From 3a9199b847d73199550359bff67c1002ffeb4ea5 Mon Sep 17 00:00:00 2001 From: xenofem Date: Tue, 12 Mar 2024 03:23:57 -0400 Subject: [PATCH] prompt for how to handle PDF weird pages, instead of immediately bailing out --- dlibrary/dlibrary.py | 54 +++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index b0720ec..bd3fa8f 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -533,37 +533,35 @@ def pdf_images(pdf, force=False): return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page) print("Checking PDF images the quick way failed, trying the slow way") - def xref_or_image_generator(): - xref_mode = not force - for (idx, page) in enumerate(pdf): - page_images = page.get_image_info(xrefs=True) - if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page): - xref = page_images[0]['xref'] - if xref_mode: - yield xref - else: - yield extract_image(pdf, xref) + print(f'0/{pdf.page_count} pages processed...', end='') + image_extractors = [] + for (idx, page) in enumerate(pdf): + page_images = page.get_image_info(xrefs=True) + if len(page_images) == 1 and page_images[0]['xref'] != 0: + xref = page_images[0]['xref'] + else: + xref = None + if xref is not None and is_single_image(page): + image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) + else: + print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects') + if force: + print(f'Converting page {idx+1}') + choice = 'c' else: - print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects') - if xref_mode: - raise ValueError + if xref is None: + choice = input('[N]ope out of this PDF or [c]onvert the page lossily to an image? [N/c] ') else: - print(f'Generating pixmap for page {idx+1}') - pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI) - yield { 'ext': 'png', 'image': pix.tobytes('png') } - print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='') - print('') + choice = input('[N]ope out of this PDF, e[x]tract the image without the text, or [c]onvert the entire page lossily to an image? [N/x/c] ') + if xref is not None and choice != '' and choice[0].lower() == 'x': + image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) + elif choice != '' and choice[0].lower() == 'c': + image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_FALLBACK_DPI).tobytes('png') }) + else: + return None + print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n')) - if force: - return xref_or_image_generator() - - try: - xrefs = list(xref_or_image_generator()) - except ValueError: - print('\nFailed') - return None - print('Success') - return (extract_image(pdf, xref) for xref in xrefs) + return (extractor() for extractor in image_extractors) def nfc(s): return unicodedata.normalize('NFC', s)