From 18fbc7f8dc18388185d0c39965b0155d8f3082f5 Mon Sep 17 00:00:00 2001 From: xenofem Date: Tue, 12 Mar 2024 15:50:12 -0400 Subject: [PATCH 1/2] fancier options for PDF strategy --- dlibrary/dlibrary.py | 59 +++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 16706db..0d8a3c3 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -382,7 +382,7 @@ class Collator: def link_pdf(self, src): with fitz.open(src) as pdf: - images = pdf_images(pdf, self.args.force_convert_pdf) + images = pdf_images(pdf, self.args.pdf_strategy) if images is None: print(f'Failed to enumerate page images in PDF, skipping {src}') return None @@ -569,7 +569,7 @@ def display_sixel_page(page): finally: sixel_output_unref(output) -def pdf_images(pdf, force=False): +def pdf_images(pdf, strategy): images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] if all(len(images) == 1 and single for (images, single) in images_by_page): return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page) @@ -587,24 +587,29 @@ def pdf_images(pdf, force=False): image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) else: print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects') - if force: - print(f'Converting page {idx+1}') - choice = 'c' - else: - shown = False - while True: - choice = input(f'[N]ope out of this PDF / [c]onvert page to image{"" if xref is None else " / e[x]tract embedded image without text"}{"" if shown else " / [s]how page before deciding"}? [N/c{"" if xref is None else "/x"}{"" if shown else "/s"}] ') - if not shown and choice != '' and choice[0].lower() == 's': - display_sixel_page(page) - shown = True - else: - break - if xref is not None and choice != '' and choice[0].lower() == 'x': - image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) - elif choice != '' and choice[0].lower() == 'c': - image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') }) - else: - return None + choice = strategy + while True: + if choice.lower().startswith('n'): + return None + if choice.lower().startswith('c'): + if choice == strategy: + print(f'Converting page {idx+1}') + image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') }) + break + if xref is not None and (choice.lower().startswith('x') or choice.lower() == 'extract'): + if choice == strategy: + print(f'Extracting image from page {idx+1} without text') + image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) + break + if choice.lower().startswith('d'): + if choice == strategy: + print(f'Dropping page {idx+1}') + break + + if choice.lower().startswith('s'): + display_sixel_page(page) + + choice = input(f'[N]ope out / [c]onvert page{"" if xref is None else " / e[x]tract image"} / [d]rop page / [s]how page? [n/c{"" if xref is None else "/x"}/d/s] ') print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n')) return (extractor() for extractor in image_extractors) @@ -1182,9 +1187,17 @@ parser_collate.add_argument( help="only collate works that are explicitly specified", ) parser_collate.add_argument( - '--force-convert-pdf', - action='store_true', - help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly", + '-p', '--pdf-strategy', + choices=[ + 'ask', '?', + 'show-ask', 's', + 'convert', 'c', + 'extract', 'x', + 'drop', 'd', + 'nope', 'n' + ], + default='show-ask', + help="how to handle PDF pages that aren't a single image with no text", ) parser_collate.add_argument( 'expression', From 26ec1901c34346608a3d89e91729ff8b4c564b45 Mon Sep 17 00:00:00 2001 From: xenofem Date: Tue, 12 Mar 2024 15:56:40 -0400 Subject: [PATCH 2/2] display progress for extracting pdf images as well as for analyzing them --- dlibrary/dlibrary.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 0d8a3c3..8d60a62 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -388,10 +388,14 @@ class Collator: return None self.dest.mkdir(parents=True, exist_ok=True) + + print(f'0 pages collated...', end='') for (idx, image) in enumerate(images, start=self.index): file_path = self.dest / f'{idx:04d}.{image["ext"]}' with open(file_path, 'wb') as f: f.write(image["image"]) + print(f'\x1b[2K\r{idx+1-self.index} pages collated...', end='') + print() self.index += pdf.page_count return True @@ -575,7 +579,7 @@ def pdf_images(pdf, strategy): return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page) print("Checking PDF images the quick way failed, trying the slow way") - print(f'0/{pdf.page_count} pages processed...', end='') + print(f'0/{pdf.page_count} pages analyzed...', end='') image_extractors = [] for (idx, page) in enumerate(pdf): page_images = page.get_image_info(xrefs=True) @@ -610,7 +614,7 @@ def pdf_images(pdf, strategy): display_sixel_page(page) choice = input(f'[N]ope out / [c]onvert page{"" if xref is None else " / e[x]tract image"} / [d]rop page / [s]how page? [n/c{"" if xref is None else "/x"}/d/s] ') - print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n')) + print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages analyzed...', end=('' if idx+1 < pdf.page_count else '\n')) return (extractor() for extractor in image_extractors)