From 18fbc7f8dc18388185d0c39965b0155d8f3082f5 Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Tue, 12 Mar 2024 15:50:12 -0400
Subject: [PATCH 1/2] fancier options for PDF strategy

---
 dlibrary/dlibrary.py | 59 +++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index 16706db..0d8a3c3 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -382,7 +382,7 @@ class Collator:
 
     def link_pdf(self, src):
         with fitz.open(src) as pdf:
-            images = pdf_images(pdf, self.args.force_convert_pdf)
+            images = pdf_images(pdf, self.args.pdf_strategy)
             if images is None:
                 print(f'Failed to enumerate page images in PDF, skipping {src}')
                 return None
@@ -569,7 +569,7 @@ def display_sixel_page(page):
     finally:
         sixel_output_unref(output)
 
-def pdf_images(pdf, force=False):
+def pdf_images(pdf, strategy):
     images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
     if all(len(images) == 1 and single for (images, single) in images_by_page):
         return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
@@ -587,24 +587,29 @@ def pdf_images(pdf, force=False):
             image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
         else:
             print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects')
-            if force:
-                print(f'Converting page {idx+1}')
-                choice = 'c'
-            else:
-                shown = False
-                while True:
-                    choice = input(f'[N]ope out of this PDF / [c]onvert page to image{"" if xref is None else " / e[x]tract embedded image without text"}{"" if shown else " / [s]how page before deciding"}? [N/c{"" if xref is None else "/x"}{"" if shown else "/s"}] ')
-                    if not shown and choice != '' and choice[0].lower() == 's':
-                        display_sixel_page(page)
-                        shown = True
-                    else:
-                        break
-            if xref is not None and choice != '' and choice[0].lower() == 'x':
-                image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
-            elif choice != '' and choice[0].lower() == 'c':
-                image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') })
-            else:
-                return None
+            choice = strategy
+            while True:
+                if choice.lower().startswith('n'):
+                    return None
+                if choice.lower().startswith('c'):
+                    if choice == strategy:
+                        print(f'Converting page {idx+1}')
+                    image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') })
+                    break
+                if xref is not None and (choice.lower().startswith('x') or choice.lower() == 'extract'):
+                    if choice == strategy:
+                        print(f'Extracting image from page {idx+1} without text')
+                    image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
+                    break
+                if choice.lower().startswith('d'):
+                    if choice == strategy:
+                        print(f'Dropping page {idx+1}')
+                    break
+
+                if choice.lower().startswith('s'):
+                    display_sixel_page(page)
+
+                choice = input(f'[N]ope out / [c]onvert page{"" if xref is None else " / e[x]tract image"} / [d]rop page / [s]how page? [n/c{"" if xref is None else "/x"}/d/s] ')
         print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n'))
 
     return (extractor() for extractor in image_extractors)
@@ -1182,9 +1187,17 @@ parser_collate.add_argument(
     help="only collate works that are explicitly specified",
 )
 parser_collate.add_argument(
-    '--force-convert-pdf',
-    action='store_true',
-    help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
+    '-p', '--pdf-strategy',
+    choices=[
+        'ask', '?',
+        'show-ask', 's',
+        'convert', 'c',
+        'extract', 'x',
+        'drop', 'd',
+        'nope', 'n'
+    ],
+    default='show-ask',
+    help="how to handle PDF pages that aren't a single image with no text",
 )
 parser_collate.add_argument(
     'expression',

From 26ec1901c34346608a3d89e91729ff8b4c564b45 Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Tue, 12 Mar 2024 15:56:40 -0400
Subject: [PATCH 2/2] display progress for extracting pdf images as well as for
 analyzing them

---
 dlibrary/dlibrary.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index 0d8a3c3..8d60a62 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -388,10 +388,14 @@ class Collator:
                 return None
 
             self.dest.mkdir(parents=True, exist_ok=True)
+
+            print(f'0 pages collated...', end='')
             for (idx, image) in enumerate(images, start=self.index):
                 file_path = self.dest / f'{idx:04d}.{image["ext"]}'
                 with open(file_path, 'wb') as f:
                     f.write(image["image"])
+                print(f'\x1b[2K\r{idx+1-self.index} pages collated...', end='')
+            print()
 
             self.index += pdf.page_count
             return True
@@ -575,7 +579,7 @@ def pdf_images(pdf, strategy):
         return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
 
     print("Checking PDF images the quick way failed, trying the slow way")
-    print(f'0/{pdf.page_count} pages processed...', end='')
+    print(f'0/{pdf.page_count} pages analyzed...', end='')
     image_extractors = []
     for (idx, page) in enumerate(pdf):
         page_images = page.get_image_info(xrefs=True)
@@ -610,7 +614,7 @@ def pdf_images(pdf, strategy):
                     display_sixel_page(page)
 
                 choice = input(f'[N]ope out / [c]onvert page{"" if xref is None else " / e[x]tract image"} / [d]rop page / [s]how page? [n/c{"" if xref is None else "/x"}/d/s] ')
-        print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n'))
+        print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages analyzed...', end=('' if idx+1 < pdf.page_count else '\n'))
 
     return (extractor() for extractor in image_extractors)