From 9fea03c270d78746c534efb18000da38abc5fc7e Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Sat, 2 Mar 2024 18:09:46 -0500
Subject: [PATCH 1/3] add option to convert PDF pages to pixmaps as needed

---
 dlibrary/dlibrary.py | 71 ++++++++++++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index acc3291..fcaf440 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -77,6 +77,8 @@ IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp']
 IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
 IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
 
+PDF_FALLBACK_DPI = 300
+
 def open_zipfile_with_encoding(path):
     try:
         return zipfile.ZipFile(path, metadata_encoding="utf-8")
@@ -232,7 +234,7 @@ def collate(args):
 
         work_staging_dir = collation_staging_area / work_id
 
-        collator = Collator(work_staging_dir, [], args.locale)
+        collator = Collator(work_staging_dir, [], args)
         collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)])
         if collation_result and collator.index > 0:
             print(f'Collated {collator.index} pages for {work_id}')
@@ -252,10 +254,10 @@ def collate(args):
     con.close()
 
 class Collator:
-    def __init__(self, dest, exclude, locale):
+    def __init__(self, dest, exclude, args):
         self.dest = dest
         self.exclude = exclude
-        self.locale = locale
+        self.args = args
         self.index = 0
 
     def collate_from_paths(self, srcs):
@@ -309,14 +311,13 @@ class Collator:
 
     def link_pdf(self, src):
         with fitz.open(src) as pdf:
-            xrefs = image_xrefs(pdf)
-            if xrefs is None:
+            images = pdf_images(pdf, self.args.force_convert_pdf)
+            if images is None:
                 print(f'Support for weirder PDFs not yet implemented, skipping {src}')
                 return None
 
             self.dest.mkdir(parents=True, exist_ok=True)
-            for (idx, xref) in enumerate(xrefs, start=self.index):
-                image = pdf.extract_image(xref)
+            for (idx, image) in enumerate(images, start=self.index):
                 file_path = self.dest / f'{idx:04d}.{image["ext"]}'
                 with open(file_path, 'wb') as f:
                     f.write(image["image"])
@@ -422,34 +423,53 @@ class Collator:
             return False
 
     def try_collate_select_language(self, srcs):
-        if self.locale not in LANGUAGE_REGEXES:
+        if self.args.locale not in LANGUAGE_REGEXES:
             return False
         if not all(any(lang.search(nname(src)) for lang in LANGUAGE_REGEXES.values()) for src in srcs):
             return False
 
-        srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.locale].search(nname(src))]
+        srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.args.locale].search(nname(src))]
         if len(srcs_matching_language) == len(srcs) or len(srcs_matching_language) == 0:
             return False
 
         return self.collate_from_paths(srcs_matching_language)
 
-def image_xrefs(pdf):
+def pdf_images(pdf, force=False):
     images_by_page = [page.get_images() for page in pdf]
     if all(len(images) == 1 for images in images_by_page):
-        return [images[0][0] for images in images_by_page]
+        return (pdf.extract_image(images[0][0]) for images in images_by_page)
 
     print("Checking PDF images the quick way failed, trying the slow way")
-    xrefs = []
-    for (idx, page) in enumerate(pdf):
-        print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='')
-        images = page.get_image_info(xrefs=True)
-        if len(images) != 1 or images[0]['xref'] == 0:
-            print('\nFailed')
-            return None
-        xrefs.append(images[0]['xref'])
+    def xref_or_image_generator():
+        xref_mode = not force
+        for (idx, page) in enumerate(pdf):
+            page_images = page.get_image_info(xrefs=True)
+            if len(page_images) == 1 and page_images[0]['xref'] != 0:
+                xref = page_images[0]['xref']
+                if xref_mode:
+                    yield xref
+                else:
+                    yield pdf.extract_image(xref)
+            else:
+                if xref_mode:
+                    raise ValueError
+                else:
+                    print(f'\nGenerating pixmap for page {idx+1}')
+                    pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI)
+                    yield { 'ext': 'png', 'image': pix.tobytes('png') }
+            print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='')
+        print('')
 
-    print('\nSuccess')
-    return xrefs
+    if force:
+        return xref_or_image_generator()
+
+    try:
+        xrefs = list(xref_or_image_generator())
+    except ValueError:
+        print('\nFailed')
+        return None
+    print('Success')
+    return (pdf.extract_image(xref) for xref in xrefs)
 
 def nfc(s):
     return unicodedata.normalize('NFC', s)
@@ -706,7 +726,7 @@ def manual_collate(args):
     work_staging_dir = collation_staging_area / work_id
     work_staging_dir.mkdir(parents=True)
 
-    collator = Collator(work_staging_dir, exclusions, args.locale)
+    collator = Collator(work_staging_dir, exclusions, args)
     for group in groups:
         collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
         if collation_result is None:
@@ -1002,7 +1022,7 @@ parser_collate.add_argument(
     nargs='*',
     help='paths within extraction folders as collation hints'
 )
-parser_collate.set_defaults(func=collate)
+parser_collate.set_defaults(func=collate, force_convert_pdf=False)
 
 parser_manual_collate = subparsers.add_parser(
     'manual-collate',
@@ -1042,6 +1062,11 @@ parser_manual_collate = subparsers.add_parser(
     only handle one work at a time.
 """),
 )
+parser_manual_collate.add_argument(
+    '--force-convert-pdf',
+    action='store_true',
+    help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
+)
 parser_manual_collate.add_argument(
     'expression',
     nargs='+',

From c042163e85dec7ecf4ce29d3e71fd417c795152b Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Sat, 2 Mar 2024 18:10:22 -0500
Subject: [PATCH 2/3] properly handle edge case when we point collate or
 manual-collate directly at an extraction directory

---
 dlibrary/dlibrary.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index fcaf440..132e7b2 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -208,12 +208,15 @@ def fetch(args):
     asyncio.run(fetch_async(args))
 
 
+def self_and_parents(path):
+    return [path] + list(path.parents)
+
 def collate(args):
     con = sqlite3.connect(args.destdir / 'meta.db')
     cur = con.cursor()
 
     extraction_dir = args.destdir / 'extract'
-    hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints}
+    hint_map = {self_and_parents(Path(relpath(hint, extraction_dir)))[-2].name: hint for hint in args.hints}
 
     collation_staging_area = args.destdir / 'site' / 'images-staging'
     collation_staging_area.mkdir(parents=True)
@@ -697,7 +700,7 @@ def manual_collate(args):
     extraction_dir = args.destdir / 'extract'
 
     sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group)
-    work_id = Path(relpath(sample_path, extraction_dir)).parents[-2].name
+    work_id = self_and_parents(Path(relpath(sample_path, extraction_dir)))[-2].name
 
     exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
 

From 7535cb6162b1a4802c25a14ff9e85ee7712939df Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Sat, 2 Mar 2024 18:27:15 -0500
Subject: [PATCH 3/3] also check whether PDFs have text alongside images

---
 dlibrary/dlibrary.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index 132e7b2..864d6b6 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -437,17 +437,21 @@ class Collator:
 
         return self.collate_from_paths(srcs_matching_language)
 
+def is_single_image(page):
+    blocks = page.get_text('blocks')
+    return len(blocks) == 1 and blocks[0][6] == 1
+
 def pdf_images(pdf, force=False):
-    images_by_page = [page.get_images() for page in pdf]
-    if all(len(images) == 1 for images in images_by_page):
-        return (pdf.extract_image(images[0][0]) for images in images_by_page)
+    images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
+    if all(len(images) == 1 and single for (images, single) in images_by_page):
+        return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page)
 
     print("Checking PDF images the quick way failed, trying the slow way")
     def xref_or_image_generator():
         xref_mode = not force
         for (idx, page) in enumerate(pdf):
             page_images = page.get_image_info(xrefs=True)
-            if len(page_images) == 1 and page_images[0]['xref'] != 0:
+            if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page):
                 xref = page_images[0]['xref']
                 if xref_mode:
                     yield xref