also check whether PDFs have text alongside images

2024-03-02 18:27:15 -05:00 · 2024-03-02 18:27:15 -05:00 · 7535cb6162
commit 7535cb6162
parent c042163e85
1 changed files with 8 additions and 4 deletions
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@ -437,17 +437,21 @@ class Collator:

        return self.collate_from_paths(srcs_matching_language)

+def is_single_image(page):
+    blocks = page.get_text('blocks')
+    return len(blocks) == 1 and blocks[0][6] == 1
+
 def pdf_images(pdf, force=False):
-    images_by_page = [page.get_images() for page in pdf]
-    if all(len(images) == 1 for images in images_by_page):
-        return (pdf.extract_image(images[0][0]) for images in images_by_page)
+    images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
+    if all(len(images) == 1 and single for (images, single) in images_by_page):
+        return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page)

    print("Checking PDF images the quick way failed, trying the slow way")
    def xref_or_image_generator():
        xref_mode = not force
        for (idx, page) in enumerate(pdf):
            page_images = page.get_image_info(xrefs=True)
-            if len(page_images) == 1 and page_images[0]['xref'] != 0:
+            if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page):
                xref = page_images[0]['xref']
                if xref_mode:
                    yield xref