From 7535cb6162b1a4802c25a14ff9e85ee7712939df Mon Sep 17 00:00:00 2001 From: xenofem Date: Sat, 2 Mar 2024 18:27:15 -0500 Subject: [PATCH] also check whether PDFs have text alongside images --- dlibrary/dlibrary.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 132e7b2..864d6b6 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -437,17 +437,21 @@ class Collator: return self.collate_from_paths(srcs_matching_language) +def is_single_image(page): + blocks = page.get_text('blocks') + return len(blocks) == 1 and blocks[0][6] == 1 + def pdf_images(pdf, force=False): - images_by_page = [page.get_images() for page in pdf] - if all(len(images) == 1 for images in images_by_page): - return (pdf.extract_image(images[0][0]) for images in images_by_page) + images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] + if all(len(images) == 1 and single for (images, single) in images_by_page): + return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page) print("Checking PDF images the quick way failed, trying the slow way") def xref_or_image_generator(): xref_mode = not force for (idx, page) in enumerate(pdf): page_images = page.get_image_info(xrefs=True) - if len(page_images) == 1 and page_images[0]['xref'] != 0: + if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page): xref = page_images[0]['xref'] if xref_mode: yield xref