From 435af20e596a8cd87679ec4817a3cb15ec7edd56 Mon Sep 17 00:00:00 2001 From: xenofem Date: Sat, 16 Mar 2024 01:51:01 -0400 Subject: [PATCH] identify PDF page image xrefs in a *much* faster and less stupid way than PyMuPDF does it --- dlibrary/dlibrary.py | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 012d1dd..60e77fd 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -88,6 +88,9 @@ IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I) MULTIPART_RAR_HEAD_REGEX = re.compile(r'^(.+)\.part0*1\.exe$', re.I) MULTIPART_RAR_TAIL_REGEX = re.compile(r'^(.+)\.part0*([^1]|[^0].+)\.rar$', re.I) +PDF_REFERENCED_IMAGE_REGEX = re.compile(r'(^|(?<=\s))/(?P\S+)\s+Do($|(?=\s))') +PDF_INLINE_IMAGE_REGEX = re.compile(r'(^|\s)(BI|ID|EI)($|\s)') + debug_mode = False def debug(s): if debug_mode: @@ -575,6 +578,33 @@ def extract_image(pdf, xref): pix = fitz.Pixmap(pdf, xref) return { 'ext': 'png', 'image': pix.tobytes('png') } +def get_displayed_image_xref(page): + ref_names = [] + for content_xref in page.get_contents(): + content = page.parent.xref_stream(content_xref).decode('ascii', 'replace') + if PDF_INLINE_IMAGE_REGEX.search(content): + debug('Inline image detected') + return None + for m in PDF_REFERENCED_IMAGE_REGEX.finditer(content): + ref_names.append(m.group('ref_name')) + + if len(ref_names) == 0: + debug('Page does not reference any xobjects') + return None + if len(ref_names) > 1: + debug(f'Page references multiple xobjects: {ref_names}') + return None + + image_xrefs = [image[0] for image in page.get_images() if image[7] == ref_names[0]] + if len(image_xrefs) == 1: + return image_xrefs[0] + + if len(image_xrefs) == 0: + debug(f'No images found matching ref name {ref_names[0]}') + else: + debug(f"Multiple images found matching ref name {ref_names[0]}, that probably shouldn't happen") + return None + def display_sixel_page(page): s = BytesIO() image = Image.open(BytesIO(page.get_pixmap(dpi=PDF_PREVIEW_DPI).tobytes('png'))) @@ -615,23 +645,15 @@ def display_sixel_page(page): sixel_output_unref(output) def pdf_images(pdf, strategy): - images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] - if all(len(images) == 1 and single for (images, single) in images_by_page): - return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page) - - print("Checking PDF images the quick way failed, trying the slow way") print(f'0/{pdf.page_count} pages analyzed...', end='') image_extractors = [] for (idx, page) in enumerate(pdf): - page_images = page.get_image_info(xrefs=True) - if len(page_images) == 1 and page_images[0]['xref'] != 0: - xref = page_images[0]['xref'] - else: - xref = None + xref = get_displayed_image_xref(page) if xref is not None and is_single_image(page): image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) else: - print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects') + page_images = page.get_image_info() + print(f'\nPage {idx+1}: {len(page_images)} images, {len(relevant_blocks(page))} total relevant objects') choice = strategy while True: if choice.lower().startswith('n'):