From f94f23186b2f5de193a2b6a937c02ffaeb304dab Mon Sep 17 00:00:00 2001 From: xenofem Date: Sun, 3 Mar 2024 01:47:29 -0500 Subject: [PATCH] convert extra-weird PDF image formats like JBIG2 to PNG --- dlibrary/dlibrary.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index a862737..82cf74f 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -441,10 +441,18 @@ def is_single_image(page): blocks = page.get_text('blocks') return len(blocks) == 1 and blocks[0][6] == 1 +def extract_image(pdf, xref): + image = pdf.extract_image(xref) + if f'.{image["ext"]}' in IMAGE_FILE_EXTENSIONS: + return image + print(f'Converting image from {image["ext"]} to png') + pix = fitz.Pixmap(pdf, xref) + return { 'ext': 'png', 'image': pix.tobytes('png') } + def pdf_images(pdf, force=False): images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] if all(len(images) == 1 and single for (images, single) in images_by_page): - return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page) + return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page) print("Checking PDF images the quick way failed, trying the slow way") def xref_or_image_generator(): @@ -456,7 +464,7 @@ def pdf_images(pdf, force=False): if xref_mode: yield xref else: - yield pdf.extract_image(xref) + yield extract_image(pdf, xref) else: if xref_mode: raise ValueError @@ -476,7 +484,7 @@ def pdf_images(pdf, force=False): print('\nFailed') return None print('Success') - return (pdf.extract_image(xref) for xref in xrefs) + return (extract_image(pdf, xref) for xref in xrefs) def nfc(s): return unicodedata.normalize('NFC', s)