convert extra-weird PDF image formats like JBIG2 to PNG

This commit is contained in:
xenofem 2024-03-03 01:47:29 -05:00
parent 0d1bff74c2
commit f94f23186b

View file

@ -441,10 +441,18 @@ def is_single_image(page):
blocks = page.get_text('blocks') blocks = page.get_text('blocks')
return len(blocks) == 1 and blocks[0][6] == 1 return len(blocks) == 1 and blocks[0][6] == 1
def extract_image(pdf, xref):
image = pdf.extract_image(xref)
if f'.{image["ext"]}' in IMAGE_FILE_EXTENSIONS:
return image
print(f'Converting image from {image["ext"]} to png')
pix = fitz.Pixmap(pdf, xref)
return { 'ext': 'png', 'image': pix.tobytes('png') }
def pdf_images(pdf, force=False): def pdf_images(pdf, force=False):
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
if all(len(images) == 1 and single for (images, single) in images_by_page): if all(len(images) == 1 and single for (images, single) in images_by_page):
return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page) return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
print("Checking PDF images the quick way failed, trying the slow way") print("Checking PDF images the quick way failed, trying the slow way")
def xref_or_image_generator(): def xref_or_image_generator():
@ -456,7 +464,7 @@ def pdf_images(pdf, force=False):
if xref_mode: if xref_mode:
yield xref yield xref
else: else:
yield pdf.extract_image(xref) yield extract_image(pdf, xref)
else: else:
if xref_mode: if xref_mode:
raise ValueError raise ValueError
@ -476,7 +484,7 @@ def pdf_images(pdf, force=False):
print('\nFailed') print('\nFailed')
return None return None
print('Success') print('Success')
return (pdf.extract_image(xref) for xref in xrefs) return (extract_image(pdf, xref) for xref in xrefs)
def nfc(s): def nfc(s):
return unicodedata.normalize('NFC', s) return unicodedata.normalize('NFC', s)