convert extra-weird PDF image formats like JBIG2 to PNG
This commit is contained in:
parent
0d1bff74c2
commit
f94f23186b
|
@ -441,10 +441,18 @@ def is_single_image(page):
|
||||||
blocks = page.get_text('blocks')
|
blocks = page.get_text('blocks')
|
||||||
return len(blocks) == 1 and blocks[0][6] == 1
|
return len(blocks) == 1 and blocks[0][6] == 1
|
||||||
|
|
||||||
|
def extract_image(pdf, xref):
|
||||||
|
image = pdf.extract_image(xref)
|
||||||
|
if f'.{image["ext"]}' in IMAGE_FILE_EXTENSIONS:
|
||||||
|
return image
|
||||||
|
print(f'Converting image from {image["ext"]} to png')
|
||||||
|
pix = fitz.Pixmap(pdf, xref)
|
||||||
|
return { 'ext': 'png', 'image': pix.tobytes('png') }
|
||||||
|
|
||||||
def pdf_images(pdf, force=False):
|
def pdf_images(pdf, force=False):
|
||||||
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
|
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
|
||||||
if all(len(images) == 1 and single for (images, single) in images_by_page):
|
if all(len(images) == 1 and single for (images, single) in images_by_page):
|
||||||
return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page)
|
return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
|
||||||
|
|
||||||
print("Checking PDF images the quick way failed, trying the slow way")
|
print("Checking PDF images the quick way failed, trying the slow way")
|
||||||
def xref_or_image_generator():
|
def xref_or_image_generator():
|
||||||
|
@ -456,7 +464,7 @@ def pdf_images(pdf, force=False):
|
||||||
if xref_mode:
|
if xref_mode:
|
||||||
yield xref
|
yield xref
|
||||||
else:
|
else:
|
||||||
yield pdf.extract_image(xref)
|
yield extract_image(pdf, xref)
|
||||||
else:
|
else:
|
||||||
if xref_mode:
|
if xref_mode:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
@ -476,7 +484,7 @@ def pdf_images(pdf, force=False):
|
||||||
print('\nFailed')
|
print('\nFailed')
|
||||||
return None
|
return None
|
||||||
print('Success')
|
print('Success')
|
||||||
return (pdf.extract_image(xref) for xref in xrefs)
|
return (extract_image(pdf, xref) for xref in xrefs)
|
||||||
|
|
||||||
def nfc(s):
|
def nfc(s):
|
||||||
return unicodedata.normalize('NFC', s)
|
return unicodedata.normalize('NFC', s)
|
||||||
|
|
Loading…
Reference in a new issue