prompt for how to handle PDF weird pages, instead of immediately bailing out
This commit is contained in:
parent
c7f95d50f9
commit
3a9199b847
|
@ -533,37 +533,35 @@ def pdf_images(pdf, force=False):
|
||||||
return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
|
return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
|
||||||
|
|
||||||
print("Checking PDF images the quick way failed, trying the slow way")
|
print("Checking PDF images the quick way failed, trying the slow way")
|
||||||
def xref_or_image_generator():
|
print(f'0/{pdf.page_count} pages processed...', end='')
|
||||||
xref_mode = not force
|
image_extractors = []
|
||||||
for (idx, page) in enumerate(pdf):
|
for (idx, page) in enumerate(pdf):
|
||||||
page_images = page.get_image_info(xrefs=True)
|
page_images = page.get_image_info(xrefs=True)
|
||||||
if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page):
|
if len(page_images) == 1 and page_images[0]['xref'] != 0:
|
||||||
xref = page_images[0]['xref']
|
xref = page_images[0]['xref']
|
||||||
if xref_mode:
|
else:
|
||||||
yield xref
|
xref = None
|
||||||
else:
|
if xref is not None and is_single_image(page):
|
||||||
yield extract_image(pdf, xref)
|
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
||||||
|
else:
|
||||||
|
print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects')
|
||||||
|
if force:
|
||||||
|
print(f'Converting page {idx+1}')
|
||||||
|
choice = 'c'
|
||||||
else:
|
else:
|
||||||
print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects')
|
if xref is None:
|
||||||
if xref_mode:
|
choice = input('[N]ope out of this PDF or [c]onvert the page lossily to an image? [N/c] ')
|
||||||
raise ValueError
|
|
||||||
else:
|
else:
|
||||||
print(f'Generating pixmap for page {idx+1}')
|
choice = input('[N]ope out of this PDF, e[x]tract the image without the text, or [c]onvert the entire page lossily to an image? [N/x/c] ')
|
||||||
pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI)
|
if xref is not None and choice != '' and choice[0].lower() == 'x':
|
||||||
yield { 'ext': 'png', 'image': pix.tobytes('png') }
|
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
||||||
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='')
|
elif choice != '' and choice[0].lower() == 'c':
|
||||||
print('')
|
image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_FALLBACK_DPI).tobytes('png') })
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n'))
|
||||||
|
|
||||||
if force:
|
return (extractor() for extractor in image_extractors)
|
||||||
return xref_or_image_generator()
|
|
||||||
|
|
||||||
try:
|
|
||||||
xrefs = list(xref_or_image_generator())
|
|
||||||
except ValueError:
|
|
||||||
print('\nFailed')
|
|
||||||
return None
|
|
||||||
print('Success')
|
|
||||||
return (extract_image(pdf, xref) for xref in xrefs)
|
|
||||||
|
|
||||||
def nfc(s):
|
def nfc(s):
|
||||||
return unicodedata.normalize('NFC', s)
|
return unicodedata.normalize('NFC', s)
|
||||||
|
|
Loading…
Reference in a new issue