more useful explanations of PDF failures

This commit is contained in:
xenofem 2024-03-11 05:47:47 -04:00
parent 6f4444f67e
commit 11ea5a0d58

View file

@ -316,7 +316,7 @@ class Collator:
with fitz.open(src) as pdf: with fitz.open(src) as pdf:
images = pdf_images(pdf, self.args.force_convert_pdf) images = pdf_images(pdf, self.args.force_convert_pdf)
if images is None: if images is None:
print(f'Support for weirder PDFs not yet implemented, skipping {src}') print(f'Failed to enumerate page images in PDF, skipping {src}')
return None return None
self.dest.mkdir(parents=True, exist_ok=True) self.dest.mkdir(parents=True, exist_ok=True)
@ -466,10 +466,11 @@ def pdf_images(pdf, force=False):
else: else:
yield extract_image(pdf, xref) yield extract_image(pdf, xref)
else: else:
print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(page.get_text("blocks"))} total objects')
if xref_mode: if xref_mode:
raise ValueError raise ValueError
else: else:
print(f'\nGenerating pixmap for page {idx+1}') print(f'Generating pixmap for page {idx+1}')
pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI) pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI)
yield { 'ext': 'png', 'image': pix.tobytes('png') } yield { 'ext': 'png', 'image': pix.tobytes('png') }
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='') print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='')