Compare commits

..

No commits in common. "26ec1901c34346608a3d89e91729ff8b4c564b45" and "35f0c1fffe9d79e1fc4e182d2385f26fd9dfbabe" have entirely different histories.

View file

@ -382,20 +382,16 @@ class Collator:
def link_pdf(self, src): def link_pdf(self, src):
with fitz.open(src) as pdf: with fitz.open(src) as pdf:
images = pdf_images(pdf, self.args.pdf_strategy) images = pdf_images(pdf, self.args.force_convert_pdf)
if images is None: if images is None:
print(f'Failed to enumerate page images in PDF, skipping {src}') print(f'Failed to enumerate page images in PDF, skipping {src}')
return None return None
self.dest.mkdir(parents=True, exist_ok=True) self.dest.mkdir(parents=True, exist_ok=True)
print(f'0 pages collated...', end='')
for (idx, image) in enumerate(images, start=self.index): for (idx, image) in enumerate(images, start=self.index):
file_path = self.dest / f'{idx:04d}.{image["ext"]}' file_path = self.dest / f'{idx:04d}.{image["ext"]}'
with open(file_path, 'wb') as f: with open(file_path, 'wb') as f:
f.write(image["image"]) f.write(image["image"])
print(f'\x1b[2K\r{idx+1-self.index} pages collated...', end='')
print()
self.index += pdf.page_count self.index += pdf.page_count
return True return True
@ -573,13 +569,13 @@ def display_sixel_page(page):
finally: finally:
sixel_output_unref(output) sixel_output_unref(output)
def pdf_images(pdf, strategy): def pdf_images(pdf, force=False):
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
if all(len(images) == 1 and single for (images, single) in images_by_page): if all(len(images) == 1 and single for (images, single) in images_by_page):
return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page) return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
print("Checking PDF images the quick way failed, trying the slow way") print("Checking PDF images the quick way failed, trying the slow way")
print(f'0/{pdf.page_count} pages analyzed...', end='') print(f'0/{pdf.page_count} pages processed...', end='')
image_extractors = [] image_extractors = []
for (idx, page) in enumerate(pdf): for (idx, page) in enumerate(pdf):
page_images = page.get_image_info(xrefs=True) page_images = page.get_image_info(xrefs=True)
@ -591,30 +587,25 @@ def pdf_images(pdf, strategy):
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
else: else:
print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects') print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects')
choice = strategy if force:
while True: print(f'Converting page {idx+1}')
if choice.lower().startswith('n'): choice = 'c'
return None else:
if choice.lower().startswith('c'): shown = False
if choice == strategy: while True:
print(f'Converting page {idx+1}') choice = input(f'[N]ope out of this PDF / [c]onvert page to image{"" if xref is None else " / e[x]tract embedded image without text"}{"" if shown else " / [s]how page before deciding"}? [N/c{"" if xref is None else "/x"}{"" if shown else "/s"}] ')
image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') }) if not shown and choice != '' and choice[0].lower() == 's':
break display_sixel_page(page)
if xref is not None and (choice.lower().startswith('x') or choice.lower() == 'extract'): shown = True
if choice == strategy: else:
print(f'Extracting image from page {idx+1} without text') break
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) if xref is not None and choice != '' and choice[0].lower() == 'x':
break image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
if choice.lower().startswith('d'): elif choice != '' and choice[0].lower() == 'c':
if choice == strategy: image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') })
print(f'Dropping page {idx+1}') else:
break return None
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n'))
if choice.lower().startswith('s'):
display_sixel_page(page)
choice = input(f'[N]ope out / [c]onvert page{"" if xref is None else " / e[x]tract image"} / [d]rop page / [s]how page? [n/c{"" if xref is None else "/x"}/d/s] ')
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages analyzed...', end=('' if idx+1 < pdf.page_count else '\n'))
return (extractor() for extractor in image_extractors) return (extractor() for extractor in image_extractors)
@ -1191,17 +1182,9 @@ parser_collate.add_argument(
help="only collate works that are explicitly specified", help="only collate works that are explicitly specified",
) )
parser_collate.add_argument( parser_collate.add_argument(
'-p', '--pdf-strategy', '--force-convert-pdf',
choices=[ action='store_true',
'ask', '?', help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
'show-ask', 's',
'convert', 'c',
'extract', 'x',
'drop', 'd',
'nope', 'n'
],
default='show-ask',
help="how to handle PDF pages that aren't a single image with no text",
) )
parser_collate.add_argument( parser_collate.add_argument(
'expression', 'expression',