Compare commits
2 commits
35f0c1fffe
...
26ec1901c3
Author | SHA1 | Date | |
---|---|---|---|
xenofem | 26ec1901c3 | ||
xenofem | 18fbc7f8dc |
|
@ -382,16 +382,20 @@ class Collator:
|
||||||
|
|
||||||
def link_pdf(self, src):
|
def link_pdf(self, src):
|
||||||
with fitz.open(src) as pdf:
|
with fitz.open(src) as pdf:
|
||||||
images = pdf_images(pdf, self.args.force_convert_pdf)
|
images = pdf_images(pdf, self.args.pdf_strategy)
|
||||||
if images is None:
|
if images is None:
|
||||||
print(f'Failed to enumerate page images in PDF, skipping {src}')
|
print(f'Failed to enumerate page images in PDF, skipping {src}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self.dest.mkdir(parents=True, exist_ok=True)
|
self.dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
print(f'0 pages collated...', end='')
|
||||||
for (idx, image) in enumerate(images, start=self.index):
|
for (idx, image) in enumerate(images, start=self.index):
|
||||||
file_path = self.dest / f'{idx:04d}.{image["ext"]}'
|
file_path = self.dest / f'{idx:04d}.{image["ext"]}'
|
||||||
with open(file_path, 'wb') as f:
|
with open(file_path, 'wb') as f:
|
||||||
f.write(image["image"])
|
f.write(image["image"])
|
||||||
|
print(f'\x1b[2K\r{idx+1-self.index} pages collated...', end='')
|
||||||
|
print()
|
||||||
|
|
||||||
self.index += pdf.page_count
|
self.index += pdf.page_count
|
||||||
return True
|
return True
|
||||||
|
@ -569,13 +573,13 @@ def display_sixel_page(page):
|
||||||
finally:
|
finally:
|
||||||
sixel_output_unref(output)
|
sixel_output_unref(output)
|
||||||
|
|
||||||
def pdf_images(pdf, force=False):
|
def pdf_images(pdf, strategy):
|
||||||
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
|
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
|
||||||
if all(len(images) == 1 and single for (images, single) in images_by_page):
|
if all(len(images) == 1 and single for (images, single) in images_by_page):
|
||||||
return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
|
return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
|
||||||
|
|
||||||
print("Checking PDF images the quick way failed, trying the slow way")
|
print("Checking PDF images the quick way failed, trying the slow way")
|
||||||
print(f'0/{pdf.page_count} pages processed...', end='')
|
print(f'0/{pdf.page_count} pages analyzed...', end='')
|
||||||
image_extractors = []
|
image_extractors = []
|
||||||
for (idx, page) in enumerate(pdf):
|
for (idx, page) in enumerate(pdf):
|
||||||
page_images = page.get_image_info(xrefs=True)
|
page_images = page.get_image_info(xrefs=True)
|
||||||
|
@ -587,25 +591,30 @@ def pdf_images(pdf, force=False):
|
||||||
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
||||||
else:
|
else:
|
||||||
print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects')
|
print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects')
|
||||||
if force:
|
choice = strategy
|
||||||
print(f'Converting page {idx+1}')
|
while True:
|
||||||
choice = 'c'
|
if choice.lower().startswith('n'):
|
||||||
else:
|
return None
|
||||||
shown = False
|
if choice.lower().startswith('c'):
|
||||||
while True:
|
if choice == strategy:
|
||||||
choice = input(f'[N]ope out of this PDF / [c]onvert page to image{"" if xref is None else " / e[x]tract embedded image without text"}{"" if shown else " / [s]how page before deciding"}? [N/c{"" if xref is None else "/x"}{"" if shown else "/s"}] ')
|
print(f'Converting page {idx+1}')
|
||||||
if not shown and choice != '' and choice[0].lower() == 's':
|
image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') })
|
||||||
display_sixel_page(page)
|
break
|
||||||
shown = True
|
if xref is not None and (choice.lower().startswith('x') or choice.lower() == 'extract'):
|
||||||
else:
|
if choice == strategy:
|
||||||
break
|
print(f'Extracting image from page {idx+1} without text')
|
||||||
if xref is not None and choice != '' and choice[0].lower() == 'x':
|
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
||||||
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
break
|
||||||
elif choice != '' and choice[0].lower() == 'c':
|
if choice.lower().startswith('d'):
|
||||||
image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') })
|
if choice == strategy:
|
||||||
else:
|
print(f'Dropping page {idx+1}')
|
||||||
return None
|
break
|
||||||
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n'))
|
|
||||||
|
if choice.lower().startswith('s'):
|
||||||
|
display_sixel_page(page)
|
||||||
|
|
||||||
|
choice = input(f'[N]ope out / [c]onvert page{"" if xref is None else " / e[x]tract image"} / [d]rop page / [s]how page? [n/c{"" if xref is None else "/x"}/d/s] ')
|
||||||
|
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages analyzed...', end=('' if idx+1 < pdf.page_count else '\n'))
|
||||||
|
|
||||||
return (extractor() for extractor in image_extractors)
|
return (extractor() for extractor in image_extractors)
|
||||||
|
|
||||||
|
@ -1182,9 +1191,17 @@ parser_collate.add_argument(
|
||||||
help="only collate works that are explicitly specified",
|
help="only collate works that are explicitly specified",
|
||||||
)
|
)
|
||||||
parser_collate.add_argument(
|
parser_collate.add_argument(
|
||||||
'--force-convert-pdf',
|
'-p', '--pdf-strategy',
|
||||||
action='store_true',
|
choices=[
|
||||||
help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
|
'ask', '?',
|
||||||
|
'show-ask', 's',
|
||||||
|
'convert', 'c',
|
||||||
|
'extract', 'x',
|
||||||
|
'drop', 'd',
|
||||||
|
'nope', 'n'
|
||||||
|
],
|
||||||
|
default='show-ask',
|
||||||
|
help="how to handle PDF pages that aren't a single image with no text",
|
||||||
)
|
)
|
||||||
parser_collate.add_argument(
|
parser_collate.add_argument(
|
||||||
'expression',
|
'expression',
|
||||||
|
|
Loading…
Reference in a new issue