add a second fallback method for getting PDF image xrefs per page

main
xenofem 2024-01-22 10:36:20 -05:00
parent 578a46c217
commit 716a42858c
1 changed files with 28 additions and 10 deletions

View File

@ -111,19 +111,37 @@ def fetch(args):
asyncio.run(fetch_async(args))
def image_xrefs(pdf):
images_by_page = [page.get_images() for page in pdf]
if all(len(images) == 1 for images in images_by_page):
return [images[0][0] for images in images_by_page]
print("Checking PDF images the quick way failed, trying the slow way")
xrefs = []
for (idx, page) in enumerate(pdf):
print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='')
images = page.get_image_info(xrefs=True)
if len(images) != 1 or images[0]['xref'] == 0:
print('\nFailed')
return None
xrefs.append(images[0]['xref'])
print('\nSuccess')
return xrefs
def link_pdf(src, dest, start_index=0):
with fitz.open(src) as pdf:
images_by_page = [page.get_images() for page in pdf]
if all(len(images) == 1 for images in images_by_page):
dest.mkdir(parents=True, exist_ok=True)
for (idx, images) in enumerate(images_by_page, start=start_index):
xref = images[0][0]
image = pdf.extract_image(xref)
file_path = dest / f'{idx:04d}.{image["ext"]}'
with open(file_path, 'wb') as f:
f.write(image["image"])
else:
xrefs = image_xrefs(pdf)
if xrefs is None:
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
return
dest.mkdir(parents=True, exist_ok=True)
for (idx, xref) in enumerate(xrefs, start=start_index):
image = pdf.extract_image(xref)
file_path = dest / f'{idx:04d}.{image["ext"]}'
with open(file_path, 'wb') as f:
f.write(image["image"])
def complete_prefix_number_ordering(entries):
matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name)))