add a second fallback method for getting PDF image xrefs per page
This commit is contained in:
parent
578a46c217
commit
716a42858c
30
dlibrary.py
30
dlibrary.py
|
@ -111,19 +111,37 @@ def fetch(args):
|
|||
asyncio.run(fetch_async(args))
|
||||
|
||||
|
||||
def link_pdf(src, dest, start_index=0):
|
||||
with fitz.open(src) as pdf:
|
||||
def image_xrefs(pdf):
|
||||
images_by_page = [page.get_images() for page in pdf]
|
||||
if all(len(images) == 1 for images in images_by_page):
|
||||
return [images[0][0] for images in images_by_page]
|
||||
|
||||
print("Checking PDF images the quick way failed, trying the slow way")
|
||||
xrefs = []
|
||||
for (idx, page) in enumerate(pdf):
|
||||
print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='')
|
||||
images = page.get_image_info(xrefs=True)
|
||||
if len(images) != 1 or images[0]['xref'] == 0:
|
||||
print('\nFailed')
|
||||
return None
|
||||
xrefs.append(images[0]['xref'])
|
||||
|
||||
print('\nSuccess')
|
||||
return xrefs
|
||||
|
||||
def link_pdf(src, dest, start_index=0):
|
||||
with fitz.open(src) as pdf:
|
||||
xrefs = image_xrefs(pdf)
|
||||
if xrefs is None:
|
||||
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
|
||||
return
|
||||
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
for (idx, images) in enumerate(images_by_page, start=start_index):
|
||||
xref = images[0][0]
|
||||
for (idx, xref) in enumerate(xrefs, start=start_index):
|
||||
image = pdf.extract_image(xref)
|
||||
file_path = dest / f'{idx:04d}.{image["ext"]}'
|
||||
with open(file_path, 'wb') as f:
|
||||
f.write(image["image"])
|
||||
else:
|
||||
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
|
||||
|
||||
def complete_prefix_number_ordering(entries):
|
||||
matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name)))
|
||||
|
|
Loading…
Reference in a new issue