add a second fallback method for getting PDF image xrefs per page
This commit is contained in:
parent
578a46c217
commit
716a42858c
38
dlibrary.py
38
dlibrary.py
|
@ -111,19 +111,37 @@ def fetch(args):
|
||||||
asyncio.run(fetch_async(args))
|
asyncio.run(fetch_async(args))
|
||||||
|
|
||||||
|
|
||||||
|
def image_xrefs(pdf):
|
||||||
|
images_by_page = [page.get_images() for page in pdf]
|
||||||
|
if all(len(images) == 1 for images in images_by_page):
|
||||||
|
return [images[0][0] for images in images_by_page]
|
||||||
|
|
||||||
|
print("Checking PDF images the quick way failed, trying the slow way")
|
||||||
|
xrefs = []
|
||||||
|
for (idx, page) in enumerate(pdf):
|
||||||
|
print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='')
|
||||||
|
images = page.get_image_info(xrefs=True)
|
||||||
|
if len(images) != 1 or images[0]['xref'] == 0:
|
||||||
|
print('\nFailed')
|
||||||
|
return None
|
||||||
|
xrefs.append(images[0]['xref'])
|
||||||
|
|
||||||
|
print('\nSuccess')
|
||||||
|
return xrefs
|
||||||
|
|
||||||
def link_pdf(src, dest, start_index=0):
|
def link_pdf(src, dest, start_index=0):
|
||||||
with fitz.open(src) as pdf:
|
with fitz.open(src) as pdf:
|
||||||
images_by_page = [page.get_images() for page in pdf]
|
xrefs = image_xrefs(pdf)
|
||||||
if all(len(images) == 1 for images in images_by_page):
|
if xrefs is None:
|
||||||
dest.mkdir(parents=True, exist_ok=True)
|
|
||||||
for (idx, images) in enumerate(images_by_page, start=start_index):
|
|
||||||
xref = images[0][0]
|
|
||||||
image = pdf.extract_image(xref)
|
|
||||||
file_path = dest / f'{idx:04d}.{image["ext"]}'
|
|
||||||
with open(file_path, 'wb') as f:
|
|
||||||
f.write(image["image"])
|
|
||||||
else:
|
|
||||||
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
|
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
|
||||||
|
return
|
||||||
|
|
||||||
|
dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
for (idx, xref) in enumerate(xrefs, start=start_index):
|
||||||
|
image = pdf.extract_image(xref)
|
||||||
|
file_path = dest / f'{idx:04d}.{image["ext"]}'
|
||||||
|
with open(file_path, 'wb') as f:
|
||||||
|
f.write(image["image"])
|
||||||
|
|
||||||
def complete_prefix_number_ordering(entries):
|
def complete_prefix_number_ordering(entries):
|
||||||
matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name)))
|
matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name)))
|
||||||
|
|
Loading…
Reference in a new issue