diff --git a/dlibrary.py b/dlibrary.py index c721202..ac039d2 100755 --- a/dlibrary.py +++ b/dlibrary.py @@ -111,19 +111,37 @@ def fetch(args): asyncio.run(fetch_async(args)) +def image_xrefs(pdf): + images_by_page = [page.get_images() for page in pdf] + if all(len(images) == 1 for images in images_by_page): + return [images[0][0] for images in images_by_page] + + print("Checking PDF images the quick way failed, trying the slow way") + xrefs = [] + for (idx, page) in enumerate(pdf): + print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='') + images = page.get_image_info(xrefs=True) + if len(images) != 1 or images[0]['xref'] == 0: + print('\nFailed') + return None + xrefs.append(images[0]['xref']) + + print('\nSuccess') + return xrefs + def link_pdf(src, dest, start_index=0): with fitz.open(src) as pdf: - images_by_page = [page.get_images() for page in pdf] - if all(len(images) == 1 for images in images_by_page): - dest.mkdir(parents=True, exist_ok=True) - for (idx, images) in enumerate(images_by_page, start=start_index): - xref = images[0][0] - image = pdf.extract_image(xref) - file_path = dest / f'{idx:04d}.{image["ext"]}' - with open(file_path, 'wb') as f: - f.write(image["image"]) - else: + xrefs = image_xrefs(pdf) + if xrefs is None: print(f'Support for weirder PDFs not yet implemented, skipping {src}') + return + + dest.mkdir(parents=True, exist_ok=True) + for (idx, xref) in enumerate(xrefs, start=start_index): + image = pdf.extract_image(xref) + file_path = dest / f'{idx:04d}.{image["ext"]}' + with open(file_path, 'wb') as f: + f.write(image["image"]) def complete_prefix_number_ordering(entries): matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name)))