From 9fea03c270d78746c534efb18000da38abc5fc7e Mon Sep 17 00:00:00 2001 From: xenofem Date: Sat, 2 Mar 2024 18:09:46 -0500 Subject: [PATCH 1/3] add option to convert PDF pages to pixmaps as needed --- dlibrary/dlibrary.py | 71 ++++++++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index acc3291..fcaf440 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -77,6 +77,8 @@ IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp'] IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store'] IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4'] +PDF_FALLBACK_DPI = 300 + def open_zipfile_with_encoding(path): try: return zipfile.ZipFile(path, metadata_encoding="utf-8") @@ -232,7 +234,7 @@ def collate(args): work_staging_dir = collation_staging_area / work_id - collator = Collator(work_staging_dir, [], args.locale) + collator = Collator(work_staging_dir, [], args) collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)]) if collation_result and collator.index > 0: print(f'Collated {collator.index} pages for {work_id}') @@ -252,10 +254,10 @@ def collate(args): con.close() class Collator: - def __init__(self, dest, exclude, locale): + def __init__(self, dest, exclude, args): self.dest = dest self.exclude = exclude - self.locale = locale + self.args = args self.index = 0 def collate_from_paths(self, srcs): @@ -309,14 +311,13 @@ class Collator: def link_pdf(self, src): with fitz.open(src) as pdf: - xrefs = image_xrefs(pdf) - if xrefs is None: + images = pdf_images(pdf, self.args.force_convert_pdf) + if images is None: print(f'Support for weirder PDFs not yet implemented, skipping {src}') return None self.dest.mkdir(parents=True, exist_ok=True) - for (idx, xref) in enumerate(xrefs, start=self.index): - image = pdf.extract_image(xref) + for (idx, image) in enumerate(images, start=self.index): file_path = self.dest / f'{idx:04d}.{image["ext"]}' with open(file_path, 'wb') as f: f.write(image["image"]) @@ -422,34 +423,53 @@ class Collator: return False def try_collate_select_language(self, srcs): - if self.locale not in LANGUAGE_REGEXES: + if self.args.locale not in LANGUAGE_REGEXES: return False if not all(any(lang.search(nname(src)) for lang in LANGUAGE_REGEXES.values()) for src in srcs): return False - srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.locale].search(nname(src))] + srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.args.locale].search(nname(src))] if len(srcs_matching_language) == len(srcs) or len(srcs_matching_language) == 0: return False return self.collate_from_paths(srcs_matching_language) -def image_xrefs(pdf): +def pdf_images(pdf, force=False): images_by_page = [page.get_images() for page in pdf] if all(len(images) == 1 for images in images_by_page): - return [images[0][0] for images in images_by_page] + return (pdf.extract_image(images[0][0]) for images in images_by_page) print("Checking PDF images the quick way failed, trying the slow way") - xrefs = [] - for (idx, page) in enumerate(pdf): - print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='') - images = page.get_image_info(xrefs=True) - if len(images) != 1 or images[0]['xref'] == 0: - print('\nFailed') - return None - xrefs.append(images[0]['xref']) + def xref_or_image_generator(): + xref_mode = not force + for (idx, page) in enumerate(pdf): + page_images = page.get_image_info(xrefs=True) + if len(page_images) == 1 and page_images[0]['xref'] != 0: + xref = page_images[0]['xref'] + if xref_mode: + yield xref + else: + yield pdf.extract_image(xref) + else: + if xref_mode: + raise ValueError + else: + print(f'\nGenerating pixmap for page {idx+1}') + pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI) + yield { 'ext': 'png', 'image': pix.tobytes('png') } + print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='') + print('') - print('\nSuccess') - return xrefs + if force: + return xref_or_image_generator() + + try: + xrefs = list(xref_or_image_generator()) + except ValueError: + print('\nFailed') + return None + print('Success') + return (pdf.extract_image(xref) for xref in xrefs) def nfc(s): return unicodedata.normalize('NFC', s) @@ -706,7 +726,7 @@ def manual_collate(args): work_staging_dir = collation_staging_area / work_id work_staging_dir.mkdir(parents=True) - collator = Collator(work_staging_dir, exclusions, args.locale) + collator = Collator(work_staging_dir, exclusions, args) for group in groups: collation_result = collator.collate_from_paths([item for item in group if item not in exclusions]) if collation_result is None: @@ -1002,7 +1022,7 @@ parser_collate.add_argument( nargs='*', help='paths within extraction folders as collation hints' ) -parser_collate.set_defaults(func=collate) +parser_collate.set_defaults(func=collate, force_convert_pdf=False) parser_manual_collate = subparsers.add_parser( 'manual-collate', @@ -1042,6 +1062,11 @@ parser_manual_collate = subparsers.add_parser( only handle one work at a time. """), ) +parser_manual_collate.add_argument( + '--force-convert-pdf', + action='store_true', + help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly", +) parser_manual_collate.add_argument( 'expression', nargs='+', From c042163e85dec7ecf4ce29d3e71fd417c795152b Mon Sep 17 00:00:00 2001 From: xenofem Date: Sat, 2 Mar 2024 18:10:22 -0500 Subject: [PATCH 2/3] properly handle edge case when we point collate or manual-collate directly at an extraction directory --- dlibrary/dlibrary.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index fcaf440..132e7b2 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -208,12 +208,15 @@ def fetch(args): asyncio.run(fetch_async(args)) +def self_and_parents(path): + return [path] + list(path.parents) + def collate(args): con = sqlite3.connect(args.destdir / 'meta.db') cur = con.cursor() extraction_dir = args.destdir / 'extract' - hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints} + hint_map = {self_and_parents(Path(relpath(hint, extraction_dir)))[-2].name: hint for hint in args.hints} collation_staging_area = args.destdir / 'site' / 'images-staging' collation_staging_area.mkdir(parents=True) @@ -697,7 +700,7 @@ def manual_collate(args): extraction_dir = args.destdir / 'extract' sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group) - work_id = Path(relpath(sample_path, extraction_dir)).parents[-2].name + work_id = self_and_parents(Path(relpath(sample_path, extraction_dir)))[-2].name exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions] From 7535cb6162b1a4802c25a14ff9e85ee7712939df Mon Sep 17 00:00:00 2001 From: xenofem Date: Sat, 2 Mar 2024 18:27:15 -0500 Subject: [PATCH 3/3] also check whether PDFs have text alongside images --- dlibrary/dlibrary.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 132e7b2..864d6b6 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -437,17 +437,21 @@ class Collator: return self.collate_from_paths(srcs_matching_language) +def is_single_image(page): + blocks = page.get_text('blocks') + return len(blocks) == 1 and blocks[0][6] == 1 + def pdf_images(pdf, force=False): - images_by_page = [page.get_images() for page in pdf] - if all(len(images) == 1 for images in images_by_page): - return (pdf.extract_image(images[0][0]) for images in images_by_page) + images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] + if all(len(images) == 1 and single for (images, single) in images_by_page): + return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page) print("Checking PDF images the quick way failed, trying the slow way") def xref_or_image_generator(): xref_mode = not force for (idx, page) in enumerate(pdf): page_images = page.get_image_info(xrefs=True) - if len(page_images) == 1 and page_images[0]['xref'] != 0: + if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page): xref = page_images[0]['xref'] if xref_mode: yield xref