diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index c246cc9..25cccb5 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -79,8 +79,6 @@ IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4'] PDF_FALLBACK_DPI = 300 -IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I) - def open_zipfile_with_encoding(path): try: return zipfile.ZipFile(path, metadata_encoding="utf-8") @@ -318,7 +316,7 @@ class Collator: with fitz.open(src) as pdf: images = pdf_images(pdf, self.args.force_convert_pdf) if images is None: - print(f'Failed to enumerate page images in PDF, skipping {src}') + print(f'Support for weirder PDFs not yet implemented, skipping {src}') return None self.dest.mkdir(parents=True, exist_ok=True) @@ -439,22 +437,9 @@ class Collator: return self.collate_from_paths(srcs_matching_language) -def block_is_image(block): - return block[6] == 1 - -def block_text(block): - return block[4] - -def block_relevant(block): - return block_is_image(block) or not IRRELEVANT_PDF_BLOCK_REGEX.search(block_text(block)) - -def relevant_blocks(page): - blocks = page.get_text('blocks') - return [block for block in blocks if block_relevant(block)] - def is_single_image(page): - blocks = relevant_blocks(page) - return len(blocks) == 1 and block_is_image(blocks[0]) + blocks = page.get_text('blocks') + return len(blocks) == 1 and blocks[0][6] == 1 def extract_image(pdf, xref): image = pdf.extract_image(xref) @@ -481,11 +466,10 @@ def pdf_images(pdf, force=False): else: yield extract_image(pdf, xref) else: - print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects') if xref_mode: raise ValueError else: - print(f'Generating pixmap for page {idx+1}') + print(f'\nGenerating pixmap for page {idx+1}') pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI) yield { 'ext': 'png', 'image': pix.tobytes('png') } print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='')