From 83a836f9b5fa14c7a567185fe626073be85be21c Mon Sep 17 00:00:00 2001 From: xenofem Date: Sun, 1 Sep 2024 13:13:14 -0400 Subject: [PATCH] pymupdf TextPage.extractBLOCKS() no longer includes images by default --- dlibrary/dlibrary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 049e583..7b97233 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -697,7 +697,7 @@ def block_relevant(block): return block_is_image(block) or not IRRELEVANT_PDF_BLOCK_REGEX.search(block_text(block)) def relevant_blocks(page): - blocks = page.get_text('blocks') + blocks = page.get_text('blocks', flags=(fitz.TEXTFLAGS_BLOCKS | fitz.TEXT_PRESERVE_IMAGES)) return [block for block in blocks if block_relevant(block)] def is_single_image(page):