pymupdf TextPage.extractBLOCKS() no longer includes images by default

This commit is contained in:
xenofem 2024-09-01 13:13:14 -04:00
parent 1d798476c9
commit 83a836f9b5

View file

@ -697,7 +697,7 @@ def block_relevant(block):
return block_is_image(block) or not IRRELEVANT_PDF_BLOCK_REGEX.search(block_text(block)) return block_is_image(block) or not IRRELEVANT_PDF_BLOCK_REGEX.search(block_text(block))
def relevant_blocks(page): def relevant_blocks(page):
blocks = page.get_text('blocks') blocks = page.get_text('blocks', flags=(fitz.TEXTFLAGS_BLOCKS | fitz.TEXT_PRESERVE_IMAGES))
return [block for block in blocks if block_relevant(block)] return [block for block in blocks if block_relevant(block)]
def is_single_image(page): def is_single_image(page):