identify PDF page image xrefs in a *much* faster and less stupid way than PyMuPDF does it
This commit is contained in:
parent
5378899b2e
commit
435af20e59
|
@ -88,6 +88,9 @@ IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
|
||||||
MULTIPART_RAR_HEAD_REGEX = re.compile(r'^(.+)\.part0*1\.exe$', re.I)
|
MULTIPART_RAR_HEAD_REGEX = re.compile(r'^(.+)\.part0*1\.exe$', re.I)
|
||||||
MULTIPART_RAR_TAIL_REGEX = re.compile(r'^(.+)\.part0*([^1]|[^0].+)\.rar$', re.I)
|
MULTIPART_RAR_TAIL_REGEX = re.compile(r'^(.+)\.part0*([^1]|[^0].+)\.rar$', re.I)
|
||||||
|
|
||||||
|
PDF_REFERENCED_IMAGE_REGEX = re.compile(r'(^|(?<=\s))/(?P<ref_name>\S+)\s+Do($|(?=\s))')
|
||||||
|
PDF_INLINE_IMAGE_REGEX = re.compile(r'(^|\s)(BI|ID|EI)($|\s)')
|
||||||
|
|
||||||
debug_mode = False
|
debug_mode = False
|
||||||
def debug(s):
|
def debug(s):
|
||||||
if debug_mode:
|
if debug_mode:
|
||||||
|
@ -575,6 +578,33 @@ def extract_image(pdf, xref):
|
||||||
pix = fitz.Pixmap(pdf, xref)
|
pix = fitz.Pixmap(pdf, xref)
|
||||||
return { 'ext': 'png', 'image': pix.tobytes('png') }
|
return { 'ext': 'png', 'image': pix.tobytes('png') }
|
||||||
|
|
||||||
|
def get_displayed_image_xref(page):
|
||||||
|
ref_names = []
|
||||||
|
for content_xref in page.get_contents():
|
||||||
|
content = page.parent.xref_stream(content_xref).decode('ascii', 'replace')
|
||||||
|
if PDF_INLINE_IMAGE_REGEX.search(content):
|
||||||
|
debug('Inline image detected')
|
||||||
|
return None
|
||||||
|
for m in PDF_REFERENCED_IMAGE_REGEX.finditer(content):
|
||||||
|
ref_names.append(m.group('ref_name'))
|
||||||
|
|
||||||
|
if len(ref_names) == 0:
|
||||||
|
debug('Page does not reference any xobjects')
|
||||||
|
return None
|
||||||
|
if len(ref_names) > 1:
|
||||||
|
debug(f'Page references multiple xobjects: {ref_names}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
image_xrefs = [image[0] for image in page.get_images() if image[7] == ref_names[0]]
|
||||||
|
if len(image_xrefs) == 1:
|
||||||
|
return image_xrefs[0]
|
||||||
|
|
||||||
|
if len(image_xrefs) == 0:
|
||||||
|
debug(f'No images found matching ref name {ref_names[0]}')
|
||||||
|
else:
|
||||||
|
debug(f"Multiple images found matching ref name {ref_names[0]}, that probably shouldn't happen")
|
||||||
|
return None
|
||||||
|
|
||||||
def display_sixel_page(page):
|
def display_sixel_page(page):
|
||||||
s = BytesIO()
|
s = BytesIO()
|
||||||
image = Image.open(BytesIO(page.get_pixmap(dpi=PDF_PREVIEW_DPI).tobytes('png')))
|
image = Image.open(BytesIO(page.get_pixmap(dpi=PDF_PREVIEW_DPI).tobytes('png')))
|
||||||
|
@ -615,23 +645,15 @@ def display_sixel_page(page):
|
||||||
sixel_output_unref(output)
|
sixel_output_unref(output)
|
||||||
|
|
||||||
def pdf_images(pdf, strategy):
|
def pdf_images(pdf, strategy):
|
||||||
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
|
|
||||||
if all(len(images) == 1 and single for (images, single) in images_by_page):
|
|
||||||
return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
|
|
||||||
|
|
||||||
print("Checking PDF images the quick way failed, trying the slow way")
|
|
||||||
print(f'0/{pdf.page_count} pages analyzed...', end='')
|
print(f'0/{pdf.page_count} pages analyzed...', end='')
|
||||||
image_extractors = []
|
image_extractors = []
|
||||||
for (idx, page) in enumerate(pdf):
|
for (idx, page) in enumerate(pdf):
|
||||||
page_images = page.get_image_info(xrefs=True)
|
xref = get_displayed_image_xref(page)
|
||||||
if len(page_images) == 1 and page_images[0]['xref'] != 0:
|
|
||||||
xref = page_images[0]['xref']
|
|
||||||
else:
|
|
||||||
xref = None
|
|
||||||
if xref is not None and is_single_image(page):
|
if xref is not None and is_single_image(page):
|
||||||
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
||||||
else:
|
else:
|
||||||
print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects')
|
page_images = page.get_image_info()
|
||||||
|
print(f'\nPage {idx+1}: {len(page_images)} images, {len(relevant_blocks(page))} total relevant objects')
|
||||||
choice = strategy
|
choice = strategy
|
||||||
while True:
|
while True:
|
||||||
if choice.lower().startswith('n'):
|
if choice.lower().startswith('n'):
|
||||||
|
|
Loading…
Reference in a new issue