also check whether PDFs have text alongside images
This commit is contained in:
parent
c042163e85
commit
7535cb6162
|
@ -437,17 +437,21 @@ class Collator:
|
||||||
|
|
||||||
return self.collate_from_paths(srcs_matching_language)
|
return self.collate_from_paths(srcs_matching_language)
|
||||||
|
|
||||||
|
def is_single_image(page):
|
||||||
|
blocks = page.get_text('blocks')
|
||||||
|
return len(blocks) == 1 and blocks[0][6] == 1
|
||||||
|
|
||||||
def pdf_images(pdf, force=False):
|
def pdf_images(pdf, force=False):
|
||||||
images_by_page = [page.get_images() for page in pdf]
|
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
|
||||||
if all(len(images) == 1 for images in images_by_page):
|
if all(len(images) == 1 and single for (images, single) in images_by_page):
|
||||||
return (pdf.extract_image(images[0][0]) for images in images_by_page)
|
return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page)
|
||||||
|
|
||||||
print("Checking PDF images the quick way failed, trying the slow way")
|
print("Checking PDF images the quick way failed, trying the slow way")
|
||||||
def xref_or_image_generator():
|
def xref_or_image_generator():
|
||||||
xref_mode = not force
|
xref_mode = not force
|
||||||
for (idx, page) in enumerate(pdf):
|
for (idx, page) in enumerate(pdf):
|
||||||
page_images = page.get_image_info(xrefs=True)
|
page_images = page.get_image_info(xrefs=True)
|
||||||
if len(page_images) == 1 and page_images[0]['xref'] != 0:
|
if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page):
|
||||||
xref = page_images[0]['xref']
|
xref = page_images[0]['xref']
|
||||||
if xref_mode:
|
if xref_mode:
|
||||||
yield xref
|
yield xref
|
||||||
|
|
Loading…
Reference in a new issue