detect when a PDF has images at the same resolution, just chopped up
This commit is contained in:
parent
7a96bc5655
commit
33c6b9fa1f
|
@ -470,6 +470,12 @@ def median(items):
|
||||||
items.sort()
|
items.sort()
|
||||||
return items[len(items) // 2]
|
return items[len(items) // 2]
|
||||||
|
|
||||||
|
def mean(items):
|
||||||
|
if len(items) == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return sum(items) / len(items)
|
||||||
|
|
||||||
def superior_or_equal(a, b):
|
def superior_or_equal(a, b):
|
||||||
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
|
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
|
||||||
|
|
||||||
|
@ -500,14 +506,29 @@ def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
|
||||||
|
|
||||||
pdf_sizes = pdf_image_sizes(inner_pdf)
|
pdf_sizes = pdf_image_sizes(inner_pdf)
|
||||||
standalone_sizes = [standalone_image_size(f) for f in images]
|
standalone_sizes = [standalone_image_size(f) for f in images]
|
||||||
if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
|
|
||||||
return False
|
|
||||||
|
|
||||||
median_pdf_size = median(pdf_sizes)
|
median_pdf_size = median(pdf_sizes)
|
||||||
median_standalone_size = median(standalone_sizes)
|
median_standalone_size = median(standalone_sizes)
|
||||||
if not (median_pdf_size and median_standalone_size):
|
if not (median_pdf_size and median_standalone_size):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
|
||||||
|
with fitz.open(inner_pdf) as pdf:
|
||||||
|
pdf_page_count = len(pdf)
|
||||||
|
height_adjusted_pdf_image_count = (
|
||||||
|
len(pdf_sizes) *
|
||||||
|
mean([size[1] for size in pdf_sizes]) / mean([size[1] for size in standalone_sizes])
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
abs(pdf_page_count - len(standalone_sizes)) <= 2 and
|
||||||
|
len(pdf_sizes) > len(standalone_sizes) and
|
||||||
|
median_pdf_size[0] == median_standalone_size[0] and
|
||||||
|
abs(height_adjusted_pdf_image_count - len(standalone_sizes)) <= 2
|
||||||
|
):
|
||||||
|
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
if superior_or_equal(median_standalone_size, median_pdf_size):
|
if superior_or_equal(median_standalone_size, median_pdf_size):
|
||||||
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
|
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
|
||||||
elif superior_or_equal(median_pdf_size, median_standalone_size):
|
elif superior_or_equal(median_pdf_size, median_standalone_size):
|
||||||
|
|
Loading…
Reference in a new issue