detect when a PDF has images at the same resolution, just chopped up
This commit is contained in:
parent
7a96bc5655
commit
33c6b9fa1f
|
@ -470,6 +470,12 @@ def median(items):
|
|||
items.sort()
|
||||
return items[len(items) // 2]
|
||||
|
||||
def mean(items):
|
||||
if len(items) == 0:
|
||||
return None
|
||||
|
||||
return sum(items) / len(items)
|
||||
|
||||
def superior_or_equal(a, b):
|
||||
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
|
||||
|
||||
|
@ -500,14 +506,29 @@ def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
|
|||
|
||||
pdf_sizes = pdf_image_sizes(inner_pdf)
|
||||
standalone_sizes = [standalone_image_size(f) for f in images]
|
||||
if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
|
||||
return False
|
||||
|
||||
median_pdf_size = median(pdf_sizes)
|
||||
median_standalone_size = median(standalone_sizes)
|
||||
if not (median_pdf_size and median_standalone_size):
|
||||
return False
|
||||
|
||||
if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
|
||||
with fitz.open(inner_pdf) as pdf:
|
||||
pdf_page_count = len(pdf)
|
||||
height_adjusted_pdf_image_count = (
|
||||
len(pdf_sizes) *
|
||||
mean([size[1] for size in pdf_sizes]) / mean([size[1] for size in standalone_sizes])
|
||||
)
|
||||
if (
|
||||
abs(pdf_page_count - len(standalone_sizes)) <= 2 and
|
||||
len(pdf_sizes) > len(standalone_sizes) and
|
||||
median_pdf_size[0] == median_standalone_size[0] and
|
||||
abs(height_adjusted_pdf_image_count - len(standalone_sizes)) <= 2
|
||||
):
|
||||
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
|
||||
else:
|
||||
return False
|
||||
|
||||
if superior_or_equal(median_standalone_size, median_pdf_size):
|
||||
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
|
||||
elif superior_or_equal(median_pdf_size, median_standalone_size):
|
||||
|
|
Loading…
Reference in a new issue