detect when a PDF has images at the same resolution, just chopped up

This commit is contained in:
xenofem 2024-02-15 19:32:50 -05:00
parent 7a96bc5655
commit 33c6b9fa1f

View file

@ -470,6 +470,12 @@ def median(items):
items.sort() items.sort()
return items[len(items) // 2] return items[len(items) // 2]
def mean(items):
if len(items) == 0:
return None
return sum(items) / len(items)
def superior_or_equal(a, b): def superior_or_equal(a, b):
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
@ -500,14 +506,29 @@ def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
pdf_sizes = pdf_image_sizes(inner_pdf) pdf_sizes = pdf_image_sizes(inner_pdf)
standalone_sizes = [standalone_image_size(f) for f in images] standalone_sizes = [standalone_image_size(f) for f in images]
if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
return False
median_pdf_size = median(pdf_sizes) median_pdf_size = median(pdf_sizes)
median_standalone_size = median(standalone_sizes) median_standalone_size = median(standalone_sizes)
if not (median_pdf_size and median_standalone_size): if not (median_pdf_size and median_standalone_size):
return False return False
if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
with fitz.open(inner_pdf) as pdf:
pdf_page_count = len(pdf)
height_adjusted_pdf_image_count = (
len(pdf_sizes) *
mean([size[1] for size in pdf_sizes]) / mean([size[1] for size in standalone_sizes])
)
if (
abs(pdf_page_count - len(standalone_sizes)) <= 2 and
len(pdf_sizes) > len(standalone_sizes) and
median_pdf_size[0] == median_standalone_size[0] and
abs(height_adjusted_pdf_image_count - len(standalone_sizes)) <= 2
):
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
else:
return False
if superior_or_equal(median_standalone_size, median_pdf_size): if superior_or_equal(median_standalone_size, median_pdf_size):
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude) return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
elif superior_or_equal(median_pdf_size, median_standalone_size): elif superior_or_equal(median_pdf_size, median_standalone_size):