From 33c6b9fa1f6ee55bfad6d44c9a53bb82e4ac13ac Mon Sep 17 00:00:00 2001 From: xenofem Date: Thu, 15 Feb 2024 19:32:50 -0500 Subject: [PATCH] detect when a PDF has images at the same resolution, just chopped up --- dlibrary/dlibrary.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index af96bc4..ff19e73 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -470,6 +470,12 @@ def median(items): items.sort() return items[len(items) // 2] +def mean(items): + if len(items) == 0: + return None + + return sum(items) / len(items) + def superior_or_equal(a, b): return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) @@ -500,14 +506,29 @@ def try_collate_images_vs_pdf(srcs, dest, start_index, exclude): pdf_sizes = pdf_image_sizes(inner_pdf) standalone_sizes = [standalone_image_size(f) for f in images] - if abs(len(pdf_sizes) - len(standalone_sizes)) > 2: - return False median_pdf_size = median(pdf_sizes) median_standalone_size = median(standalone_sizes) if not (median_pdf_size and median_standalone_size): return False + if abs(len(pdf_sizes) - len(standalone_sizes)) > 2: + with fitz.open(inner_pdf) as pdf: + pdf_page_count = len(pdf) + height_adjusted_pdf_image_count = ( + len(pdf_sizes) * + mean([size[1] for size in pdf_sizes]) / mean([size[1] for size in standalone_sizes]) + ) + if ( + abs(pdf_page_count - len(standalone_sizes)) <= 2 and + len(pdf_sizes) > len(standalone_sizes) and + median_pdf_size[0] == median_standalone_size[0] and + abs(height_adjusted_pdf_image_count - len(standalone_sizes)) <= 2 + ): + return collate_from_paths(non_pdf_srcs, dest, start_index, exclude) + else: + return False + if superior_or_equal(median_standalone_size, median_pdf_size): return collate_from_paths(non_pdf_srcs, dest, start_index, exclude) elif superior_or_equal(median_pdf_size, median_standalone_size):