From c24c811115368dd1a2549fc063c5694d3f273dee Mon Sep 17 00:00:00 2001 From: xenofem Date: Wed, 7 Feb 2024 19:11:37 -0500 Subject: [PATCH] handle when the pdf is in a folder called PDF instead of being right there in the directory --- dlibrary/dlibrary.py | 90 ++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 33 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 374386f..b637bc7 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -338,9 +338,12 @@ def ls_ignore(directory): if not ignoreable(path) ] -def descendant_files_ignore(directory): +def descendant_files_ignore(path): + if path.is_file(): + return [path] + result = [] - for item in ls_ignore(directory): + for item in ls_ignore(path): if item.is_dir(): result.extend(descendant_files_ignore(item)) else: @@ -389,7 +392,7 @@ def collate(args): collation_staging_area.rmdir() con.close() -def collate_split_regex(srcs, dest, start_index, earlier=None, later=None): +def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None): early_srcs = [] middle_srcs = [] late_srcs = [] @@ -443,6 +446,51 @@ def median(items): items.sort() return items[len(items) // 2] +def superior_or_equal(a, b): + return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) + +def try_collate_images_vs_pdf(srcs, dest, start_index): + pdfs = [src for src in srcs if 'pdf' in src.name.lower()] + if len(pdfs) != 1: + return False + outer_pdf = pdfs[0] + + inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if f.suffix.lower() == '.pdf'] + if len(inner_pdfs) != 1: + return False + inner_pdf = inner_pdfs[0] + + non_pdf_srcs = [src for src in srcs if src != outer_pdf] + images = [] + non_images = [] + descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)] + for f in descendant_files: + if f.suffix.lower() in IMAGE_FILE_EXTENSIONS: + images.append(f) + else: + non_images.append(f) + break + + if len(non_images) != 0 or len(images) == 0: + return False + + pdf_sizes = pdf_image_sizes(inner_pdf) + standalone_sizes = [standalone_image_size(f) for f in images] + if abs(len(pdf_sizes) - len(standalone_sizes)) > 2: + return False + + median_pdf_size = median(pdf_sizes) + median_standalone_size = median(standalone_sizes) + if not (median_pdf_size and median_standalone_size): + return False + + if superior_or_equal(median_standalone_size, median_pdf_size): + return collate_from_paths(non_pdf_srcs, dest, start_index) + elif superior_or_equal(median_pdf_size, median_standalone_size): + return collate_from_paths([outer_pdf], dest, start_index) + else: + return False + def collate_from_paths(srcs, dest, start_index): if len(srcs) == 1 and srcs[0].is_dir(): return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) @@ -462,15 +510,15 @@ def collate_from_paths(srcs, dest, start_index): if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)): return collate_from_paths([hi_res_dir], dest, start_index) - textless_split = collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX) + textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX) if textless_split != False: return textless_split - epilogue_split = collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX) + epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX) if epilogue_split != False: return epilogue_split - cover_split = collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX) + cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX) if cover_split != False: return cover_split @@ -483,33 +531,9 @@ def collate_from_paths(srcs, dest, start_index): else: return None - pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf'] - if len(pdfs) == 1: - pdf = pdfs[0] - images = [] - non_images = [] - descendant_files = [ - src for src in srcs if src != pdf and src.is_file() - ] + [ - f for src in srcs if src.is_dir() for f in descendant_files_ignore(src) - ] - for f in descendant_files: - if f.suffix.lower() in IMAGE_FILE_EXTENSIONS: - images.append(f) - else: - non_images.append(f) - break - if len(non_images) == 0 and len(images) > 0: - pdf_sizes = pdf_image_sizes(pdf) - standalone_sizes = [standalone_image_size(f) for f in images] - if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2: - median_pdf_size = median(pdf_sizes) - median_standalone_size = median(standalone_sizes) - if median_pdf_size and median_standalone_size: - if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]: - return collate_from_paths([src for src in srcs if src != pdf], dest, start_index) - if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]: - return collate_from_paths([pdf], dest, start_index) + images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index) + if images_vs_pdf != False: + return images_vs_pdf return None