handle when the pdf is in a folder called PDF instead of being right there in the directory

This commit is contained in:
xenofem 2024-02-07 19:11:37 -05:00
parent 9ff18f933b
commit c24c811115

View file

@ -338,9 +338,12 @@ def ls_ignore(directory):
if not ignoreable(path) if not ignoreable(path)
] ]
def descendant_files_ignore(directory): def descendant_files_ignore(path):
if path.is_file():
return [path]
result = [] result = []
for item in ls_ignore(directory): for item in ls_ignore(path):
if item.is_dir(): if item.is_dir():
result.extend(descendant_files_ignore(item)) result.extend(descendant_files_ignore(item))
else: else:
@ -389,7 +392,7 @@ def collate(args):
collation_staging_area.rmdir() collation_staging_area.rmdir()
con.close() con.close()
def collate_split_regex(srcs, dest, start_index, earlier=None, later=None): def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
early_srcs = [] early_srcs = []
middle_srcs = [] middle_srcs = []
late_srcs = [] late_srcs = []
@ -443,6 +446,51 @@ def median(items):
items.sort() items.sort()
return items[len(items) // 2] return items[len(items) // 2]
def superior_or_equal(a, b):
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
def try_collate_images_vs_pdf(srcs, dest, start_index):
pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
if len(pdfs) != 1:
return False
outer_pdf = pdfs[0]
inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if f.suffix.lower() == '.pdf']
if len(inner_pdfs) != 1:
return False
inner_pdf = inner_pdfs[0]
non_pdf_srcs = [src for src in srcs if src != outer_pdf]
images = []
non_images = []
descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)]
for f in descendant_files:
if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
images.append(f)
else:
non_images.append(f)
break
if len(non_images) != 0 or len(images) == 0:
return False
pdf_sizes = pdf_image_sizes(inner_pdf)
standalone_sizes = [standalone_image_size(f) for f in images]
if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
return False
median_pdf_size = median(pdf_sizes)
median_standalone_size = median(standalone_sizes)
if not (median_pdf_size and median_standalone_size):
return False
if superior_or_equal(median_standalone_size, median_pdf_size):
return collate_from_paths(non_pdf_srcs, dest, start_index)
elif superior_or_equal(median_pdf_size, median_standalone_size):
return collate_from_paths([outer_pdf], dest, start_index)
else:
return False
def collate_from_paths(srcs, dest, start_index): def collate_from_paths(srcs, dest, start_index):
if len(srcs) == 1 and srcs[0].is_dir(): if len(srcs) == 1 and srcs[0].is_dir():
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
@ -462,15 +510,15 @@ def collate_from_paths(srcs, dest, start_index):
if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)): if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)):
return collate_from_paths([hi_res_dir], dest, start_index) return collate_from_paths([hi_res_dir], dest, start_index)
textless_split = collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX) textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX)
if textless_split != False: if textless_split != False:
return textless_split return textless_split
epilogue_split = collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX) epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX)
if epilogue_split != False: if epilogue_split != False:
return epilogue_split return epilogue_split
cover_split = collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX) cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX)
if cover_split != False: if cover_split != False:
return cover_split return cover_split
@ -483,33 +531,9 @@ def collate_from_paths(srcs, dest, start_index):
else: else:
return None return None
pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf'] images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index)
if len(pdfs) == 1: if images_vs_pdf != False:
pdf = pdfs[0] return images_vs_pdf
images = []
non_images = []
descendant_files = [
src for src in srcs if src != pdf and src.is_file()
] + [
f for src in srcs if src.is_dir() for f in descendant_files_ignore(src)
]
for f in descendant_files:
if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
images.append(f)
else:
non_images.append(f)
break
if len(non_images) == 0 and len(images) > 0:
pdf_sizes = pdf_image_sizes(pdf)
standalone_sizes = [standalone_image_size(f) for f in images]
if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2:
median_pdf_size = median(pdf_sizes)
median_standalone_size = median(standalone_sizes)
if median_pdf_size and median_standalone_size:
if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]:
return collate_from_paths([src for src in srcs if src != pdf], dest, start_index)
if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]:
return collate_from_paths([pdf], dest, start_index)
return None return None