diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index bfe629a..374386f 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -329,17 +329,8 @@ def link_ordered_files(ordering, dest, start_index): link_path = dest / f'{idx:04d}{ext}' link_path.symlink_to(relpath(src_path, dest)) -def check_extension(path, exts): - return path.suffix.lower() in exts - -def is_pdf(path): - check_extension(path, ['.pdf']) - -def is_image(path): - check_extension(path, IMAGE_FILE_EXTENSIONS) - def ignoreable(path): - return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS) + return path.name in IGNOREABLE_FILES or path.suffix.lower() in IGNOREABLE_EXTENSIONS def ls_ignore(directory): return [ @@ -347,12 +338,9 @@ def ls_ignore(directory): if not ignoreable(path) ] -def descendant_files_ignore(path): - if path.is_file(): - return [path] - +def descendant_files_ignore(directory): result = [] - for item in ls_ignore(path): + for item in ls_ignore(directory): if item.is_dir(): result.extend(descendant_files_ignore(item)) else: @@ -401,7 +389,7 @@ def collate(args): collation_staging_area.rmdir() con.close() -def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None): +def collate_split_regex(srcs, dest, start_index, earlier=None, later=None): early_srcs = [] middle_srcs = [] late_srcs = [] @@ -455,56 +443,11 @@ def median(items): items.sort() return items[len(items) // 2] -def superior_or_equal(a, b): - return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) - -def try_collate_images_vs_pdf(srcs, dest, start_index): - pdfs = [src for src in srcs if 'pdf' in src.name.lower()] - if len(pdfs) != 1: - return False - outer_pdf = pdfs[0] - - inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if is_pdf(f)] - if len(inner_pdfs) != 1: - return False - inner_pdf = inner_pdfs[0] - - non_pdf_srcs = [src for src in srcs if src != outer_pdf] - images = [] - non_images = [] - descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)] - for f in descendant_files: - if is_image(f): - images.append(f) - else: - non_images.append(f) - break - - if len(non_images) != 0 or len(images) == 0: - return False - - pdf_sizes = pdf_image_sizes(inner_pdf) - standalone_sizes = [standalone_image_size(f) for f in images] - if abs(len(pdf_sizes) - len(standalone_sizes)) > 2: - return False - - median_pdf_size = median(pdf_sizes) - median_standalone_size = median(standalone_sizes) - if not (median_pdf_size and median_standalone_size): - return False - - if superior_or_equal(median_standalone_size, median_pdf_size): - return collate_from_paths(non_pdf_srcs, dest, start_index) - elif superior_or_equal(median_pdf_size, median_standalone_size): - return collate_from_paths([outer_pdf], dest, start_index) - else: - return False - def collate_from_paths(srcs, dest, start_index): if len(srcs) == 1 and srcs[0].is_dir(): return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) - if len(srcs) == 1 and is_pdf(srcs[0]): + if len(srcs) == 1 and srcs[0].suffix.lower() == '.pdf': print(f'Extracting images from {srcs[0]}') return link_pdf(srcs[0], dest, start_index) @@ -519,19 +462,19 @@ def collate_from_paths(srcs, dest, start_index): if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)): return collate_from_paths([hi_res_dir], dest, start_index) - textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX) + textless_split = collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX) if textless_split != False: return textless_split - epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX) + epilogue_split = collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX) if epilogue_split != False: return epilogue_split - cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX) + cover_split = collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX) if cover_split != False: return cover_split - if all(src.is_file() and is_image(src) for src in srcs): + if all(src.is_file() and src.suffix.lower() in IMAGE_FILE_EXTENSIONS for src in srcs): ordering = complete_prefix_number_ordering(srcs) if ordering: print(f'Symlinking image files: {ordering[0]}...') @@ -540,9 +483,33 @@ def collate_from_paths(srcs, dest, start_index): else: return None - images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index) - if images_vs_pdf != False: - return images_vs_pdf + pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf'] + if len(pdfs) == 1: + pdf = pdfs[0] + images = [] + non_images = [] + descendant_files = [ + src for src in srcs if src != pdf and src.is_file() + ] + [ + f for src in srcs if src.is_dir() for f in descendant_files_ignore(src) + ] + for f in descendant_files: + if f.suffix.lower() in IMAGE_FILE_EXTENSIONS: + images.append(f) + else: + non_images.append(f) + break + if len(non_images) == 0 and len(images) > 0: + pdf_sizes = pdf_image_sizes(pdf) + standalone_sizes = [standalone_image_size(f) for f in images] + if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2: + median_pdf_size = median(pdf_sizes) + median_standalone_size = median(standalone_sizes) + if median_pdf_size and median_standalone_size: + if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]: + return collate_from_paths([src for src in srcs if src != pdf], dest, start_index) + if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]: + return collate_from_paths([pdf], dest, start_index) return None @@ -567,17 +534,17 @@ def manual_collate(args): index = 0 for path in args.paths: if path.is_dir(): - entries = [p for p in path.iterdir() if p.is_file() and is_image(p)] + entries = [p for p in path.iterdir() if p.suffix.lower() in IMAGE_FILE_EXTENSIONS] ordering = complete_prefix_number_ordering(entries) if ordering is None: ordering = entries ordering.sort() link_ordered_files(ordering, collation_dir, index) index += len(ordering) - elif is_image(path): + elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS: link_ordered_files([path], collation_dir, index) index += 1 - elif is_pdf(path): + elif path.suffix.lower() == ".pdf": pdf_page_count = link_pdf(path, collation_dir, index) if pdf_page_count is None: return @@ -596,10 +563,10 @@ def analyze(args): for f in files: print(f'{relpath(f, extract_dir)}', end='') - if is_image(f): + if f.suffix.lower() in IMAGE_FILE_EXTENSIONS: size = standalone_image_size(f) print(f'\t{fmt_size(size)}') - elif is_pdf(f): + elif f.suffix.lower() == '.pdf': sizes = pdf_image_sizes(f) if len(sizes) == 0: print(f'\tContains no images')