Compare commits
2 commits
9ff18f933b
...
9c6328659f
Author | SHA1 | Date | |
---|---|---|---|
xenofem | 9c6328659f | ||
xenofem | c24c811115 |
|
@ -329,8 +329,17 @@ def link_ordered_files(ordering, dest, start_index):
|
|||
link_path = dest / f'{idx:04d}{ext}'
|
||||
link_path.symlink_to(relpath(src_path, dest))
|
||||
|
||||
def check_extension(path, exts):
|
||||
return path.suffix.lower() in exts
|
||||
|
||||
def is_pdf(path):
|
||||
check_extension(path, ['.pdf'])
|
||||
|
||||
def is_image(path):
|
||||
check_extension(path, IMAGE_FILE_EXTENSIONS)
|
||||
|
||||
def ignoreable(path):
|
||||
return path.name in IGNOREABLE_FILES or path.suffix.lower() in IGNOREABLE_EXTENSIONS
|
||||
return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)
|
||||
|
||||
def ls_ignore(directory):
|
||||
return [
|
||||
|
@ -338,9 +347,12 @@ def ls_ignore(directory):
|
|||
if not ignoreable(path)
|
||||
]
|
||||
|
||||
def descendant_files_ignore(directory):
|
||||
def descendant_files_ignore(path):
|
||||
if path.is_file():
|
||||
return [path]
|
||||
|
||||
result = []
|
||||
for item in ls_ignore(directory):
|
||||
for item in ls_ignore(path):
|
||||
if item.is_dir():
|
||||
result.extend(descendant_files_ignore(item))
|
||||
else:
|
||||
|
@ -389,7 +401,7 @@ def collate(args):
|
|||
collation_staging_area.rmdir()
|
||||
con.close()
|
||||
|
||||
def collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
|
||||
def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
|
||||
early_srcs = []
|
||||
middle_srcs = []
|
||||
late_srcs = []
|
||||
|
@ -443,11 +455,56 @@ def median(items):
|
|||
items.sort()
|
||||
return items[len(items) // 2]
|
||||
|
||||
def superior_or_equal(a, b):
|
||||
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
|
||||
|
||||
def try_collate_images_vs_pdf(srcs, dest, start_index):
|
||||
pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
|
||||
if len(pdfs) != 1:
|
||||
return False
|
||||
outer_pdf = pdfs[0]
|
||||
|
||||
inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if is_pdf(f)]
|
||||
if len(inner_pdfs) != 1:
|
||||
return False
|
||||
inner_pdf = inner_pdfs[0]
|
||||
|
||||
non_pdf_srcs = [src for src in srcs if src != outer_pdf]
|
||||
images = []
|
||||
non_images = []
|
||||
descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)]
|
||||
for f in descendant_files:
|
||||
if is_image(f):
|
||||
images.append(f)
|
||||
else:
|
||||
non_images.append(f)
|
||||
break
|
||||
|
||||
if len(non_images) != 0 or len(images) == 0:
|
||||
return False
|
||||
|
||||
pdf_sizes = pdf_image_sizes(inner_pdf)
|
||||
standalone_sizes = [standalone_image_size(f) for f in images]
|
||||
if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
|
||||
return False
|
||||
|
||||
median_pdf_size = median(pdf_sizes)
|
||||
median_standalone_size = median(standalone_sizes)
|
||||
if not (median_pdf_size and median_standalone_size):
|
||||
return False
|
||||
|
||||
if superior_or_equal(median_standalone_size, median_pdf_size):
|
||||
return collate_from_paths(non_pdf_srcs, dest, start_index)
|
||||
elif superior_or_equal(median_pdf_size, median_standalone_size):
|
||||
return collate_from_paths([outer_pdf], dest, start_index)
|
||||
else:
|
||||
return False
|
||||
|
||||
def collate_from_paths(srcs, dest, start_index):
|
||||
if len(srcs) == 1 and srcs[0].is_dir():
|
||||
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
|
||||
|
||||
if len(srcs) == 1 and srcs[0].suffix.lower() == '.pdf':
|
||||
if len(srcs) == 1 and is_pdf(srcs[0]):
|
||||
print(f'Extracting images from {srcs[0]}')
|
||||
return link_pdf(srcs[0], dest, start_index)
|
||||
|
||||
|
@ -462,19 +519,19 @@ def collate_from_paths(srcs, dest, start_index):
|
|||
if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)):
|
||||
return collate_from_paths([hi_res_dir], dest, start_index)
|
||||
|
||||
textless_split = collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX)
|
||||
textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX)
|
||||
if textless_split != False:
|
||||
return textless_split
|
||||
|
||||
epilogue_split = collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX)
|
||||
epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX)
|
||||
if epilogue_split != False:
|
||||
return epilogue_split
|
||||
|
||||
cover_split = collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX)
|
||||
cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX)
|
||||
if cover_split != False:
|
||||
return cover_split
|
||||
|
||||
if all(src.is_file() and src.suffix.lower() in IMAGE_FILE_EXTENSIONS for src in srcs):
|
||||
if all(src.is_file() and is_image(src) for src in srcs):
|
||||
ordering = complete_prefix_number_ordering(srcs)
|
||||
if ordering:
|
||||
print(f'Symlinking image files: {ordering[0]}...')
|
||||
|
@ -483,33 +540,9 @@ def collate_from_paths(srcs, dest, start_index):
|
|||
else:
|
||||
return None
|
||||
|
||||
pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf']
|
||||
if len(pdfs) == 1:
|
||||
pdf = pdfs[0]
|
||||
images = []
|
||||
non_images = []
|
||||
descendant_files = [
|
||||
src for src in srcs if src != pdf and src.is_file()
|
||||
] + [
|
||||
f for src in srcs if src.is_dir() for f in descendant_files_ignore(src)
|
||||
]
|
||||
for f in descendant_files:
|
||||
if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
|
||||
images.append(f)
|
||||
else:
|
||||
non_images.append(f)
|
||||
break
|
||||
if len(non_images) == 0 and len(images) > 0:
|
||||
pdf_sizes = pdf_image_sizes(pdf)
|
||||
standalone_sizes = [standalone_image_size(f) for f in images]
|
||||
if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2:
|
||||
median_pdf_size = median(pdf_sizes)
|
||||
median_standalone_size = median(standalone_sizes)
|
||||
if median_pdf_size and median_standalone_size:
|
||||
if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]:
|
||||
return collate_from_paths([src for src in srcs if src != pdf], dest, start_index)
|
||||
if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]:
|
||||
return collate_from_paths([pdf], dest, start_index)
|
||||
images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index)
|
||||
if images_vs_pdf != False:
|
||||
return images_vs_pdf
|
||||
|
||||
return None
|
||||
|
||||
|
@ -534,17 +567,17 @@ def manual_collate(args):
|
|||
index = 0
|
||||
for path in args.paths:
|
||||
if path.is_dir():
|
||||
entries = [p for p in path.iterdir() if p.suffix.lower() in IMAGE_FILE_EXTENSIONS]
|
||||
entries = [p for p in path.iterdir() if p.is_file() and is_image(p)]
|
||||
ordering = complete_prefix_number_ordering(entries)
|
||||
if ordering is None:
|
||||
ordering = entries
|
||||
ordering.sort()
|
||||
link_ordered_files(ordering, collation_dir, index)
|
||||
index += len(ordering)
|
||||
elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS:
|
||||
elif is_image(path):
|
||||
link_ordered_files([path], collation_dir, index)
|
||||
index += 1
|
||||
elif path.suffix.lower() == ".pdf":
|
||||
elif is_pdf(path):
|
||||
pdf_page_count = link_pdf(path, collation_dir, index)
|
||||
if pdf_page_count is None:
|
||||
return
|
||||
|
@ -563,10 +596,10 @@ def analyze(args):
|
|||
|
||||
for f in files:
|
||||
print(f'{relpath(f, extract_dir)}', end='')
|
||||
if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
|
||||
if is_image(f):
|
||||
size = standalone_image_size(f)
|
||||
print(f'\t{fmt_size(size)}')
|
||||
elif f.suffix.lower() == '.pdf':
|
||||
elif is_pdf(f):
|
||||
sizes = pdf_image_sizes(f)
|
||||
if len(sizes) == 0:
|
||||
print(f'\tContains no images')
|
||||
|
|
Loading…
Reference in a new issue