completely refactor manual collation to be more ergonomic

This commit is contained in:
xenofem 2024-02-07 22:32:31 -05:00
parent 9353357dc9
commit bee5f7c58a

View file

@ -341,20 +341,20 @@ def is_image(path):
def ignoreable(path):
return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)
def ls_ignore(directory):
def ls_ignore(directory, exclude):
return [
path for path in directory.iterdir()
if not ignoreable(path)
if not ignoreable(path) and path not in exclude
]
def descendant_files_ignore(path):
def descendant_files_ignore(path, exclude):
if path.is_file():
return [path]
result = []
for item in ls_ignore(path):
for item in ls_ignore(path, exclude):
if item.is_dir():
result.extend(descendant_files_ignore(item))
result.extend(descendant_files_ignore(item, exclude))
else:
result.append(item)
@ -365,7 +365,7 @@ def collate(args):
cur = con.cursor()
extraction_dir = args.destdir / 'extract'
hint_map = {hint.absolute().relative_to(extraction_dir.absolute()).parents[-2].name: hint for hint in args.hints}
hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints}
collation_staging_area = args.destdir / 'site' / 'images-staging'
collation_staging_area.mkdir(parents=True)
@ -383,7 +383,7 @@ def collate(args):
work_staging_dir = collation_staging_area / work_id
pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0)
pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0, [])
if pages_collated:
print(f'Collated {pages_collated} pages for {work_id}')
work_staging_dir.rename(collation_dir)
@ -401,7 +401,7 @@ def collate(args):
collation_staging_area.rmdir()
con.close()
def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, later=None):
early_srcs = []
middle_srcs = []
late_srcs = []
@ -416,17 +416,17 @@ def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
return False
early_page_count = collate_from_paths(early_srcs, dest, start_index)
early_page_count = collate_from_paths(early_srcs, dest, start_index, exclude)
if early_page_count is None:
return None
start_index += early_page_count
middle_page_count = collate_from_paths(middle_srcs, dest, start_index)
middle_page_count = collate_from_paths(middle_srcs, dest, start_index, exclude)
if middle_page_count is None:
return None
start_index += middle_page_count
late_page_count = collate_from_paths(late_srcs, dest, start_index)
late_page_count = collate_from_paths(late_srcs, dest, start_index, exclude)
if late_page_count is None:
return None
@ -458,13 +458,13 @@ def median(items):
def superior_or_equal(a, b):
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
def try_collate_images_vs_pdf(srcs, dest, start_index):
def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
if len(pdfs) != 1:
return False
outer_pdf = pdfs[0]
inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if is_pdf(f)]
inner_pdfs = [f for f in descendant_files_ignore(outer_pdf, exclude) if is_pdf(f)]
if len(inner_pdfs) != 1:
return False
inner_pdf = inner_pdfs[0]
@ -472,7 +472,7 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
non_pdf_srcs = [src for src in srcs if src != outer_pdf]
images = []
non_images = []
descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)]
descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src, exclude)]
for f in descendant_files:
if is_image(f):
images.append(f)
@ -494,15 +494,15 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
return False
if superior_or_equal(median_standalone_size, median_pdf_size):
return collate_from_paths(non_pdf_srcs, dest, start_index)
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
elif superior_or_equal(median_pdf_size, median_standalone_size):
return collate_from_paths([outer_pdf], dest, start_index)
return collate_from_paths([outer_pdf], dest, start_index, exclude)
else:
return False
def collate_from_paths(srcs, dest, start_index):
def collate_from_paths(srcs, dest, start_index, exclude):
if len(srcs) == 1 and srcs[0].is_dir():
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
return collate_from_paths(ls_ignore(srcs[0], exclude), dest, start_index, exclude)
if len(srcs) == 1 and is_pdf(srcs[0]):
print(f'Extracting images from {srcs[0]}')
@ -516,18 +516,18 @@ def collate_from_paths(srcs, dest, start_index):
if len(hi_res_dirs) == 1:
hi_res_dir = hi_res_dirs[0]
lo_res_dir = next(src for src in srcs if src != hi_res_dir)
if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)):
return collate_from_paths([hi_res_dir], dest, start_index)
if len(descendant_files_ignore(lo_res_dir, exclude)) == len(descendant_files_ignore(hi_res_dir, exclude)):
return collate_from_paths([hi_res_dir], dest, start_index, exclude)
textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX)
textless_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=TEXTLESS_REGEX)
if textless_split != False:
return textless_split
epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX)
epilogue_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=EPILOGUE_REGEX)
if epilogue_split != False:
return epilogue_split
cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX)
cover_split = try_collate_split_regex(srcs, dest, start_index, exclude, earlier=COVER_REGEX)
if cover_split != False:
return cover_split
@ -540,7 +540,7 @@ def collate_from_paths(srcs, dest, start_index):
else:
return None
images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index)
images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index, exclude)
if images_vs_pdf != False:
return images_vs_pdf
@ -549,49 +549,104 @@ def collate_from_paths(srcs, dest, start_index):
def self_and_parents(path):
return [path] + list(path.parents)
def manual_collate(args):
work_id = self_and_parents(args.paths[0].absolute().relative_to(args.destdir.absolute() / 'extract'))[-2].name
def parse_expressions(tokens):
groups = []
exclusions = []
collation_dir = args.destdir / 'site' / 'images' / work_id
if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0:
while tokens:
token = tokens.pop(0)
if token == '!':
exclusions.extend(parse_exclusion(tokens))
elif token == '(':
groups.append(parse_group(tokens))
else:
groups.append([token])
return (groups, exclusions)
def parse_exclusion(tokens):
token = tokens.pop(0)
if token == '(':
return parse_group(tokens)
else:
return [token]
def parse_group(tokens):
items = []
while True:
token = tokens.pop(0)
if token == ')':
return items
else:
items.append(token)
def normalize_to(path, ref):
return ref / Path(relpath(path, ref))
def manual_collate(args):
(raw_groups, raw_exclusions) = parse_expressions(args.expression)
exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
if raw_groups:
groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
else:
groups = [[args.destdir / 'extract' / args.work_id]]
collation_dir = args.destdir / 'site' / 'images' / args.work_id
if collation_dir.exists():
if len(list(collation_dir.iterdir())) > 0:
print(f'Collation directory already exists!')
return
else:
collation_dir.rmdir()
nonexistent = [path for path in args.paths if not path.exists()]
nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
if len(nonexistent) > 0:
print(f'Nonexistent paths: {nonexistent}')
return
collation_dir.mkdir(parents=True, exist_ok=True)
collation_staging_area = args.destdir / 'site' / 'images-staging'
work_staging_dir = collation_staging_area / args.work_id
work_staging_dir.mkdir(parents=True)
index = 0
for path in args.paths:
if path.is_dir():
entries = [p for p in path.iterdir() if p.is_file() and is_image(p)]
ordering = complete_prefix_number_ordering(entries)
if ordering is None:
ordering = entries
ordering.sort()
link_ordered_files(ordering, collation_dir, index)
index += len(ordering)
elif is_image(path):
link_ordered_files([path], collation_dir, index)
index += 1
elif is_pdf(path):
pdf_page_count = link_pdf(path, collation_dir, index)
if pdf_page_count is None:
return
index += pdf_page_count
pages_collated = 0
for group in groups:
pages_added = collate_from_paths(
[item for item in group if item not in exclusions],
work_staging_dir,
pages_collated,
exclusions,
)
if pages_added is None:
print(f'Unable to deduce file structure for {args.work_id} subgroup {[str(path) for path in group]}')
pages_collated = None
break
pages_collated += pages_added
if pages_collated:
print(f'Collated {pages_collated} pages for {args.work_id}')
work_staging_dir.rename(collation_dir)
else:
print(f'Unknown file type {path}, stopping')
return
for f in work_staging_dir.iterdir():
f.unlink()
work_staging_dir.rmdir()
if pages_collated == 0:
print(f'No files found for {work_id}')
collation_staging_area.rmdir()
def fmt_size(s):
return f'{s[0]}x{s[1]}px'
def analyze(args):
extract_dir = args.destdir / 'extract'
files = descendant_files_ignore(extract_dir / args.work_id)
files = descendant_files_ignore(extract_dir / args.work_id, [])
files.sort()
for f in files:
@ -869,28 +924,43 @@ parser_manual_collate = subparsers.add_parser(
help='collate a single work manually',
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent("""\
Provide an expression or sequence of expressions specifying groups
of paths to collate or skip. An expression can be:
PATH
A single path. If this is an image, it will be appended to
the sequence of collated images; if this is a PDF, images will be
extracted from it and concatenated to the sequence; if this is a
directory, the contents of the directory will be collated based on
the normal heuristics and concatenated to the sequence.
( PATH [PATH ...] )
A group of paths contained in parentheses. You may need to escape
the parentheses to avoid them getting parsed by your shell.
All the paths in this group will be considered together, and
collated based on the normal heuristics, regardless of what
order the paths are provided in.
! PATH
! ( PATH [PATH ...] )
A path or group of paths to exclude from collation. You may
need to escape the !. If an excluded path appears within any
of the other specified paths, it will be ignored.
If the only expressions provided are negations, then auto-collation
will start from the top level of the extracted work while excluding
the negated paths.
All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
for the work being manually collated. `manual-collate` can
only handle one work at a time. Paths are used as follows:
- If a path is a directory, all *image files* immediately
inside that directory will be appended to the sequence. If
files are named in a way which indicates a clear ordering,
that ordering will be used. Otherwise, filenames will be
sorted lexicographically. Non-image files and
subdirectories will be ignored.
- If a path is an image file, that image file will be
appended to the sequence.
- If a path is a PDF file, page images will be extracted
from that PDF and appended to the sequence.
"""),
)
parser_manual_collate.add_argument('work_id')
parser_manual_collate.add_argument(
'paths',
metavar='PATH',
type=Path,
'expression',
nargs='+',
help='paths within a single work to be collated in sequence',
help='expressions indicating paths to collate or skip',
)
parser_manual_collate.set_defaults(func=manual_collate)