completely refactor manual collation to be more ergonomic

This commit is contained in:
xenofem 2024-02-07 22:32:31 -05:00
parent 9353357dc9
commit bee5f7c58a

View file

@ -341,20 +341,20 @@ def is_image(path):
def ignoreable(path): def ignoreable(path):
return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS) return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)
def ls_ignore(directory): def ls_ignore(directory, exclude):
return [ return [
path for path in directory.iterdir() path for path in directory.iterdir()
if not ignoreable(path) if not ignoreable(path) and path not in exclude
] ]
def descendant_files_ignore(path): def descendant_files_ignore(path, exclude):
if path.is_file(): if path.is_file():
return [path] return [path]
result = [] result = []
for item in ls_ignore(path): for item in ls_ignore(path, exclude):
if item.is_dir(): if item.is_dir():
result.extend(descendant_files_ignore(item)) result.extend(descendant_files_ignore(item, exclude))
else: else:
result.append(item) result.append(item)
@ -365,7 +365,7 @@ def collate(args):
cur = con.cursor() cur = con.cursor()
extraction_dir = args.destdir / 'extract' extraction_dir = args.destdir / 'extract'
hint_map = {hint.absolute().relative_to(extraction_dir.absolute()).parents[-2].name: hint for hint in args.hints} hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints}
collation_staging_area = args.destdir / 'site' / 'images-staging' collation_staging_area = args.destdir / 'site' / 'images-staging'
collation_staging_area.mkdir(parents=True) collation_staging_area.mkdir(parents=True)
@ -383,7 +383,7 @@ def collate(args):
work_staging_dir = collation_staging_area / work_id work_staging_dir = collation_staging_area / work_id
pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0) pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0, [])
if pages_collated: if pages_collated:
print(f'Collated {pages_collated} pages for {work_id}') print(f'Collated {pages_collated} pages for {work_id}')
work_staging_dir.rename(collation_dir) work_staging_dir.rename(collation_dir)
@ -401,7 +401,7 @@ def collate(args):
collation_staging_area.rmdir() collation_staging_area.rmdir()
con.close() con.close()
def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None): def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, later=None):
early_srcs = [] early_srcs = []
middle_srcs = [] middle_srcs = []
late_srcs = [] late_srcs = []
@ -416,17 +416,17 @@ def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1: if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
return False return False
early_page_count = collate_from_paths(early_srcs, dest, start_index) early_page_count = collate_from_paths(early_srcs, dest, start_index, exclude)
if early_page_count is None: if early_page_count is None:
return None return None
start_index += early_page_count start_index += early_page_count
middle_page_count = collate_from_paths(middle_srcs, dest, start_index) middle_page_count = collate_from_paths(middle_srcs, dest, start_index, exclude)
if middle_page_count is None: if middle_page_count is None:
return None return None
start_index += middle_page_count start_index += middle_page_count
late_page_count = collate_from_paths(late_srcs, dest, start_index) late_page_count = collate_from_paths(late_srcs, dest, start_index, exclude)
if late_page_count is None: if late_page_count is None:
return None return None
@ -458,13 +458,13 @@ def median(items):
def superior_or_equal(a, b): def superior_or_equal(a, b):
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
def try_collate_images_vs_pdf(srcs, dest, start_index): def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
pdfs = [src for src in srcs if 'pdf' in src.name.lower()] pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
if len(pdfs) != 1: if len(pdfs) != 1:
return False return False
outer_pdf = pdfs[0] outer_pdf = pdfs[0]
inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if is_pdf(f)] inner_pdfs = [f for f in descendant_files_ignore(outer_pdf, exclude) if is_pdf(f)]
if len(inner_pdfs) != 1: if len(inner_pdfs) != 1:
return False return False
inner_pdf = inner_pdfs[0] inner_pdf = inner_pdfs[0]
@ -472,7 +472,7 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
non_pdf_srcs = [src for src in srcs if src != outer_pdf] non_pdf_srcs = [src for src in srcs if src != outer_pdf]
images = [] images = []
non_images = [] non_images = []
descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)] descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src, exclude)]
for f in descendant_files: for f in descendant_files:
if is_image(f): if is_image(f):
images.append(f) images.append(f)
@ -494,15 +494,15 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
return False return False
if superior_or_equal(median_standalone_size, median_pdf_size): if superior_or_equal(median_standalone_size, median_pdf_size):
return collate_from_paths(non_pdf_srcs, dest, start_index) return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
elif superior_or_equal(median_pdf_size, median_standalone_size): elif superior_or_equal(median_pdf_size, median_standalone_size):
return collate_from_paths([outer_pdf], dest, start_index) return collate_from_paths([outer_pdf], dest, start_index, exclude)
else: else:
return False return False
def collate_from_paths(srcs, dest, start_index): def collate_from_paths(srcs, dest, start_index, exclude):
if len(srcs) == 1 and srcs[0].is_dir(): if len(srcs) == 1 and srcs[0].is_dir():
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) return collate_from_paths(ls_ignore(srcs[0], exclude), dest, start_index, exclude)
if len(srcs) == 1 and is_pdf(srcs[0]): if len(srcs) == 1 and is_pdf(srcs[0]):
print(f'Extracting images from {srcs[0]}') print(f'Extracting images from {srcs[0]}')
@ -516,18 +516,18 @@ def collate_from_paths(srcs, dest, start_index):
if len(hi_res_dirs) == 1: if len(hi_res_dirs) == 1:
hi_res_dir = hi_res_dirs[0] hi_res_dir = hi_res_dirs[0]
lo_res_dir = next(src for src in srcs if src != hi_res_dir) lo_res_dir = next(src for src in srcs if src != hi_res_dir)
if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)): if len(descendant_files_ignore(lo_res_dir, exclude)) == len(descendant_files_ignore(hi_res_dir, exclude)):
return collate_from_paths([hi_res_dir], dest, start_index) return collate_from_paths([hi_res_dir], dest, start_index, exclude)
textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX) textless_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=TEXTLESS_REGEX)
if textless_split != False: if textless_split != False:
return textless_split return textless_split
epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX) epilogue_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=EPILOGUE_REGEX)
if epilogue_split != False: if epilogue_split != False:
return epilogue_split return epilogue_split
cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX) cover_split = try_collate_split_regex(srcs, dest, start_index, exclude, earlier=COVER_REGEX)
if cover_split != False: if cover_split != False:
return cover_split return cover_split
@ -540,7 +540,7 @@ def collate_from_paths(srcs, dest, start_index):
else: else:
return None return None
images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index) images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index, exclude)
if images_vs_pdf != False: if images_vs_pdf != False:
return images_vs_pdf return images_vs_pdf
@ -549,49 +549,104 @@ def collate_from_paths(srcs, dest, start_index):
def self_and_parents(path): def self_and_parents(path):
return [path] + list(path.parents) return [path] + list(path.parents)
def manual_collate(args): def parse_expressions(tokens):
work_id = self_and_parents(args.paths[0].absolute().relative_to(args.destdir.absolute() / 'extract'))[-2].name groups = []
exclusions = []
collation_dir = args.destdir / 'site' / 'images' / work_id while tokens:
if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0: token = tokens.pop(0)
if token == '!':
exclusions.extend(parse_exclusion(tokens))
elif token == '(':
groups.append(parse_group(tokens))
else:
groups.append([token])
return (groups, exclusions)
def parse_exclusion(tokens):
token = tokens.pop(0)
if token == '(':
return parse_group(tokens)
else:
return [token]
def parse_group(tokens):
items = []
while True:
token = tokens.pop(0)
if token == ')':
return items
else:
items.append(token)
def normalize_to(path, ref):
return ref / Path(relpath(path, ref))
def manual_collate(args):
(raw_groups, raw_exclusions) = parse_expressions(args.expression)
exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
if raw_groups:
groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
else:
groups = [[args.destdir / 'extract' / args.work_id]]
collation_dir = args.destdir / 'site' / 'images' / args.work_id
if collation_dir.exists():
if len(list(collation_dir.iterdir())) > 0:
print(f'Collation directory already exists!') print(f'Collation directory already exists!')
return return
else:
collation_dir.rmdir()
nonexistent = [path for path in args.paths if not path.exists()] nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
if len(nonexistent) > 0: if len(nonexistent) > 0:
print(f'Nonexistent paths: {nonexistent}') print(f'Nonexistent paths: {nonexistent}')
return return
collation_dir.mkdir(parents=True, exist_ok=True) collation_staging_area = args.destdir / 'site' / 'images-staging'
work_staging_dir = collation_staging_area / args.work_id
work_staging_dir.mkdir(parents=True)
index = 0 pages_collated = 0
for path in args.paths: for group in groups:
if path.is_dir(): pages_added = collate_from_paths(
entries = [p for p in path.iterdir() if p.is_file() and is_image(p)] [item for item in group if item not in exclusions],
ordering = complete_prefix_number_ordering(entries) work_staging_dir,
if ordering is None: pages_collated,
ordering = entries exclusions,
ordering.sort() )
link_ordered_files(ordering, collation_dir, index) if pages_added is None:
index += len(ordering) print(f'Unable to deduce file structure for {args.work_id} subgroup {[str(path) for path in group]}')
elif is_image(path): pages_collated = None
link_ordered_files([path], collation_dir, index) break
index += 1
elif is_pdf(path): pages_collated += pages_added
pdf_page_count = link_pdf(path, collation_dir, index)
if pdf_page_count is None: if pages_collated:
return print(f'Collated {pages_collated} pages for {args.work_id}')
index += pdf_page_count work_staging_dir.rename(collation_dir)
else: else:
print(f'Unknown file type {path}, stopping') for f in work_staging_dir.iterdir():
return f.unlink()
work_staging_dir.rmdir()
if pages_collated == 0:
print(f'No files found for {work_id}')
collation_staging_area.rmdir()
def fmt_size(s): def fmt_size(s):
return f'{s[0]}x{s[1]}px' return f'{s[0]}x{s[1]}px'
def analyze(args): def analyze(args):
extract_dir = args.destdir / 'extract' extract_dir = args.destdir / 'extract'
files = descendant_files_ignore(extract_dir / args.work_id) files = descendant_files_ignore(extract_dir / args.work_id, [])
files.sort() files.sort()
for f in files: for f in files:
@ -869,28 +924,43 @@ parser_manual_collate = subparsers.add_parser(
help='collate a single work manually', help='collate a single work manually',
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent("""\ description=textwrap.dedent("""\
Provide an expression or sequence of expressions specifying groups
of paths to collate or skip. An expression can be:
PATH
A single path. If this is an image, it will be appended to
the sequence of collated images; if this is a PDF, images will be
extracted from it and concatenated to the sequence; if this is a
directory, the contents of the directory will be collated based on
the normal heuristics and concatenated to the sequence.
( PATH [PATH ...] )
A group of paths contained in parentheses. You may need to escape
the parentheses to avoid them getting parsed by your shell.
All the paths in this group will be considered together, and
collated based on the normal heuristics, regardless of what
order the paths are provided in.
! PATH
! ( PATH [PATH ...] )
A path or group of paths to exclude from collation. You may
need to escape the !. If an excluded path appears within any
of the other specified paths, it will be ignored.
If the only expressions provided are negations, then auto-collation
will start from the top level of the extracted work while excluding
the negated paths.
All provided paths must be under $DLIBRARY_DIR/extract/[work id]/ All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
for the work being manually collated. `manual-collate` can for the work being manually collated. `manual-collate` can
only handle one work at a time. Paths are used as follows: only handle one work at a time. Paths are used as follows:
- If a path is a directory, all *image files* immediately
inside that directory will be appended to the sequence. If
files are named in a way which indicates a clear ordering,
that ordering will be used. Otherwise, filenames will be
sorted lexicographically. Non-image files and
subdirectories will be ignored.
- If a path is an image file, that image file will be
appended to the sequence.
- If a path is a PDF file, page images will be extracted
from that PDF and appended to the sequence.
"""), """),
) )
parser_manual_collate.add_argument('work_id')
parser_manual_collate.add_argument( parser_manual_collate.add_argument(
'paths', 'expression',
metavar='PATH',
type=Path,
nargs='+', nargs='+',
help='paths within a single work to be collated in sequence', help='expressions indicating paths to collate or skip',
) )
parser_manual_collate.set_defaults(func=manual_collate) parser_manual_collate.set_defaults(func=manual_collate)