completely refactor manual collation to be more ergonomic

2024-02-07 22:32:31 -05:00 · 2024-02-07 22:32:31 -05:00 · bee5f7c58a
commit bee5f7c58a
parent 9353357dc9
1 changed files with 138 additions and 68 deletions
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@ -341,20 +341,20 @@ def is_image(path):
 def ignoreable(path):
    return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)
-def ls_ignore(directory):
+def ls_ignore(directory, exclude):
    return [
        path for path in directory.iterdir()
-        if not ignoreable(path)
+        if not ignoreable(path) and path not in exclude
    ]
-def descendant_files_ignore(path):
+def descendant_files_ignore(path, exclude):
    if path.is_file():
        return [path]
    result = []
-    for item in ls_ignore(path):
+    for item in ls_ignore(path, exclude):
        if item.is_dir():
-            result.extend(descendant_files_ignore(item))
+            result.extend(descendant_files_ignore(item, exclude))
        else:
            result.append(item)
@ -365,7 +365,7 @@ def collate(args):
    cur = con.cursor()
    extraction_dir = args.destdir / 'extract'
-    hint_map = {hint.absolute().relative_to(extraction_dir.absolute()).parents[-2].name: hint for hint in args.hints}
+    hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints}
    collation_staging_area = args.destdir / 'site' / 'images-staging'
    collation_staging_area.mkdir(parents=True)
@ -383,7 +383,7 @@ def collate(args):
        work_staging_dir = collation_staging_area / work_id
-        pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0)
+        pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0, [])
        if pages_collated:
            print(f'Collated {pages_collated} pages for {work_id}')
            work_staging_dir.rename(collation_dir)
@ -401,7 +401,7 @@ def collate(args):
    collation_staging_area.rmdir()
    con.close()
-def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
+def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, later=None):
    early_srcs = []
    middle_srcs = []
    late_srcs = []
@ -416,17 +416,17 @@ def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
    if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
        return False
-    early_page_count = collate_from_paths(early_srcs, dest, start_index)
+    early_page_count = collate_from_paths(early_srcs, dest, start_index, exclude)
    if early_page_count is None:
        return None
    start_index += early_page_count
-    middle_page_count = collate_from_paths(middle_srcs, dest, start_index)
+    middle_page_count = collate_from_paths(middle_srcs, dest, start_index, exclude)
    if middle_page_count is None:
        return None
    start_index += middle_page_count
-    late_page_count = collate_from_paths(late_srcs, dest, start_index)
+    late_page_count = collate_from_paths(late_srcs, dest, start_index, exclude)
    if late_page_count is None:
        return None
@ -458,13 +458,13 @@ def median(items):
 def superior_or_equal(a, b):
    return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
-def try_collate_images_vs_pdf(srcs, dest, start_index):
+def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
    pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
    if len(pdfs) != 1:
        return False
    outer_pdf = pdfs[0]
-    inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if is_pdf(f)]
+    inner_pdfs = [f for f in descendant_files_ignore(outer_pdf, exclude) if is_pdf(f)]
    if len(inner_pdfs) != 1:
        return False
    inner_pdf = inner_pdfs[0]
@ -472,7 +472,7 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
    non_pdf_srcs = [src for src in srcs if src != outer_pdf]
    images = []
    non_images = []
-    descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)]
+    descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src, exclude)]
    for f in descendant_files:
        if is_image(f):
            images.append(f)
@ -494,15 +494,15 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
        return False
    if superior_or_equal(median_standalone_size, median_pdf_size):
-        return collate_from_paths(non_pdf_srcs, dest, start_index)
+        return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
    elif superior_or_equal(median_pdf_size, median_standalone_size):
-        return collate_from_paths([outer_pdf], dest, start_index)
+        return collate_from_paths([outer_pdf], dest, start_index, exclude)
    else:
        return False
-def collate_from_paths(srcs, dest, start_index):
+def collate_from_paths(srcs, dest, start_index, exclude):
    if len(srcs) == 1 and srcs[0].is_dir():
-        return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
+        return collate_from_paths(ls_ignore(srcs[0], exclude), dest, start_index, exclude)
    if len(srcs) == 1 and is_pdf(srcs[0]):
        print(f'Extracting images from {srcs[0]}')
@ -516,18 +516,18 @@ def collate_from_paths(srcs, dest, start_index):
        if len(hi_res_dirs) == 1:
            hi_res_dir = hi_res_dirs[0]
            lo_res_dir = next(src for src in srcs if src != hi_res_dir)
-            if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)):
+            if len(descendant_files_ignore(lo_res_dir, exclude)) == len(descendant_files_ignore(hi_res_dir, exclude)):
-                return collate_from_paths([hi_res_dir], dest, start_index)
+                return collate_from_paths([hi_res_dir], dest, start_index, exclude)
-    textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX)
+    textless_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=TEXTLESS_REGEX)
    if textless_split != False:
        return textless_split
-    epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX)
+    epilogue_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=EPILOGUE_REGEX)
    if epilogue_split != False:
        return epilogue_split
-    cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX)
+    cover_split = try_collate_split_regex(srcs, dest, start_index, exclude, earlier=COVER_REGEX)
    if cover_split != False:
        return cover_split
@ -540,7 +540,7 @@ def collate_from_paths(srcs, dest, start_index):
        else:
            return None
-    images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index)
+    images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index, exclude)
    if images_vs_pdf != False:
        return images_vs_pdf
@ -549,49 +549,104 @@ def collate_from_paths(srcs, dest, start_index):
 def self_and_parents(path):
    return [path] + list(path.parents)
-def manual_collate(args):
+def parse_expressions(tokens):
-    work_id = self_and_parents(args.paths[0].absolute().relative_to(args.destdir.absolute() / 'extract'))[-2].name
+    groups = []
    exclusions = []
-    collation_dir = args.destdir / 'site' / 'images' / work_id
+    while tokens:
-    if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0:
+        token = tokens.pop(0)
        if token == '!':
            exclusions.extend(parse_exclusion(tokens))
        elif token == '(':
            groups.append(parse_group(tokens))
        else:
            groups.append([token])
    return (groups, exclusions)
 def parse_exclusion(tokens):
    token = tokens.pop(0)
    if token == '(':
        return parse_group(tokens)
    else:
        return [token]
 def parse_group(tokens):
    items = []
    while True:
        token = tokens.pop(0)
        if token == ')':
            return items
        else:
            items.append(token)
 def normalize_to(path, ref):
    return ref / Path(relpath(path, ref))
 def manual_collate(args):
    (raw_groups, raw_exclusions) = parse_expressions(args.expression)
    exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
    if raw_groups:
        groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
    else:
        groups = [[args.destdir / 'extract' / args.work_id]]
    collation_dir = args.destdir / 'site' / 'images' / args.work_id
    if collation_dir.exists():
        if len(list(collation_dir.iterdir())) > 0:
            print(f'Collation directory already exists!')
            return
        else:
            collation_dir.rmdir()
-    nonexistent = [path for path in args.paths if not path.exists()]
+    nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
    if len(nonexistent) > 0:
        print(f'Nonexistent paths: {nonexistent}')
        return
-    collation_dir.mkdir(parents=True, exist_ok=True)
+    collation_staging_area = args.destdir / 'site' / 'images-staging'
    work_staging_dir = collation_staging_area / args.work_id
    work_staging_dir.mkdir(parents=True)
-    index = 0
+    pages_collated = 0
-    for path in args.paths:
+    for group in groups:
-        if path.is_dir():
+        pages_added = collate_from_paths(
-            entries = [p for p in path.iterdir() if p.is_file() and is_image(p)]
+            [item for item in group if item not in exclusions],
-            ordering = complete_prefix_number_ordering(entries)
+            work_staging_dir,
-            if ordering is None:
+            pages_collated,
-                ordering = entries
+            exclusions,
-                ordering.sort()
+        )
-            link_ordered_files(ordering, collation_dir, index)
+        if pages_added is None:
-            index += len(ordering)
+            print(f'Unable to deduce file structure for {args.work_id} subgroup {[str(path) for path in group]}')
-        elif is_image(path):
+            pages_collated = None
-            link_ordered_files([path], collation_dir, index)
+            break
-            index += 1
+
-        elif is_pdf(path):
+        pages_collated += pages_added
-            pdf_page_count = link_pdf(path, collation_dir, index)
+
-            if pdf_page_count is None:
+    if pages_collated:
-                return
+        print(f'Collated {pages_collated} pages for {args.work_id}')
-            index += pdf_page_count
+        work_staging_dir.rename(collation_dir)
    else:
-            print(f'Unknown file type {path}, stopping')
+        for f in work_staging_dir.iterdir():
-            return
+            f.unlink()
        work_staging_dir.rmdir()
        if pages_collated == 0:
            print(f'No files found for {work_id}')
    collation_staging_area.rmdir()
 def fmt_size(s):
    return f'{s[0]}x{s[1]}px'
 def analyze(args):
    extract_dir = args.destdir / 'extract'
-    files = descendant_files_ignore(extract_dir / args.work_id)
+    files = descendant_files_ignore(extract_dir / args.work_id, [])
    files.sort()
    for f in files:
@ -869,28 +924,43 @@ parser_manual_collate = subparsers.add_parser(
    help='collate a single work manually',
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description=textwrap.dedent("""\
    Provide an expression or sequence of expressions specifying groups
    of paths to collate or skip. An expression can be:
    PATH
      A single path. If this is an image, it will be appended to
      the sequence of collated images; if this is a PDF, images will be
      extracted from it and concatenated to the sequence; if this is a
      directory, the contents of the directory will be collated based on
      the normal heuristics and concatenated to the sequence.
    ( PATH [PATH ...] )
      A group of paths contained in parentheses. You may need to escape
      the parentheses to avoid them getting parsed by your shell.
      All the paths in this group will be considered together, and
      collated based on the normal heuristics, regardless of what
      order the paths are provided in.
    ! PATH
    ! ( PATH [PATH ...] )
      A path or group of paths to exclude from collation. You may
      need to escape the !. If an excluded path appears within any
      of the other specified paths, it will be ignored.
    If the only expressions provided are negations, then auto-collation
    will start from the top level of the extracted work while excluding
    the negated paths.
    All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
    for the work being manually collated. `manual-collate` can
    only handle one work at a time. Paths are used as follows:
    - If a path is a directory, all *image files* immediately
      inside that directory will be appended to the sequence. If
      files are named in a way which indicates a clear ordering,
      that ordering will be used. Otherwise, filenames will be
      sorted lexicographically. Non-image files and
      subdirectories will be ignored.
    - If a path is an image file, that image file will be
      appended to the sequence.
    - If a path is a PDF file, page images will be extracted
      from that PDF and appended to the sequence.
 """),
 )
 parser_manual_collate.add_argument('work_id')
 parser_manual_collate.add_argument(
-    'paths',
+    'expression',
    metavar='PATH',
    type=Path,
    nargs='+',
-    help='paths within a single work to be collated in sequence',
+    help='expressions indicating paths to collate or skip',
 )
 parser_manual_collate.set_defaults(func=manual_collate)