From bee5f7c58a40e16eccd005a2b7c559f7b877109c Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Wed, 7 Feb 2024 22:32:31 -0500
Subject: [PATCH] completely refactor manual collation to be more ergonomic

---
 dlibrary/dlibrary.py | 206 +++++++++++++++++++++++++++++--------------
 1 file changed, 138 insertions(+), 68 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index f1cbf3d..30f2e62 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -341,20 +341,20 @@ def is_image(path):
 def ignoreable(path):
     return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)
 
-def ls_ignore(directory):
+def ls_ignore(directory, exclude):
     return [
         path for path in directory.iterdir()
-        if not ignoreable(path)
+        if not ignoreable(path) and path not in exclude
     ]
 
-def descendant_files_ignore(path):
+def descendant_files_ignore(path, exclude):
     if path.is_file():
         return [path]
 
     result = []
-    for item in ls_ignore(path):
+    for item in ls_ignore(path, exclude):
         if item.is_dir():
-            result.extend(descendant_files_ignore(item))
+            result.extend(descendant_files_ignore(item, exclude))
         else:
             result.append(item)
 
@@ -365,7 +365,7 @@ def collate(args):
     cur = con.cursor()
 
     extraction_dir = args.destdir / 'extract'
-    hint_map = {hint.absolute().relative_to(extraction_dir.absolute()).parents[-2].name: hint for hint in args.hints}
+    hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints}
 
     collation_staging_area = args.destdir / 'site' / 'images-staging'
     collation_staging_area.mkdir(parents=True)
@@ -383,7 +383,7 @@ def collate(args):
 
         work_staging_dir = collation_staging_area / work_id
 
-        pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0)
+        pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0, [])
         if pages_collated:
             print(f'Collated {pages_collated} pages for {work_id}')
             work_staging_dir.rename(collation_dir)
@@ -401,7 +401,7 @@ def collate(args):
     collation_staging_area.rmdir()
     con.close()
 
-def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
+def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, later=None):
     early_srcs = []
     middle_srcs = []
     late_srcs = []
@@ -416,17 +416,17 @@ def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
     if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
         return False
 
-    early_page_count = collate_from_paths(early_srcs, dest, start_index)
+    early_page_count = collate_from_paths(early_srcs, dest, start_index, exclude)
     if early_page_count is None:
         return None
     start_index += early_page_count
 
-    middle_page_count = collate_from_paths(middle_srcs, dest, start_index)
+    middle_page_count = collate_from_paths(middle_srcs, dest, start_index, exclude)
     if middle_page_count is None:
         return None
     start_index += middle_page_count
 
-    late_page_count = collate_from_paths(late_srcs, dest, start_index)
+    late_page_count = collate_from_paths(late_srcs, dest, start_index, exclude)
     if late_page_count is None:
         return None
 
@@ -458,13 +458,13 @@ def median(items):
 def superior_or_equal(a, b):
     return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
 
-def try_collate_images_vs_pdf(srcs, dest, start_index):
+def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
     pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
     if len(pdfs) != 1:
         return False
     outer_pdf = pdfs[0]
 
-    inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if is_pdf(f)]
+    inner_pdfs = [f for f in descendant_files_ignore(outer_pdf, exclude) if is_pdf(f)]
     if len(inner_pdfs) != 1:
         return False
     inner_pdf = inner_pdfs[0]
@@ -472,7 +472,7 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
     non_pdf_srcs = [src for src in srcs if src != outer_pdf]
     images = []
     non_images = []
-    descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)]
+    descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src, exclude)]
     for f in descendant_files:
         if is_image(f):
             images.append(f)
@@ -494,15 +494,15 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
         return False
 
     if superior_or_equal(median_standalone_size, median_pdf_size):
-        return collate_from_paths(non_pdf_srcs, dest, start_index)
+        return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
     elif superior_or_equal(median_pdf_size, median_standalone_size):
-        return collate_from_paths([outer_pdf], dest, start_index)
+        return collate_from_paths([outer_pdf], dest, start_index, exclude)
     else:
         return False
 
-def collate_from_paths(srcs, dest, start_index):
+def collate_from_paths(srcs, dest, start_index, exclude):
     if len(srcs) == 1 and srcs[0].is_dir():
-        return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
+        return collate_from_paths(ls_ignore(srcs[0], exclude), dest, start_index, exclude)
 
     if len(srcs) == 1 and is_pdf(srcs[0]):
         print(f'Extracting images from {srcs[0]}')
@@ -516,18 +516,18 @@ def collate_from_paths(srcs, dest, start_index):
         if len(hi_res_dirs) == 1:
             hi_res_dir = hi_res_dirs[0]
             lo_res_dir = next(src for src in srcs if src != hi_res_dir)
-            if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)):
-                return collate_from_paths([hi_res_dir], dest, start_index)
+            if len(descendant_files_ignore(lo_res_dir, exclude)) == len(descendant_files_ignore(hi_res_dir, exclude)):
+                return collate_from_paths([hi_res_dir], dest, start_index, exclude)
 
-    textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX)
+    textless_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=TEXTLESS_REGEX)
     if textless_split != False:
         return textless_split
 
-    epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX)
+    epilogue_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=EPILOGUE_REGEX)
     if epilogue_split != False:
         return epilogue_split
 
-    cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX)
+    cover_split = try_collate_split_regex(srcs, dest, start_index, exclude, earlier=COVER_REGEX)
     if cover_split != False:
         return cover_split
 
@@ -540,7 +540,7 @@ def collate_from_paths(srcs, dest, start_index):
         else:
             return None
 
-    images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index)
+    images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index, exclude)
     if images_vs_pdf != False:
         return images_vs_pdf
 
@@ -549,49 +549,104 @@ def collate_from_paths(srcs, dest, start_index):
 def self_and_parents(path):
     return [path] + list(path.parents)
 
+def parse_expressions(tokens):
+    groups = []
+    exclusions = []
+
+    while tokens:
+        token = tokens.pop(0)
+        if token == '!':
+            exclusions.extend(parse_exclusion(tokens))
+        elif token == '(':
+            groups.append(parse_group(tokens))
+        else:
+            groups.append([token])
+
+    return (groups, exclusions)
+
+def parse_exclusion(tokens):
+    token = tokens.pop(0)
+
+    if token == '(':
+        return parse_group(tokens)
+    else:
+        return [token]
+
+def parse_group(tokens):
+    items = []
+
+    while True:
+        token = tokens.pop(0)
+        if token == ')':
+            return items
+        else:
+            items.append(token)
+
+def normalize_to(path, ref):
+    return ref / Path(relpath(path, ref))
+
 def manual_collate(args):
-    work_id = self_and_parents(args.paths[0].absolute().relative_to(args.destdir.absolute() / 'extract'))[-2].name
+    (raw_groups, raw_exclusions) = parse_expressions(args.expression)
 
-    collation_dir = args.destdir / 'site' / 'images' / work_id
-    if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0:
-        print(f'Collation directory already exists!')
-        return
+    exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
 
-    nonexistent = [path for path in args.paths if not path.exists()]
+    if raw_groups:
+        groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
+    else:
+        groups = [[args.destdir / 'extract' / args.work_id]]
+
+    collation_dir = args.destdir / 'site' / 'images' / args.work_id
+    if collation_dir.exists():
+        if len(list(collation_dir.iterdir())) > 0:
+            print(f'Collation directory already exists!')
+            return
+        else:
+            collation_dir.rmdir()
+
+    nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
     if len(nonexistent) > 0:
         print(f'Nonexistent paths: {nonexistent}')
         return
 
-    collation_dir.mkdir(parents=True, exist_ok=True)
+    collation_staging_area = args.destdir / 'site' / 'images-staging'
+    work_staging_dir = collation_staging_area / args.work_id
+    work_staging_dir.mkdir(parents=True)
+
+    pages_collated = 0
+    for group in groups:
+        pages_added = collate_from_paths(
+            [item for item in group if item not in exclusions],
+            work_staging_dir,
+            pages_collated,
+            exclusions,
+        )
+        if pages_added is None:
+            print(f'Unable to deduce file structure for {args.work_id} subgroup {[str(path) for path in group]}')
+            pages_collated = None
+            break
+
+        pages_collated += pages_added
+
+    if pages_collated:
+        print(f'Collated {pages_collated} pages for {args.work_id}')
+        work_staging_dir.rename(collation_dir)
+    else:
+        for f in work_staging_dir.iterdir():
+            f.unlink()
+        work_staging_dir.rmdir()
+
+        if pages_collated == 0:
+            print(f'No files found for {work_id}')
+
+    collation_staging_area.rmdir()
 
-    index = 0
-    for path in args.paths:
-        if path.is_dir():
-            entries = [p for p in path.iterdir() if p.is_file() and is_image(p)]
-            ordering = complete_prefix_number_ordering(entries)
-            if ordering is None:
-                ordering = entries
-                ordering.sort()
-            link_ordered_files(ordering, collation_dir, index)
-            index += len(ordering)
-        elif is_image(path):
-            link_ordered_files([path], collation_dir, index)
-            index += 1
-        elif is_pdf(path):
-            pdf_page_count = link_pdf(path, collation_dir, index)
-            if pdf_page_count is None:
-                return
-            index += pdf_page_count
-        else:
-            print(f'Unknown file type {path}, stopping')
-            return
 
 def fmt_size(s):
     return f'{s[0]}x{s[1]}px'
 
 def analyze(args):
     extract_dir = args.destdir / 'extract'
-    files = descendant_files_ignore(extract_dir / args.work_id)
+    files = descendant_files_ignore(extract_dir / args.work_id, [])
     files.sort()
 
     for f in files:
@@ -869,28 +924,43 @@ parser_manual_collate = subparsers.add_parser(
     help='collate a single work manually',
     formatter_class=argparse.RawDescriptionHelpFormatter,
     description=textwrap.dedent("""\
+    Provide an expression or sequence of expressions specifying groups
+    of paths to collate or skip. An expression can be:
+
+    PATH
+      A single path. If this is an image, it will be appended to
+      the sequence of collated images; if this is a PDF, images will be
+      extracted from it and concatenated to the sequence; if this is a
+      directory, the contents of the directory will be collated based on
+      the normal heuristics and concatenated to the sequence.
+
+    ( PATH [PATH ...] )
+      A group of paths contained in parentheses. You may need to escape
+      the parentheses to avoid them getting parsed by your shell.
+      All the paths in this group will be considered together, and
+      collated based on the normal heuristics, regardless of what
+      order the paths are provided in.
+
+    ! PATH
+    ! ( PATH [PATH ...] )
+      A path or group of paths to exclude from collation. You may
+      need to escape the !. If an excluded path appears within any
+      of the other specified paths, it will be ignored.
+
+    If the only expressions provided are negations, then auto-collation
+    will start from the top level of the extracted work while excluding
+    the negated paths.
+
     All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
     for the work being manually collated. `manual-collate` can
     only handle one work at a time. Paths are used as follows:
-
-    - If a path is a directory, all *image files* immediately
-      inside that directory will be appended to the sequence. If
-      files are named in a way which indicates a clear ordering,
-      that ordering will be used. Otherwise, filenames will be
-      sorted lexicographically. Non-image files and
-      subdirectories will be ignored.
-    - If a path is an image file, that image file will be
-      appended to the sequence.
-    - If a path is a PDF file, page images will be extracted
-      from that PDF and appended to the sequence.
 """),
 )
+parser_manual_collate.add_argument('work_id')
 parser_manual_collate.add_argument(
-    'paths',
-    metavar='PATH',
-    type=Path,
+    'expression',
     nargs='+',
-    help='paths within a single work to be collated in sequence',
+    help='expressions indicating paths to collate or skip',
 )
 parser_manual_collate.set_defaults(func=manual_collate)