From aefaf824a81d71b466f66aeee48f4195bada5fb1 Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Tue, 6 Feb 2024 11:02:08 -0500
Subject: [PATCH 1/3] handle hierarchical numbering schemes in auto-collation

---
 dlibrary/dlibrary.py | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index ec567ed..63a6983 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -202,7 +202,7 @@ def complete_prefix_number_ordering(entries):
         version = next(ver for ver in (ALT_VERSIONS + ['']) if ver in entry.name)
         entries_by_version.setdefault(version, []).append(entry)
 
-    numberings_by_version = {ver: prefix_numbering(entries_by_version[ver]) for ver in entries_by_version}
+    numberings_by_version = {ver: unique_hierarchical_prefix_numbering(entries_by_version[ver]) for ver in entries_by_version}
 
     unified_indices = set()
     for numbering in numberings_by_version.values():
@@ -212,8 +212,15 @@ def complete_prefix_number_ordering(entries):
     unified_indices = list(unified_indices)
     unified_indices.sort()
 
-    if len(unified_indices) > 1 and min(unified_indices[i] - unified_indices[i-1] for i in range(1, len(unified_indices))) > 2:
-        return None
+    if len(unified_indices) > 1:
+        for i in range(1, len(unified_indices)):
+            cur = unified_indices[i]
+            prev = unified_indices[i-1]
+            for level in range(min(len(cur), len(prev))):
+                if cur[level] != prev[level]:
+                    if cur[level] - prev[level] > 2:
+                        return None
+                    break
 
     versions = list(numberings_by_version.keys())
     versions.sort()
@@ -231,27 +238,37 @@ def complete_prefix_number_ordering(entries):
     for out_ver in outer_versions:
         for i in unified_indices:
             for ver in ([out_ver] + (inner_versions if out_ver == versions[0] else [])):
-                entries_i_ver = numberings_by_version[ver].get(i, [])
-                if len(entries_i_ver) <= 1:
-                    result += entries_i_ver
-                else:
-                    return None
+                result += numberings_by_version[ver].get(i, [])
     return result
 
-def prefix_numbering(entries):
+def unique_hierarchical_prefix_numbering(entries, start_point=0):
     matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name)))
     for m in matches:
         pos = m.start()
+        if pos < start_point:
+            return None
         prefix = entries[0].name[:pos]
         if all(e.name.startswith(prefix) for e in entries):
-            entries_by_index = {}
+            numbering = {}
             for e in entries:
                 n = NUMBER_REGEX.match(e.name[pos:])
                 if n is None:
                     return None
                 i = int(n.group())
-                entries_by_index.setdefault(i, []).append(e)
-            return entries_by_index
+                numbering.setdefault((i,), []).append(e)
+
+            indices = list(numbering.keys())
+            for idx in indices:
+                if len(numbering[idx]) > 1:
+                    ents_idx = numbering.pop(idx)
+                    next_layer_start = pos + NUMBER_REGEX.match(ents_idx[0].name[pos:]).end()
+                    sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start)
+                    if not sub_numbering:
+                        return None
+                    for sub_idx in sub_numbering:
+                        numbering[(*idx, *sub_idx)] = sub_numbering[sub_idx]
+
+            return numbering
 
     return None
 

From 330b10c85bbd95121e271d537688b3a8350ffa8e Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Tue, 6 Feb 2024 11:59:20 -0500
Subject: [PATCH 2/3] more flexible splitting out of textless pages

---
 dlibrary/dlibrary.py | 117 +++++++++++++++++++++++++------------------
 1 file changed, 68 insertions(+), 49 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index 63a6983..cf5d4fc 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -25,7 +25,7 @@ FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
 FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
 
 TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless')
-ALT_VERSIONS = ['褐色', '日焼け']
+ALT_VERSIONS = ['褐色', '日焼け', 'pink']
 
 IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff']
 
@@ -179,12 +179,12 @@ def image_xrefs(pdf):
     print('\nSuccess')
     return xrefs
 
-def link_pdf(src, dest, start_index=0):
+def link_pdf(src, dest, start_index):
     with fitz.open(src) as pdf:
         xrefs = image_xrefs(pdf)
         if xrefs is None:
             print(f'Support for weirder PDFs not yet implemented, skipping {src}')
-            return
+            return None
 
         dest.mkdir(parents=True, exist_ok=True)
         for (idx, xref) in enumerate(xrefs, start=start_index):
@@ -193,6 +193,8 @@ def link_pdf(src, dest, start_index=0):
             with open(file_path, 'wb') as f:
                 f.write(image["image"])
 
+        return pdf.page_count
+
 def complete_prefix_number_ordering(entries):
     if len(entries) == 1:
         return entries
@@ -272,7 +274,7 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0):
 
     return None
 
-def link_ordered_files(ordering, dest, start_index=0):
+def link_ordered_files(ordering, dest, start_index):
     dest.mkdir(parents=True, exist_ok=True)
 
     for (idx, src_path) in enumerate(ordering, start=start_index):
@@ -293,6 +295,9 @@ def collate(args):
     extraction_dir = args.destdir / 'extract'
     hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints}
 
+    collation_staging_area = args.destdir / 'site' / 'images-staging'
+    collation_staging_area.mkdir(parents=True)
+
     for work_path in extraction_dir.iterdir():
         work_id = work_path.name
 
@@ -304,51 +309,64 @@ def collate(args):
         if virtual == (1,):
             continue
 
-        if work_id in hint_map:
-            hint = hint_map[work_id]
-            entries = [hint] if hint.is_file() else ls_ignore(hint)
+        work_staging_dir = collation_staging_area / work_id
+
+        pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0)
+        if pages_collated:
+            print(f'Collated {pages_collated} pages for {work_id}')
+            work_staging_dir.rename(collation_dir)
         else:
-            search_dir = work_path
-            while True:
-                entries = ls_ignore(search_dir)
-                if len(entries) == 1 and entries[0].is_dir():
-                    search_dir = entries[0]
-                else:
-                    break
+            if work_staging_dir.is_dir():
+                for f in work_staging_dir.iterdir():
+                    f.unlink()
+                work_staging_dir.rmdir()
 
-        if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
-            print(f'Extracting images from {entries[0]} for {work_id}')
-            link_pdf(entries[0], collation_dir)
-            continue
-
-        if len(entries) == 0:
-            print(f'{work_id} contains no files? skipping')
-            continue
-
-        if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
-            ordering = complete_prefix_number_ordering(entries)
-            if not ordering:
-                with_text = []
-                textless = []
-                for entry in entries:
-                    if TEXTLESS_REGEX.search(entry.name):
-                        textless.append(entry)
-                    else:
-                        with_text.append(entry)
-                if with_text and textless:
-                    with_text_ordering = complete_prefix_number_ordering(with_text)
-                    textless_ordering = complete_prefix_number_ordering(textless)
-                    if with_text_ordering and textless_ordering:
-                        ordering = with_text_ordering + textless_ordering
-            if ordering:
-                print(f'Symlinking image files for {work_id}')
-                link_ordered_files(ordering, collation_dir)
-                continue
-
-        print(f'Unable to deduce file structure for {work_id}, skipping')
+            if pages_collated == 0:
+                print(f'{work_id} contains no files? skipping')
+            elif pages_collated is None:
+                print(f'Unable to deduce file structure for {work_id}, skipping')
 
+    collation_staging_area.rmdir()
     con.close()
 
+def collate_from_paths(srcs, dest, start_index):
+    if len(srcs) == 1 and srcs[0].is_dir():
+        return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
+
+    if len(srcs) == 1 and srcs[0].suffix.lower() == '.pdf':
+        print(f'Extracting images from {srcs[0]}')
+        return link_pdf(srcs[0], dest, start_index)
+
+    if len(srcs) == 0:
+        return 0
+
+    with_text = []
+    textless = []
+    for src in srcs:
+        if TEXTLESS_REGEX.search(src.name):
+            textless.append(src)
+        else:
+            with_text.append(src)
+    if with_text and textless:
+        text_pages = collate_from_paths(with_text, dest, start_index)
+        if text_pages is None:
+            return None
+        textless_pages = collate_from_paths(textless, dest, start_index+text_pages)
+        if textless_pages is None:
+            return None
+        return text_pages + textless_pages
+
+    if all(src.is_file() and src.suffix.lower() in IMAGE_FILE_EXTENSIONS for src in srcs):
+        ordering = complete_prefix_number_ordering(srcs)
+        if ordering:
+            print(f'Symlinking image files: {ordering[0]}...')
+            link_ordered_files(ordering, dest, start_index)
+            return len(ordering)
+        else:
+            return None
+
+    return None
+
 def self_and_parents(path):
     return [path] + list(path.parents)
 
@@ -375,15 +393,16 @@ def manual_collate(args):
             if ordering is None:
                 ordering = entries
                 ordering.sort()
-            link_ordered_files(ordering, collation_dir, start_index=index)
+            link_ordered_files(ordering, collation_dir, index)
             index += len(ordering)
         elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS:
-            link_ordered_files([path], collation_dir, start_index=index)
+            link_ordered_files([path], collation_dir, index)
             index += 1
         elif path.suffix.lower() == ".pdf":
-            link_pdf(path, collation_dir, start_index=index)
-            with fitz.open(path) as pdf:
-                index += pdf.page_count
+            pdf_page_count = link_pdf(path, collation_dir, index)
+            if pdf_page_count is None:
+                return
+            index += pdf_page_count
         else:
             print(f'Unknown file type {path}, stopping')
             return

From 657ec65e4a3d788f42af8b5511789f0ead101e93 Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Tue, 6 Feb 2024 12:19:11 -0500
Subject: [PATCH 3/3] handle alphabetic suffixes

---
 dlibrary/dlibrary.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index cf5d4fc..deaf3ee 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -264,7 +264,7 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0):
                 if len(numbering[idx]) > 1:
                     ents_idx = numbering.pop(idx)
                     next_layer_start = pos + NUMBER_REGEX.match(ents_idx[0].name[pos:]).end()
-                    sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start)
+                    sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start)
                     if not sub_numbering:
                         return None
                     for sub_idx in sub_numbering:
@@ -274,6 +274,22 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0):
 
     return None
 
+def alphabetic_numbering(entries, start_point):
+    alphabetized = {}
+    for entry in entries:
+        ending = entry.stem[start_point:]
+        if len(ending) > 1:
+            return None
+        index = 0 if ending == '' else ord(ending.lower()) - ord('a')
+        if index in alphabetized:
+            return None
+        alphabetized[(index,)] = [entry]
+    indices = list(alphabetized.keys())
+    indices.sort()
+    if indices != [(i,) for i in range(len(indices))]:
+        return None
+    return alphabetized
+
 def link_ordered_files(ordering, dest, start_index):
     dest.mkdir(parents=True, exist_ok=True)