more flexible splitting out of textless pages

2024-02-06 11:59:20 -05:00 · 2024-02-06 11:59:20 -05:00 · 330b10c85b
commit 330b10c85b
parent aefaf824a8
1 changed files with 68 additions and 49 deletions
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@ -25,7 +25,7 @@ FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
 FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
 TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless')
-ALT_VERSIONS = ['褐色', '日焼け']
+ALT_VERSIONS = ['褐色', '日焼け', 'pink']
 IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff']
@ -179,12 +179,12 @@ def image_xrefs(pdf):
    print('\nSuccess')
    return xrefs
-def link_pdf(src, dest, start_index=0):
+def link_pdf(src, dest, start_index):
    with fitz.open(src) as pdf:
        xrefs = image_xrefs(pdf)
        if xrefs is None:
            print(f'Support for weirder PDFs not yet implemented, skipping {src}')
-            return
+            return None
        dest.mkdir(parents=True, exist_ok=True)
        for (idx, xref) in enumerate(xrefs, start=start_index):
@ -193,6 +193,8 @@ def link_pdf(src, dest, start_index=0):
            with open(file_path, 'wb') as f:
                f.write(image["image"])
        return pdf.page_count
 def complete_prefix_number_ordering(entries):
    if len(entries) == 1:
        return entries
@ -272,7 +274,7 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0):
    return None
-def link_ordered_files(ordering, dest, start_index=0):
+def link_ordered_files(ordering, dest, start_index):
    dest.mkdir(parents=True, exist_ok=True)
    for (idx, src_path) in enumerate(ordering, start=start_index):
@ -293,6 +295,9 @@ def collate(args):
    extraction_dir = args.destdir / 'extract'
    hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints}
    collation_staging_area = args.destdir / 'site' / 'images-staging'
    collation_staging_area.mkdir(parents=True)
    for work_path in extraction_dir.iterdir():
        work_id = work_path.name
@ -304,51 +309,64 @@ def collate(args):
        if virtual == (1,):
            continue
-        if work_id in hint_map:
+        work_staging_dir = collation_staging_area / work_id
-            hint = hint_map[work_id]
+
-            entries = [hint] if hint.is_file() else ls_ignore(hint)
+        pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0)
        if pages_collated:
            print(f'Collated {pages_collated} pages for {work_id}')
            work_staging_dir.rename(collation_dir)
        else:
-            search_dir = work_path
+            if work_staging_dir.is_dir():
-            while True:
+                for f in work_staging_dir.iterdir():
-                entries = ls_ignore(search_dir)
+                    f.unlink()
-                if len(entries) == 1 and entries[0].is_dir():
+                work_staging_dir.rmdir()
                    search_dir = entries[0]
                else:
                    break
-        if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
+            if pages_collated == 0:
-            print(f'Extracting images from {entries[0]} for {work_id}')
+                print(f'{work_id} contains no files? skipping')
-            link_pdf(entries[0], collation_dir)
+            elif pages_collated is None:
-            continue
+                print(f'Unable to deduce file structure for {work_id}, skipping')
        if len(entries) == 0:
            print(f'{work_id} contains no files? skipping')
            continue
        if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
            ordering = complete_prefix_number_ordering(entries)
            if not ordering:
                with_text = []
                textless = []
                for entry in entries:
                    if TEXTLESS_REGEX.search(entry.name):
                        textless.append(entry)
                    else:
                        with_text.append(entry)
                if with_text and textless:
                    with_text_ordering = complete_prefix_number_ordering(with_text)
                    textless_ordering = complete_prefix_number_ordering(textless)
                    if with_text_ordering and textless_ordering:
                        ordering = with_text_ordering + textless_ordering
            if ordering:
                print(f'Symlinking image files for {work_id}')
                link_ordered_files(ordering, collation_dir)
                continue
        print(f'Unable to deduce file structure for {work_id}, skipping')
    collation_staging_area.rmdir()
    con.close()
 def collate_from_paths(srcs, dest, start_index):
    if len(srcs) == 1 and srcs[0].is_dir():
        return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
    if len(srcs) == 1 and srcs[0].suffix.lower() == '.pdf':
        print(f'Extracting images from {srcs[0]}')
        return link_pdf(srcs[0], dest, start_index)
    if len(srcs) == 0:
        return 0
    with_text = []
    textless = []
    for src in srcs:
        if TEXTLESS_REGEX.search(src.name):
            textless.append(src)
        else:
            with_text.append(src)
    if with_text and textless:
        text_pages = collate_from_paths(with_text, dest, start_index)
        if text_pages is None:
            return None
        textless_pages = collate_from_paths(textless, dest, start_index+text_pages)
        if textless_pages is None:
            return None
        return text_pages + textless_pages
    if all(src.is_file() and src.suffix.lower() in IMAGE_FILE_EXTENSIONS for src in srcs):
        ordering = complete_prefix_number_ordering(srcs)
        if ordering:
            print(f'Symlinking image files: {ordering[0]}...')
            link_ordered_files(ordering, dest, start_index)
            return len(ordering)
        else:
            return None
    return None
 def self_and_parents(path):
    return [path] + list(path.parents)
@ -375,15 +393,16 @@ def manual_collate(args):
            if ordering is None:
                ordering = entries
                ordering.sort()
-            link_ordered_files(ordering, collation_dir, start_index=index)
+            link_ordered_files(ordering, collation_dir, index)
            index += len(ordering)
        elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS:
-            link_ordered_files([path], collation_dir, start_index=index)
+            link_ordered_files([path], collation_dir, index)
            index += 1
        elif path.suffix.lower() == ".pdf":
-            link_pdf(path, collation_dir, start_index=index)
+            pdf_page_count = link_pdf(path, collation_dir, index)
-            with fitz.open(path) as pdf:
+            if pdf_page_count is None:
-                index += pdf.page_count
+                return
            index += pdf_page_count
        else:
            print(f'Unknown file type {path}, stopping')
            return