diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 63a6983..cf5d4fc 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -25,7 +25,7 @@ FANZA_ID_REGEX = re.compile('^d_[0-9]+$') FAKKU_ID_REGEX = re.compile('.*_FAKKU$') TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless') -ALT_VERSIONS = ['褐色', '日焼け'] +ALT_VERSIONS = ['褐色', '日焼け', 'pink'] IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff'] @@ -179,12 +179,12 @@ def image_xrefs(pdf): print('\nSuccess') return xrefs -def link_pdf(src, dest, start_index=0): +def link_pdf(src, dest, start_index): with fitz.open(src) as pdf: xrefs = image_xrefs(pdf) if xrefs is None: print(f'Support for weirder PDFs not yet implemented, skipping {src}') - return + return None dest.mkdir(parents=True, exist_ok=True) for (idx, xref) in enumerate(xrefs, start=start_index): @@ -193,6 +193,8 @@ def link_pdf(src, dest, start_index=0): with open(file_path, 'wb') as f: f.write(image["image"]) + return pdf.page_count + def complete_prefix_number_ordering(entries): if len(entries) == 1: return entries @@ -272,7 +274,7 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0): return None -def link_ordered_files(ordering, dest, start_index=0): +def link_ordered_files(ordering, dest, start_index): dest.mkdir(parents=True, exist_ok=True) for (idx, src_path) in enumerate(ordering, start=start_index): @@ -293,6 +295,9 @@ def collate(args): extraction_dir = args.destdir / 'extract' hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints} + collation_staging_area = args.destdir / 'site' / 'images-staging' + collation_staging_area.mkdir(parents=True) + for work_path in extraction_dir.iterdir(): work_id = work_path.name @@ -304,51 +309,64 @@ def collate(args): if virtual == (1,): continue - if work_id in hint_map: - hint = hint_map[work_id] - entries = [hint] if hint.is_file() else ls_ignore(hint) + work_staging_dir = collation_staging_area / work_id + + pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0) + if pages_collated: + print(f'Collated {pages_collated} pages for {work_id}') + work_staging_dir.rename(collation_dir) else: - search_dir = work_path - while True: - entries = ls_ignore(search_dir) - if len(entries) == 1 and entries[0].is_dir(): - search_dir = entries[0] - else: - break + if work_staging_dir.is_dir(): + for f in work_staging_dir.iterdir(): + f.unlink() + work_staging_dir.rmdir() - if len(entries) == 1 and entries[0].suffix.lower() == '.pdf': - print(f'Extracting images from {entries[0]} for {work_id}') - link_pdf(entries[0], collation_dir) - continue - - if len(entries) == 0: - print(f'{work_id} contains no files? skipping') - continue - - if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries): - ordering = complete_prefix_number_ordering(entries) - if not ordering: - with_text = [] - textless = [] - for entry in entries: - if TEXTLESS_REGEX.search(entry.name): - textless.append(entry) - else: - with_text.append(entry) - if with_text and textless: - with_text_ordering = complete_prefix_number_ordering(with_text) - textless_ordering = complete_prefix_number_ordering(textless) - if with_text_ordering and textless_ordering: - ordering = with_text_ordering + textless_ordering - if ordering: - print(f'Symlinking image files for {work_id}') - link_ordered_files(ordering, collation_dir) - continue - - print(f'Unable to deduce file structure for {work_id}, skipping') + if pages_collated == 0: + print(f'{work_id} contains no files? skipping') + elif pages_collated is None: + print(f'Unable to deduce file structure for {work_id}, skipping') + collation_staging_area.rmdir() con.close() +def collate_from_paths(srcs, dest, start_index): + if len(srcs) == 1 and srcs[0].is_dir(): + return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) + + if len(srcs) == 1 and srcs[0].suffix.lower() == '.pdf': + print(f'Extracting images from {srcs[0]}') + return link_pdf(srcs[0], dest, start_index) + + if len(srcs) == 0: + return 0 + + with_text = [] + textless = [] + for src in srcs: + if TEXTLESS_REGEX.search(src.name): + textless.append(src) + else: + with_text.append(src) + if with_text and textless: + text_pages = collate_from_paths(with_text, dest, start_index) + if text_pages is None: + return None + textless_pages = collate_from_paths(textless, dest, start_index+text_pages) + if textless_pages is None: + return None + return text_pages + textless_pages + + if all(src.is_file() and src.suffix.lower() in IMAGE_FILE_EXTENSIONS for src in srcs): + ordering = complete_prefix_number_ordering(srcs) + if ordering: + print(f'Symlinking image files: {ordering[0]}...') + link_ordered_files(ordering, dest, start_index) + return len(ordering) + else: + return None + + return None + def self_and_parents(path): return [path] + list(path.parents) @@ -375,15 +393,16 @@ def manual_collate(args): if ordering is None: ordering = entries ordering.sort() - link_ordered_files(ordering, collation_dir, start_index=index) + link_ordered_files(ordering, collation_dir, index) index += len(ordering) elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS: - link_ordered_files([path], collation_dir, start_index=index) + link_ordered_files([path], collation_dir, index) index += 1 elif path.suffix.lower() == ".pdf": - link_pdf(path, collation_dir, start_index=index) - with fitz.open(path) as pdf: - index += pdf.page_count + pdf_page_count = link_pdf(path, collation_dir, index) + if pdf_page_count is None: + return + index += pdf_page_count else: print(f'Unknown file type {path}, stopping') return