diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index deaf3ee..ec567ed 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -25,7 +25,7 @@ FANZA_ID_REGEX = re.compile('^d_[0-9]+$') FAKKU_ID_REGEX = re.compile('.*_FAKKU$') TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless') -ALT_VERSIONS = ['褐色', '日焼け', 'pink'] +ALT_VERSIONS = ['褐色', '日焼け'] IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff'] @@ -179,12 +179,12 @@ def image_xrefs(pdf): print('\nSuccess') return xrefs -def link_pdf(src, dest, start_index): +def link_pdf(src, dest, start_index=0): with fitz.open(src) as pdf: xrefs = image_xrefs(pdf) if xrefs is None: print(f'Support for weirder PDFs not yet implemented, skipping {src}') - return None + return dest.mkdir(parents=True, exist_ok=True) for (idx, xref) in enumerate(xrefs, start=start_index): @@ -193,8 +193,6 @@ def link_pdf(src, dest, start_index): with open(file_path, 'wb') as f: f.write(image["image"]) - return pdf.page_count - def complete_prefix_number_ordering(entries): if len(entries) == 1: return entries @@ -204,7 +202,7 @@ def complete_prefix_number_ordering(entries): version = next(ver for ver in (ALT_VERSIONS + ['']) if ver in entry.name) entries_by_version.setdefault(version, []).append(entry) - numberings_by_version = {ver: unique_hierarchical_prefix_numbering(entries_by_version[ver]) for ver in entries_by_version} + numberings_by_version = {ver: prefix_numbering(entries_by_version[ver]) for ver in entries_by_version} unified_indices = set() for numbering in numberings_by_version.values(): @@ -214,15 +212,8 @@ def complete_prefix_number_ordering(entries): unified_indices = list(unified_indices) unified_indices.sort() - if len(unified_indices) > 1: - for i in range(1, len(unified_indices)): - cur = unified_indices[i] - prev = unified_indices[i-1] - for level in range(min(len(cur), len(prev))): - if cur[level] != prev[level]: - if cur[level] - prev[level] > 2: - return None - break + if len(unified_indices) > 1 and min(unified_indices[i] - unified_indices[i-1] for i in range(1, len(unified_indices))) > 2: + return None versions = list(numberings_by_version.keys()) versions.sort() @@ -240,57 +231,31 @@ def complete_prefix_number_ordering(entries): for out_ver in outer_versions: for i in unified_indices: for ver in ([out_ver] + (inner_versions if out_ver == versions[0] else [])): - result += numberings_by_version[ver].get(i, []) + entries_i_ver = numberings_by_version[ver].get(i, []) + if len(entries_i_ver) <= 1: + result += entries_i_ver + else: + return None return result -def unique_hierarchical_prefix_numbering(entries, start_point=0): +def prefix_numbering(entries): matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name))) for m in matches: pos = m.start() - if pos < start_point: - return None prefix = entries[0].name[:pos] if all(e.name.startswith(prefix) for e in entries): - numbering = {} + entries_by_index = {} for e in entries: n = NUMBER_REGEX.match(e.name[pos:]) if n is None: return None i = int(n.group()) - numbering.setdefault((i,), []).append(e) - - indices = list(numbering.keys()) - for idx in indices: - if len(numbering[idx]) > 1: - ents_idx = numbering.pop(idx) - next_layer_start = pos + NUMBER_REGEX.match(ents_idx[0].name[pos:]).end() - sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start) - if not sub_numbering: - return None - for sub_idx in sub_numbering: - numbering[(*idx, *sub_idx)] = sub_numbering[sub_idx] - - return numbering + entries_by_index.setdefault(i, []).append(e) + return entries_by_index return None -def alphabetic_numbering(entries, start_point): - alphabetized = {} - for entry in entries: - ending = entry.stem[start_point:] - if len(ending) > 1: - return None - index = 0 if ending == '' else ord(ending.lower()) - ord('a') - if index in alphabetized: - return None - alphabetized[(index,)] = [entry] - indices = list(alphabetized.keys()) - indices.sort() - if indices != [(i,) for i in range(len(indices))]: - return None - return alphabetized - -def link_ordered_files(ordering, dest, start_index): +def link_ordered_files(ordering, dest, start_index=0): dest.mkdir(parents=True, exist_ok=True) for (idx, src_path) in enumerate(ordering, start=start_index): @@ -311,9 +276,6 @@ def collate(args): extraction_dir = args.destdir / 'extract' hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints} - collation_staging_area = args.destdir / 'site' / 'images-staging' - collation_staging_area.mkdir(parents=True) - for work_path in extraction_dir.iterdir(): work_id = work_path.name @@ -325,64 +287,51 @@ def collate(args): if virtual == (1,): continue - work_staging_dir = collation_staging_area / work_id - - pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0) - if pages_collated: - print(f'Collated {pages_collated} pages for {work_id}') - work_staging_dir.rename(collation_dir) + if work_id in hint_map: + hint = hint_map[work_id] + entries = [hint] if hint.is_file() else ls_ignore(hint) else: - if work_staging_dir.is_dir(): - for f in work_staging_dir.iterdir(): - f.unlink() - work_staging_dir.rmdir() + search_dir = work_path + while True: + entries = ls_ignore(search_dir) + if len(entries) == 1 and entries[0].is_dir(): + search_dir = entries[0] + else: + break - if pages_collated == 0: - print(f'{work_id} contains no files? skipping') - elif pages_collated is None: - print(f'Unable to deduce file structure for {work_id}, skipping') + if len(entries) == 1 and entries[0].suffix.lower() == '.pdf': + print(f'Extracting images from {entries[0]} for {work_id}') + link_pdf(entries[0], collation_dir) + continue + + if len(entries) == 0: + print(f'{work_id} contains no files? skipping') + continue + + if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries): + ordering = complete_prefix_number_ordering(entries) + if not ordering: + with_text = [] + textless = [] + for entry in entries: + if TEXTLESS_REGEX.search(entry.name): + textless.append(entry) + else: + with_text.append(entry) + if with_text and textless: + with_text_ordering = complete_prefix_number_ordering(with_text) + textless_ordering = complete_prefix_number_ordering(textless) + if with_text_ordering and textless_ordering: + ordering = with_text_ordering + textless_ordering + if ordering: + print(f'Symlinking image files for {work_id}') + link_ordered_files(ordering, collation_dir) + continue + + print(f'Unable to deduce file structure for {work_id}, skipping') - collation_staging_area.rmdir() con.close() -def collate_from_paths(srcs, dest, start_index): - if len(srcs) == 1 and srcs[0].is_dir(): - return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) - - if len(srcs) == 1 and srcs[0].suffix.lower() == '.pdf': - print(f'Extracting images from {srcs[0]}') - return link_pdf(srcs[0], dest, start_index) - - if len(srcs) == 0: - return 0 - - with_text = [] - textless = [] - for src in srcs: - if TEXTLESS_REGEX.search(src.name): - textless.append(src) - else: - with_text.append(src) - if with_text and textless: - text_pages = collate_from_paths(with_text, dest, start_index) - if text_pages is None: - return None - textless_pages = collate_from_paths(textless, dest, start_index+text_pages) - if textless_pages is None: - return None - return text_pages + textless_pages - - if all(src.is_file() and src.suffix.lower() in IMAGE_FILE_EXTENSIONS for src in srcs): - ordering = complete_prefix_number_ordering(srcs) - if ordering: - print(f'Symlinking image files: {ordering[0]}...') - link_ordered_files(ordering, dest, start_index) - return len(ordering) - else: - return None - - return None - def self_and_parents(path): return [path] + list(path.parents) @@ -409,16 +358,15 @@ def manual_collate(args): if ordering is None: ordering = entries ordering.sort() - link_ordered_files(ordering, collation_dir, index) + link_ordered_files(ordering, collation_dir, start_index=index) index += len(ordering) elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS: - link_ordered_files([path], collation_dir, index) + link_ordered_files([path], collation_dir, start_index=index) index += 1 elif path.suffix.lower() == ".pdf": - pdf_page_count = link_pdf(path, collation_dir, index) - if pdf_page_count is None: - return - index += pdf_page_count + link_pdf(path, collation_dir, start_index=index) + with fitz.open(path) as pdf: + index += pdf.page_count else: print(f'Unknown file type {path}, stopping') return