diff --git a/dlibrary.py b/dlibrary.py index 9f04a08..aff179c 100755 --- a/dlibrary.py +++ b/dlibrary.py @@ -106,55 +106,12 @@ def fetch(args): asyncio.run(fetch_async(args)) -def collate(args): - con = sqlite3.connect(args.destdir / 'meta.db') - cur = con.cursor() - - for work_path in (args.destdir / 'extract').iterdir(): - work_id = work_path.name - - collation_dir = args.destdir / 'site' / 'works' / work_id - if collation_dir.exists(): - continue - - virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone() - if virtual == (1,): - continue - - search_dir = work_path - while True: - entries = list(search_dir.iterdir()) - if len(entries) == 1 and entries[0].is_dir(): - search_dir = entries[0] - else: - break - - if len(entries) == 1 and entries[0].suffix.lower() == '.pdf': - print(f'Extracting images from {entries[0].name} for {work_id}') - link_pdf(entries[0], collation_dir) - continue - - if len(entries) == 0: - print(f'{work_id} contains no files? skipping') - continue - - if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries): - ordering = complete_prefix_number_ordering(entries) - if ordering: - print(f'Symlinking image files for {work_id}') - link_ordered_files(ordering, collation_dir) - continue - - print(f'Unable to deduce file structure for {work_id}, skipping') - - con.close() - -def link_pdf(src, dest): +def link_pdf(src, dest, start_index=0): with fitz.open(src) as pdf: images_by_page = [page.get_images() for page in pdf] if all(len(images) == 1 for images in images_by_page): - dest.mkdir(parents=True) - for (idx, images) in enumerate(images_by_page): + dest.mkdir(parents=True, exist_ok=True) + for (idx, images) in enumerate(images_by_page, start=start_index): xref = images[0][0] image = pdf.extract_image(xref) file_path = dest / f'{idx:04d}.{image["ext"]}' @@ -184,14 +141,92 @@ def complete_prefix_number_ordering(entries): return [e for (e, i) in entries_with_indices] return None -def link_ordered_files(ordering, dest): - dest.mkdir(parents=True) +def link_ordered_files(ordering, dest, start_index=0): + dest.mkdir(parents=True, exist_ok=True) - for (idx, src_path) in enumerate(ordering): + for (idx, src_path) in enumerate(ordering, start=start_index): ext = src_path.suffix.lower() link_path = dest / f'{idx:04d}{ext}' link_path.symlink_to(relpath(src_path, dest)) +def collate(args): + con = sqlite3.connect(args.destdir / 'meta.db') + cur = con.cursor() + + extraction_dir = args.destdir / 'extract' + hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints} + + for work_path in extraction_dir.iterdir(): + work_id = work_path.name + + collation_dir = args.destdir / 'site' / 'works' / work_id + if collation_dir.exists(): + continue + + virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone() + if virtual == (1,): + continue + + if work_id in hint_map: + hint = hint_map[work_id] + entries = list(hint.iterdir()) if hint.is_dir() else [hint] + else: + search_dir = work_path + while True: + entries = list(search_dir.iterdir()) + if len(entries) == 1 and entries[0].is_dir(): + search_dir = entries[0] + else: + break + + if len(entries) == 1 and entries[0].suffix.lower() == '.pdf': + print(f'Extracting images from {entries[0]} for {work_id}') + link_pdf(entries[0], collation_dir) + continue + + if len(entries) == 0: + print(f'{work_id} contains no files? skipping') + continue + + if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries): + ordering = complete_prefix_number_ordering(entries) + if ordering: + print(f'Symlinking image files for {work_id}') + link_ordered_files(ordering, collation_dir) + continue + + print(f'Unable to deduce file structure for {work_id}, skipping') + + con.close() + +def manual_collate(args): + collation_dir = args.destdir / 'site' / 'works' / args.work_id + if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0: + print(f'Collation directory already exists!') + return + collation_dir.mkdir(parents=True, exist_ok=True) + + index = 0 + for path in args.paths: + if path.is_dir(): + entries = [p for p in path.iterdir() if p.suffix.lower() in IMAGE_FILE_EXTENSIONS] + ordering = complete_prefix_number_ordering(entries) + if ordering is None: + ordering = entries + ordering.sort() + link_ordered_files(ordering, collation_dir, start_index=index) + index += len(ordering) + elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS: + link_ordered_files([path], collation_dir, start_index=index) + index += 1 + elif path.suffix.lower() == ".pdf": + link_pdf(path, collation_dir, start_index=index) + with fitz.open(path) as pdf: + index += pdf.page_count + else: + print(f'Unknown file type {path}, stopping') + return + def metadata(args): con = sqlite3.connect(args.destdir / 'meta.db') cur = con.cursor() @@ -251,8 +286,26 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail parser_fetch.set_defaults(func=fetch) parser_collate = subparsers.add_parser('collate', help='collate a single sequence of image files for each work') +parser_collate.add_argument( + 'hints', + metavar='PATH', + type=Path, + nargs='*', + help='manually-specified paths of subdirectories or PDFs within extraction folders, at most one per work', +) parser_collate.set_defaults(func=collate) +parser_manual_collate = subparsers.add_parser('manual-collate', help='collate a specific work manually, specifying all paths to include') +parser_manual_collate.add_argument('work_id') +parser_manual_collate.add_argument( + 'paths', + metavar='PATH', + type=Path, + nargs='+', + help='paths of files (images to symlink, pdfs to extract) or directories (symlink all images in the directory, no recursion, best-effort sorting)' +) +parser_manual_collate.set_defaults(func=manual_collate) + parser_metadata = subparsers.add_parser('metadata', help='view or modify metadata for a work') parser_metadata.add_argument('work_id') parser_metadata.add_argument(