diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 30f2e62..f1cbf3d 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -341,20 +341,20 @@ def is_image(path): def ignoreable(path): return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS) -def ls_ignore(directory, exclude): +def ls_ignore(directory): return [ path for path in directory.iterdir() - if not ignoreable(path) and path not in exclude + if not ignoreable(path) ] -def descendant_files_ignore(path, exclude): +def descendant_files_ignore(path): if path.is_file(): return [path] result = [] - for item in ls_ignore(path, exclude): + for item in ls_ignore(path): if item.is_dir(): - result.extend(descendant_files_ignore(item, exclude)) + result.extend(descendant_files_ignore(item)) else: result.append(item) @@ -365,7 +365,7 @@ def collate(args): cur = con.cursor() extraction_dir = args.destdir / 'extract' - hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints} + hint_map = {hint.absolute().relative_to(extraction_dir.absolute()).parents[-2].name: hint for hint in args.hints} collation_staging_area = args.destdir / 'site' / 'images-staging' collation_staging_area.mkdir(parents=True) @@ -383,7 +383,7 @@ def collate(args): work_staging_dir = collation_staging_area / work_id - pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0, []) + pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0) if pages_collated: print(f'Collated {pages_collated} pages for {work_id}') work_staging_dir.rename(collation_dir) @@ -401,7 +401,7 @@ def collate(args): collation_staging_area.rmdir() con.close() -def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, later=None): +def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None): early_srcs = [] middle_srcs = [] late_srcs = [] @@ -416,17 +416,17 @@ def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, late if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1: return False - early_page_count = collate_from_paths(early_srcs, dest, start_index, exclude) + early_page_count = collate_from_paths(early_srcs, dest, start_index) if early_page_count is None: return None start_index += early_page_count - middle_page_count = collate_from_paths(middle_srcs, dest, start_index, exclude) + middle_page_count = collate_from_paths(middle_srcs, dest, start_index) if middle_page_count is None: return None start_index += middle_page_count - late_page_count = collate_from_paths(late_srcs, dest, start_index, exclude) + late_page_count = collate_from_paths(late_srcs, dest, start_index) if late_page_count is None: return None @@ -458,13 +458,13 @@ def median(items): def superior_or_equal(a, b): return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) -def try_collate_images_vs_pdf(srcs, dest, start_index, exclude): +def try_collate_images_vs_pdf(srcs, dest, start_index): pdfs = [src for src in srcs if 'pdf' in src.name.lower()] if len(pdfs) != 1: return False outer_pdf = pdfs[0] - inner_pdfs = [f for f in descendant_files_ignore(outer_pdf, exclude) if is_pdf(f)] + inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if is_pdf(f)] if len(inner_pdfs) != 1: return False inner_pdf = inner_pdfs[0] @@ -472,7 +472,7 @@ def try_collate_images_vs_pdf(srcs, dest, start_index, exclude): non_pdf_srcs = [src for src in srcs if src != outer_pdf] images = [] non_images = [] - descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src, exclude)] + descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)] for f in descendant_files: if is_image(f): images.append(f) @@ -494,15 +494,15 @@ def try_collate_images_vs_pdf(srcs, dest, start_index, exclude): return False if superior_or_equal(median_standalone_size, median_pdf_size): - return collate_from_paths(non_pdf_srcs, dest, start_index, exclude) + return collate_from_paths(non_pdf_srcs, dest, start_index) elif superior_or_equal(median_pdf_size, median_standalone_size): - return collate_from_paths([outer_pdf], dest, start_index, exclude) + return collate_from_paths([outer_pdf], dest, start_index) else: return False -def collate_from_paths(srcs, dest, start_index, exclude): +def collate_from_paths(srcs, dest, start_index): if len(srcs) == 1 and srcs[0].is_dir(): - return collate_from_paths(ls_ignore(srcs[0], exclude), dest, start_index, exclude) + return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) if len(srcs) == 1 and is_pdf(srcs[0]): print(f'Extracting images from {srcs[0]}') @@ -516,18 +516,18 @@ def collate_from_paths(srcs, dest, start_index, exclude): if len(hi_res_dirs) == 1: hi_res_dir = hi_res_dirs[0] lo_res_dir = next(src for src in srcs if src != hi_res_dir) - if len(descendant_files_ignore(lo_res_dir, exclude)) == len(descendant_files_ignore(hi_res_dir, exclude)): - return collate_from_paths([hi_res_dir], dest, start_index, exclude) + if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)): + return collate_from_paths([hi_res_dir], dest, start_index) - textless_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=TEXTLESS_REGEX) + textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX) if textless_split != False: return textless_split - epilogue_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=EPILOGUE_REGEX) + epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX) if epilogue_split != False: return epilogue_split - cover_split = try_collate_split_regex(srcs, dest, start_index, exclude, earlier=COVER_REGEX) + cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX) if cover_split != False: return cover_split @@ -540,7 +540,7 @@ def collate_from_paths(srcs, dest, start_index, exclude): else: return None - images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index, exclude) + images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index) if images_vs_pdf != False: return images_vs_pdf @@ -549,104 +549,49 @@ def collate_from_paths(srcs, dest, start_index, exclude): def self_and_parents(path): return [path] + list(path.parents) -def parse_expressions(tokens): - groups = [] - exclusions = [] - - while tokens: - token = tokens.pop(0) - if token == '!': - exclusions.extend(parse_exclusion(tokens)) - elif token == '(': - groups.append(parse_group(tokens)) - else: - groups.append([token]) - - return (groups, exclusions) - -def parse_exclusion(tokens): - token = tokens.pop(0) - - if token == '(': - return parse_group(tokens) - else: - return [token] - -def parse_group(tokens): - items = [] - - while True: - token = tokens.pop(0) - if token == ')': - return items - else: - items.append(token) - -def normalize_to(path, ref): - return ref / Path(relpath(path, ref)) - def manual_collate(args): - (raw_groups, raw_exclusions) = parse_expressions(args.expression) + work_id = self_and_parents(args.paths[0].absolute().relative_to(args.destdir.absolute() / 'extract'))[-2].name - exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions] + collation_dir = args.destdir / 'site' / 'images' / work_id + if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0: + print(f'Collation directory already exists!') + return - if raw_groups: - groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups] - else: - groups = [[args.destdir / 'extract' / args.work_id]] - - collation_dir = args.destdir / 'site' / 'images' / args.work_id - if collation_dir.exists(): - if len(list(collation_dir.iterdir())) > 0: - print(f'Collation directory already exists!') - return - else: - collation_dir.rmdir() - - nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()] + nonexistent = [path for path in args.paths if not path.exists()] if len(nonexistent) > 0: print(f'Nonexistent paths: {nonexistent}') return - collation_staging_area = args.destdir / 'site' / 'images-staging' - work_staging_dir = collation_staging_area / args.work_id - work_staging_dir.mkdir(parents=True) - - pages_collated = 0 - for group in groups: - pages_added = collate_from_paths( - [item for item in group if item not in exclusions], - work_staging_dir, - pages_collated, - exclusions, - ) - if pages_added is None: - print(f'Unable to deduce file structure for {args.work_id} subgroup {[str(path) for path in group]}') - pages_collated = None - break - - pages_collated += pages_added - - if pages_collated: - print(f'Collated {pages_collated} pages for {args.work_id}') - work_staging_dir.rename(collation_dir) - else: - for f in work_staging_dir.iterdir(): - f.unlink() - work_staging_dir.rmdir() - - if pages_collated == 0: - print(f'No files found for {work_id}') - - collation_staging_area.rmdir() + collation_dir.mkdir(parents=True, exist_ok=True) + index = 0 + for path in args.paths: + if path.is_dir(): + entries = [p for p in path.iterdir() if p.is_file() and is_image(p)] + ordering = complete_prefix_number_ordering(entries) + if ordering is None: + ordering = entries + ordering.sort() + link_ordered_files(ordering, collation_dir, index) + index += len(ordering) + elif is_image(path): + link_ordered_files([path], collation_dir, index) + index += 1 + elif is_pdf(path): + pdf_page_count = link_pdf(path, collation_dir, index) + if pdf_page_count is None: + return + index += pdf_page_count + else: + print(f'Unknown file type {path}, stopping') + return def fmt_size(s): return f'{s[0]}x{s[1]}px' def analyze(args): extract_dir = args.destdir / 'extract' - files = descendant_files_ignore(extract_dir / args.work_id, []) + files = descendant_files_ignore(extract_dir / args.work_id) files.sort() for f in files: @@ -924,43 +869,28 @@ parser_manual_collate = subparsers.add_parser( help='collate a single work manually', formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent("""\ - Provide an expression or sequence of expressions specifying groups - of paths to collate or skip. An expression can be: - - PATH - A single path. If this is an image, it will be appended to - the sequence of collated images; if this is a PDF, images will be - extracted from it and concatenated to the sequence; if this is a - directory, the contents of the directory will be collated based on - the normal heuristics and concatenated to the sequence. - - ( PATH [PATH ...] ) - A group of paths contained in parentheses. You may need to escape - the parentheses to avoid them getting parsed by your shell. - All the paths in this group will be considered together, and - collated based on the normal heuristics, regardless of what - order the paths are provided in. - - ! PATH - ! ( PATH [PATH ...] ) - A path or group of paths to exclude from collation. You may - need to escape the !. If an excluded path appears within any - of the other specified paths, it will be ignored. - - If the only expressions provided are negations, then auto-collation - will start from the top level of the extracted work while excluding - the negated paths. - All provided paths must be under $DLIBRARY_DIR/extract/[work id]/ for the work being manually collated. `manual-collate` can only handle one work at a time. Paths are used as follows: + + - If a path is a directory, all *image files* immediately + inside that directory will be appended to the sequence. If + files are named in a way which indicates a clear ordering, + that ordering will be used. Otherwise, filenames will be + sorted lexicographically. Non-image files and + subdirectories will be ignored. + - If a path is an image file, that image file will be + appended to the sequence. + - If a path is a PDF file, page images will be extracted + from that PDF and appended to the sequence. """), ) -parser_manual_collate.add_argument('work_id') parser_manual_collate.add_argument( - 'expression', + 'paths', + metavar='PATH', + type=Path, nargs='+', - help='expressions indicating paths to collate or skip', + help='paths within a single work to be collated in sequence', ) parser_manual_collate.set_defaults(func=manual_collate) diff --git a/flake.nix b/flake.nix index dc893cb..4ae830b 100644 --- a/flake.nix +++ b/flake.nix @@ -47,7 +47,7 @@ dlibrary = buildPythonApplication { pname = "dlibrary"; - version = "0.2"; + version = "0.1"; pyproject = true; propagatedBuildInputs = [ pymupdf diff --git a/pyproject.toml b/pyproject.toml index acc2a38..7ecfcfd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dlibrary" -version = "0.2" +version = "0.1" description = "Cataloging tool and viewer for downloaded DLSite purchases" license = {file = "LICENSE"} authors = [{name = "xenofem"}]