consolidate manual-collate functionality into collate
This commit is contained in:
		
							parent
							
								
									b7b989433a
								
							
						
					
					
						commit
						c7f95d50f9
					
				
					 1 changed files with 126 additions and 138 deletions
				
			
		|  | @ -214,11 +214,52 @@ def self_and_parents(path): | |||
|     return [path] + list(path.parents) | ||||
| 
 | ||||
| def collate(args): | ||||
|     con = sqlite3.connect(args.destdir / 'meta.db') | ||||
|     cur = con.cursor() | ||||
| 
 | ||||
|     extraction_dir = args.destdir / 'extract' | ||||
|     hint_map = {self_and_parents(Path(relpath(hint, extraction_dir)))[-2].name: hint for hint in args.hints} | ||||
| 
 | ||||
|     def extracted_path_work_id(path): | ||||
|         trail = self_and_parents(Path(relpath(path, extraction_dir))) | ||||
|         if len(trail) < 2: | ||||
|             return None | ||||
|         result = trail[-2].name | ||||
|         if result == '..': | ||||
|             return None | ||||
|         return result | ||||
| 
 | ||||
|     (raw_groups, raw_exclusions) = parse_expressions(args.expression) | ||||
| 
 | ||||
|     specified_works = set() | ||||
|     works_groups = {} | ||||
|     for group in raw_groups: | ||||
|         if len(group) == 0: | ||||
|             continue | ||||
|         work_id = extracted_path_work_id(group[0]) | ||||
|         if not work_id: | ||||
|             print(f'Group {group} contains paths outside an extracted work!') | ||||
|             exit(1) | ||||
|         if not all(extracted_path_work_id(item) == work_id for item in group[1:]): | ||||
|             print(f'Group {group} contains paths from multiple works!') | ||||
|             exit(1) | ||||
|         specified_works.add(work_id) | ||||
|         if work_id not in works_groups: | ||||
|             works_groups[work_id] = [] | ||||
|         normalized_paths = [normalize_to(item, args.destdir) for item in group] | ||||
|         if not all(path.exists() for path in normalized_paths): | ||||
|             print(f'Group {group} contains nonexistent paths!') | ||||
|             exit(1) | ||||
|         works_groups[work_id].append(normalized_paths) | ||||
| 
 | ||||
|     exclusions = [] | ||||
|     for exclusion in raw_exclusions: | ||||
|         work_id = extracted_path_work_id(exclusion) | ||||
|         if not work_id: | ||||
|             print(f'Excluded path {exclusion} does not belong to an extracted work!') | ||||
|             exit(1) | ||||
|         specified_works.add(work_id) | ||||
|         normalized_path = normalize_to(exclusion, args.destdir) | ||||
|         if not normalized_path.exists(): | ||||
|             print(f'Excluded path {exclusion} does not exist!') | ||||
|             exit(1) | ||||
|         exclusions.append(normalized_path) | ||||
| 
 | ||||
|     collation_staging_area = args.destdir / 'site' / 'images-staging' | ||||
|     collation_staging_area.mkdir(parents=True) | ||||
|  | @ -226,21 +267,41 @@ def collate(args): | |||
|     collation_area = args.destdir / 'site' / 'images' | ||||
|     collation_area.mkdir(parents=True, exist_ok=True) | ||||
| 
 | ||||
|     con = sqlite3.connect(args.destdir / 'meta.db') | ||||
|     cur = con.cursor() | ||||
| 
 | ||||
|     for work_path in extraction_dir.iterdir(): | ||||
|         work_id = work_path.name | ||||
| 
 | ||||
|         if args.only_specified_works and work_id not in specified_works: | ||||
|             continue | ||||
| 
 | ||||
|         work_collation_dir = collation_area / work_id | ||||
|         if work_collation_dir.exists(): | ||||
|             if work_id not in specified_works: | ||||
|                 continue | ||||
|             if len(list(work_collation_dir.iterdir())) > 0: | ||||
|                 print(f'Collation directory for work {work_id} already exists!') | ||||
|                 break | ||||
|             else: | ||||
|                 work_collation_dir.rmdir() | ||||
| 
 | ||||
|         virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone() | ||||
|         if virtual == (1,): | ||||
|             if work_id in specified_works: | ||||
|                 print(f'Work {work_id} is virtual!') | ||||
|                 break | ||||
|             continue | ||||
| 
 | ||||
|         work_staging_dir = collation_staging_area / work_id | ||||
| 
 | ||||
|         collator = Collator(work_staging_dir, [], args) | ||||
|         collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)]) | ||||
|         collator = Collator(work_staging_dir, exclusions, args) | ||||
|         for group in works_groups.get(work_id, [[work_path]]): | ||||
|             collation_result = collator.collate_from_paths([item for item in group if item not in exclusions]) | ||||
|             if not collation_result: | ||||
|                 print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}') | ||||
|                 break | ||||
| 
 | ||||
|         if collation_result and collator.index > 0: | ||||
|             print(f'Collated {collator.index} pages for {work_id}') | ||||
|             work_staging_dir.rename(work_collation_dir) | ||||
|  | @ -253,7 +314,7 @@ def collate(args): | |||
|             if not collation_result: | ||||
|                 print(f'Unable to deduce file structure for {work_id}, skipping') | ||||
|             elif collator.index == 0: | ||||
|                 print(f'{work_id} contains no files? skipping') | ||||
|                 print(f'No files found for {work_id}, skipping') | ||||
| 
 | ||||
|     collation_staging_area.rmdir() | ||||
|     con.close() | ||||
|  | @ -688,9 +749,6 @@ def superior_or_equal(a, b): | |||
|     return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) | ||||
| 
 | ||||
| 
 | ||||
| def self_and_parents(path): | ||||
|     return [path] + list(path.parents) | ||||
| 
 | ||||
| def parse_expressions(tokens): | ||||
|     groups = [] | ||||
|     exclusions = [] | ||||
|  | @ -727,61 +785,6 @@ def parse_group(tokens): | |||
| def normalize_to(path, ref): | ||||
|     return ref / Path(relpath(path, ref)) | ||||
| 
 | ||||
| def manual_collate(args): | ||||
|     (raw_groups, raw_exclusions) = parse_expressions(args.expression) | ||||
| 
 | ||||
|     extraction_dir = args.destdir / 'extract' | ||||
| 
 | ||||
|     sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group) | ||||
|     work_id = self_and_parents(Path(relpath(sample_path, extraction_dir)))[-2].name | ||||
| 
 | ||||
|     exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions] | ||||
| 
 | ||||
|     if raw_groups: | ||||
|         groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups] | ||||
|     else: | ||||
|         groups = [[extraction_dir / work_id]] | ||||
| 
 | ||||
|     collation_area = args.destdir / 'site' / 'images' | ||||
|     collation_area.mkdir(parents=True, exist_ok=True) | ||||
| 
 | ||||
|     work_collation_dir = collation_area / work_id | ||||
|     if work_collation_dir.exists(): | ||||
|         if len(list(work_collation_dir.iterdir())) > 0: | ||||
|             print('Collation directory already exists!') | ||||
|             return | ||||
|         else: | ||||
|             work_collation_dir.rmdir() | ||||
| 
 | ||||
|     nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()] | ||||
|     if len(nonexistent) > 0: | ||||
|         print(f'Nonexistent paths: {nonexistent}') | ||||
|         return | ||||
| 
 | ||||
|     collation_staging_area = args.destdir / 'site' / 'images-staging' | ||||
|     work_staging_dir = collation_staging_area / work_id | ||||
|     work_staging_dir.mkdir(parents=True) | ||||
| 
 | ||||
|     collator = Collator(work_staging_dir, exclusions, args) | ||||
|     for group in groups: | ||||
|         collation_result = collator.collate_from_paths([item for item in group if item not in exclusions]) | ||||
|         if collation_result is None: | ||||
|             print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}') | ||||
|             break | ||||
| 
 | ||||
|     if collation_result and collator.index > 0: | ||||
|         print(f'Collated {collator.index} pages for {work_id}') | ||||
|         work_staging_dir.rename(work_collation_dir) | ||||
|     else: | ||||
|         for f in work_staging_dir.iterdir(): | ||||
|             f.unlink() | ||||
|         work_staging_dir.rmdir() | ||||
| 
 | ||||
|         if collation_result and collator.index == 0: | ||||
|             print(f'No files found for {work_id}') | ||||
| 
 | ||||
|     collation_staging_area.rmdir() | ||||
| 
 | ||||
| 
 | ||||
| def fmt_size(s): | ||||
|     return f'{s[0]}x{s[1]}px' | ||||
|  | @ -1022,9 +1025,9 @@ argparser = argparse.ArgumentParser( | |||
|       subfolder. | ||||
|     - `fetch` metadata and thumbnail images for extracted works | ||||
|       from DLSite. | ||||
|     - `collate` and/or `manual-collate` extracted works, | ||||
|       producing a single sequence of image files (or symlinks | ||||
|       into the extracted data, when possible) for each work. | ||||
|     - `collate` extracted works, producing a single sequence of | ||||
|       image files (or symlinks into the extracted data, when | ||||
|       possible) for each work. | ||||
|     - Manually adjust works' `metadata` when necessary. | ||||
|     - `generate` a static website providing a catalog and viewer | ||||
|       for all collated works. | ||||
|  | @ -1047,7 +1050,7 @@ argparser.add_argument( | |||
| ) | ||||
| subparsers = argparser.add_subparsers(title="subcommands", required=True) | ||||
| 
 | ||||
| parser_extract = subparsers.add_parser('extract', aliases=['x', 'ex'], help='extract zipfiles') | ||||
| parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles') | ||||
| parser_extract.add_argument( | ||||
|     '-r', '--remove', | ||||
|     action='store_true', | ||||
|  | @ -1062,111 +1065,96 @@ parser_extract.add_argument( | |||
| ) | ||||
| parser_extract.set_defaults(func=extract) | ||||
| 
 | ||||
| parser_fetch = subparsers.add_parser('fetch', aliases=['f', 'fet'], help='fetch metadata and thumbnails') | ||||
| parser_fetch = subparsers.add_parser('fetch', aliases=['f'], help='fetch metadata and thumbnails') | ||||
| parser_fetch.set_defaults(func=fetch) | ||||
| 
 | ||||
| parser_collate = subparsers.add_parser( | ||||
|     'collate', | ||||
|     aliases=['c', 'co', 'col'], | ||||
|     help='collate each work into a sequence of image files', | ||||
|     aliases=['c'], | ||||
|     help='collate works into sequences of image files', | ||||
|     formatter_class=argparse.RawDescriptionHelpFormatter, | ||||
|     description=textwrap.dedent("""\ | ||||
|     For each extracted work that has not already been collated, | ||||
|     DLibrary will attempt to intuit its structure as follows: | ||||
|     DLibrary will attempt to intuit its structure and create | ||||
|     a single ordered list of image files in the site data | ||||
|     directory. Each image will either be a symlink to an image | ||||
|     file in the extraction folder, or a single page extracted | ||||
|     from a PDF file. | ||||
| 
 | ||||
|     - Enter the work's directory. If the directory contains | ||||
|       nothing except a single subdirectory (ignoring a few types | ||||
|       of files that are definitely not relevant), traverse | ||||
|       downwards repeatedly. | ||||
|     - If the current directory contains nothing except a single | ||||
|       PDF (again, ignoring irrelevant files), attempt to extract | ||||
|       a series of images from the PDF. This process expects that | ||||
|       each page of the PDF consists of a single embedded image, | ||||
|       which will be extracted at full resolution. Support for | ||||
|       more complex PDFs is not yet implemented. | ||||
|     - If the current directory contains nothing except image | ||||
|       files, and the image files are named in a way that clearly | ||||
|       indicates a complete numerical order (each filename | ||||
|       consists of a shared prefix followed by a distinct | ||||
|       number), symlink files in the inferred order. | ||||
|     - Otherwise, skip processing this work for now. | ||||
|     DLibrary may fail to automatically collate a work if its | ||||
|     files and subdirectories are not named in a way that | ||||
|     indicates a clear linear ordering. In order to assist with | ||||
|     collation, you can provide a list of expressions specifying | ||||
|     where to start traversing the directory structure, what | ||||
|     files to include in what order, and/or what files to ignore | ||||
|     entirely. | ||||
| 
 | ||||
|     DLibrary can be given "collation hints" which provide | ||||
|     alternative starting points for this search process. A hint | ||||
|     is a path under $DLIBRARY_DIR/extract/[work id]/ | ||||
|     indicating a different directory or PDF file to begin the | ||||
|     search process for that work, rather than starting at the | ||||
|     top level of the extracted data. There can be at most one | ||||
|     hint per work; for more complicated scenarios where a work | ||||
|     includes multiple folders that need to be collated together, | ||||
|     or where filenames do not clearly indicate an ordering, use | ||||
|     `manual-collate` instead. | ||||
|     """), | ||||
| ) | ||||
| parser_collate.add_argument( | ||||
|     'hints', | ||||
|     metavar='PATH', | ||||
|     type=Path, | ||||
|     nargs='*', | ||||
|     help='paths within extraction folders as collation hints' | ||||
| ) | ||||
| parser_collate.set_defaults(func=collate, force_convert_pdf=False) | ||||
| 
 | ||||
| parser_manual_collate = subparsers.add_parser( | ||||
|     'manual-collate', | ||||
|     aliases=['mc', 'man', 'manual'], | ||||
|     help='collate a single work manually', | ||||
|     formatter_class=argparse.RawDescriptionHelpFormatter, | ||||
|     description=textwrap.dedent("""\ | ||||
|     Provide an expression or sequence of expressions specifying groups | ||||
|     of paths to collate or skip. An expression can be: | ||||
|     An expression can be: | ||||
| 
 | ||||
|     PATH | ||||
|       A single path. If this is an image, it will be appended to | ||||
|       the sequence of collated images; if this is a PDF, images will be | ||||
|       extracted from it and concatenated to the sequence; if this is a | ||||
|       directory, the contents of the directory will be collated based on | ||||
|       the normal heuristics and concatenated to the sequence. | ||||
|       the sequence of collated images for the work it belongs to; | ||||
|       if this is a PDF, images will be extracted from it and | ||||
|       concatenated to the sequence; if this is a directory, the | ||||
|       contents of the directory will be automatically collated | ||||
|       using DLibrary's default heuristics, and concatenated | ||||
|       to the sequence. | ||||
| 
 | ||||
|     ( PATH [PATH ...] ) | ||||
|       A group of paths contained in parentheses. You may need to escape | ||||
|       the parentheses to avoid them getting parsed by your shell. | ||||
|       All the paths in this group will be considered together, and | ||||
|       collated based on the normal heuristics, regardless of what | ||||
|       order the paths are provided in. | ||||
|       automatically collated using the default heuristics, regardless | ||||
|       of what order the paths are provided in. | ||||
| 
 | ||||
|     ! PATH | ||||
|     ! ( PATH [PATH ...] ) | ||||
|       A path or group of paths to exclude from collation. You may | ||||
|       need to escape the !. If an excluded path appears within any | ||||
|       of the other specified paths, it will be ignored. | ||||
|       of the other specified paths, it will be skipped by the collation | ||||
|       heuristics. | ||||
| 
 | ||||
|     If the only expressions provided are negations, then auto-collation | ||||
|     will start from the top level of the extracted work while excluding | ||||
|     the negated paths. | ||||
|     will start from the top level of the extracted work while skipping | ||||
|     the excluded paths. | ||||
| 
 | ||||
|     All provided paths must be under $DLIBRARY_DIR/extract/[work id]/ | ||||
|     for the work being manually collated. `manual-collate` can | ||||
|     only handle one work at a time. | ||||
|     for some not-yet-collated work. Paths belonging to multiple | ||||
|     different works can all be provided on the same command line, and | ||||
|     expressions will be clustered together by work id while otherwise | ||||
|     preserving the order they were provided in. A parenthesized group | ||||
|     expression must only contain paths belonging to a single work. | ||||
| 
 | ||||
|     By default, DLibrary will attempt to collate every not-yet-collated | ||||
|     work (excluding "virtual" works), using the provided expressions | ||||
|     to assist in collation when available. The `-o` flag will direct | ||||
|     DLibrary to *only* collate works included in the provided expressions, | ||||
|     even if other uncollated works are present. | ||||
|     """), | ||||
| ) | ||||
| parser_manual_collate.add_argument( | ||||
| parser_collate.add_argument( | ||||
|     '-o', '--only-specified-works', | ||||
|     action='store_true', | ||||
|     help="only collate works that are explicitly specified", | ||||
| ) | ||||
| parser_collate.add_argument( | ||||
|     '--force-convert-pdf', | ||||
|     action='store_true', | ||||
|     help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly", | ||||
| ) | ||||
| parser_manual_collate.add_argument( | ||||
| parser_collate.add_argument( | ||||
|     'expression', | ||||
|     nargs='+', | ||||
|     nargs='*', | ||||
|     help='expressions indicating paths to collate or skip', | ||||
| ) | ||||
| parser_manual_collate.set_defaults(func=manual_collate) | ||||
| parser_collate.set_defaults(func=collate) | ||||
| 
 | ||||
| parser_analyze = subparsers.add_parser('analyze', aliases=['a', 'an', 'anal'], help='analyze an extracted folder to assist in collation') | ||||
| 
 | ||||
| parser_analyze = subparsers.add_parser('analyze', aliases=['a'], help='analyze an extracted folder to assist in collation') | ||||
| parser_analyze.add_argument('work_id') | ||||
| parser_analyze.set_defaults(func=analyze) | ||||
| 
 | ||||
| parser_metadata = subparsers.add_parser('metadata', aliases=['m', 'me', 'meta'], help='view or modify metadata for a work') | ||||
| parser_metadata = subparsers.add_parser('metadata', aliases=['m'], help='view or modify metadata for a work') | ||||
| parser_metadata.add_argument('work_id') | ||||
| parser_metadata.add_argument( | ||||
|     '--virtual', | ||||
|  | @ -1177,7 +1165,7 @@ parser_metadata.set_defaults(func=metadata) | |||
| 
 | ||||
| parser_generate = subparsers.add_parser( | ||||
|     'generate', | ||||
|     aliases=['g', 'gen'], | ||||
|     aliases=['g'], | ||||
|     help='generate HTML/CSS/JS for library site', | ||||
|     formatter_class=argparse.RawDescriptionHelpFormatter, | ||||
|     description=textwrap.dedent("""\ | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue