consolidate manual-collate functionality into collate
This commit is contained in:
		
							parent
							
								
									b7b989433a
								
							
						
					
					
						commit
						c7f95d50f9
					
				
					 1 changed files with 126 additions and 138 deletions
				
			
		|  | @ -214,11 +214,52 @@ def self_and_parents(path): | ||||||
|     return [path] + list(path.parents) |     return [path] + list(path.parents) | ||||||
| 
 | 
 | ||||||
| def collate(args): | def collate(args): | ||||||
|     con = sqlite3.connect(args.destdir / 'meta.db') |  | ||||||
|     cur = con.cursor() |  | ||||||
| 
 |  | ||||||
|     extraction_dir = args.destdir / 'extract' |     extraction_dir = args.destdir / 'extract' | ||||||
|     hint_map = {self_and_parents(Path(relpath(hint, extraction_dir)))[-2].name: hint for hint in args.hints} | 
 | ||||||
|  |     def extracted_path_work_id(path): | ||||||
|  |         trail = self_and_parents(Path(relpath(path, extraction_dir))) | ||||||
|  |         if len(trail) < 2: | ||||||
|  |             return None | ||||||
|  |         result = trail[-2].name | ||||||
|  |         if result == '..': | ||||||
|  |             return None | ||||||
|  |         return result | ||||||
|  | 
 | ||||||
|  |     (raw_groups, raw_exclusions) = parse_expressions(args.expression) | ||||||
|  | 
 | ||||||
|  |     specified_works = set() | ||||||
|  |     works_groups = {} | ||||||
|  |     for group in raw_groups: | ||||||
|  |         if len(group) == 0: | ||||||
|  |             continue | ||||||
|  |         work_id = extracted_path_work_id(group[0]) | ||||||
|  |         if not work_id: | ||||||
|  |             print(f'Group {group} contains paths outside an extracted work!') | ||||||
|  |             exit(1) | ||||||
|  |         if not all(extracted_path_work_id(item) == work_id for item in group[1:]): | ||||||
|  |             print(f'Group {group} contains paths from multiple works!') | ||||||
|  |             exit(1) | ||||||
|  |         specified_works.add(work_id) | ||||||
|  |         if work_id not in works_groups: | ||||||
|  |             works_groups[work_id] = [] | ||||||
|  |         normalized_paths = [normalize_to(item, args.destdir) for item in group] | ||||||
|  |         if not all(path.exists() for path in normalized_paths): | ||||||
|  |             print(f'Group {group} contains nonexistent paths!') | ||||||
|  |             exit(1) | ||||||
|  |         works_groups[work_id].append(normalized_paths) | ||||||
|  | 
 | ||||||
|  |     exclusions = [] | ||||||
|  |     for exclusion in raw_exclusions: | ||||||
|  |         work_id = extracted_path_work_id(exclusion) | ||||||
|  |         if not work_id: | ||||||
|  |             print(f'Excluded path {exclusion} does not belong to an extracted work!') | ||||||
|  |             exit(1) | ||||||
|  |         specified_works.add(work_id) | ||||||
|  |         normalized_path = normalize_to(exclusion, args.destdir) | ||||||
|  |         if not normalized_path.exists(): | ||||||
|  |             print(f'Excluded path {exclusion} does not exist!') | ||||||
|  |             exit(1) | ||||||
|  |         exclusions.append(normalized_path) | ||||||
| 
 | 
 | ||||||
|     collation_staging_area = args.destdir / 'site' / 'images-staging' |     collation_staging_area = args.destdir / 'site' / 'images-staging' | ||||||
|     collation_staging_area.mkdir(parents=True) |     collation_staging_area.mkdir(parents=True) | ||||||
|  | @ -226,21 +267,41 @@ def collate(args): | ||||||
|     collation_area = args.destdir / 'site' / 'images' |     collation_area = args.destdir / 'site' / 'images' | ||||||
|     collation_area.mkdir(parents=True, exist_ok=True) |     collation_area.mkdir(parents=True, exist_ok=True) | ||||||
| 
 | 
 | ||||||
|  |     con = sqlite3.connect(args.destdir / 'meta.db') | ||||||
|  |     cur = con.cursor() | ||||||
|  | 
 | ||||||
|     for work_path in extraction_dir.iterdir(): |     for work_path in extraction_dir.iterdir(): | ||||||
|         work_id = work_path.name |         work_id = work_path.name | ||||||
| 
 | 
 | ||||||
|  |         if args.only_specified_works and work_id not in specified_works: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|         work_collation_dir = collation_area / work_id |         work_collation_dir = collation_area / work_id | ||||||
|         if work_collation_dir.exists(): |         if work_collation_dir.exists(): | ||||||
|  |             if work_id not in specified_works: | ||||||
|                 continue |                 continue | ||||||
|  |             if len(list(work_collation_dir.iterdir())) > 0: | ||||||
|  |                 print(f'Collation directory for work {work_id} already exists!') | ||||||
|  |                 break | ||||||
|  |             else: | ||||||
|  |                 work_collation_dir.rmdir() | ||||||
| 
 | 
 | ||||||
|         virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone() |         virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone() | ||||||
|         if virtual == (1,): |         if virtual == (1,): | ||||||
|  |             if work_id in specified_works: | ||||||
|  |                 print(f'Work {work_id} is virtual!') | ||||||
|  |                 break | ||||||
|             continue |             continue | ||||||
| 
 | 
 | ||||||
|         work_staging_dir = collation_staging_area / work_id |         work_staging_dir = collation_staging_area / work_id | ||||||
| 
 | 
 | ||||||
|         collator = Collator(work_staging_dir, [], args) |         collator = Collator(work_staging_dir, exclusions, args) | ||||||
|         collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)]) |         for group in works_groups.get(work_id, [[work_path]]): | ||||||
|  |             collation_result = collator.collate_from_paths([item for item in group if item not in exclusions]) | ||||||
|  |             if not collation_result: | ||||||
|  |                 print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}') | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|         if collation_result and collator.index > 0: |         if collation_result and collator.index > 0: | ||||||
|             print(f'Collated {collator.index} pages for {work_id}') |             print(f'Collated {collator.index} pages for {work_id}') | ||||||
|             work_staging_dir.rename(work_collation_dir) |             work_staging_dir.rename(work_collation_dir) | ||||||
|  | @ -253,7 +314,7 @@ def collate(args): | ||||||
|             if not collation_result: |             if not collation_result: | ||||||
|                 print(f'Unable to deduce file structure for {work_id}, skipping') |                 print(f'Unable to deduce file structure for {work_id}, skipping') | ||||||
|             elif collator.index == 0: |             elif collator.index == 0: | ||||||
|                 print(f'{work_id} contains no files? skipping') |                 print(f'No files found for {work_id}, skipping') | ||||||
| 
 | 
 | ||||||
|     collation_staging_area.rmdir() |     collation_staging_area.rmdir() | ||||||
|     con.close() |     con.close() | ||||||
|  | @ -688,9 +749,6 @@ def superior_or_equal(a, b): | ||||||
|     return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) |     return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def self_and_parents(path): |  | ||||||
|     return [path] + list(path.parents) |  | ||||||
| 
 |  | ||||||
| def parse_expressions(tokens): | def parse_expressions(tokens): | ||||||
|     groups = [] |     groups = [] | ||||||
|     exclusions = [] |     exclusions = [] | ||||||
|  | @ -727,61 +785,6 @@ def parse_group(tokens): | ||||||
| def normalize_to(path, ref): | def normalize_to(path, ref): | ||||||
|     return ref / Path(relpath(path, ref)) |     return ref / Path(relpath(path, ref)) | ||||||
| 
 | 
 | ||||||
| def manual_collate(args): |  | ||||||
|     (raw_groups, raw_exclusions) = parse_expressions(args.expression) |  | ||||||
| 
 |  | ||||||
|     extraction_dir = args.destdir / 'extract' |  | ||||||
| 
 |  | ||||||
|     sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group) |  | ||||||
|     work_id = self_and_parents(Path(relpath(sample_path, extraction_dir)))[-2].name |  | ||||||
| 
 |  | ||||||
|     exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions] |  | ||||||
| 
 |  | ||||||
|     if raw_groups: |  | ||||||
|         groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups] |  | ||||||
|     else: |  | ||||||
|         groups = [[extraction_dir / work_id]] |  | ||||||
| 
 |  | ||||||
|     collation_area = args.destdir / 'site' / 'images' |  | ||||||
|     collation_area.mkdir(parents=True, exist_ok=True) |  | ||||||
| 
 |  | ||||||
|     work_collation_dir = collation_area / work_id |  | ||||||
|     if work_collation_dir.exists(): |  | ||||||
|         if len(list(work_collation_dir.iterdir())) > 0: |  | ||||||
|             print('Collation directory already exists!') |  | ||||||
|             return |  | ||||||
|         else: |  | ||||||
|             work_collation_dir.rmdir() |  | ||||||
| 
 |  | ||||||
|     nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()] |  | ||||||
|     if len(nonexistent) > 0: |  | ||||||
|         print(f'Nonexistent paths: {nonexistent}') |  | ||||||
|         return |  | ||||||
| 
 |  | ||||||
|     collation_staging_area = args.destdir / 'site' / 'images-staging' |  | ||||||
|     work_staging_dir = collation_staging_area / work_id |  | ||||||
|     work_staging_dir.mkdir(parents=True) |  | ||||||
| 
 |  | ||||||
|     collator = Collator(work_staging_dir, exclusions, args) |  | ||||||
|     for group in groups: |  | ||||||
|         collation_result = collator.collate_from_paths([item for item in group if item not in exclusions]) |  | ||||||
|         if collation_result is None: |  | ||||||
|             print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}') |  | ||||||
|             break |  | ||||||
| 
 |  | ||||||
|     if collation_result and collator.index > 0: |  | ||||||
|         print(f'Collated {collator.index} pages for {work_id}') |  | ||||||
|         work_staging_dir.rename(work_collation_dir) |  | ||||||
|     else: |  | ||||||
|         for f in work_staging_dir.iterdir(): |  | ||||||
|             f.unlink() |  | ||||||
|         work_staging_dir.rmdir() |  | ||||||
| 
 |  | ||||||
|         if collation_result and collator.index == 0: |  | ||||||
|             print(f'No files found for {work_id}') |  | ||||||
| 
 |  | ||||||
|     collation_staging_area.rmdir() |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| def fmt_size(s): | def fmt_size(s): | ||||||
|     return f'{s[0]}x{s[1]}px' |     return f'{s[0]}x{s[1]}px' | ||||||
|  | @ -1022,9 +1025,9 @@ argparser = argparse.ArgumentParser( | ||||||
|       subfolder. |       subfolder. | ||||||
|     - `fetch` metadata and thumbnail images for extracted works |     - `fetch` metadata and thumbnail images for extracted works | ||||||
|       from DLSite. |       from DLSite. | ||||||
|     - `collate` and/or `manual-collate` extracted works, |     - `collate` extracted works, producing a single sequence of | ||||||
|       producing a single sequence of image files (or symlinks |       image files (or symlinks into the extracted data, when | ||||||
|       into the extracted data, when possible) for each work. |       possible) for each work. | ||||||
|     - Manually adjust works' `metadata` when necessary. |     - Manually adjust works' `metadata` when necessary. | ||||||
|     - `generate` a static website providing a catalog and viewer |     - `generate` a static website providing a catalog and viewer | ||||||
|       for all collated works. |       for all collated works. | ||||||
|  | @ -1047,7 +1050,7 @@ argparser.add_argument( | ||||||
| ) | ) | ||||||
| subparsers = argparser.add_subparsers(title="subcommands", required=True) | subparsers = argparser.add_subparsers(title="subcommands", required=True) | ||||||
| 
 | 
 | ||||||
| parser_extract = subparsers.add_parser('extract', aliases=['x', 'ex'], help='extract zipfiles') | parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles') | ||||||
| parser_extract.add_argument( | parser_extract.add_argument( | ||||||
|     '-r', '--remove', |     '-r', '--remove', | ||||||
|     action='store_true', |     action='store_true', | ||||||
|  | @ -1062,111 +1065,96 @@ parser_extract.add_argument( | ||||||
| ) | ) | ||||||
| parser_extract.set_defaults(func=extract) | parser_extract.set_defaults(func=extract) | ||||||
| 
 | 
 | ||||||
| parser_fetch = subparsers.add_parser('fetch', aliases=['f', 'fet'], help='fetch metadata and thumbnails') | parser_fetch = subparsers.add_parser('fetch', aliases=['f'], help='fetch metadata and thumbnails') | ||||||
| parser_fetch.set_defaults(func=fetch) | parser_fetch.set_defaults(func=fetch) | ||||||
| 
 | 
 | ||||||
| parser_collate = subparsers.add_parser( | parser_collate = subparsers.add_parser( | ||||||
|     'collate', |     'collate', | ||||||
|     aliases=['c', 'co', 'col'], |     aliases=['c'], | ||||||
|     help='collate each work into a sequence of image files', |     help='collate works into sequences of image files', | ||||||
|     formatter_class=argparse.RawDescriptionHelpFormatter, |     formatter_class=argparse.RawDescriptionHelpFormatter, | ||||||
|     description=textwrap.dedent("""\ |     description=textwrap.dedent("""\ | ||||||
|     For each extracted work that has not already been collated, |     For each extracted work that has not already been collated, | ||||||
|     DLibrary will attempt to intuit its structure as follows: |     DLibrary will attempt to intuit its structure and create | ||||||
|  |     a single ordered list of image files in the site data | ||||||
|  |     directory. Each image will either be a symlink to an image | ||||||
|  |     file in the extraction folder, or a single page extracted | ||||||
|  |     from a PDF file. | ||||||
| 
 | 
 | ||||||
|     - Enter the work's directory. If the directory contains |     DLibrary may fail to automatically collate a work if its | ||||||
|       nothing except a single subdirectory (ignoring a few types |     files and subdirectories are not named in a way that | ||||||
|       of files that are definitely not relevant), traverse |     indicates a clear linear ordering. In order to assist with | ||||||
|       downwards repeatedly. |     collation, you can provide a list of expressions specifying | ||||||
|     - If the current directory contains nothing except a single |     where to start traversing the directory structure, what | ||||||
|       PDF (again, ignoring irrelevant files), attempt to extract |     files to include in what order, and/or what files to ignore | ||||||
|       a series of images from the PDF. This process expects that |     entirely. | ||||||
|       each page of the PDF consists of a single embedded image, |  | ||||||
|       which will be extracted at full resolution. Support for |  | ||||||
|       more complex PDFs is not yet implemented. |  | ||||||
|     - If the current directory contains nothing except image |  | ||||||
|       files, and the image files are named in a way that clearly |  | ||||||
|       indicates a complete numerical order (each filename |  | ||||||
|       consists of a shared prefix followed by a distinct |  | ||||||
|       number), symlink files in the inferred order. |  | ||||||
|     - Otherwise, skip processing this work for now. |  | ||||||
| 
 | 
 | ||||||
|     DLibrary can be given "collation hints" which provide |     An expression can be: | ||||||
|     alternative starting points for this search process. A hint |  | ||||||
|     is a path under $DLIBRARY_DIR/extract/[work id]/ |  | ||||||
|     indicating a different directory or PDF file to begin the |  | ||||||
|     search process for that work, rather than starting at the |  | ||||||
|     top level of the extracted data. There can be at most one |  | ||||||
|     hint per work; for more complicated scenarios where a work |  | ||||||
|     includes multiple folders that need to be collated together, |  | ||||||
|     or where filenames do not clearly indicate an ordering, use |  | ||||||
|     `manual-collate` instead. |  | ||||||
|     """), |  | ||||||
| ) |  | ||||||
| parser_collate.add_argument( |  | ||||||
|     'hints', |  | ||||||
|     metavar='PATH', |  | ||||||
|     type=Path, |  | ||||||
|     nargs='*', |  | ||||||
|     help='paths within extraction folders as collation hints' |  | ||||||
| ) |  | ||||||
| parser_collate.set_defaults(func=collate, force_convert_pdf=False) |  | ||||||
| 
 |  | ||||||
| parser_manual_collate = subparsers.add_parser( |  | ||||||
|     'manual-collate', |  | ||||||
|     aliases=['mc', 'man', 'manual'], |  | ||||||
|     help='collate a single work manually', |  | ||||||
|     formatter_class=argparse.RawDescriptionHelpFormatter, |  | ||||||
|     description=textwrap.dedent("""\ |  | ||||||
|     Provide an expression or sequence of expressions specifying groups |  | ||||||
|     of paths to collate or skip. An expression can be: |  | ||||||
| 
 | 
 | ||||||
|     PATH |     PATH | ||||||
|       A single path. If this is an image, it will be appended to |       A single path. If this is an image, it will be appended to | ||||||
|       the sequence of collated images; if this is a PDF, images will be |       the sequence of collated images for the work it belongs to; | ||||||
|       extracted from it and concatenated to the sequence; if this is a |       if this is a PDF, images will be extracted from it and | ||||||
|       directory, the contents of the directory will be collated based on |       concatenated to the sequence; if this is a directory, the | ||||||
|       the normal heuristics and concatenated to the sequence. |       contents of the directory will be automatically collated | ||||||
|  |       using DLibrary's default heuristics, and concatenated | ||||||
|  |       to the sequence. | ||||||
| 
 | 
 | ||||||
|     ( PATH [PATH ...] ) |     ( PATH [PATH ...] ) | ||||||
|       A group of paths contained in parentheses. You may need to escape |       A group of paths contained in parentheses. You may need to escape | ||||||
|       the parentheses to avoid them getting parsed by your shell. |       the parentheses to avoid them getting parsed by your shell. | ||||||
|       All the paths in this group will be considered together, and |       All the paths in this group will be considered together, and | ||||||
|       collated based on the normal heuristics, regardless of what |       automatically collated using the default heuristics, regardless | ||||||
|       order the paths are provided in. |       of what order the paths are provided in. | ||||||
| 
 | 
 | ||||||
|     ! PATH |     ! PATH | ||||||
|     ! ( PATH [PATH ...] ) |     ! ( PATH [PATH ...] ) | ||||||
|       A path or group of paths to exclude from collation. You may |       A path or group of paths to exclude from collation. You may | ||||||
|       need to escape the !. If an excluded path appears within any |       need to escape the !. If an excluded path appears within any | ||||||
|       of the other specified paths, it will be ignored. |       of the other specified paths, it will be skipped by the collation | ||||||
|  |       heuristics. | ||||||
| 
 | 
 | ||||||
|     If the only expressions provided are negations, then auto-collation |     If the only expressions provided are negations, then auto-collation | ||||||
|     will start from the top level of the extracted work while excluding |     will start from the top level of the extracted work while skipping | ||||||
|     the negated paths. |     the excluded paths. | ||||||
| 
 | 
 | ||||||
|     All provided paths must be under $DLIBRARY_DIR/extract/[work id]/ |     All provided paths must be under $DLIBRARY_DIR/extract/[work id]/ | ||||||
|     for the work being manually collated. `manual-collate` can |     for some not-yet-collated work. Paths belonging to multiple | ||||||
|     only handle one work at a time. |     different works can all be provided on the same command line, and | ||||||
|  |     expressions will be clustered together by work id while otherwise | ||||||
|  |     preserving the order they were provided in. A parenthesized group | ||||||
|  |     expression must only contain paths belonging to a single work. | ||||||
|  | 
 | ||||||
|  |     By default, DLibrary will attempt to collate every not-yet-collated | ||||||
|  |     work (excluding "virtual" works), using the provided expressions | ||||||
|  |     to assist in collation when available. The `-o` flag will direct | ||||||
|  |     DLibrary to *only* collate works included in the provided expressions, | ||||||
|  |     even if other uncollated works are present. | ||||||
|     """), |     """), | ||||||
| ) | ) | ||||||
| parser_manual_collate.add_argument( | parser_collate.add_argument( | ||||||
|  |     '-o', '--only-specified-works', | ||||||
|  |     action='store_true', | ||||||
|  |     help="only collate works that are explicitly specified", | ||||||
|  | ) | ||||||
|  | parser_collate.add_argument( | ||||||
|     '--force-convert-pdf', |     '--force-convert-pdf', | ||||||
|     action='store_true', |     action='store_true', | ||||||
|     help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly", |     help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly", | ||||||
| ) | ) | ||||||
| parser_manual_collate.add_argument( | parser_collate.add_argument( | ||||||
|     'expression', |     'expression', | ||||||
|     nargs='+', |     nargs='*', | ||||||
|     help='expressions indicating paths to collate or skip', |     help='expressions indicating paths to collate or skip', | ||||||
| ) | ) | ||||||
| parser_manual_collate.set_defaults(func=manual_collate) | parser_collate.set_defaults(func=collate) | ||||||
| 
 | 
 | ||||||
| parser_analyze = subparsers.add_parser('analyze', aliases=['a', 'an', 'anal'], help='analyze an extracted folder to assist in collation') | 
 | ||||||
|  | parser_analyze = subparsers.add_parser('analyze', aliases=['a'], help='analyze an extracted folder to assist in collation') | ||||||
| parser_analyze.add_argument('work_id') | parser_analyze.add_argument('work_id') | ||||||
| parser_analyze.set_defaults(func=analyze) | parser_analyze.set_defaults(func=analyze) | ||||||
| 
 | 
 | ||||||
| parser_metadata = subparsers.add_parser('metadata', aliases=['m', 'me', 'meta'], help='view or modify metadata for a work') | parser_metadata = subparsers.add_parser('metadata', aliases=['m'], help='view or modify metadata for a work') | ||||||
| parser_metadata.add_argument('work_id') | parser_metadata.add_argument('work_id') | ||||||
| parser_metadata.add_argument( | parser_metadata.add_argument( | ||||||
|     '--virtual', |     '--virtual', | ||||||
|  | @ -1177,7 +1165,7 @@ parser_metadata.set_defaults(func=metadata) | ||||||
| 
 | 
 | ||||||
| parser_generate = subparsers.add_parser( | parser_generate = subparsers.add_parser( | ||||||
|     'generate', |     'generate', | ||||||
|     aliases=['g', 'gen'], |     aliases=['g'], | ||||||
|     help='generate HTML/CSS/JS for library site', |     help='generate HTML/CSS/JS for library site', | ||||||
|     formatter_class=argparse.RawDescriptionHelpFormatter, |     formatter_class=argparse.RawDescriptionHelpFormatter, | ||||||
|     description=textwrap.dedent("""\ |     description=textwrap.dedent("""\ | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue