consolidate manual-collate functionality into collate
This commit is contained in:
parent
b7b989433a
commit
c7f95d50f9
|
@ -214,11 +214,52 @@ def self_and_parents(path):
|
|||
return [path] + list(path.parents)
|
||||
|
||||
def collate(args):
|
||||
con = sqlite3.connect(args.destdir / 'meta.db')
|
||||
cur = con.cursor()
|
||||
|
||||
extraction_dir = args.destdir / 'extract'
|
||||
hint_map = {self_and_parents(Path(relpath(hint, extraction_dir)))[-2].name: hint for hint in args.hints}
|
||||
|
||||
def extracted_path_work_id(path):
|
||||
trail = self_and_parents(Path(relpath(path, extraction_dir)))
|
||||
if len(trail) < 2:
|
||||
return None
|
||||
result = trail[-2].name
|
||||
if result == '..':
|
||||
return None
|
||||
return result
|
||||
|
||||
(raw_groups, raw_exclusions) = parse_expressions(args.expression)
|
||||
|
||||
specified_works = set()
|
||||
works_groups = {}
|
||||
for group in raw_groups:
|
||||
if len(group) == 0:
|
||||
continue
|
||||
work_id = extracted_path_work_id(group[0])
|
||||
if not work_id:
|
||||
print(f'Group {group} contains paths outside an extracted work!')
|
||||
exit(1)
|
||||
if not all(extracted_path_work_id(item) == work_id for item in group[1:]):
|
||||
print(f'Group {group} contains paths from multiple works!')
|
||||
exit(1)
|
||||
specified_works.add(work_id)
|
||||
if work_id not in works_groups:
|
||||
works_groups[work_id] = []
|
||||
normalized_paths = [normalize_to(item, args.destdir) for item in group]
|
||||
if not all(path.exists() for path in normalized_paths):
|
||||
print(f'Group {group} contains nonexistent paths!')
|
||||
exit(1)
|
||||
works_groups[work_id].append(normalized_paths)
|
||||
|
||||
exclusions = []
|
||||
for exclusion in raw_exclusions:
|
||||
work_id = extracted_path_work_id(exclusion)
|
||||
if not work_id:
|
||||
print(f'Excluded path {exclusion} does not belong to an extracted work!')
|
||||
exit(1)
|
||||
specified_works.add(work_id)
|
||||
normalized_path = normalize_to(exclusion, args.destdir)
|
||||
if not normalized_path.exists():
|
||||
print(f'Excluded path {exclusion} does not exist!')
|
||||
exit(1)
|
||||
exclusions.append(normalized_path)
|
||||
|
||||
collation_staging_area = args.destdir / 'site' / 'images-staging'
|
||||
collation_staging_area.mkdir(parents=True)
|
||||
|
@ -226,21 +267,41 @@ def collate(args):
|
|||
collation_area = args.destdir / 'site' / 'images'
|
||||
collation_area.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
con = sqlite3.connect(args.destdir / 'meta.db')
|
||||
cur = con.cursor()
|
||||
|
||||
for work_path in extraction_dir.iterdir():
|
||||
work_id = work_path.name
|
||||
|
||||
if args.only_specified_works and work_id not in specified_works:
|
||||
continue
|
||||
|
||||
work_collation_dir = collation_area / work_id
|
||||
if work_collation_dir.exists():
|
||||
if work_id not in specified_works:
|
||||
continue
|
||||
if len(list(work_collation_dir.iterdir())) > 0:
|
||||
print(f'Collation directory for work {work_id} already exists!')
|
||||
break
|
||||
else:
|
||||
work_collation_dir.rmdir()
|
||||
|
||||
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
|
||||
if virtual == (1,):
|
||||
if work_id in specified_works:
|
||||
print(f'Work {work_id} is virtual!')
|
||||
break
|
||||
continue
|
||||
|
||||
work_staging_dir = collation_staging_area / work_id
|
||||
|
||||
collator = Collator(work_staging_dir, [], args)
|
||||
collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)])
|
||||
collator = Collator(work_staging_dir, exclusions, args)
|
||||
for group in works_groups.get(work_id, [[work_path]]):
|
||||
collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
|
||||
if not collation_result:
|
||||
print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}')
|
||||
break
|
||||
|
||||
if collation_result and collator.index > 0:
|
||||
print(f'Collated {collator.index} pages for {work_id}')
|
||||
work_staging_dir.rename(work_collation_dir)
|
||||
|
@ -253,7 +314,7 @@ def collate(args):
|
|||
if not collation_result:
|
||||
print(f'Unable to deduce file structure for {work_id}, skipping')
|
||||
elif collator.index == 0:
|
||||
print(f'{work_id} contains no files? skipping')
|
||||
print(f'No files found for {work_id}, skipping')
|
||||
|
||||
collation_staging_area.rmdir()
|
||||
con.close()
|
||||
|
@ -688,9 +749,6 @@ def superior_or_equal(a, b):
|
|||
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
|
||||
|
||||
|
||||
def self_and_parents(path):
|
||||
return [path] + list(path.parents)
|
||||
|
||||
def parse_expressions(tokens):
|
||||
groups = []
|
||||
exclusions = []
|
||||
|
@ -727,61 +785,6 @@ def parse_group(tokens):
|
|||
def normalize_to(path, ref):
|
||||
return ref / Path(relpath(path, ref))
|
||||
|
||||
def manual_collate(args):
|
||||
(raw_groups, raw_exclusions) = parse_expressions(args.expression)
|
||||
|
||||
extraction_dir = args.destdir / 'extract'
|
||||
|
||||
sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group)
|
||||
work_id = self_and_parents(Path(relpath(sample_path, extraction_dir)))[-2].name
|
||||
|
||||
exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
|
||||
|
||||
if raw_groups:
|
||||
groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
|
||||
else:
|
||||
groups = [[extraction_dir / work_id]]
|
||||
|
||||
collation_area = args.destdir / 'site' / 'images'
|
||||
collation_area.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
work_collation_dir = collation_area / work_id
|
||||
if work_collation_dir.exists():
|
||||
if len(list(work_collation_dir.iterdir())) > 0:
|
||||
print('Collation directory already exists!')
|
||||
return
|
||||
else:
|
||||
work_collation_dir.rmdir()
|
||||
|
||||
nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
|
||||
if len(nonexistent) > 0:
|
||||
print(f'Nonexistent paths: {nonexistent}')
|
||||
return
|
||||
|
||||
collation_staging_area = args.destdir / 'site' / 'images-staging'
|
||||
work_staging_dir = collation_staging_area / work_id
|
||||
work_staging_dir.mkdir(parents=True)
|
||||
|
||||
collator = Collator(work_staging_dir, exclusions, args)
|
||||
for group in groups:
|
||||
collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
|
||||
if collation_result is None:
|
||||
print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}')
|
||||
break
|
||||
|
||||
if collation_result and collator.index > 0:
|
||||
print(f'Collated {collator.index} pages for {work_id}')
|
||||
work_staging_dir.rename(work_collation_dir)
|
||||
else:
|
||||
for f in work_staging_dir.iterdir():
|
||||
f.unlink()
|
||||
work_staging_dir.rmdir()
|
||||
|
||||
if collation_result and collator.index == 0:
|
||||
print(f'No files found for {work_id}')
|
||||
|
||||
collation_staging_area.rmdir()
|
||||
|
||||
|
||||
def fmt_size(s):
|
||||
return f'{s[0]}x{s[1]}px'
|
||||
|
@ -1022,9 +1025,9 @@ argparser = argparse.ArgumentParser(
|
|||
subfolder.
|
||||
- `fetch` metadata and thumbnail images for extracted works
|
||||
from DLSite.
|
||||
- `collate` and/or `manual-collate` extracted works,
|
||||
producing a single sequence of image files (or symlinks
|
||||
into the extracted data, when possible) for each work.
|
||||
- `collate` extracted works, producing a single sequence of
|
||||
image files (or symlinks into the extracted data, when
|
||||
possible) for each work.
|
||||
- Manually adjust works' `metadata` when necessary.
|
||||
- `generate` a static website providing a catalog and viewer
|
||||
for all collated works.
|
||||
|
@ -1047,7 +1050,7 @@ argparser.add_argument(
|
|||
)
|
||||
subparsers = argparser.add_subparsers(title="subcommands", required=True)
|
||||
|
||||
parser_extract = subparsers.add_parser('extract', aliases=['x', 'ex'], help='extract zipfiles')
|
||||
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles')
|
||||
parser_extract.add_argument(
|
||||
'-r', '--remove',
|
||||
action='store_true',
|
||||
|
@ -1062,111 +1065,96 @@ parser_extract.add_argument(
|
|||
)
|
||||
parser_extract.set_defaults(func=extract)
|
||||
|
||||
parser_fetch = subparsers.add_parser('fetch', aliases=['f', 'fet'], help='fetch metadata and thumbnails')
|
||||
parser_fetch = subparsers.add_parser('fetch', aliases=['f'], help='fetch metadata and thumbnails')
|
||||
parser_fetch.set_defaults(func=fetch)
|
||||
|
||||
parser_collate = subparsers.add_parser(
|
||||
'collate',
|
||||
aliases=['c', 'co', 'col'],
|
||||
help='collate each work into a sequence of image files',
|
||||
aliases=['c'],
|
||||
help='collate works into sequences of image files',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=textwrap.dedent("""\
|
||||
For each extracted work that has not already been collated,
|
||||
DLibrary will attempt to intuit its structure as follows:
|
||||
DLibrary will attempt to intuit its structure and create
|
||||
a single ordered list of image files in the site data
|
||||
directory. Each image will either be a symlink to an image
|
||||
file in the extraction folder, or a single page extracted
|
||||
from a PDF file.
|
||||
|
||||
- Enter the work's directory. If the directory contains
|
||||
nothing except a single subdirectory (ignoring a few types
|
||||
of files that are definitely not relevant), traverse
|
||||
downwards repeatedly.
|
||||
- If the current directory contains nothing except a single
|
||||
PDF (again, ignoring irrelevant files), attempt to extract
|
||||
a series of images from the PDF. This process expects that
|
||||
each page of the PDF consists of a single embedded image,
|
||||
which will be extracted at full resolution. Support for
|
||||
more complex PDFs is not yet implemented.
|
||||
- If the current directory contains nothing except image
|
||||
files, and the image files are named in a way that clearly
|
||||
indicates a complete numerical order (each filename
|
||||
consists of a shared prefix followed by a distinct
|
||||
number), symlink files in the inferred order.
|
||||
- Otherwise, skip processing this work for now.
|
||||
DLibrary may fail to automatically collate a work if its
|
||||
files and subdirectories are not named in a way that
|
||||
indicates a clear linear ordering. In order to assist with
|
||||
collation, you can provide a list of expressions specifying
|
||||
where to start traversing the directory structure, what
|
||||
files to include in what order, and/or what files to ignore
|
||||
entirely.
|
||||
|
||||
DLibrary can be given "collation hints" which provide
|
||||
alternative starting points for this search process. A hint
|
||||
is a path under $DLIBRARY_DIR/extract/[work id]/
|
||||
indicating a different directory or PDF file to begin the
|
||||
search process for that work, rather than starting at the
|
||||
top level of the extracted data. There can be at most one
|
||||
hint per work; for more complicated scenarios where a work
|
||||
includes multiple folders that need to be collated together,
|
||||
or where filenames do not clearly indicate an ordering, use
|
||||
`manual-collate` instead.
|
||||
"""),
|
||||
)
|
||||
parser_collate.add_argument(
|
||||
'hints',
|
||||
metavar='PATH',
|
||||
type=Path,
|
||||
nargs='*',
|
||||
help='paths within extraction folders as collation hints'
|
||||
)
|
||||
parser_collate.set_defaults(func=collate, force_convert_pdf=False)
|
||||
|
||||
parser_manual_collate = subparsers.add_parser(
|
||||
'manual-collate',
|
||||
aliases=['mc', 'man', 'manual'],
|
||||
help='collate a single work manually',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=textwrap.dedent("""\
|
||||
Provide an expression or sequence of expressions specifying groups
|
||||
of paths to collate or skip. An expression can be:
|
||||
An expression can be:
|
||||
|
||||
PATH
|
||||
A single path. If this is an image, it will be appended to
|
||||
the sequence of collated images; if this is a PDF, images will be
|
||||
extracted from it and concatenated to the sequence; if this is a
|
||||
directory, the contents of the directory will be collated based on
|
||||
the normal heuristics and concatenated to the sequence.
|
||||
the sequence of collated images for the work it belongs to;
|
||||
if this is a PDF, images will be extracted from it and
|
||||
concatenated to the sequence; if this is a directory, the
|
||||
contents of the directory will be automatically collated
|
||||
using DLibrary's default heuristics, and concatenated
|
||||
to the sequence.
|
||||
|
||||
( PATH [PATH ...] )
|
||||
A group of paths contained in parentheses. You may need to escape
|
||||
the parentheses to avoid them getting parsed by your shell.
|
||||
All the paths in this group will be considered together, and
|
||||
collated based on the normal heuristics, regardless of what
|
||||
order the paths are provided in.
|
||||
automatically collated using the default heuristics, regardless
|
||||
of what order the paths are provided in.
|
||||
|
||||
! PATH
|
||||
! ( PATH [PATH ...] )
|
||||
A path or group of paths to exclude from collation. You may
|
||||
need to escape the !. If an excluded path appears within any
|
||||
of the other specified paths, it will be ignored.
|
||||
of the other specified paths, it will be skipped by the collation
|
||||
heuristics.
|
||||
|
||||
If the only expressions provided are negations, then auto-collation
|
||||
will start from the top level of the extracted work while excluding
|
||||
the negated paths.
|
||||
will start from the top level of the extracted work while skipping
|
||||
the excluded paths.
|
||||
|
||||
All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
|
||||
for the work being manually collated. `manual-collate` can
|
||||
only handle one work at a time.
|
||||
"""),
|
||||
for some not-yet-collated work. Paths belonging to multiple
|
||||
different works can all be provided on the same command line, and
|
||||
expressions will be clustered together by work id while otherwise
|
||||
preserving the order they were provided in. A parenthesized group
|
||||
expression must only contain paths belonging to a single work.
|
||||
|
||||
By default, DLibrary will attempt to collate every not-yet-collated
|
||||
work (excluding "virtual" works), using the provided expressions
|
||||
to assist in collation when available. The `-o` flag will direct
|
||||
DLibrary to *only* collate works included in the provided expressions,
|
||||
even if other uncollated works are present.
|
||||
"""),
|
||||
)
|
||||
parser_manual_collate.add_argument(
|
||||
parser_collate.add_argument(
|
||||
'-o', '--only-specified-works',
|
||||
action='store_true',
|
||||
help="only collate works that are explicitly specified",
|
||||
)
|
||||
parser_collate.add_argument(
|
||||
'--force-convert-pdf',
|
||||
action='store_true',
|
||||
help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
|
||||
)
|
||||
parser_manual_collate.add_argument(
|
||||
parser_collate.add_argument(
|
||||
'expression',
|
||||
nargs='+',
|
||||
nargs='*',
|
||||
help='expressions indicating paths to collate or skip',
|
||||
)
|
||||
parser_manual_collate.set_defaults(func=manual_collate)
|
||||
parser_collate.set_defaults(func=collate)
|
||||
|
||||
parser_analyze = subparsers.add_parser('analyze', aliases=['a', 'an', 'anal'], help='analyze an extracted folder to assist in collation')
|
||||
|
||||
parser_analyze = subparsers.add_parser('analyze', aliases=['a'], help='analyze an extracted folder to assist in collation')
|
||||
parser_analyze.add_argument('work_id')
|
||||
parser_analyze.set_defaults(func=analyze)
|
||||
|
||||
parser_metadata = subparsers.add_parser('metadata', aliases=['m', 'me', 'meta'], help='view or modify metadata for a work')
|
||||
parser_metadata = subparsers.add_parser('metadata', aliases=['m'], help='view or modify metadata for a work')
|
||||
parser_metadata.add_argument('work_id')
|
||||
parser_metadata.add_argument(
|
||||
'--virtual',
|
||||
|
@ -1177,7 +1165,7 @@ parser_metadata.set_defaults(func=metadata)
|
|||
|
||||
parser_generate = subparsers.add_parser(
|
||||
'generate',
|
||||
aliases=['g', 'gen'],
|
||||
aliases=['g'],
|
||||
help='generate HTML/CSS/JS for library site',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=textwrap.dedent("""\
|
||||
|
|
Loading…
Reference in a new issue