consolidate manual-collate functionality into collate
This commit is contained in:
parent
b7b989433a
commit
c7f95d50f9
|
@ -214,11 +214,52 @@ def self_and_parents(path):
|
||||||
return [path] + list(path.parents)
|
return [path] + list(path.parents)
|
||||||
|
|
||||||
def collate(args):
|
def collate(args):
|
||||||
con = sqlite3.connect(args.destdir / 'meta.db')
|
|
||||||
cur = con.cursor()
|
|
||||||
|
|
||||||
extraction_dir = args.destdir / 'extract'
|
extraction_dir = args.destdir / 'extract'
|
||||||
hint_map = {self_and_parents(Path(relpath(hint, extraction_dir)))[-2].name: hint for hint in args.hints}
|
|
||||||
|
def extracted_path_work_id(path):
|
||||||
|
trail = self_and_parents(Path(relpath(path, extraction_dir)))
|
||||||
|
if len(trail) < 2:
|
||||||
|
return None
|
||||||
|
result = trail[-2].name
|
||||||
|
if result == '..':
|
||||||
|
return None
|
||||||
|
return result
|
||||||
|
|
||||||
|
(raw_groups, raw_exclusions) = parse_expressions(args.expression)
|
||||||
|
|
||||||
|
specified_works = set()
|
||||||
|
works_groups = {}
|
||||||
|
for group in raw_groups:
|
||||||
|
if len(group) == 0:
|
||||||
|
continue
|
||||||
|
work_id = extracted_path_work_id(group[0])
|
||||||
|
if not work_id:
|
||||||
|
print(f'Group {group} contains paths outside an extracted work!')
|
||||||
|
exit(1)
|
||||||
|
if not all(extracted_path_work_id(item) == work_id for item in group[1:]):
|
||||||
|
print(f'Group {group} contains paths from multiple works!')
|
||||||
|
exit(1)
|
||||||
|
specified_works.add(work_id)
|
||||||
|
if work_id not in works_groups:
|
||||||
|
works_groups[work_id] = []
|
||||||
|
normalized_paths = [normalize_to(item, args.destdir) for item in group]
|
||||||
|
if not all(path.exists() for path in normalized_paths):
|
||||||
|
print(f'Group {group} contains nonexistent paths!')
|
||||||
|
exit(1)
|
||||||
|
works_groups[work_id].append(normalized_paths)
|
||||||
|
|
||||||
|
exclusions = []
|
||||||
|
for exclusion in raw_exclusions:
|
||||||
|
work_id = extracted_path_work_id(exclusion)
|
||||||
|
if not work_id:
|
||||||
|
print(f'Excluded path {exclusion} does not belong to an extracted work!')
|
||||||
|
exit(1)
|
||||||
|
specified_works.add(work_id)
|
||||||
|
normalized_path = normalize_to(exclusion, args.destdir)
|
||||||
|
if not normalized_path.exists():
|
||||||
|
print(f'Excluded path {exclusion} does not exist!')
|
||||||
|
exit(1)
|
||||||
|
exclusions.append(normalized_path)
|
||||||
|
|
||||||
collation_staging_area = args.destdir / 'site' / 'images-staging'
|
collation_staging_area = args.destdir / 'site' / 'images-staging'
|
||||||
collation_staging_area.mkdir(parents=True)
|
collation_staging_area.mkdir(parents=True)
|
||||||
|
@ -226,21 +267,41 @@ def collate(args):
|
||||||
collation_area = args.destdir / 'site' / 'images'
|
collation_area = args.destdir / 'site' / 'images'
|
||||||
collation_area.mkdir(parents=True, exist_ok=True)
|
collation_area.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
con = sqlite3.connect(args.destdir / 'meta.db')
|
||||||
|
cur = con.cursor()
|
||||||
|
|
||||||
for work_path in extraction_dir.iterdir():
|
for work_path in extraction_dir.iterdir():
|
||||||
work_id = work_path.name
|
work_id = work_path.name
|
||||||
|
|
||||||
|
if args.only_specified_works and work_id not in specified_works:
|
||||||
|
continue
|
||||||
|
|
||||||
work_collation_dir = collation_area / work_id
|
work_collation_dir = collation_area / work_id
|
||||||
if work_collation_dir.exists():
|
if work_collation_dir.exists():
|
||||||
|
if work_id not in specified_works:
|
||||||
continue
|
continue
|
||||||
|
if len(list(work_collation_dir.iterdir())) > 0:
|
||||||
|
print(f'Collation directory for work {work_id} already exists!')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
work_collation_dir.rmdir()
|
||||||
|
|
||||||
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
|
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
|
||||||
if virtual == (1,):
|
if virtual == (1,):
|
||||||
|
if work_id in specified_works:
|
||||||
|
print(f'Work {work_id} is virtual!')
|
||||||
|
break
|
||||||
continue
|
continue
|
||||||
|
|
||||||
work_staging_dir = collation_staging_area / work_id
|
work_staging_dir = collation_staging_area / work_id
|
||||||
|
|
||||||
collator = Collator(work_staging_dir, [], args)
|
collator = Collator(work_staging_dir, exclusions, args)
|
||||||
collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)])
|
for group in works_groups.get(work_id, [[work_path]]):
|
||||||
|
collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
|
||||||
|
if not collation_result:
|
||||||
|
print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}')
|
||||||
|
break
|
||||||
|
|
||||||
if collation_result and collator.index > 0:
|
if collation_result and collator.index > 0:
|
||||||
print(f'Collated {collator.index} pages for {work_id}')
|
print(f'Collated {collator.index} pages for {work_id}')
|
||||||
work_staging_dir.rename(work_collation_dir)
|
work_staging_dir.rename(work_collation_dir)
|
||||||
|
@ -253,7 +314,7 @@ def collate(args):
|
||||||
if not collation_result:
|
if not collation_result:
|
||||||
print(f'Unable to deduce file structure for {work_id}, skipping')
|
print(f'Unable to deduce file structure for {work_id}, skipping')
|
||||||
elif collator.index == 0:
|
elif collator.index == 0:
|
||||||
print(f'{work_id} contains no files? skipping')
|
print(f'No files found for {work_id}, skipping')
|
||||||
|
|
||||||
collation_staging_area.rmdir()
|
collation_staging_area.rmdir()
|
||||||
con.close()
|
con.close()
|
||||||
|
@ -688,9 +749,6 @@ def superior_or_equal(a, b):
|
||||||
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
|
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
|
||||||
|
|
||||||
|
|
||||||
def self_and_parents(path):
|
|
||||||
return [path] + list(path.parents)
|
|
||||||
|
|
||||||
def parse_expressions(tokens):
|
def parse_expressions(tokens):
|
||||||
groups = []
|
groups = []
|
||||||
exclusions = []
|
exclusions = []
|
||||||
|
@ -727,61 +785,6 @@ def parse_group(tokens):
|
||||||
def normalize_to(path, ref):
|
def normalize_to(path, ref):
|
||||||
return ref / Path(relpath(path, ref))
|
return ref / Path(relpath(path, ref))
|
||||||
|
|
||||||
def manual_collate(args):
|
|
||||||
(raw_groups, raw_exclusions) = parse_expressions(args.expression)
|
|
||||||
|
|
||||||
extraction_dir = args.destdir / 'extract'
|
|
||||||
|
|
||||||
sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group)
|
|
||||||
work_id = self_and_parents(Path(relpath(sample_path, extraction_dir)))[-2].name
|
|
||||||
|
|
||||||
exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
|
|
||||||
|
|
||||||
if raw_groups:
|
|
||||||
groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
|
|
||||||
else:
|
|
||||||
groups = [[extraction_dir / work_id]]
|
|
||||||
|
|
||||||
collation_area = args.destdir / 'site' / 'images'
|
|
||||||
collation_area.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
work_collation_dir = collation_area / work_id
|
|
||||||
if work_collation_dir.exists():
|
|
||||||
if len(list(work_collation_dir.iterdir())) > 0:
|
|
||||||
print('Collation directory already exists!')
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
work_collation_dir.rmdir()
|
|
||||||
|
|
||||||
nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
|
|
||||||
if len(nonexistent) > 0:
|
|
||||||
print(f'Nonexistent paths: {nonexistent}')
|
|
||||||
return
|
|
||||||
|
|
||||||
collation_staging_area = args.destdir / 'site' / 'images-staging'
|
|
||||||
work_staging_dir = collation_staging_area / work_id
|
|
||||||
work_staging_dir.mkdir(parents=True)
|
|
||||||
|
|
||||||
collator = Collator(work_staging_dir, exclusions, args)
|
|
||||||
for group in groups:
|
|
||||||
collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
|
|
||||||
if collation_result is None:
|
|
||||||
print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}')
|
|
||||||
break
|
|
||||||
|
|
||||||
if collation_result and collator.index > 0:
|
|
||||||
print(f'Collated {collator.index} pages for {work_id}')
|
|
||||||
work_staging_dir.rename(work_collation_dir)
|
|
||||||
else:
|
|
||||||
for f in work_staging_dir.iterdir():
|
|
||||||
f.unlink()
|
|
||||||
work_staging_dir.rmdir()
|
|
||||||
|
|
||||||
if collation_result and collator.index == 0:
|
|
||||||
print(f'No files found for {work_id}')
|
|
||||||
|
|
||||||
collation_staging_area.rmdir()
|
|
||||||
|
|
||||||
|
|
||||||
def fmt_size(s):
|
def fmt_size(s):
|
||||||
return f'{s[0]}x{s[1]}px'
|
return f'{s[0]}x{s[1]}px'
|
||||||
|
@ -1022,9 +1025,9 @@ argparser = argparse.ArgumentParser(
|
||||||
subfolder.
|
subfolder.
|
||||||
- `fetch` metadata and thumbnail images for extracted works
|
- `fetch` metadata and thumbnail images for extracted works
|
||||||
from DLSite.
|
from DLSite.
|
||||||
- `collate` and/or `manual-collate` extracted works,
|
- `collate` extracted works, producing a single sequence of
|
||||||
producing a single sequence of image files (or symlinks
|
image files (or symlinks into the extracted data, when
|
||||||
into the extracted data, when possible) for each work.
|
possible) for each work.
|
||||||
- Manually adjust works' `metadata` when necessary.
|
- Manually adjust works' `metadata` when necessary.
|
||||||
- `generate` a static website providing a catalog and viewer
|
- `generate` a static website providing a catalog and viewer
|
||||||
for all collated works.
|
for all collated works.
|
||||||
|
@ -1047,7 +1050,7 @@ argparser.add_argument(
|
||||||
)
|
)
|
||||||
subparsers = argparser.add_subparsers(title="subcommands", required=True)
|
subparsers = argparser.add_subparsers(title="subcommands", required=True)
|
||||||
|
|
||||||
parser_extract = subparsers.add_parser('extract', aliases=['x', 'ex'], help='extract zipfiles')
|
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles')
|
||||||
parser_extract.add_argument(
|
parser_extract.add_argument(
|
||||||
'-r', '--remove',
|
'-r', '--remove',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
|
@ -1062,111 +1065,96 @@ parser_extract.add_argument(
|
||||||
)
|
)
|
||||||
parser_extract.set_defaults(func=extract)
|
parser_extract.set_defaults(func=extract)
|
||||||
|
|
||||||
parser_fetch = subparsers.add_parser('fetch', aliases=['f', 'fet'], help='fetch metadata and thumbnails')
|
parser_fetch = subparsers.add_parser('fetch', aliases=['f'], help='fetch metadata and thumbnails')
|
||||||
parser_fetch.set_defaults(func=fetch)
|
parser_fetch.set_defaults(func=fetch)
|
||||||
|
|
||||||
parser_collate = subparsers.add_parser(
|
parser_collate = subparsers.add_parser(
|
||||||
'collate',
|
'collate',
|
||||||
aliases=['c', 'co', 'col'],
|
aliases=['c'],
|
||||||
help='collate each work into a sequence of image files',
|
help='collate works into sequences of image files',
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
description=textwrap.dedent("""\
|
description=textwrap.dedent("""\
|
||||||
For each extracted work that has not already been collated,
|
For each extracted work that has not already been collated,
|
||||||
DLibrary will attempt to intuit its structure as follows:
|
DLibrary will attempt to intuit its structure and create
|
||||||
|
a single ordered list of image files in the site data
|
||||||
|
directory. Each image will either be a symlink to an image
|
||||||
|
file in the extraction folder, or a single page extracted
|
||||||
|
from a PDF file.
|
||||||
|
|
||||||
- Enter the work's directory. If the directory contains
|
DLibrary may fail to automatically collate a work if its
|
||||||
nothing except a single subdirectory (ignoring a few types
|
files and subdirectories are not named in a way that
|
||||||
of files that are definitely not relevant), traverse
|
indicates a clear linear ordering. In order to assist with
|
||||||
downwards repeatedly.
|
collation, you can provide a list of expressions specifying
|
||||||
- If the current directory contains nothing except a single
|
where to start traversing the directory structure, what
|
||||||
PDF (again, ignoring irrelevant files), attempt to extract
|
files to include in what order, and/or what files to ignore
|
||||||
a series of images from the PDF. This process expects that
|
entirely.
|
||||||
each page of the PDF consists of a single embedded image,
|
|
||||||
which will be extracted at full resolution. Support for
|
|
||||||
more complex PDFs is not yet implemented.
|
|
||||||
- If the current directory contains nothing except image
|
|
||||||
files, and the image files are named in a way that clearly
|
|
||||||
indicates a complete numerical order (each filename
|
|
||||||
consists of a shared prefix followed by a distinct
|
|
||||||
number), symlink files in the inferred order.
|
|
||||||
- Otherwise, skip processing this work for now.
|
|
||||||
|
|
||||||
DLibrary can be given "collation hints" which provide
|
An expression can be:
|
||||||
alternative starting points for this search process. A hint
|
|
||||||
is a path under $DLIBRARY_DIR/extract/[work id]/
|
|
||||||
indicating a different directory or PDF file to begin the
|
|
||||||
search process for that work, rather than starting at the
|
|
||||||
top level of the extracted data. There can be at most one
|
|
||||||
hint per work; for more complicated scenarios where a work
|
|
||||||
includes multiple folders that need to be collated together,
|
|
||||||
or where filenames do not clearly indicate an ordering, use
|
|
||||||
`manual-collate` instead.
|
|
||||||
"""),
|
|
||||||
)
|
|
||||||
parser_collate.add_argument(
|
|
||||||
'hints',
|
|
||||||
metavar='PATH',
|
|
||||||
type=Path,
|
|
||||||
nargs='*',
|
|
||||||
help='paths within extraction folders as collation hints'
|
|
||||||
)
|
|
||||||
parser_collate.set_defaults(func=collate, force_convert_pdf=False)
|
|
||||||
|
|
||||||
parser_manual_collate = subparsers.add_parser(
|
|
||||||
'manual-collate',
|
|
||||||
aliases=['mc', 'man', 'manual'],
|
|
||||||
help='collate a single work manually',
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
description=textwrap.dedent("""\
|
|
||||||
Provide an expression or sequence of expressions specifying groups
|
|
||||||
of paths to collate or skip. An expression can be:
|
|
||||||
|
|
||||||
PATH
|
PATH
|
||||||
A single path. If this is an image, it will be appended to
|
A single path. If this is an image, it will be appended to
|
||||||
the sequence of collated images; if this is a PDF, images will be
|
the sequence of collated images for the work it belongs to;
|
||||||
extracted from it and concatenated to the sequence; if this is a
|
if this is a PDF, images will be extracted from it and
|
||||||
directory, the contents of the directory will be collated based on
|
concatenated to the sequence; if this is a directory, the
|
||||||
the normal heuristics and concatenated to the sequence.
|
contents of the directory will be automatically collated
|
||||||
|
using DLibrary's default heuristics, and concatenated
|
||||||
|
to the sequence.
|
||||||
|
|
||||||
( PATH [PATH ...] )
|
( PATH [PATH ...] )
|
||||||
A group of paths contained in parentheses. You may need to escape
|
A group of paths contained in parentheses. You may need to escape
|
||||||
the parentheses to avoid them getting parsed by your shell.
|
the parentheses to avoid them getting parsed by your shell.
|
||||||
All the paths in this group will be considered together, and
|
All the paths in this group will be considered together, and
|
||||||
collated based on the normal heuristics, regardless of what
|
automatically collated using the default heuristics, regardless
|
||||||
order the paths are provided in.
|
of what order the paths are provided in.
|
||||||
|
|
||||||
! PATH
|
! PATH
|
||||||
! ( PATH [PATH ...] )
|
! ( PATH [PATH ...] )
|
||||||
A path or group of paths to exclude from collation. You may
|
A path or group of paths to exclude from collation. You may
|
||||||
need to escape the !. If an excluded path appears within any
|
need to escape the !. If an excluded path appears within any
|
||||||
of the other specified paths, it will be ignored.
|
of the other specified paths, it will be skipped by the collation
|
||||||
|
heuristics.
|
||||||
|
|
||||||
If the only expressions provided are negations, then auto-collation
|
If the only expressions provided are negations, then auto-collation
|
||||||
will start from the top level of the extracted work while excluding
|
will start from the top level of the extracted work while skipping
|
||||||
the negated paths.
|
the excluded paths.
|
||||||
|
|
||||||
All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
|
All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
|
||||||
for the work being manually collated. `manual-collate` can
|
for some not-yet-collated work. Paths belonging to multiple
|
||||||
only handle one work at a time.
|
different works can all be provided on the same command line, and
|
||||||
"""),
|
expressions will be clustered together by work id while otherwise
|
||||||
|
preserving the order they were provided in. A parenthesized group
|
||||||
|
expression must only contain paths belonging to a single work.
|
||||||
|
|
||||||
|
By default, DLibrary will attempt to collate every not-yet-collated
|
||||||
|
work (excluding "virtual" works), using the provided expressions
|
||||||
|
to assist in collation when available. The `-o` flag will direct
|
||||||
|
DLibrary to *only* collate works included in the provided expressions,
|
||||||
|
even if other uncollated works are present.
|
||||||
|
"""),
|
||||||
)
|
)
|
||||||
parser_manual_collate.add_argument(
|
parser_collate.add_argument(
|
||||||
|
'-o', '--only-specified-works',
|
||||||
|
action='store_true',
|
||||||
|
help="only collate works that are explicitly specified",
|
||||||
|
)
|
||||||
|
parser_collate.add_argument(
|
||||||
'--force-convert-pdf',
|
'--force-convert-pdf',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
|
help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
|
||||||
)
|
)
|
||||||
parser_manual_collate.add_argument(
|
parser_collate.add_argument(
|
||||||
'expression',
|
'expression',
|
||||||
nargs='+',
|
nargs='*',
|
||||||
help='expressions indicating paths to collate or skip',
|
help='expressions indicating paths to collate or skip',
|
||||||
)
|
)
|
||||||
parser_manual_collate.set_defaults(func=manual_collate)
|
parser_collate.set_defaults(func=collate)
|
||||||
|
|
||||||
parser_analyze = subparsers.add_parser('analyze', aliases=['a', 'an', 'anal'], help='analyze an extracted folder to assist in collation')
|
|
||||||
|
parser_analyze = subparsers.add_parser('analyze', aliases=['a'], help='analyze an extracted folder to assist in collation')
|
||||||
parser_analyze.add_argument('work_id')
|
parser_analyze.add_argument('work_id')
|
||||||
parser_analyze.set_defaults(func=analyze)
|
parser_analyze.set_defaults(func=analyze)
|
||||||
|
|
||||||
parser_metadata = subparsers.add_parser('metadata', aliases=['m', 'me', 'meta'], help='view or modify metadata for a work')
|
parser_metadata = subparsers.add_parser('metadata', aliases=['m'], help='view or modify metadata for a work')
|
||||||
parser_metadata.add_argument('work_id')
|
parser_metadata.add_argument('work_id')
|
||||||
parser_metadata.add_argument(
|
parser_metadata.add_argument(
|
||||||
'--virtual',
|
'--virtual',
|
||||||
|
@ -1177,7 +1165,7 @@ parser_metadata.set_defaults(func=metadata)
|
||||||
|
|
||||||
parser_generate = subparsers.add_parser(
|
parser_generate = subparsers.add_parser(
|
||||||
'generate',
|
'generate',
|
||||||
aliases=['g', 'gen'],
|
aliases=['g'],
|
||||||
help='generate HTML/CSS/JS for library site',
|
help='generate HTML/CSS/JS for library site',
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
description=textwrap.dedent("""\
|
description=textwrap.dedent("""\
|
||||||
|
|
Loading…
Reference in a new issue