completely refactor manual collation to be more ergonomic
This commit is contained in:
parent
9353357dc9
commit
bee5f7c58a
|
@ -341,20 +341,20 @@ def is_image(path):
|
||||||
def ignoreable(path):
|
def ignoreable(path):
|
||||||
return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)
|
return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)
|
||||||
|
|
||||||
def ls_ignore(directory):
|
def ls_ignore(directory, exclude):
|
||||||
return [
|
return [
|
||||||
path for path in directory.iterdir()
|
path for path in directory.iterdir()
|
||||||
if not ignoreable(path)
|
if not ignoreable(path) and path not in exclude
|
||||||
]
|
]
|
||||||
|
|
||||||
def descendant_files_ignore(path):
|
def descendant_files_ignore(path, exclude):
|
||||||
if path.is_file():
|
if path.is_file():
|
||||||
return [path]
|
return [path]
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
for item in ls_ignore(path):
|
for item in ls_ignore(path, exclude):
|
||||||
if item.is_dir():
|
if item.is_dir():
|
||||||
result.extend(descendant_files_ignore(item))
|
result.extend(descendant_files_ignore(item, exclude))
|
||||||
else:
|
else:
|
||||||
result.append(item)
|
result.append(item)
|
||||||
|
|
||||||
|
@ -365,7 +365,7 @@ def collate(args):
|
||||||
cur = con.cursor()
|
cur = con.cursor()
|
||||||
|
|
||||||
extraction_dir = args.destdir / 'extract'
|
extraction_dir = args.destdir / 'extract'
|
||||||
hint_map = {hint.absolute().relative_to(extraction_dir.absolute()).parents[-2].name: hint for hint in args.hints}
|
hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints}
|
||||||
|
|
||||||
collation_staging_area = args.destdir / 'site' / 'images-staging'
|
collation_staging_area = args.destdir / 'site' / 'images-staging'
|
||||||
collation_staging_area.mkdir(parents=True)
|
collation_staging_area.mkdir(parents=True)
|
||||||
|
@ -383,7 +383,7 @@ def collate(args):
|
||||||
|
|
||||||
work_staging_dir = collation_staging_area / work_id
|
work_staging_dir = collation_staging_area / work_id
|
||||||
|
|
||||||
pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0)
|
pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0, [])
|
||||||
if pages_collated:
|
if pages_collated:
|
||||||
print(f'Collated {pages_collated} pages for {work_id}')
|
print(f'Collated {pages_collated} pages for {work_id}')
|
||||||
work_staging_dir.rename(collation_dir)
|
work_staging_dir.rename(collation_dir)
|
||||||
|
@ -401,7 +401,7 @@ def collate(args):
|
||||||
collation_staging_area.rmdir()
|
collation_staging_area.rmdir()
|
||||||
con.close()
|
con.close()
|
||||||
|
|
||||||
def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
|
def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, later=None):
|
||||||
early_srcs = []
|
early_srcs = []
|
||||||
middle_srcs = []
|
middle_srcs = []
|
||||||
late_srcs = []
|
late_srcs = []
|
||||||
|
@ -416,17 +416,17 @@ def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
|
||||||
if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
|
if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
early_page_count = collate_from_paths(early_srcs, dest, start_index)
|
early_page_count = collate_from_paths(early_srcs, dest, start_index, exclude)
|
||||||
if early_page_count is None:
|
if early_page_count is None:
|
||||||
return None
|
return None
|
||||||
start_index += early_page_count
|
start_index += early_page_count
|
||||||
|
|
||||||
middle_page_count = collate_from_paths(middle_srcs, dest, start_index)
|
middle_page_count = collate_from_paths(middle_srcs, dest, start_index, exclude)
|
||||||
if middle_page_count is None:
|
if middle_page_count is None:
|
||||||
return None
|
return None
|
||||||
start_index += middle_page_count
|
start_index += middle_page_count
|
||||||
|
|
||||||
late_page_count = collate_from_paths(late_srcs, dest, start_index)
|
late_page_count = collate_from_paths(late_srcs, dest, start_index, exclude)
|
||||||
if late_page_count is None:
|
if late_page_count is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -458,13 +458,13 @@ def median(items):
|
||||||
def superior_or_equal(a, b):
|
def superior_or_equal(a, b):
|
||||||
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
|
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
|
||||||
|
|
||||||
def try_collate_images_vs_pdf(srcs, dest, start_index):
|
def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
|
||||||
pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
|
pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
|
||||||
if len(pdfs) != 1:
|
if len(pdfs) != 1:
|
||||||
return False
|
return False
|
||||||
outer_pdf = pdfs[0]
|
outer_pdf = pdfs[0]
|
||||||
|
|
||||||
inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if is_pdf(f)]
|
inner_pdfs = [f for f in descendant_files_ignore(outer_pdf, exclude) if is_pdf(f)]
|
||||||
if len(inner_pdfs) != 1:
|
if len(inner_pdfs) != 1:
|
||||||
return False
|
return False
|
||||||
inner_pdf = inner_pdfs[0]
|
inner_pdf = inner_pdfs[0]
|
||||||
|
@ -472,7 +472,7 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
|
||||||
non_pdf_srcs = [src for src in srcs if src != outer_pdf]
|
non_pdf_srcs = [src for src in srcs if src != outer_pdf]
|
||||||
images = []
|
images = []
|
||||||
non_images = []
|
non_images = []
|
||||||
descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)]
|
descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src, exclude)]
|
||||||
for f in descendant_files:
|
for f in descendant_files:
|
||||||
if is_image(f):
|
if is_image(f):
|
||||||
images.append(f)
|
images.append(f)
|
||||||
|
@ -494,15 +494,15 @@ def try_collate_images_vs_pdf(srcs, dest, start_index):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if superior_or_equal(median_standalone_size, median_pdf_size):
|
if superior_or_equal(median_standalone_size, median_pdf_size):
|
||||||
return collate_from_paths(non_pdf_srcs, dest, start_index)
|
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
|
||||||
elif superior_or_equal(median_pdf_size, median_standalone_size):
|
elif superior_or_equal(median_pdf_size, median_standalone_size):
|
||||||
return collate_from_paths([outer_pdf], dest, start_index)
|
return collate_from_paths([outer_pdf], dest, start_index, exclude)
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def collate_from_paths(srcs, dest, start_index):
|
def collate_from_paths(srcs, dest, start_index, exclude):
|
||||||
if len(srcs) == 1 and srcs[0].is_dir():
|
if len(srcs) == 1 and srcs[0].is_dir():
|
||||||
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
|
return collate_from_paths(ls_ignore(srcs[0], exclude), dest, start_index, exclude)
|
||||||
|
|
||||||
if len(srcs) == 1 and is_pdf(srcs[0]):
|
if len(srcs) == 1 and is_pdf(srcs[0]):
|
||||||
print(f'Extracting images from {srcs[0]}')
|
print(f'Extracting images from {srcs[0]}')
|
||||||
|
@ -516,18 +516,18 @@ def collate_from_paths(srcs, dest, start_index):
|
||||||
if len(hi_res_dirs) == 1:
|
if len(hi_res_dirs) == 1:
|
||||||
hi_res_dir = hi_res_dirs[0]
|
hi_res_dir = hi_res_dirs[0]
|
||||||
lo_res_dir = next(src for src in srcs if src != hi_res_dir)
|
lo_res_dir = next(src for src in srcs if src != hi_res_dir)
|
||||||
if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)):
|
if len(descendant_files_ignore(lo_res_dir, exclude)) == len(descendant_files_ignore(hi_res_dir, exclude)):
|
||||||
return collate_from_paths([hi_res_dir], dest, start_index)
|
return collate_from_paths([hi_res_dir], dest, start_index, exclude)
|
||||||
|
|
||||||
textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX)
|
textless_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=TEXTLESS_REGEX)
|
||||||
if textless_split != False:
|
if textless_split != False:
|
||||||
return textless_split
|
return textless_split
|
||||||
|
|
||||||
epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX)
|
epilogue_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=EPILOGUE_REGEX)
|
||||||
if epilogue_split != False:
|
if epilogue_split != False:
|
||||||
return epilogue_split
|
return epilogue_split
|
||||||
|
|
||||||
cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX)
|
cover_split = try_collate_split_regex(srcs, dest, start_index, exclude, earlier=COVER_REGEX)
|
||||||
if cover_split != False:
|
if cover_split != False:
|
||||||
return cover_split
|
return cover_split
|
||||||
|
|
||||||
|
@ -540,7 +540,7 @@ def collate_from_paths(srcs, dest, start_index):
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index)
|
images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index, exclude)
|
||||||
if images_vs_pdf != False:
|
if images_vs_pdf != False:
|
||||||
return images_vs_pdf
|
return images_vs_pdf
|
||||||
|
|
||||||
|
@ -549,49 +549,104 @@ def collate_from_paths(srcs, dest, start_index):
|
||||||
def self_and_parents(path):
|
def self_and_parents(path):
|
||||||
return [path] + list(path.parents)
|
return [path] + list(path.parents)
|
||||||
|
|
||||||
def manual_collate(args):
|
def parse_expressions(tokens):
|
||||||
work_id = self_and_parents(args.paths[0].absolute().relative_to(args.destdir.absolute() / 'extract'))[-2].name
|
groups = []
|
||||||
|
exclusions = []
|
||||||
|
|
||||||
collation_dir = args.destdir / 'site' / 'images' / work_id
|
while tokens:
|
||||||
if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0:
|
token = tokens.pop(0)
|
||||||
|
if token == '!':
|
||||||
|
exclusions.extend(parse_exclusion(tokens))
|
||||||
|
elif token == '(':
|
||||||
|
groups.append(parse_group(tokens))
|
||||||
|
else:
|
||||||
|
groups.append([token])
|
||||||
|
|
||||||
|
return (groups, exclusions)
|
||||||
|
|
||||||
|
def parse_exclusion(tokens):
|
||||||
|
token = tokens.pop(0)
|
||||||
|
|
||||||
|
if token == '(':
|
||||||
|
return parse_group(tokens)
|
||||||
|
else:
|
||||||
|
return [token]
|
||||||
|
|
||||||
|
def parse_group(tokens):
|
||||||
|
items = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
token = tokens.pop(0)
|
||||||
|
if token == ')':
|
||||||
|
return items
|
||||||
|
else:
|
||||||
|
items.append(token)
|
||||||
|
|
||||||
|
def normalize_to(path, ref):
|
||||||
|
return ref / Path(relpath(path, ref))
|
||||||
|
|
||||||
|
def manual_collate(args):
|
||||||
|
(raw_groups, raw_exclusions) = parse_expressions(args.expression)
|
||||||
|
|
||||||
|
exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
|
||||||
|
|
||||||
|
if raw_groups:
|
||||||
|
groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
|
||||||
|
else:
|
||||||
|
groups = [[args.destdir / 'extract' / args.work_id]]
|
||||||
|
|
||||||
|
collation_dir = args.destdir / 'site' / 'images' / args.work_id
|
||||||
|
if collation_dir.exists():
|
||||||
|
if len(list(collation_dir.iterdir())) > 0:
|
||||||
print(f'Collation directory already exists!')
|
print(f'Collation directory already exists!')
|
||||||
return
|
return
|
||||||
|
else:
|
||||||
|
collation_dir.rmdir()
|
||||||
|
|
||||||
nonexistent = [path for path in args.paths if not path.exists()]
|
nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
|
||||||
if len(nonexistent) > 0:
|
if len(nonexistent) > 0:
|
||||||
print(f'Nonexistent paths: {nonexistent}')
|
print(f'Nonexistent paths: {nonexistent}')
|
||||||
return
|
return
|
||||||
|
|
||||||
collation_dir.mkdir(parents=True, exist_ok=True)
|
collation_staging_area = args.destdir / 'site' / 'images-staging'
|
||||||
|
work_staging_dir = collation_staging_area / args.work_id
|
||||||
|
work_staging_dir.mkdir(parents=True)
|
||||||
|
|
||||||
index = 0
|
pages_collated = 0
|
||||||
for path in args.paths:
|
for group in groups:
|
||||||
if path.is_dir():
|
pages_added = collate_from_paths(
|
||||||
entries = [p for p in path.iterdir() if p.is_file() and is_image(p)]
|
[item for item in group if item not in exclusions],
|
||||||
ordering = complete_prefix_number_ordering(entries)
|
work_staging_dir,
|
||||||
if ordering is None:
|
pages_collated,
|
||||||
ordering = entries
|
exclusions,
|
||||||
ordering.sort()
|
)
|
||||||
link_ordered_files(ordering, collation_dir, index)
|
if pages_added is None:
|
||||||
index += len(ordering)
|
print(f'Unable to deduce file structure for {args.work_id} subgroup {[str(path) for path in group]}')
|
||||||
elif is_image(path):
|
pages_collated = None
|
||||||
link_ordered_files([path], collation_dir, index)
|
break
|
||||||
index += 1
|
|
||||||
elif is_pdf(path):
|
pages_collated += pages_added
|
||||||
pdf_page_count = link_pdf(path, collation_dir, index)
|
|
||||||
if pdf_page_count is None:
|
if pages_collated:
|
||||||
return
|
print(f'Collated {pages_collated} pages for {args.work_id}')
|
||||||
index += pdf_page_count
|
work_staging_dir.rename(collation_dir)
|
||||||
else:
|
else:
|
||||||
print(f'Unknown file type {path}, stopping')
|
for f in work_staging_dir.iterdir():
|
||||||
return
|
f.unlink()
|
||||||
|
work_staging_dir.rmdir()
|
||||||
|
|
||||||
|
if pages_collated == 0:
|
||||||
|
print(f'No files found for {work_id}')
|
||||||
|
|
||||||
|
collation_staging_area.rmdir()
|
||||||
|
|
||||||
|
|
||||||
def fmt_size(s):
|
def fmt_size(s):
|
||||||
return f'{s[0]}x{s[1]}px'
|
return f'{s[0]}x{s[1]}px'
|
||||||
|
|
||||||
def analyze(args):
|
def analyze(args):
|
||||||
extract_dir = args.destdir / 'extract'
|
extract_dir = args.destdir / 'extract'
|
||||||
files = descendant_files_ignore(extract_dir / args.work_id)
|
files = descendant_files_ignore(extract_dir / args.work_id, [])
|
||||||
files.sort()
|
files.sort()
|
||||||
|
|
||||||
for f in files:
|
for f in files:
|
||||||
|
@ -869,28 +924,43 @@ parser_manual_collate = subparsers.add_parser(
|
||||||
help='collate a single work manually',
|
help='collate a single work manually',
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
description=textwrap.dedent("""\
|
description=textwrap.dedent("""\
|
||||||
|
Provide an expression or sequence of expressions specifying groups
|
||||||
|
of paths to collate or skip. An expression can be:
|
||||||
|
|
||||||
|
PATH
|
||||||
|
A single path. If this is an image, it will be appended to
|
||||||
|
the sequence of collated images; if this is a PDF, images will be
|
||||||
|
extracted from it and concatenated to the sequence; if this is a
|
||||||
|
directory, the contents of the directory will be collated based on
|
||||||
|
the normal heuristics and concatenated to the sequence.
|
||||||
|
|
||||||
|
( PATH [PATH ...] )
|
||||||
|
A group of paths contained in parentheses. You may need to escape
|
||||||
|
the parentheses to avoid them getting parsed by your shell.
|
||||||
|
All the paths in this group will be considered together, and
|
||||||
|
collated based on the normal heuristics, regardless of what
|
||||||
|
order the paths are provided in.
|
||||||
|
|
||||||
|
! PATH
|
||||||
|
! ( PATH [PATH ...] )
|
||||||
|
A path or group of paths to exclude from collation. You may
|
||||||
|
need to escape the !. If an excluded path appears within any
|
||||||
|
of the other specified paths, it will be ignored.
|
||||||
|
|
||||||
|
If the only expressions provided are negations, then auto-collation
|
||||||
|
will start from the top level of the extracted work while excluding
|
||||||
|
the negated paths.
|
||||||
|
|
||||||
All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
|
All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
|
||||||
for the work being manually collated. `manual-collate` can
|
for the work being manually collated. `manual-collate` can
|
||||||
only handle one work at a time. Paths are used as follows:
|
only handle one work at a time. Paths are used as follows:
|
||||||
|
|
||||||
- If a path is a directory, all *image files* immediately
|
|
||||||
inside that directory will be appended to the sequence. If
|
|
||||||
files are named in a way which indicates a clear ordering,
|
|
||||||
that ordering will be used. Otherwise, filenames will be
|
|
||||||
sorted lexicographically. Non-image files and
|
|
||||||
subdirectories will be ignored.
|
|
||||||
- If a path is an image file, that image file will be
|
|
||||||
appended to the sequence.
|
|
||||||
- If a path is a PDF file, page images will be extracted
|
|
||||||
from that PDF and appended to the sequence.
|
|
||||||
"""),
|
"""),
|
||||||
)
|
)
|
||||||
|
parser_manual_collate.add_argument('work_id')
|
||||||
parser_manual_collate.add_argument(
|
parser_manual_collate.add_argument(
|
||||||
'paths',
|
'expression',
|
||||||
metavar='PATH',
|
|
||||||
type=Path,
|
|
||||||
nargs='+',
|
nargs='+',
|
||||||
help='paths within a single work to be collated in sequence',
|
help='expressions indicating paths to collate or skip',
|
||||||
)
|
)
|
||||||
parser_manual_collate.set_defaults(func=manual_collate)
|
parser_manual_collate.set_defaults(func=manual_collate)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue