Compare commits

..

No commits in common. "ad71c6c280ae9e1f91e66b6787b72d5b598809ae" and "9353357dc9a0443adb8b6eb903486686e00fad67" have entirely different histories.

3 changed files with 70 additions and 140 deletions

View file

@ -341,20 +341,20 @@ def is_image(path):
def ignoreable(path): def ignoreable(path):
return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS) return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)
def ls_ignore(directory, exclude): def ls_ignore(directory):
return [ return [
path for path in directory.iterdir() path for path in directory.iterdir()
if not ignoreable(path) and path not in exclude if not ignoreable(path)
] ]
def descendant_files_ignore(path, exclude): def descendant_files_ignore(path):
if path.is_file(): if path.is_file():
return [path] return [path]
result = [] result = []
for item in ls_ignore(path, exclude): for item in ls_ignore(path):
if item.is_dir(): if item.is_dir():
result.extend(descendant_files_ignore(item, exclude)) result.extend(descendant_files_ignore(item))
else: else:
result.append(item) result.append(item)
@ -365,7 +365,7 @@ def collate(args):
cur = con.cursor() cur = con.cursor()
extraction_dir = args.destdir / 'extract' extraction_dir = args.destdir / 'extract'
hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints} hint_map = {hint.absolute().relative_to(extraction_dir.absolute()).parents[-2].name: hint for hint in args.hints}
collation_staging_area = args.destdir / 'site' / 'images-staging' collation_staging_area = args.destdir / 'site' / 'images-staging'
collation_staging_area.mkdir(parents=True) collation_staging_area.mkdir(parents=True)
@ -383,7 +383,7 @@ def collate(args):
work_staging_dir = collation_staging_area / work_id work_staging_dir = collation_staging_area / work_id
pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0, []) pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0)
if pages_collated: if pages_collated:
print(f'Collated {pages_collated} pages for {work_id}') print(f'Collated {pages_collated} pages for {work_id}')
work_staging_dir.rename(collation_dir) work_staging_dir.rename(collation_dir)
@ -401,7 +401,7 @@ def collate(args):
collation_staging_area.rmdir() collation_staging_area.rmdir()
con.close() con.close()
def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, later=None): def try_collate_split_regex(srcs, dest, start_index, earlier=None, later=None):
early_srcs = [] early_srcs = []
middle_srcs = [] middle_srcs = []
late_srcs = [] late_srcs = []
@ -416,17 +416,17 @@ def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, late
if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1: if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
return False return False
early_page_count = collate_from_paths(early_srcs, dest, start_index, exclude) early_page_count = collate_from_paths(early_srcs, dest, start_index)
if early_page_count is None: if early_page_count is None:
return None return None
start_index += early_page_count start_index += early_page_count
middle_page_count = collate_from_paths(middle_srcs, dest, start_index, exclude) middle_page_count = collate_from_paths(middle_srcs, dest, start_index)
if middle_page_count is None: if middle_page_count is None:
return None return None
start_index += middle_page_count start_index += middle_page_count
late_page_count = collate_from_paths(late_srcs, dest, start_index, exclude) late_page_count = collate_from_paths(late_srcs, dest, start_index)
if late_page_count is None: if late_page_count is None:
return None return None
@ -458,13 +458,13 @@ def median(items):
def superior_or_equal(a, b): def superior_or_equal(a, b):
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
def try_collate_images_vs_pdf(srcs, dest, start_index, exclude): def try_collate_images_vs_pdf(srcs, dest, start_index):
pdfs = [src for src in srcs if 'pdf' in src.name.lower()] pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
if len(pdfs) != 1: if len(pdfs) != 1:
return False return False
outer_pdf = pdfs[0] outer_pdf = pdfs[0]
inner_pdfs = [f for f in descendant_files_ignore(outer_pdf, exclude) if is_pdf(f)] inner_pdfs = [f for f in descendant_files_ignore(outer_pdf) if is_pdf(f)]
if len(inner_pdfs) != 1: if len(inner_pdfs) != 1:
return False return False
inner_pdf = inner_pdfs[0] inner_pdf = inner_pdfs[0]
@ -472,7 +472,7 @@ def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
non_pdf_srcs = [src for src in srcs if src != outer_pdf] non_pdf_srcs = [src for src in srcs if src != outer_pdf]
images = [] images = []
non_images = [] non_images = []
descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src, exclude)] descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src)]
for f in descendant_files: for f in descendant_files:
if is_image(f): if is_image(f):
images.append(f) images.append(f)
@ -494,15 +494,15 @@ def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
return False return False
if superior_or_equal(median_standalone_size, median_pdf_size): if superior_or_equal(median_standalone_size, median_pdf_size):
return collate_from_paths(non_pdf_srcs, dest, start_index, exclude) return collate_from_paths(non_pdf_srcs, dest, start_index)
elif superior_or_equal(median_pdf_size, median_standalone_size): elif superior_or_equal(median_pdf_size, median_standalone_size):
return collate_from_paths([outer_pdf], dest, start_index, exclude) return collate_from_paths([outer_pdf], dest, start_index)
else: else:
return False return False
def collate_from_paths(srcs, dest, start_index, exclude): def collate_from_paths(srcs, dest, start_index):
if len(srcs) == 1 and srcs[0].is_dir(): if len(srcs) == 1 and srcs[0].is_dir():
return collate_from_paths(ls_ignore(srcs[0], exclude), dest, start_index, exclude) return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
if len(srcs) == 1 and is_pdf(srcs[0]): if len(srcs) == 1 and is_pdf(srcs[0]):
print(f'Extracting images from {srcs[0]}') print(f'Extracting images from {srcs[0]}')
@ -516,18 +516,18 @@ def collate_from_paths(srcs, dest, start_index, exclude):
if len(hi_res_dirs) == 1: if len(hi_res_dirs) == 1:
hi_res_dir = hi_res_dirs[0] hi_res_dir = hi_res_dirs[0]
lo_res_dir = next(src for src in srcs if src != hi_res_dir) lo_res_dir = next(src for src in srcs if src != hi_res_dir)
if len(descendant_files_ignore(lo_res_dir, exclude)) == len(descendant_files_ignore(hi_res_dir, exclude)): if len(descendant_files_ignore(lo_res_dir)) == len(descendant_files_ignore(hi_res_dir)):
return collate_from_paths([hi_res_dir], dest, start_index, exclude) return collate_from_paths([hi_res_dir], dest, start_index)
textless_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=TEXTLESS_REGEX) textless_split = try_collate_split_regex(srcs, dest, start_index, later=TEXTLESS_REGEX)
if textless_split != False: if textless_split != False:
return textless_split return textless_split
epilogue_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=EPILOGUE_REGEX) epilogue_split = try_collate_split_regex(srcs, dest, start_index, later=EPILOGUE_REGEX)
if epilogue_split != False: if epilogue_split != False:
return epilogue_split return epilogue_split
cover_split = try_collate_split_regex(srcs, dest, start_index, exclude, earlier=COVER_REGEX) cover_split = try_collate_split_regex(srcs, dest, start_index, earlier=COVER_REGEX)
if cover_split != False: if cover_split != False:
return cover_split return cover_split
@ -540,7 +540,7 @@ def collate_from_paths(srcs, dest, start_index, exclude):
else: else:
return None return None
images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index, exclude) images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index)
if images_vs_pdf != False: if images_vs_pdf != False:
return images_vs_pdf return images_vs_pdf
@ -549,104 +549,49 @@ def collate_from_paths(srcs, dest, start_index, exclude):
def self_and_parents(path): def self_and_parents(path):
return [path] + list(path.parents) return [path] + list(path.parents)
def parse_expressions(tokens):
groups = []
exclusions = []
while tokens:
token = tokens.pop(0)
if token == '!':
exclusions.extend(parse_exclusion(tokens))
elif token == '(':
groups.append(parse_group(tokens))
else:
groups.append([token])
return (groups, exclusions)
def parse_exclusion(tokens):
token = tokens.pop(0)
if token == '(':
return parse_group(tokens)
else:
return [token]
def parse_group(tokens):
items = []
while True:
token = tokens.pop(0)
if token == ')':
return items
else:
items.append(token)
def normalize_to(path, ref):
return ref / Path(relpath(path, ref))
def manual_collate(args): def manual_collate(args):
(raw_groups, raw_exclusions) = parse_expressions(args.expression) work_id = self_and_parents(args.paths[0].absolute().relative_to(args.destdir.absolute() / 'extract'))[-2].name
exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions] collation_dir = args.destdir / 'site' / 'images' / work_id
if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0:
if raw_groups:
groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
else:
groups = [[args.destdir / 'extract' / args.work_id]]
collation_dir = args.destdir / 'site' / 'images' / args.work_id
if collation_dir.exists():
if len(list(collation_dir.iterdir())) > 0:
print(f'Collation directory already exists!') print(f'Collation directory already exists!')
return return
else:
collation_dir.rmdir()
nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()] nonexistent = [path for path in args.paths if not path.exists()]
if len(nonexistent) > 0: if len(nonexistent) > 0:
print(f'Nonexistent paths: {nonexistent}') print(f'Nonexistent paths: {nonexistent}')
return return
collation_staging_area = args.destdir / 'site' / 'images-staging' collation_dir.mkdir(parents=True, exist_ok=True)
work_staging_dir = collation_staging_area / args.work_id
work_staging_dir.mkdir(parents=True)
pages_collated = 0 index = 0
for group in groups: for path in args.paths:
pages_added = collate_from_paths( if path.is_dir():
[item for item in group if item not in exclusions], entries = [p for p in path.iterdir() if p.is_file() and is_image(p)]
work_staging_dir, ordering = complete_prefix_number_ordering(entries)
pages_collated, if ordering is None:
exclusions, ordering = entries
) ordering.sort()
if pages_added is None: link_ordered_files(ordering, collation_dir, index)
print(f'Unable to deduce file structure for {args.work_id} subgroup {[str(path) for path in group]}') index += len(ordering)
pages_collated = None elif is_image(path):
break link_ordered_files([path], collation_dir, index)
index += 1
pages_collated += pages_added elif is_pdf(path):
pdf_page_count = link_pdf(path, collation_dir, index)
if pages_collated: if pdf_page_count is None:
print(f'Collated {pages_collated} pages for {args.work_id}') return
work_staging_dir.rename(collation_dir) index += pdf_page_count
else: else:
for f in work_staging_dir.iterdir(): print(f'Unknown file type {path}, stopping')
f.unlink() return
work_staging_dir.rmdir()
if pages_collated == 0:
print(f'No files found for {work_id}')
collation_staging_area.rmdir()
def fmt_size(s): def fmt_size(s):
return f'{s[0]}x{s[1]}px' return f'{s[0]}x{s[1]}px'
def analyze(args): def analyze(args):
extract_dir = args.destdir / 'extract' extract_dir = args.destdir / 'extract'
files = descendant_files_ignore(extract_dir / args.work_id, []) files = descendant_files_ignore(extract_dir / args.work_id)
files.sort() files.sort()
for f in files: for f in files:
@ -924,43 +869,28 @@ parser_manual_collate = subparsers.add_parser(
help='collate a single work manually', help='collate a single work manually',
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent("""\ description=textwrap.dedent("""\
Provide an expression or sequence of expressions specifying groups
of paths to collate or skip. An expression can be:
PATH
A single path. If this is an image, it will be appended to
the sequence of collated images; if this is a PDF, images will be
extracted from it and concatenated to the sequence; if this is a
directory, the contents of the directory will be collated based on
the normal heuristics and concatenated to the sequence.
( PATH [PATH ...] )
A group of paths contained in parentheses. You may need to escape
the parentheses to avoid them getting parsed by your shell.
All the paths in this group will be considered together, and
collated based on the normal heuristics, regardless of what
order the paths are provided in.
! PATH
! ( PATH [PATH ...] )
A path or group of paths to exclude from collation. You may
need to escape the !. If an excluded path appears within any
of the other specified paths, it will be ignored.
If the only expressions provided are negations, then auto-collation
will start from the top level of the extracted work while excluding
the negated paths.
All provided paths must be under $DLIBRARY_DIR/extract/[work id]/ All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
for the work being manually collated. `manual-collate` can for the work being manually collated. `manual-collate` can
only handle one work at a time. Paths are used as follows: only handle one work at a time. Paths are used as follows:
- If a path is a directory, all *image files* immediately
inside that directory will be appended to the sequence. If
files are named in a way which indicates a clear ordering,
that ordering will be used. Otherwise, filenames will be
sorted lexicographically. Non-image files and
subdirectories will be ignored.
- If a path is an image file, that image file will be
appended to the sequence.
- If a path is a PDF file, page images will be extracted
from that PDF and appended to the sequence.
"""), """),
) )
parser_manual_collate.add_argument('work_id')
parser_manual_collate.add_argument( parser_manual_collate.add_argument(
'expression', 'paths',
metavar='PATH',
type=Path,
nargs='+', nargs='+',
help='expressions indicating paths to collate or skip', help='paths within a single work to be collated in sequence',
) )
parser_manual_collate.set_defaults(func=manual_collate) parser_manual_collate.set_defaults(func=manual_collate)

View file

@ -47,7 +47,7 @@
dlibrary = buildPythonApplication { dlibrary = buildPythonApplication {
pname = "dlibrary"; pname = "dlibrary";
version = "0.2"; version = "0.1";
pyproject = true; pyproject = true;
propagatedBuildInputs = [ propagatedBuildInputs = [
pymupdf pymupdf

View file

@ -1,6 +1,6 @@
[project] [project]
name = "dlibrary" name = "dlibrary"
version = "0.2" version = "0.1"
description = "Cataloging tool and viewer for downloaded DLSite purchases" description = "Cataloging tool and viewer for downloaded DLSite purchases"
license = {file = "LICENSE"} license = {file = "LICENSE"}
authors = [{name = "xenofem"}] authors = [{name = "xenofem"}]