add support for explicitly specifying paths for collation

This commit is contained in:
xenofem 2024-01-22 03:49:00 -05:00
parent 3714eecab9
commit 7680a174fc

View file

@ -106,55 +106,12 @@ def fetch(args):
asyncio.run(fetch_async(args))
def collate(args):
con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor()
for work_path in (args.destdir / 'extract').iterdir():
work_id = work_path.name
collation_dir = args.destdir / 'site' / 'works' / work_id
if collation_dir.exists():
continue
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
if virtual == (1,):
continue
search_dir = work_path
while True:
entries = list(search_dir.iterdir())
if len(entries) == 1 and entries[0].is_dir():
search_dir = entries[0]
else:
break
if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
print(f'Extracting images from {entries[0].name} for {work_id}')
link_pdf(entries[0], collation_dir)
continue
if len(entries) == 0:
print(f'{work_id} contains no files? skipping')
continue
if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
ordering = complete_prefix_number_ordering(entries)
if ordering:
print(f'Symlinking image files for {work_id}')
link_ordered_files(ordering, collation_dir)
continue
print(f'Unable to deduce file structure for {work_id}, skipping')
con.close()
def link_pdf(src, dest):
def link_pdf(src, dest, start_index=0):
with fitz.open(src) as pdf:
images_by_page = [page.get_images() for page in pdf]
if all(len(images) == 1 for images in images_by_page):
dest.mkdir(parents=True)
for (idx, images) in enumerate(images_by_page):
dest.mkdir(parents=True, exist_ok=True)
for (idx, images) in enumerate(images_by_page, start=start_index):
xref = images[0][0]
image = pdf.extract_image(xref)
file_path = dest / f'{idx:04d}.{image["ext"]}'
@ -184,14 +141,92 @@ def complete_prefix_number_ordering(entries):
return [e for (e, i) in entries_with_indices]
return None
def link_ordered_files(ordering, dest):
dest.mkdir(parents=True)
def link_ordered_files(ordering, dest, start_index=0):
dest.mkdir(parents=True, exist_ok=True)
for (idx, src_path) in enumerate(ordering):
for (idx, src_path) in enumerate(ordering, start=start_index):
ext = src_path.suffix.lower()
link_path = dest / f'{idx:04d}{ext}'
link_path.symlink_to(relpath(src_path, dest))
def collate(args):
con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor()
extraction_dir = args.destdir / 'extract'
hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints}
for work_path in extraction_dir.iterdir():
work_id = work_path.name
collation_dir = args.destdir / 'site' / 'works' / work_id
if collation_dir.exists():
continue
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
if virtual == (1,):
continue
if work_id in hint_map:
hint = hint_map[work_id]
entries = list(hint.iterdir()) if hint.is_dir() else [hint]
else:
search_dir = work_path
while True:
entries = list(search_dir.iterdir())
if len(entries) == 1 and entries[0].is_dir():
search_dir = entries[0]
else:
break
if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
print(f'Extracting images from {entries[0]} for {work_id}')
link_pdf(entries[0], collation_dir)
continue
if len(entries) == 0:
print(f'{work_id} contains no files? skipping')
continue
if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
ordering = complete_prefix_number_ordering(entries)
if ordering:
print(f'Symlinking image files for {work_id}')
link_ordered_files(ordering, collation_dir)
continue
print(f'Unable to deduce file structure for {work_id}, skipping')
con.close()
def manual_collate(args):
collation_dir = args.destdir / 'site' / 'works' / args.work_id
if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0:
print(f'Collation directory already exists!')
return
collation_dir.mkdir(parents=True, exist_ok=True)
index = 0
for path in args.paths:
if path.is_dir():
entries = [p for p in path.iterdir() if p.suffix.lower() in IMAGE_FILE_EXTENSIONS]
ordering = complete_prefix_number_ordering(entries)
if ordering is None:
ordering = entries
ordering.sort()
link_ordered_files(ordering, collation_dir, start_index=index)
index += len(ordering)
elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS:
link_ordered_files([path], collation_dir, start_index=index)
index += 1
elif path.suffix.lower() == ".pdf":
link_pdf(path, collation_dir, start_index=index)
with fitz.open(path) as pdf:
index += pdf.page_count
else:
print(f'Unknown file type {path}, stopping')
return
def metadata(args):
con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor()
@ -251,8 +286,26 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail
parser_fetch.set_defaults(func=fetch)
parser_collate = subparsers.add_parser('collate', help='collate a single sequence of image files for each work')
parser_collate.add_argument(
'hints',
metavar='PATH',
type=Path,
nargs='*',
help='manually-specified paths of subdirectories or PDFs within extraction folders, at most one per work',
)
parser_collate.set_defaults(func=collate)
parser_manual_collate = subparsers.add_parser('manual-collate', help='collate a specific work manually, specifying all paths to include')
parser_manual_collate.add_argument('work_id')
parser_manual_collate.add_argument(
'paths',
metavar='PATH',
type=Path,
nargs='+',
help='paths of files (images to symlink, pdfs to extract) or directories (symlink all images in the directory, no recursion, best-effort sorting)'
)
parser_manual_collate.set_defaults(func=manual_collate)
parser_metadata = subparsers.add_parser('metadata', help='view or modify metadata for a work')
parser_metadata.add_argument('work_id')
parser_metadata.add_argument(