add support for explicitly specifying paths for collation
This commit is contained in:
parent
3714eecab9
commit
7680a174fc
151
dlibrary.py
151
dlibrary.py
|
@ -106,55 +106,12 @@ def fetch(args):
|
|||
asyncio.run(fetch_async(args))
|
||||
|
||||
|
||||
def collate(args):
|
||||
con = sqlite3.connect(args.destdir / 'meta.db')
|
||||
cur = con.cursor()
|
||||
|
||||
for work_path in (args.destdir / 'extract').iterdir():
|
||||
work_id = work_path.name
|
||||
|
||||
collation_dir = args.destdir / 'site' / 'works' / work_id
|
||||
if collation_dir.exists():
|
||||
continue
|
||||
|
||||
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
|
||||
if virtual == (1,):
|
||||
continue
|
||||
|
||||
search_dir = work_path
|
||||
while True:
|
||||
entries = list(search_dir.iterdir())
|
||||
if len(entries) == 1 and entries[0].is_dir():
|
||||
search_dir = entries[0]
|
||||
else:
|
||||
break
|
||||
|
||||
if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
|
||||
print(f'Extracting images from {entries[0].name} for {work_id}')
|
||||
link_pdf(entries[0], collation_dir)
|
||||
continue
|
||||
|
||||
if len(entries) == 0:
|
||||
print(f'{work_id} contains no files? skipping')
|
||||
continue
|
||||
|
||||
if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
|
||||
ordering = complete_prefix_number_ordering(entries)
|
||||
if ordering:
|
||||
print(f'Symlinking image files for {work_id}')
|
||||
link_ordered_files(ordering, collation_dir)
|
||||
continue
|
||||
|
||||
print(f'Unable to deduce file structure for {work_id}, skipping')
|
||||
|
||||
con.close()
|
||||
|
||||
def link_pdf(src, dest):
|
||||
def link_pdf(src, dest, start_index=0):
|
||||
with fitz.open(src) as pdf:
|
||||
images_by_page = [page.get_images() for page in pdf]
|
||||
if all(len(images) == 1 for images in images_by_page):
|
||||
dest.mkdir(parents=True)
|
||||
for (idx, images) in enumerate(images_by_page):
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
for (idx, images) in enumerate(images_by_page, start=start_index):
|
||||
xref = images[0][0]
|
||||
image = pdf.extract_image(xref)
|
||||
file_path = dest / f'{idx:04d}.{image["ext"]}'
|
||||
|
@ -184,14 +141,92 @@ def complete_prefix_number_ordering(entries):
|
|||
return [e for (e, i) in entries_with_indices]
|
||||
return None
|
||||
|
||||
def link_ordered_files(ordering, dest):
|
||||
dest.mkdir(parents=True)
|
||||
def link_ordered_files(ordering, dest, start_index=0):
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for (idx, src_path) in enumerate(ordering):
|
||||
for (idx, src_path) in enumerate(ordering, start=start_index):
|
||||
ext = src_path.suffix.lower()
|
||||
link_path = dest / f'{idx:04d}{ext}'
|
||||
link_path.symlink_to(relpath(src_path, dest))
|
||||
|
||||
def collate(args):
|
||||
con = sqlite3.connect(args.destdir / 'meta.db')
|
||||
cur = con.cursor()
|
||||
|
||||
extraction_dir = args.destdir / 'extract'
|
||||
hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints}
|
||||
|
||||
for work_path in extraction_dir.iterdir():
|
||||
work_id = work_path.name
|
||||
|
||||
collation_dir = args.destdir / 'site' / 'works' / work_id
|
||||
if collation_dir.exists():
|
||||
continue
|
||||
|
||||
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
|
||||
if virtual == (1,):
|
||||
continue
|
||||
|
||||
if work_id in hint_map:
|
||||
hint = hint_map[work_id]
|
||||
entries = list(hint.iterdir()) if hint.is_dir() else [hint]
|
||||
else:
|
||||
search_dir = work_path
|
||||
while True:
|
||||
entries = list(search_dir.iterdir())
|
||||
if len(entries) == 1 and entries[0].is_dir():
|
||||
search_dir = entries[0]
|
||||
else:
|
||||
break
|
||||
|
||||
if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
|
||||
print(f'Extracting images from {entries[0]} for {work_id}')
|
||||
link_pdf(entries[0], collation_dir)
|
||||
continue
|
||||
|
||||
if len(entries) == 0:
|
||||
print(f'{work_id} contains no files? skipping')
|
||||
continue
|
||||
|
||||
if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
|
||||
ordering = complete_prefix_number_ordering(entries)
|
||||
if ordering:
|
||||
print(f'Symlinking image files for {work_id}')
|
||||
link_ordered_files(ordering, collation_dir)
|
||||
continue
|
||||
|
||||
print(f'Unable to deduce file structure for {work_id}, skipping')
|
||||
|
||||
con.close()
|
||||
|
||||
def manual_collate(args):
|
||||
collation_dir = args.destdir / 'site' / 'works' / args.work_id
|
||||
if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0:
|
||||
print(f'Collation directory already exists!')
|
||||
return
|
||||
collation_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
index = 0
|
||||
for path in args.paths:
|
||||
if path.is_dir():
|
||||
entries = [p for p in path.iterdir() if p.suffix.lower() in IMAGE_FILE_EXTENSIONS]
|
||||
ordering = complete_prefix_number_ordering(entries)
|
||||
if ordering is None:
|
||||
ordering = entries
|
||||
ordering.sort()
|
||||
link_ordered_files(ordering, collation_dir, start_index=index)
|
||||
index += len(ordering)
|
||||
elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS:
|
||||
link_ordered_files([path], collation_dir, start_index=index)
|
||||
index += 1
|
||||
elif path.suffix.lower() == ".pdf":
|
||||
link_pdf(path, collation_dir, start_index=index)
|
||||
with fitz.open(path) as pdf:
|
||||
index += pdf.page_count
|
||||
else:
|
||||
print(f'Unknown file type {path}, stopping')
|
||||
return
|
||||
|
||||
def metadata(args):
|
||||
con = sqlite3.connect(args.destdir / 'meta.db')
|
||||
cur = con.cursor()
|
||||
|
@ -251,8 +286,26 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail
|
|||
parser_fetch.set_defaults(func=fetch)
|
||||
|
||||
parser_collate = subparsers.add_parser('collate', help='collate a single sequence of image files for each work')
|
||||
parser_collate.add_argument(
|
||||
'hints',
|
||||
metavar='PATH',
|
||||
type=Path,
|
||||
nargs='*',
|
||||
help='manually-specified paths of subdirectories or PDFs within extraction folders, at most one per work',
|
||||
)
|
||||
parser_collate.set_defaults(func=collate)
|
||||
|
||||
parser_manual_collate = subparsers.add_parser('manual-collate', help='collate a specific work manually, specifying all paths to include')
|
||||
parser_manual_collate.add_argument('work_id')
|
||||
parser_manual_collate.add_argument(
|
||||
'paths',
|
||||
metavar='PATH',
|
||||
type=Path,
|
||||
nargs='+',
|
||||
help='paths of files (images to symlink, pdfs to extract) or directories (symlink all images in the directory, no recursion, best-effort sorting)'
|
||||
)
|
||||
parser_manual_collate.set_defaults(func=manual_collate)
|
||||
|
||||
parser_metadata = subparsers.add_parser('metadata', help='view or modify metadata for a work')
|
||||
parser_metadata.add_argument('work_id')
|
||||
parser_metadata.add_argument(
|
||||
|
|
Loading…
Reference in a new issue