Compare commits
3 commits
d66b467c5c
...
657ec65e4a
Author | SHA1 | Date | |
---|---|---|---|
xenofem | 657ec65e4a | ||
xenofem | 330b10c85b | ||
xenofem | aefaf824a8 |
|
@ -25,7 +25,7 @@ FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
|
||||||
FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
|
FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
|
||||||
|
|
||||||
TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless')
|
TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless')
|
||||||
ALT_VERSIONS = ['褐色', '日焼け']
|
ALT_VERSIONS = ['褐色', '日焼け', 'pink']
|
||||||
|
|
||||||
IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff']
|
IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff']
|
||||||
|
|
||||||
|
@ -179,12 +179,12 @@ def image_xrefs(pdf):
|
||||||
print('\nSuccess')
|
print('\nSuccess')
|
||||||
return xrefs
|
return xrefs
|
||||||
|
|
||||||
def link_pdf(src, dest, start_index=0):
|
def link_pdf(src, dest, start_index):
|
||||||
with fitz.open(src) as pdf:
|
with fitz.open(src) as pdf:
|
||||||
xrefs = image_xrefs(pdf)
|
xrefs = image_xrefs(pdf)
|
||||||
if xrefs is None:
|
if xrefs is None:
|
||||||
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
|
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
|
||||||
return
|
return None
|
||||||
|
|
||||||
dest.mkdir(parents=True, exist_ok=True)
|
dest.mkdir(parents=True, exist_ok=True)
|
||||||
for (idx, xref) in enumerate(xrefs, start=start_index):
|
for (idx, xref) in enumerate(xrefs, start=start_index):
|
||||||
|
@ -193,6 +193,8 @@ def link_pdf(src, dest, start_index=0):
|
||||||
with open(file_path, 'wb') as f:
|
with open(file_path, 'wb') as f:
|
||||||
f.write(image["image"])
|
f.write(image["image"])
|
||||||
|
|
||||||
|
return pdf.page_count
|
||||||
|
|
||||||
def complete_prefix_number_ordering(entries):
|
def complete_prefix_number_ordering(entries):
|
||||||
if len(entries) == 1:
|
if len(entries) == 1:
|
||||||
return entries
|
return entries
|
||||||
|
@ -202,7 +204,7 @@ def complete_prefix_number_ordering(entries):
|
||||||
version = next(ver for ver in (ALT_VERSIONS + ['']) if ver in entry.name)
|
version = next(ver for ver in (ALT_VERSIONS + ['']) if ver in entry.name)
|
||||||
entries_by_version.setdefault(version, []).append(entry)
|
entries_by_version.setdefault(version, []).append(entry)
|
||||||
|
|
||||||
numberings_by_version = {ver: prefix_numbering(entries_by_version[ver]) for ver in entries_by_version}
|
numberings_by_version = {ver: unique_hierarchical_prefix_numbering(entries_by_version[ver]) for ver in entries_by_version}
|
||||||
|
|
||||||
unified_indices = set()
|
unified_indices = set()
|
||||||
for numbering in numberings_by_version.values():
|
for numbering in numberings_by_version.values():
|
||||||
|
@ -212,8 +214,15 @@ def complete_prefix_number_ordering(entries):
|
||||||
unified_indices = list(unified_indices)
|
unified_indices = list(unified_indices)
|
||||||
unified_indices.sort()
|
unified_indices.sort()
|
||||||
|
|
||||||
if len(unified_indices) > 1 and min(unified_indices[i] - unified_indices[i-1] for i in range(1, len(unified_indices))) > 2:
|
if len(unified_indices) > 1:
|
||||||
|
for i in range(1, len(unified_indices)):
|
||||||
|
cur = unified_indices[i]
|
||||||
|
prev = unified_indices[i-1]
|
||||||
|
for level in range(min(len(cur), len(prev))):
|
||||||
|
if cur[level] != prev[level]:
|
||||||
|
if cur[level] - prev[level] > 2:
|
||||||
return None
|
return None
|
||||||
|
break
|
||||||
|
|
||||||
versions = list(numberings_by_version.keys())
|
versions = list(numberings_by_version.keys())
|
||||||
versions.sort()
|
versions.sort()
|
||||||
|
@ -231,31 +240,57 @@ def complete_prefix_number_ordering(entries):
|
||||||
for out_ver in outer_versions:
|
for out_ver in outer_versions:
|
||||||
for i in unified_indices:
|
for i in unified_indices:
|
||||||
for ver in ([out_ver] + (inner_versions if out_ver == versions[0] else [])):
|
for ver in ([out_ver] + (inner_versions if out_ver == versions[0] else [])):
|
||||||
entries_i_ver = numberings_by_version[ver].get(i, [])
|
result += numberings_by_version[ver].get(i, [])
|
||||||
if len(entries_i_ver) <= 1:
|
|
||||||
result += entries_i_ver
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def prefix_numbering(entries):
|
def unique_hierarchical_prefix_numbering(entries, start_point=0):
|
||||||
matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name)))
|
matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name)))
|
||||||
for m in matches:
|
for m in matches:
|
||||||
pos = m.start()
|
pos = m.start()
|
||||||
|
if pos < start_point:
|
||||||
|
return None
|
||||||
prefix = entries[0].name[:pos]
|
prefix = entries[0].name[:pos]
|
||||||
if all(e.name.startswith(prefix) for e in entries):
|
if all(e.name.startswith(prefix) for e in entries):
|
||||||
entries_by_index = {}
|
numbering = {}
|
||||||
for e in entries:
|
for e in entries:
|
||||||
n = NUMBER_REGEX.match(e.name[pos:])
|
n = NUMBER_REGEX.match(e.name[pos:])
|
||||||
if n is None:
|
if n is None:
|
||||||
return None
|
return None
|
||||||
i = int(n.group())
|
i = int(n.group())
|
||||||
entries_by_index.setdefault(i, []).append(e)
|
numbering.setdefault((i,), []).append(e)
|
||||||
return entries_by_index
|
|
||||||
|
indices = list(numbering.keys())
|
||||||
|
for idx in indices:
|
||||||
|
if len(numbering[idx]) > 1:
|
||||||
|
ents_idx = numbering.pop(idx)
|
||||||
|
next_layer_start = pos + NUMBER_REGEX.match(ents_idx[0].name[pos:]).end()
|
||||||
|
sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start)
|
||||||
|
if not sub_numbering:
|
||||||
|
return None
|
||||||
|
for sub_idx in sub_numbering:
|
||||||
|
numbering[(*idx, *sub_idx)] = sub_numbering[sub_idx]
|
||||||
|
|
||||||
|
return numbering
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def link_ordered_files(ordering, dest, start_index=0):
|
def alphabetic_numbering(entries, start_point):
|
||||||
|
alphabetized = {}
|
||||||
|
for entry in entries:
|
||||||
|
ending = entry.stem[start_point:]
|
||||||
|
if len(ending) > 1:
|
||||||
|
return None
|
||||||
|
index = 0 if ending == '' else ord(ending.lower()) - ord('a')
|
||||||
|
if index in alphabetized:
|
||||||
|
return None
|
||||||
|
alphabetized[(index,)] = [entry]
|
||||||
|
indices = list(alphabetized.keys())
|
||||||
|
indices.sort()
|
||||||
|
if indices != [(i,) for i in range(len(indices))]:
|
||||||
|
return None
|
||||||
|
return alphabetized
|
||||||
|
|
||||||
|
def link_ordered_files(ordering, dest, start_index):
|
||||||
dest.mkdir(parents=True, exist_ok=True)
|
dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
for (idx, src_path) in enumerate(ordering, start=start_index):
|
for (idx, src_path) in enumerate(ordering, start=start_index):
|
||||||
|
@ -276,6 +311,9 @@ def collate(args):
|
||||||
extraction_dir = args.destdir / 'extract'
|
extraction_dir = args.destdir / 'extract'
|
||||||
hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints}
|
hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints}
|
||||||
|
|
||||||
|
collation_staging_area = args.destdir / 'site' / 'images-staging'
|
||||||
|
collation_staging_area.mkdir(parents=True)
|
||||||
|
|
||||||
for work_path in extraction_dir.iterdir():
|
for work_path in extraction_dir.iterdir():
|
||||||
work_id = work_path.name
|
work_id = work_path.name
|
||||||
|
|
||||||
|
@ -287,51 +325,64 @@ def collate(args):
|
||||||
if virtual == (1,):
|
if virtual == (1,):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if work_id in hint_map:
|
work_staging_dir = collation_staging_area / work_id
|
||||||
hint = hint_map[work_id]
|
|
||||||
entries = [hint] if hint.is_file() else ls_ignore(hint)
|
|
||||||
else:
|
|
||||||
search_dir = work_path
|
|
||||||
while True:
|
|
||||||
entries = ls_ignore(search_dir)
|
|
||||||
if len(entries) == 1 and entries[0].is_dir():
|
|
||||||
search_dir = entries[0]
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
|
pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0)
|
||||||
print(f'Extracting images from {entries[0]} for {work_id}')
|
if pages_collated:
|
||||||
link_pdf(entries[0], collation_dir)
|
print(f'Collated {pages_collated} pages for {work_id}')
|
||||||
continue
|
work_staging_dir.rename(collation_dir)
|
||||||
|
else:
|
||||||
|
if work_staging_dir.is_dir():
|
||||||
|
for f in work_staging_dir.iterdir():
|
||||||
|
f.unlink()
|
||||||
|
work_staging_dir.rmdir()
|
||||||
|
|
||||||
if len(entries) == 0:
|
if pages_collated == 0:
|
||||||
print(f'{work_id} contains no files? skipping')
|
print(f'{work_id} contains no files? skipping')
|
||||||
continue
|
elif pages_collated is None:
|
||||||
|
|
||||||
if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
|
|
||||||
ordering = complete_prefix_number_ordering(entries)
|
|
||||||
if not ordering:
|
|
||||||
with_text = []
|
|
||||||
textless = []
|
|
||||||
for entry in entries:
|
|
||||||
if TEXTLESS_REGEX.search(entry.name):
|
|
||||||
textless.append(entry)
|
|
||||||
else:
|
|
||||||
with_text.append(entry)
|
|
||||||
if with_text and textless:
|
|
||||||
with_text_ordering = complete_prefix_number_ordering(with_text)
|
|
||||||
textless_ordering = complete_prefix_number_ordering(textless)
|
|
||||||
if with_text_ordering and textless_ordering:
|
|
||||||
ordering = with_text_ordering + textless_ordering
|
|
||||||
if ordering:
|
|
||||||
print(f'Symlinking image files for {work_id}')
|
|
||||||
link_ordered_files(ordering, collation_dir)
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f'Unable to deduce file structure for {work_id}, skipping')
|
print(f'Unable to deduce file structure for {work_id}, skipping')
|
||||||
|
|
||||||
|
collation_staging_area.rmdir()
|
||||||
con.close()
|
con.close()
|
||||||
|
|
||||||
|
def collate_from_paths(srcs, dest, start_index):
|
||||||
|
if len(srcs) == 1 and srcs[0].is_dir():
|
||||||
|
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
|
||||||
|
|
||||||
|
if len(srcs) == 1 and srcs[0].suffix.lower() == '.pdf':
|
||||||
|
print(f'Extracting images from {srcs[0]}')
|
||||||
|
return link_pdf(srcs[0], dest, start_index)
|
||||||
|
|
||||||
|
if len(srcs) == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
with_text = []
|
||||||
|
textless = []
|
||||||
|
for src in srcs:
|
||||||
|
if TEXTLESS_REGEX.search(src.name):
|
||||||
|
textless.append(src)
|
||||||
|
else:
|
||||||
|
with_text.append(src)
|
||||||
|
if with_text and textless:
|
||||||
|
text_pages = collate_from_paths(with_text, dest, start_index)
|
||||||
|
if text_pages is None:
|
||||||
|
return None
|
||||||
|
textless_pages = collate_from_paths(textless, dest, start_index+text_pages)
|
||||||
|
if textless_pages is None:
|
||||||
|
return None
|
||||||
|
return text_pages + textless_pages
|
||||||
|
|
||||||
|
if all(src.is_file() and src.suffix.lower() in IMAGE_FILE_EXTENSIONS for src in srcs):
|
||||||
|
ordering = complete_prefix_number_ordering(srcs)
|
||||||
|
if ordering:
|
||||||
|
print(f'Symlinking image files: {ordering[0]}...')
|
||||||
|
link_ordered_files(ordering, dest, start_index)
|
||||||
|
return len(ordering)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def self_and_parents(path):
|
def self_and_parents(path):
|
||||||
return [path] + list(path.parents)
|
return [path] + list(path.parents)
|
||||||
|
|
||||||
|
@ -358,15 +409,16 @@ def manual_collate(args):
|
||||||
if ordering is None:
|
if ordering is None:
|
||||||
ordering = entries
|
ordering = entries
|
||||||
ordering.sort()
|
ordering.sort()
|
||||||
link_ordered_files(ordering, collation_dir, start_index=index)
|
link_ordered_files(ordering, collation_dir, index)
|
||||||
index += len(ordering)
|
index += len(ordering)
|
||||||
elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS:
|
elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS:
|
||||||
link_ordered_files([path], collation_dir, start_index=index)
|
link_ordered_files([path], collation_dir, index)
|
||||||
index += 1
|
index += 1
|
||||||
elif path.suffix.lower() == ".pdf":
|
elif path.suffix.lower() == ".pdf":
|
||||||
link_pdf(path, collation_dir, start_index=index)
|
pdf_page_count = link_pdf(path, collation_dir, index)
|
||||||
with fitz.open(path) as pdf:
|
if pdf_page_count is None:
|
||||||
index += pdf.page_count
|
return
|
||||||
|
index += pdf_page_count
|
||||||
else:
|
else:
|
||||||
print(f'Unknown file type {path}, stopping')
|
print(f'Unknown file type {path}, stopping')
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in a new issue