support DLSite's multipart RAR archives for larger works

This commit is contained in:
xenofem 2024-03-15 16:00:37 -04:00
parent 894b1d34b6
commit 9ab587d399
3 changed files with 63 additions and 25 deletions

View file

@ -21,6 +21,7 @@ import fitz
from libsixel import * from libsixel import *
from PIL import Image from PIL import Image
from jinja2 import Environment, PackageLoader, select_autoescape from jinja2 import Environment, PackageLoader, select_autoescape
import rarfile
import requests import requests
NUMBER_REGEX = re.compile('[0-9-]+') NUMBER_REGEX = re.compile('[0-9-]+')
@ -84,32 +85,67 @@ PDF_PREVIEW_DPI = 72
IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I) IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
MULTIPART_RAR_HEAD_REGEX = re.compile(r'^(.+)\.part0*1\.exe$', re.I)
MULTIPART_RAR_TAIL_REGEX = re.compile(r'^(.+)\.part0*([^1]|[^0].+)\.rar$', re.I)
def open_zipfile_with_encoding(path): def open_zipfile_with_encoding(path):
try: for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
return zipfile.ZipFile(path, metadata_encoding="utf-8") try:
except UnicodeDecodeError: return zipfile.ZipFile(path, metadata_encoding=enc)
pass except UnicodeDecodeError:
pass
try: print(f'{path} contains filenames with unknown character encoding!')
return zipfile.ZipFile(path, metadata_encoding="shift-jis") exit(1)
except UnicodeDecodeError:
pass
return zipfile.ZipFile(path, metadata_encoding="shift-jisx0213") def open_rarfile_with_encoding(path):
for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
rf = rarfile.RarFile(path, charset=enc)
if all('<EFBFBD>' not in info.filename for info in rf.infolist()):
return rf
print(f'{path} contains filenames with unknown character encoding!')
exit(1)
def extract(args): def extract(args):
for zip_path in args.zipfiles: absolute_archive_paths = set(path.resolve(strict=True) for path in args.archives)
work_id = zip_path.stem
work_extract_path = args.destdir / 'extract' / work_id
work_extract_path.mkdir(parents=True)
print(f'Extracting {zip_path} to {work_extract_path}') for archive_path in args.archives:
if archive_path.suffix.lower() == '.zip':
work_id = archive_path.stem
work_extract_path = args.destdir / 'extract' / work_id
with open_zipfile_with_encoding(zip_path) as z: print(f'Extracting {archive_path} to {work_extract_path}')
z.extractall(path=work_extract_path)
if args.remove: with open_zipfile_with_encoding(archive_path) as z:
zip_path.unlink() work_extract_path.mkdir(parents=True)
z.extractall(path=work_extract_path)
if args.remove:
archive_path.unlink()
elif rar_match := MULTIPART_RAR_HEAD_REGEX.fullmatch(archive_path.name):
work_id = rar_match.group(1)
work_extract_path = args.destdir / 'extract' / work_id
print(f'Extracting multipart RAR archive beginning with {archive_path} to {work_extract_path}')
with open_rarfile_with_encoding(archive_path) as r:
volumes = [Path(vol).resolve(strict=True) for vol in r.volumelist()]
if any(vol not in absolute_archive_paths for vol in volumes):
print(f'Multipart RAR archive starting with {archive_path} contains volume files not listed on command-line, skipping')
continue
work_extract_path.mkdir(parents=True)
r.extractall(path=work_extract_path)
if args.remove:
for vol in volumes:
vol.unlink()
elif MULTIPART_RAR_TAIL_REGEX.fullmatch(archive_path.name):
pass
else:
print(f'Unknown archive file type {archive_path}, skipping')
def manual_input_metadata(work_id): def manual_input_metadata(work_id):
@ -1073,9 +1109,8 @@ argparser = argparse.ArgumentParser(
that can be viewed in a web browser. that can be viewed in a web browser.
Intended workflow: Intended workflow:
- `extract` a collection of zipfiles downloaded from DLSite - `extract` a collection of archive files into DLibrary's data
into DLibrary's data directory, giving each work its own directory, automatically giving each work its own subfolder.
subfolder.
- `fetch` metadata and thumbnail images for extracted works - `fetch` metadata and thumbnail images for extracted works
from DLSite. from DLSite.
- `collate` extracted works, producing a single sequence of - `collate` extracted works, producing a single sequence of
@ -1103,18 +1138,18 @@ argparser.add_argument(
) )
subparsers = argparser.add_subparsers(title="subcommands", required=True) subparsers = argparser.add_subparsers(title="subcommands", required=True)
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles') parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract archive files')
parser_extract.add_argument( parser_extract.add_argument(
'-r', '--remove', '-r', '--remove',
action='store_true', action='store_true',
help='remove original zipfiles after extraction', help='remove original archive files after extraction',
) )
parser_extract.add_argument( parser_extract.add_argument(
'zipfiles', 'archives',
metavar='FILE', metavar='FILE',
type=Path, type=Path,
nargs='+', nargs='+',
help='zipfiles to extract', help='archive files to extract',
) )
parser_extract.set_defaults(func=extract) parser_extract.set_defaults(func=extract)

View file

@ -6,6 +6,7 @@
outputs = { self, nixpkgs }: let outputs = { self, nixpkgs }: let
pkgs = import nixpkgs { pkgs = import nixpkgs {
system = "x86_64-linux"; system = "x86_64-linux";
config.allowUnfree = true;
}; };
in { in {
packages.x86_64-linux = with pkgs.python3Packages; rec { packages.x86_64-linux = with pkgs.python3Packages; rec {
@ -58,6 +59,7 @@
importlib-resources importlib-resources
setuptools setuptools
libsixel libsixel
(rarfile.override { useUnrar = true; })
]; ];
src = ./.; src = ./.;
}; };

View file

@ -12,6 +12,7 @@ dependencies = [
"jinja2", "jinja2",
"importlib_resources", "importlib_resources",
"libsixel-python", "libsixel-python",
"rarfile",
] ]
[project.scripts] [project.scripts]