From 9ab587d39941ef3b9a837076751999d50f131091 Mon Sep 17 00:00:00 2001 From: xenofem Date: Fri, 15 Mar 2024 16:00:37 -0400 Subject: [PATCH] support DLSite's multipart RAR archives for larger works --- dlibrary/dlibrary.py | 85 +++++++++++++++++++++++++++++++------------- flake.nix | 2 ++ pyproject.toml | 1 + 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 1dc8b39..3501161 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -21,6 +21,7 @@ import fitz from libsixel import * from PIL import Image from jinja2 import Environment, PackageLoader, select_autoescape +import rarfile import requests NUMBER_REGEX = re.compile('[0-90-9]+') @@ -84,32 +85,67 @@ PDF_PREVIEW_DPI = 72 IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I) +MULTIPART_RAR_HEAD_REGEX = re.compile(r'^(.+)\.part0*1\.exe$', re.I) +MULTIPART_RAR_TAIL_REGEX = re.compile(r'^(.+)\.part0*([^1]|[^0].+)\.rar$', re.I) + def open_zipfile_with_encoding(path): - try: - return zipfile.ZipFile(path, metadata_encoding="utf-8") - except UnicodeDecodeError: - pass + for enc in ["utf-8", "shift-jis", "shift-jisx0213"]: + try: + return zipfile.ZipFile(path, metadata_encoding=enc) + except UnicodeDecodeError: + pass - try: - return zipfile.ZipFile(path, metadata_encoding="shift-jis") - except UnicodeDecodeError: - pass + print(f'{path} contains filenames with unknown character encoding!') + exit(1) - return zipfile.ZipFile(path, metadata_encoding="shift-jisx0213") +def open_rarfile_with_encoding(path): + for enc in ["utf-8", "shift-jis", "shift-jisx0213"]: + rf = rarfile.RarFile(path, charset=enc) + if all('�' not in info.filename for info in rf.infolist()): + return rf + + print(f'{path} contains filenames with unknown character encoding!') + exit(1) def extract(args): - for zip_path in args.zipfiles: - work_id = zip_path.stem - work_extract_path = args.destdir / 'extract' / work_id - work_extract_path.mkdir(parents=True) + absolute_archive_paths = set(path.resolve(strict=True) for path in args.archives) - print(f'Extracting {zip_path} to {work_extract_path}') + for archive_path in args.archives: + if archive_path.suffix.lower() == '.zip': + work_id = archive_path.stem + work_extract_path = args.destdir / 'extract' / work_id - with open_zipfile_with_encoding(zip_path) as z: - z.extractall(path=work_extract_path) + print(f'Extracting {archive_path} to {work_extract_path}') - if args.remove: - zip_path.unlink() + with open_zipfile_with_encoding(archive_path) as z: + work_extract_path.mkdir(parents=True) + z.extractall(path=work_extract_path) + + if args.remove: + archive_path.unlink() + + elif rar_match := MULTIPART_RAR_HEAD_REGEX.fullmatch(archive_path.name): + work_id = rar_match.group(1) + work_extract_path = args.destdir / 'extract' / work_id + + print(f'Extracting multipart RAR archive beginning with {archive_path} to {work_extract_path}') + + with open_rarfile_with_encoding(archive_path) as r: + volumes = [Path(vol).resolve(strict=True) for vol in r.volumelist()] + if any(vol not in absolute_archive_paths for vol in volumes): + print(f'Multipart RAR archive starting with {archive_path} contains volume files not listed on command-line, skipping') + continue + work_extract_path.mkdir(parents=True) + r.extractall(path=work_extract_path) + + if args.remove: + for vol in volumes: + vol.unlink() + + elif MULTIPART_RAR_TAIL_REGEX.fullmatch(archive_path.name): + pass + else: + print(f'Unknown archive file type {archive_path}, skipping') def manual_input_metadata(work_id): @@ -1073,9 +1109,8 @@ argparser = argparse.ArgumentParser( that can be viewed in a web browser. Intended workflow: - - `extract` a collection of zipfiles downloaded from DLSite - into DLibrary's data directory, giving each work its own - subfolder. + - `extract` a collection of archive files into DLibrary's data + directory, automatically giving each work its own subfolder. - `fetch` metadata and thumbnail images for extracted works from DLSite. - `collate` extracted works, producing a single sequence of @@ -1103,18 +1138,18 @@ argparser.add_argument( ) subparsers = argparser.add_subparsers(title="subcommands", required=True) -parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles') +parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract archive files') parser_extract.add_argument( '-r', '--remove', action='store_true', - help='remove original zipfiles after extraction', + help='remove original archive files after extraction', ) parser_extract.add_argument( - 'zipfiles', + 'archives', metavar='FILE', type=Path, nargs='+', - help='zipfiles to extract', + help='archive files to extract', ) parser_extract.set_defaults(func=extract) diff --git a/flake.nix b/flake.nix index 8ef597c..6391e97 100644 --- a/flake.nix +++ b/flake.nix @@ -6,6 +6,7 @@ outputs = { self, nixpkgs }: let pkgs = import nixpkgs { system = "x86_64-linux"; + config.allowUnfree = true; }; in { packages.x86_64-linux = with pkgs.python3Packages; rec { @@ -58,6 +59,7 @@ importlib-resources setuptools libsixel + (rarfile.override { useUnrar = true; }) ]; src = ./.; }; diff --git a/pyproject.toml b/pyproject.toml index d05e950..7897dec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "jinja2", "importlib_resources", "libsixel-python", + "rarfile", ] [project.scripts]