support DLSite's multipart RAR archives for larger works

This commit is contained in:
xenofem 2024-03-15 16:00:37 -04:00
parent 894b1d34b6
commit 9ab587d399
3 changed files with 63 additions and 25 deletions

View file

@ -21,6 +21,7 @@ import fitz
from libsixel import *
from PIL import Image
from jinja2 import Environment, PackageLoader, select_autoescape
import rarfile
import requests
NUMBER_REGEX = re.compile('[0-9-]+')
@ -84,32 +85,67 @@ PDF_PREVIEW_DPI = 72
IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
MULTIPART_RAR_HEAD_REGEX = re.compile(r'^(.+)\.part0*1\.exe$', re.I)
MULTIPART_RAR_TAIL_REGEX = re.compile(r'^(.+)\.part0*([^1]|[^0].+)\.rar$', re.I)
def open_zipfile_with_encoding(path):
for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
try:
return zipfile.ZipFile(path, metadata_encoding="utf-8")
return zipfile.ZipFile(path, metadata_encoding=enc)
except UnicodeDecodeError:
pass
try:
return zipfile.ZipFile(path, metadata_encoding="shift-jis")
except UnicodeDecodeError:
pass
print(f'{path} contains filenames with unknown character encoding!')
exit(1)
return zipfile.ZipFile(path, metadata_encoding="shift-jisx0213")
def open_rarfile_with_encoding(path):
for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
rf = rarfile.RarFile(path, charset=enc)
if all('<EFBFBD>' not in info.filename for info in rf.infolist()):
return rf
print(f'{path} contains filenames with unknown character encoding!')
exit(1)
def extract(args):
for zip_path in args.zipfiles:
work_id = zip_path.stem
absolute_archive_paths = set(path.resolve(strict=True) for path in args.archives)
for archive_path in args.archives:
if archive_path.suffix.lower() == '.zip':
work_id = archive_path.stem
work_extract_path = args.destdir / 'extract' / work_id
print(f'Extracting {archive_path} to {work_extract_path}')
with open_zipfile_with_encoding(archive_path) as z:
work_extract_path.mkdir(parents=True)
print(f'Extracting {zip_path} to {work_extract_path}')
with open_zipfile_with_encoding(zip_path) as z:
z.extractall(path=work_extract_path)
if args.remove:
zip_path.unlink()
archive_path.unlink()
elif rar_match := MULTIPART_RAR_HEAD_REGEX.fullmatch(archive_path.name):
work_id = rar_match.group(1)
work_extract_path = args.destdir / 'extract' / work_id
print(f'Extracting multipart RAR archive beginning with {archive_path} to {work_extract_path}')
with open_rarfile_with_encoding(archive_path) as r:
volumes = [Path(vol).resolve(strict=True) for vol in r.volumelist()]
if any(vol not in absolute_archive_paths for vol in volumes):
print(f'Multipart RAR archive starting with {archive_path} contains volume files not listed on command-line, skipping')
continue
work_extract_path.mkdir(parents=True)
r.extractall(path=work_extract_path)
if args.remove:
for vol in volumes:
vol.unlink()
elif MULTIPART_RAR_TAIL_REGEX.fullmatch(archive_path.name):
pass
else:
print(f'Unknown archive file type {archive_path}, skipping')
def manual_input_metadata(work_id):
@ -1073,9 +1109,8 @@ argparser = argparse.ArgumentParser(
that can be viewed in a web browser.
Intended workflow:
- `extract` a collection of zipfiles downloaded from DLSite
into DLibrary's data directory, giving each work its own
subfolder.
- `extract` a collection of archive files into DLibrary's data
directory, automatically giving each work its own subfolder.
- `fetch` metadata and thumbnail images for extracted works
from DLSite.
- `collate` extracted works, producing a single sequence of
@ -1103,18 +1138,18 @@ argparser.add_argument(
)
subparsers = argparser.add_subparsers(title="subcommands", required=True)
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles')
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract archive files')
parser_extract.add_argument(
'-r', '--remove',
action='store_true',
help='remove original zipfiles after extraction',
help='remove original archive files after extraction',
)
parser_extract.add_argument(
'zipfiles',
'archives',
metavar='FILE',
type=Path,
nargs='+',
help='zipfiles to extract',
help='archive files to extract',
)
parser_extract.set_defaults(func=extract)

View file

@ -6,6 +6,7 @@
outputs = { self, nixpkgs }: let
pkgs = import nixpkgs {
system = "x86_64-linux";
config.allowUnfree = true;
};
in {
packages.x86_64-linux = with pkgs.python3Packages; rec {
@ -58,6 +59,7 @@
importlib-resources
setuptools
libsixel
(rarfile.override { useUnrar = true; })
];
src = ./.;
};

View file

@ -12,6 +12,7 @@ dependencies = [
"jinja2",
"importlib_resources",
"libsixel-python",
"rarfile",
]
[project.scripts]