support DLSite's multipart RAR archives for larger works
This commit is contained in:
parent
894b1d34b6
commit
9ab587d399
|
@ -21,6 +21,7 @@ import fitz
|
|||
from libsixel import *
|
||||
from PIL import Image
|
||||
from jinja2 import Environment, PackageLoader, select_autoescape
|
||||
import rarfile
|
||||
import requests
|
||||
|
||||
NUMBER_REGEX = re.compile('[0-90-9]+')
|
||||
|
@ -84,32 +85,67 @@ PDF_PREVIEW_DPI = 72
|
|||
|
||||
IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
|
||||
|
||||
MULTIPART_RAR_HEAD_REGEX = re.compile(r'^(.+)\.part0*1\.exe$', re.I)
|
||||
MULTIPART_RAR_TAIL_REGEX = re.compile(r'^(.+)\.part0*([^1]|[^0].+)\.rar$', re.I)
|
||||
|
||||
def open_zipfile_with_encoding(path):
|
||||
try:
|
||||
return zipfile.ZipFile(path, metadata_encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
|
||||
try:
|
||||
return zipfile.ZipFile(path, metadata_encoding=enc)
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
return zipfile.ZipFile(path, metadata_encoding="shift-jis")
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
print(f'{path} contains filenames with unknown character encoding!')
|
||||
exit(1)
|
||||
|
||||
return zipfile.ZipFile(path, metadata_encoding="shift-jisx0213")
|
||||
def open_rarfile_with_encoding(path):
|
||||
for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
|
||||
rf = rarfile.RarFile(path, charset=enc)
|
||||
if all('<EFBFBD>' not in info.filename for info in rf.infolist()):
|
||||
return rf
|
||||
|
||||
print(f'{path} contains filenames with unknown character encoding!')
|
||||
exit(1)
|
||||
|
||||
def extract(args):
|
||||
for zip_path in args.zipfiles:
|
||||
work_id = zip_path.stem
|
||||
work_extract_path = args.destdir / 'extract' / work_id
|
||||
work_extract_path.mkdir(parents=True)
|
||||
absolute_archive_paths = set(path.resolve(strict=True) for path in args.archives)
|
||||
|
||||
print(f'Extracting {zip_path} to {work_extract_path}')
|
||||
for archive_path in args.archives:
|
||||
if archive_path.suffix.lower() == '.zip':
|
||||
work_id = archive_path.stem
|
||||
work_extract_path = args.destdir / 'extract' / work_id
|
||||
|
||||
with open_zipfile_with_encoding(zip_path) as z:
|
||||
z.extractall(path=work_extract_path)
|
||||
print(f'Extracting {archive_path} to {work_extract_path}')
|
||||
|
||||
if args.remove:
|
||||
zip_path.unlink()
|
||||
with open_zipfile_with_encoding(archive_path) as z:
|
||||
work_extract_path.mkdir(parents=True)
|
||||
z.extractall(path=work_extract_path)
|
||||
|
||||
if args.remove:
|
||||
archive_path.unlink()
|
||||
|
||||
elif rar_match := MULTIPART_RAR_HEAD_REGEX.fullmatch(archive_path.name):
|
||||
work_id = rar_match.group(1)
|
||||
work_extract_path = args.destdir / 'extract' / work_id
|
||||
|
||||
print(f'Extracting multipart RAR archive beginning with {archive_path} to {work_extract_path}')
|
||||
|
||||
with open_rarfile_with_encoding(archive_path) as r:
|
||||
volumes = [Path(vol).resolve(strict=True) for vol in r.volumelist()]
|
||||
if any(vol not in absolute_archive_paths for vol in volumes):
|
||||
print(f'Multipart RAR archive starting with {archive_path} contains volume files not listed on command-line, skipping')
|
||||
continue
|
||||
work_extract_path.mkdir(parents=True)
|
||||
r.extractall(path=work_extract_path)
|
||||
|
||||
if args.remove:
|
||||
for vol in volumes:
|
||||
vol.unlink()
|
||||
|
||||
elif MULTIPART_RAR_TAIL_REGEX.fullmatch(archive_path.name):
|
||||
pass
|
||||
else:
|
||||
print(f'Unknown archive file type {archive_path}, skipping')
|
||||
|
||||
|
||||
def manual_input_metadata(work_id):
|
||||
|
@ -1073,9 +1109,8 @@ argparser = argparse.ArgumentParser(
|
|||
that can be viewed in a web browser.
|
||||
|
||||
Intended workflow:
|
||||
- `extract` a collection of zipfiles downloaded from DLSite
|
||||
into DLibrary's data directory, giving each work its own
|
||||
subfolder.
|
||||
- `extract` a collection of archive files into DLibrary's data
|
||||
directory, automatically giving each work its own subfolder.
|
||||
- `fetch` metadata and thumbnail images for extracted works
|
||||
from DLSite.
|
||||
- `collate` extracted works, producing a single sequence of
|
||||
|
@ -1103,18 +1138,18 @@ argparser.add_argument(
|
|||
)
|
||||
subparsers = argparser.add_subparsers(title="subcommands", required=True)
|
||||
|
||||
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles')
|
||||
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract archive files')
|
||||
parser_extract.add_argument(
|
||||
'-r', '--remove',
|
||||
action='store_true',
|
||||
help='remove original zipfiles after extraction',
|
||||
help='remove original archive files after extraction',
|
||||
)
|
||||
parser_extract.add_argument(
|
||||
'zipfiles',
|
||||
'archives',
|
||||
metavar='FILE',
|
||||
type=Path,
|
||||
nargs='+',
|
||||
help='zipfiles to extract',
|
||||
help='archive files to extract',
|
||||
)
|
||||
parser_extract.set_defaults(func=extract)
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
outputs = { self, nixpkgs }: let
|
||||
pkgs = import nixpkgs {
|
||||
system = "x86_64-linux";
|
||||
config.allowUnfree = true;
|
||||
};
|
||||
in {
|
||||
packages.x86_64-linux = with pkgs.python3Packages; rec {
|
||||
|
@ -58,6 +59,7 @@
|
|||
importlib-resources
|
||||
setuptools
|
||||
libsixel
|
||||
(rarfile.override { useUnrar = true; })
|
||||
];
|
||||
src = ./.;
|
||||
};
|
||||
|
|
|
@ -12,6 +12,7 @@ dependencies = [
|
|||
"jinja2",
|
||||
"importlib_resources",
|
||||
"libsixel-python",
|
||||
"rarfile",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
|
Loading…
Reference in a new issue