support DLSite's multipart RAR archives for larger works
This commit is contained in:
parent
894b1d34b6
commit
9ab587d399
|
@ -21,6 +21,7 @@ import fitz
|
||||||
from libsixel import *
|
from libsixel import *
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from jinja2 import Environment, PackageLoader, select_autoescape
|
from jinja2 import Environment, PackageLoader, select_autoescape
|
||||||
|
import rarfile
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
NUMBER_REGEX = re.compile('[0-90-9]+')
|
NUMBER_REGEX = re.compile('[0-90-9]+')
|
||||||
|
@ -84,32 +85,67 @@ PDF_PREVIEW_DPI = 72
|
||||||
|
|
||||||
IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
|
IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
|
||||||
|
|
||||||
|
MULTIPART_RAR_HEAD_REGEX = re.compile(r'^(.+)\.part0*1\.exe$', re.I)
|
||||||
|
MULTIPART_RAR_TAIL_REGEX = re.compile(r'^(.+)\.part0*([^1]|[^0].+)\.rar$', re.I)
|
||||||
|
|
||||||
def open_zipfile_with_encoding(path):
|
def open_zipfile_with_encoding(path):
|
||||||
|
for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
|
||||||
try:
|
try:
|
||||||
return zipfile.ZipFile(path, metadata_encoding="utf-8")
|
return zipfile.ZipFile(path, metadata_encoding=enc)
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
print(f'{path} contains filenames with unknown character encoding!')
|
||||||
return zipfile.ZipFile(path, metadata_encoding="shift-jis")
|
exit(1)
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return zipfile.ZipFile(path, metadata_encoding="shift-jisx0213")
|
def open_rarfile_with_encoding(path):
|
||||||
|
for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
|
||||||
|
rf = rarfile.RarFile(path, charset=enc)
|
||||||
|
if all('<EFBFBD>' not in info.filename for info in rf.infolist()):
|
||||||
|
return rf
|
||||||
|
|
||||||
|
print(f'{path} contains filenames with unknown character encoding!')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
def extract(args):
|
def extract(args):
|
||||||
for zip_path in args.zipfiles:
|
absolute_archive_paths = set(path.resolve(strict=True) for path in args.archives)
|
||||||
work_id = zip_path.stem
|
|
||||||
|
for archive_path in args.archives:
|
||||||
|
if archive_path.suffix.lower() == '.zip':
|
||||||
|
work_id = archive_path.stem
|
||||||
work_extract_path = args.destdir / 'extract' / work_id
|
work_extract_path = args.destdir / 'extract' / work_id
|
||||||
|
|
||||||
|
print(f'Extracting {archive_path} to {work_extract_path}')
|
||||||
|
|
||||||
|
with open_zipfile_with_encoding(archive_path) as z:
|
||||||
work_extract_path.mkdir(parents=True)
|
work_extract_path.mkdir(parents=True)
|
||||||
|
|
||||||
print(f'Extracting {zip_path} to {work_extract_path}')
|
|
||||||
|
|
||||||
with open_zipfile_with_encoding(zip_path) as z:
|
|
||||||
z.extractall(path=work_extract_path)
|
z.extractall(path=work_extract_path)
|
||||||
|
|
||||||
if args.remove:
|
if args.remove:
|
||||||
zip_path.unlink()
|
archive_path.unlink()
|
||||||
|
|
||||||
|
elif rar_match := MULTIPART_RAR_HEAD_REGEX.fullmatch(archive_path.name):
|
||||||
|
work_id = rar_match.group(1)
|
||||||
|
work_extract_path = args.destdir / 'extract' / work_id
|
||||||
|
|
||||||
|
print(f'Extracting multipart RAR archive beginning with {archive_path} to {work_extract_path}')
|
||||||
|
|
||||||
|
with open_rarfile_with_encoding(archive_path) as r:
|
||||||
|
volumes = [Path(vol).resolve(strict=True) for vol in r.volumelist()]
|
||||||
|
if any(vol not in absolute_archive_paths for vol in volumes):
|
||||||
|
print(f'Multipart RAR archive starting with {archive_path} contains volume files not listed on command-line, skipping')
|
||||||
|
continue
|
||||||
|
work_extract_path.mkdir(parents=True)
|
||||||
|
r.extractall(path=work_extract_path)
|
||||||
|
|
||||||
|
if args.remove:
|
||||||
|
for vol in volumes:
|
||||||
|
vol.unlink()
|
||||||
|
|
||||||
|
elif MULTIPART_RAR_TAIL_REGEX.fullmatch(archive_path.name):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
print(f'Unknown archive file type {archive_path}, skipping')
|
||||||
|
|
||||||
|
|
||||||
def manual_input_metadata(work_id):
|
def manual_input_metadata(work_id):
|
||||||
|
@ -1073,9 +1109,8 @@ argparser = argparse.ArgumentParser(
|
||||||
that can be viewed in a web browser.
|
that can be viewed in a web browser.
|
||||||
|
|
||||||
Intended workflow:
|
Intended workflow:
|
||||||
- `extract` a collection of zipfiles downloaded from DLSite
|
- `extract` a collection of archive files into DLibrary's data
|
||||||
into DLibrary's data directory, giving each work its own
|
directory, automatically giving each work its own subfolder.
|
||||||
subfolder.
|
|
||||||
- `fetch` metadata and thumbnail images for extracted works
|
- `fetch` metadata and thumbnail images for extracted works
|
||||||
from DLSite.
|
from DLSite.
|
||||||
- `collate` extracted works, producing a single sequence of
|
- `collate` extracted works, producing a single sequence of
|
||||||
|
@ -1103,18 +1138,18 @@ argparser.add_argument(
|
||||||
)
|
)
|
||||||
subparsers = argparser.add_subparsers(title="subcommands", required=True)
|
subparsers = argparser.add_subparsers(title="subcommands", required=True)
|
||||||
|
|
||||||
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles')
|
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract archive files')
|
||||||
parser_extract.add_argument(
|
parser_extract.add_argument(
|
||||||
'-r', '--remove',
|
'-r', '--remove',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='remove original zipfiles after extraction',
|
help='remove original archive files after extraction',
|
||||||
)
|
)
|
||||||
parser_extract.add_argument(
|
parser_extract.add_argument(
|
||||||
'zipfiles',
|
'archives',
|
||||||
metavar='FILE',
|
metavar='FILE',
|
||||||
type=Path,
|
type=Path,
|
||||||
nargs='+',
|
nargs='+',
|
||||||
help='zipfiles to extract',
|
help='archive files to extract',
|
||||||
)
|
)
|
||||||
parser_extract.set_defaults(func=extract)
|
parser_extract.set_defaults(func=extract)
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
outputs = { self, nixpkgs }: let
|
outputs = { self, nixpkgs }: let
|
||||||
pkgs = import nixpkgs {
|
pkgs = import nixpkgs {
|
||||||
system = "x86_64-linux";
|
system = "x86_64-linux";
|
||||||
|
config.allowUnfree = true;
|
||||||
};
|
};
|
||||||
in {
|
in {
|
||||||
packages.x86_64-linux = with pkgs.python3Packages; rec {
|
packages.x86_64-linux = with pkgs.python3Packages; rec {
|
||||||
|
@ -58,6 +59,7 @@
|
||||||
importlib-resources
|
importlib-resources
|
||||||
setuptools
|
setuptools
|
||||||
libsixel
|
libsixel
|
||||||
|
(rarfile.override { useUnrar = true; })
|
||||||
];
|
];
|
||||||
src = ./.;
|
src = ./.;
|
||||||
};
|
};
|
||||||
|
|
|
@ -12,6 +12,7 @@ dependencies = [
|
||||||
"jinja2",
|
"jinja2",
|
||||||
"importlib_resources",
|
"importlib_resources",
|
||||||
"libsixel-python",
|
"libsixel-python",
|
||||||
|
"rarfile",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|
Loading…
Reference in a new issue