From 380a481d9be01184cf0dbbe620bbc55ff68f1597 Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Sun, 8 Dec 2024 17:07:23 -0500
Subject: [PATCH] extract images from epubs

---
 dlibrary/dlibrary.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index 7b97233..8868f83 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -97,8 +97,20 @@ ALT_VERSIONS = [
 
 IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp']
 
-IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store', 'desktop.ini']
-IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
+IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store', 'desktop.ini', 'mimetype']
+IGNOREABLE_EXTENSIONS = [
+    '.txt',
+    '.xml',
+    '.html',
+    '.htm',
+    '.xhtml',
+    '.css',
+    '.js',
+    '.psd',
+    '.mp4',
+    '.opf',
+    '.ncx',
+]
 
 PDF_CONVERSION_DPI = 300
 PDF_PREVIEW_DPI = 72
@@ -179,7 +191,7 @@ def extract(args):
     any_skipped = False
 
     for archive_path in args.archives:
-        if archive_path.suffix.lower() == '.zip':
+        if archive_path.suffix.lower() in ['.zip', '.epub']:
             work_id = archive_path.stem
             work_extract_path = args.destdir / 'extract' / work_id