diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py
index c7dc8f87..417c095b 100644
--- a/gallery_dl/extractor/hentaifox.py
+++ b/gallery_dl/extractor/hentaifox.py
@@ -12,16 +12,20 @@ from .common import GalleryExtractor, Extractor, Message
from .. import text
-class HentaifoxGalleryExtractor(GalleryExtractor):
- """Extractor for image galleries on hentaifox.com"""
+class HentaifoxBase():
+ """Base class for hentaifox extractors"""
category = "hentaifox"
+ root = "https://hentaifox.com"
+
+
+class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
+ """Extractor for image galleries on hentaifox.com"""
pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
test = ("https://hentaifox.com/gallery/56622/", {
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
"count": 24,
- "keyword": "d0df47e073e32a7752236ab151949c3820f9d81e",
+ "keyword": "38f8517605feb6854d48833297da6b05c6541b69",
})
- root = "https://hentaifox.com"
def __init__(self, match):
GalleryExtractor.__init__(self, match)
@@ -30,7 +34,7 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
def metadata(self, page):
title, pos = text.extract(page, "
", "
")
data = text.extract_all(page, (
- ("parodies" , ">Parodies:" , ""),
+ ("parody" , ">Parodies:" , ""),
("characters", ">Characters:", ""),
("tags" , ">Tags:" , ""),
("artist" , ">Artists:" , ""),
@@ -39,9 +43,10 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
), pos)[0]
for key, value in data.items():
- data[key] = text.remove_html(value).replace(" , ", ", ")
+ data[key] = text.split_html(value)[::2]
data["gallery_id"] = text.parse_int(self.gallery_id)
data["title"] = text.unescape(title)
+ data["type"] = data["type"][0] if data["type"] else ""
data["language"] = "English"
data["lang"] = "en"
return data
@@ -53,9 +58,8 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
]
-class HentaifoxSearchExtractor(Extractor):
+class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
"""Extractor for search results and listings on hentaifox.com"""
- category = "hentaifox"
subcategory = "search"
pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)")
@@ -76,7 +80,6 @@ class HentaifoxSearchExtractor(Extractor):
},
}),
)
- root = "https://hentaifox.com"
def __init__(self, match):
Extractor.__init__(self, match)
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 4de7c938..5a518beb 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -20,7 +20,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
test = (
("https://hitomi.la/galleries/867789.html", {
"url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
- "keyword": "52951edb50163180eb669a78aef0bab0522d32b7",
+ "keyword": "07536afc5696cb4983a4831ab4c70c1d155f875c",
}),
("https://hitomi.la/galleries/1036181.html", {
# "aa" subdomain for gallery-id ending in 1 (#142)
@@ -30,8 +30,8 @@ class HitomiGalleryExtractor(GalleryExtractor):
)
def __init__(self, match):
- self.gid = text.parse_int(match.group(1))
- url = "https://hitomi.la/galleries/{}.html".format(self.gid)
+ self.gallery_id = text.parse_int(match.group(1))
+ url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id)
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -49,23 +49,22 @@ class HitomiGalleryExtractor(GalleryExtractor):
lang = None if lang == "N/A" else text.remove_html(lang)
return {
- "gallery_id": self.gid,
- "title": text.unescape(" ".join(title.split())),
- "artist": self._prepare(artist),
- "group": self._prepare(group),
- "type": text.remove_html(gtype).capitalize(),
- "lang": util.language_to_code(lang),
- "language": lang,
- "date": date,
- "series": self._prepare(series),
+ "gallery_id": self.gallery_id,
+ "title" : text.unescape(title.strip()),
+ "artist" : self._prepare(artist),
+ "group" : self._prepare(group),
+ "parody" : self._prepare(series),
"characters": self._prepare(chars),
- "tags": self._prepare(tags),
+ "tags" : self._prepare(tags),
+ "type" : text.remove_html(gtype).capitalize(),
+ "lang" : util.language_to_code(lang),
+ "language" : lang,
+ "date" : date,
}
def images(self, page):
# see https://ltn.hitomi.la/common.js
- frontends = 2
- offset = self.gid % frontends if self.gid % 10 != 1 else 0
+ offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0
subdomain = chr(97 + offset) + "a"
base = "https://" + subdomain + ".hitomi.la/galleries/"
@@ -78,10 +77,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
@staticmethod
def _prepare(value):
- if not value or "', '<'))
- return string.capwords(
- text.unescape(value)
- )
+ return [
+ text.unescape(string.capwords(v))
+ for v in text.extract_iter(value or "", '.html">', '<')
+ ]
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
index bd23d9fd..746144a9 100644
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@@ -32,6 +32,7 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
"title_ja" : str,
"gallery_id": 147850,
"media_id" : 867789,
+ "count" : 16,
"date" : 1446050915,
"scanlator" : "",
"artist" : ["morris"],
@@ -40,8 +41,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
"characters": list,
"tags" : list,
"type" : "manga",
- "language" : ["translated", "english"],
"lang" : "en",
+ "language" : "English",
"width" : int,
"height" : int,
},
@@ -63,12 +64,11 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
for tag in data["tags"]:
info[tag["type"]].append(tag["name"])
+ language = ""
for language in info["language"]:
if language != "translated":
- lang = util.language_to_code(language)
+ language = language.capitalize()
break
- else:
- lang = ""
return {
"title" : title_en or title_ja,
@@ -84,8 +84,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
"characters": info["character"],
"tags" : info["tag"],
"type" : info["category"][0] if info["category"] else "",
- "language" : info["language"],
- "lang" : lang,
+ "lang" : util.language_to_code(language),
+ "language" : language,
}
def images(self, _):
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
index 35ca86aa..151f5f3a 100644
--- a/gallery_dl/extractor/pururin.py
+++ b/gallery_dl/extractor/pururin.py
@@ -20,24 +20,24 @@ class PururinGalleryExtractor(GalleryExtractor):
test = ("https://pururin.io/gallery/38661/iowant-2", {
"pattern": r"https://cdn.pururin.io/assets/images/data/38661/\d+\.jpg",
"keyword": {
- "artist": "Shoda Norihiro",
+ "title" : "Iowant 2!!",
+ "title_en" : "Iowant 2!!",
+ "title_jp" : "",
+ "gallery_id": 38661,
+ "count" : 19,
+ "artist" : ["Shoda Norihiro"],
+ "group" : ["Obsidian Order"],
+ "parody" : ["Kantai Collection"],
"characters": ["Iowa", "Teitoku"],
+ "tags" : list,
+ "type" : "Doujinshi",
"collection": "",
"convention": "C92",
- "count": 19,
- "extension": "jpg",
- "gallery_id": 38661,
- "group": "Obsidian Order",
- "lang": "en",
- "language": "English",
- "parody": "Kantai Collection",
- "rating": float,
- "scanlator": "",
- "tags": list,
- "title": "Iowant 2!!",
- "title_jp": str,
- "type": "Doujinshi",
- "uploader": "demo"
+ "rating" : float,
+ "uploader" : "demo",
+ "scanlator" : "",
+ "lang" : "en",
+ "language" : "English",
}
})
root = "https://pururin.io"
@@ -74,18 +74,19 @@ class PururinGalleryExtractor(GalleryExtractor):
self._ext = info["image_extension"]
self._cnt = info["total_pages"]
- for key in ("tags", "characters"):
+ for key in ("artist", "group", "parody", "tags", "characters"):
data[key] = [
text.unescape(item)
for item in text.extract_iter(data[key], 'title="', '"')
]
- for key in ("artist", "group", "parody", "type", "collection",
- "language", "scanlator", "convention"):
+ for key in ("type", "collection", "language", "scanlator",
+ "convention"):
data[key] = text.unescape(text.extract(
data[key], 'title="', '"')[0] or "")
data["gallery_id"] = text.parse_int(self.gallery_id)
- data["title"] = info["title"]
+ data["title"] = info["title"] or info.get("j_title") or ""
+ data["title_en"] = info["title"]
data["title_jp"] = info.get("j_title") or ""
data["uploader"] = text.remove_html(data["uploader"])
data["rating"] = text.parse_float(data["rating"])
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
index d9a8ebba..44dc6fe8 100644
--- a/gallery_dl/extractor/simplyhentai.py
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
(("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), {
"url": "258289249990502c3138719cb89e995a60861e49",
- "keyword": "468a0a3db4fc6ad7fcae0facefb9753831c0404d",
+ "keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b",
}),
("https://www.simply-hentai.com/notfound", {
"exception": exception.GalleryDLException,
@@ -55,14 +55,14 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
return {
"gallery_id": text.parse_int(gid),
- "title": text.unescape(title),
- "series": text.remove_html(series),
- "characters": ", ".join(text.split_html(chars)),
- "tags": text.split_html(tags),
- "artist": ", ".join(text.split_html(artist)),
- "lang": util.language_to_code(lang),
- "language": lang,
- "date": text.remove_html(date),
+ "title" : text.unescape(title),
+ "artist" : text.split_html(artist),
+ "parody" : text.split_html(series),
+ "characters": text.split_html(chars),
+ "tags" : text.split_html(tags),
+ "lang" : util.language_to_code(lang),
+ "language" : lang,
+ "date" : text.remove_html(date),
}
def images(self, _):
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
index c0d5a32b..ec80c78d 100644
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -48,24 +48,24 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
("https://www.tsumino.com/Book/Info/40996", {
"url": "84bf30a86623039fc87855680fada884dc8a1ddd",
"keyword": {
- "artist": "Itou Life",
- "characters": "Carmilla, Gudako, Gudao, Lancelot, Nightingale",
- "collection": "",
- "count": 42,
- "date": "2018 June 29",
+ "title" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+ "title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+ "title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
"gallery_id": 40996,
- "group": "Itou Life",
- "lang": "en",
- "language": "English",
- "page": int,
- "parodies": "Fate/Grand Order",
- "rating": float,
- "tags": str,
- "thumbnail": "http://www.tsumino.com/Image/Thumb/40996",
- "title": r"re:Shikoshiko Daisuki Nightingale \+ Kaijou Gentei",
- "title_jp": "シコシコ大好きナイチンゲール + 会場限定おまけ本",
- "type": "Doujinshi",
- "uploader": "sehki"
+ "date" : "2018 June 29",
+ "count" : 42,
+ "collection": "",
+ "artist" : ["Itou Life"],
+ "group" : ["Itou Life"],
+ "parody" : ["Fate/Grand Order"],
+ "characters": list,
+ "tags" : list,
+ "type" : "Doujinshi",
+ "rating" : float,
+ "uploader" : "sehki",
+ "lang" : "en",
+ "language" : "English",
+ "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
},
}),
("https://www.tsumino.com/Read/View/45834"),
@@ -81,6 +81,8 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
title, pos = extr(page, '"og:title" content="', '"')
thumb, pos = extr(page, '"og:image" content="', '"', pos)
title_en, _, title_jp = text.unescape(title).partition("/")
+ title_en = title_en.strip()
+ title_jp = title_jp.strip()
uploader , pos = extr(page, 'id="Uploader">' , '', pos)
date , pos = extr(page, 'id="Uploaded">' , '', pos)
@@ -95,19 +97,20 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
return {
"gallery_id": text.parse_int(self.gallery_id),
- "title": title_en.strip(),
- "title_jp": title_jp.strip(),
+ "title": title_en or title_jp,
+ "title_en": title_en,
+ "title_jp": title_jp,
"thumbnail": thumb,
"uploader": text.remove_html(uploader),
"date": date.strip(),
"rating": text.parse_float(rating.partition(" ")[0]),
"type": text.remove_html(gtype),
"collection": text.remove_html(collection),
- "group": text.remove_html(group),
- "artist": ", ".join(text.split_html(artist)),
- "parodies": ", ".join(text.split_html(parody)),
- "characters": ", ".join(text.split_html(character)),
- "tags": ", ".join(text.split_html(tags)),
+ "group": text.split_html(group),
+ "artist": text.split_html(artist),
+ "parody": text.split_html(parody),
+ "characters": text.split_html(character),
+ "tags": text.split_html(tags),
"language": "English",
"lang": "en",
}