1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-22 18:53:21 +01:00

adjust metadata types for GalleryExtractors

This commit is contained in:
Mike Fährmann 2019-03-01 23:13:40 +01:00
parent 13e0f2a78f
commit 26c4365baa
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
6 changed files with 92 additions and 89 deletions

View File

@ -12,16 +12,20 @@ from .common import GalleryExtractor, Extractor, Message
from .. import text
class HentaifoxGalleryExtractor(GalleryExtractor):
"""Extractor for image galleries on hentaifox.com"""
class HentaifoxBase():
"""Base class for hentaifox extractors"""
category = "hentaifox"
root = "https://hentaifox.com"
class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
"""Extractor for image galleries on hentaifox.com"""
pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
test = ("https://hentaifox.com/gallery/56622/", {
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
"count": 24,
"keyword": "d0df47e073e32a7752236ab151949c3820f9d81e",
"keyword": "38f8517605feb6854d48833297da6b05c6541b69",
})
root = "https://hentaifox.com"
def __init__(self, match):
GalleryExtractor.__init__(self, match)
@ -30,7 +34,7 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
def metadata(self, page):
title, pos = text.extract(page, "<h1>", "</h1>")
data = text.extract_all(page, (
("parodies" , ">Parodies:" , "</a></span>"),
("parody" , ">Parodies:" , "</a></span>"),
("characters", ">Characters:", "</a></span>"),
("tags" , ">Tags:" , "</a></span>"),
("artist" , ">Artists:" , "</a></span>"),
@ -39,9 +43,10 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
), pos)[0]
for key, value in data.items():
data[key] = text.remove_html(value).replace(" , ", ", ")
data[key] = text.split_html(value)[::2]
data["gallery_id"] = text.parse_int(self.gallery_id)
data["title"] = text.unescape(title)
data["type"] = data["type"][0] if data["type"] else ""
data["language"] = "English"
data["lang"] = "en"
return data
@ -53,9 +58,8 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
]
class HentaifoxSearchExtractor(Extractor):
class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
"""Extractor for search results and listings on hentaifox.com"""
category = "hentaifox"
subcategory = "search"
pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)")
@ -76,7 +80,6 @@ class HentaifoxSearchExtractor(Extractor):
},
}),
)
root = "https://hentaifox.com"
def __init__(self, match):
Extractor.__init__(self, match)

View File

@ -20,7 +20,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
test = (
("https://hitomi.la/galleries/867789.html", {
"url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
"keyword": "52951edb50163180eb669a78aef0bab0522d32b7",
"keyword": "07536afc5696cb4983a4831ab4c70c1d155f875c",
}),
("https://hitomi.la/galleries/1036181.html", {
# "aa" subdomain for gallery-id ending in 1 (#142)
@ -30,8 +30,8 @@ class HitomiGalleryExtractor(GalleryExtractor):
)
def __init__(self, match):
self.gid = text.parse_int(match.group(1))
url = "https://hitomi.la/galleries/{}.html".format(self.gid)
self.gallery_id = text.parse_int(match.group(1))
url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id)
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@ -49,23 +49,22 @@ class HitomiGalleryExtractor(GalleryExtractor):
lang = None if lang == "N/A" else text.remove_html(lang)
return {
"gallery_id": self.gid,
"title": text.unescape(" ".join(title.split())),
"artist": self._prepare(artist),
"group": self._prepare(group),
"type": text.remove_html(gtype).capitalize(),
"lang": util.language_to_code(lang),
"language": lang,
"date": date,
"series": self._prepare(series),
"gallery_id": self.gallery_id,
"title" : text.unescape(title.strip()),
"artist" : self._prepare(artist),
"group" : self._prepare(group),
"parody" : self._prepare(series),
"characters": self._prepare(chars),
"tags": self._prepare(tags),
"tags" : self._prepare(tags),
"type" : text.remove_html(gtype).capitalize(),
"lang" : util.language_to_code(lang),
"language" : lang,
"date" : date,
}
def images(self, page):
# see https://ltn.hitomi.la/common.js
frontends = 2
offset = self.gid % frontends if self.gid % 10 != 1 else 0
offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0
subdomain = chr(97 + offset) + "a"
base = "https://" + subdomain + ".hitomi.la/galleries/"
@ -78,10 +77,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
@staticmethod
def _prepare(value):
if not value or "<ul " not in value:
return ""
value = ", ".join(text.extract_iter(
value, '.html">', '<'))
return string.capwords(
text.unescape(value)
)
return [
text.unescape(string.capwords(v))
for v in text.extract_iter(value or "", '.html">', '<')
]

View File

@ -32,6 +32,7 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
"title_ja" : str,
"gallery_id": 147850,
"media_id" : 867789,
"count" : 16,
"date" : 1446050915,
"scanlator" : "",
"artist" : ["morris"],
@ -40,8 +41,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
"characters": list,
"tags" : list,
"type" : "manga",
"language" : ["translated", "english"],
"lang" : "en",
"language" : "English",
"width" : int,
"height" : int,
},
@ -63,12 +64,11 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
for tag in data["tags"]:
info[tag["type"]].append(tag["name"])
language = ""
for language in info["language"]:
if language != "translated":
lang = util.language_to_code(language)
language = language.capitalize()
break
else:
lang = ""
return {
"title" : title_en or title_ja,
@ -84,8 +84,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
"characters": info["character"],
"tags" : info["tag"],
"type" : info["category"][0] if info["category"] else "",
"language" : info["language"],
"lang" : lang,
"lang" : util.language_to_code(language),
"language" : language,
}
def images(self, _):

View File

@ -20,24 +20,24 @@ class PururinGalleryExtractor(GalleryExtractor):
test = ("https://pururin.io/gallery/38661/iowant-2", {
"pattern": r"https://cdn.pururin.io/assets/images/data/38661/\d+\.jpg",
"keyword": {
"artist": "Shoda Norihiro",
"title" : "Iowant 2!!",
"title_en" : "Iowant 2!!",
"title_jp" : "",
"gallery_id": 38661,
"count" : 19,
"artist" : ["Shoda Norihiro"],
"group" : ["Obsidian Order"],
"parody" : ["Kantai Collection"],
"characters": ["Iowa", "Teitoku"],
"tags" : list,
"type" : "Doujinshi",
"collection": "",
"convention": "C92",
"count": 19,
"extension": "jpg",
"gallery_id": 38661,
"group": "Obsidian Order",
"lang": "en",
"language": "English",
"parody": "Kantai Collection",
"rating": float,
"scanlator": "",
"tags": list,
"title": "Iowant 2!!",
"title_jp": str,
"type": "Doujinshi",
"uploader": "demo"
"rating" : float,
"uploader" : "demo",
"scanlator" : "",
"lang" : "en",
"language" : "English",
}
})
root = "https://pururin.io"
@ -74,18 +74,19 @@ class PururinGalleryExtractor(GalleryExtractor):
self._ext = info["image_extension"]
self._cnt = info["total_pages"]
for key in ("tags", "characters"):
for key in ("artist", "group", "parody", "tags", "characters"):
data[key] = [
text.unescape(item)
for item in text.extract_iter(data[key], 'title="', '"')
]
for key in ("artist", "group", "parody", "type", "collection",
"language", "scanlator", "convention"):
for key in ("type", "collection", "language", "scanlator",
"convention"):
data[key] = text.unescape(text.extract(
data[key], 'title="', '"')[0] or "")
data["gallery_id"] = text.parse_int(self.gallery_id)
data["title"] = info["title"]
data["title"] = info["title"] or info.get("j_title") or ""
data["title_en"] = info["title"]
data["title_jp"] = info.get("j_title") or ""
data["uploader"] = text.remove_html(data["uploader"])
data["rating"] = text.parse_float(data["rating"])

View File

@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
(("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), {
"url": "258289249990502c3138719cb89e995a60861e49",
"keyword": "468a0a3db4fc6ad7fcae0facefb9753831c0404d",
"keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b",
}),
("https://www.simply-hentai.com/notfound", {
"exception": exception.GalleryDLException,
@ -55,14 +55,14 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
return {
"gallery_id": text.parse_int(gid),
"title": text.unescape(title),
"series": text.remove_html(series),
"characters": ", ".join(text.split_html(chars)),
"tags": text.split_html(tags),
"artist": ", ".join(text.split_html(artist)),
"lang": util.language_to_code(lang),
"language": lang,
"date": text.remove_html(date),
"title" : text.unescape(title),
"artist" : text.split_html(artist),
"parody" : text.split_html(series),
"characters": text.split_html(chars),
"tags" : text.split_html(tags),
"lang" : util.language_to_code(lang),
"language" : lang,
"date" : text.remove_html(date),
}
def images(self, _):

View File

@ -48,24 +48,24 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
("https://www.tsumino.com/Book/Info/40996", {
"url": "84bf30a86623039fc87855680fada884dc8a1ddd",
"keyword": {
"artist": "Itou Life",
"characters": "Carmilla, Gudako, Gudao, Lancelot, Nightingale",
"collection": "",
"count": 42,
"date": "2018 June 29",
"title" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
"title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
"title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
"gallery_id": 40996,
"group": "Itou Life",
"lang": "en",
"language": "English",
"page": int,
"parodies": "Fate/Grand Order",
"rating": float,
"tags": str,
"thumbnail": "http://www.tsumino.com/Image/Thumb/40996",
"title": r"re:Shikoshiko Daisuki Nightingale \+ Kaijou Gentei",
"title_jp": "シコシコ大好きナイチンゲール + 会場限定おまけ本",
"type": "Doujinshi",
"uploader": "sehki"
"date" : "2018 June 29",
"count" : 42,
"collection": "",
"artist" : ["Itou Life"],
"group" : ["Itou Life"],
"parody" : ["Fate/Grand Order"],
"characters": list,
"tags" : list,
"type" : "Doujinshi",
"rating" : float,
"uploader" : "sehki",
"lang" : "en",
"language" : "English",
"thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
},
}),
("https://www.tsumino.com/Read/View/45834"),
@ -81,6 +81,8 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
title, pos = extr(page, '"og:title" content="', '"')
thumb, pos = extr(page, '"og:image" content="', '"', pos)
title_en, _, title_jp = text.unescape(title).partition("/")
title_en = title_en.strip()
title_jp = title_jp.strip()
uploader , pos = extr(page, 'id="Uploader">' , '</div>', pos)
date , pos = extr(page, 'id="Uploaded">' , '</div>', pos)
@ -95,19 +97,20 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
return {
"gallery_id": text.parse_int(self.gallery_id),
"title": title_en.strip(),
"title_jp": title_jp.strip(),
"title": title_en or title_jp,
"title_en": title_en,
"title_jp": title_jp,
"thumbnail": thumb,
"uploader": text.remove_html(uploader),
"date": date.strip(),
"rating": text.parse_float(rating.partition(" ")[0]),
"type": text.remove_html(gtype),
"collection": text.remove_html(collection),
"group": text.remove_html(group),
"artist": ", ".join(text.split_html(artist)),
"parodies": ", ".join(text.split_html(parody)),
"characters": ", ".join(text.split_html(character)),
"tags": ", ".join(text.split_html(tags)),
"group": text.split_html(group),
"artist": text.split_html(artist),
"parody": text.split_html(parody),
"characters": text.split_html(character),
"tags": text.split_html(tags),
"language": "English",
"lang": "en",
}