From f170d73ffc543cfc51d1ee0c21267886a1a6c334 Mon Sep 17 00:00:00 2001 From: space-nuko <24979496+space-nuko@users.noreply.github.com> Date: Wed, 1 Mar 2023 19:00:20 -0800 Subject: [PATCH 1/2] [hitomi] add 'index' and 'search' extractors - Support hitomi.la multiple tag searches - Support hitomi.la index searches - Fix tests --- gallery_dl/extractor/hitomi.py | 120 +++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 18df9dfa..384908d3 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -16,6 +16,22 @@ import string import re +def get_nozomi_args(query): + ns, tag = query.strip().split(":") + area = ns + language = "all" + + if ns == "female" or ns == "male": + area = "tag" + tag = query + elif "language" == ns: + area = None + language = tag + tag = "index" + + return area, tag, language + + class HitomiGalleryExtractor(GalleryExtractor): """Extractor for image galleries from hitomi.la""" category = "hitomi" @@ -103,6 +119,53 @@ class HitomiGalleryExtractor(GalleryExtractor): return result +class HitomiIndexExtractor(Extractor): + """Extractor for galleries from index searches on hitomi.la""" + category = "hitomi" + subcategory = "index" + root = "https://hitomi.la" + pattern = (r"(?:https?://)?hitomi\.la/" + r"([a-zA-Z0-9_]+)-([a-zA-Z0-9_]+)\.html") + test = ( + ("https://hitomi.la/index-japanese.html", { + "pattern": HitomiGalleryExtractor.pattern, + "count": ">= 35", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.tag, self.language = match.groups() + + def items(self): + data = {"_extractor": HitomiGalleryExtractor} + nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format(self.tag, self.language) + headers = { + "Origin": self.root, + "Cache-Control": "max-age=0", + } + + offset = 0 + total = None + while True: + headers["Referer"] = "{}/{}-{}.html?page={}".format( + self.root, self.tag, self.language, offset // 100 + 1) + headers["Range"] = "bytes={}-{}".format(offset, offset+99) + response = self.request(nozomi_url, headers=headers) + + for gallery_id in decode_nozomi(response.content): + gallery_url = "{}/galleries/{}.html".format( + self.root, gallery_id) + yield Message.Queue, gallery_url, data + + offset += 100 + if total is None: + total = text.parse_int( + response.headers["content-range"].rpartition("/")[2]) + if offset >= total: + return + + class HitomiTagExtractor(Extractor): """Extractor for galleries from tag searches on hitomi.la""" category = "hitomi" @@ -151,6 +214,63 @@ class HitomiTagExtractor(Extractor): return +class HitomiSearchExtractor(Extractor): + """Extractor for galleries from multiple tag searches on hitomi.la""" + category = "hitomi" + subcategory = "search" + root = "https://hitomi.la" + pattern = (r"(?:https?://)?hitomi\.la/search.html" + r"\?([^/?#]+)") + test = ( + ("https://hitomi.la/search.html?tag%3Ascreenshots%20language%3Ajapanese", { + "pattern": HitomiGalleryExtractor.pattern, + "count": ">= 35", + }), + ("https://hitomi.la/search.html?language%3Ajapanese%20artist%3Asumiya"), + ("https://hitomi.la/search.html?group:initial_g"), + ("https://hitomi.la/search.html?series:amnesia"), + ("https://hitomi.la/search.html?type%3Adoujinshi"), + ("https://hitomi.la/search.html?character%3Aa2"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.query = match.group(1) + self.tags = text.unquote(self.query).split(" ") + + def get_nozomi_items(self, full_tag): + area, tag, language = get_nozomi_args(full_tag) + + if area: + referer_base = "{}/n/{}/{}-{}.html".format(self.root, area, tag, language) + nozomi_url = "https://ltn.hitomi.la/{}/{}-{}.nozomi".format(area, tag, language) + else: + referer_base = "{}/n/{}-{}.html".format(self.root, tag, language) + nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format(tag, language) + + headers = { + "Origin": self.root, + "Cache-Control": "max-age=0", + } + + headers["Referer"] = f"{referer_base}/search.html?{self.query}" + response = self.request(nozomi_url, headers=headers) + + result = set(decode_nozomi(response.content)) + return result + + def items(self): + data = {"_extractor": HitomiGalleryExtractor} + + results = [self.get_nozomi_items(tag) for tag in self.tags] + intersects = set.intersection(*results) + + for gallery_id in sorted(intersects, reverse=True): + gallery_url = "{}/galleries/{}.html".format( + self.root, gallery_id) + yield Message.Queue, gallery_url, data + + @memcache(maxage=1800) def _parse_gg(extr): page = extr.request("https://ltn.hitomi.la/gg.js").text From 6f54328a397e283e69a1f35cf901c69d40d41843 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 29 Oct 2024 16:51:19 +0100 Subject: [PATCH 2/2] [hitomi] update - remove f-strings - fix flake8 warnings - move tests to test/results/hitomi.py --- docs/supportedsites.md | 2 +- gallery_dl/extractor/hitomi.py | 183 +++++++++++++++------------------ test/results/hitomi.py | 37 +++++++ 3 files changed, 121 insertions(+), 101 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2d3f1774..ba5aed8f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -364,7 +364,7 @@ Consider all listed sites to potentially be NSFW. Hitomi.la https://hitomi.la/ - Galleries, Tag Searches + Galleries, Site Index, Search Results, Tag Searches diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 384908d3..308b42c6 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -16,22 +16,6 @@ import string import re -def get_nozomi_args(query): - ns, tag = query.strip().split(":") - area = ns - language = "all" - - if ns == "female" or ns == "male": - area = "tag" - tag = query - elif "language" == ns: - area = None - language = tag - tag = "index" - - return area, tag, language - - class HitomiGalleryExtractor(GalleryExtractor): """Extractor for image galleries from hitomi.la""" category = "hitomi" @@ -119,61 +103,14 @@ class HitomiGalleryExtractor(GalleryExtractor): return result -class HitomiIndexExtractor(Extractor): - """Extractor for galleries from index searches on hitomi.la""" - category = "hitomi" - subcategory = "index" - root = "https://hitomi.la" - pattern = (r"(?:https?://)?hitomi\.la/" - r"([a-zA-Z0-9_]+)-([a-zA-Z0-9_]+)\.html") - test = ( - ("https://hitomi.la/index-japanese.html", { - "pattern": HitomiGalleryExtractor.pattern, - "count": ">= 35", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.tag, self.language = match.groups() - - def items(self): - data = {"_extractor": HitomiGalleryExtractor} - nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format(self.tag, self.language) - headers = { - "Origin": self.root, - "Cache-Control": "max-age=0", - } - - offset = 0 - total = None - while True: - headers["Referer"] = "{}/{}-{}.html?page={}".format( - self.root, self.tag, self.language, offset // 100 + 1) - headers["Range"] = "bytes={}-{}".format(offset, offset+99) - response = self.request(nozomi_url, headers=headers) - - for gallery_id in decode_nozomi(response.content): - gallery_url = "{}/galleries/{}.html".format( - self.root, gallery_id) - yield Message.Queue, gallery_url, data - - offset += 100 - if total is None: - total = text.parse_int( - response.headers["content-range"].rpartition("/")[2]) - if offset >= total: - return - - class HitomiTagExtractor(Extractor): """Extractor for galleries from tag searches on hitomi.la""" category = "hitomi" subcategory = "tag" root = "https://hitomi.la" - pattern = (r"(?:https?://)?hitomi\.la/" - r"(tag|artist|group|series|type|character)/" - r"([^/?#]+)\.html") + pattern = (r"(?:https?://)?hitomi\.la" + r"/(tag|artist|group|series|type|character)" + r"/([^/?#]+)\.html") example = "https://hitomi.la/tag/TAG-LANG.html" def __init__(self, match): @@ -214,50 +151,58 @@ class HitomiTagExtractor(Extractor): return -class HitomiSearchExtractor(Extractor): - """Extractor for galleries from multiple tag searches on hitomi.la""" - category = "hitomi" - subcategory = "search" - root = "https://hitomi.la" - pattern = (r"(?:https?://)?hitomi\.la/search.html" - r"\?([^/?#]+)") - test = ( - ("https://hitomi.la/search.html?tag%3Ascreenshots%20language%3Ajapanese", { - "pattern": HitomiGalleryExtractor.pattern, - "count": ">= 35", - }), - ("https://hitomi.la/search.html?language%3Ajapanese%20artist%3Asumiya"), - ("https://hitomi.la/search.html?group:initial_g"), - ("https://hitomi.la/search.html?series:amnesia"), - ("https://hitomi.la/search.html?type%3Adoujinshi"), - ("https://hitomi.la/search.html?character%3Aa2"), - ) +class HitomiIndexExtractor(HitomiTagExtractor): + """Extractor for galleries from index searches on hitomi.la""" + subcategory = "index" + pattern = r"(?:https?://)?hitomi\.la/(\w+)-(\w+)\.html" + example = "https://hitomi.la/index-LANG.html" def __init__(self, match): Extractor.__init__(self, match) - self.query = match.group(1) - self.tags = text.unquote(self.query).split(" ") - - def get_nozomi_items(self, full_tag): - area, tag, language = get_nozomi_args(full_tag) - - if area: - referer_base = "{}/n/{}/{}-{}.html".format(self.root, area, tag, language) - nozomi_url = "https://ltn.hitomi.la/{}/{}-{}.nozomi".format(area, tag, language) - else: - referer_base = "{}/n/{}-{}.html".format(self.root, tag, language) - nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format(tag, language) + self.tag, self.language = match.groups() + def items(self): + data = {"_extractor": HitomiGalleryExtractor} + nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format( + self.tag, self.language) headers = { "Origin": self.root, "Cache-Control": "max-age=0", } - headers["Referer"] = f"{referer_base}/search.html?{self.query}" - response = self.request(nozomi_url, headers=headers) + offset = 0 + total = None + while True: + headers["Referer"] = "{}/{}-{}.html?page={}".format( + self.root, self.tag, self.language, offset // 100 + 1) + headers["Range"] = "bytes={}-{}".format(offset, offset+99) + response = self.request(nozomi_url, headers=headers) - result = set(decode_nozomi(response.content)) - return result + for gallery_id in decode_nozomi(response.content): + gallery_url = "{}/galleries/{}.html".format( + self.root, gallery_id) + yield Message.Queue, gallery_url, data + + offset += 100 + if total is None: + total = text.parse_int( + response.headers["content-range"].rpartition("/")[2]) + if offset >= total: + return + + +class HitomiSearchExtractor(Extractor): + """Extractor for galleries from multiple tag searches on hitomi.la""" + category = "hitomi" + subcategory = "search" + root = "https://hitomi.la" + pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)" + example = "https://hitomi.la/search.html?QUERY" + + def __init__(self, match): + Extractor.__init__(self, match) + self.query = match.group(1) + self.tags = text.unquote(self.query).split(" ") def items(self): data = {"_extractor": HitomiGalleryExtractor} @@ -270,6 +215,44 @@ class HitomiSearchExtractor(Extractor): self.root, gallery_id) yield Message.Queue, gallery_url, data + def get_nozomi_items(self, full_tag): + area, tag, language = self.get_nozomi_args(full_tag) + + if area: + referer_base = "{}/n/{}/{}-{}.html".format( + self.root, area, tag, language) + nozomi_url = "https://ltn.hitomi.la/{}/{}-{}.nozomi".format( + area, tag, language) + else: + referer_base = "{}/n/{}-{}.html".format( + self.root, tag, language) + nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format( + tag, language) + + headers = { + "Origin": self.root, + "Cache-Control": "max-age=0", + "Referer": "{}/search.html?{}".format(referer_base, self.query), + } + + response = self.request(nozomi_url, headers=headers) + return set(decode_nozomi(response.content)) + + def get_nozomi_args(self, query): + ns, _, tag = query.strip().partition(":") + area = ns + language = "all" + + if ns == "female" or ns == "male": + area = "tag" + tag = query + elif ns == "language": + area = None + language = tag + tag = "index" + + return area, tag, language + @memcache(maxage=1800) def _parse_gg(extr): diff --git a/test/results/hitomi.py b/test/results/hitomi.py index 78fa0799..1b0ffcba 100644 --- a/test/results/hitomi.py +++ b/test/results/hitomi.py @@ -194,4 +194,41 @@ __tests__ = ( "#class" : hitomi.HitomiTagExtractor, }, +{ + "#url" : "https://hitomi.la/index-japanese.html", + "#class" : hitomi.HitomiIndexExtractor, + "#pattern" : hitomi.HitomiGalleryExtractor.pattern, + "#range" : "1-150", + "#count" : 150, +}, + +{ + "#url" : "https://hitomi.la/search.html?tag%3Ascreenshots%20language%3Ajapanese", + "#class" : hitomi.HitomiSearchExtractor, + "#pattern" : hitomi.HitomiGalleryExtractor.pattern, + "#range" : "1-150", + "#count" : 150, +}, + +{ + "#url" : "https://hitomi.la/search.html?language%3Ajapanese%20artist%3Asumiya", + "#class" : hitomi.HitomiSearchExtractor, +}, +{ + "#url" : "https://hitomi.la/search.html?group:initial_g", + "#class" : hitomi.HitomiSearchExtractor, +}, +{ + "#url" : "https://hitomi.la/search.html?series:amnesia", + "#class" : hitomi.HitomiSearchExtractor, +}, +{ + "#url" : "https://hitomi.la/search.html?type%3Adoujinshi", + "#class" : hitomi.HitomiSearchExtractor, +}, +{ + "#url" : "https://hitomi.la/search.html?character%3Aa2", + "#class" : hitomi.HitomiSearchExtractor, +}, + )