From 50098762e326063e8f6e4b44ab1572878d73e918 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 14 Oct 2021 16:23:47 +0200 Subject: [PATCH] [nhentai] add 'tag' extractor (closes #1950) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/nhentai.py | 43 +++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 27ffade3..56802401 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -508,7 +508,7 @@ Consider all sites to be NSFW unless otherwise known. nhentai https://nhentai.net/ - Favorites, Galleries, Search Results + Favorites, Galleries, Search Results, Tag Searches diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py index 20b716b2..5b5da6a8 100644 --- a/gallery_dl/extractor/nhentai.py +++ b/gallery_dl/extractor/nhentai.py @@ -99,6 +99,49 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor): ] +class NhentaiTagExtractor(NhentaiBase, Extractor): + """Extractor for nhentai tag searches""" + subcategory = "tag" + pattern = (r"(?:https?://)?nhentai\.net(" + r"/(?:artist|category|character|group|language|parody|tag)" + r"/[^/?#]+(?:/popular[^/?#]*)?/?)(?:\?([^#]+))?") + test = ( + ("https://nhentai.net/tag/sole-female/", { + "pattern": NhentaiGalleryExtractor.pattern, + "count": 30, + "range": "1-30", + }), + ("https://nhentai.net/artist/itou-life/"), + ("https://nhentai.net/group/itou-life/"), + ("https://nhentai.net/parody/touhou-project/"), + ("https://nhentai.net/character/patchouli-knowledge/popular"), + ("https://nhentai.net/category/doujinshi/popular-today"), + ("https://nhentai.net/language/english/popular-week"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.path, self.query = match.groups() + + def items(self): + data = {"_extractor": NhentaiGalleryExtractor} + for gallery_id in self._pagination(): + url = "{}/g/{}/".format(self.root, gallery_id) + yield Message.Queue, url, data + + def _pagination(self): + url = self.root + self.path + params = text.parse_query(self.query) + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + yield from text.extract_iter(page, 'href="/g/', '/') + if 'class="next"' not in page: + return + params["page"] += 1 + + class NhentaiSearchExtractor(NhentaiBase, Extractor): """Extractor for nhentai search results""" subcategory = "search"