[nhentai] add 'tag' extractor (closes #1950)

2024-11-23 03:02:50 +01:00 · 2021-10-14 16:23:47 +02:00 · 2021-10-14 16:23:47 +02:00 · 50098762e3
commit 50098762e3
parent fe6ce5495a
2 changed files with 44 additions and 1 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -508,7 +508,7 @@ Consider all sites to be NSFW unless otherwise known.
 <tr>
    <td>nhentai</td>
    <td>https://nhentai.net/</td>
-    <td>Favorites, Galleries, Search Results</td>
+    <td>Favorites, Galleries, Search Results, Tag Searches</td>
    <td></td>
 </tr>
 <tr>
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@ -99,6 +99,49 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
        ]
 class NhentaiTagExtractor(NhentaiBase, Extractor):
    """Extractor for nhentai tag searches"""
    subcategory = "tag"
    pattern = (r"(?:https?://)?nhentai\.net("
               r"/(?:artist|category|character|group|language|parody|tag)"
               r"/[^/?#]+(?:/popular[^/?#]*)?/?)(?:\?([^#]+))?")
    test = (
        ("https://nhentai.net/tag/sole-female/", {
            "pattern": NhentaiGalleryExtractor.pattern,
            "count": 30,
            "range": "1-30",
        }),
        ("https://nhentai.net/artist/itou-life/"),
        ("https://nhentai.net/group/itou-life/"),
        ("https://nhentai.net/parody/touhou-project/"),
        ("https://nhentai.net/character/patchouli-knowledge/popular"),
        ("https://nhentai.net/category/doujinshi/popular-today"),
        ("https://nhentai.net/language/english/popular-week"),
    )
    def __init__(self, match):
        Extractor.__init__(self, match)
        self.path, self.query = match.groups()
    def items(self):
        data = {"_extractor": NhentaiGalleryExtractor}
        for gallery_id in self._pagination():
            url = "{}/g/{}/".format(self.root, gallery_id)
            yield Message.Queue, url, data
    def _pagination(self):
        url = self.root + self.path
        params = text.parse_query(self.query)
        params["page"] = text.parse_int(params.get("page"), 1)
        while True:
            page = self.request(url, params=params).text
            yield from text.extract_iter(page, 'href="/g/', '/')
            if 'class="next"' not in page:
                return
            params["page"] += 1
 class NhentaiSearchExtractor(NhentaiBase, Extractor):
    """Extractor for nhentai search results"""
    subcategory = "search"