gallery-dl/gallery_dl/extractor/hentaifox.py

# -*- coding: utf-8 -*-

# Copyright 2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://hentaifox.com/"""

from .common import ChapterExtractor, Extractor, Message
from .. import text


class HentaifoxGalleryExtractor(ChapterExtractor):
    """Extractor for image galleries on hentaifox.com"""
    category = "hentaifox"
    subcategory = "gallery"
    filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"
    directory_fmt = ("{category}", "{gallery_id} {title}")
    archive_fmt = "{gallery_id}_{page}"
    pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com/gallery/(\d+)"
    test = ("https://hentaifox.com/gallery/56622/", {
        "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
        "count": 24,
        "keyword": "80fc0fb5db9626fffb078dd2e4f9aff4a9348686",
    })
    root = "https://hentaifox.com"

    def __init__(self, match):
        self.gallery_id = match.group(1)
        url = "{}/gallery/{}".format(self.root, self.gallery_id)
        ChapterExtractor.__init__(self, url)

    def get_metadata(self, page):
        title, pos = text.extract(page, "<h1>", "</h1>")
        data = text.extract_all(page, (
            ("parodies"  , ">Parodies:"  , "</a></span>"),
            ("characters", ">Characters:", "</a></span>"),
            ("tags"      , ">Tags:"      , "</a></span>"),
            ("artist"    , ">Artists:"   , "</a></span>"),
            ("group"     , ">Groups:"    , "</a></span>"),
            ("type"      , ">Category:"  , "</a></span>"),
        ), pos)[0]

        for key, value in data.items():
            data[key] = text.remove_html(value).replace(" , ", ", ")
        data["gallery_id"] = text.parse_int(self.gallery_id)
        data["title"] = text.unescape(title)
        data["language"] = "English"
        data["lang"] = "en"
        return data

    def get_images(self, page):
        return [
            (text.urljoin(self.root, url.replace("t.", ".")), None)
            for url in text.extract_iter(page, 'data-src="', '"')
        ]


class HentaifoxSearchExtractor(Extractor):
    """Extractor for search results and listings on hentaifox.com"""
    category = "hentaifox"
    subcategory = "search"
    pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
               r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)")
    test = (
        ("https://hentaifox.com/parody/touhou-project/"),
        ("https://hentaifox.com/character/reimu-hakurei/"),
        ("https://hentaifox.com/artist/distance/"),
        ("https://hentaifox.com/search/touhou/"),
        ("https://hentaifox.com/tag/full-colour/", {
            "pattern": HentaifoxGalleryExtractor.pattern,
            "count": ">= 40",
            "keyword": {
                "url": str,
                "gallery_id": int,
                "thumbnail": r"re:https://i\d*.hentaifox.com/\d+/\d+/thumb\.",
                "title": str,
                "tags": list,
            },
        }),
    )
    root = "https://hentaifox.com"

    def __init__(self, match):
        Extractor.__init__(self)
        self.path = match.group(1)

    def items(self):
        yield Message.Version, 1
        for gallery in self.galleries():
            yield Message.Queue, gallery["url"], gallery

    def galleries(self):
        url = "{}/{}/".format(self.root, self.path)

        while True:
            page = self.request(url).text
            info, gpos = text.extract(
                page, 'class="galleries_overview">', 'class="clear">')

            for ginfo in text.extract_iter(info, '<div class="item', '</a>'):
                tags , pos = text.extract(ginfo, '', '"')
                url  , pos = text.extract(ginfo, 'href="', '"', pos)
                title, pos = text.extract(ginfo, 'alt="', '"', pos)
                thumb, pos = text.extract(ginfo, 'src="', '"', pos)

                yield {
                    "url": text.urljoin(self.root, url),
                    "gallery_id": text.parse_int(
                        url.strip("/").rpartition("/")[2]),
                    "thumbnail": text.urljoin(self.root, thumb),
                    "title": text.unescape(title),
                    "tags": tags.split(),
                }

            pos = page.find('class="current"', gpos)
            url = text.extract(page, 'href="', '"', pos)[0]
            if pos == -1 or "/pag" not in url:
                return
            url = text.urljoin(self.root, url)
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`# -- coding: utf-8 --`

			`# Copyright 2019 Mike Fährmann`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://hentaifox.com/"""`

[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`from .common import ChapterExtractor, Extractor, Message`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`from .. import text`


[hentaifox] rename Chapter- to GalleryExtractor (#160) 2019-01-28 21:49:26 +01:00			`class HentaifoxGalleryExtractor(ChapterExtractor):`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`"""Extractor for image galleries on hentaifox.com"""`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`category = "hentaifox"`
[hentaifox] rename Chapter- to GalleryExtractor (#160) 2019-01-28 21:49:26 +01:00			`subcategory = "gallery"`
			`filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = ("{category}", "{gallery_id} {title}")`
[hentaifox] rename Chapter- to GalleryExtractor (#160) 2019-01-28 21:49:26 +01:00			`archive_fmt = "{gallery_id}_{page}"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com/gallery/(\d+)"`
			`test = ("https://hentaifox.com/gallery/56622/", {`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",`
			`"count": 24,`
[hentaifox] rename Chapter- to GalleryExtractor (#160) 2019-01-28 21:49:26 +01:00			`"keyword": "80fc0fb5db9626fffb078dd2e4f9aff4a9348686",`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`})`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`root = "https://hentaifox.com"`

			`def __init__(self, match):`
[hentaifox] rename Chapter- to GalleryExtractor (#160) 2019-01-28 21:49:26 +01:00			`self.gallery_id = match.group(1)`
			`url = "{}/gallery/{}".format(self.root, self.gallery_id)`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`ChapterExtractor.__init__(self, url)`

			`def get_metadata(self, page):`
			`title, pos = text.extract(page, "<h1>", "</h1>")`
			`data = text.extract_all(page, (`
			`("parodies" , ">Parodies:" , "</a></span>"),`
			`("characters", ">Characters:", "</a></span>"),`
			`("tags" , ">Tags:" , "</a></span>"),`
			`("artist" , ">Artists:" , "</a></span>"),`
			`("group" , ">Groups:" , "</a></span>"),`
			`("type" , ">Category:" , "</a></span>"),`
			`), pos)[0]`

			`for key, value in data.items():`
			`data[key] = text.remove_html(value).replace(" , ", ", ")`
[hentaifox] rename Chapter- to GalleryExtractor (#160) 2019-01-28 21:49:26 +01:00			`data["gallery_id"] = text.parse_int(self.gallery_id)`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`data["title"] = text.unescape(title)`
			`data["language"] = "English"`
			`data["lang"] = "en"`
			`return data`

			`def get_images(self, page):`
			`return [`
			`(text.urljoin(self.root, url.replace("t.", ".")), None)`
			`for url in text.extract_iter(page, 'data-src="', '"')`
			`]`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00

			`class HentaifoxSearchExtractor(Extractor):`
			`"""Extractor for search results and listings on hentaifox.com"""`
			`category = "hentaifox"`
			`subcategory = "search"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"`
			`r"(/(?:parody\|tag\|artist\|character\|search)/[^/?%#]+)")`
			`test = (`
			`("https://hentaifox.com/parody/touhou-project/"),`
			`("https://hentaifox.com/character/reimu-hakurei/"),`
			`("https://hentaifox.com/artist/distance/"),`
			`("https://hentaifox.com/search/touhou/"),`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`("https://hentaifox.com/tag/full-colour/", {`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`"pattern": HentaifoxGalleryExtractor.pattern,`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`"count": ">= 40",`
			`"keyword": {`
			`"url": str,`
			`"gallery_id": int,`
			`"thumbnail": r"re:https://i\d*.hentaifox.com/\d+/\d+/thumb\.",`
			`"title": str,`
			`"tags": list,`
			`},`
			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`)`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`root = "https://hentaifox.com"`

			`def __init__(self, match):`
			`Extractor.__init__(self)`
			`self.path = match.group(1)`

			`def items(self):`
			`yield Message.Version, 1`
			`for gallery in self.galleries():`
			`yield Message.Queue, gallery["url"], gallery`

			`def galleries(self):`
			`url = "{}/{}/".format(self.root, self.path)`

			`while True:`
			`page = self.request(url).text`
			`info, gpos = text.extract(`
			`page, 'class="galleries_overview">', 'class="clear">')`

			`for ginfo in text.extract_iter(info, '<div class="item', '</a>'):`
			`tags , pos = text.extract(ginfo, '', '"')`
			`url , pos = text.extract(ginfo, 'href="', '"', pos)`
			`title, pos = text.extract(ginfo, 'alt="', '"', pos)`
			`thumb, pos = text.extract(ginfo, 'src="', '"', pos)`

			`yield {`
			`"url": text.urljoin(self.root, url),`
			`"gallery_id": text.parse_int(`
			`url.strip("/").rpartition("/")[2]),`
			`"thumbnail": text.urljoin(self.root, thumb),`
			`"title": text.unescape(title),`
			`"tags": tags.split(),`
			`}`

			`pos = page.find('class="current"', gpos)`
			`url = text.extract(page, 'href="', '"', pos)[0]`
			`if pos == -1 or "/pag" not in url:`
			`return`
			`url = text.urljoin(self.root, url)`