gallery-dl/gallery_dl/extractor/hentaifox.py

# -*- coding: utf-8 -*-

# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://hentaifox.com/"""

from .common import GalleryExtractor, Extractor, Message
from .. import text, util


class HentaifoxBase():
    """Base class for hentaifox extractors"""
    category = "hentaifox"
    root = "https://hentaifox.com"


class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
    """Extractor for image galleries on hentaifox.com"""
    pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
    example = "https://hentaifox.com/gallery/12345/"

    def __init__(self, match):
        GalleryExtractor.__init__(self, match)
        self.gallery_id = match.group(2)

    @staticmethod
    def _split(txt):
        return [
            text.remove_html(tag.partition(">")[2], "", "")
            for tag in text.extract_iter(
                txt, "class='tag_btn", "<span class='t_badge")
        ]

    def metadata(self, page):
        extr = text.extract_from(page)
        split = self._split

        return {
            "gallery_id": text.parse_int(self.gallery_id),
            "parody"    : split(extr(">Parodies:"  , "</ul>")),
            "characters": split(extr(">Characters:", "</ul>")),
            "tags"      : split(extr(">Tags:"      , "</ul>")),
            "artist"    : split(extr(">Artists:"   , "</ul>")),
            "group"     : split(extr(">Groups:"    , "</ul>")),
            "type"      : text.remove_html(extr(">Category:", "<span")),
            "title"     : text.unescape(extr(
                'id="gallery_title" value="', '"')),
            "language"  : "English",
            "lang"      : "en",
        }

    def images(self, page):
        cover, pos = text.extract(page, '<img src="', '"')
        data , pos = text.extract(page, "$.parseJSON('", "');", pos)
        path = "/".join(cover.split("/")[3:-1])

        result = []
        append = result.append
        extmap = {"j": "jpg", "p": "png", "g": "gif"}
        urlfmt = ("/" + path + "/{}.{}").format

        server1 = "https://i.hentaifox.com"
        server2 = "https://i2.hentaifox.com"

        for num, image in util.json_loads(data).items():
            ext, width, height = image.split(",")
            path = urlfmt(num, extmap[ext])
            append((server1 + path, {
                "width"    : width,
                "height"   : height,
                "_fallback": (server2 + path,),
            }))

        return result


class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
    """Extractor for search results and listings on hentaifox.com"""
    subcategory = "search"
    pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
               r"(/(?:parody|tag|artist|character|search|group)/[^/?%#]+)")
    example = "https://hentaifox.com/tag/TAG/"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.path = match.group(1)

    def items(self):
        for gallery in self.galleries():
            yield Message.Queue, gallery["url"], gallery

    def galleries(self):
        num = 1

        while True:
            url = "{}{}/pag/{}/".format(self.root, self.path, num)
            page = self.request(url).text

            for info in text.extract_iter(
                    page, 'class="g_title"><a href="', '</a>'):
                url, _, title = info.partition('">')

                yield {
                    "url"       : text.urljoin(self.root, url),
                    "gallery_id": text.parse_int(
                        url.strip("/").rpartition("/")[2]),
                    "title"     : text.unescape(title),
                    "_extractor": HentaifoxGalleryExtractor,
                }

            pos = page.find(">Next<")
            url = text.rextract(page, "href=", ">", pos)[0]
            if pos == -1 or "/pag" not in url:
                return
            num += 1
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`# -- coding: utf-8 --`

replace json.loads with direct calls to JSONDecoder.decode 2023-02-07 23:14:53 +01:00			`# Copyright 2019-2023 Mike Fährmann`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://hentaifox.com/"""`

use GalleryExtractor as common base class 2019-02-26 14:08:02 +01:00			`from .common import GalleryExtractor, Extractor, Message`
replace json.loads with direct calls to JSONDecoder.decode 2023-02-07 23:14:53 +01:00			`from .. import text, util`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00

adjust metadata types for GalleryExtractors 2019-03-01 23:13:40 +01:00			`class HentaifoxBase():`
			`"""Base class for hentaifox extractors"""`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00			`category = "hentaifox"`
adjust metadata types for GalleryExtractors 2019-03-01 23:13:40 +01:00			`root = "https://hentaifox.com"`


			`class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):`
			`"""Extractor for image galleries on hentaifox.com"""`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://hentaifox.com/gallery/12345/"`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00
			`def __init__(self, match):`
use GalleryExtractor as common base class 2019-02-26 14:08:02 +01:00			`GalleryExtractor.__init__(self, match)`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`self.gallery_id = match.group(2)`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00
[hentaifox] improve metadata extraction (fixes #1378) 2021-03-13 17:52:53 +01:00			`@staticmethod`
			`def _split(txt):`
			`return [`
			`text.remove_html(tag.partition(">")[2], "", "")`
			`for tag in text.extract_iter(`
			`txt, "class='tag_btn", "<span class='t_badge")`
			`]`

			`def metadata(self, page):`
use 'text.extract_from()' in a few places 2019-04-19 23:02:29 +02:00			`extr = text.extract_from(page)`
[hentaifox] improve metadata extraction (fixes #1378) 2021-03-13 17:52:53 +01:00			`split = self._split`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00
use 'text.extract_from()' in a few places 2019-04-19 23:02:29 +02:00			`return {`
			`"gallery_id": text.parse_int(self.gallery_id),`
[hentaifox] improve metadata extraction (fixes #1378) 2021-03-13 17:52:53 +01:00			`"parody" : split(extr(">Parodies:" , "</ul>")),`
			`"characters": split(extr(">Characters:", "</ul>")),`
			`"tags" : split(extr(">Tags:" , "</ul>")),`
			`"artist" : split(extr(">Artists:" , "</ul>")),`
			`"group" : split(extr(">Groups:" , "</ul>")),`
[hentaifox] fix extraction 2019-12-02 18:04:22 +01:00			`"type" : text.remove_html(extr(">Category:", "<span")),`
[hentaifox] fix titles containing '@' (#4201) 2023-06-18 20:01:33 +02:00			`"title" : text.unescape(extr(`
			`'id="gallery_title" value="', '"')),`
use 'text.extract_from()' in a few places 2019-04-19 23:02:29 +02:00			`"language" : "English",`
			`"lang" : "en",`
			`}`
[hentaifox] add chapter extractor (#160) 2019-01-28 18:00:32 +01:00
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`def images(self, page):`
[hentaifox] improve image extraction (fixes #1366) build image URLs from embedded JSON data instead 0f rewriting thumbnail URLs 2021-03-10 00:55:33 +01:00			`cover, pos = text.extract(page, '<img src="', '"')`
			`data , pos = text.extract(page, "$.parseJSON('", "');", pos)`
			`path = "/".join(cover.split("/")[3:-1])`

			`result = []`
			`append = result.append`
			`extmap = {"j": "jpg", "p": "png", "g": "gif"}`
			`urlfmt = ("/" + path + "/{}.{}").format`

			`server1 = "https://i.hentaifox.com"`
			`server2 = "https://i2.hentaifox.com"`

replace json.loads with direct calls to JSONDecoder.decode 2023-02-07 23:14:53 +01:00			`for num, image in util.json_loads(data).items():`
[hentaifox] improve image extraction (fixes #1366) build image URLs from embedded JSON data instead 0f rewriting thumbnail URLs 2021-03-10 00:55:33 +01:00			`ext, width, height = image.split(",")`
			`path = urlfmt(num, extmap[ext])`
			`append((server1 + path, {`
			`"width" : width,`
			`"height" : height,`
			`"_fallback": (server2 + path,),`
			`}))`

			`return result`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00

adjust metadata types for GalleryExtractors 2019-03-01 23:13:40 +01:00			`class HentaifoxSearchExtractor(HentaifoxBase, Extractor):`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`"""Extractor for search results and listings on hentaifox.com"""`
			`subcategory = "search"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"`
[hentaifox] support searching by group (#1294) Groups on hentaifox lists the items to download the same way as the other pages (artists, search, tag, etc). Added group to the pattern to search, and the test. 2021-02-06 17:52:00 +01:00			`r"(/(?:parody\|tag\|artist\|character\|search\|group)/[^/?%#]+)")`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://hentaifox.com/tag/TAG/"`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`Extractor.__init__(self, match)`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`self.path = match.group(1)`

			`def items(self):`
			`for gallery in self.galleries():`
			`yield Message.Queue, gallery["url"], gallery`

			`def galleries(self):`
[hentaifox] fix extraction 2019-12-02 18:04:22 +01:00			`num = 1`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00
			`while True:`
[hentaifox] fix extraction 2019-12-02 18:04:22 +01:00			`url = "{}{}/pag/{}/".format(self.root, self.path, num)`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`page = self.request(url).text`

[hentaifox] fix extraction 2019-12-02 18:04:22 +01:00			`for info in text.extract_iter(`
			`page, 'class="g_title"><a href="', '</a>'):`
			`url, _, title = info.partition('">')`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00
			`yield {`
[hentaifox] fix extraction 2019-12-02 18:04:22 +01:00			`"url" : text.urljoin(self.root, url),`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`"gallery_id": text.parse_int(`
			`url.strip("/").rpartition("/")[2]),`
[hentaifox] fix extraction 2019-12-02 18:04:22 +01:00			`"title" : text.unescape(title),`
provide type information for Queue messages Child extractors are now directly constructed with Extractor.from_url() if the extractor class is known beforehand, instead of using extractor.find() and searching through all possible extractor classes. 2019-02-12 21:26:41 +01:00			`"_extractor": HentaifoxGalleryExtractor,`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`}`

[hentaifox] fix extraction 2019-12-02 18:04:22 +01:00			`pos = page.find(">Next<")`
			`url = text.rextract(page, "href=", ">", pos)[0]`
[hentaifox] add extractor for search results (#160) 2019-01-28 22:38:32 +01:00			`if pos == -1 or "/pag" not in url:`
			`return`
[hentaifox] fix extraction 2019-12-02 18:04:22 +01:00			`num += 1`