gallery-dl/gallery_dl/extractor/hiperdex.py

# -*- coding: utf-8 -*-

# Copyright 2020-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://hiperdex.com/"""

from .common import ChapterExtractor, MangaExtractor
from .. import text
from ..cache import memcache
import re

BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"
                r"(?:1st)?hiperdex\d?\.(?:com|net|info))")


class HiperdexBase():
    """Base class for hiperdex extractors"""
    category = "hiperdex"
    root = "https://hiperdex.com"

    @memcache(keyarg=1)
    def manga_data(self, manga, page=None):
        if not page:
            url = "{}/manga/{}/".format(self.root, manga)
            page = self.request(url).text
        extr = text.extract_from(page)

        return {
            "url"    : text.unescape(extr(
                'property="og:url" content="', '"')),
            "manga"  : text.unescape(extr(
                '"headline": "', '"')),
            "score"  : text.parse_float(extr(
                'id="averagerate">', '<')),
            "author" : text.remove_html(extr(
                'class="author-content">', '</div>')),
            "artist" : text.remove_html(extr(
                'class="artist-content">', '</div>')),
            "genre"  : text.split_html(extr(
                'class="genres-content">', '</div>'))[::2],
            "type"   : extr(
                'class="summary-content">', '<').strip(),
            "release": text.parse_int(text.remove_html(extr(
                'class="summary-content">', '</div>'))),
            "status" : extr(
                'class="summary-content">', '<').strip(),
            "description": text.remove_html(text.unescape(extr(
                'class="description-summary">', '</div>'))),
            "language": "English",
            "lang"    : "en",
        }

    def chapter_data(self, chapter):
        if chapter.startswith("chapter-"):
            chapter = chapter[8:]
        chapter, _, minor = chapter.partition("-")
        data = {
            "chapter"      : text.parse_int(chapter),
            "chapter_minor": "." + minor if minor and minor != "end" else "",
        }
        data.update(self.manga_data(self.manga.lower()))
        return data


class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
    """Extractor for manga chapters from hiperdex.com"""
    pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))"
    example = "https://hiperdex.com/manga/MANGA/CHAPTER/"

    def __init__(self, match):
        root, path, self.manga, self.chapter = match.groups()
        self.root = text.ensure_http_scheme(root)
        ChapterExtractor.__init__(self, match, self.root + path + "/")

    def metadata(self, _):
        return self.chapter_data(self.chapter)

    def images(self, page):
        return [
            (url.strip(), None)
            for url in re.findall(
                r'id="image-\d+"\s+(?:data-)?src="([^"]+)', page)
        ]


class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
    """Extractor for manga from hiperdex.com"""
    chapterclass = HiperdexChapterExtractor
    pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$"
    example = "https://hiperdex.com/manga/MANGA/"

    def __init__(self, match):
        root, path, self.manga = match.groups()
        self.root = text.ensure_http_scheme(root)
        MangaExtractor.__init__(self, match, self.root + path + "/")

    def chapters(self, page):
        data = self.manga_data(self.manga, page)
        self.manga_url = url = data["url"]

        url = self.manga_url + "ajax/chapters/"
        headers = {
            "Accept": "*/*",
            "X-Requested-With": "XMLHttpRequest",
            "Origin": self.root,
            "Referer": "https://" + text.quote(self.manga_url[8:]),
        }
        html = self.request(url, method="POST", headers=headers).text

        results = []
        for item in text.extract_iter(
                html, '<li class="wp-manga-chapter', '</li>'):
            url = text.extr(item, 'href="', '"')
            chapter = url.rstrip("/").rpartition("/")[2]
            results.append((url, self.chapter_data(chapter)))
        return results


class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
    """Extractor for an artists's manga on hiperdex.com"""
    subcategory = "artist"
    categorytransfer = False
    chapterclass = HiperdexMangaExtractor
    reverse = False
    pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))"
    example = "https://hiperdex.com/manga-artist/NAME/"

    def __init__(self, match):
        self.root = text.ensure_http_scheme(match.group(1))
        MangaExtractor.__init__(self, match, self.root + match.group(2) + "/")

    def chapters(self, page):
        results = []
        for info in text.extract_iter(page, 'id="manga-item-', '<img'):
            url = text.extr(info, 'href="', '"')
            results.append((url, {}))
        return results
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`# -- coding: utf-8 --`

[hiperdex] update domain (#3572) 2023-01-26 12:01:16 +01:00			`# Copyright 2020-2023 Mike Fährmann`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[hiperdex] fix extraction (#3768) 2023-03-15 14:28:03 +01:00			`"""Extractors for https://hiperdex.com/"""`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00
			`from .common import ChapterExtractor, MangaExtractor`
			`from .. import text`
			`from ..cache import memcache`
			`import re`

[hiperdex] update domain (#3572) 2023-01-26 12:01:16 +01:00			`BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"`
			`r"(?:1st)?hiperdex\d?\.(?:com\|net\|info))")`
[hiperdex] update domain to hiperdex.info 2020-05-11 23:37:47 +02:00

[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`class HiperdexBase():`
			`"""Base class for hiperdex extractors"""`
			`category = "hiperdex"`
[hiperdex] fix extraction (#3768) 2023-03-15 14:28:03 +01:00			`root = "https://hiperdex.com"`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00
			`@memcache(keyarg=1)`
			`def manga_data(self, manga, page=None):`
			`if not page:`
			`url = "{}/manga/{}/".format(self.root, manga)`
			`page = self.request(url).text`
			`extr = text.extract_from(page)`

			`return {`
[hiperdex] fix extraction 2023-03-25 18:18:27 +01:00			`"url" : text.unescape(extr(`
			`'property="og:url" content="', '"')),`
[hiperdex] fix 'manga' metadata 2023-11-26 01:24:42 +01:00			`"manga" : text.unescape(extr(`
			`'"headline": "', '"')),`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`"score" : text.parse_float(extr(`
			`'id="averagerate">', '<')),`
			`"author" : text.remove_html(extr(`
			`'class="author-content">', '</div>')),`
			`"artist" : text.remove_html(extr(`
			`'class="artist-content">', '</div>')),`
			`"genre" : text.split_html(extr(`
			`'class="genres-content">', '</div>'))[::2],`
			`"type" : extr(`
			`'class="summary-content">', '<').strip(),`
			`"release": text.parse_int(text.remove_html(extr(`
			`'class="summary-content">', '</div>'))),`
			`"status" : extr(`
			`'class="summary-content">', '<').strip(),`
			`"description": text.remove_html(text.unescape(extr(`
			`'class="description-summary">', '</div>'))),`
			`"language": "English",`
			`"lang" : "en",`
			`}`

			`def chapter_data(self, chapter):`
[hiperdex] update domain (#3572) 2023-01-26 12:01:16 +01:00			`if chapter.startswith("chapter-"):`
			`chapter = chapter[8:]`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`chapter, _, minor = chapter.partition("-")`
			`data = {`
			`"chapter" : text.parse_int(chapter),`
[hiperdex] use proper name for 'chapter_minor' 2020-02-29 00:18:54 +01:00			`"chapter_minor": "." + minor if minor and minor != "end" else "",`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`}`
			`data.update(self.manga_data(self.manga.lower()))`
			`return data`


			`class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):`
[hiperdex] fix extraction (#3768) 2023-03-15 14:28:03 +01:00			`"""Extractor for manga chapters from hiperdex.com"""`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://hiperdex.com/manga/MANGA/CHAPTER/"`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00
			`def __init__(self, match):`
[hiperdex] use domain from input URL 2021-07-02 23:23:42 +02:00			`root, path, self.manga, self.chapter = match.groups()`
			`self.root = text.ensure_http_scheme(root)`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`ChapterExtractor.__init__(self, match, self.root + path + "/")`

			`def metadata(self, _):`
			`return self.chapter_data(self.chapter)`

			`def images(self, page):`
			`return [`
			`(url.strip(), None)`
[hiperdex] fix extraction 2020-04-03 21:25:25 +02:00			`for url in re.findall(`
			`r'id="image-\d+"\s+(?:data-)?src="([^"]+)', page)`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`]`


			`class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):`
[hiperdex] fix extraction (#3768) 2023-03-15 14:28:03 +01:00			`"""Extractor for manga from hiperdex.com"""`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`chapterclass = HiperdexChapterExtractor`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://hiperdex.com/manga/MANGA/"`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00
			`def __init__(self, match):`
[hiperdex] use domain from input URL 2021-07-02 23:23:42 +02:00			`root, path, self.manga = match.groups()`
			`self.root = text.ensure_http_scheme(root)`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`MangaExtractor.__init__(self, match, self.root + path + "/")`

			`def chapters(self, page):`
[hiperdex] fix extraction 2023-03-25 18:18:27 +01:00			`data = self.manga_data(self.manga, page)`
			`self.manga_url = url = data["url"]`

			`url = self.manga_url + "ajax/chapters/"`
			`headers = {`
			`"Accept": "/",`
			`"X-Requested-With": "XMLHttpRequest",`
			`"Origin": self.root,`
[hiperdex] fix for unicode titles (#4325) 2023-07-22 16:15:55 +02:00			`"Referer": "https://" + text.quote(self.manga_url[8:]),`
[hiperdex] fix extraction 2023-03-25 18:18:27 +01:00			`}`
			`html = self.request(url, method="POST", headers=headers).text`
[hiperdex] fix manga extraction 2020-04-12 02:27:13 +02:00
[hiperdex] fix extraction 2023-03-25 18:18:27 +01:00			`results = []`
			`for item in text.extract_iter(`
			`html, '<li class="wp-manga-chapter', '</li>'):`
			`url = text.extr(item, 'href="', '"')`
			`chapter = url.rstrip("/").rpartition("/")[2]`
[hiperdex] fix manga extraction 2020-04-12 02:27:13 +02:00			`results.append((url, self.chapter_data(chapter)))`
[hiperdex] add chapter and manga extractors (closes #606) 2020-02-21 23:40:32 +01:00			`return results`
[hiperdex] add 'artist' extractor (#606) 2020-04-12 02:32:37 +02:00

			`class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):`
update extractor test results - don't run Instagram tests on Travis anymore - replace Twitter test because timeline was made private - update Hiperdex domain to '.com' (again ...) 2020-05-28 01:55:32 +02:00			`"""Extractor for an artists's manga on hiperdex.com"""`
[hiperdex] add 'artist' extractor (#606) 2020-04-12 02:32:37 +02:00			`subcategory = "artist"`
			`categorytransfer = False`
			`chapterclass = HiperdexMangaExtractor`
			`reverse = False`
[hiperdex] use domain from input URL 2021-07-02 23:23:42 +02:00			`pattern = BASE_PATTERN + r"(/manga-a(?:rtist\|uthor)/(?:[^/?#]+))"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://hiperdex.com/manga-artist/NAME/"`
[hiperdex] add 'artist' extractor (#606) 2020-04-12 02:32:37 +02:00
			`def __init__(self, match):`
[hiperdex] use domain from input URL 2021-07-02 23:23:42 +02:00			`self.root = text.ensure_http_scheme(match.group(1))`
			`MangaExtractor.__init__(self, match, self.root + match.group(2) + "/")`
[hiperdex] add 'artist' extractor (#606) 2020-04-12 02:32:37 +02:00
			`def chapters(self, page):`
			`results = []`
			`for info in text.extract_iter(page, 'id="manga-item-', '<img'):`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`url = text.extr(info, 'href="', '"')`
[hiperdex] add 'artist' extractor (#606) 2020-04-12 02:32:37 +02:00			`results.append((url, {}))`
			`return results`