gallery-dl/gallery_dl/extractor/mangapark.py

# -*- coding: utf-8 -*-

# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://mangapark.net/"""

from .common import ChapterExtractor, MangaExtractor
from .. import text, exception
import json
import re


class MangaparkBase():
    """Base class for mangapark extractors"""
    category = "mangapark"
    root_fmt = "https://mangapark.{}"

    @staticmethod
    def parse_chapter_path(path, data):
        """Get volume/chapter information from url-path of a chapter"""
        data["volume"], data["chapter_minor"] = 0, ""
        for part in path.split("/")[1:]:
            key, value = part[0], part[1:]
            if key == "c":
                chapter, dot, minor = value.partition(".")
                data["chapter"] = text.parse_int(chapter)
                data["chapter_minor"] = dot + minor
            elif key == "i":
                data["chapter_id"] = text.parse_int(value)
            elif key == "v":
                data["volume"] = text.parse_int(value)
            elif key == "s":
                data["stream"] = text.parse_int(value)
            elif key == "e":
                data["chapter_minor"] = "v" + value

    @staticmethod
    def parse_chapter_title(title, data):
        match = re.search(r"(?i)(?:vol(?:ume)?[ .]*(\d+) )?"
                          r"ch(?:apter)?[ .]*(\d+)(\.\w+)?", title)
        if match:
            vol, ch, data["chapter_minor"] = match.groups()
            data["volume"] = text.parse_int(vol)
            data["chapter"] = text.parse_int(ch)


class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
    """Extractor for manga-chapters from mangapark.net"""
    pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)"
               r"/manga/([^?#]+/i\d+)")
    test = (
        ("https://mangapark.net/manga/gosu/i811653/c055/1", {
            "count": 50,
            "keyword": "8344bdda8cd8414e7729a4e148379f147e3437da",
        }),
        (("https://mangapark.net/manga"
          "/ad-astra-per-aspera-hata-kenjirou/i662051/c001.2/1"), {
            "count": 40,
            "keyword": "2bb3a8f426383ea13f17ff5582f3070d096d30ac",
        }),
        (("https://mangapark.net/manga"
          "/gekkan-shoujo-nozaki-kun/i2067426/v7/c70/1"), {
            "count": 15,
            "keyword": "edc14993c4752cee3a76e09b2f024d40d854bfd1",
        }),
        ("https://mangapark.me/manga/gosu/i811615/c55/1"),
        ("https://mangapark.com/manga/gosu/i811615/c55/1"),
    )

    def __init__(self, match):
        tld, self.path = match.groups()
        self.root = self.root_fmt.format(tld)
        url = "{}/manga/{}?zoom=2".format(self.root, self.path)
        ChapterExtractor.__init__(self, match, url)

    def metadata(self, page):
        data = text.extract_all(page, (
            ("manga_id"  , "var _manga_id = '", "'"),
            ("chapter_id", "var _book_id = '", "'"),
            ("stream"    , "var _stream = '", "'"),
            ("path"      , "var _book_link = '", "'"),
            ("manga"     , "<h2>", "</h2>"),
            ("title"     , "</a>", "<"),
        ), values={"lang": "en", "language": "English"})[0]

        if not data["path"]:
            raise exception.NotFoundError("chapter")

        self.parse_chapter_path(data["path"], data)
        if "chapter" not in data:
            self.parse_chapter_title(data["title"], data)

        data["manga"], _, data["type"] = data["manga"].rpartition(" ")
        data["manga"] = text.unescape(data["manga"])
        data["title"] = data["title"].partition(": ")[2]
        for key in ("manga_id", "chapter_id", "stream"):
            data[key] = text.parse_int(data[key])

        return data

    def images(self, page):
        data = json.loads(text.extract(page, "var _load_pages =", ";")[0])
        return [
            (text.urljoin(self.root, item["u"]), {
                "width": text.parse_int(item["w"]),
                "height": text.parse_int(item["h"]),
            })
            for item in data
        ]


class MangaparkMangaExtractor(MangaparkBase, MangaExtractor):
    """Extractor for manga from mangapark.net"""
    chapterclass = MangaparkChapterExtractor
    pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)"
               r"(/manga/[^/?#]+)/?$")
    test = (
        ("https://mangapark.net/manga/aria", {
            "url": "9b62883c25c8de471f8ab43651e1448536c4ce3f",
            "keyword": "eb4a9b273c69acf31efa731eba713e1cfa14bab6",
        }),
        ("https://mangapark.me/manga/aria"),
        ("https://mangapark.com/manga/aria"),
    )

    def __init__(self, match):
        self.root = self.root_fmt.format(match.group(1))
        MangaExtractor.__init__(self, match, self.root + match.group(2))

    def chapters(self, page):
        results = []
        data = {"lang": "en", "language": "English"}
        data["manga"] = text.unescape(
            text.extract(page, '<title>', ' Manga - ')[0])

        for stream in page.split('<div id="stream_')[1:]:
            data["stream"] = text.parse_int(text.extract(stream, '', '"')[0])

            for chapter in text.extract_iter(stream, '<li ', '</li>'):
                path  , pos = text.extract(chapter, 'href="', '"')
                title1, pos = text.extract(chapter, '>', '<', pos)
                title2, pos = text.extract(chapter, '>: </span>', '<', pos)
                count , pos = text.extract(chapter, '  of ', ' ', pos)

                self.parse_chapter_path(path[8:], data)
                if "chapter" not in data:
                    self.parse_chapter_title(title1, data)

                if title2:
                    data["title"] = title2.strip()
                else:
                    data["title"] = title1.partition(":")[2].strip()

                data["count"] = text.parse_int(count)
                results.append((self.root + path, data.copy()))
                data.pop("chapter", None)

        return results
[mangapark] add chapter extractor 2015-12-08 22:29:34 +01:00			`# -- coding: utf-8 --`

[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`# Copyright 2015-2020 Mike Fährmann`
[mangapark] add chapter extractor 2015-12-08 22:29:34 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`"""Extractors for https://mangapark.net/"""`
[mangapark] add chapter extractor 2015-12-08 22:29:34 +01:00
implement generic manga-chapter extractor 2018-02-03 23:14:32 +01:00			`from .common import ChapterExtractor, MangaExtractor`
[mangapark] detect non-existent chapters 2018-12-27 21:41:50 +01:00			`from .. import text, exception`
[mangapark] fix extraction 2019-01-17 21:21:57 +01:00			`import json`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`import re`
[mangapark] add manga extractor 2015-12-09 00:07:18 +01:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
rename some base classes They shouldn't be called …Extractor if they don't have 'Extractor' as their base class. 2019-02-08 11:43:40 +01:00			`class MangaparkBase():`
[mangapark] extract manga metadata + code improvements 2017-09-22 17:33:58 +02:00			`"""Base class for mangapark extractors"""`
[mangapark] add manga extractor 2015-12-09 00:07:18 +01:00			`category = "mangapark"`
[mangapark] support .net and .com mirrors 2018-07-05 14:45:05 +02:00			`root_fmt = "https://mangapark.{}"`
[mangapark] extract manga metadata + code improvements 2017-09-22 17:33:58 +02:00
			`@staticmethod`
implement and use 'util.safe_int()' same as Python's 'int()', except it doesn't raise any exceptions and accepts a default value 2017-09-24 15:59:25 +02:00			`def parse_chapter_path(path, data):`
			`"""Get volume/chapter information from url-path of a chapter"""`
[mangapark] extract manga metadata + code improvements 2017-09-22 17:33:58 +02:00			`data["volume"], data["chapter_minor"] = 0, ""`
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00			`for part in path.split("/")[1:]:`
[mangapark] extract manga metadata + code improvements 2017-09-22 17:33:58 +02:00			`key, value = part[0], part[1:]`
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00			`if key == "c":`
[mangapark] extract manga metadata + code improvements 2017-09-22 17:33:58 +02:00			`chapter, dot, minor = value.partition(".")`
rename safe_int to parse_int; move parse_* to text module 2018-04-20 14:53:21 +02:00			`data["chapter"] = text.parse_int(chapter)`
[mangapark] extract manga metadata + code improvements 2017-09-22 17:33:58 +02:00			`data["chapter_minor"] = dot + minor`
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00			`elif key == "i":`
			`data["chapter_id"] = text.parse_int(value)`
			`elif key == "v":`
			`data["volume"] = text.parse_int(value)`
			`elif key == "s":`
			`data["stream"] = text.parse_int(value)`
[mangapark] extract manga metadata + code improvements 2017-09-22 17:33:58 +02:00			`elif key == "e":`
			`data["chapter_minor"] = "v" + value`

[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`@staticmethod`
			`def parse_chapter_title(title, data):`
			`match = re.search(r"(?i)(?:vol(?:ume)?[ .]*(\d+) )?"`
			`r"ch(?:apter)?[ .]*(\d+)(\.\w+)?", title)`
			`if match:`
			`vol, ch, data["chapter_minor"] = match.groups()`
			`data["volume"] = text.parse_int(vol)`
			`data["chapter"] = text.parse_int(ch)`

[mangapark] extract manga metadata + code improvements 2017-09-22 17:33:58 +02:00
rename some base classes They shouldn't be called …Extractor if they don't have 'Extractor' as their base class. 2019-02-08 11:43:40 +01:00			`class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`"""Extractor for manga-chapters from mangapark.net"""`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me\|net\|com)"`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`r"/manga/([^?#]+/i\d+)")`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`test = (`
update extractor test results - sensescans: replace 404d chapters - mangapark: replace 404d chapters - subscribestar: update test for attached files 2020-08-28 22:26:54 +02:00			`("https://mangapark.net/manga/gosu/i811653/c055/1", {`
implement support for additional unit test result types - "pattern" matches all resulting URLs against the given regex - "count" allows to specify the amount of returned URLs 2017-08-25 22:01:14 +02:00			`"count": 50,`
update extractor test results - sensescans: replace 404d chapters - mangapark: replace 404d chapters - subscribestar: update test for attached files 2020-08-28 22:26:54 +02:00			`"keyword": "8344bdda8cd8414e7729a4e148379f147e3437da",`
[mangapark] small fixes and additions - add a 'title' keyword for chapter-titles and update the directory format accordingly - add a 'type' keyword to distinguish between manga and manhwa - fix an issue where an exception would be thrown if a chapter number did not have any special additions (2.5, 55a, v2, etc.) - add a test-case without a special chapter number - unescape manga title 2016-11-16 14:20:25 +01:00			`}),`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`(("https://mangapark.net/manga"`
update extractor test results - sensescans: replace 404d chapters - mangapark: replace 404d chapters - subscribestar: update test for attached files 2020-08-28 22:26:54 +02:00			`"/ad-astra-per-aspera-hata-kenjirou/i662051/c001.2/1"), {`
implement support for additional unit test result types - "pattern" matches all resulting URLs against the given regex - "count" allows to specify the amount of returned URLs 2017-08-25 22:01:14 +02:00			`"count": 40,`
update extractor test results - sensescans: replace 404d chapters - mangapark: replace 404d chapters - subscribestar: update test for attached files 2020-08-28 22:26:54 +02:00			`"keyword": "2bb3a8f426383ea13f17ff5582f3070d096d30ac",`
more extractor test-cases 2015-12-14 03:00:58 +01:00			`}),`
update extractor test results - sensescans: replace 404d chapters - mangapark: replace 404d chapters - subscribestar: update test for attached files 2020-08-28 22:26:54 +02:00			`(("https://mangapark.net/manga"`
			`"/gekkan-shoujo-nozaki-kun/i2067426/v7/c70/1"), {`
[kissmanga] use HTTPS 2018-12-30 13:19:35 +01:00			`"count": 15,`
update extractor test results - sensescans: replace 404d chapters - mangapark: replace 404d chapters - subscribestar: update test for attached files 2020-08-28 22:26:54 +02:00			`"keyword": "edc14993c4752cee3a76e09b2f024d40d854bfd1",`
implement support for additional unit test result types - "pattern" matches all resulting URLs against the given regex - "count" allows to specify the amount of returned URLs 2017-08-25 22:01:14 +02:00			`}),`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`("https://mangapark.me/manga/gosu/i811615/c55/1"),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`("https://mangapark.com/manga/gosu/i811615/c55/1"),`
			`)`
[mangapark] add chapter extractor 2015-12-08 22:29:34 +01:00
			`def __init__(self, match):`
[mangapark] support .net and .com mirrors 2018-07-05 14:45:05 +02:00			`tld, self.path = match.groups()`
			`self.root = self.root_fmt.format(tld)`
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00			`url = "{}/manga/{}?zoom=2".format(self.root, self.path)`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`ChapterExtractor.__init__(self, match, url)`
[mangapark] add chapter extractor 2015-12-08 22:29:34 +01:00
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`def metadata(self, page):`
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00			`data = text.extract_all(page, (`
change keyword names to valid Python identifiers This commit mostly replaces all minus-signs ('-') in keyword names with underscores ('_') to allow them to be used in filter-expressions. For example 'gallery-id' got renamed to 'gallery_id'. (It is theoretically possible to access any variable, regardless of its name, with 'locals()["NAME"]', but that seems a bit too convoluted if just 'NAME' could be enough) 2017-09-10 22:20:47 +02:00			`("manga_id" , "var _manga_id = '", "'"),`
			`("chapter_id", "var _book_id = '", "'"),`
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00			`("stream" , "var _stream = '", "'"),`
			`("path" , "var _book_link = '", "'"),`
[mangapark] add chapter extractor 2015-12-08 22:29:34 +01:00			`("manga" , "<h2>", "</h2>"),`
[mangapark] small fixes and additions - add a 'title' keyword for chapter-titles and update the directory format accordingly - add a 'type' keyword to distinguish between manga and manhwa - fix an issue where an exception would be thrown if a chapter number did not have any special additions (2.5, 55a, v2, etc.) - add a test-case without a special chapter number - unescape manga title 2016-11-16 14:20:25 +01:00			`("title" , "</a>", "<"),`
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00			`), values={"lang": "en", "language": "English"})[0]`
[mangapark] detect non-existent chapters 2018-12-27 21:41:50 +01:00
			`if not data["path"]:`
			`raise exception.NotFoundError("chapter")`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00			`self.parse_chapter_path(data["path"], data)`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`if "chapter" not in data:`
			`self.parse_chapter_title(data["title"], data)`
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00
use 'str.partition()' The (r)partition method is always faster then split() or any other method that has been replaced in this commit. 2017-08-21 18:29:50 +02:00			`data["manga"], _, data["type"] = data["manga"].rpartition(" ")`
[mangapark] small fixes and additions - add a 'title' keyword for chapter-titles and update the directory format accordingly - add a 'type' keyword to distinguish between manga and manhwa - fix an issue where an exception would be thrown if a chapter number did not have any special additions (2.5, 55a, v2, etc.) - add a test-case without a special chapter number - unescape manga title 2016-11-16 14:20:25 +01:00			`data["manga"] = text.unescape(data["manga"])`
use 'str.partition()' The (r)partition method is always faster then split() or any other method that has been replaced in this commit. 2017-08-21 18:29:50 +02:00			`data["title"] = data["title"].partition(": ")[2]`
[mangapark] fix extraction 2019-01-17 21:21:57 +01:00			`for key in ("manga_id", "chapter_id", "stream"):`
[mangapark] fix extraction 2018-12-21 23:32:04 +01:00			`data[key] = text.parse_int(data[key])`

[mangapark] add chapter extractor 2015-12-08 22:29:34 +01:00			`return data`

change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`def images(self, page):`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`data = json.loads(text.extract(page, "var _load_pages =", ";")[0])`
[mangapark] fix extraction 2019-01-17 21:21:57 +01:00			`return [`
			`(text.urljoin(self.root, item["u"]), {`
			`"width": text.parse_int(item["w"]),`
			`"height": text.parse_int(item["h"]),`
			`})`
			`for item in data`
			`]`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00

			`class MangaparkMangaExtractor(MangaparkBase, MangaExtractor):`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`"""Extractor for manga from mangapark.net"""`
add '_extractor' info to manga extractor results 2019-02-13 13:23:36 +01:00			`chapterclass = MangaparkChapterExtractor`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me\|net\|com)"`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`r"(/manga/[^/?#]+)/?$")`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`test = (`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`("https://mangapark.net/manga/aria", {`
update extractor test results - sensescans: replace 404d chapters - mangapark: replace 404d chapters - subscribestar: update test for attached files 2020-08-28 22:26:54 +02:00			`"url": "9b62883c25c8de471f8ab43651e1448536c4ce3f",`
			`"keyword": "eb4a9b273c69acf31efa731eba713e1cfa14bab6",`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`}),`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`("https://mangapark.me/manga/aria"),`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`("https://mangapark.com/manga/aria"),`
			`)`

			`def __init__(self, match):`
			`self.root = self.root_fmt.format(match.group(1))`
			`MangaExtractor.__init__(self, match, self.root + match.group(2))`

			`def chapters(self, page):`
			`results = []`
			`data = {"lang": "en", "language": "English"}`
			`data["manga"] = text.unescape(`
			`text.extract(page, '<title>', ' Manga - ')[0])`

			`for stream in page.split('<div id="stream_')[1:]:`
			`data["stream"] = text.parse_int(text.extract(stream, '', '"')[0])`

			`for chapter in text.extract_iter(stream, '<li ', '</li>'):`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`path , pos = text.extract(chapter, 'href="', '"')`
			`title1, pos = text.extract(chapter, '>', '<', pos)`
			`title2, pos = text.extract(chapter, '>: </span>', '<', pos)`
			`count , pos = text.extract(chapter, ' of ', ' ', pos)`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00
			`self.parse_chapter_path(path[8:], data)`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`if "chapter" not in data:`
			`self.parse_chapter_title(title1, data)`

			`if title2:`
			`data["title"] = title2.strip()`
			`else:`
			`data["title"] = title1.partition(":")[2].strip()`

change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`data["count"] = text.parse_int(count)`
			`results.append((self.root + path, data.copy()))`
[mangapark] fix metadata extraction 2020-03-27 22:07:42 +01:00			`data.pop("chapter", None)`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00
			`return results`