gallery-dl/gallery_dl/extractor/mangadex.py

# -*- coding: utf-8 -*-

# Copyright 2018-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract manga-chapters and entire manga from https://mangadex.org/"""

from .common import Extractor, Message
from .. import text, util


class MangadexExtractor(Extractor):
    """Base class for mangadex extractors"""
    category = "mangadex"
    root = "https://mangadex.org"

    # mangadex-to-iso639-1 codes
    iso639_map = {
        "br": "pt",
        "ct": "ca",
        "gb": "en",
        "vn": "vi",
    }

    def chapter_data(self, chapter_id):
        """Request API results for 'chapter_id'"""
        url = "{}/api/chapter/{}".format(self.root, chapter_id)
        return self.request(url).json()

    def manga_data(self, manga_id, *, cache={}):
        """Request API results for 'manga_id'"""
        if manga_id not in cache:
            url = "{}/api/manga/{}".format(self.root, manga_id)
            cache[manga_id] = self.request(url).json()
        return cache[manga_id]


class MangadexChapterExtractor(MangadexExtractor):
    """Extractor for manga-chapters from mangadex.org"""
    subcategory = "chapter"
    directory_fmt = (
        "{category}", "{manga}",
        "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}")
    filename_fmt = (
        "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
    archive_fmt = "{chapter_id}_{page}"
    pattern = r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/chapter/(\d+)"
    test = (
        ("https://mangadex.org/chapter/122094", {
            "keyword": "7bd7f82ab9d3f06976c4b68afe78d0040851ac3c",
            "content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f",
        }),
        # oneshot
        ("https://mangadex.org/chapter/138086", {
            "count": 64,
            "keyword": "435e157dc5529d152458ba751ffe5bfbaf4850fb",
        }),
    )

    def __init__(self, match):
        MangadexExtractor.__init__(self, match)
        self.chapter_id = match.group(1)
        self.data = None

    def items(self):
        data = self.metadata()
        imgs = self.images()
        data["count"] = len(imgs)

        yield Message.Version, 1
        yield Message.Directory, data
        for data["page"], url in enumerate(imgs, 1):
            yield Message.Url, url, text.nameext_from_url(url, data)

    def metadata(self):
        """Return a dict with general metadata"""
        cdata = self.chapter_data(self.chapter_id)
        mdata = self.manga_data(cdata["manga_id"])
        self.data = cdata

        chapter, sep, minor = cdata["chapter"].partition(".")
        return {
            "manga": mdata["manga"]["title"],
            "manga_id": cdata["manga_id"],
            "artist": mdata["manga"]["artist"],
            "author": mdata["manga"]["author"],
            "title": text.unescape(cdata["title"]),
            "volume": text.parse_int(cdata["volume"]),
            "chapter": text.parse_int(chapter),
            "chapter_minor": sep + minor,
            "chapter_id": cdata["id"],
            "group": mdata["chapter"][self.chapter_id]["group_name"],
            "date": cdata["timestamp"],
            "lang": util.language_to_code(cdata["lang_name"]),
            "language": cdata["lang_name"],
        }

    def images(self):
        """Return a list of all image URLs"""
        base = self.data["server"] + self.data["hash"] + "/"
        if base.startswith("/"):
            base = text.urljoin(self.root, base)
        return [base + page for page in self.data["page_array"]]


class MangadexMangaExtractor(MangadexExtractor):
    """Extractor for manga from mangadex.org"""
    subcategory = "manga"
    categorytransfer = True
    pattern = (r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)"
               r"/(?:title|manga)/(\d+)")
    test = (
        ("https://mangadex.org/manga/2946/souten-no-koumori", {
            "pattern": r"https://mangadex.org/chapter/\d+",
            "keywords": {
                "manga": "Souten no Koumori",
                "manga_id": 2946,
                "title": "Oneshot",
                "volume": 0,
                "chapter": 0,
                "chapter_minor": "",
                "chapter_id": int,
                "group": str,
                "date": int,
                "lang": str,
                "language": str,
            },
        }),
        ("https://mangadex.org/manga/13318/dagashi-kashi/chapters/2/", {
            "count": ">= 100",
        }),
        ("https://mangadex.org/title/13004/yorumori-no-kuni-no-sora-ni", {
            "count": 0,
        }),
        ("https://mangadex.org/title/2946/souten-no-koumori"),
    )

    def __init__(self, match):
        MangadexExtractor.__init__(self, match)
        self.manga_id = text.parse_int(match.group(1))

    def items(self):
        yield Message.Version, 1
        for data in self.chapters():
            url = "{}/chapter/{}".format(self.root, data["chapter_id"])
            yield Message.Queue, url, data

    def chapters(self):
        """Return a sorted list of chapter-metadata dicts"""
        data = self.manga_data(self.manga_id)
        if "chapter" not in data:
            return ()
        manga = data["manga"]

        results = []
        for chid, info in data["chapter"].items():
            chapter, sep, minor = info["chapter"].partition(".")
            lang = self.iso639_map.get(info["lang_code"], info["lang_code"])
            results.append({
                "manga": manga["title"],
                "manga_id": self.manga_id,
                "artist": manga["artist"],
                "author": manga["author"],
                "title": text.unescape(info["title"]),
                "volume": text.parse_int(info["volume"]),
                "chapter": text.parse_int(chapter),
                "chapter_minor": sep + minor,
                "chapter_id": text.parse_int(chid),
                "group": text.unescape(info["group_name"]),
                "date": info["timestamp"],
                "lang": lang,
                "language": util.code_to_language(lang),
            })

        results.sort(key=lambda x: (x["chapter"], x["chapter_minor"]))
        return results
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`# -- coding: utf-8 --`

[mangadex] handle manga pages without chapters 2019-01-03 11:52:00 +01:00			`# Copyright 2018-2019 Mike Fährmann`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract manga-chapters and entire manga from https://mangadex.org/"""`

[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`from .common import Extractor, Message`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`from .. import text, util`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00

[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`class MangadexExtractor(Extractor):`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`"""Base class for mangadex extractors"""`
			`category = "mangadex"`
			`root = "https://mangadex.org"`

[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`# mangadex-to-iso639-1 codes`
			`iso639_map = {`
			`"br": "pt",`
			`"ct": "ca",`
			`"gb": "en",`
			`"vn": "vi",`
			`}`

[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`def chapter_data(self, chapter_id):`
			`"""Request API results for 'chapter_id'"""`
			`url = "{}/api/chapter/{}".format(self.root, chapter_id)`
			`return self.request(url).json()`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`def manga_data(self, manga_id, *, cache={}):`
			`"""Request API results for 'manga_id'"""`
			`if manga_id not in cache:`
			`url = "{}/api/manga/{}".format(self.root, manga_id)`
			`cache[manga_id] = self.request(url).json()`
			`return cache[manga_id]`


			`class MangadexChapterExtractor(MangadexExtractor):`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`"""Extractor for manga-chapters from mangadex.org"""`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`subcategory = "chapter"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = (`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`"{category}", "{manga}",`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}")`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`filename_fmt = (`
			`"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")`
[mangadex] general improvements - support >100 chapter entries per manga - custom archive ID format - detect non-existing chapters 2018-03-06 14:15:15 +01:00			`archive_fmt = "{chapter_id}_{page}"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = r"(?:https?://)?(?:www\.)?mangadex\.(?:org\|com)/chapter/(\d+)"`
			`test = (`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`("https://mangadex.org/chapter/122094", {`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`"keyword": "7bd7f82ab9d3f06976c4b68afe78d0040851ac3c",`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`"content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f",`
			`}),`
			`# oneshot`
			`("https://mangadex.org/chapter/138086", {`
			`"count": 64,`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`"keyword": "435e157dc5529d152458ba751ffe5bfbaf4850fb",`
[mangadex] fix parsing of unusual chapter strings 2018-05-23 18:37:12 +02:00			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`)`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`MangadexExtractor.__init__(self, match)`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`self.chapter_id = match.group(1)`
[mangadex] fix parsing of unusual chapter strings 2018-05-23 18:37:12 +02:00			`self.data = None`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`def items(self):`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`data = self.metadata()`
			`imgs = self.images()`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`data["count"] = len(imgs)`

			`yield Message.Version, 1`
			`yield Message.Directory, data`
			`for data["page"], url in enumerate(imgs, 1):`
			`yield Message.Url, url, text.nameext_from_url(url, data)`
[mangadex] add title info for chapter extractors 2018-04-21 16:18:03 +02:00
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`def metadata(self):`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`"""Return a dict with general metadata"""`
			`cdata = self.chapter_data(self.chapter_id)`
			`mdata = self.manga_data(cdata["manga_id"])`
			`self.data = cdata`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`chapter, sep, minor = cdata["chapter"].partition(".")`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`return {`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`"manga": mdata["manga"]["title"],`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`"manga_id": cdata["manga_id"],`
			`"artist": mdata["manga"]["artist"],`
			`"author": mdata["manga"]["author"],`
			`"title": text.unescape(cdata["title"]),`
			`"volume": text.parse_int(cdata["volume"]),`
[mangadex] fix parsing of unusual chapter strings 2018-05-23 18:37:12 +02:00			`"chapter": text.parse_int(chapter),`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`"chapter_minor": sep + minor,`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`"chapter_id": cdata["id"],`
			`"group": mdata["chapter"][self.chapter_id]["group_name"],`
			`"date": cdata["timestamp"],`
			`"lang": util.language_to_code(cdata["lang_name"]),`
			`"language": cdata["lang_name"],`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`}`

change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`def images(self):`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`"""Return a list of all image URLs"""`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`base = self.data["server"] + self.data["hash"] + "/"`
[mangadex] fix relative page URLs 2018-08-25 11:06:28 +02:00			`if base.startswith("/"):`
			`base = text.urljoin(self.root, base)`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`return [base + page for page in self.data["page_array"]]`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00

[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`class MangadexMangaExtractor(MangadexExtractor):`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`"""Extractor for manga from mangadex.org"""`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`subcategory = "manga"`
			`categorytransfer = True`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?(?:www\.)?mangadex\.(?:org\|com)"`
			`r"/(?:title\|manga)/(\d+)")`
			`test = (`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`("https://mangadex.org/manga/2946/souten-no-koumori", {`
[mangadex] handle manga pages without chapters 2019-01-03 11:52:00 +01:00			`"pattern": r"https://mangadex.org/chapter/\d+",`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`"keywords": {`
			`"manga": "Souten no Koumori",`
			`"manga_id": 2946,`
			`"title": "Oneshot",`
[mangadex] general improvements - support >100 chapter entries per manga - custom archive ID format - detect non-existing chapters 2018-03-06 14:15:15 +01:00			`"volume": 0,`
			`"chapter": 0,`
			`"chapter_minor": "",`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`"chapter_id": int,`
			`"group": str,`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`"date": int,`
[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`"lang": str,`
			`"language": str,`
			`},`
			`}),`
[mangadex] fix manga extraction (closes #84) Chapter listings for manga now use https://mangadex.org/manga/<id>/_/chapters/2/ as URL instead of https://mangadex.org/manga/<id>/_//2/ 2018-05-06 17:43:50 +02:00			`("https://mangadex.org/manga/13318/dagashi-kashi/chapters/2/", {`
			`"count": ">= 100",`
			`}),`
[mangadex] handle manga pages without chapters 2019-01-03 11:52:00 +01:00			`("https://mangadex.org/title/13004/yorumori-no-kuni-no-sora-ni", {`
			`"count": 0,`
			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`("https://mangadex.org/title/2946/souten-no-koumori"),`
			`)`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`MangadexExtractor.__init__(self, match)`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`self.manga_id = text.parse_int(match.group(1))`

			`def items(self):`
			`yield Message.Version, 1`
			`for data in self.chapters():`
			`url = "{}/chapter/{}".format(self.root, data["chapter_id"])`
			`yield Message.Queue, url, data`

			`def chapters(self):`
			`"""Return a sorted list of chapter-metadata dicts"""`
			`data = self.manga_data(self.manga_id)`
[mangadex] handle manga pages without chapters 2019-01-03 11:52:00 +01:00			`if "chapter" not in data:`
			`return ()`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`manga = data["manga"]`

[mangadex] add chapter- and manga-extractor 2018-03-05 18:37:21 +01:00			`results = []`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`for chid, info in data["chapter"].items():`
			`chapter, sep, minor = info["chapter"].partition(".")`
			`lang = self.iso639_map.get(info["lang_code"], info["lang_code"])`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`results.append({`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`"manga": manga["title"],`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`"manga_id": self.manga_id,`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`"artist": manga["artist"],`
			`"author": manga["author"],`
			`"title": text.unescape(info["title"]),`
			`"volume": text.parse_int(info["volume"]),`
			`"chapter": text.parse_int(chapter),`
			`"chapter_minor": sep + minor,`
			`"chapter_id": text.parse_int(chid),`
			`"group": text.unescape(info["group_name"]),`
			`"date": info["timestamp"],`
			`"lang": lang,`
			`"language": util.code_to_language(lang),`
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`})`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00
[mangadex] improve extraction - cache manga API results - add artist, author and date fields to chapter metadata - remove Manga-/ChapterExtractor inheritance - minor code simplifications and improvements 2018-08-10 16:26:10 +02:00			`results.sort(key=lambda x: (x["chapter"], x["chapter_minor"]))`
[mangadex] fix extraction 2018-08-08 18:08:26 +02:00			`return results`