[mangadex] add chapter- and manga-extractor

2024-11-22 10:42:34 +01:00 · 2018-03-05 18:37:21 +01:00 · 2018-03-05 18:37:21 +01:00 · 749fbbfa6c
commit 749fbbfa6c
parent b58449fd88
8 changed files with 293 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,7 @@
 # Changelog

+## Unreleased
+
 ## 1.3.0 - 2018-03-02
 - Added `--proxy` to explicitly specify a proxy server ([#76](https://github.com/mikf/gallery-dl/issues/76))
 - Added options to customize [archive ID formats](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorarchive-format) and [undefined replacement fields](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorkeywords-default)
--- a/docs/gallery-dl-example.conf
+++ b/docs/gallery-dl-example.conf
@ -0,0 +1,136 @@
+{
+    "base-directory": "/tmp/",
+    "netrc": false,
+
+    "downloader":
+    {
+        "part": true,
+        "part-directory": null,
+        "http":
+        {
+            "rate": null,
+            "retries": 5,
+            "timeout": 30,
+            "verify": true
+        }
+    },
+    "extractor":
+    {
+        "archive": null,
+        "proxy": null,
+        "skip": true,
+        "sleep": 0,
+
+        "pixiv":
+        {
+            "user":
+            {
+                "directory": ["{category}", "{user[id]}"]
+            },
+            "bookmark":
+            {
+                "directory": ["{category}", "my bookmarks"]
+            },
+            "ugoira": true,
+            "username": null,
+            "password": null
+        },
+        "batoto":
+        {
+            "username": null,
+            "password": null
+        },
+        "exhentai":
+        {
+            "wait-min": 3,
+            "wait-max": 6,
+            "original": true,
+            "username": null,
+            "password": null,
+            "cookies": {
+                "igneous": null,
+                "s": null,
+                "yay": "louder"
+            }
+        },
+        "nijie":
+        {
+            "username": null,
+            "password": null
+        },
+        "sankaku":
+        {
+            "wait-min": 2,
+            "wait-max": 4,
+            "username": null,
+            "password": null
+        },
+        "seiga":
+        {
+            "username": null,
+            "password": null
+        },
+        "gelbooru":
+        {
+            "filename": "{category}_{id:>07}_{md5}.{extension}",
+            "api": true
+        },
+        "reddit":
+        {
+            "refresh-token": null,
+            "comments": 500,
+            "morecomments": false,
+            "date-min": 0,
+            "date-max": 253402210800,
+            "date-format": "%Y-%m-%dT%H:%M:%S",
+            "id-min": "0",
+            "id-max": "ZIK0ZJ",
+            "recursion": 0
+        },
+        "flickr":
+        {
+            "access-token": null,
+            "access-token-secret": null,
+            "metadata": false,
+            "size-max": null
+        },
+        "deviantart":
+        {
+            "refresh-token": null,
+            "flat": true,
+            "mature": true,
+            "original": true
+        },
+        "gfycat":
+        {
+            "format": "mp4"
+        },
+        "imgur":
+        {
+            "mp4": true
+        },
+        "tumblr":
+        {
+            "posts": "photo",
+            "inline": false,
+            "reblogs": true,
+            "external": false
+        },
+        "recursive":
+        {
+            "blacklist": ["directlink", "oauth", "recursive", "test"]
+        },
+        "oauth":
+        {
+            "browser": true
+        }
+    },
+    "output":
+    {
+        "mode": "auto",
+        "shorten": true,
+        "progress": true,
+        "logfile": null,
+        "unsupportedfile": null
+    }
+}
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@ -47,6 +47,7 @@ Luscious             https://luscious.net/               Albums
 Manga Fox            http://fanfox.net/                  Chapters
 Manga Here           http://www.mangahere.co/            Chapters, Manga
 Manga Stream         https://mangastream.com/            Chapters
+Mangadex             https://mangadex.org/               Chapters, Manga
 Mangapanda           https://www.mangapanda.com/         Chapters, Manga
 MangaPark            https://mangapark.me/               Chapters, Manga
 Mangareader          https://www.mangareader.net/        Chapters, Manga
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -51,6 +51,7 @@ modules = [
    "konachan",
    "loveisover",
    "luscious",
+    "mangadex",
    "mangafox",
    "mangahere",
    "mangapanda",
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://mangadex.org/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, util
+from urllib.parse import urljoin
+import json
+import re
+
+
+class MangadexExtractor():
+    """Base class for mangadex extractors"""
+    category = "mangadex"
+    root = "https://mangadex.org"
+
+
+class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor):
+    """Extractor for manga-chapters from mangadex.org"""
+    pattern = [r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/chapter/(\d+)"]
+    test = [
+        ("https://mangadex.org/chapter/122094", {
+            "keyword": "b4c83fe41f125eae745c2e00d29e087cc4eb78df",
+            "content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f",
+        }),
+        # oneshot
+        ("https://mangadex.org/chapter/138086", {
+            "count": 64,
+            "keyword": "9b1b7292f7dbcf10983fbdc34b8cdceeb47328ee",
+        }),
+    ]
+
+    def __init__(self, match):
+        self.chapter_id = match.group(1)
+        url = self.root + "/chapter/" + self.chapter_id
+        ChapterExtractor.__init__(self, url)
+
+    def get_metadata(self, page):
+        info    , pos = text.extract(page, '="og:title" content="', '"')
+        manga_id, pos = text.extract(page, '/images/manga/', '.', pos)
+        _       , pos = text.extract(page, ' id="jump_group"', '', pos)
+        _       , pos = text.extract(page, ' selected ', '', pos)
+        language, ___ = text.extract(page, " title='", "'", pos-100)
+        group   , pos = text.extract(page, '>', '<', pos)
+
+        info = text.unescape(info)
+        match = re.match(
+            r"(?:(?:Vol\. (\d+) )?Ch\. (\d+)([^ ]*)|(.*)) "
+            r"\(([^)]+)\)",
+            info)
+
+        return {
+            "manga": match.group(5),
+            "manga_id": util.safe_int(manga_id),
+            "volume": util.safe_int(match.group(1)),
+            "chapter": util.safe_int(match.group(2)),
+            "chapter_minor": match.group(3) or "",
+            "chapter_id": util.safe_int(self.chapter_id),
+            "chapter_string": info.rstrip(" - MangaDex"),
+            "group": text.unescape(group),
+            "lang": util.language_to_code(language),
+            "language": language,
+        }
+
+    def get_images(self, page):
+        dataurl , pos = text.extract(page, "var dataurl = '", "'")
+        pagelist, pos = text.extract(page, "var page_array = [", "]", pos)
+        server  , pos = text.extract(page, "var server = '", "'", pos)
+
+        base = urljoin(self.root, server + dataurl + "/")
+
+        return [
+            (base + page, None)
+            for page in json.loads(
+                "[" + pagelist.replace("'", '"').rstrip(",") + "]"
+            )
+        ]
+
+
+class MangadexMangaExtractor(MangadexExtractor, MangaExtractor):
+    """Extractor for manga from mangadex.org"""
+    pattern = [r"(?:https?://)?(?:www\.)?(mangadex\.(?:org|com)/manga/\d+)"]
+    test = [
+        ("https://mangadex.org/manga/2946/souten-no-koumori", {
+            "url": "9e77934759828458d0424473922e41f348719472",
+            "keywords": {
+                "manga": "Souten no Koumori",
+                "manga_id": 2946,
+                "title": "Oneshot",
+                "volume": int,
+                "chapter": int,
+                "chapter_minor": str,
+                "chapter_id": int,
+                "group": str,
+                "contributor": str,
+                "date": str,
+                "views": int,
+                "lang": str,
+                "language": str,
+            },
+        }),
+    ]
+
+    def chapters(self, page):
+        results = []
+        extr = text.extract
+
+        manga = text.unescape(extr(
+            page, '"og:title" content="', '"')[0].rpartition(" (")[0])
+        manga_id = util.safe_int(extr(
+            page, '/images/manga/', '.')[0])
+
+        for info in text.extract_iter(page, "<tr id=", "</tr>"):
+            chid    , pos = extr(info, 'data-chapter-id="', '"')
+            chapter , pos = extr(info, 'data-chapter-num="', '"', pos)
+            volume  , pos = extr(info, 'data-volume-num="', '"', pos)
+            title   , pos = extr(info, 'data-chapter-name="', '"', pos)
+            language, pos = extr(info, " title='", "'", pos)
+            group   , pos = extr(info, "<td>", "</td>", pos)
+            user    , pos = extr(info, "<td>", "</td>", pos)
+            views   , pos = extr(info, ">", "<", pos)
+            date    , pos = extr(info, ' datetime="', '"', pos)
+
+            chapter, sep, minor = chapter.partition(".")
+
+            results.append((self.root + "/chapter/" + chid, {
+                "manga": manga,
+                "manga_id": util.safe_int(manga_id),
+                "title": text.unescape(title),
+                "volume": util.safe_int(volume),
+                "chapter": util.safe_int(chapter),
+                "chapter_minor": sep + minor,
+                "chapter_id": util.safe_int(chid),
+                "group": text.unescape(text.remove_html(group)),
+                "contributor": text.remove_html(user),
+                "views": util.safe_int(views),
+                "date": date,
+                "lang": util.language_to_code(language),
+                "language": language,
+            }))
+
+        return results
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@ -155,6 +155,8 @@ def language_to_code(lang, default=None):

 CODES = {
    "ar": "Arabic",
+    "bg": "Bulgarian",
+    "ca": "Catalan",
    "cs": "Czech",
    "da": "Danish",
    "de": "German",
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@ -6,4 +6,4 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-__version__ = "1.3.0"
+__version__ = "1.3.1-dev"
--- a/test/test_extractors.py
+++ b/test/test_extractors.py
@ -18,7 +18,9 @@ SKIP = {
    "archivedmoe", "archiveofsins", "thebarchive",

    # temporary issues
+    "imgchili",
    "powermanga",
+    "pinterest",
 }