gallery-dl/gallery_dl/extractor/mangahere.py

# -*- coding: utf-8 -*-

# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract manga-chapters and entire manga from http://www.mangahere.co/"""

from .common import ChapterExtractor, MangaExtractor
from .. import text, util
from ..cache import memcache
from urllib.parse import urljoin
import re


class MangahereBase():
    """Base class for mangahere extractors"""
    category = "mangahere"
    root = "https://www.mangahere.cc"
    url_fmt = root + "/manga/{}/{}.html"


class MangahereMangaExtractor(MangahereBase, MangaExtractor):
    """Extractor for manga from mangahere.cc"""
    pattern = [r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/"
               r"([^/]+)/?(?:#.*)?$"]
    test = [
        ("https://www.mangahere.cc/manga/aria/", {
            "url": "e8971b1605d9888d978ebb2895adb1c7c37d663c",
            "keyword": "951eef36a3775525a31ca78c9d9cea546f4cf2f5",
        }),
        ("http://www.mangahere.cc/manga/hiyokoi#50", {
            "url": "6df27c0e105d9ee0b78a7aa77340d0891e6c7fc6",
            "keyword": "9542283639bd082fabf3a14b6695697d3ef15111",
        }),
        ("http://www.mangahere.co/manga/aria/", None),
        ("http://m.mangahere.co/manga/aria/", None),
    ]

    def __init__(self, match):
        url = "{}/manga/{}/".format(self.root, match.group(1))
        MangaExtractor.__init__(self, match, url)

    def chapters(self, page):
        results = []
        pos = page.index('<div class="detail_list">')
        manga, pos = text.extract(page, '<h3>Read ', ' Online</h3>', pos)
        manga = text.unescape(manga)

        while True:
            url, pos = text.extract(
                page, '<a class="color_0077" href="', '"', pos)
            if not url:
                return results
            chapter, dot, minor = url[:-1].rpartition("/c")[2].partition(".")
            volume, pos = text.extract(page, 'span class="mr6">', '<', pos)
            title, pos = text.extract(page, '/span>', '<', pos)
            date, pos = text.extract(page, 'class="right">', '</span>', pos)
            results.append((urljoin("http:", url), {
                "manga": manga, "title": title, "date": date,
                "volume": util.safe_int(volume.rpartition(" ")[2]),
                "chapter": util.safe_int(chapter),
                "chapter_minor": dot + minor,
                "lang": "en", "language": "English",
            }))


class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
    """Extractor for manga-chapters from mangahere.cc"""
    pattern = [(r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/"
                r"([^/]+(?:/v0*(\d+))?/c([^/?&#]+))")]
    test = [
        ("https://www.mangahere.cc/manga/dongguo_xiaojie/c004.2/", {
            "keyword": "0e1cee6dd377da02ad51aa810ba65db3e811aef9",
            "content": "708d475f06893b88549cbd30df1e3f9428f2c884",
        }),
        ("http://www.mangahere.co/manga/dongguo_xiaojie/c003.2/", None),
        ("http://m.mangahere.co/manga/dongguo_xiaojie/c003.2/", None),
    ]

    def __init__(self, match):
        self.part, self.volume, self.chapter = match.groups()
        # remove ".html" for the first chapter page to avoid redirects
        url = self.url_fmt.format(self.part, "")[:-5]
        ChapterExtractor.__init__(self, url)

    def get_metadata(self, page):
        """Collect metadata for extractor-job"""
        manga, pos = text.extract(page, '<title>', '</title>')
        mid  , pos = text.extract(page, '.net/store/manga/', '/', pos)
        pages, pos = text.extract(page, ' class="wid60"', '</select>', pos)
        count = re.findall(r">(\d+)<", pages)[-1]
        manga = re.match((r"(.+) \d+(\.\d+)? - Read .+ Chapter "
                          r"\d+(\.\d+)? Online"), manga).group(1)
        chapter, dot, minor = self.chapter.partition(".")

        return {
            "manga": text.unescape(manga),
            "manga_id": util.safe_int(mid),
            "title": self._get_title_map(mid).get(self.chapter),
            "volume": util.safe_int(self.volume),
            "chapter": util.safe_int(chapter),
            "chapter_minor": dot + minor,
            "count": util.safe_int(count),
            "lang": "en",
            "language": "English",
        }

    def get_images(self, page):
        """Yield all image-urls for this chapter"""
        pnum = 1
        while True:
            url, pos = text.extract(page, '<img src="', '"')
            yield url, None
            _  , pos = text.extract(page, '<img src="', '"', pos)
            _  , pos = text.extract(page, '<img src="', '"', pos)
            url, pos = text.extract(page, '<img src="', '"', pos)
            yield url, None

            pnum += 2
            page = self.request(self.url_fmt.format(self.part, pnum)).text

    @memcache(keyarg=1)
    def _get_title_map(self, manga_id):
        url = "{}/get_chapters{}.js".format(self.root, manga_id)
        page = self.request(url).text

        chapters = {}
        for info in text.extract_iter(page, '["', '"]'):
            title, _, url = info.partition('","')
            title = title.partition(": ")[2]
            num = url.rpartition("c")[2].rstrip("/")
            chapters[num] = text.unescape(title)

        return chapters
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`# -- coding: utf-8 --`

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`# Copyright 2015-2018 Mike Fährmann`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

consistent extractor naming scheme + docstrings 2016-09-12 10:20:57 +02:00			`"""Extract manga-chapters and entire manga from http://www.mangahere.co/"""`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`from .common import ChapterExtractor, MangaExtractor`
implement and use 'util.safe_int()' same as Python's 'int()', except it doesn't raise any exceptions and accepts a default value 2017-09-24 15:59:25 +02:00			`from .. import text, util`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`from ..cache import memcache`
[mangahere] fix extraction would switch to HTTPS, but there seem to be certificate issues 2017-09-26 17:08:59 +02:00			`from urllib.parse import urljoin`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`import re`

code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`class MangahereBase():`
			`"""Base class for mangahere extractors"""`
[mangahere] add manga-extractor 2015-11-28 00:11:28 +01:00			`category = "mangahere"`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`root = "https://www.mangahere.cc"`
			`url_fmt = root + "/manga/{}/{}.html"`


			`class MangahereMangaExtractor(MangahereBase, MangaExtractor):`
			`"""Extractor for manga from mangahere.cc"""`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`pattern = [r"(?:https?://)?(?:www\.\|m\.)?mangahere\.c[co]/manga/"`
			`r"([^/]+)/?(?:#.*)?$"]`
[mangahere] extract manga metadata 2017-09-22 14:55:37 +02:00			`test = [`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`("https://www.mangahere.cc/manga/aria/", {`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`"url": "e8971b1605d9888d978ebb2895adb1c7c37d663c",`
[mangahere] extract manga metadata 2017-09-22 14:55:37 +02:00			`"keyword": "951eef36a3775525a31ca78c9d9cea546f4cf2f5",`
			`}),`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`("http://www.mangahere.cc/manga/hiyokoi#50", {`
			`"url": "6df27c0e105d9ee0b78a7aa77340d0891e6c7fc6",`
[mangahere] extract manga metadata 2017-09-22 14:55:37 +02:00			`"keyword": "9542283639bd082fabf3a14b6695697d3ef15111",`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`}),`
			`("http://www.mangahere.co/manga/aria/", None),`
			`("http://m.mangahere.co/manga/aria/", None),`
[mangahere] extract manga metadata 2017-09-22 14:55:37 +02:00			`]`
[mangahere] add manga-extractor 2015-11-28 00:11:28 +01:00
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`def __init__(self, match):`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`url = "{}/manga/{}/".format(self.root, match.group(1))`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`MangaExtractor.__init__(self, match, url)`

simplify code by using a MangaExtractor base class 2017-05-20 11:27:43 +02:00			`def chapters(self, page):`
[mangahere] extract manga metadata 2017-09-22 14:55:37 +02:00			`results = []`
			`pos = page.index('<div class="detail_list">')`
			`manga, pos = text.extract(page, '<h3>Read ', ' Online</h3>', pos)`
			`manga = text.unescape(manga)`

			`while True:`
			`url, pos = text.extract(`
			`page, '<a class="color_0077" href="', '"', pos)`
			`if not url:`
			`return results`
			`chapter, dot, minor = url[:-1].rpartition("/c")[2].partition(".")`
			`volume, pos = text.extract(page, 'span class="mr6">', '<', pos)`
			`title, pos = text.extract(page, '/span>', '<', pos)`
			`date, pos = text.extract(page, 'class="right">', '</span>', pos)`
[mangahere] fix extraction would switch to HTTPS, but there seem to be certificate issues 2017-09-26 17:08:59 +02:00			`results.append((urljoin("http:", url), {`
[mangahere] extract manga metadata 2017-09-22 14:55:37 +02:00			`"manga": manga, "title": title, "date": date,`
implement and use 'util.safe_int()' same as Python's 'int()', except it doesn't raise any exceptions and accepts a default value 2017-09-24 15:59:25 +02:00			`"volume": util.safe_int(volume.rpartition(" ")[2]),`
			`"chapter": util.safe_int(chapter),`
			`"chapter_minor": dot + minor,`
[mangahere] extract manga metadata 2017-09-22 14:55:37 +02:00			`"lang": "en", "language": "English",`
			`}))`
[mangahere] add manga-extractor 2015-11-28 00:11:28 +01:00

[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`class MangahereChapterExtractor(MangahereBase, ChapterExtractor):`
			`"""Extractor for manga-chapters from mangahere.cc"""`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`pattern = [(r"(?:https?://)?(?:www\.\|m\.)?mangahere\.c[co]/manga/"`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`r"([^/]+(?:/v0*(\d+))?/c([^/?&#]+))")]`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`test = [`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`("https://www.mangahere.cc/manga/dongguo_xiaojie/c004.2/", {`
			`"keyword": "0e1cee6dd377da02ad51aa810ba65db3e811aef9",`
			`"content": "708d475f06893b88549cbd30df1e3f9428f2c884",`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`}),`
			`("http://www.mangahere.co/manga/dongguo_xiaojie/c003.2/", None),`
			`("http://m.mangahere.co/manga/dongguo_xiaojie/c003.2/", None),`
			`]`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00
			`def __init__(self, match):`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`self.part, self.volume, self.chapter = match.groups()`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00			`# remove ".html" for the first chapter page to avoid redirects`
			`url = self.url_fmt.format(self.part, "")[:-5]`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`ChapterExtractor.__init__(self, url)`
[mangahere] support ".cc" TLD and mobile URLs 2017-12-20 21:34:25 +01:00
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`def get_metadata(self, page):`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`"""Collect metadata for extractor-job"""`
			`manga, pos = text.extract(page, '<title>', '</title>')`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`mid , pos = text.extract(page, '.net/store/manga/', '/', pos)`
[mangahere] fix metadata extraction 2017-11-03 14:54:46 +01:00			`pages, pos = text.extract(page, ' class="wid60"', '</select>', pos)`
			`count = re.findall(r">(\d+)<", pages)[-1]`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`manga = re.match((r"(.+) \d+(\.\d+)? - Read .+ Chapter "`
			`r"\d+(\.\d+)? Online"), manga).group(1)`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`chapter, dot, minor = self.chapter.partition(".")`

[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`return {`
			`"manga": text.unescape(manga),`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`"manga_id": util.safe_int(mid),`
			`"title": self._get_title_map(mid).get(self.chapter),`
implement and use 'util.safe_int()' same as Python's 'int()', except it doesn't raise any exceptions and accepts a default value 2017-09-24 15:59:25 +02:00			`"volume": util.safe_int(self.volume),`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00			`"chapter": util.safe_int(chapter),`
			`"chapter_minor": dot + minor,`
implement and use 'util.safe_int()' same as Python's 'int()', except it doesn't raise any exceptions and accepts a default value 2017-09-24 15:59:25 +02:00			`"count": util.safe_int(count),`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`"lang": "en",`
			`"language": "English",`
			`}`

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`def get_images(self, page):`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`"""Yield all image-urls for this chapter"""`
			`pnum = 1`
			`while True:`
			`url, pos = text.extract(page, '<img src="', '"')`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`yield url, None`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`_ , pos = text.extract(page, '<img src="', '"', pos)`
[mangahere] fix parsing 2016-04-20 08:33:06 +02:00			`_ , pos = text.extract(page, '<img src="', '"', pos)`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`url, pos = text.extract(page, '<img src="', '"', pos)`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`yield url, None`

[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`pnum += 2`
			`page = self.request(self.url_fmt.format(self.part, pnum)).text`
[mangahere] extract chapter titles 2018-05-16 16:22:05 +02:00
			`@memcache(keyarg=1)`
			`def _get_title_map(self, manga_id):`
			`url = "{}/get_chapters{}.js".format(self.root, manga_id)`
			`page = self.request(url).text`

			`chapters = {}`
			`for info in text.extract_iter(page, '["', '"]'):`
			`title, _, url = info.partition('","')`
			`title = title.partition(": ")[2]`
			`num = url.rpartition("c")[2].rstrip("/")`
			`chapters[num] = text.unescape(title)`

			`return chapters`