gallery-dl/gallery_dl/extractor/senmanga.py

# -*- coding: utf-8 -*-

# Copyright 2016-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract manga-chapters from from http://raw.senmanga.com/"""

from .common import Extractor, Message
from .. import text


class SenmangaChapterExtractor(Extractor):
    """Extractor for manga-chapters from raw.senmanga.com"""
    category = "senmanga"
    subcategory = "chapter"
    directory_fmt = ["{category}", "{manga}", "c{chapter:>03}"]
    filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"
    pattern = [r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"]
    test = [("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", {
        "url": "32d88382fcad66859d089cd9a61249f375492ec5",
        "keyword": "bd25a8d00c8507faa5cdd6146a872797486fbf93",
        "content": "a791dda85ac0d37e3b36d754560cbb65b8dab5b9",
    })]
    url_base = "http://raw.senmanga.com"

    def __init__(self, match):
        Extractor.__init__(self)
        part = match.group(1)
        self.chapter_url = "{}/{}/".format(self.url_base, part)
        self.img_url = "{}/viewer/{}/".format(self.url_base, part)
        self.session.headers["Referer"] = self.chapter_url
        self.session.headers["User-Agent"] = "Mozilla 5.0"

    def items(self):
        data = self.get_job_metadata()
        yield Message.Version, 1
        yield Message.Directory, data
        for i in range(int(data["count"])):
            page = str(i+1)
            data["page"] = page
            data["extension"] = ""
            yield Message.Url, self.img_url + page, data

    def get_job_metadata(self):
        """Collect metadata for extractor-job"""
        page = self.request(self.chapter_url).text
        title, pos = text.extract(page, '<title>', '</title>')
        count, pos = text.extract(page, '</select> of ', ' ', pos)
        manga, pos = text.extract(title, '| Raw | ', '  |  Chapter ')
        chapter, pos = text.extract(title, '', ' |  Page ', pos)
        return {
            "manga": text.unescape(manga.replace("-", " ")),
            "chapter": chapter,
            "count": count,
            "lang": "jp",
            "language": "Japanese",
        }
[senmanga] add chapter extractor 2016-08-02 17:42:22 +02:00			`# -- coding: utf-8 --`

share extractor and downloader sessions There was never any "good" reason for the strict separation between extractors and downloaders. This change allows for reduced resource usage (probably unnoticeable) and less lines of code at the "cost" of tighter coupling. 2017-06-30 19:38:14 +02:00			`# Copyright 2016-2017 Mike Fährmann`
[senmanga] add chapter extractor 2016-08-02 17:42:22 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract manga-chapters from from http://raw.senmanga.com/"""`

			`from .common import Extractor, Message`
			`from .. import text`

code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
consistent extractor naming scheme + docstrings 2016-09-12 10:20:57 +02:00			`class SenmangaChapterExtractor(Extractor):`
			`"""Extractor for manga-chapters from raw.senmanga.com"""`
[senmanga] add chapter extractor 2016-08-02 17:42:22 +02:00			`category = "senmanga"`
			`subcategory = "chapter"`
			`directory_fmt = ["{category}", "{manga}", "c{chapter:>03}"]`
			`filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"`
			`pattern = [r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"]`
			`test = [("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", {`
			`"url": "32d88382fcad66859d089cd9a61249f375492ec5",`
allow extension by Content-Type for exhentai, seiga, senmanga 2016-09-30 16:43:43 +02:00			`"keyword": "bd25a8d00c8507faa5cdd6146a872797486fbf93",`
[senmanga] add chapter extractor 2016-08-02 17:42:22 +02:00			`"content": "a791dda85ac0d37e3b36d754560cbb65b8dab5b9",`
			`})]`
			`url_base = "http://raw.senmanga.com"`

			`def __init__(self, match):`
			`Extractor.__init__(self)`
			`part = match.group(1)`
			`self.chapter_url = "{}/{}/".format(self.url_base, part)`
			`self.img_url = "{}/viewer/{}/".format(self.url_base, part)`
			`self.session.headers["Referer"] = self.chapter_url`
			`self.session.headers["User-Agent"] = "Mozilla 5.0"`

			`def items(self):`
			`data = self.get_job_metadata()`
			`yield Message.Version, 1`
			`yield Message.Directory, data`
			`for i in range(int(data["count"])):`
			`page = str(i+1)`
			`data["page"] = page`
allow extension by Content-Type for exhentai, seiga, senmanga 2016-09-30 16:43:43 +02:00			`data["extension"] = ""`
[senmanga] add chapter extractor 2016-08-02 17:42:22 +02:00			`yield Message.Url, self.img_url + page, data`

			`def get_job_metadata(self):`
			`"""Collect metadata for extractor-job"""`
			`page = self.request(self.chapter_url).text`
			`title, pos = text.extract(page, '<title>', '</title>')`
			`count, pos = text.extract(page, '</select> of ', ' ', pos)`
			`manga, pos = text.extract(title, '\| Raw \| ', ' \| Chapter ')`
			`chapter, pos = text.extract(title, '', ' \| Page ', pos)`
			`return {`
			`"manga": text.unescape(manga.replace("-", " ")),`
			`"chapter": chapter,`
			`"count": count,`
			`"lang": "jp",`
			`"language": "Japanese",`
			`}`