gallery-dl/gallery_dl/extractor/readcomiconline.py

# -*- coding: utf-8 -*-

# Copyright 2016-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract comic-issues and entire comics from http://readcomiconline.to/"""

from .common import ChapterExtractor, MangaExtractor
from .. import text, cloudflare
import re


class ReadcomiconlineBase():
    """Base class for readcomiconline extractors"""
    category = "readcomiconline"
    directory_fmt = ["{category}", "{comic}", "{issue:>03}"]
    filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
    archive_fmt = "{issue_id}_{page}"
    root = "http://readcomiconline.to"
    useragent = "Wget/1.19.2 (linux-gnu)"

    request = cloudflare.request_func


class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
    """Extractor for comics from readcomiconline.to"""
    subcategory = "comic"
    pattern = [r"(?i)(?:https?://)?(?:www\.)?(readcomiconline\.to"
               r"/Comic/[^/?&#]+/?)$"]
    test = [
        ("http://readcomiconline.to/Comic/W-i-t-c-h", {
            "url": "c5a530538a30b176916e30cbe223a93d83cb2691",
            "keyword": "3986248e4458fa44a201ec073c3684917f48ee0c",
        }),
        ("http://readcomiconline.to/Comic/Bazooka-Jules", {
            "url": "e517dca61dff489f18ca781084f59a9eeb60a6b6",
            "keyword": "f5ba5246cd787bb750924d9690cb1549199bd516",
        }),
    ]

    def __init__(self, match):
        MangaExtractor.__init__(self, match)
        self.session.headers["User-Agent"] = self.useragent

    def chapters(self, page):
        results = []
        comic, pos = text.extract(page, '<div class="heading"><h3>', '<')
        page , pos = text.extract(page, '<ul class="list">', '</ul>', pos)

        for item in text.extract_iter(page, '<a href="', '</span>'):
            url, _, issue = item.partition('"><span>')
            if issue.startswith('Issue #'):
                issue = issue[7:]
            results.append((self.root + url, {
                "comic": comic, "issue": issue,
                "issue_id": text.parse_int(url.rpartition("=")[2]),
                "lang": "en", "language": "English",
            }))
        return results


class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
    """Extractor for comic-issues from readcomiconline.to"""
    subcategory = "issue"
    pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
               r"/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+)"]
    test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
        "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",
        "keyword": "c6de1c9c8a307dc4be56783c4ac6f1338ffac6fc",
    })]

    def __init__(self, match):
        ChapterExtractor.__init__(self, match.group(0))
        self.issue_id = match.group(1)
        self.session.headers["User-Agent"] = self.useragent

    def get_metadata(self, page):
        comic, pos = text.extract(page, "   - Read\r\n    ", "\r\n")
        iinfo, pos = text.extract(page, "    ", "\r\n", pos)
        match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
        return {
            "comic": comic,
            "issue": match.group(1) or match.group(2),
            "issue_id": text.parse_int(self.issue_id),
            "lang": "en",
            "language": "English",
        }

    def get_images(self, page):
        self.session.headers["Referer"] = None
        return [
            (url, None)
            for url in text.extract_iter(
                page, 'lstImages.push("', '"'
            )
        ]
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`# -- coding: utf-8 --`

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`# Copyright 2016-2018 Mike Fährmann`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract comic-issues and entire comics from http://readcomiconline.to/"""`

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`from .common import ChapterExtractor, MangaExtractor`
rename safe_int to parse_int; move parse_* to text module 2018-04-20 14:53:21 +02:00			`from .. import text, cloudflare`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`import re`

code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`class ReadcomiconlineBase():`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"""Base class for readcomiconline extractors"""`
			`category = "readcomiconline"`
			`directory_fmt = ["{category}", "{comic}", "{issue:>03}"]`
			`filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`archive_fmt = "{issue_id}_{page}"`
rewrite parts of the cloudflare bypass system 2016-12-16 13:28:36 +01:00			`root = "http://readcomiconline.to"`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`useragent = "Wget/1.19.2 (linux-gnu)"`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`request = cloudflare.request_func`
add 'extractor.*.user-agent' config option 2017-11-15 13:54:40 +01:00
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"""Extractor for comics from readcomiconline.to"""`
			`subcategory = "comic"`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`pattern = [r"(?i)(?:https?://)?(?:www\.)?(readcomiconline\.to"`
			`r"/Comic/[^/?&#]+/?)$"]`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`test = [`
			`("http://readcomiconline.to/Comic/W-i-t-c-h", {`
			`"url": "c5a530538a30b176916e30cbe223a93d83cb2691",`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"keyword": "3986248e4458fa44a201ec073c3684917f48ee0c",`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`}),`
			`("http://readcomiconline.to/Comic/Bazooka-Jules", {`
			`"url": "e517dca61dff489f18ca781084f59a9eeb60a6b6",`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"keyword": "f5ba5246cd787bb750924d9690cb1549199bd516",`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`}),`
			`]`

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`def __init__(self, match):`
			`MangaExtractor.__init__(self, match)`
			`self.session.headers["User-Agent"] = self.useragent`

[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`def chapters(self, page):`
			`results = []`
			`comic, pos = text.extract(page, '<div class="heading"><h3>', '<')`
			`page , pos = text.extract(page, '<ul class="list">', '</ul>', pos)`

			`for item in text.extract_iter(page, '<a href="', '</span>'):`
			`url, _, issue = item.partition('"><span>')`
			`if issue.startswith('Issue #'):`
			`issue = issue[7:]`
			`results.append((self.root + url, {`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"comic": comic, "issue": issue,`
rename safe_int to parse_int; move parse_* to text module 2018-04-20 14:53:21 +02:00			`"issue_id": text.parse_int(url.rpartition("=")[2]),`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`"lang": "en", "language": "English",`
			`}))`
			`return results`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"""Extractor for comic-issues from readcomiconline.to"""`
			`subcategory = "issue"`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`r"/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+)"]`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {`
[gelbooru] tag-splitting for non-api mode 2018-07-06 15:18:49 +02:00			`"url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"keyword": "c6de1c9c8a307dc4be56783c4ac6f1338ffac6fc",`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`})]`

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`def __init__(self, match):`
			`ChapterExtractor.__init__(self, match.group(0))`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`self.issue_id = match.group(1)`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`self.session.headers["User-Agent"] = self.useragent`

			`def get_metadata(self, page):`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`comic, pos = text.extract(page, " - Read\r\n ", "\r\n")`
			`iinfo, pos = text.extract(page, " ", "\r\n", pos)`
			`match = re.match(r"(?:Issue )?#(\d+)\|(.+)", iinfo)`
			`return {`
			`"comic": comic,`
			`"issue": match.group(1) or match.group(2),`
rename safe_int to parse_int; move parse_* to text module 2018-04-20 14:53:21 +02:00			`"issue_id": text.parse_int(self.issue_id),`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"lang": "en",`
			`"language": "English",`
			`}`
[kissmanga] re-enable module 2017-04-05 12:16:23 +02:00
smaller changes and fixes - fix the cloudflare challenge result if the last decimal places are zero (JS`s toFixed() removes trailing zeroes) - fix downloading of kissmanga chapter-pages hosted on blogspot (accessing blogspot with "kissmanga.com" as referrer yields a 401) - disable certificate validation for 'mangahere' tests - update flickr test result 2018-04-06 15:30:09 +02:00			`def get_images(self, page):`
			`self.session.headers["Referer"] = None`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`return [`
			`(url, None)`
			`for url in text.extract_iter(`
			`page, 'lstImages.push("', '"'`
			`)`
			`]`