gallery-dl/gallery_dl/extractor/readcomiconline.py

# -*- coding: utf-8 -*-

# Copyright 2016-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract comic-issues and entire comics from https://readcomiconline.to/"""

from .common import ChapterExtractor, MangaExtractor
from .. import text, cloudflare
import re


class ReadcomiconlineBase():
    """Base class for readcomiconline extractors"""
    category = "readcomiconline"
    directory_fmt = ["{category}", "{comic}", "{issue:>03}"]
    filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
    archive_fmt = "{issue_id}_{page}"
    root = "https://readcomiconline.to"

    request = cloudflare.request_func


class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
    """Extractor for comics from readcomiconline.to"""
    subcategory = "comic"
    pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
               r"(/Comic/[^/?&#]+/?)$"]
    test = [
        ("https://readcomiconline.to/Comic/W-i-t-c-h", {
            "url": "e231bc2a293edb465133c37a8e36a7e7d94cab14",
            "keyword": "3986248e4458fa44a201ec073c3684917f48ee0c",
        }),
        ("https://readcomiconline.to/Comic/Bazooka-Jules", {
            "url": "711674cb78ed10bd2557315f7a67552d01b33985",
            "keyword": "f5ba5246cd787bb750924d9690cb1549199bd516",
        }),
    ]

    def __init__(self, match):
        MangaExtractor.__init__(self, match, self.root + match.group(1))

    def chapters(self, page):
        results = []
        comic, pos = text.extract(page, ' class="barTitle">', '<')
        page , pos = text.extract(page, ' class="listing">', '</table>', pos)

        comic = comic.rpartition("information")[0].strip()
        needle = ' title="Read {} '.format(comic)
        comic = text.unescape(comic)

        for item in text.extract_iter(page, ' href="', ' comic online '):
            url, _, issue = item.partition(needle)
            url = url.rpartition('"')[0]
            if issue.startswith('Issue #'):
                issue = issue[7:]
            results.append((self.root + url, {
                "comic": comic, "issue": issue,
                "issue_id": text.parse_int(url.rpartition("=")[2]),
                "lang": "en", "language": "English",
            }))
        return results


class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
    """Extractor for comic-issues from readcomiconline.to"""
    subcategory = "issue"
    pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
               r"(/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+))"]
    test = [("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
        "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",
        "keyword": "c6de1c9c8a307dc4be56783c4ac6f1338ffac6fc",
    })]

    def __init__(self, match):
        ChapterExtractor.__init__(self, self.root + match.group(1))
        self.issue_id = match.group(2)

    def get_metadata(self, page):
        comic, pos = text.extract(page, "   - Read\r\n    ", "\r\n")
        iinfo, pos = text.extract(page, "    ", "\r\n", pos)
        match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
        return {
            "comic": comic,
            "issue": match.group(1) or match.group(2),
            "issue_id": text.parse_int(self.issue_id),
            "lang": "en",
            "language": "English",
        }

    def get_images(self, page):
        self.session.headers["Referer"] = None
        return [
            (url, None)
            for url in text.extract_iter(
                page, 'lstImages.push("', '"'
            )
        ]
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`# -- coding: utf-8 --`

update URLs for supportedsites.rst 2019-01-30 16:18:22 +01:00			`# Copyright 2016-2019 Mike Fährmann`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

update URLs for supportedsites.rst 2019-01-30 16:18:22 +01:00			`"""Extract comic-issues and entire comics from https://readcomiconline.to/"""`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`from .common import ChapterExtractor, MangaExtractor`
rename safe_int to parse_int; move parse_* to text module 2018-04-20 14:53:21 +02:00			`from .. import text, cloudflare`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`import re`

code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`class ReadcomiconlineBase():`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"""Base class for readcomiconline extractors"""`
			`category = "readcomiconline"`
			`directory_fmt = ["{category}", "{comic}", "{issue:>03}"]`
			`filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`archive_fmt = "{issue_id}_{page}"`
[readcomiconline] use HTTPS 2018-12-09 14:54:55 +01:00			`root = "https://readcomiconline.to"`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`request = cloudflare.request_func`
add 'extractor.*.user-agent' config option 2017-11-15 13:54:40 +01:00
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"""Extractor for comics from readcomiconline.to"""`
			`subcategory = "comic"`
[readcomiconline] use HTTPS 2018-12-09 14:54:55 +01:00			`pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"`
			`r"(/Comic/[^/?&#]+/?)$"]`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`test = [`
[readcomiconline] use HTTPS 2018-12-09 14:54:55 +01:00			`("https://readcomiconline.to/Comic/W-i-t-c-h", {`
			`"url": "e231bc2a293edb465133c37a8e36a7e7d94cab14",`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"keyword": "3986248e4458fa44a201ec073c3684917f48ee0c",`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`}),`
[readcomiconline] use HTTPS 2018-12-09 14:54:55 +01:00			`("https://readcomiconline.to/Comic/Bazooka-Jules", {`
			`"url": "711674cb78ed10bd2557315f7a67552d01b33985",`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"keyword": "f5ba5246cd787bb750924d9690cb1549199bd516",`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`}),`
			`]`

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`def __init__(self, match):`
[readcomiconline] use HTTPS 2018-12-09 14:54:55 +01:00			`MangaExtractor.__init__(self, match, self.root + match.group(1))`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`def chapters(self, page):`
			`results = []`
[readcomiconline] improve comic-page parsing 2018-12-30 13:13:25 +01:00			`comic, pos = text.extract(page, ' class="barTitle">', '<')`
			`page , pos = text.extract(page, ' class="listing">', '</table>', pos)`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00
[readcomiconline] improve comic-page parsing 2018-12-30 13:13:25 +01:00			`comic = comic.rpartition("information")[0].strip()`
			`needle = ' title="Read {} '.format(comic)`
			`comic = text.unescape(comic)`

			`for item in text.extract_iter(page, ' href="', ' comic online '):`
			`url, _, issue = item.partition(needle)`
			`url = url.rpartition('"')[0]`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`if issue.startswith('Issue #'):`
			`issue = issue[7:]`
			`results.append((self.root + url, {`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"comic": comic, "issue": issue,`
rename safe_int to parse_int; move parse_* to text module 2018-04-20 14:53:21 +02:00			`"issue_id": text.parse_int(url.rpartition("=")[2]),`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`"lang": "en", "language": "English",`
			`}))`
			`return results`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"""Extractor for comic-issues from readcomiconline.to"""`
			`subcategory = "issue"`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"`
[readcomiconline] use HTTPS 2018-12-09 14:54:55 +01:00			`r"(/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+))"]`
			`test = [("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {`
[gelbooru] tag-splitting for non-api mode 2018-07-06 15:18:49 +02:00			`"url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"keyword": "c6de1c9c8a307dc4be56783c4ac6f1338ffac6fc",`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`})]`

use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`def __init__(self, match):`
[readcomiconline] use HTTPS 2018-12-09 14:54:55 +01:00			`ChapterExtractor.__init__(self, self.root + match.group(1))`
			`self.issue_id = match.group(2)`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00
			`def get_metadata(self, page):`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`comic, pos = text.extract(page, " - Read\r\n ", "\r\n")`
			`iinfo, pos = text.extract(page, " ", "\r\n", pos)`
			`match = re.match(r"(?:Issue )?#(\d+)\|(.+)", iinfo)`
			`return {`
			`"comic": comic,`
			`"issue": match.group(1) or match.group(2),`
rename safe_int to parse_int; move parse_* to text module 2018-04-20 14:53:21 +02:00			`"issue_id": text.parse_int(self.issue_id),`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"lang": "en",`
			`"language": "English",`
			`}`
[kissmanga] re-enable module 2017-04-05 12:16:23 +02:00
smaller changes and fixes - fix the cloudflare challenge result if the last decimal places are zero (JS`s toFixed() removes trailing zeroes) - fix downloading of kissmanga chapter-pages hosted on blogspot (accessing blogspot with "kissmanga.com" as referrer yields a 401) - disable certificate validation for 'mangahere' tests - update flickr test result 2018-04-06 15:30:09 +02:00			`def get_images(self, page):`
			`self.session.headers["Referer"] = None`
use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`return [`
			`(url, None)`
			`for url in text.extract_iter(`
			`page, 'lstImages.push("', '"'`
			`)`
			`]`