gallery-dl/gallery_dl/extractor/readcomiconline.py

# -*- coding: utf-8 -*-

# Copyright 2016-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://readcomiconline.li/"""

from .common import Extractor, ChapterExtractor, MangaExtractor
from .. import text, exception
import re

BASE_PATTERN = r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.(?:li|to)"


class ReadcomiconlineBase():
    """Base class for readcomiconline extractors"""
    category = "readcomiconline"
    directory_fmt = ("{category}", "{comic}", "{issue:>03}")
    filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
    archive_fmt = "{issue_id}_{page}"
    root = "https://readcomiconline.li"

    def request(self, url, **kwargs):
        """Detect and handle redirects to CAPTCHA pages"""
        while True:
            response = Extractor.request(self, url, **kwargs)
            if not response.history or "/AreYouHuman" not in response.url:
                return response
            if self.config("captcha", "stop") == "wait":
                self.log.warning(
                    "Redirect to \n%s\nVisit this URL in your browser, solve "
                    "the CAPTCHA, and press ENTER to continue", response.url)
                try:
                    input()
                except (EOFError, OSError):
                    pass
            else:
                raise exception.StopExtraction(
                    "Redirect to \n%s\nVisit this URL in your browser and "
                    "solve the CAPTCHA to continue", response.url)


class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
    """Extractor for comic-issues from readcomiconline.li"""
    subcategory = "issue"
    pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/[^/?#]+\?id=(\d+))"
    test = ("https://readcomiconline.li/Comic/W-i-t-c-h/Issue-130?id=22289", {
        "url": "30d29c5afc65043bfd384c010257ec2d0ecbafa6",
        "keyword": "2d9ec81ce1b11fac06ebf96ce33cdbfca0e85eb5",
    })

    def __init__(self, match):
        ChapterExtractor.__init__(self, match)
        self.gallery_url += "&quality=hq"
        self.issue_id = match.group(2)

    def metadata(self, page):
        comic, pos = text.extract(page, "   - Read\r\n    ", "\r\n")
        iinfo, pos = text.extract(page, "    ", "\r\n", pos)
        match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
        return {
            "comic": comic,
            "issue": match.group(1) or match.group(2),
            "issue_id": text.parse_int(self.issue_id),
            "lang": "en",
            "language": "English",
        }

    def images(self, page):
        return [
            (url, None)
            for url in text.extract_iter(
                page, 'lstImages.push("', '"'
            )
        ]


class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
    """Extractor for comics from readcomiconline.li"""
    chapterclass = ReadcomiconlineIssueExtractor
    subcategory = "comic"
    pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/?)$"
    test = (
        ("https://readcomiconline.li/Comic/W-i-t-c-h", {
            "url": "74eb8b9504b4084fcc9367b341300b2c52260918",
            "keyword": "3986248e4458fa44a201ec073c3684917f48ee0c",
        }),
        ("https://readcomiconline.to/Comic/Bazooka-Jules", {
            "url": "2f66a467a772df4d4592e97a059ddbc3e8991799",
            "keyword": "f5ba5246cd787bb750924d9690cb1549199bd516",
        }),
    )

    def chapters(self, page):
        results = []
        comic, pos = text.extract(page, ' class="barTitle">', '<')
        page , pos = text.extract(page, ' class="listing">', '</table>', pos)

        comic = comic.rpartition("information")[0].strip()
        needle = ' title="Read {} '.format(comic)
        comic = text.unescape(comic)

        for item in text.extract_iter(page, ' href="', ' comic online '):
            url, _, issue = item.partition(needle)
            url = url.rpartition('"')[0]
            if issue.startswith('Issue #'):
                issue = issue[7:]
            results.append((self.root + url, {
                "comic": comic, "issue": issue,
                "issue_id": text.parse_int(url.rpartition("=")[2]),
                "lang": "en", "language": "English",
            }))
        return results
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`# -- coding: utf-8 --`

[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`# Copyright 2016-2021 Mike Fährmann`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`"""Extractors for https://readcomiconline.li/"""`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00
[kissmanga] remove module 2020-10-04 22:46:41 +02:00			`from .common import Extractor, ChapterExtractor, MangaExtractor`
			`from .. import text, exception`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`import re`

[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`BASE_PATTERN = r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.(?:li\|to)"`

code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
[kissmanga] remove module 2020-10-04 22:46:41 +02:00			`class ReadcomiconlineBase():`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"""Base class for readcomiconline extractors"""`
			`category = "readcomiconline"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = ("{category}", "{comic}", "{issue:>03}")`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`archive_fmt = "{issue_id}_{page}"`
[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`root = "https://readcomiconline.li"`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00
[kissmanga] remove module 2020-10-04 22:46:41 +02:00			`def request(self, url, **kwargs):`
			`"""Detect and handle redirects to CAPTCHA pages"""`
			`while True:`
			`response = Extractor.request(self, url, **kwargs)`
			`if not response.history or "/AreYouHuman" not in response.url:`
			`return response`
			`if self.config("captcha", "stop") == "wait":`
			`self.log.warning(`
			`"Redirect to \n%s\nVisit this URL in your browser, solve "`
			`"the CAPTCHA, and press ENTER to continue", response.url)`
			`try:`
			`input()`
			`except (EOFError, OSError):`
			`pass`
			`else:`
			`raise exception.StopExtraction(`
			`"Redirect to \n%s\nVisit this URL in your browser and "`
			`"solve the CAPTCHA to continue", response.url)`

[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):`
[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`"""Extractor for comic-issues from readcomiconline.li"""`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`subcategory = "issue"`
[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/[^/?#]+\?id=(\d+))"`
			`test = ("https://readcomiconline.li/Comic/W-i-t-c-h/Issue-130?id=22289", {`
[readcomiconline] download high quality image versions (fixes #1347) 2021-02-28 01:07:13 +01:00			`"url": "30d29c5afc65043bfd384c010257ec2d0ecbafa6",`
			`"keyword": "2d9ec81ce1b11fac06ebf96ce33cdbfca0e85eb5",`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`})`

			`def __init__(self, match):`
			`ChapterExtractor.__init__(self, match)`
[readcomiconline] download high quality image versions (fixes #1347) 2021-02-28 01:07:13 +01:00			`self.gallery_url += "&quality=hq"`
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`self.issue_id = match.group(2)`

			`def metadata(self, page):`
			`comic, pos = text.extract(page, " - Read\r\n ", "\r\n")`
			`iinfo, pos = text.extract(page, " ", "\r\n", pos)`
			`match = re.match(r"(?:Issue )?#(\d+)\|(.+)", iinfo)`
			`return {`
			`"comic": comic,`
			`"issue": match.group(1) or match.group(2),`
			`"issue_id": text.parse_int(self.issue_id),`
			`"lang": "en",`
			`"language": "English",`
			`}`

			`def images(self, page):`
			`return [`
			`(url, None)`
			`for url in text.extract_iter(`
			`page, 'lstImages.push("', '"'`
			`)`
			`]`


use generic chapter-extractor in more modules 2018-02-07 11:22:47 +01:00			`class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):`
[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`"""Extractor for comics from readcomiconline.li"""`
add '_extractor' info to manga extractor results 2019-02-13 13:23:36 +01:00			`chapterclass = ReadcomiconlineIssueExtractor`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`subcategory = "comic"`
[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/?)$"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`test = (`
[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`("https://readcomiconline.li/Comic/W-i-t-c-h", {`
			`"url": "74eb8b9504b4084fcc9367b341300b2c52260918",`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"keyword": "3986248e4458fa44a201ec073c3684917f48ee0c",`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`}),`
[readcomiconline] use HTTPS 2018-12-09 14:54:55 +01:00			`("https://readcomiconline.to/Comic/Bazooka-Jules", {`
[readcomiconline] change domain to 'readcomiconline.li' (closes #1517) 2021-05-01 16:41:16 +02:00			`"url": "2f66a467a772df4d4592e97a059ddbc3e8991799",`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"keyword": "f5ba5246cd787bb750924d9690cb1549199bd516",`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`)`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00
			`def chapters(self, page):`
			`results = []`
[readcomiconline] improve comic-page parsing 2018-12-30 13:13:25 +01:00			`comic, pos = text.extract(page, ' class="barTitle">', '<')`
			`page , pos = text.extract(page, ' class="listing">', '</table>', pos)`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00
[readcomiconline] improve comic-page parsing 2018-12-30 13:13:25 +01:00			`comic = comic.rpartition("information")[0].strip()`
			`needle = ' title="Read {} '.format(comic)`
			`comic = text.unescape(comic)`

			`for item in text.extract_iter(page, ' href="', ' comic online '):`
			`url, _, issue = item.partition(needle)`
			`url = url.rpartition('"')[0]`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`if issue.startswith('Issue #'):`
			`issue = issue[7:]`
			`results.append((self.root + url, {`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`"comic": comic, "issue": issue,`
rename safe_int to parse_int; move parse_* to text module 2018-04-20 14:53:21 +02:00			`"issue_id": text.parse_int(url.rpartition("=")[2]),`
[readcomiconline] extract comic metadata 2017-09-18 19:18:24 +02:00			`"lang": "en", "language": "English",`
			`}))`
			`return results`