gallery-dl/gallery_dl/extractor/keenspot.py

# -*- coding: utf-8 -*-

# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for http://www.keenspot.com/"""

from .common import Extractor, Message
from .. import text


class KeenspotComicExtractor(Extractor):
    """Extractor for webcomics from keenspot.com"""
    category = "keenspot"
    subcategory = "comic"
    directory_fmt = ("{category}", "{comic}")
    filename_fmt = "{filename}.{extension}"
    archive_fmt = "{comic}_{filename}"
    pattern = r"(?:https?://)?(?!www\.|forums\.)([\w-]+)\.keenspot\.com(/.+)?"
    test = (
        # link
        ("http://marksmen.keenspot.com/", {
            "range": "1-3",
            "url": "83bcf029103bf8bc865a1988afa4aaeb23709ba6",
        }),
        # id
        ("http://barkercomic.keenspot.com/", {
            "range": "1-3",
            "url": "c4080926db18d00bac641fdd708393b7d61379e6",
        }),
        # id v2
        ("http://crowscare.keenspot.com/", {
            "range": "1-3",
            "url": "a00e66a133dd39005777317da90cef921466fcaa"
        }),
        # ks
        ("http://supernovas.keenspot.com/", {
            "range": "1-3",
            "url": "de21b12887ef31ff82edccbc09d112e3885c3aab"
        }),
        # "random" access
        ("http://twokinds.keenspot.com/comic/1066/", {
            "range": "1-3",
            "url": "6a784e11370abfb343dcad9adbb7718f9b7be350",
        })
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.comic = match.group(1).lower()
        self.path = match.group(2)
        self.root = "http://" + self.comic + ".keenspot.com"

        self._needle = ""
        self._image = 'class="ksc"'
        self._next = self._next_needle

    def items(self):
        data = {"comic": self.comic}
        yield Message.Directory, data

        with self.request(self.root + "/") as response:
            if response.history:
                url = response.request.url
                self.root = url[:url.index("/", 8)]
            page = response.text
            del response

        url = self._first(page)
        if self.path:
            url = self.root + self.path

        prev = None
        ilen = len(self._image)
        while url and url != prev:
            prev = url
            page = self.request(text.urljoin(self.root, url)).text

            pos = 0
            while True:
                pos = page.find(self._image, pos)
                if pos < 0:
                    break
                img, pos = text.extract(page, 'src="', '"', pos + ilen)
                if img.endswith(".js"):
                    continue
                if img[0] == "/":
                    img = self.root + img
                elif "youtube.com/" in img:
                    img = "ytdl:" + img
                yield Message.Url, img, text.nameext_from_url(img, data)

            url = self._next(page)

    def _first(self, page):
        if self.comic == "brawlinthefamily":
            self._next = self._next_brawl
            self._image = '<div id="comic">'
            return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"

        url = text.extr(page, '<link rel="first" href="', '"')
        if url:
            if self.comic == "porcelain":
                self._needle = 'id="porArchivetop_"'
            else:
                self._next = self._next_link
            return url

        pos = page.find('id="first_day1"')
        if pos >= 0:
            self._next = self._next_id
            return text.rextract(page, 'href="', '"', pos)[0]

        pos = page.find('>FIRST PAGE<')
        if pos >= 0:
            if self.comic == "lastblood":
                self._next = self._next_lastblood
                self._image = '<div id="comic">'
            else:
                self._next = self._next_id
            return text.rextract(page, 'href="', '"', pos)[0]

        pos = page.find('<div id="kscomicpart"')
        if pos >= 0:
            self._needle = '<a href="/archive.html'
            return text.extract(page, 'href="', '"', pos)[0]

        pos = page.find('>First Comic<')  # twokinds
        if pos >= 0:
            self._image = '</header>'
            self._needle = 'class="navarchive"'
            return text.rextract(page, 'href="', '"', pos)[0]

        pos = page.find('id="flip_FirstDay"')  # flipside
        if pos >= 0:
            self._image = 'class="flip_Pages ksc"'
            self._needle = 'id="flip_ArcButton"'
            return text.rextract(page, 'href="', '"', pos)[0]

        self.log.error("Unrecognized page layout")
        return None

    def _next_needle(self, page):
        pos = page.index(self._needle) + len(self._needle)
        return text.extract(page, 'href="', '"', pos)[0]

    @staticmethod
    def _next_link(page):
        return text.extr(page, '<link rel="next" href="', '"')

    @staticmethod
    def _next_id(page):
        pos = page.find('id="next_')
        return text.rextract(page, 'href="', '"', pos)[0] if pos >= 0 else None

    @staticmethod
    def _next_lastblood(page):
        pos = page.index("link rel='next'")
        return text.extract(page, "href='", "'", pos)[0]

    @staticmethod
    def _next_brawl(page):
        pos = page.index("comic-nav-next")
        url = text.rextract(page, 'href="', '"', pos)[0]
        return None if "?random" in url else url
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`# -- coding: utf-8 --`

update test comment positions always put them above the test they're referring to 2023-09-06 18:16:09 +02:00			`# Copyright 2019-2023 Mike Fährmann`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for http://www.keenspot.com/"""`

			`from .common import Extractor, Message`
			`from .. import text`


			`class KeenspotComicExtractor(Extractor):`
			`"""Extractor for webcomics from keenspot.com"""`
			`category = "keenspot"`
			`subcategory = "comic"`
			`directory_fmt = ("{category}", "{comic}")`
			`filename_fmt = "{filename}.{extension}"`
			`archive_fmt = "{comic}_{filename}"`
generic extractor (#735) * Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor. 2021-12-29 22:39:29 +01:00			`pattern = r"(?:https?://)?(?!www\.\|forums\.)([\w-]+)\.keenspot\.com(/.+)?"`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`test = (`
update test comment positions always put them above the test they're referring to 2023-09-06 18:16:09 +02:00			`# link`
			`("http://marksmen.keenspot.com/", {`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`"range": "1-3",`
			`"url": "83bcf029103bf8bc865a1988afa4aaeb23709ba6",`
			`}),`
update test comment positions always put them above the test they're referring to 2023-09-06 18:16:09 +02:00			`# id`
			`("http://barkercomic.keenspot.com/", {`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`"range": "1-3",`
			`"url": "c4080926db18d00bac641fdd708393b7d61379e6",`
			`}),`
update test comment positions always put them above the test they're referring to 2023-09-06 18:16:09 +02:00			`# id v2`
			`("http://crowscare.keenspot.com/", {`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`"range": "1-3",`
			`"url": "a00e66a133dd39005777317da90cef921466fcaa"`
			`}),`
update test comment positions always put them above the test they're referring to 2023-09-06 18:16:09 +02:00			`# ks`
			`("http://supernovas.keenspot.com/", {`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`"range": "1-3",`
			`"url": "de21b12887ef31ff82edccbc09d112e3885c3aab"`
			`}),`
update test comment positions always put them above the test they're referring to 2023-09-06 18:16:09 +02:00			`# "random" access`
			`("http://twokinds.keenspot.com/comic/1066/", {`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`"range": "1-3",`
[keenspot] improve redirect handling Before it would use http:// for all requests and get a redirect to a https:// version if those are supported. Now the redirect only happens once during the first request. 2020-12-26 21:38:40 +01:00			`"url": "6a784e11370abfb343dcad9adbb7718f9b7be350",`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`})`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`)`

			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`self.comic = match.group(1).lower()`
			`self.path = match.group(2)`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`self.root = "http://" + self.comic + ".keenspot.com"`
[keenspot] improve pagination (#223) The old code would skip the last comic page for some series. 2019-06-02 22:12:21 +02:00
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`self._needle = ""`
			`self._image = 'class="ksc"'`
			`self._next = self._next_needle`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00
			`def items(self):`
			`data = {"comic": self.comic}`
			`yield Message.Directory, data`

[keenspot] improve redirect handling Before it would use http:// for all requests and get a redirect to a https:// version if those are supported. Now the redirect only happens once during the first request. 2020-12-26 21:38:40 +01:00			`with self.request(self.root + "/") as response:`
			`if response.history:`
			`url = response.request.url`
			`self.root = url[:url.index("/", 8)]`
			`page = response.text`
			`del response`

			`url = self._first(page)`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`if self.path:`
			`url = self.root + self.path`

[keenspot] improve pagination (#223) The old code would skip the last comic page for some series. 2019-06-02 22:12:21 +02:00			`prev = None`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`ilen = len(self._image)`
[keenspot] improve pagination (#223) The old code would skip the last comic page for some series. 2019-06-02 22:12:21 +02:00			`while url and url != prev:`
			`prev = url`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`page = self.request(text.urljoin(self.root, url)).text`

			`pos = 0`
			`while True:`
			`pos = page.find(self._image, pos)`
			`if pos < 0:`
			`break`
			`img, pos = text.extract(page, 'src="', '"', pos + ilen)`
			`if img.endswith(".js"):`
			`continue`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`if img[0] == "/":`
			`img = self.root + img`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`elif "youtube.com/" in img:`
			`img = "ytdl:" + img`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`yield Message.Url, img, text.nameext_from_url(img, data)`

			`url = self._next(page)`

			`def _first(self, page):`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`if self.comic == "brawlinthefamily":`
			`self._next = self._next_brawl`
			`self._image = '<div id="comic">'`
			`return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"`

replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`url = text.extr(page, '<link rel="first" href="', '"')`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`if url:`
			`if self.comic == "porcelain":`
			`self._needle = 'id="porArchivetop_"'`
			`else:`
			`self._next = self._next_link`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`return url`

			`pos = page.find('id="first_day1"')`
			`if pos >= 0:`
			`self._next = self._next_id`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`return text.rextract(page, 'href="', '"', pos)[0]`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00
			`pos = page.find('>FIRST PAGE<')`
			`if pos >= 0:`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`if self.comic == "lastblood":`
			`self._next = self._next_lastblood`
			`self._image = '<div id="comic">'`
			`else:`
			`self._next = self._next_id`
			`return text.rextract(page, 'href="', '"', pos)[0]`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00
			`pos = page.find('<div id="kscomicpart"')`
			`if pos >= 0:`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`self._needle = '<a href="/archive.html'`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`return text.extract(page, 'href="', '"', pos)[0]`

[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`pos = page.find('>First Comic<') # twokinds`
			`if pos >= 0:`
[keenspot] fix extraction for "TwoKinds" 2019-06-17 19:33:16 +02:00			`self._image = '</header>'`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`self._needle = 'class="navarchive"'`
			`return text.rextract(page, 'href="', '"', pos)[0]`

			`pos = page.find('id="flip_FirstDay"') # flipside`
			`if pos >= 0:`
			`self._image = 'class="flip_Pages ksc"'`
			`self._needle = 'id="flip_ArcButton"'`
			`return text.rextract(page, 'href="', '"', pos)[0]`

[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`self.log.error("Unrecognized page layout")`
			`return None`

[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`def _next_needle(self, page):`
			`pos = page.index(self._needle) + len(self._needle)`
			`return text.extract(page, 'href="', '"', pos)[0]`

[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00			`@staticmethod`
			`def _next_link(page):`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`return text.extr(page, '<link rel="next" href="', '"')`
[keenspot] add comic extractor (#223) Doesn't work for - http://brawlinthefamily.keenspot.com/ - http://flipside.keenspot.com/ - http://lastblood.keenspot.com/ - http://mysticrevolution.keenspot.com/ - http://porcelain.keenspot.com/ - http://twokinds.keenspot.com/ yet, because of custom layouts. 2019-05-28 21:34:38 +02:00
			`@staticmethod`
			`def _next_id(page):`
			`pos = page.find('id="next_')`
			`return text.rextract(page, 'href="', '"', pos)[0] if pos >= 0 else None`

			`@staticmethod`
[keenspot] support all comics and "random" access (#223) 2019-06-01 18:43:54 +02:00			`def _next_lastblood(page):`
			`pos = page.index("link rel='next'")`
			`return text.extract(page, "href='", "'", pos)[0]`

			`@staticmethod`
			`def _next_brawl(page):`
			`pos = page.index("comic-nav-next")`
			`url = text.rextract(page, 'href="', '"', pos)[0]`
			`return None if "?random" in url else url`