2019-05-28 21:34:38 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-09-06 18:16:09 +02:00
|
|
|
# Copyright 2019-2023 Mike Fährmann
|
2019-05-28 21:34:38 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Extractors for http://www.keenspot.com/"""
|
|
|
|
|
|
|
|
from .common import Extractor, Message
|
|
|
|
from .. import text
|
|
|
|
|
|
|
|
|
|
|
|
class KeenspotComicExtractor(Extractor):
|
|
|
|
"""Extractor for webcomics from keenspot.com"""
|
|
|
|
category = "keenspot"
|
|
|
|
subcategory = "comic"
|
|
|
|
directory_fmt = ("{category}", "{comic}")
|
|
|
|
filename_fmt = "{filename}.{extension}"
|
|
|
|
archive_fmt = "{comic}_{filename}"
|
2021-12-29 22:39:29 +01:00
|
|
|
pattern = r"(?:https?://)?(?!www\.|forums\.)([\w-]+)\.keenspot\.com(/.+)?"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "http://COMIC.keenspot.com/"
|
2019-05-28 21:34:38 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
Extractor.__init__(self, match)
|
2019-06-01 18:43:54 +02:00
|
|
|
self.comic = match.group(1).lower()
|
|
|
|
self.path = match.group(2)
|
2019-05-28 21:34:38 +02:00
|
|
|
self.root = "http://" + self.comic + ".keenspot.com"
|
2019-06-02 22:12:21 +02:00
|
|
|
|
2019-06-01 18:43:54 +02:00
|
|
|
self._needle = ""
|
|
|
|
self._image = 'class="ksc"'
|
|
|
|
self._next = self._next_needle
|
2019-05-28 21:34:38 +02:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
data = {"comic": self.comic}
|
|
|
|
yield Message.Directory, data
|
|
|
|
|
2020-12-26 21:38:40 +01:00
|
|
|
with self.request(self.root + "/") as response:
|
|
|
|
if response.history:
|
|
|
|
url = response.request.url
|
|
|
|
self.root = url[:url.index("/", 8)]
|
|
|
|
page = response.text
|
|
|
|
del response
|
|
|
|
|
|
|
|
url = self._first(page)
|
2019-06-01 18:43:54 +02:00
|
|
|
if self.path:
|
|
|
|
url = self.root + self.path
|
|
|
|
|
2019-06-02 22:12:21 +02:00
|
|
|
prev = None
|
2019-06-01 18:43:54 +02:00
|
|
|
ilen = len(self._image)
|
2019-06-02 22:12:21 +02:00
|
|
|
while url and url != prev:
|
|
|
|
prev = url
|
2019-06-01 18:43:54 +02:00
|
|
|
page = self.request(text.urljoin(self.root, url)).text
|
|
|
|
|
|
|
|
pos = 0
|
|
|
|
while True:
|
|
|
|
pos = page.find(self._image, pos)
|
|
|
|
if pos < 0:
|
|
|
|
break
|
|
|
|
img, pos = text.extract(page, 'src="', '"', pos + ilen)
|
|
|
|
if img.endswith(".js"):
|
|
|
|
continue
|
2019-05-28 21:34:38 +02:00
|
|
|
if img[0] == "/":
|
|
|
|
img = self.root + img
|
2019-06-01 18:43:54 +02:00
|
|
|
elif "youtube.com/" in img:
|
|
|
|
img = "ytdl:" + img
|
2019-05-28 21:34:38 +02:00
|
|
|
yield Message.Url, img, text.nameext_from_url(img, data)
|
|
|
|
|
|
|
|
url = self._next(page)
|
|
|
|
|
|
|
|
def _first(self, page):
|
2019-06-01 18:43:54 +02:00
|
|
|
if self.comic == "brawlinthefamily":
|
|
|
|
self._next = self._next_brawl
|
|
|
|
self._image = '<div id="comic">'
|
|
|
|
return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"
|
|
|
|
|
2022-11-04 23:39:38 +01:00
|
|
|
url = text.extr(page, '<link rel="first" href="', '"')
|
2019-06-01 18:43:54 +02:00
|
|
|
if url:
|
|
|
|
if self.comic == "porcelain":
|
|
|
|
self._needle = 'id="porArchivetop_"'
|
|
|
|
else:
|
|
|
|
self._next = self._next_link
|
2019-05-28 21:34:38 +02:00
|
|
|
return url
|
|
|
|
|
|
|
|
pos = page.find('id="first_day1"')
|
|
|
|
if pos >= 0:
|
|
|
|
self._next = self._next_id
|
2019-06-01 18:43:54 +02:00
|
|
|
return text.rextract(page, 'href="', '"', pos)[0]
|
2019-05-28 21:34:38 +02:00
|
|
|
|
|
|
|
pos = page.find('>FIRST PAGE<')
|
|
|
|
if pos >= 0:
|
2019-06-01 18:43:54 +02:00
|
|
|
if self.comic == "lastblood":
|
|
|
|
self._next = self._next_lastblood
|
|
|
|
self._image = '<div id="comic">'
|
|
|
|
else:
|
|
|
|
self._next = self._next_id
|
|
|
|
return text.rextract(page, 'href="', '"', pos)[0]
|
2019-05-28 21:34:38 +02:00
|
|
|
|
|
|
|
pos = page.find('<div id="kscomicpart"')
|
|
|
|
if pos >= 0:
|
2019-06-01 18:43:54 +02:00
|
|
|
self._needle = '<a href="/archive.html'
|
2019-05-28 21:34:38 +02:00
|
|
|
return text.extract(page, 'href="', '"', pos)[0]
|
|
|
|
|
2019-06-01 18:43:54 +02:00
|
|
|
pos = page.find('>First Comic<') # twokinds
|
|
|
|
if pos >= 0:
|
2019-06-17 19:33:16 +02:00
|
|
|
self._image = '</header>'
|
2019-06-01 18:43:54 +02:00
|
|
|
self._needle = 'class="navarchive"'
|
|
|
|
return text.rextract(page, 'href="', '"', pos)[0]
|
|
|
|
|
|
|
|
pos = page.find('id="flip_FirstDay"') # flipside
|
|
|
|
if pos >= 0:
|
|
|
|
self._image = 'class="flip_Pages ksc"'
|
|
|
|
self._needle = 'id="flip_ArcButton"'
|
|
|
|
return text.rextract(page, 'href="', '"', pos)[0]
|
|
|
|
|
2019-05-28 21:34:38 +02:00
|
|
|
self.log.error("Unrecognized page layout")
|
|
|
|
return None
|
|
|
|
|
2019-06-01 18:43:54 +02:00
|
|
|
def _next_needle(self, page):
|
|
|
|
pos = page.index(self._needle) + len(self._needle)
|
|
|
|
return text.extract(page, 'href="', '"', pos)[0]
|
|
|
|
|
2019-05-28 21:34:38 +02:00
|
|
|
@staticmethod
|
|
|
|
def _next_link(page):
|
2022-11-04 23:39:38 +01:00
|
|
|
return text.extr(page, '<link rel="next" href="', '"')
|
2019-05-28 21:34:38 +02:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _next_id(page):
|
|
|
|
pos = page.find('id="next_')
|
|
|
|
return text.rextract(page, 'href="', '"', pos)[0] if pos >= 0 else None
|
|
|
|
|
|
|
|
@staticmethod
|
2019-06-01 18:43:54 +02:00
|
|
|
def _next_lastblood(page):
|
|
|
|
pos = page.index("link rel='next'")
|
|
|
|
return text.extract(page, "href='", "'", pos)[0]
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _next_brawl(page):
|
|
|
|
pos = page.index("comic-nav-next")
|
|
|
|
url = text.rextract(page, 'href="', '"', pos)[0]
|
|
|
|
return None if "?random" in url else url
|