2014-10-12 21:56:44 +02:00
|
|
|
from .common import BasicExtractor
|
|
|
|
from urllib.parse import unquote
|
|
|
|
import re
|
|
|
|
|
|
|
|
class Extractor(BasicExtractor):
|
|
|
|
|
2015-02-03 00:22:39 +01:00
|
|
|
url_base = "https://8ch.net"
|
|
|
|
thread_url_fmt = url_base + "/{0}/res/{1}.html"
|
2014-10-12 21:56:44 +02:00
|
|
|
regex = r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
|
|
|
|
|
|
|
|
def __init__(self, match, config):
|
|
|
|
BasicExtractor.__init__(self, config)
|
|
|
|
self.board, _, self.thread_id = match.group(1).split("/")
|
|
|
|
self.category = "8chan"
|
|
|
|
self.directory = self.board + "-" + self.thread_id
|
|
|
|
|
|
|
|
def images(self):
|
|
|
|
url = self.thread_url_fmt.format(self.board, self.thread_id)
|
|
|
|
text = self.request(url).text
|
|
|
|
for match in re.finditer(self.regex, text):
|
|
|
|
url, prefix, fullname, name = match.group(1, 2, 4, 5)
|
2015-02-03 00:22:39 +01:00
|
|
|
if url.startswith("/"):
|
|
|
|
url = self.url_base + url
|
2014-11-26 17:38:50 +01:00
|
|
|
yield (url, prefix + "-" + unquote(fullname or name))
|