From 60833abcc6942275491a60e81f169d38fc2e70ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 9 Nov 2015 02:29:33 +0100 Subject: [PATCH] [sankaku] re-enable extractor --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/sankaku.py | 98 +++++++++++++++++++++++--------- 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3571efca..f16596a9 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -37,6 +37,7 @@ modules = [ "powermanga", "redhawkscans", "safebooru", + "sankaku", "yandere", ] diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index ad110e2b..0b719663 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -1,35 +1,81 @@ -from .common import AsyncExtractor -from ..util import filename_from_url +# -*- coding: utf-8 -*- -class Extractor(AsyncExtractor): +# Copyright 2014, 2015 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://chan.sankakucomplex.com/""" + +from .common import Extractor, Message +from .. import text +import os.path + +info = { + "category": "sankaku", + "extractor": "SankakuExtractor", + "directory": ["{category}", "{tags}"], + "filename": "{category}_{id}_{md5}.{extension}", + "pattern": [ + r"(?:https?://)?chan\.sankakucomplex\.com/\?tags=([^&]+)", + ], +} + +class SankakuExtractor(Extractor): url = "https://chan.sankakucomplex.com/" - def __init__(self, match, config): - AsyncExtractor.__init__(self, config) - self.tags = match.group(1) - self.category = "sankaku" - self.directory = self.tags.replace("/", "_") - self.enable_useragent() + def __init__(self, match): + Extractor.__init__(self) + self.tags = text.unquote(match.group(1)) + self.session.headers["User-Agent"] = ( + "Mozilla/5.0 Gecko/20100101 Firefox/40.0" + ) - def images(self): - needle = ' src="//c.sankakucomplex.com/data/preview/' - params = {"tags": self.tags, "page":1} + def items(self): + yield Message.Version, 1 + data = self.get_job_metadata() + yield Message.Directory, data + for image in self.get_images(): + data.update(image) + yield Message.Url, image["file-url"], data + + def get_job_metadata(self): + """Collect metadata for extractor-job""" + return { + "category": info["category"], + "tags": self.tags, + } + + def get_images(self): + image = {} + params = { + "tags": self.tags, + "page": 1, + } while True: - text = self.request(self.url, params=params).text - print(text) - return - pos = 0 - found = 0 + pos = 0 + count = 0 + page = self.request(self.url, params=params).text while True: - try: - url, pos = self.extract(text, needle, '"', pos) - found += 1 - print("https://cs.sankakucomplex.com/data/" + url) - yield ("https://cs.sankakucomplex.com/data/" + url, - "%s_%s" % (self.category, filename_from_url(url))) - except: + image["id"], pos = text.extract(page, + '', pos) + if not image["id"]: break - if found == 0: - break + url , pos = text.extract(page, ' src="//c.sankakucomplex.com/', '"', pos) + tags, pos = text.extract(page, ' title="', '"', pos) + self.get_image_metadata(image, url) + count += 1 + yield image + if count < 20: + return params["page"] += 1 + + @staticmethod + def get_image_metadata(image, url): + image["file-url"] = "https://cs.sankakucomplex.com/data/" + url[13:] + filename = text.filename_from_url(url) + name, ext = os.path.splitext(filename) + image["name"] = image["md5"] = name + image["extension"] = ext[1:]