From 595593a35ef5729c65cd094fd7ebfab8d2ed7a2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 11 Dec 2017 21:44:27 +0100 Subject: [PATCH] [sankaku] rewrite - better code structure and extensibility - better metadata --- CHANGELOG.md | 2 + gallery_dl/extractor/sankaku.py | 179 ++++++++++++++++++++------------ gallery_dl/version.py | 2 +- 3 files changed, 116 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4191d03..baadca74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # Changelog +## Unreleased + ## 1.1.0 - 2017-12-08 - Added the ``-r/--limit-rate`` command-line option to set a maximum download rate - Added the ``--sleep`` command-line option to specify the number of seconds to sleep before each download diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 2215146e..fb5b944d 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -8,102 +8,86 @@ """Extract images from https://chan.sankakucomplex.com/""" -from .common import Extractor, Message +from .common import SharedConfigExtractor, Message from .. import text, util, exception from ..cache import cache import time import random -class SankakuTagExtractor(Extractor): - """Extractor for images from chan.sankakucomplex.com by search-tags""" +class SankakuExtractor(SharedConfigExtractor): + """Base class for sankaku extractors""" + basecategory = "booru" category = "sankaku" - subcategory = "tag" - directory_fmt = ["{category}", "{tags}"] filename_fmt = "{category}_{id}_{md5}.{extension}" - pattern = [r"(?:https?://)?chan\.sankakucomplex\.com" - r"/\?(?:[^&#]*&)*tags=([^&#]+)"] - test = [("https://chan.sankakucomplex.com/?tags=bonocho", { - "count": 5, - "pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" - r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+"), - })] root = "https://chan.sankakucomplex.com" cookienames = ("login", "pass_hash") cookiedomain = "chan.sankakucomplex.com" - def __init__(self, match): - Extractor.__init__(self) + def __init__(self): + SharedConfigExtractor.__init__(self) self.logged_in = True - self.pagestart = 1 - self.tags = text.unquote(match.group(1).replace("+", " ")) + self.start_post = 0 self.wait_min = self.config("wait-min", 2) self.wait_max = self.config("wait-max", 4) if self.wait_max < self.wait_min: self.wait_max = self.wait_min - def skip(self, num): - pages = min(num // 20, 49) - self.pagestart += pages - return pages * 20 - def items(self): self.login() - data = self.get_job_metadata() yield Message.Version, 1 - yield Message.Directory, data - for image in self.get_images(): - image.update(data) - yield Message.Url, image["file_url"], image + yield Message.Directory, self.get_metadata() - def get_job_metadata(self): - """Collect metadata for extractor-job""" - return {"tags": self.tags} + for post_id in util.advance(self.get_posts(), self.start_post): + self.wait() + data = self.get_post_data(post_id) + url = data["file_url"] + yield Message.Url, url, text.nameext_from_url(url, data) - def get_images(self): - """Yield all available images for the given tags""" - params = { - "tags": self.tags, - "page": self.pagestart, - } - while self.logged_in or params["page"] <= 25: - image = None - page = self.request(self.root, params=params, retries=10).text - pos = text.extract(page, '