diff --git a/CHANGELOG.md b/CHANGELOG.md
index d4191d03..baadca74 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
# Changelog
+## Unreleased
+
## 1.1.0 - 2017-12-08
- Added the ``-r/--limit-rate`` command-line option to set a maximum download rate
- Added the ``--sleep`` command-line option to specify the number of seconds to sleep before each download
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 2215146e..fb5b944d 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -8,102 +8,86 @@
"""Extract images from https://chan.sankakucomplex.com/"""
-from .common import Extractor, Message
+from .common import SharedConfigExtractor, Message
from .. import text, util, exception
from ..cache import cache
import time
import random
-class SankakuTagExtractor(Extractor):
- """Extractor for images from chan.sankakucomplex.com by search-tags"""
+class SankakuExtractor(SharedConfigExtractor):
+ """Base class for sankaku extractors"""
+ basecategory = "booru"
category = "sankaku"
- subcategory = "tag"
- directory_fmt = ["{category}", "{tags}"]
filename_fmt = "{category}_{id}_{md5}.{extension}"
- pattern = [r"(?:https?://)?chan\.sankakucomplex\.com"
- r"/\?(?:[^]*&)*tags=([^]+)"]
- test = [("https://chan.sankakucomplex.com/?tags=bonocho", {
- "count": 5,
- "pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
- r"/[^/]{32}\.\w+\?e=\d+&m=[^]+"),
- })]
root = "https://chan.sankakucomplex.com"
cookienames = ("login", "pass_hash")
cookiedomain = "chan.sankakucomplex.com"
- def __init__(self, match):
- Extractor.__init__(self)
+ def __init__(self):
+ SharedConfigExtractor.__init__(self)
self.logged_in = True
- self.pagestart = 1
- self.tags = text.unquote(match.group(1).replace("+", " "))
+ self.start_post = 0
self.wait_min = self.config("wait-min", 2)
self.wait_max = self.config("wait-max", 4)
if self.wait_max < self.wait_min:
self.wait_max = self.wait_min
- def skip(self, num):
- pages = min(num // 20, 49)
- self.pagestart += pages
- return pages * 20
-
def items(self):
self.login()
- data = self.get_job_metadata()
yield Message.Version, 1
- yield Message.Directory, data
- for image in self.get_images():
- image.update(data)
- yield Message.Url, image["file_url"], image
+ yield Message.Directory, self.get_metadata()
- def get_job_metadata(self):
- """Collect metadata for extractor-job"""
- return {"tags": self.tags}
+ for post_id in util.advance(self.get_posts(), self.start_post):
+ self.wait()
+ data = self.get_post_data(post_id)
+ url = data["file_url"]
+ yield Message.Url, url, text.nameext_from_url(url, data)
- def get_images(self):
- """Yield all available images for the given tags"""
- params = {
- "tags": self.tags,
- "page": self.pagestart,
- }
- while self.logged_in or params["page"] <= 25:
- image = None
- page = self.request(self.root, params=params, retries=10).text
- pos = text.extract(page, '
', '')[1]
- for image_id in text.extract_iter(
- page, '
', pos):
- self.wait()
- image = self.get_image_metadata(image_id)
- yield image
- if not image:
- return
- params["page"] += 1
- params["next"] = image["id"] - 1
- self.log.warning(
- "Unauthenticated users may only access the first 500 images / 25 "
- "pages. (Use '--range 501-' to continue downloading from this "
- "point onwards after setting up an account.)")
+ def skip(self, num):
+ self.start_post += num
+ return num
- def get_image_metadata(self, image_id):
- """Collect metadata for a single image"""
- url = "https://chan.sankakucomplex.com/post/show/" + image_id
+ def get_metadata(self):
+ """Return general metadata"""
+ return {}
+
+ def get_posts(self):
+ """Return an iterable containing all relevant post ids"""
+
+ def get_post_data(self, post_id, extr=text.extract):
+ """Extract metadata of a single post"""
+ url = self.root + "/post/show/" + post_id
page = self.request(url, retries=10).text
- file_url, pos = text.extract(page, 'Original: ", " | Sankaku Channel")
+ vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos)
+ vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos)
+ _ , pos = extr(page, "Posted: <", "", pos)
+ created, pos = extr(page, ' title="', '"', pos)
+ rating = extr(page, "Rating: ", "<", pos)[0]
+
+ file_url, pos = extr(page, 'Original: ', 'x', pos)
- height, pos = text.extract(page, '', ' ', pos)
+ width , pos = extr(page, '>', 'x', pos)
+ height, pos = extr(page, '', ' ', pos)
else:
- width , pos = text.extract(page, '