[sankaku] rewrite/improve (fixes #44)

- add wait-time between HTTP requests similar to exhentai - add 'wait-min' and 'wait-max' options - increase retry-count for HTTP requests to 10 - implement user authentication (non-authenticated users can only view images up to page 25) - implement 'skip()' functionality (only works up to page 50) - implement image-retrieval for pages >= 51 - fix issue with multiple tags
2024-11-22 10:42:34 +01:00 · 2017-10-14 23:01:33 +02:00 · 2017-10-14 23:01:33 +02:00 · 6af921a952
commit 6af921a952
parent 9aecc67841
6 changed files with 95 additions and 25 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -238,7 +238,8 @@ Description The username and password to use when attempting to log in to

            Specifying username and password is
            required for the ``pixiv``, ``nijie`` and ``seiga`` modules and
-            optional (but strongly recommended) for ``batoto`` and ``exhentai``.
+            optional (but strongly recommended) for ``batoto``, ``exhentai``
+            and ``sankaku``.

            These values can also be set via the ``-u/--username`` and
            ``-p/--password`` command-line options or by using a |.netrc|_ file.
@ -522,6 +523,20 @@ Description The ``refresh_token`` value you get from linking your Reddit account
 =========== =====


+extractor.sankaku.wait-min & .wait-max
+--------------------------------------
+=========== =====
+Type        ``float``
+Default     ``2.0`` and ``4.0``
+Description Minimum and maximum wait time in seconds between each image
+
+            Sankaku Channel responds with ``429 Too Many Requests`` if it
+            receives too many HTTP requests in a certain amount of time.
+            Waiting a few seconds between each request tries to prevent that.
+=========== =====
+
+
+
 API Tokens & IDs
 ================

--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@ -49,6 +49,11 @@
            "username": null,
            "password": null
        },
+        "sankaku":
+        {
+            "username": null,
+            "password": null
+        },
        "seiga":
        {
            "username": null,
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@ -63,7 +63,7 @@ RebeccaBlackTech     https://rbt.asia/                   Threads
 Reddit               https://reddit.com/                 Submissions, Subreddits                            Optional (OAuth)
 Rule 34              https://rule34.xxx/                 Posts, Tag-Searches
 Safebooru            https://safebooru.org/              Posts, Tag-Searches
-Sankaku Channel      https://chan.sankakucomplex.com/    Tag-Searches
+Sankaku Channel      https://chan.sankakucomplex.com/    Tag-Searches                                       Optional
 Sea Otter Scans      https://reader.seaotterscans.com/   Chapters, Manga
 Sen Manga            http://raw.senmanga.com/            Chapters
 Sense-Scans          http://sensescans.com/              Chapters, Manga
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@ -8,11 +8,14 @@

 """Extract images from https://chan.sankakucomplex.com/"""

-from .common import AsynchronousExtractor, Message
-from .. import text
+from .common import Extractor, Message
+from .. import text, util, exception
+from ..cache import cache
+import time
+import random


-class SankakuTagExtractor(AsynchronousExtractor):
+class SankakuTagExtractor(Extractor):
    """Extractor for images from chan.sankakucomplex.com by search-tags"""
    category = "sankaku"
    subcategory = "tag"
@ -24,16 +27,30 @@ class SankakuTagExtractor(AsynchronousExtractor):
        "pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
                    r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+"),
    })]
-    url = "https://chan.sankakucomplex.com/"
+    root = "https://chan.sankakucomplex.com"
+    cookienames = ("login", "pass_hash")
+    cookiedomain = "chan.sankakucomplex.com"

    def __init__(self, match):
-        AsynchronousExtractor.__init__(self)
-        self.tags = text.unquote(match.group(1))
+        Extractor.__init__(self)
+        self.logged_in = True
+        self.pagestart = 1
+        self.tags = text.unquote(match.group(1).replace("+", " "))
+        self.wait_min = self.config("wait-min", 2)
+        self.wait_max = self.config("wait-max", 4)
+        if self.wait_max < self.wait_min:
+            self.wait_max = self.wait_min
        self.session.headers["User-Agent"] = (
            "Mozilla/5.0 Gecko/20100101 Firefox/40.0"
        )

+    def skip(self, num):
+        pages = min(num // 20, 49)
+        self.pagestart += pages
+        return pages * 20
+
    def items(self):
+        self.login()
        data = self.get_job_metadata()
        yield Message.Version, 1
        yield Message.Directory, data
@ -48,36 +65,67 @@ class SankakuTagExtractor(AsynchronousExtractor):
    def get_images(self):
        params = {
            "tags": self.tags,
-            "page": 1,
+            "page": self.pagestart,
        }
-        while True:
-            count = 0
-            page = self.request(self.url, params=params).text
+        while self.logged_in or params["page"] <= 25:
+            image = None
+            page = self.request(self.root, params=params, retries=10).text
            pos = text.extract(page, '<div id=more-popular-posts-link>', '')[1]
-            while True:
-                image_id, pos = text.extract(
-                    page, '<span class="thumb blacklisted" id=p', '>', pos
-                )
-                if not image_id:
-                    break
+            for image_id in text.extract_iter(
+                    page, '<span class="thumb blacklisted" id=p', '>', pos):
+                self.wait()
                image = self.get_image_metadata(image_id)
-                count += 1
                yield image
-            if count < 20:
+            if not image:
                return
            params["page"] += 1
+            params["next"] = image["id"] - 1

    def get_image_metadata(self, image_id):
        url = "https://chan.sankakucomplex.com/post/show/" + image_id
-        page = self.request(url).text
+        page = self.request(url, retries=10).text
        image_url, pos = text.extract(page, '<li>Original: <a href="', '"')
        width    , pos = text.extract(page, '>', 'x', pos)
        height   , pos = text.extract(page, '', ' ', pos)
        data = text.nameext_from_url(image_url, {
-            "id": image_id,
+            "id": util.safe_int(image_id),
            "file_url": "https:" + text.unescape(image_url),
-            "width": width,
-            "height": height,
+            "width": util.safe_int(width),
+            "height": util.safe_int(height),
        })
        data["md5"] = data["name"]
        return data
+
+    def wait(self):
+        """Wait for a randomly chosen amount of seconds"""
+        time.sleep(random.uniform(self.wait_min, self.wait_max))
+
+    def login(self):
+        """Login and set necessary cookies"""
+        if self._check_cookies(self.cookienames):
+            return
+        username, password = self._get_auth_info()
+        if username:
+            cookies = self._login_impl(username, password)
+            for key, value in cookies.items():
+                self.session.cookies.set(
+                    key, value, domain=self.cookiedomain)
+        else:
+            self.logged_in = False
+
+    @cache(maxage=90*24*60*60, keyarg=1)
+    def _login_impl(self, username, password):
+        """Actual login implementation"""
+        self.log.info("Logging in as %s", username)
+        params = {
+            "url": "",
+            "user[name]": username,
+            "user[password]": password,
+            "commit": "Login",
+        }
+        response = self.request(self.root + "/user/authenticate",
+                                method="POST", params=params)
+        if not response.history or response.url != self.root + "/user/home":
+            raise exception.AuthenticationError()
+        response = response.history[0]
+        return {c: response.cookies[c] for c in self.cookienames}
--- a/scripts/build_supportedsites.py
+++ b/scripts/build_supportedsites.py
@ -82,6 +82,7 @@ AUTH_MAP = {
    "nijie"   : "Required",
    "pixiv"   : "Required",
    "reddit"  : "Optional (OAuth)",
+    "sankaku" : "Optional",
    "seiga"   : "Required",
 }

--- a/test/test_cookies.py
+++ b/test/test_cookies.py
@ -90,7 +90,7 @@ class TestCookiedict(unittest.TestCase):
        self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values()))

    def test_domain(self):
-        for category in ["batoto", "exhentai", "nijie", "seiga"]:
+        for category in ["batoto", "exhentai", "nijie", "sankaku", "seiga"]:
            extr = _get_extractor(category)
            cookies = extr.session.cookies
            for key in self.cdict.keys():
@ -109,6 +109,7 @@ class TestCookieLogin(unittest.TestCase):
            "batoto": ("member_id", "pass_hash"),
            "exhentai": ("ipb_member_id", "ipb_pass_hash"),
            "nijie": ("nemail", "nlogin"),
+            "sankaku": ("login", "pass_hash"),
            "seiga": ("user_session",),
        }
        for category, cookienames in extr_cookies.items():