From 6af921a9523cf8a310acd121a88a5cdd3e878ea1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Sat, 14 Oct 2017 23:01:33 +0200
Subject: [PATCH] [sankaku] rewrite/improve (fixes #44)

- add wait-time between HTTP requests similar to exhentai
- add 'wait-min' and 'wait-max' options
- increase retry-count for HTTP requests to 10
- implement user authentication (non-authenticated users can only view
  images up to page 25)
- implement 'skip()' functionality (only works up to page 50)
- implement image-retrieval for pages >= 51
- fix issue with multiple tags
---
 docs/configuration.rst          | 17 +++++-
 docs/gallery-dl.conf            |  5 ++
 docs/supportedsites.rst         |  2 +-
 gallery_dl/extractor/sankaku.py | 92 +++++++++++++++++++++++++--------
 scripts/build_supportedsites.py |  1 +
 test/test_cookies.py            |  3 +-
 6 files changed, 95 insertions(+), 25 deletions(-)

diff --git a/docs/configuration.rst b/docs/configuration.rst
index 340498d0..23bed31d 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -238,7 +238,8 @@ Description The username and password to use when attempting to log in to
 
             Specifying username and password is
             required for the ``pixiv``, ``nijie`` and ``seiga`` modules and
-            optional (but strongly recommended) for ``batoto`` and ``exhentai``.
+            optional (but strongly recommended) for ``batoto``, ``exhentai``
+            and ``sankaku``.
 
             These values can also be set via the ``-u/--username`` and
             ``-p/--password`` command-line options or by using a |.netrc|_ file.
@@ -522,6 +523,20 @@ Description The ``refresh_token`` value you get from linking your Reddit account
 =========== =====
 
 
+extractor.sankaku.wait-min & .wait-max
+--------------------------------------
+=========== =====
+Type        ``float``
+Default     ``2.0`` and ``4.0``
+Description Minimum and maximum wait time in seconds between each image
+
+            Sankaku Channel responds with ``429 Too Many Requests`` if it
+            receives too many HTTP requests in a certain amount of time.
+            Waiting a few seconds between each request tries to prevent that.
+=========== =====
+
+
+
 API Tokens & IDs
 ================
 
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 60d1a153..05092747 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -49,6 +49,11 @@
             "username": null,
             "password": null
         },
+        "sankaku":
+        {
+            "username": null,
+            "password": null
+        },
         "seiga":
         {
             "username": null,
diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
index 9d4d6e66..43d7e929 100644
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@@ -63,7 +63,7 @@ RebeccaBlackTech     https://rbt.asia/                   Threads
 Reddit               https://reddit.com/                 Submissions, Subreddits                            Optional (OAuth)
 Rule 34              https://rule34.xxx/                 Posts, Tag-Searches
 Safebooru            https://safebooru.org/              Posts, Tag-Searches
-Sankaku Channel      https://chan.sankakucomplex.com/    Tag-Searches
+Sankaku Channel      https://chan.sankakucomplex.com/    Tag-Searches                                       Optional
 Sea Otter Scans      https://reader.seaotterscans.com/   Chapters, Manga
 Sen Manga            http://raw.senmanga.com/            Chapters
 Sense-Scans          http://sensescans.com/              Chapters, Manga
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 9c2bf6c5..32cf69e9 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -8,11 +8,14 @@
 
 """Extract images from https://chan.sankakucomplex.com/"""
 
-from .common import AsynchronousExtractor, Message
-from .. import text
+from .common import Extractor, Message
+from .. import text, util, exception
+from ..cache import cache
+import time
+import random
 
 
-class SankakuTagExtractor(AsynchronousExtractor):
+class SankakuTagExtractor(Extractor):
     """Extractor for images from chan.sankakucomplex.com by search-tags"""
     category = "sankaku"
     subcategory = "tag"
@@ -24,16 +27,30 @@ class SankakuTagExtractor(AsynchronousExtractor):
         "pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
                     r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+"),
     })]
-    url = "https://chan.sankakucomplex.com/"
+    root = "https://chan.sankakucomplex.com"
+    cookienames = ("login", "pass_hash")
+    cookiedomain = "chan.sankakucomplex.com"
 
     def __init__(self, match):
-        AsynchronousExtractor.__init__(self)
-        self.tags = text.unquote(match.group(1))
+        Extractor.__init__(self)
+        self.logged_in = True
+        self.pagestart = 1
+        self.tags = text.unquote(match.group(1).replace("+", " "))
+        self.wait_min = self.config("wait-min", 2)
+        self.wait_max = self.config("wait-max", 4)
+        if self.wait_max < self.wait_min:
+            self.wait_max = self.wait_min
         self.session.headers["User-Agent"] = (
             "Mozilla/5.0 Gecko/20100101 Firefox/40.0"
         )
 
+    def skip(self, num):
+        pages = min(num // 20, 49)
+        self.pagestart += pages
+        return pages * 20
+
     def items(self):
+        self.login()
         data = self.get_job_metadata()
         yield Message.Version, 1
         yield Message.Directory, data
@@ -48,36 +65,67 @@ class SankakuTagExtractor(AsynchronousExtractor):
     def get_images(self):
         params = {
             "tags": self.tags,
-            "page": 1,
+            "page": self.pagestart,
         }
-        while True:
-            count = 0
-            page = self.request(self.url, params=params).text
+        while self.logged_in or params["page"] <= 25:
+            image = None
+            page = self.request(self.root, params=params, retries=10).text
             pos = text.extract(page, '<div id=more-popular-posts-link>', '')[1]
-            while True:
-                image_id, pos = text.extract(
-                    page, '<span class="thumb blacklisted" id=p', '>', pos
-                )
-                if not image_id:
-                    break
+            for image_id in text.extract_iter(
+                    page, '<span class="thumb blacklisted" id=p', '>', pos):
+                self.wait()
                 image = self.get_image_metadata(image_id)
-                count += 1
                 yield image
-            if count < 20:
+            if not image:
                 return
             params["page"] += 1
+            params["next"] = image["id"] - 1
 
     def get_image_metadata(self, image_id):
         url = "https://chan.sankakucomplex.com/post/show/" + image_id
-        page = self.request(url).text
+        page = self.request(url, retries=10).text
         image_url, pos = text.extract(page, '<li>Original: <a href="', '"')
         width    , pos = text.extract(page, '>', 'x', pos)
         height   , pos = text.extract(page, '', ' ', pos)
         data = text.nameext_from_url(image_url, {
-            "id": image_id,
+            "id": util.safe_int(image_id),
             "file_url": "https:" + text.unescape(image_url),
-            "width": width,
-            "height": height,
+            "width": util.safe_int(width),
+            "height": util.safe_int(height),
         })
         data["md5"] = data["name"]
         return data
+
+    def wait(self):
+        """Wait for a randomly chosen amount of seconds"""
+        time.sleep(random.uniform(self.wait_min, self.wait_max))
+
+    def login(self):
+        """Login and set necessary cookies"""
+        if self._check_cookies(self.cookienames):
+            return
+        username, password = self._get_auth_info()
+        if username:
+            cookies = self._login_impl(username, password)
+            for key, value in cookies.items():
+                self.session.cookies.set(
+                    key, value, domain=self.cookiedomain)
+        else:
+            self.logged_in = False
+
+    @cache(maxage=90*24*60*60, keyarg=1)
+    def _login_impl(self, username, password):
+        """Actual login implementation"""
+        self.log.info("Logging in as %s", username)
+        params = {
+            "url": "",
+            "user[name]": username,
+            "user[password]": password,
+            "commit": "Login",
+        }
+        response = self.request(self.root + "/user/authenticate",
+                                method="POST", params=params)
+        if not response.history or response.url != self.root + "/user/home":
+            raise exception.AuthenticationError()
+        response = response.history[0]
+        return {c: response.cookies[c] for c in self.cookienames}
diff --git a/scripts/build_supportedsites.py b/scripts/build_supportedsites.py
index 1e27fe6c..dc873975 100755
--- a/scripts/build_supportedsites.py
+++ b/scripts/build_supportedsites.py
@@ -82,6 +82,7 @@ AUTH_MAP = {
     "nijie"   : "Required",
     "pixiv"   : "Required",
     "reddit"  : "Optional (OAuth)",
+    "sankaku" : "Optional",
     "seiga"   : "Required",
 }
 
diff --git a/test/test_cookies.py b/test/test_cookies.py
index 5d069fbe..8d5fd1a2 100644
--- a/test/test_cookies.py
+++ b/test/test_cookies.py
@@ -90,7 +90,7 @@ class TestCookiedict(unittest.TestCase):
         self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values()))
 
     def test_domain(self):
-        for category in ["batoto", "exhentai", "nijie", "seiga"]:
+        for category in ["batoto", "exhentai", "nijie", "sankaku", "seiga"]:
             extr = _get_extractor(category)
             cookies = extr.session.cookies
             for key in self.cdict.keys():
@@ -109,6 +109,7 @@ class TestCookieLogin(unittest.TestCase):
             "batoto": ("member_id", "pass_hash"),
             "exhentai": ("ipb_member_id", "ipb_pass_hash"),
             "nijie": ("nemail", "nlogin"),
+            "sankaku": ("login", "pass_hash"),
             "seiga": ("user_session",),
         }
         for category, cookienames in extr_cookies.items():