[tumblr] implement 'pagination' option (#5880)

restore pagination behavior from before de670bd7de
2024-11-22 02:32:33 +01:00 · 2024-07-23 20:31:04 +02:00 · 2024-07-23 20:31:04 +02:00 · 540eaa5add
commit 540eaa5add
parent 7b445ec255
2 changed files with 52 additions and 10 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -3735,6 +3735,23 @@ Description
    use an extra HTTP request to find the URL to its full-resolution version.


+extractor.tumblr.pagination
+---------------------------
+Type
+    ``string``
+Default
+    ``"offset"``
+Description
+    Controls how to paginate over blog posts.
+
+    * ``"api"``: ``next`` parameter provided by the API
+      (potentially misses posts due to a
+      `bug <https://github.com/tumblr/docs/issues/76>`__
+      in Tumblr's API)
+    * ``"before"``: timestamp of last post
+    * ``"offset"``: post offset number
+
+
 extractor.tumblr.ratelimit
 --------------------------
 Type
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@ -386,7 +386,7 @@ class TumblrAPI(oauth.OAuth1API):
    def posts(self, blog, params):
        """Retrieve published posts"""
        params["offset"] = self.extractor.config("offset")
-        params["limit"] = "50"
+        params["limit"] = 50
        params["reblog_info"] = "true"
        params["type"] = self.posts_type
        params["before"] = self.before
@ -398,8 +398,14 @@ class TumblrAPI(oauth.OAuth1API):

    def likes(self, blog):
        """Retrieve liked posts"""
+        endpoint = "/v2/blog/{}/likes".format(blog)
        params = {"limit": "50", "before": self.before}
-        return self._pagination(blog, "/likes", params, key="liked_posts")
+        while True:
+            posts = self._call(endpoint, params)["liked_posts"]
+            if not posts:
+                return
+            yield from posts
+            params["before"] = posts[-1]["liked_timestamp"]

    def _call(self, endpoint, params, **kwargs):
        url = self.ROOT + endpoint
@ -474,6 +480,7 @@ class TumblrAPI(oauth.OAuth1API):
        if self.api_key:
            params["api_key"] = self.api_key

+        strategy = self.extractor.config("pagination")
        while True:
            data = self._call(endpoint, params)

@ -481,13 +488,31 @@ class TumblrAPI(oauth.OAuth1API):
                self.BLOG_CACHE[blog] = data["blog"]
                cache = False

-            yield from data[key]
+            posts = data[key]
+            yield from posts

-            try:
-                endpoint = data["_links"]["next"]["href"]
-            except KeyError:
-                return
+            if strategy == "api":
+                try:
+                    endpoint = data["_links"]["next"]["href"]
+                except KeyError:
+                    return

-            params = None
-            if self.api_key:
-                endpoint += "&api_key=" + self.api_key
+                params = None
+                if self.api_key:
+                    endpoint += "&api_key=" + self.api_key
+
+            elif strategy == "before":
+                if not posts:
+                    return
+                timestamp = posts[-1]["timestamp"] + 1
+                if params["before"] and timestamp >= params["before"]:
+                    return
+                params["before"] = timestamp
+                params["offset"] = None
+
+            else:  # offset
+                params["offset"] = \
+                    text.parse_int(params["offset"]) + params["limit"]
+                params["before"] = None
+                if params["offset"] >= data["total_posts"]:
+                    return