[tumblr] update

- simplify - fix search pagination - support custom search mode and post types
2024-11-24 19:52:32 +01:00 · 2024-11-07 22:15:49 +01:00 · 2024-11-07 22:15:49 +01:00 · 33778d35ba
commit 33778d35ba
parent 0f94fa9015
2 changed files with 50 additions and 58 deletions
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@ -11,7 +11,6 @@
 from .common import Extractor, Message
 from .. import text, util, oauth, exception
 from datetime import datetime, date, timedelta
-from urllib.parse import urlparse
 import re


@ -43,8 +42,6 @@ class TumblrExtractor(Extractor):
        else:
            self.blog = match.group(1) or match.group(3)

-        self.is_timeline = False
-
    def _init(self):
        self.api = TumblrAPI(self)
        self.types = self._setup_posttypes()
@ -87,24 +84,20 @@ class TumblrExtractor(Extractor):
            if post["type"] not in self.types:
                continue

-            if not blog:
-                if self.is_timeline:
-                    blog = post.get("blog")
-                    self.blog = blog.get("name") + ".tumblr.com"
-
-                    for image in blog.get("avatar", []):
-                        if int(image.get("width")) == 512:
-                            avatar_url = image.get("url")
-                            break
-
-                else:
+            if "blog" in post:
+                blog = post["blog"]
+                self.blog = blog["name"] + ".tumblr.com"
+            else:
+                if not blog:
                    blog = self.api.info(self.blog)
                    blog["uuid"] = self.blog

-                if self.avatar:
-                    url = avatar_url or self.api.avatar(self.blog)
-                    yield Message.Directory, {"blog": blog}
-                    yield self._prepare_avatar(url, post.copy(), blog)
+                    if self.avatar:
+                        url = self.api.avatar(self.blog)
+                        yield Message.Directory, {"blog": blog}
+                        yield self._prepare_avatar(url, post.copy(), blog)
+
+                post["blog"] = blog

            reblog = "reblogged_from_id" in post
            if reblog and self._skip_reblog(post):
@ -113,7 +106,6 @@ class TumblrExtractor(Extractor):

            if "trail" in post:
                del post["trail"]
-            post["blog"] = blog
            post["date"] = text.parse_timestamp(post["timestamp"])
            posts = []

@ -366,32 +358,14 @@ class TumblrLikesExtractor(TumblrExtractor):
 class TumblrSearchExtractor(TumblrExtractor):
    """Extractor for a Tumblr search"""
    subcategory = "search"
-    pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
+    pattern = (BASE_PATTERN + r"/search/([^/?#]+)"
+               r"(?:/([^/?#]+)(?:/([^/?#]+))?)?(?:/?\?([^#]+))?")
    example = "https://www.tumblr.com/search/QUERY"

-    def __init__(self, match):
-        TumblrExtractor.__init__(self, match)
-
-        self.is_timeline = True
-        self.query = text.unquote(match.group(4))
-
-        parsed_url = urlparse(self.url)
-        self.params = text.parse_query(parsed_url.query)
-
-    def search(self, query, params):
-        """Retrieve search results"""
-
-        params["limit"] = 50
-        params["days"] = self.params.get("t") or 0
-        params["query"] = query
-        params["mode"] = "top"
-        params["reblog_info"] = "true" if self.reblogs else "false"
-
-        endpoint = "/v2/timeline/search"
-        return self.api._pagination(endpoint, params, cache=True)
-
    def posts(self):
-        return self.search(self.query, {})
+        _, _, _, search, mode, post_type, query = self.groups
+        params = text.parse_query(query)
+        return self.api.search(text.unquote(search), params, mode, post_type)


 class TumblrAPI(oauth.OAuth1API):
@ -440,7 +414,7 @@ class TumblrAPI(oauth.OAuth1API):
            self.log.warning("'offset' and 'date-max' cannot be used together")

        endpoint = "/v2/blog/{}/posts".format(blog)
-        return self._pagination(endpoint, params, cache=True)
+        return self._pagination(endpoint, params, blog=blog, cache=True)

    def likes(self, blog):
        """Retrieve liked posts"""
@ -456,6 +430,20 @@ class TumblrAPI(oauth.OAuth1API):
            yield from posts
            params["before"] = posts[-1]["liked_timestamp"]

+    def search(self, query, params, mode="top", post_type=None):
+        """Retrieve search results"""
+        endpoint = "/v2/timeline/search"
+
+        params["limit"] = "50"
+        params["days"] = params.pop("t", None)
+        params["query"] = query
+        params["mode"] = mode
+        params["reblog_info"] = "true" if self.extractor.reblogs else "false"
+        if post_type:
+            params["post_type_filter"] = post_type
+
+        return self._pagination(endpoint, params)
+
    def _call(self, endpoint, params, **kwargs):
        url = self.ROOT + endpoint
        kwargs["params"] = params
@ -524,29 +512,27 @@ class TumblrAPI(oauth.OAuth1API):

            raise exception.StopExtraction(data)

-    def _pagination(self, endpoint, params, key="posts", cache=False):
-        if endpoint[-1] != "?":
-            endpoint += "?"
-
+    def _pagination(self, endpoint, params,
+                    blog=None, key="posts", cache=False):
        if self.api_key:
            params["api_key"] = self.api_key

        strategy = self.extractor.config("pagination")
+        if not strategy and "offset" not in params:
+            strategy = "api"

        while True:
            data = self._call(endpoint, params)

-            if "/timeline/" in endpoint:
-                key = "elements"
-                posts = data.get("timeline", {}).get(key, [])
-            else:
-                posts = data[key]
+            if "timeline" in data:
+                data = data["timeline"]
+                posts = data["elements"]

-            if cache:
-                for post in posts:
-                    p_blog = post.get("blog", {})
-                    self.BLOG_CACHE[p_blog.get("name", "")] = p_blog
-                cache = False
+            else:
+                if cache:
+                    self.BLOG_CACHE[blog] = data["blog"]
+                    cache = False
+                posts = data[key]

            yield from posts

--- a/test/results/tumblr.py
+++ b/test/results/tumblr.py
@ -361,7 +361,13 @@ __tests__ = (
 },

 {
-    "#url"     : "https://www.tumblr.com/search/nathan fielder?src=typed_query",
+    "#url"     : "https://www.tumblr.com/search/nathan fielder",
+    "#category": ("", "tumblr", "search"),
+    "#class"   : tumblr.TumblrSearchExtractor,
+},
+
+{
+    "#url"     : "https://www.tumblr.com/search/nathan fielder/recent/quote?src=typed_query",
    "#category": ("", "tumblr", "search"),
    "#class"   : tumblr.TumblrSearchExtractor,
 },