merge #6394: [tumblr] add 'search' extractor

2024-11-25 04:02:32 +01:00 · 2024-11-08 08:17:46 +01:00 · 2024-11-08 08:17:46 +01:00 · 6205e255f4
commit 6205e255f4
parent ce90566c56 33778d35ba
3 changed files with 77 additions and 18 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -934,7 +934,7 @@ Consider all listed sites to potentially be NSFW.
 <tr>
    <td>Tumblr</td>
    <td>https://www.tumblr.com/</td>
-    <td>Days, Likes, Posts, Tag Searches, User Profiles</td>
+    <td>Days, Likes, Posts, Search Results, Tag Searches, User Profiles</td>
    <td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td>
 </tr>
 <tr>
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@ -21,8 +21,8 @@ BASE_PATTERN = (
    r"([\w-]+\.tumblr\.com)))"
 )

-POST_TYPES = frozenset((
-    "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+POST_TYPES = frozenset(("text", "quote", "link", "answer", "video",
+                        "audio", "photo", "chat", "search"))


 class TumblrExtractor(Extractor):
@ -83,6 +83,11 @@ class TumblrExtractor(Extractor):
                return
            if post["type"] not in self.types:
                continue
+
+            if "blog" in post:
+                blog = post["blog"]
+                self.blog = blog["name"] + ".tumblr.com"
+            else:
                if not blog:
                    blog = self.api.info(self.blog)
                    blog["uuid"] = self.blog
@ -92,6 +97,8 @@ class TumblrExtractor(Extractor):
                        yield Message.Directory, {"blog": blog}
                        yield self._prepare_avatar(url, post.copy(), blog)

+                post["blog"] = blog
+
            reblog = "reblogged_from_id" in post
            if reblog and self._skip_reblog(post):
                continue
@ -99,7 +106,6 @@ class TumblrExtractor(Extractor):

            if "trail" in post:
                del post["trail"]
-            post["blog"] = blog
            post["date"] = text.parse_timestamp(post["timestamp"])
            posts = []

@ -349,6 +355,19 @@ class TumblrLikesExtractor(TumblrExtractor):
        return self.api.likes(self.blog)


+class TumblrSearchExtractor(TumblrExtractor):
+    """Extractor for a Tumblr search"""
+    subcategory = "search"
+    pattern = (BASE_PATTERN + r"/search/([^/?#]+)"
+               r"(?:/([^/?#]+)(?:/([^/?#]+))?)?(?:/?\?([^#]+))?")
+    example = "https://www.tumblr.com/search/QUERY"
+
+    def posts(self):
+        _, _, _, search, mode, post_type, query = self.groups
+        params = text.parse_query(query)
+        return self.api.search(text.unquote(search), params, mode, post_type)
+
+
 class TumblrAPI(oauth.OAuth1API):
    """Interface for the Tumblr API v2

@ -394,7 +413,8 @@ class TumblrAPI(oauth.OAuth1API):
        if self.before and params["offset"]:
            self.log.warning("'offset' and 'date-max' cannot be used together")

-        return self._pagination(blog, "/posts", params, cache=True)
+        endpoint = "/v2/blog/{}/posts".format(blog)
+        return self._pagination(endpoint, params, blog=blog, cache=True)

    def likes(self, blog):
        """Retrieve liked posts"""
@ -410,6 +430,20 @@ class TumblrAPI(oauth.OAuth1API):
            yield from posts
            params["before"] = posts[-1]["liked_timestamp"]

+    def search(self, query, params, mode="top", post_type=None):
+        """Retrieve search results"""
+        endpoint = "/v2/timeline/search"
+
+        params["limit"] = "50"
+        params["days"] = params.pop("t", None)
+        params["query"] = query
+        params["mode"] = mode
+        params["reblog_info"] = "true" if self.extractor.reblogs else "false"
+        if post_type:
+            params["post_type_filter"] = post_type
+
+        return self._pagination(endpoint, params)
+
    def _call(self, endpoint, params, **kwargs):
        url = self.ROOT + endpoint
        kwargs["params"] = params
@ -478,20 +512,28 @@ class TumblrAPI(oauth.OAuth1API):

            raise exception.StopExtraction(data)

-    def _pagination(self, blog, endpoint, params, key="posts", cache=False):
-        endpoint = "/v2/blog/{}{}".format(blog, endpoint)
+    def _pagination(self, endpoint, params,
+                    blog=None, key="posts", cache=False):
        if self.api_key:
            params["api_key"] = self.api_key

        strategy = self.extractor.config("pagination")
+        if not strategy and "offset" not in params:
+            strategy = "api"
+
        while True:
            data = self._call(endpoint, params)

+            if "timeline" in data:
+                data = data["timeline"]
+                posts = data["elements"]
+
+            else:
                if cache:
                    self.BLOG_CACHE[blog] = data["blog"]
                    cache = False
-
                posts = data[key]
+
            yield from posts

            if strategy == "api":
--- a/test/results/tumblr.py
+++ b/test/results/tumblr.py
@ -360,4 +360,21 @@ __tests__ = (
    "#class"   : tumblr.TumblrLikesExtractor,
 },

+{
+    "#url"     : "https://www.tumblr.com/search/nathan fielder",
+    "#category": ("", "tumblr", "search"),
+    "#class"   : tumblr.TumblrSearchExtractor,
+},
+
+{
+    "#url"     : "https://www.tumblr.com/search/nathan fielder/recent/quote?src=typed_query",
+    "#category": ("", "tumblr", "search"),
+    "#class"   : tumblr.TumblrSearchExtractor,
+},
+
+{
+    "#url"     : "https://www.tumblr.com/search/nathan%20fielder?t=90",
+    "#category": ("", "tumblr", "search"),
+    "#class"   : tumblr.TumblrSearchExtractor,
+},
 )