From d2ef9a590f0cfc4591da68c263372f9e91c98184 Mon Sep 17 00:00:00 2001
From: Allen <64094914+allendema@users.noreply.github.com>
Date: Tue, 3 Sep 2024 08:18:58 +0200
Subject: [PATCH 1/3] [tumblr] add search extractor

---
 gallery_dl/extractor/tumblr.py | 78 ++++++++++++++++++++++++++++++----
 test/results/tumblr.py         | 11 +++++
 2 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 73455d2f..4a32879f 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -11,6 +11,7 @@
 from .common import Extractor, Message
 from .. import text, util, oauth, exception
 from datetime import datetime, date, timedelta
+from urllib.parse import urlparse
 import re
 
 
@@ -22,7 +23,7 @@ BASE_PATTERN = (
 )
 
 POST_TYPES = frozenset((
-    "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+    "text", "quote", "link", "answer", "video", "audio", "photo", "chat", "search"))
 
 
 class TumblrExtractor(Extractor):
@@ -37,11 +38,14 @@ class TumblrExtractor(Extractor):
         Extractor.__init__(self, match)
 
         name = match.group(2)
+
         if name:
             self.blog = name + ".tumblr.com"
         else:
             self.blog = match.group(1) or match.group(3)
 
+        self.is_timeline = False
+
     def _init(self):
         self.api = TumblrAPI(self)
         self.types = self._setup_posttypes()
@@ -83,12 +87,23 @@ class TumblrExtractor(Extractor):
                 return
             if post["type"] not in self.types:
                 continue
+
             if not blog:
-                blog = self.api.info(self.blog)
-                blog["uuid"] = self.blog
+                if self.is_timeline:
+                    blog = post.get("blog")
+                    self.blog = blog.get("name") + ".tumblr.com"
+
+                    for image in blog.get("avatar", []):
+                        if int(image.get("width")) == 512:
+                            avatar_url = image.get("url")
+                            break
+
+                else:
+                    blog = self.api.info(self.blog)
+                    blog["uuid"] = self.blog
 
                 if self.avatar:
-                    url = self.api.avatar(self.blog)
+                    url = avatar_url or self.api.avatar(self.blog)
                     yield Message.Directory, {"blog": blog}
                     yield self._prepare_avatar(url, post.copy(), blog)
 
@@ -349,6 +364,38 @@ class TumblrLikesExtractor(TumblrExtractor):
         return self.api.likes(self.blog)
 
 
+class TumblrSearchExtractor(TumblrExtractor):
+    """Extractor for a Tumblr search"""
+    subcategory = "search"
+    """ https://www.tumblr.com/search/nathan%20fielder?src=suggested_tag """
+    pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
+    example = "https://www.tumblr.com/search/QUERY"
+
+    def __init__(self, match):
+        TumblrExtractor.__init__(self, match)
+
+        self.is_timeline = True
+        self.query = text.unquote(match.group(4))
+
+        parsed_url = urlparse(self.url)
+        self.params = text.parse_query(parsed_url.query)
+
+    def search(self, query, params):
+        """Retrieve published posts"""
+
+        params["limit"] = 50
+        params["days"] = self.params.get("t") or 0
+        params["query"] = query
+        params["mode"] = "top"
+        params["reblog_info"] = "true" if self.reblogs else "false"
+
+        endpoint = "/v2/timeline/search"
+        return self.api._pagination(endpoint, params, cache=True)
+
+    def posts(self):
+        return self.search(self.query, {})
+
+
 class TumblrAPI(oauth.OAuth1API):
     """Interface for the Tumblr API v2
 
@@ -394,7 +441,8 @@ class TumblrAPI(oauth.OAuth1API):
         if self.before and params["offset"]:
             self.log.warning("'offset' and 'date-max' cannot be used together")
 
-        return self._pagination(blog, "/posts", params, cache=True)
+        endpoint = "/v2/blog/{}/posts".format(blog)
+        return self._pagination(endpoint, params, cache=True)
 
     def likes(self, blog):
         """Retrieve liked posts"""
@@ -478,20 +526,32 @@ class TumblrAPI(oauth.OAuth1API):
 
             raise exception.StopExtraction(data)
 
-    def _pagination(self, blog, endpoint, params, key="posts", cache=False):
-        endpoint = "/v2/blog/{}{}".format(blog, endpoint)
+    def _pagination(self, full_endpoint, params, key="posts", cache=False):
+        if not full_endpoint.endswith("?"):
+            full_endpoint = full_endpoint + "?"
+
+        endpoint = full_endpoint
+
         if self.api_key:
             params["api_key"] = self.api_key
 
         strategy = self.extractor.config("pagination")
+
         while True:
             data = self._call(endpoint, params)
 
+            if "/timeline/" in endpoint:
+                key = "elements"
+                posts = data.get("timeline", {}).get(key, [])
+            else:
+                posts = data[key]
+
             if cache:
-                self.BLOG_CACHE[blog] = data["blog"]
+                for post in posts:
+                    p_blog = post.get("blog", {})
+                    self.BLOG_CACHE[p_blog.get("name", "")] = p_blog
                 cache = False
 
-            posts = data[key]
             yield from posts
 
             if strategy == "api":
diff --git a/test/results/tumblr.py b/test/results/tumblr.py
index 4d0d6abc..74de2f64 100644
--- a/test/results/tumblr.py
+++ b/test/results/tumblr.py
@@ -360,4 +360,15 @@ __tests__ = (
     "#class"   : tumblr.TumblrLikesExtractor,
 },
 
+{
+    "#url"     : "https://www.tumblr.com/search/nathan fielder?src=typed_query",
+    "#category": ("", "tumblr", "search"),
+    "#class"   : tumblr.TumblrSearchExtractor,
+},
+
+{
+    "#url"     : "https://www.tumblr.com/search/nathan%20fielder?t=90",
+    "#category": ("", "tumblr", "search"),
+    "#class"   : tumblr.TumblrSearchExtractor,
+},
 )

From 0f94fa9015615baae379bc035d905844a2043432 Mon Sep 17 00:00:00 2001
From: Allen <64094914+allendema@users.noreply.github.com>
Date: Tue, 29 Oct 2024 12:44:27 +0100
Subject: [PATCH 2/3] [tumblr] search extractor minimal styling changes

---
 docs/supportedsites.md         |  2 +-
 gallery_dl/extractor/tumblr.py | 16 ++++++----------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 2790476f..4f2c0a2e 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -886,7 +886,7 @@ Consider all listed sites to potentially be NSFW.
 <tr>
     <td>Tumblr</td>
     <td>https://www.tumblr.com/</td>
-    <td>Days, Likes, Posts, Tag Searches, User Profiles</td>
+    <td>Days, Likes, Posts, Search Results, Tag Searches, User Profiles</td>
     <td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td>
 </tr>
 <tr>
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 4a32879f..8fc46f63 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -22,8 +22,8 @@ BASE_PATTERN = (
     r"([\w-]+\.tumblr\.com)))"
 )
 
-POST_TYPES = frozenset((
-    "text", "quote", "link", "answer", "video", "audio", "photo", "chat", "search"))
+POST_TYPES = frozenset(("text", "quote", "link", "answer", "video",
+                        "audio", "photo", "chat", "search"))
 
 
 class TumblrExtractor(Extractor):
@@ -38,7 +38,6 @@ class TumblrExtractor(Extractor):
         Extractor.__init__(self, match)
 
         name = match.group(2)
-
         if name:
             self.blog = name + ".tumblr.com"
         else:
@@ -367,7 +366,6 @@ class TumblrLikesExtractor(TumblrExtractor):
 class TumblrSearchExtractor(TumblrExtractor):
     """Extractor for a Tumblr search"""
     subcategory = "search"
-    """ https://www.tumblr.com/search/nathan%20fielder?src=suggested_tag """
     pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
     example = "https://www.tumblr.com/search/QUERY"
 
@@ -381,7 +379,7 @@ class TumblrSearchExtractor(TumblrExtractor):
         self.params = text.parse_query(parsed_url.query)
 
     def search(self, query, params):
-        """Retrieve published posts"""
+        """Retrieve search results"""
 
         params["limit"] = 50
         params["days"] = self.params.get("t") or 0
@@ -526,11 +524,9 @@ class TumblrAPI(oauth.OAuth1API):
 
             raise exception.StopExtraction(data)
 
-    def _pagination(self, full_endpoint, params, key="posts", cache=False):
-        if not full_endpoint.endswith("?"):
-            full_endpoint = full_endpoint + "?"
-
-        endpoint = full_endpoint
+    def _pagination(self, endpoint, params, key="posts", cache=False):
+        if endpoint[-1] != "?":
+            endpoint += "?"
 
         if self.api_key:
             params["api_key"] = self.api_key

From 33778d35baa14c8c0edfbdd8305822f6e1a73f5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Thu, 7 Nov 2024 22:15:49 +0100
Subject: [PATCH 3/3] [tumblr] update

- simplify
- fix search pagination
- support custom search mode and post types
---
 gallery_dl/extractor/tumblr.py | 100 ++++++++++++++-------------------
 test/results/tumblr.py         |   8 ++-
 2 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 8fc46f63..5dcd3374 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -11,7 +11,6 @@
 from .common import Extractor, Message
 from .. import text, util, oauth, exception
 from datetime import datetime, date, timedelta
-from urllib.parse import urlparse
 import re
 
 
@@ -43,8 +42,6 @@ class TumblrExtractor(Extractor):
         else:
             self.blog = match.group(1) or match.group(3)
 
-        self.is_timeline = False
-
     def _init(self):
         self.api = TumblrAPI(self)
         self.types = self._setup_posttypes()
@@ -87,24 +84,20 @@ class TumblrExtractor(Extractor):
             if post["type"] not in self.types:
                 continue
 
-            if not blog:
-                if self.is_timeline:
-                    blog = post.get("blog")
-                    self.blog = blog.get("name") + ".tumblr.com"
-
-                    for image in blog.get("avatar", []):
-                        if int(image.get("width")) == 512:
-                            avatar_url = image.get("url")
-                            break
-
-                else:
+            if "blog" in post:
+                blog = post["blog"]
+                self.blog = blog["name"] + ".tumblr.com"
+            else:
+                if not blog:
                     blog = self.api.info(self.blog)
                     blog["uuid"] = self.blog
 
-                if self.avatar:
-                    url = avatar_url or self.api.avatar(self.blog)
-                    yield Message.Directory, {"blog": blog}
-                    yield self._prepare_avatar(url, post.copy(), blog)
+                    if self.avatar:
+                        url = self.api.avatar(self.blog)
+                        yield Message.Directory, {"blog": blog}
+                        yield self._prepare_avatar(url, post.copy(), blog)
+
+                post["blog"] = blog
 
             reblog = "reblogged_from_id" in post
             if reblog and self._skip_reblog(post):
@@ -113,7 +106,6 @@ class TumblrExtractor(Extractor):
 
             if "trail" in post:
                 del post["trail"]
-            post["blog"] = blog
             post["date"] = text.parse_timestamp(post["timestamp"])
             posts = []
 
@@ -366,32 +358,14 @@ class TumblrLikesExtractor(TumblrExtractor):
 class TumblrSearchExtractor(TumblrExtractor):
     """Extractor for a Tumblr search"""
     subcategory = "search"
-    pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
+    pattern = (BASE_PATTERN + r"/search/([^/?#]+)"
+               r"(?:/([^/?#]+)(?:/([^/?#]+))?)?(?:/?\?([^#]+))?")
     example = "https://www.tumblr.com/search/QUERY"
 
-    def __init__(self, match):
-        TumblrExtractor.__init__(self, match)
-
-        self.is_timeline = True
-        self.query = text.unquote(match.group(4))
-
-        parsed_url = urlparse(self.url)
-        self.params = text.parse_query(parsed_url.query)
-
-    def search(self, query, params):
-        """Retrieve search results"""
-
-        params["limit"] = 50
-        params["days"] = self.params.get("t") or 0
-        params["query"] = query
-        params["mode"] = "top"
-        params["reblog_info"] = "true" if self.reblogs else "false"
-
-        endpoint = "/v2/timeline/search"
-        return self.api._pagination(endpoint, params, cache=True)
-
     def posts(self):
-        return self.search(self.query, {})
+        _, _, _, search, mode, post_type, query = self.groups
+        params = text.parse_query(query)
+        return self.api.search(text.unquote(search), params, mode, post_type)
 
 
 class TumblrAPI(oauth.OAuth1API):
@@ -440,7 +414,7 @@ class TumblrAPI(oauth.OAuth1API):
             self.log.warning("'offset' and 'date-max' cannot be used together")
 
         endpoint = "/v2/blog/{}/posts".format(blog)
-        return self._pagination(endpoint, params, cache=True)
+        return self._pagination(endpoint, params, blog=blog, cache=True)
 
     def likes(self, blog):
         """Retrieve liked posts"""
@@ -456,6 +430,20 @@ class TumblrAPI(oauth.OAuth1API):
             yield from posts
             params["before"] = posts[-1]["liked_timestamp"]
 
+    def search(self, query, params, mode="top", post_type=None):
+        """Retrieve search results"""
+        endpoint = "/v2/timeline/search"
+
+        params["limit"] = "50"
+        params["days"] = params.pop("t", None)
+        params["query"] = query
+        params["mode"] = mode
+        params["reblog_info"] = "true" if self.extractor.reblogs else "false"
+        if post_type:
+            params["post_type_filter"] = post_type
+
+        return self._pagination(endpoint, params)
+
     def _call(self, endpoint, params, **kwargs):
         url = self.ROOT + endpoint
         kwargs["params"] = params
@@ -524,29 +512,27 @@ class TumblrAPI(oauth.OAuth1API):
 
             raise exception.StopExtraction(data)
 
-    def _pagination(self, endpoint, params, key="posts", cache=False):
-        if endpoint[-1] != "?":
-            endpoint += "?"
-
+    def _pagination(self, endpoint, params,
+                    blog=None, key="posts", cache=False):
         if self.api_key:
             params["api_key"] = self.api_key
 
         strategy = self.extractor.config("pagination")
+        if not strategy and "offset" not in params:
+            strategy = "api"
 
         while True:
             data = self._call(endpoint, params)
 
-            if "/timeline/" in endpoint:
-                key = "elements"
-                posts = data.get("timeline", {}).get(key, [])
-            else:
-                posts = data[key]
+            if "timeline" in data:
+                data = data["timeline"]
+                posts = data["elements"]
 
-            if cache:
-                for post in posts:
-                    p_blog = post.get("blog", {})
-                    self.BLOG_CACHE[p_blog.get("name", "")] = p_blog
-                cache = False
+            else:
+                if cache:
+                    self.BLOG_CACHE[blog] = data["blog"]
+                    cache = False
+                posts = data[key]
 
             yield from posts
 
diff --git a/test/results/tumblr.py b/test/results/tumblr.py
index 74de2f64..50b67676 100644
--- a/test/results/tumblr.py
+++ b/test/results/tumblr.py
@@ -361,7 +361,13 @@ __tests__ = (
 },
 
 {
-    "#url"     : "https://www.tumblr.com/search/nathan fielder?src=typed_query",
+    "#url"     : "https://www.tumblr.com/search/nathan fielder",
+    "#category": ("", "tumblr", "search"),
+    "#class"   : tumblr.TumblrSearchExtractor,
+},
+
+{
+    "#url"     : "https://www.tumblr.com/search/nathan fielder/recent/quote?src=typed_query",
     "#category": ("", "tumblr", "search"),
     "#class"   : tumblr.TumblrSearchExtractor,
 },