From d2ef9a590f0cfc4591da68c263372f9e91c98184 Mon Sep 17 00:00:00 2001
From: Allen <64094914+allendema@users.noreply.github.com>
Date: Tue, 3 Sep 2024 08:18:58 +0200
Subject: [PATCH 1/3] [tumblr] add search extractor
---
gallery_dl/extractor/tumblr.py | 78 ++++++++++++++++++++++++++++++----
test/results/tumblr.py | 11 +++++
2 files changed, 80 insertions(+), 9 deletions(-)
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 73455d2f..4a32879f 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -11,6 +11,7 @@
from .common import Extractor, Message
from .. import text, util, oauth, exception
from datetime import datetime, date, timedelta
+from urllib.parse import urlparse
import re
@@ -22,7 +23,7 @@ BASE_PATTERN = (
)
POST_TYPES = frozenset((
- "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+ "text", "quote", "link", "answer", "video", "audio", "photo", "chat", "search"))
class TumblrExtractor(Extractor):
@@ -37,11 +38,14 @@ class TumblrExtractor(Extractor):
Extractor.__init__(self, match)
name = match.group(2)
+
if name:
self.blog = name + ".tumblr.com"
else:
self.blog = match.group(1) or match.group(3)
+ self.is_timeline = False
+
def _init(self):
self.api = TumblrAPI(self)
self.types = self._setup_posttypes()
@@ -83,12 +87,23 @@ class TumblrExtractor(Extractor):
return
if post["type"] not in self.types:
continue
+
if not blog:
- blog = self.api.info(self.blog)
- blog["uuid"] = self.blog
+ if self.is_timeline:
+ blog = post.get("blog")
+ self.blog = blog.get("name") + ".tumblr.com"
+
+ for image in blog.get("avatar", []):
+ if int(image.get("width")) == 512:
+ avatar_url = image.get("url")
+ break
+
+ else:
+ blog = self.api.info(self.blog)
+ blog["uuid"] = self.blog
if self.avatar:
- url = self.api.avatar(self.blog)
+ url = avatar_url or self.api.avatar(self.blog)
yield Message.Directory, {"blog": blog}
yield self._prepare_avatar(url, post.copy(), blog)
@@ -349,6 +364,38 @@ class TumblrLikesExtractor(TumblrExtractor):
return self.api.likes(self.blog)
+class TumblrSearchExtractor(TumblrExtractor):
+ """Extractor for a Tumblr search"""
+ subcategory = "search"
+ """ https://www.tumblr.com/search/nathan%20fielder?src=suggested_tag """
+ pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
+ example = "https://www.tumblr.com/search/QUERY"
+
+ def __init__(self, match):
+ TumblrExtractor.__init__(self, match)
+
+ self.is_timeline = True
+ self.query = text.unquote(match.group(4))
+
+ parsed_url = urlparse(self.url)
+ self.params = text.parse_query(parsed_url.query)
+
+ def search(self, query, params):
+ """Retrieve published posts"""
+
+ params["limit"] = 50
+ params["days"] = self.params.get("t") or 0
+ params["query"] = query
+ params["mode"] = "top"
+ params["reblog_info"] = "true" if self.reblogs else "false"
+
+ endpoint = "/v2/timeline/search"
+ return self.api._pagination(endpoint, params, cache=True)
+
+ def posts(self):
+ return self.search(self.query, {})
+
+
class TumblrAPI(oauth.OAuth1API):
"""Interface for the Tumblr API v2
@@ -394,7 +441,8 @@ class TumblrAPI(oauth.OAuth1API):
if self.before and params["offset"]:
self.log.warning("'offset' and 'date-max' cannot be used together")
- return self._pagination(blog, "/posts", params, cache=True)
+ endpoint = "/v2/blog/{}/posts".format(blog)
+ return self._pagination(endpoint, params, cache=True)
def likes(self, blog):
"""Retrieve liked posts"""
@@ -478,20 +526,32 @@ class TumblrAPI(oauth.OAuth1API):
raise exception.StopExtraction(data)
- def _pagination(self, blog, endpoint, params, key="posts", cache=False):
- endpoint = "/v2/blog/{}{}".format(blog, endpoint)
+ def _pagination(self, full_endpoint, params, key="posts", cache=False):
+ if not full_endpoint.endswith("?"):
+ full_endpoint = full_endpoint + "?"
+
+ endpoint = full_endpoint
+
if self.api_key:
params["api_key"] = self.api_key
strategy = self.extractor.config("pagination")
+
while True:
data = self._call(endpoint, params)
+ if "/timeline/" in endpoint:
+ key = "elements"
+ posts = data.get("timeline", {}).get(key, [])
+ else:
+ posts = data[key]
+
if cache:
- self.BLOG_CACHE[blog] = data["blog"]
+ for post in posts:
+ p_blog = post.get("blog", {})
+ self.BLOG_CACHE[p_blog.get("name", "")] = p_blog
cache = False
- posts = data[key]
yield from posts
if strategy == "api":
diff --git a/test/results/tumblr.py b/test/results/tumblr.py
index 4d0d6abc..74de2f64 100644
--- a/test/results/tumblr.py
+++ b/test/results/tumblr.py
@@ -360,4 +360,15 @@ __tests__ = (
"#class" : tumblr.TumblrLikesExtractor,
},
+{
+ "#url" : "https://www.tumblr.com/search/nathan fielder?src=typed_query",
+ "#category": ("", "tumblr", "search"),
+ "#class" : tumblr.TumblrSearchExtractor,
+},
+
+{
+ "#url" : "https://www.tumblr.com/search/nathan%20fielder?t=90",
+ "#category": ("", "tumblr", "search"),
+ "#class" : tumblr.TumblrSearchExtractor,
+},
)
From 0f94fa9015615baae379bc035d905844a2043432 Mon Sep 17 00:00:00 2001
From: Allen <64094914+allendema@users.noreply.github.com>
Date: Tue, 29 Oct 2024 12:44:27 +0100
Subject: [PATCH 2/3] [tumblr] search extractor minimal styling changes
---
docs/supportedsites.md | 2 +-
gallery_dl/extractor/tumblr.py | 16 ++++++----------
2 files changed, 7 insertions(+), 11 deletions(-)
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 2790476f..4f2c0a2e 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -886,7 +886,7 @@ Consider all listed sites to potentially be NSFW.
Tumblr |
https://www.tumblr.com/ |
- Days, Likes, Posts, Tag Searches, User Profiles |
+ Days, Likes, Posts, Search Results, Tag Searches, User Profiles |
OAuth |
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 4a32879f..8fc46f63 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -22,8 +22,8 @@ BASE_PATTERN = (
r"([\w-]+\.tumblr\.com)))"
)
-POST_TYPES = frozenset((
- "text", "quote", "link", "answer", "video", "audio", "photo", "chat", "search"))
+POST_TYPES = frozenset(("text", "quote", "link", "answer", "video",
+ "audio", "photo", "chat", "search"))
class TumblrExtractor(Extractor):
@@ -38,7 +38,6 @@ class TumblrExtractor(Extractor):
Extractor.__init__(self, match)
name = match.group(2)
-
if name:
self.blog = name + ".tumblr.com"
else:
@@ -367,7 +366,6 @@ class TumblrLikesExtractor(TumblrExtractor):
class TumblrSearchExtractor(TumblrExtractor):
"""Extractor for a Tumblr search"""
subcategory = "search"
- """ https://www.tumblr.com/search/nathan%20fielder?src=suggested_tag """
pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
example = "https://www.tumblr.com/search/QUERY"
@@ -381,7 +379,7 @@ class TumblrSearchExtractor(TumblrExtractor):
self.params = text.parse_query(parsed_url.query)
def search(self, query, params):
- """Retrieve published posts"""
+ """Retrieve search results"""
params["limit"] = 50
params["days"] = self.params.get("t") or 0
@@ -526,11 +524,9 @@ class TumblrAPI(oauth.OAuth1API):
raise exception.StopExtraction(data)
- def _pagination(self, full_endpoint, params, key="posts", cache=False):
- if not full_endpoint.endswith("?"):
- full_endpoint = full_endpoint + "?"
-
- endpoint = full_endpoint
+ def _pagination(self, endpoint, params, key="posts", cache=False):
+ if endpoint[-1] != "?":
+ endpoint += "?"
if self.api_key:
params["api_key"] = self.api_key
From 33778d35baa14c8c0edfbdd8305822f6e1a73f5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?=
Date: Thu, 7 Nov 2024 22:15:49 +0100
Subject: [PATCH 3/3] [tumblr] update
- simplify
- fix search pagination
- support custom search mode and post types
---
gallery_dl/extractor/tumblr.py | 100 ++++++++++++++-------------------
test/results/tumblr.py | 8 ++-
2 files changed, 50 insertions(+), 58 deletions(-)
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 8fc46f63..5dcd3374 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -11,7 +11,6 @@
from .common import Extractor, Message
from .. import text, util, oauth, exception
from datetime import datetime, date, timedelta
-from urllib.parse import urlparse
import re
@@ -43,8 +42,6 @@ class TumblrExtractor(Extractor):
else:
self.blog = match.group(1) or match.group(3)
- self.is_timeline = False
-
def _init(self):
self.api = TumblrAPI(self)
self.types = self._setup_posttypes()
@@ -87,24 +84,20 @@ class TumblrExtractor(Extractor):
if post["type"] not in self.types:
continue
- if not blog:
- if self.is_timeline:
- blog = post.get("blog")
- self.blog = blog.get("name") + ".tumblr.com"
-
- for image in blog.get("avatar", []):
- if int(image.get("width")) == 512:
- avatar_url = image.get("url")
- break
-
- else:
+ if "blog" in post:
+ blog = post["blog"]
+ self.blog = blog["name"] + ".tumblr.com"
+ else:
+ if not blog:
blog = self.api.info(self.blog)
blog["uuid"] = self.blog
- if self.avatar:
- url = avatar_url or self.api.avatar(self.blog)
- yield Message.Directory, {"blog": blog}
- yield self._prepare_avatar(url, post.copy(), blog)
+ if self.avatar:
+ url = self.api.avatar(self.blog)
+ yield Message.Directory, {"blog": blog}
+ yield self._prepare_avatar(url, post.copy(), blog)
+
+ post["blog"] = blog
reblog = "reblogged_from_id" in post
if reblog and self._skip_reblog(post):
@@ -113,7 +106,6 @@ class TumblrExtractor(Extractor):
if "trail" in post:
del post["trail"]
- post["blog"] = blog
post["date"] = text.parse_timestamp(post["timestamp"])
posts = []
@@ -366,32 +358,14 @@ class TumblrLikesExtractor(TumblrExtractor):
class TumblrSearchExtractor(TumblrExtractor):
"""Extractor for a Tumblr search"""
subcategory = "search"
- pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
+ pattern = (BASE_PATTERN + r"/search/([^/?#]+)"
+ r"(?:/([^/?#]+)(?:/([^/?#]+))?)?(?:/?\?([^#]+))?")
example = "https://www.tumblr.com/search/QUERY"
- def __init__(self, match):
- TumblrExtractor.__init__(self, match)
-
- self.is_timeline = True
- self.query = text.unquote(match.group(4))
-
- parsed_url = urlparse(self.url)
- self.params = text.parse_query(parsed_url.query)
-
- def search(self, query, params):
- """Retrieve search results"""
-
- params["limit"] = 50
- params["days"] = self.params.get("t") or 0
- params["query"] = query
- params["mode"] = "top"
- params["reblog_info"] = "true" if self.reblogs else "false"
-
- endpoint = "/v2/timeline/search"
- return self.api._pagination(endpoint, params, cache=True)
-
def posts(self):
- return self.search(self.query, {})
+ _, _, _, search, mode, post_type, query = self.groups
+ params = text.parse_query(query)
+ return self.api.search(text.unquote(search), params, mode, post_type)
class TumblrAPI(oauth.OAuth1API):
@@ -440,7 +414,7 @@ class TumblrAPI(oauth.OAuth1API):
self.log.warning("'offset' and 'date-max' cannot be used together")
endpoint = "/v2/blog/{}/posts".format(blog)
- return self._pagination(endpoint, params, cache=True)
+ return self._pagination(endpoint, params, blog=blog, cache=True)
def likes(self, blog):
"""Retrieve liked posts"""
@@ -456,6 +430,20 @@ class TumblrAPI(oauth.OAuth1API):
yield from posts
params["before"] = posts[-1]["liked_timestamp"]
+ def search(self, query, params, mode="top", post_type=None):
+ """Retrieve search results"""
+ endpoint = "/v2/timeline/search"
+
+ params["limit"] = "50"
+ params["days"] = params.pop("t", None)
+ params["query"] = query
+ params["mode"] = mode
+ params["reblog_info"] = "true" if self.extractor.reblogs else "false"
+ if post_type:
+ params["post_type_filter"] = post_type
+
+ return self._pagination(endpoint, params)
+
def _call(self, endpoint, params, **kwargs):
url = self.ROOT + endpoint
kwargs["params"] = params
@@ -524,29 +512,27 @@ class TumblrAPI(oauth.OAuth1API):
raise exception.StopExtraction(data)
- def _pagination(self, endpoint, params, key="posts", cache=False):
- if endpoint[-1] != "?":
- endpoint += "?"
-
+ def _pagination(self, endpoint, params,
+ blog=None, key="posts", cache=False):
if self.api_key:
params["api_key"] = self.api_key
strategy = self.extractor.config("pagination")
+ if not strategy and "offset" not in params:
+ strategy = "api"
while True:
data = self._call(endpoint, params)
- if "/timeline/" in endpoint:
- key = "elements"
- posts = data.get("timeline", {}).get(key, [])
- else:
- posts = data[key]
+ if "timeline" in data:
+ data = data["timeline"]
+ posts = data["elements"]
- if cache:
- for post in posts:
- p_blog = post.get("blog", {})
- self.BLOG_CACHE[p_blog.get("name", "")] = p_blog
- cache = False
+ else:
+ if cache:
+ self.BLOG_CACHE[blog] = data["blog"]
+ cache = False
+ posts = data[key]
yield from posts
diff --git a/test/results/tumblr.py b/test/results/tumblr.py
index 74de2f64..50b67676 100644
--- a/test/results/tumblr.py
+++ b/test/results/tumblr.py
@@ -361,7 +361,13 @@ __tests__ = (
},
{
- "#url" : "https://www.tumblr.com/search/nathan fielder?src=typed_query",
+ "#url" : "https://www.tumblr.com/search/nathan fielder",
+ "#category": ("", "tumblr", "search"),
+ "#class" : tumblr.TumblrSearchExtractor,
+},
+
+{
+ "#url" : "https://www.tumblr.com/search/nathan fielder/recent/quote?src=typed_query",
"#category": ("", "tumblr", "search"),
"#class" : tumblr.TumblrSearchExtractor,
},