From d2ef9a590f0cfc4591da68c263372f9e91c98184 Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Tue, 3 Sep 2024 08:18:58 +0200 Subject: [PATCH 1/3] [tumblr] add search extractor --- gallery_dl/extractor/tumblr.py | 78 ++++++++++++++++++++++++++++++---- test/results/tumblr.py | 11 +++++ 2 files changed, 80 insertions(+), 9 deletions(-) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 73455d2f..4a32879f 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, util, oauth, exception from datetime import datetime, date, timedelta +from urllib.parse import urlparse import re @@ -22,7 +23,7 @@ BASE_PATTERN = ( ) POST_TYPES = frozenset(( - "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) + "text", "quote", "link", "answer", "video", "audio", "photo", "chat", "search")) class TumblrExtractor(Extractor): @@ -37,11 +38,14 @@ class TumblrExtractor(Extractor): Extractor.__init__(self, match) name = match.group(2) + if name: self.blog = name + ".tumblr.com" else: self.blog = match.group(1) or match.group(3) + self.is_timeline = False + def _init(self): self.api = TumblrAPI(self) self.types = self._setup_posttypes() @@ -83,12 +87,23 @@ class TumblrExtractor(Extractor): return if post["type"] not in self.types: continue + if not blog: - blog = self.api.info(self.blog) - blog["uuid"] = self.blog + if self.is_timeline: + blog = post.get("blog") + self.blog = blog.get("name") + ".tumblr.com" + + for image in blog.get("avatar", []): + if int(image.get("width")) == 512: + avatar_url = image.get("url") + break + + else: + blog = self.api.info(self.blog) + blog["uuid"] = self.blog if self.avatar: - url = self.api.avatar(self.blog) + url = avatar_url or self.api.avatar(self.blog) yield Message.Directory, {"blog": blog} yield self._prepare_avatar(url, post.copy(), blog) @@ -349,6 +364,38 @@ class TumblrLikesExtractor(TumblrExtractor): return self.api.likes(self.blog) +class TumblrSearchExtractor(TumblrExtractor): + """Extractor for a Tumblr search""" + subcategory = "search" + """ https://www.tumblr.com/search/nathan%20fielder?src=suggested_tag """ + pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$' + example = "https://www.tumblr.com/search/QUERY" + + def __init__(self, match): + TumblrExtractor.__init__(self, match) + + self.is_timeline = True + self.query = text.unquote(match.group(4)) + + parsed_url = urlparse(self.url) + self.params = text.parse_query(parsed_url.query) + + def search(self, query, params): + """Retrieve published posts""" + + params["limit"] = 50 + params["days"] = self.params.get("t") or 0 + params["query"] = query + params["mode"] = "top" + params["reblog_info"] = "true" if self.reblogs else "false" + + endpoint = "/v2/timeline/search" + return self.api._pagination(endpoint, params, cache=True) + + def posts(self): + return self.search(self.query, {}) + + class TumblrAPI(oauth.OAuth1API): """Interface for the Tumblr API v2 @@ -394,7 +441,8 @@ class TumblrAPI(oauth.OAuth1API): if self.before and params["offset"]: self.log.warning("'offset' and 'date-max' cannot be used together") - return self._pagination(blog, "/posts", params, cache=True) + endpoint = "/v2/blog/{}/posts".format(blog) + return self._pagination(endpoint, params, cache=True) def likes(self, blog): """Retrieve liked posts""" @@ -478,20 +526,32 @@ class TumblrAPI(oauth.OAuth1API): raise exception.StopExtraction(data) - def _pagination(self, blog, endpoint, params, key="posts", cache=False): - endpoint = "/v2/blog/{}{}".format(blog, endpoint) + def _pagination(self, full_endpoint, params, key="posts", cache=False): + if not full_endpoint.endswith("?"): + full_endpoint = full_endpoint + "?" + + endpoint = full_endpoint + if self.api_key: params["api_key"] = self.api_key strategy = self.extractor.config("pagination") + while True: data = self._call(endpoint, params) + if "/timeline/" in endpoint: + key = "elements" + posts = data.get("timeline", {}).get(key, []) + else: + posts = data[key] + if cache: - self.BLOG_CACHE[blog] = data["blog"] + for post in posts: + p_blog = post.get("blog", {}) + self.BLOG_CACHE[p_blog.get("name", "")] = p_blog cache = False - posts = data[key] yield from posts if strategy == "api": diff --git a/test/results/tumblr.py b/test/results/tumblr.py index 4d0d6abc..74de2f64 100644 --- a/test/results/tumblr.py +++ b/test/results/tumblr.py @@ -360,4 +360,15 @@ __tests__ = ( "#class" : tumblr.TumblrLikesExtractor, }, +{ + "#url" : "https://www.tumblr.com/search/nathan fielder?src=typed_query", + "#category": ("", "tumblr", "search"), + "#class" : tumblr.TumblrSearchExtractor, +}, + +{ + "#url" : "https://www.tumblr.com/search/nathan%20fielder?t=90", + "#category": ("", "tumblr", "search"), + "#class" : tumblr.TumblrSearchExtractor, +}, ) From 0f94fa9015615baae379bc035d905844a2043432 Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Tue, 29 Oct 2024 12:44:27 +0100 Subject: [PATCH 2/3] [tumblr] search extractor minimal styling changes --- docs/supportedsites.md | 2 +- gallery_dl/extractor/tumblr.py | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2790476f..4f2c0a2e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -886,7 +886,7 @@ Consider all listed sites to potentially be NSFW. Tumblr https://www.tumblr.com/ - Days, Likes, Posts, Tag Searches, User Profiles + Days, Likes, Posts, Search Results, Tag Searches, User Profiles OAuth diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 4a32879f..8fc46f63 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -22,8 +22,8 @@ BASE_PATTERN = ( r"([\w-]+\.tumblr\.com)))" ) -POST_TYPES = frozenset(( - "text", "quote", "link", "answer", "video", "audio", "photo", "chat", "search")) +POST_TYPES = frozenset(("text", "quote", "link", "answer", "video", + "audio", "photo", "chat", "search")) class TumblrExtractor(Extractor): @@ -38,7 +38,6 @@ class TumblrExtractor(Extractor): Extractor.__init__(self, match) name = match.group(2) - if name: self.blog = name + ".tumblr.com" else: @@ -367,7 +366,6 @@ class TumblrLikesExtractor(TumblrExtractor): class TumblrSearchExtractor(TumblrExtractor): """Extractor for a Tumblr search""" subcategory = "search" - """ https://www.tumblr.com/search/nathan%20fielder?src=suggested_tag """ pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$' example = "https://www.tumblr.com/search/QUERY" @@ -381,7 +379,7 @@ class TumblrSearchExtractor(TumblrExtractor): self.params = text.parse_query(parsed_url.query) def search(self, query, params): - """Retrieve published posts""" + """Retrieve search results""" params["limit"] = 50 params["days"] = self.params.get("t") or 0 @@ -526,11 +524,9 @@ class TumblrAPI(oauth.OAuth1API): raise exception.StopExtraction(data) - def _pagination(self, full_endpoint, params, key="posts", cache=False): - if not full_endpoint.endswith("?"): - full_endpoint = full_endpoint + "?" - - endpoint = full_endpoint + def _pagination(self, endpoint, params, key="posts", cache=False): + if endpoint[-1] != "?": + endpoint += "?" if self.api_key: params["api_key"] = self.api_key From 33778d35baa14c8c0edfbdd8305822f6e1a73f5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 7 Nov 2024 22:15:49 +0100 Subject: [PATCH 3/3] [tumblr] update - simplify - fix search pagination - support custom search mode and post types --- gallery_dl/extractor/tumblr.py | 100 ++++++++++++++------------------- test/results/tumblr.py | 8 ++- 2 files changed, 50 insertions(+), 58 deletions(-) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 8fc46f63..5dcd3374 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, util, oauth, exception from datetime import datetime, date, timedelta -from urllib.parse import urlparse import re @@ -43,8 +42,6 @@ class TumblrExtractor(Extractor): else: self.blog = match.group(1) or match.group(3) - self.is_timeline = False - def _init(self): self.api = TumblrAPI(self) self.types = self._setup_posttypes() @@ -87,24 +84,20 @@ class TumblrExtractor(Extractor): if post["type"] not in self.types: continue - if not blog: - if self.is_timeline: - blog = post.get("blog") - self.blog = blog.get("name") + ".tumblr.com" - - for image in blog.get("avatar", []): - if int(image.get("width")) == 512: - avatar_url = image.get("url") - break - - else: + if "blog" in post: + blog = post["blog"] + self.blog = blog["name"] + ".tumblr.com" + else: + if not blog: blog = self.api.info(self.blog) blog["uuid"] = self.blog - if self.avatar: - url = avatar_url or self.api.avatar(self.blog) - yield Message.Directory, {"blog": blog} - yield self._prepare_avatar(url, post.copy(), blog) + if self.avatar: + url = self.api.avatar(self.blog) + yield Message.Directory, {"blog": blog} + yield self._prepare_avatar(url, post.copy(), blog) + + post["blog"] = blog reblog = "reblogged_from_id" in post if reblog and self._skip_reblog(post): @@ -113,7 +106,6 @@ class TumblrExtractor(Extractor): if "trail" in post: del post["trail"] - post["blog"] = blog post["date"] = text.parse_timestamp(post["timestamp"]) posts = [] @@ -366,32 +358,14 @@ class TumblrLikesExtractor(TumblrExtractor): class TumblrSearchExtractor(TumblrExtractor): """Extractor for a Tumblr search""" subcategory = "search" - pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$' + pattern = (BASE_PATTERN + r"/search/([^/?#]+)" + r"(?:/([^/?#]+)(?:/([^/?#]+))?)?(?:/?\?([^#]+))?") example = "https://www.tumblr.com/search/QUERY" - def __init__(self, match): - TumblrExtractor.__init__(self, match) - - self.is_timeline = True - self.query = text.unquote(match.group(4)) - - parsed_url = urlparse(self.url) - self.params = text.parse_query(parsed_url.query) - - def search(self, query, params): - """Retrieve search results""" - - params["limit"] = 50 - params["days"] = self.params.get("t") or 0 - params["query"] = query - params["mode"] = "top" - params["reblog_info"] = "true" if self.reblogs else "false" - - endpoint = "/v2/timeline/search" - return self.api._pagination(endpoint, params, cache=True) - def posts(self): - return self.search(self.query, {}) + _, _, _, search, mode, post_type, query = self.groups + params = text.parse_query(query) + return self.api.search(text.unquote(search), params, mode, post_type) class TumblrAPI(oauth.OAuth1API): @@ -440,7 +414,7 @@ class TumblrAPI(oauth.OAuth1API): self.log.warning("'offset' and 'date-max' cannot be used together") endpoint = "/v2/blog/{}/posts".format(blog) - return self._pagination(endpoint, params, cache=True) + return self._pagination(endpoint, params, blog=blog, cache=True) def likes(self, blog): """Retrieve liked posts""" @@ -456,6 +430,20 @@ class TumblrAPI(oauth.OAuth1API): yield from posts params["before"] = posts[-1]["liked_timestamp"] + def search(self, query, params, mode="top", post_type=None): + """Retrieve search results""" + endpoint = "/v2/timeline/search" + + params["limit"] = "50" + params["days"] = params.pop("t", None) + params["query"] = query + params["mode"] = mode + params["reblog_info"] = "true" if self.extractor.reblogs else "false" + if post_type: + params["post_type_filter"] = post_type + + return self._pagination(endpoint, params) + def _call(self, endpoint, params, **kwargs): url = self.ROOT + endpoint kwargs["params"] = params @@ -524,29 +512,27 @@ class TumblrAPI(oauth.OAuth1API): raise exception.StopExtraction(data) - def _pagination(self, endpoint, params, key="posts", cache=False): - if endpoint[-1] != "?": - endpoint += "?" - + def _pagination(self, endpoint, params, + blog=None, key="posts", cache=False): if self.api_key: params["api_key"] = self.api_key strategy = self.extractor.config("pagination") + if not strategy and "offset" not in params: + strategy = "api" while True: data = self._call(endpoint, params) - if "/timeline/" in endpoint: - key = "elements" - posts = data.get("timeline", {}).get(key, []) - else: - posts = data[key] + if "timeline" in data: + data = data["timeline"] + posts = data["elements"] - if cache: - for post in posts: - p_blog = post.get("blog", {}) - self.BLOG_CACHE[p_blog.get("name", "")] = p_blog - cache = False + else: + if cache: + self.BLOG_CACHE[blog] = data["blog"] + cache = False + posts = data[key] yield from posts diff --git a/test/results/tumblr.py b/test/results/tumblr.py index 74de2f64..50b67676 100644 --- a/test/results/tumblr.py +++ b/test/results/tumblr.py @@ -361,7 +361,13 @@ __tests__ = ( }, { - "#url" : "https://www.tumblr.com/search/nathan fielder?src=typed_query", + "#url" : "https://www.tumblr.com/search/nathan fielder", + "#category": ("", "tumblr", "search"), + "#class" : tumblr.TumblrSearchExtractor, +}, + +{ + "#url" : "https://www.tumblr.com/search/nathan fielder/recent/quote?src=typed_query", "#category": ("", "tumblr", "search"), "#class" : tumblr.TumblrSearchExtractor, },