From 10365394d7ad2577ca1ec76de3fbeedd31aaeeb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 17 Aug 2018 20:04:11 +0200 Subject: [PATCH] [twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content --- CHANGELOG.md | 2 + docs/configuration.rst | 9 +++ docs/gallery-dl.conf | 4 + docs/supportedsites.rst | 2 +- gallery_dl/extractor/twitter.py | 133 ++++++++++++++++++++++++-------- gallery_dl/version.py | 2 +- 6 files changed, 118 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf7232e2..144a09e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # Changelog +## Unreleased + ## 1.5.1 - 2018-08-17 - Added support for: - `piczel` - https://piczel.tv/ diff --git a/docs/configuration.rst b/docs/configuration.rst index 4bfe9201..88289cc0 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -642,6 +642,15 @@ Description A (comma-separated) list of post types to extract images, etc. from. =========== ===== +extractor.twitter.retweets +-------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Extract images from retweets. +=========== ===== + + extractor.[booru].tags ---------------------- =========== ===== diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 8674e171..66cc0c45 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -107,6 +107,10 @@ "posts": "photo", "reblogs": true }, + "twitter": + { + "retweets": true + }, "yandere": { "tags": false diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 1edbdeb0..d87949a6 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -76,7 +76,7 @@ SmugMug https://www.smugmug.com/ |Albums, individ-5| Subapics https://subapics.com/ Chapters, Manga The /b/ Archive https://thebarchive.com/ Threads Tumblr https://www.tumblr.com/ Images from Users, Likes, Posts, Tag-Searches Optional (OAuth) -Twitter https://twitter.com/ Tweets +Twitter https://twitter.com/ Images from Users, Tweets Warosu https://warosu.org/ Threads World Three http://www.slide.world-three.org/ Chapters, Manga XVideos https://www.xvideos.com/ Images from Users, Galleries diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 36779000..01f205f2 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -12,52 +12,121 @@ from .common import Extractor, Message from .. import text -class TwitterTweetExtractor(Extractor): - """Extractor for images from tweets on twitter.com""" +class TwitterExtractor(Extractor): + """Base class for twitter extractors""" category = "twitter" - subcategory = "tweet" directory_fmt = ["{category}", "{user}"] filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{num}" - pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/" - r"(([^/]+)/status/(\d+))"] + root = "https://twitter.com" + + def __init__(self): + Extractor.__init__(self) + self.user = None + self.retweets = self.config("retweets", True) + + def items(self): + yield Message.Version, 1 + yield Message.Directory, self.metadata() + + for tweet in self.tweets(): + data = self._data_from_tweet(tweet) + if not self.retweets and data["retweet_id"]: + continue + + images = text.extract_iter( + tweet, 'data-image-url="', '"') + for data["num"], url in enumerate(images, 1): + text.nameext_from_url(url, data) + yield Message.Url, url + ":orig", data + + def metadata(self): + """Return general metadata""" + + def tweets(self): + """Yield HTML content of all relevant tweets""" + + @staticmethod + def _data_from_tweet(tweet): + data = text.extract_all(tweet, ( + ("tweet_id" , 'data-tweet-id="' , '"'), + ("retweet_id", 'data-retweet-id="' , '"'), + ("retweeter" , 'data-retweeter="' , '"'), + ("user" , 'data-screen-name="', '"'), + ("username" , 'data-name="' , '"'), + ("userid" , 'data-user-id="' , '"'), + ))[0] + for key in ("tweet_id", "retweet_id", "userid"): + data[key] = text.parse_int(data[key]) + data["retweeter"] = data["retweeter"] or "" + return data + + +class TwitterUserExtractor(TwitterExtractor): + """Extractor for all tweeted images of a user""" + subcategory = "user" + archive_fmt = "{tweet_id}_{num}" + pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" + r"/([^/?&#]+)/?$"] + test = [("https://twitter.com/PicturesEarth", { + "range": (1, 40), + "url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771", + "keyword": "611066e488c233e0b1bd2ab45d5f7fca1335f691", + })] + + def __init__(self, match): + TwitterExtractor.__init__(self) + self.user = match.group(1) + + def metadata(self): + return {"user": self.user} + + def tweets(self): + url = "{}/i/profiles/show/{}/timeline/tweets".format( + self.root, self.user) + params = {} + tweet = None + + while True: + data = self.request(url, params=params).json() + html = data["items_html"] + + for tweet in text.extract_iter( + html, '
', '