gallery-dl/gallery_dl/extractor/twitter.py

# -*- coding: utf-8 -*-

# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://twitter.com/"""

from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
import hashlib
import time


class TwitterExtractor(Extractor):
    """Base class for twitter extractors"""
    category = "twitter"
    directory_fmt = ("{category}", "{user[name]}")
    filename_fmt = "{tweet_id}_{num}.{extension}"
    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
    cookiedomain = ".twitter.com"
    root = "https://twitter.com"
    sizes = (":orig", ":large", ":medium", ":small")

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.user = match.group(1)
        self.retweets = self.config("retweets", True)
        self.replies = self.config("replies", True)
        self.twitpic = self.config("twitpic", False)
        self.quoted = self.config("quoted", True)
        self.videos = self.config("videos", True)
        self._user_cache = {}

    def items(self):
        self.login()
        metadata = self.metadata()
        yield Message.Version, 1

        for tweet in self.tweets():

            if not self.retweets and "retweeted_status_id_str" in tweet:
                self.log.debug("Skipping %s (retweet)", tweet["id_str"])
                continue
            if not self.replies and "in_reply_to_user_id_str" in tweet:
                self.log.debug("Skipping %s (reply)", tweet["id_str"])
                continue
            if not self.quoted and "quoted" in tweet:
                self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"])
                continue

            if self.twitpic:
                self._extract_twitpic(tweet)
            if "extended_entities" not in tweet:
                continue

            tdata = self._transform_tweet(tweet)
            tdata.update(metadata)

            yield Message.Directory, tdata
            for tdata["num"], media in enumerate(
                    tweet["extended_entities"]["media"], 1):

                tdata["width"] = media["original_info"].get("width", 0)
                tdata["height"] = media["original_info"].get("height", 0)

                if "video_info" in media:

                    if self.videos == "ytdl":
                        url = "ytdl:{}/i/web/status/{}".format(
                            self.root, tweet["id_str"])
                        tdata["extension"] = None
                        yield Message.Url, url, tdata

                    elif self.videos:
                        video_info = media["video_info"]
                        variant = max(
                            video_info["variants"],
                            key=lambda v: v.get("bitrate", 0),
                        )
                        tdata["duration"] = video_info.get(
                            "duration_millis", 0) / 1000
                        tdata["bitrate"] = variant.get("bitrate", 0)

                        url = variant["url"]
                        text.nameext_from_url(url, tdata)
                        yield Message.Url, url, tdata

                elif "media_url_https" in media:
                    url = media["media_url_https"]
                    urls = [url + size for size in self.sizes]
                    text.nameext_from_url(url, tdata)
                    yield Message.Urllist, urls, tdata

                else:
                    url = media["media_url"]
                    text.nameext_from_url(url, tdata)
                    yield Message.Url, url, tdata

    def _extract_twitpic(self, tweet):
        twitpics = []
        for url in tweet["entities"].get("urls", ()):
            url = url["expanded_url"]
            if "//twitpic.com/" in url:
                response = self.request(url, fatal=False)
                if response.status_code >= 400:
                    continue
                url = text.extract(
                    response.text, 'name="twitter:image" value="', '"')[0]
                twitpics.append({
                    "original_info": {},
                    "media_url"    : url,
                })
        if twitpics:
            if "extended_entities" in tweet:
                tweet["extended_entities"]["media"].extend(twitpics)
            else:
                tweet["extended_entities"] = {"media": twitpics}

    def _transform_tweet(self, tweet):
        entities = tweet["entities"]
        tdata = {
            "tweet_id"      : text.parse_int(tweet["id_str"]),
            "retweet_id"    : text.parse_int(
                tweet.get("retweeted_status_id_str")),
            "quote_id"      : text.parse_int(
                tweet.get("quoted_status_id_str")),
            "reply_id"      : text.parse_int(
                tweet.get("in_reply_to_status_id_str")),
            "date"          : text.parse_datetime(
                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
            "user"          : self._transform_user(tweet["user"]),
            "lang"          : tweet["lang"],
            "content"       : tweet["full_text"],
            "favorite_count": tweet["favorite_count"],
            "quote_count"   : tweet["quote_count"],
            "reply_count"   : tweet["reply_count"],
            "retweet_count" : tweet["retweet_count"],
        }

        hashtags = entities.get("hashtags")
        if hashtags:
            tdata["hashtags"] = [t["text"] for t in hashtags]

        mentions = entities.get("user_mentions")
        if mentions:
            tdata["mentions"] = [{
                "id": text.parse_int(u["id_str"]),
                "name": u["screen_name"],
                "nick": u["name"],
            } for u in mentions]

        if "in_reply_to_screen_name" in tweet:
            tdata["reply_to"] = tweet["in_reply_to_screen_name"]

        if "author" in tweet:
            tdata["author"] = self._transform_user(tweet["author"])
        else:
            tdata["author"] = tdata["user"]

        return tdata

    def _transform_user(self, user):
        uid = user["id_str"]
        cache = self._user_cache

        if uid not in cache:
            cache[uid] = {
                "id"              : text.parse_int(uid),
                "name"            : user["screen_name"],
                "nick"            : user["name"],
                "description"     : user["description"],
                "location"        : user["location"],
                "date"            : text.parse_datetime(
                    user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
                "verified"        : user.get("verified", False),
                "profile_banner"  : user.get("profile_banner_url", ""),
                "profile_image"   : user.get(
                    "profile_image_url_https", "").replace("_normal.", "."),
                "favourites_count": user["favourites_count"],
                "followers_count" : user["followers_count"],
                "friends_count"   : user["friends_count"],
                "listed_count"    : user["listed_count"],
                "media_count"     : user["media_count"],
                "statuses_count"  : user["statuses_count"],
            }
        return cache[uid]

    def metadata(self):
        """Return general metadata"""
        return {}

    def tweets(self):
        """Yield all relevant tweet objects"""

    def login(self):
        username, password = self._get_auth_info()
        if username:
            self._update_cookies(self._login_impl(username, password))

    @cache(maxage=360*24*3600, keyarg=1)
    def _login_impl(self, username, password):
        self.log.info("Logging in as %s", username)

        url = "https://mobile.twitter.com/i/nojs_router"
        params = {"path": "/login"}
        headers = {"Referer": self.root + "/", "Origin": self.root}
        page = self.request(
            url, method="POST", params=params, headers=headers, data={}).text

        pos = page.index('name="authenticity_token"')
        token = text.extract(page, 'value="', '"', pos)[0]

        url = "https://mobile.twitter.com/sessions"
        data = {
            "authenticity_token"        : token,
            "session[username_or_email]": username,
            "session[password]"         : password,
            "remember_me"               : "1",
            "wfa"                       : "1",
            "commit"                    : "+Log+in+",
            "ui_metrics"                : "",
        }
        response = self.request(url, method="POST", data=data)
        cookies = {
            cookie.name: cookie.value
            for cookie in self.session.cookies
            if cookie.domain == self.cookiedomain
        }

        if "/error" in response.url or "auth_token" not in cookies:
            raise exception.AuthenticationError()
        return cookies


class TwitterTimelineExtractor(TwitterExtractor):
    """Extractor for all images from a user's timeline"""
    subcategory = "timeline"
    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
               r"/(?!search)([^/?&#]+)/?(?:$|[?#])")
    test = (
        ("https://twitter.com/supernaturepics", {
            "range": "1-40",
            "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
        }),
        ("https://mobile.twitter.com/supernaturepics?p=i"),
    )

    def tweets(self):
        return TwitterAPI(self).timeline_profile(self.user)


class TwitterMediaExtractor(TwitterExtractor):
    """Extractor for all images from a user's Media Tweets"""
    subcategory = "media"
    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
               r"/(?!search)([^/?&#]+)/media(?!\w)")
    test = (
        ("https://twitter.com/supernaturepics/media", {
            "range": "1-40",
            "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
        }),
        ("https://mobile.twitter.com/supernaturepics/media#t"),
    )

    def tweets(self):
        return TwitterAPI(self).timeline_media(self.user)


class TwitterLikesExtractor(TwitterExtractor):
    """Extractor for liked tweets"""
    subcategory = "likes"
    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
               r"/(?!search)([^/?&#]+)/likes(?!\w)")
    test = ("https://twitter.com/supernaturepics/likes",)

    def tweets(self):
        return TwitterAPI(self).timeline_favorites(self.user)


class TwitterBookmarkExtractor(TwitterExtractor):
    """Extractor for bookmarked tweets"""
    subcategory = "bookmark"
    pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()"
    test = ("https://twitter.com/i/bookmarks",)

    def tweets(self):
        return TwitterAPI(self).timeline_bookmark()


class TwitterSearchExtractor(TwitterExtractor):
    """Extractor for all images from a search timeline"""
    subcategory = "search"
    directory_fmt = ("{category}", "Search", "{search}")
    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
               r"/search/?\?(?:[^&#]+&)*q=([^&#]+)")
    test = ("https://twitter.com/search?q=nature", {
        "range": "1-40",
        "count": 40,
    })

    def metadata(self):
        return {"search": text.unquote(self.user)}

    def tweets(self):
        return TwitterAPI(self).search(text.unquote(self.user))


class TwitterTweetExtractor(TwitterExtractor):
    """Extractor for images from individual tweets"""
    subcategory = "tweet"
    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
               r"/([^/?&#]+|i/web)/status/(\d+)")
    test = (
        ("https://twitter.com/supernaturepics/status/604341487988576256", {
            "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
        }),
        # 4 images
        ("https://twitter.com/perrypumas/status/894001459754180609", {
            "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
        }),
        # video
        ("https://twitter.com/perrypumas/status/1065692031626829824", {
            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
        }),
        # content with emoji, newlines, hashtags (#338)
        ("https://twitter.com/playpokemon/status/1263832915173048321", {
            "keyword": {"content": (
                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
                "Gifts! \n\nYou’ll be able to receive four Galarian form "
                "Pokémon with Hidden Abilities, plus some very useful items. "
                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
            )},
        }),
        # Reply to deleted tweet (#403, #838)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_.jpg:orig",
        }),
        # 'replies' option (#705)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "options": (("replies", False),),
            "count": 0,
        }),
        # quoted tweet (#526, #854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+\.jpg",
            "count": 8,
        }),
        # "quoted" option (#854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "options": (("quoted", False),),
            "pattern": r"https://pbs\.twimg\.com/media/EaK.+\.jpg",
            "count": 4,
        }),
        # TwitPic embeds (#579)
        ("https://twitter.com/i/web/status/112900228289540096", {
            "options": (("twitpic", True),),
            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
            "count": 3,
        }),
    )

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        self.tweet_id = match.group(2)

    def tweets(self):
        return TwitterAPI(self).tweet(self.tweet_id)


class TwitterAPI():

    def __init__(self, extractor):
        self.extractor = extractor
        self.headers = {
            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
                             "4FA33AGWWjCpTnA",
            "x-guest-token": None,
            "x-twitter-client-language": "en",
            "x-twitter-active-user": "yes",
            "x-csrf-token": None,
            "Origin": "https://twitter.com",
            "Referer": "https://twitter.com/",
        }
        self.params = {
            "include_profile_interstitial_type": "1",
            "include_blocking": "1",
            "include_blocked_by": "1",
            "include_followed_by": "1",
            "include_want_retweets": "1",
            "include_mute_edge": "1",
            "include_can_dm": "1",
            "include_can_media_tag": "1",
            "skip_status": "1",
            "cards_platform": "Web-12",
            "include_cards": "1",
            "include_composer_source": "true",
            "include_ext_alt_text": "true",
            "include_reply_count": "1",
            "tweet_mode": "extended",
            "include_entities": "true",
            "include_user_entities": "true",
            "include_ext_media_color": "true",
            "include_ext_media_availability": "true",
            "send_error_codes": "true",
            "simple_quoted_tweet": "true",
            #  "count": "20",
            "count": "100",
            "cursor": None,
            "ext": "mediaStats,highlightedLabel,cameraMoment",
            "include_quote_count": "true",
        }

        cookies = self.extractor.session.cookies

        # CSRF
        csrf = hashlib.md5(str(time.time()).encode()).hexdigest()
        self.headers["x-csrf-token"] = csrf
        cookies.set("ct0", csrf, domain=".twitter.com")

        if cookies.get("auth_token", domain=".twitter.com"):
            self.headers["x-twitter-auth-type"] = "OAuth2Session"
        else:
            # guest token
            guest_token = self._guest_token()
            self.headers["x-guest-token"] = guest_token
            cookies.set("gt", guest_token, domain=".twitter.com")

    def tweet(self, tweet_id):
        endpoint = "2/timeline/conversation/{}.json".format(tweet_id)
        tweets = []
        for tweet in self._pagination(endpoint):
            if tweet["id_str"] == tweet_id:
                tweets.append(tweet)
                if "quoted_status_id_str" in tweet:
                    tweet_id = tweet["quoted_status_id_str"]
                else:
                    break
        return tweets

    def timeline_profile(self, screen_name):
        user = self.user_by_screen_name(screen_name)
        endpoint = "2/timeline/profile/{}.json".format(user["rest_id"])
        return self._pagination(endpoint)

    def timeline_media(self, screen_name):
        user = self.user_by_screen_name(screen_name)
        endpoint = "2/timeline/media/{}.json".format(user["rest_id"])
        return self._pagination(endpoint)

    def timeline_favorites(self, screen_name):
        user = self.user_by_screen_name(screen_name)
        endpoint = "2/timeline/favorites/{}.json".format(user["rest_id"])
        return self._pagination(endpoint)

    def timeline_bookmark(self):
        endpoint = "2/timeline/bookmark.json"
        return self._pagination(endpoint)

    def search(self, query):
        endpoint = "2/search/adaptive.json"
        params = self.params.copy()
        params["q"] = query
        params["tweet_search_mode"] = "live"
        params["query_source"] = "typed_query"
        params["pc"] = "1"
        params["spelling_corrections"] = "1"
        return self._pagination(
            endpoint, params, "sq-I-t-", "sq-cursor-bottom")

    def user_by_screen_name(self, screen_name):
        endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName"
        params = {
            "variables": '{"screen_name":"' + screen_name + '"'
                         ',"withHighlightedLabel":true}'
        }
        return self._call(endpoint, params)["data"]["user"]

    @cache(maxage=3600)
    def _guest_token(self):
        endpoint = "1.1/guest/activate.json"
        return self._call(endpoint, None, "POST")["guest_token"]

    def _call(self, endpoint, params, method="GET"):
        url = "https://api.twitter.com/" + endpoint
        response = self.extractor.request(
            url, method=method, params=params, headers=self.headers,
            fatal=None)
        if response.status_code < 400:
            return response.json()
        if response.status_code == 429:
            self.extractor.wait(until=response.headers["x-rate-limit-reset"])
            return self._call(endpoint, params)

        try:
            msg = ", ".join(
                '"' + error["message"] + '"'
                for error in response.json()["errors"]
            )
        except Exception:
            msg = response.text
        raise exception.StopExtraction(
            "%s %s (%s)", response.status_code, response.reason, msg)

    def _pagination(self, endpoint, params=None,
                    entry_tweet="tweet-", entry_cursor="cursor-bottom-"):
        if params is None:
            params = self.params.copy()

        while True:
            cursor = tweet = None
            data = self._call(endpoint, params)

            instr = data["timeline"]["instructions"]
            if not instr:
                return
            tweets = data["globalObjects"]["tweets"]
            users = data["globalObjects"]["users"]

            for entry in instr[0]["addEntries"]["entries"]:

                if entry["entryId"].startswith(entry_tweet):
                    try:
                        tweet = tweets[
                            entry["content"]["item"]["content"]["tweet"]["id"]]
                    except KeyError:
                        self.extractor.log.debug(
                            "Skipping %s (deleted)",
                            entry["entryId"][len(entry_tweet):])
                        continue
                    tweet["user"] = users[tweet["user_id_str"]]

                    if "retweeted_status_id_str" in tweet:
                        retweet = tweets.get(tweet["retweeted_status_id_str"])
                        if retweet:
                            tweet["author"] = users[retweet["user_id_str"]]
                    yield tweet

                    if "quoted_status_id_str" in tweet:
                        quoted = tweets.get(tweet["quoted_status_id_str"])
                        if quoted:
                            quoted["author"] = users[quoted["user_id_str"]]
                            quoted["user"] = tweet["user"]
                            quoted["quoted"] = True
                            yield quoted

                elif entry["entryId"].startswith(entry_cursor):
                    cursor = entry["content"]["operation"]["cursor"]
                    if not cursor.get("stopOnEmptyResponse"):
                        # keep going even if there are no tweets
                        tweet = True
                    cursor = cursor["value"]

            if "replaceEntry" in instr[-1] :
                cursor = (instr[-1]["replaceEntry"]["entry"]
                          ["content"]["operation"]["cursor"]["value"])

            if not cursor or not tweet:
                return
            params["cursor"] = cursor
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
+								# -*- coding: utf-8 -*-
-												[twitter] force old login page layout (fixes #584, fixes #598)

											
										
										
											2020-02-02 17:19:14 +01:00
+								# Copyright 2016-2020 Mike Fährmann
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
+								#
 								# This program is free software; you can redistribute it and/or modify
 								# it under the terms of the GNU General Public License version 2 as
 								# published by the Free Software Foundation.
-												[twitter] force old login page layout (fixes #584, fixes #598)

											
										
										
											2020-02-02 17:19:14 +01:00
+								"""Extractors for https://twitter.com/"""
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
 								from .common import Extractor, Message
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								from .. import text, exception
-												[twitter] don't cache results of 'user_by_screen_name()'

A 'keyarg=1' argument to the memcache decorator would have worked as
well, but keeping the user object in memory isn't useful for the vast
majority of use cases and only wastes space.

(closes #817)

											
										
										
											2020-06-10 20:58:42 +02:00
+								from ..cache import cache
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								import hashlib
 								import time
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
-												code adjustments according to pep8 nr2

											
										
										
											2017-02-01 00:53:19 +01:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								class TwitterExtractor(Extractor):
 								    """Base class for twitter extractors"""
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
+								    category = "twitter"
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								    directory_fmt = ("{category}", "{user[name]}")
 								    filename_fmt = "{tweet_id}_{num}.{extension}"
 								    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											2020-03-12 22:02:12 +01:00
+								    cookiedomain = ".twitter.com"
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    root = "https://twitter.com"
-												[twitter] add fallback URLs (#237)

											
										
										
											2019-04-30 15:43:43 +02:00
+								    sizes = (":orig", ":large", ":medium", ":small")
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								    def __init__(self, match):
-												propagate 'match' to base extractor constructor

											
										
										
											2019-02-11 13:31:10 +01:00
+								        Extractor.__init__(self, match)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								        self.user = match.group(1)
 								        self.retweets = self.config("retweets", True)
-												[twitter] add 'replies' option (closes #705)

											
										
										
											2020-04-29 23:11:24 +02:00
+								        self.replies = self.config("replies", True)
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											2020-01-18 21:26:46 +01:00
+								        self.twitpic = self.config("twitpic", False)
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								        self.quoted = self.config("quoted", True)
-												[twitter] change default value for 'videos' to 'true'

Every other 'videos' option defaulted to 'true', except Twitter.

											
										
										
											2020-02-14 01:03:42 +01:00
+								        self.videos = self.config("videos", True)
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								        self._user_cache = {}
-												[twitter] add experimental 'videos' option (#99)

Enabling this option will detect videos in tweets and output them as
"unsupported" URLs, so that these can then be downloaded with youtube-dl

There are a lot of improvements to be made to the current
implementation, but it works and does what it is supposed to, even if
inefficient as can be ...

											
										
										
											2018-09-30 18:41:39 +02:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    def items(self):
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								        self.login()
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											2019-11-30 21:51:08 +01:00
+								        metadata = self.metadata()
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								        yield Message.Version, 1
 								        for tweet in self.tweets():
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
-												[twitter] add debug messages for all skipped Tweets (#867)

											
										
										
											2020-07-11 00:41:50 +02:00
+								            if not self.retweets and "retweeted_status_id_str" in tweet:
 								                self.log.debug("Skipping %s (retweet)", tweet["id_str"])
 								                continue
 								            if not self.replies and "in_reply_to_user_id_str" in tweet:
 								                self.log.debug("Skipping %s (reply)", tweet["id_str"])
 								                continue
 								            if not self.quoted and "quoted" in tweet:
 								                self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								                continue
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
+								            if self.twitpic:
 								                self._extract_twitpic(tweet)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            if "extended_entities" not in tweet:
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								                continue
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            tdata = self._transform_tweet(tweet)
 								            tdata.update(metadata)
-												[twitter] small metadata cleanup

- add 'date' field
- remove 'entities' and 'extended_entities'
- don't include 'focus_fields' from 'original_info'

											
										
										
											2020-06-04 18:21:54 +02:00
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            yield Message.Directory, tdata
 								            for tdata["num"], media in enumerate(
 								                    tweet["extended_entities"]["media"], 1):
-												[twitter] small metadata cleanup

- add 'date' field
- remove 'entities' and 'extended_entities'
- don't include 'focus_fields' from 'original_info'

											
										
										
											2020-06-04 18:21:54 +02:00
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								                tdata["width"] = media["original_info"].get("width", 0)
 								                tdata["height"] = media["original_info"].get("height", 0)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] don't download video previews (#833)

when 'videos' is set to False

											
										
										
											2020-06-16 14:10:51 +02:00
+								                if "video_info" in media:
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
 								                    if self.videos == "ytdl":
 								                        url = "ytdl:{}/i/web/status/{}".format(
 								                            self.root, tweet["id_str"])
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								                        tdata["extension"] = None
 								                        yield Message.Url, url, tdata
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] don't download video previews (#833)

when 'videos' is set to False

											
										
										
											2020-06-16 14:10:51 +02:00
+								                    elif self.videos:
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								                        video_info = media["video_info"]
 								                        variant = max(
 								                            video_info["variants"],
 								                            key=lambda v: v.get("bitrate", 0),
 								                        )
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								                        tdata["duration"] = video_info.get(
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								                            "duration_millis", 0) / 1000
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								                        tdata["bitrate"] = variant.get("bitrate", 0)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
 								                        url = variant["url"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								                        text.nameext_from_url(url, tdata)
 								                        yield Message.Url, url, tdata
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											2019-11-30 21:51:08 +01:00
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
+								                elif "media_url_https" in media:
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								                    url = media["media_url_https"]
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											2019-11-30 21:51:08 +01:00
+								                    urls = [url + size for size in self.sizes]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								                    text.nameext_from_url(url, tdata)
 								                    yield Message.Urllist, urls, tdata
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											2020-01-18 21:26:46 +01:00
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
+								                else:
 								                    url = media["media_url"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								                    text.nameext_from_url(url, tdata)
 								                    yield Message.Url, url, tdata
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
 								    def _extract_twitpic(self, tweet):
 								        twitpics = []
 								        for url in tweet["entities"].get("urls", ()):
 								            url = url["expanded_url"]
 								            if "//twitpic.com/" in url:
 								                response = self.request(url, fatal=False)
 								                if response.status_code >= 400:
 								                    continue
 								                url = text.extract(
 								                    response.text, 'name="twitter:image" value="', '"')[0]
 								                twitpics.append({
 								                    "original_info": {},
 								                    "media_url"    : url,
 								                })
 								        if twitpics:
 								            if "extended_entities" in tweet:
 								                tweet["extended_entities"]["media"].extend(twitpics)
 								            else:
 								                tweet["extended_entities"] = {"media": twitpics}
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								    def _transform_tweet(self, tweet):
 								        entities = tweet["entities"]
 								        tdata = {
 								            "tweet_id"      : text.parse_int(tweet["id_str"]),
 								            "retweet_id"    : text.parse_int(
 								                tweet.get("retweeted_status_id_str")),
 								            "quote_id"      : text.parse_int(
 								                tweet.get("quoted_status_id_str")),
 								            "reply_id"      : text.parse_int(
 								                tweet.get("in_reply_to_status_id_str")),
 								            "date"          : text.parse_datetime(
 								                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
 								            "user"          : self._transform_user(tweet["user"]),
 								            "lang"          : tweet["lang"],
 								            "content"       : tweet["full_text"],
 								            "favorite_count": tweet["favorite_count"],
 								            "quote_count"   : tweet["quote_count"],
 								            "reply_count"   : tweet["reply_count"],
 								            "retweet_count" : tweet["retweet_count"],
 								        }
 								        hashtags = entities.get("hashtags")
 								        if hashtags:
 								            tdata["hashtags"] = [t["text"] for t in hashtags]
 								        mentions = entities.get("user_mentions")
 								        if mentions:
 								            tdata["mentions"] = [{
 								                "id": text.parse_int(u["id_str"]),
 								                "name": u["screen_name"],
 								                "nick": u["name"],
 								            } for u in mentions]
-												[twitter] add 'reply_to' metadata to replies

											
										
										
											2020-06-09 21:48:04 +02:00
+								        if "in_reply_to_screen_name" in tweet:
 								            tdata["reply_to"] = tweet["in_reply_to_screen_name"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								        if "author" in tweet:
 								            tdata["author"] = self._transform_user(tweet["author"])
-												[twitter] always provide an 'author' field (#831, #833)

The idea was to have less metadata clutter for most Tweets were
'author' and 'user' are the same (non-retweets), and only provide
a 'user' field.

The original Tweet author could be gotten with
{author[…]|user[…]}, but basically no one knows about that.

											
										
										
											2020-06-18 00:12:36 +02:00
+								        else:
 								            tdata["author"] = tdata["user"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
 								        return tdata
 								    def _transform_user(self, user):
 								        uid = user["id_str"]
 								        cache = self._user_cache
 								        if uid not in cache:
 								            cache[uid] = {
 								                "id"              : text.parse_int(uid),
 								                "name"            : user["screen_name"],
 								                "nick"            : user["name"],
 								                "description"     : user["description"],
 								                "location"        : user["location"],
 								                "date"            : text.parse_datetime(
 								                    user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
 								                "verified"        : user.get("verified", False),
 								                "profile_banner"  : user.get("profile_banner_url", ""),
 								                "profile_image"   : user.get(
 								                    "profile_image_url_https", "").replace("_normal.", "."),
 								                "favourites_count": user["favourites_count"],
 								                "followers_count" : user["followers_count"],
 								                "friends_count"   : user["friends_count"],
 								                "listed_count"    : user["listed_count"],
 								                "media_count"     : user["media_count"],
 								                "statuses_count"  : user["statuses_count"],
 								            }
 								        return cache[uid]
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    def metadata(self):
 								        """Return general metadata"""
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											2019-11-30 21:51:08 +01:00
+								        return {}
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
 								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        """Yield all relevant tweet objects"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								    def login(self):
 								        username, password = self._get_auth_info()
 								        if username:
 								            self._update_cookies(self._login_impl(username, password))
 								    @cache(maxage=360*24*3600, keyarg=1)
 								    def _login_impl(self, username, password):
 								        self.log.info("Logging in as %s", username)
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								        url = "https://mobile.twitter.com/i/nojs_router"
 								        params = {"path": "/login"}
 								        headers = {"Referer": self.root + "/", "Origin": self.root}
 								        page = self.request(
 								            url, method="POST", params=params, headers=headers, data={}).text
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								        pos = page.index('name="authenticity_token"')
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								        token = text.extract(page, 'value="', '"', pos)[0]
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								        url = "https://mobile.twitter.com/sessions"
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								        data = {
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								            "authenticity_token"        : token,
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								            "session[username_or_email]": username,
 								            "session[password]"         : password,
 								            "remember_me"               : "1",
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								            "wfa"                       : "1",
 								            "commit"                    : "+Log+in+",
 								            "ui_metrics"                : "",
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								        }
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								        response = self.request(url, method="POST", data=data)
 								        cookies = {
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											2020-03-12 22:02:12 +01:00
+								            cookie.name: cookie.value
 								            for cookie in self.session.cookies
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								            if cookie.domain == self.cookiedomain
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											2020-03-12 22:02:12 +01:00
+								        }
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
 								        if "/error" in response.url or "auth_token" not in cookies:
 								            raise exception.AuthenticationError()
 								        return cookies
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								class TwitterTimelineExtractor(TwitterExtractor):
 								    """Extractor for all images from a user's timeline"""
 								    subcategory = "timeline"
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											2019-02-08 13:45:40 +01:00
+								    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
+								               r"/(?!search)([^/?&#]+)/?(?:$|[?#])")
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    test = (
 								        ("https://twitter.com/supernaturepics", {
 								            "range": "1-40",
 								            "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
 								        }),
 								        ("https://mobile.twitter.com/supernaturepics?p=i"),
 								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
 								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        return TwitterAPI(self).timeline_profile(self.user)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
 								class TwitterMediaExtractor(TwitterExtractor):
 								    """Extractor for all images from a user's Media Tweets"""
 								    subcategory = "media"
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											2019-02-08 13:45:40 +01:00
+								    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
+								               r"/(?!search)([^/?&#]+)/media(?!\w)")
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    test = (
 								        ("https://twitter.com/supernaturepics/media", {
 								            "range": "1-40",
 								            "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
 								        }),
 								        ("https://mobile.twitter.com/supernaturepics/media#t"),
 								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
 								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        return TwitterAPI(self).timeline_media(self.user)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								class TwitterLikesExtractor(TwitterExtractor):
 								    """Extractor for liked tweets"""
 								    subcategory = "likes"
 								    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
 								               r"/(?!search)([^/?&#]+)/likes(?!\w)")
 								    test = ("https://twitter.com/supernaturepics/likes",)
 								    def tweets(self):
 								        return TwitterAPI(self).timeline_favorites(self.user)
 								class TwitterBookmarkExtractor(TwitterExtractor):
 								    """Extractor for bookmarked tweets"""
 								    subcategory = "bookmark"
 								    pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()"
 								    test = ("https://twitter.com/i/bookmarks",)
 								    def tweets(self):
 								        return TwitterAPI(self).timeline_bookmark()
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											2019-10-16 18:23:10 +02:00
+								class TwitterSearchExtractor(TwitterExtractor):
 								    """Extractor for all images from a search timeline"""
 								    subcategory = "search"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
+								    directory_fmt = ("{category}", "Search", "{search}")
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											2019-10-16 18:23:10 +02:00
+								    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
+								               r"/search/?\?(?:[^&#]+&)*q=([^&#]+)")
 								    test = ("https://twitter.com/search?q=nature", {
 								        "range": "1-40",
 								        "count": 40,
 								    })
 								    def metadata(self):
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								        return {"search": text.unquote(self.user)}
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											2019-10-16 18:23:10 +02:00
+								    def tweets(self):
-												[twitter] improve search results (fixes #847)

Adding 'tweet_search_mode=live' to the query parameters
is the most important part here.

											
										
										
											2020-06-21 15:43:27 +02:00
+								        return TwitterAPI(self).search(text.unquote(self.user))
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								class TwitterTweetExtractor(TwitterExtractor):
-												[twitter] changes and improvements

- rename User- to TimelineExtractor
- rename 'userid' to 'user_id' to conform to the other ..._id values
- adjust archive_fmt to deal with retweets
- emulate browser behavior for API calls

											
										
										
											2018-08-18 18:58:10 +02:00
+								    """Extractor for images from individual tweets"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    subcategory = "tweet"
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											2019-02-08 13:45:40 +01:00
+								    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
-												[twitter] match and use 'i/web' status URLs

											
										
										
											2019-09-24 21:18:05 +02:00
+								               r"/([^/?&#]+|i/web)/status/(\d+)")
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											2019-02-08 13:45:40 +01:00
+								    test = (
-												[twitter] replace unit test URLs

https://twitter.com/PicturesEarth was deleted

											
										
										
											2019-05-09 10:17:55 +02:00
+								        ("https://twitter.com/supernaturepics/status/604341487988576256", {
 								            "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
 								            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											2017-08-06 13:43:08 +02:00
+								        }),
-												[twitter] extract 'date' metadata (#224)

											
										
										
											2019-04-21 15:41:22 +02:00
+								        # 4 images
-												[twitter] ignore "Promoted Tweets"

											
										
										
											2017-08-06 13:43:08 +02:00
+								        ("https://twitter.com/perrypumas/status/894001459754180609", {
 								            "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
-												[twitter] extract 'date' metadata (#224)

											
										
										
											2019-04-21 15:41:22 +02:00
+								        }),
 								        # video
 								        ("https://twitter.com/perrypumas/status/1065692031626829824", {
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											2017-08-06 13:43:08 +02:00
+								        }),
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											2019-07-17 15:35:42 +02:00
+								        # content with emoji, newlines, hashtags (#338)
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											2020-05-28 01:55:32 +02:00
+								        ("https://twitter.com/playpokemon/status/1263832915173048321", {
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            "keyword": {"content": (
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											2020-05-28 01:55:32 +02:00
+								                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
 								                "Gifts! \n\nYou’ll be able to receive four Galarian form "
 								                "Pokémon with Hidden Abilities, plus some very useful items. "
 								                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
-												update test results

- twitter:

    Don't test the whole kwdict, only the actual content, since the
    keyword hash changes whenever that user changes his display name.

- khinsider:

    Download host changed

											
										
										
											2020-02-22 02:59:56 +01:00
+								            )},
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											2019-07-17 15:35:42 +02:00
+								        }),
-												[twitter] update tests

											
										
										
											2020-06-19 18:12:57 +02:00
+								        # Reply to deleted tweet (#403, #838)
 								        ("https://twitter.com/i/web/status/1170041925560258560", {
 								            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_.jpg:orig",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								        }),
-												[twitter] add 'replies' option (closes #705)

											
										
										
											2020-04-29 23:11:24 +02:00
+								        # 'replies' option (#705)
-												[twitter] update tests

											
										
										
											2020-06-19 18:12:57 +02:00
+								        ("https://twitter.com/i/web/status/1170041925560258560", {
-												[twitter] add 'replies' option (closes #705)

											
										
										
											2020-04-29 23:11:24 +02:00
+								            "options": (("replies", False),),
 								            "count": 0,
 								        }),
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								        # quoted tweet (#526, #854)
 								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
 								            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+\.jpg",
 								            "count": 8,
 								        }),
 								        # "quoted" option (#854)
 								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
 								            "options": (("quoted", False),),
 								            "pattern": r"https://pbs\.twimg\.com/media/EaK.+\.jpg",
 								            "count": 4,
-												[twitter] handle quoted tweets (#526)

… and categorize them as retweets

											
										
										
											2020-01-04 21:26:55 +01:00
+								        }),
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											2020-01-18 21:26:46 +01:00
+								        # TwitPic embeds (#579)
 								        ("https://twitter.com/i/web/status/112900228289540096", {
 								            "options": (("twitpic", True),),
 								            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
 								            "count": 3,
 								        }),
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											2019-02-08 13:45:40 +01:00
+								    )
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
 								    def __init__(self, match):
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								        TwitterExtractor.__init__(self, match)
 								        self.tweet_id = match.group(2)
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        return TwitterAPI(self).tweet(self.tweet_id)
-												[twitter] handle API rate limits (#526)

											
										
										
											2020-01-04 23:46:29 +01:00
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								class TwitterAPI():
 								    def __init__(self, extractor):
 								        self.extractor = extractor
 								        self.headers = {
 								            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
 								                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
 								                             "4FA33AGWWjCpTnA",
 								            "x-guest-token": None,
 								            "x-twitter-client-language": "en",
 								            "x-twitter-active-user": "yes",
 								            "x-csrf-token": None,
 								            "Origin": "https://twitter.com",
 								            "Referer": "https://twitter.com/",
 								        }
 								        self.params = {
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								            "include_profile_interstitial_type": "1",
 								            "include_blocking": "1",
 								            "include_blocked_by": "1",
 								            "include_followed_by": "1",
 								            "include_want_retweets": "1",
 								            "include_mute_edge": "1",
 								            "include_can_dm": "1",
 								            "include_can_media_tag": "1",
 								            "skip_status": "1",
 								            "cards_platform": "Web-12",
 								            "include_cards": "1",
 								            "include_composer_source": "true",
 								            "include_ext_alt_text": "true",
 								            "include_reply_count": "1",
 								            "tweet_mode": "extended",
 								            "include_entities": "true",
 								            "include_user_entities": "true",
 								            "include_ext_media_color": "true",
 								            "include_ext_media_availability": "true",
 								            "send_error_codes": "true",
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            "simple_quoted_tweet": "true",
 								            #  "count": "20",
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								            "count": "100",
 								            "cursor": None,
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            "ext": "mediaStats,highlightedLabel,cameraMoment",
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            "include_quote_count": "true",
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								        }
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
 								        cookies = self.extractor.session.cookies
 								        # CSRF
 								        csrf = hashlib.md5(str(time.time()).encode()).hexdigest()
 								        self.headers["x-csrf-token"] = csrf
 								        cookies.set("ct0", csrf, domain=".twitter.com")
 								        if cookies.get("auth_token", domain=".twitter.com"):
 								            self.headers["x-twitter-auth-type"] = "OAuth2Session"
 								        else:
 								            # guest token
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											2020-06-18 00:28:38 +02:00
+								            guest_token = self._guest_token()
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            self.headers["x-guest-token"] = guest_token
 								            cookies.set("gt", guest_token, domain=".twitter.com")
 								    def tweet(self, tweet_id):
 								        endpoint = "2/timeline/conversation/{}.json".format(tweet_id)
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											2020-06-24 21:08:04 +02:00
+								        tweets = []
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        for tweet in self._pagination(endpoint):
 								            if tweet["id_str"] == tweet_id:
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											2020-06-24 21:08:04 +02:00
+								                tweets.append(tweet)
 								                if "quoted_status_id_str" in tweet:
 								                    tweet_id = tweet["quoted_status_id_str"]
 								                else:
 								                    break
 								        return tweets
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
 								    def timeline_profile(self, screen_name):
 								        user = self.user_by_screen_name(screen_name)
 								        endpoint = "2/timeline/profile/{}.json".format(user["rest_id"])
 								        return self._pagination(endpoint)
 								    def timeline_media(self, screen_name):
 								        user = self.user_by_screen_name(screen_name)
 								        endpoint = "2/timeline/media/{}.json".format(user["rest_id"])
 								        return self._pagination(endpoint)
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								    def timeline_favorites(self, screen_name):
 								        user = self.user_by_screen_name(screen_name)
 								        endpoint = "2/timeline/favorites/{}.json".format(user["rest_id"])
 								        return self._pagination(endpoint)
 								    def timeline_bookmark(self):
 								        endpoint = "2/timeline/bookmark.json"
 								        return self._pagination(endpoint)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								    def search(self, query):
 								        endpoint = "2/search/adaptive.json"
 								        params = self.params.copy()
-												[twitter] improve search results (fixes #847)

Adding 'tweet_search_mode=live' to the query parameters
is the most important part here.

											
										
										
											2020-06-21 15:43:27 +02:00
+								        params["q"] = query
 								        params["tweet_search_mode"] = "live"
 								        params["query_source"] = "typed_query"
 								        params["pc"] = "1"
 								        params["spelling_corrections"] = "1"
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        return self._pagination(
 								            endpoint, params, "sq-I-t-", "sq-cursor-bottom")
 								    def user_by_screen_name(self, screen_name):
 								        endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName"
 								        params = {
 								            "variables": '{"screen_name":"' + screen_name + '"'
 								                         ',"withHighlightedLabel":true}'
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								        }
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        return self._call(endpoint, params)["data"]["user"]
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											2020-06-18 00:28:38 +02:00
+								    @cache(maxage=3600)
 								    def _guest_token(self):
 								        endpoint = "1.1/guest/activate.json"
 								        return self._call(endpoint, None, "POST")["guest_token"]
 								    def _call(self, endpoint, params, method="GET"):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        url = "https://api.twitter.com/" + endpoint
 								        response = self.extractor.request(
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											2020-06-18 00:28:38 +02:00
+								            url, method=method, params=params, headers=self.headers,
 								            fatal=None)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        if response.status_code < 400:
 								            return response.json()
 								        if response.status_code == 429:
 								            self.extractor.wait(until=response.headers["x-rate-limit-reset"])
 								            return self._call(endpoint, params)
-												[twitter] improve error message formatting

											
										
										
											2020-07-06 23:13:05 +02:00
 								        try:
 								            msg = ", ".join(
 								                '"' + error["message"] + '"'
 								                for error in response.json()["errors"]
 								            )
 								        except Exception:
 								            msg = response.text
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        raise exception.StopExtraction(
-												[twitter] improve error message formatting

											
										
										
											2020-07-06 23:13:05 +02:00
+								            "%s %s (%s)", response.status_code, response.reason, msg)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
 								    def _pagination(self, endpoint, params=None,
 								                    entry_tweet="tweet-", entry_cursor="cursor-bottom-"):
 								        if params is None:
 								            params = self.params.copy()
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
 								        while True:
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            cursor = tweet = None
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            data = self._call(endpoint, params)
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
 								            instr = data["timeline"]["instructions"]
 								            if not instr:
 								                return
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								            tweets = data["globalObjects"]["tweets"]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            users = data["globalObjects"]["users"]
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            for entry in instr[0]["addEntries"]["entries"]:
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
 								                if entry["entryId"].startswith(entry_tweet):
-												[twitter] improve handling of deleted tweets (fixes #838)

											
										
										
											2020-06-19 14:40:17 +02:00
+								                    try:
 								                        tweet = tweets[
 								                            entry["content"]["item"]["content"]["tweet"]["id"]]
 								                    except KeyError:
-												[twitter] skip unavailable tweets

											
										
										
											2020-06-04 14:51:25 +02:00
+								                        self.extractor.log.debug(
-												[twitter] add debug messages for all skipped Tweets (#867)

											
										
										
											2020-07-11 00:41:50 +02:00
+								                            "Skipping %s (deleted)",
 								                            entry["entryId"][len(entry_tweet):])
-												[twitter] skip unavailable tweets

											
										
										
											2020-06-04 14:51:25 +02:00
+								                        continue
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								                    tweet["user"] = users[tweet["user_id_str"]]
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											2020-06-24 21:08:04 +02:00
+								                    if "retweeted_status_id_str" in tweet:
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								                        retweet = tweets.get(tweet["retweeted_status_id_str"])
 								                        if retweet:
 								                            tweet["author"] = users[retweet["user_id_str"]]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								                    yield tweet
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											2020-06-24 21:08:04 +02:00
+								                    if "quoted_status_id_str" in tweet:
 								                        quoted = tweets.get(tweet["quoted_status_id_str"])
 								                        if quoted:
 								                            quoted["author"] = users[quoted["user_id_str"]]
 								                            quoted["user"] = tweet["user"]
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								                            quoted["quoted"] = True
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											2020-06-24 21:08:04 +02:00
+								                            yield quoted
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								                elif entry["entryId"].startswith(entry_cursor):
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								                    cursor = entry["content"]["operation"]["cursor"]
 								                    if not cursor.get("stopOnEmptyResponse"):
 								                        # keep going even if there are no tweets
 								                        tweet = True
 								                    cursor = cursor["value"]
 								            if "replaceEntry" in instr[-1] :
 								                cursor = (instr[-1]["replaceEntry"]["entry"]
 								                          ["content"]["operation"]["cursor"]["value"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            if not cursor or not tweet:
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								                return
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            params["cursor"] = cursor