gallery-dl/gallery_dl/extractor/twitter.py

# -*- coding: utf-8 -*-

# Copyright 2016-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://twitter.com/"""

from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import json

BASE_PATTERN = (
    r"(?:https?://)?(?:www\.|mobile\.)?"
    r"(?:twitter\.com|nitter\.net)"
)


class TwitterExtractor(Extractor):
    """Base class for twitter extractors"""
    category = "twitter"
    directory_fmt = ("{category}", "{user[name]}")
    filename_fmt = "{tweet_id}_{num}.{extension}"
    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
    cookiedomain = ".twitter.com"
    cookienames = ("auth_token",)
    root = "https://twitter.com"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.user = match.group(1)
        self.textonly = self.config("text-tweets", False)
        self.retweets = self.config("retweets", True)
        self.replies = self.config("replies", True)
        self.twitpic = self.config("twitpic", False)
        self.quoted = self.config("quoted", True)
        self.videos = self.config("videos", True)
        self.cards = self.config("cards", False)
        self._user_cache = {}

    def items(self):
        self.login()
        metadata = self.metadata()
        yield Message.Version, 1

        for tweet in self.tweets():

            if not self.retweets and "retweeted_status_id_str" in tweet:
                self.log.debug("Skipping %s (retweet)", tweet["id_str"])
                continue
            if not self.replies and "in_reply_to_user_id_str" in tweet:
                self.log.debug("Skipping %s (reply)", tweet["id_str"])
                continue
            if not self.quoted and "quoted" in tweet:
                self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"])
                continue

            files = []
            if "extended_entities" in tweet:
                self._extract_media(tweet, files)
            if "card" in tweet and self.cards:
                self._extract_card(tweet, files)
            if self.twitpic:
                self._extract_twitpic(tweet, files)
            if not files and not self.textonly:
                continue

            tdata = self._transform_tweet(tweet)
            tdata.update(metadata)
            yield Message.Directory, tdata
            for tdata["num"], file in enumerate(files, 1):
                file.update(tdata)
                url = file.pop("url")
                if "extension" not in file:
                    text.nameext_from_url(url, file)
                yield Message.Url, url, file

    def _extract_media(self, tweet, files):
        for media in tweet["extended_entities"]["media"]:
            width = media["original_info"].get("width", 0)
            height = media["original_info"].get("height", 0)

            if "video_info" in media:
                if self.videos == "ytdl":
                    files.append({
                        "url": "ytdl:{}/i/web/status/{}".format(
                            self.root, tweet["id_str"]),
                        "width"    : width,
                        "height"   : height,
                        "extension": None,
                    })
                elif self.videos:
                    video_info = media["video_info"]
                    variant = max(
                        video_info["variants"],
                        key=lambda v: v.get("bitrate", 0),
                    )
                    files.append({
                        "url"     : variant["url"],
                        "width"   : width,
                        "height"  : height,
                        "bitrate" : variant.get("bitrate", 0),
                        "duration": video_info.get(
                            "duration_millis", 0) / 1000,
                    })
            elif "media_url_https" in media:
                url = media["media_url_https"]
                base, _, fmt = url.rpartition(".")
                base += "?format=" + fmt + "&name="
                files.append(text.nameext_from_url(url, {
                    "url"      : base + "orig",
                    "width"    : width,
                    "height"   : height,
                    "_fallback": self._image_fallback(base, url + ":"),
                }))
            else:
                files.append({"url": media["media_url"]})

    @staticmethod
    def _image_fallback(new, old):
        yield old + "orig"

        for size in ("large", "medium", "small"):
            yield new + size
            yield old + size

    def _extract_card(self, tweet, files):
        card = tweet["card"]
        if card["name"] in ("summary", "summary_large_image"):
            bvals = card["binding_values"]
            for prefix in ("photo_image_full_size_",
                           "summary_photo_image_",
                           "thumbnail_image_"):
                for size in ("original", "x_large", "large", "small"):
                    key = prefix + size
                    if key in bvals:
                        files.append(bvals[key]["image_value"])
                        return
        elif self.videos:
            url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"])
            files.append({"url": url})

    def _extract_twitpic(self, tweet, files):
        for url in tweet["entities"].get("urls", ()):
            url = url["expanded_url"]
            if "//twitpic.com/" in url and "/photos/" not in url:
                response = self.request(url, fatal=False)
                if response.status_code >= 400:
                    continue
                url = text.extract(
                    response.text, 'name="twitter:image" value="', '"')[0]
                if url:
                    files.append({"url": url})

    def _transform_tweet(self, tweet):
        entities = tweet["entities"]
        tdata = {
            "tweet_id"      : text.parse_int(tweet["id_str"]),
            "retweet_id"    : text.parse_int(
                tweet.get("retweeted_status_id_str")),
            "quote_id"      : text.parse_int(
                tweet.get("quoted_status_id_str")),
            "reply_id"      : text.parse_int(
                tweet.get("in_reply_to_status_id_str")),
            "date"          : text.parse_datetime(
                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
            "user"          : self._transform_user(tweet["user"]),
            "lang"          : tweet["lang"],
            "favorite_count": tweet["favorite_count"],
            "quote_count"   : tweet["quote_count"],
            "reply_count"   : tweet["reply_count"],
            "retweet_count" : tweet["retweet_count"],
        }

        hashtags = entities.get("hashtags")
        if hashtags:
            tdata["hashtags"] = [t["text"] for t in hashtags]

        mentions = entities.get("user_mentions")
        if mentions:
            tdata["mentions"] = [{
                "id": text.parse_int(u["id_str"]),
                "name": u["screen_name"],
                "nick": u["name"],
            } for u in mentions]

        content = tweet["full_text"]
        urls = entities.get("urls")
        if urls:
            for url in urls:
                content = content.replace(url["url"], url["expanded_url"])
        txt, _, tco = content.rpartition(" ")
        tdata["content"] = txt if tco.startswith("https://t.co/") else content

        if "in_reply_to_screen_name" in tweet:
            tdata["reply_to"] = tweet["in_reply_to_screen_name"]

        if "author" in tweet:
            tdata["author"] = self._transform_user(tweet["author"])
        else:
            tdata["author"] = tdata["user"]

        return tdata

    def _transform_user(self, user):
        uid = user["id_str"]
        cache = self._user_cache

        if uid not in cache:
            cache[uid] = {
                "id"              : text.parse_int(uid),
                "name"            : user["screen_name"],
                "nick"            : user["name"],
                "description"     : user["description"],
                "location"        : user["location"],
                "date"            : text.parse_datetime(
                    user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
                "verified"        : user.get("verified", False),
                "profile_banner"  : user.get("profile_banner_url", ""),
                "profile_image"   : user.get(
                    "profile_image_url_https", "").replace("_normal.", "."),
                "favourites_count": user["favourites_count"],
                "followers_count" : user["followers_count"],
                "friends_count"   : user["friends_count"],
                "listed_count"    : user["listed_count"],
                "media_count"     : user["media_count"],
                "statuses_count"  : user["statuses_count"],
            }
        return cache[uid]

    def _users_result(self, users):
        userfmt = self.config("users")
        if not userfmt or userfmt == "timeline":
            cls = TwitterTimelineExtractor
            fmt = (self.root + "/i/user/{rest_id}").format_map
        elif userfmt == "media":
            cls = TwitterMediaExtractor
            fmt = (self.root + "/id:{rest_id}/media").format_map
        else:
            cls = None
            fmt = userfmt.format_map

        for user in users:
            user["_extractor"] = cls
            yield Message.Queue, fmt(user), user

    def metadata(self):
        """Return general metadata"""
        return {}

    def tweets(self):
        """Yield all relevant tweet objects"""

    def login(self):
        if not self._check_cookies(self.cookienames):
            username, password = self._get_auth_info()
            if username:
                self._update_cookies(self._login_impl(username, password))

    @cache(maxage=360*24*3600, keyarg=1)
    def _login_impl(self, username, password):
        self.log.info("Logging in as %s", username)

        token = util.generate_token()
        self.session.cookies.clear()
        self.request(self.root + "/login")

        url = self.root + "/sessions"
        cookies = {
            "_mb_tk": token,
        }
        data = {
            "redirect_after_login"      : "/",
            "remember_me"               : "1",
            "authenticity_token"        : token,
            "wfa"                       : "1",
            "ui_metrics"                : "{}",
            "session[username_or_email]": username,
            "session[password]"         : password,
        }
        response = self.request(
            url, method="POST", cookies=cookies, data=data)

        if "/account/login_verification" in response.url:
            raise exception.AuthenticationError(
                "Login with two-factor authentication is not supported")

        cookies = {
            cookie.name: cookie.value
            for cookie in self.session.cookies
        }

        if "/error" in response.url or "auth_token" not in cookies:
            raise exception.AuthenticationError()
        return cookies


class TwitterTimelineExtractor(TwitterExtractor):
    """Extractor for all images from a user's timeline"""
    subcategory = "timeline"
    pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
               r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
    test = (
        ("https://twitter.com/supernaturepics", {
            "range": "1-40",
            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
        }),
        ("https://mobile.twitter.com/supernaturepics?p=i"),
        ("https://www.twitter.com/id:2976459548"),
        ("https://twitter.com/i/user/2976459548"),
        ("https://twitter.com/intent/user?user_id=2976459548"),
    )

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        user_id = match.group(2)
        if user_id:
            self.user = "id:" + user_id

    def tweets(self):
        return TwitterAPI(self).timeline_profile(self.user)


class TwitterMediaExtractor(TwitterExtractor):
    """Extractor for all images from a user's Media Tweets"""
    subcategory = "media"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/media", {
            "range": "1-40",
            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
        }),
        ("https://mobile.twitter.com/supernaturepics/media#t"),
        ("https://www.twitter.com/id:2976459548/media"),
    )

    def tweets(self):
        return TwitterAPI(self).timeline_media(self.user)


class TwitterLikesExtractor(TwitterExtractor):
    """Extractor for liked tweets"""
    subcategory = "likes"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
    test = ("https://twitter.com/supernaturepics/likes",)

    def metadata(self):
        return {"user_likes": self.user}

    def tweets(self):
        return TwitterAPI(self).timeline_favorites(self.user)


class TwitterBookmarkExtractor(TwitterExtractor):
    """Extractor for bookmarked tweets"""
    subcategory = "bookmark"
    pattern = BASE_PATTERN + r"/i/bookmarks()"
    test = ("https://twitter.com/i/bookmarks",)

    def tweets(self):
        return TwitterAPI(self).timeline_bookmark()


class TwitterListExtractor(TwitterExtractor):
    """Extractor for Twitter lists"""
    subcategory = "list"
    pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
    test = ("https://twitter.com/i/lists/784214683683127296", {
        "range": "1-40",
        "count": 40,
        "archive": False,
    })

    def tweets(self):
        return TwitterAPI(self).timeline_list(self.user)


class TwitterListMembersExtractor(TwitterExtractor):
    """Extractor for members of a Twitter list"""
    subcategory = "list-members"
    pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
    test = ("https://twitter.com/i/lists/784214683683127296/members",)

    def items(self):
        self.login()
        return self._users_result(TwitterAPI(self).list_members(self.user))


class TwitterFollowingExtractor(TwitterExtractor):
    """Extractor for followed users"""
    subcategory = "following"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/following"),
        ("https://www.twitter.com/id:2976459548/following"),
    )

    def items(self):
        self.login()
        return self._users_result(TwitterAPI(self).user_following(self.user))


class TwitterSearchExtractor(TwitterExtractor):
    """Extractor for all images from a search timeline"""
    subcategory = "search"
    directory_fmt = ("{category}", "Search", "{search}")
    pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
    test = ("https://twitter.com/search?q=nature", {
        "range": "1-40",
        "count": 40,
        "archive": False,
    })

    def metadata(self):
        return {"search": text.unquote(self.user)}

    def tweets(self):
        return TwitterAPI(self).search(text.unquote(self.user))


class TwitterTweetExtractor(TwitterExtractor):
    """Extractor for images from individual tweets"""
    subcategory = "tweet"
    pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
    test = (
        ("https://twitter.com/supernaturepics/status/604341487988576256", {
            "url": "88a40f7d25529c2501c46f2218f9e0de9aa634b4",
            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
        }),
        # 4 images
        ("https://twitter.com/perrypumas/status/894001459754180609", {
            "url": "3a2a43dc5fb79dd5432c701d8e55e87c4e551f47",
        }),
        # video
        ("https://twitter.com/perrypumas/status/1065692031626829824", {
            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
        }),
        # content with emoji, newlines, hashtags (#338)
        ("https://twitter.com/playpokemon/status/1263832915173048321", {
            "keyword": {"content": (
                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
                "Gifts! \n\nYou’ll be able to receive four Galarian form "
                "Pokémon with Hidden Abilities, plus some very useful items. "
                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
            )},
        }),
        # Reply to deleted tweet (#403, #838)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_",
        }),
        # 'replies' option (#705)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "options": (("replies", False),),
            "count": 0,
        }),
        # quoted tweet (#526, #854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+=jpg",
            "count": 8,
        }),
        # "quoted" option (#854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "options": (("quoted", False),),
            "pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg",
            "count": 4,
        }),
        # TwitPic embeds (#579)
        ("https://twitter.com/i/web/status/112900228289540096", {
            "options": (("twitpic", True),),
            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
            "count": 3,
        }),
        # Nitter tweet (#890)
        ("https://nitter.net/ed1conf/status/1163841619336007680", {
            "url": "4a9ea898b14d3c112f98562d0df75c9785e239d9",
            "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
        }),
        # Twitter card (#1005)
        ("https://twitter.com/billboard/status/1306599586602135555", {
            "options": (("cards", True),),
            "pattern": r"https://pbs.twimg.com/card_img/\d+/",
        }),
        # original retweets (#1026)
        ("https://twitter.com/jessica_3978/status/1296304589591810048", {
            "options": (("retweets", "original"),),
            "count": 2,
            "keyword": {
                "tweet_id": 1296296016002547713,
                "date"    : "dt:2020-08-20 04:00:28",
            },
        }),
        # all Tweets from a conversation (#1319)
        ("https://twitter.com/BlankArts_/status/1323314488611872769", {
            "options": (("conversations", True),),
            "count": ">= 50",
        }),
        # retweet with missing media entities (#1555)
        ("https://twitter.com/morino_ya/status/1392763691599237121", {
            "count": 4,
        }),
    )

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        self.tweet_id = match.group(2)

    def tweets(self):
        if self.config("conversations", False):
            return TwitterAPI(self).conversation(self.tweet_id)
        return TwitterAPI(self).tweet(self.tweet_id)


class TwitterImageExtractor(Extractor):
    category = "twitter"
    subcategory = "image"
    pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)"
    test = (
        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg%name=orig"),
        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.id, self.fmt = match.groups()

    def items(self):
        base = "https://pbs.twimg.com/media/" + self.id
        new = base + "?format=" + self.fmt + "&name="
        old = base + "." + self.fmt + ":"

        data = {
            "filename": self.id,
            "extension": self.fmt,
            "_fallback": TwitterExtractor._image_fallback(new, old),
        }

        yield Message.Directory, data
        yield Message.Url, new + "orig", data


class TwitterAPI():

    def __init__(self, extractor):
        self.extractor = extractor

        self.root = "https://twitter.com/i/api"
        self.headers = {
            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
                             "4FA33AGWWjCpTnA",
            "x-guest-token": None,
            "x-twitter-auth-type": None,
            "x-twitter-client-language": "en",
            "x-twitter-active-user": "yes",
            "x-csrf-token": None,
            "Referer": "https://twitter.com/",
        }
        self.params = {
            "include_profile_interstitial_type": "1",
            "include_blocking": "1",
            "include_blocked_by": "1",
            "include_followed_by": "1",
            "include_want_retweets": "1",
            "include_mute_edge": "1",
            "include_can_dm": "1",
            "include_can_media_tag": "1",
            "skip_status": "1",
            "cards_platform": "Web-12",
            "include_cards": "1",
            "include_ext_alt_text": "true",
            "include_quote_count": "true",
            "include_reply_count": "1",
            "tweet_mode": "extended",
            "include_entities": "true",
            "include_user_entities": "true",
            "include_ext_media_color": "true",
            "include_ext_media_availability": "true",
            "send_error_codes": "true",
            "simple_quoted_tweet": "true",
            "count": "100",
            "cursor": None,
            "ext": "mediaStats,highlightedLabel",
        }

        cookies = self.extractor.session.cookies
        cookiedomain = ".twitter.com"

        # CSRF
        csrf_token = cookies.get("ct0", domain=cookiedomain)
        if not csrf_token:
            csrf_token = util.generate_token()
            cookies.set("ct0", csrf_token, domain=cookiedomain)
        self.headers["x-csrf-token"] = csrf_token

        if cookies.get("auth_token", domain=cookiedomain):
            # logged in
            self.headers["x-twitter-auth-type"] = "OAuth2Session"
        else:
            # guest
            guest_token = self._guest_token()
            cookies.set("gt", guest_token, domain=cookiedomain)
            self.headers["x-guest-token"] = guest_token

    def tweet(self, tweet_id):
        endpoint = "/2/timeline/conversation/{}.json".format(tweet_id)
        tweets = []
        for tweet in self._pagination(endpoint):
            if tweet["id_str"] == tweet_id or \
                    tweet.get("_retweet_id_str") == tweet_id:
                tweets.append(tweet)
                if "quoted_status_id_str" in tweet:
                    tweet_id = tweet["quoted_status_id_str"]
                else:
                    break
        return tweets

    def conversation(self, conversation_id):
        endpoint = "/2/timeline/conversation/{}.json".format(conversation_id)
        return self._pagination(endpoint)

    def timeline_profile(self, screen_name):
        user_id = self._user_id_by_screen_name(screen_name)
        endpoint = "/2/timeline/profile/{}.json".format(user_id)
        params = self.params.copy()
        params["include_tweet_replies"] = "false"
        return self._pagination(endpoint, params)

    def timeline_media(self, screen_name):
        user_id = self._user_id_by_screen_name(screen_name)
        endpoint = "/2/timeline/media/{}.json".format(user_id)
        return self._pagination(endpoint)

    def timeline_favorites(self, screen_name):
        user_id = self._user_id_by_screen_name(screen_name)
        endpoint = "/2/timeline/favorites/{}.json".format(user_id)
        params = self.params.copy()
        params["sorted_by_time"] = "true"
        return self._pagination(endpoint)

    def timeline_bookmark(self):
        endpoint = "/2/timeline/bookmark.json"
        return self._pagination(endpoint)

    def timeline_list(self, list_id):
        endpoint = "/2/timeline/list.json"
        params = self.params.copy()
        params["list_id"] = list_id
        params["ranking_mode"] = "reverse_chronological"
        return self._pagination(endpoint, params)

    def search(self, query):
        endpoint = "/2/search/adaptive.json"
        params = self.params.copy()
        params["q"] = query
        params["tweet_search_mode"] = "live"
        params["query_source"] = "typed_query"
        params["pc"] = "1"
        params["spelling_corrections"] = "1"
        return self._pagination(endpoint, params)

    def list_by_rest_id(self, list_id):
        endpoint = "/graphql/18MAHTcDU-TdJSjWWmoH7w/ListByRestId"
        params = {"variables": '{"listId":"' + list_id + '"'
                               ',"withUserResult":false}'}
        try:
            return self._call(endpoint, params)["data"]["list"]
        except KeyError:
            raise exception.NotFoundError("list")

    def list_members(self, list_id):
        endpoint = "/graphql/tA7h9hy4U0Yc9COfIOh3qQ/ListMembers"
        variables = {
            "listId": list_id,
            "count" : 100,
            "withTweetResult": False,
            "withUserResult" : False,
        }
        return self._pagination_graphql(
            endpoint, variables, "list", "members_timeline")

    def user_following(self, screen_name):
        endpoint = "/graphql/Q_QTiPvoXwsA13eoA7okIQ/Following"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count" : 100,
            "withTweetResult": False,
            "withUserResult" : False,
            "withTweetQuoteCount"   : False,
            "withHighlightedLabel"  : False,
            "includePromotedContent": False,
        }
        return self._pagination_graphql(
            endpoint, variables, "user", "following_timeline")

    def user_by_screen_name(self, screen_name):
        endpoint = "/graphql/hc-pka9A7gyS3xODIafnrQ/UserByScreenName"
        params = {"variables": '{"screen_name":"' + screen_name + '"'
                               ',"withHighlightedLabel":true}'}
        try:
            return self._call(endpoint, params)["data"]["user"]
        except KeyError:
            raise exception.NotFoundError("user")

    def _user_id_by_screen_name(self, screen_name):
        if screen_name.startswith("id:"):
            return screen_name[3:]
        return self.user_by_screen_name(screen_name)["rest_id"]

    @cache(maxage=3600)
    def _guest_token(self):
        root = "https://api.twitter.com"
        endpoint = "/1.1/guest/activate.json"
        return self._call(endpoint, None, root, "POST")["guest_token"]

    def _call(self, endpoint, params, root=None, method="GET"):
        if root is None:
            root = self.root

        while True:
            response = self.extractor.request(
                root + endpoint, method=method, params=params,
                headers=self.headers, fatal=None)

            # update 'x-csrf-token' header (#1170)
            csrf_token = response.cookies.get("ct0")
            if csrf_token:
                self.headers["x-csrf-token"] = csrf_token

            if response.status_code < 400:
                return response.json()
            if response.status_code == 429:
                until = response.headers.get("x-rate-limit-reset")
                seconds = None if until else 60
                self.extractor.wait(until=until, seconds=seconds)
                continue

            try:
                msg = ", ".join(
                    '"' + error["message"] + '"'
                    for error in response.json()["errors"]
                )
            except Exception:
                msg = response.text
            raise exception.StopExtraction(
                "%s %s (%s)", response.status_code, response.reason, msg)

    def _pagination(self, endpoint, params=None):
        if params is None:
            params = self.params.copy()
        original_retweets = (self.extractor.retweets == "original")
        pinned_tweet = True

        while True:
            cursor = tweet = None
            data = self._call(endpoint, params)

            instr = data["timeline"]["instructions"]
            if not instr:
                return
            tweet_ids = []
            tweets = data["globalObjects"]["tweets"]
            users = data["globalObjects"]["users"]

            if pinned_tweet:
                if "pinEntry" in instr[-1]:
                    tweet_ids.append(instr[-1]["pinEntry"]["entry"]["content"]
                                     ["item"]["content"]["tweet"]["id"])
                pinned_tweet = False

            # collect tweet IDs and cursor value
            for entry in instr[0]["addEntries"]["entries"]:
                entry_startswith = entry["entryId"].startswith

                if entry_startswith(("tweet-", "sq-I-t-")):
                    tweet_ids.append(
                        entry["content"]["item"]["content"]["tweet"]["id"])

                elif entry_startswith("homeConversation-"):
                    tweet_ids.extend(
                        entry["content"]["timelineModule"]["metadata"]
                        ["conversationMetadata"]["allTweetIds"][::-1])

                elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")):
                    cursor = entry["content"]["operation"]["cursor"]
                    if not cursor.get("stopOnEmptyResponse"):
                        # keep going even if there are no tweets
                        tweet = True
                    cursor = cursor["value"]

                elif entry_startswith("conversationThread-"):
                    tweet_ids.extend(
                        item["entryId"][6:]
                        for item in entry["content"]["timelineModule"]["items"]
                        if item["entryId"].startswith("tweet-")
                    )

            # process tweets
            for tweet_id in tweet_ids:
                try:
                    tweet = tweets[tweet_id]
                except KeyError:
                    self.extractor.log.debug("Skipping %s (deleted)", tweet_id)
                    continue

                if "retweeted_status_id_str" in tweet:
                    retweet = tweets.get(tweet["retweeted_status_id_str"])
                    if original_retweets:
                        if not retweet:
                            continue
                        retweet["_retweet_id_str"] = tweet["id_str"]
                        tweet = retweet
                    elif retweet:
                        tweet["author"] = users[retweet["user_id_str"]]
                        if "extended_entities" in retweet and \
                                "extended_entities" not in tweet:
                            tweet["extended_entities"] = \
                                retweet["extended_entities"]
                tweet["user"] = users[tweet["user_id_str"]]
                yield tweet

                if "quoted_status_id_str" in tweet:
                    quoted = tweets.get(tweet["quoted_status_id_str"])
                    if quoted:
                        quoted["author"] = users[quoted["user_id_str"]]
                        quoted["user"] = tweet["user"]
                        quoted["quoted"] = True
                        yield quoted

            # update cursor value
            if "replaceEntry" in instr[-1] :
                cursor = (instr[-1]["replaceEntry"]["entry"]
                          ["content"]["operation"]["cursor"]["value"])

            if not cursor or not tweet:
                return
            params["cursor"] = cursor

    def _pagination_graphql(self, endpoint, variables, key, timeline):
        while True:
            cursor = entry = stop = None
            params = {"variables": json.dumps(variables)}
            data = self._call(endpoint, params)

            try:
                instructions = \
                    data["data"][key][timeline]["timeline"]["instructions"]
            except KeyError:
                raise exception.AuthorizationError()

            for instr in instructions:
                if instr["type"] == "TimelineAddEntries":
                    for entry in instr["entries"]:
                        if entry["entryId"].startswith("user-"):
                            yield entry["content"]["itemContent"]["user"]
                        elif entry["entryId"].startswith("cursor-bottom-"):
                            cursor = entry["content"]["value"]
                elif instr["type"] == "TimelineTerminateTimeline":
                    if instr["direction"] == "Bottom":
                        stop = True

            if stop or not cursor or not entry:
                return
            variables["cursor"] = cursor
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
+								# -*- coding: utf-8 -*-
-												rename 'generate_csrf_token()' to just 'generate_token()'

and add a 'size' argument

											
										
										
											2021-01-11 22:12:40 +01:00
+								# Copyright 2016-2021 Mike Fährmann
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
+								#
 								# This program is free software; you can redistribute it and/or modify
 								# it under the terms of the GNU General Public License version 2 as
 								# published by the Free Software Foundation.
-												[twitter] force old login page layout (fixes #584, fixes #598)

											
										
										
											2020-02-02 17:19:14 +01:00
+								"""Extractors for https://twitter.com/"""
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
 								from .common import Extractor, Message
-												add a general 'generate_csrf_token()' function

											
										
										
											2020-10-15 00:43:26 +02:00
+								from .. import text, util, exception
-												[twitter] don't cache results of 'user_by_screen_name()'

A 'keyarg=1' argument to the memcache decorator would have worked as
well, but keeping the user object in memory isn't useful for the vast
majority of use cases and only wastes space.

(closes #817)

											
										
										
											2020-06-10 20:58:42 +02:00
+								from ..cache import cache
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								import json
-												code adjustments according to pep8 nr2

											
										
										
											2017-02-01 00:53:19 +01:00
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								BASE_PATTERN = (
 								    r"(?:https?://)?(?:www\.|mobile\.)?"
 								    r"(?:twitter\.com|nitter\.net)"
 								)
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								class TwitterExtractor(Extractor):
 								    """Base class for twitter extractors"""
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
+								    category = "twitter"
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								    directory_fmt = ("{category}", "{user[name]}")
 								    filename_fmt = "{tweet_id}_{num}.{extension}"
 								    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											2020-03-12 22:02:12 +01:00
+								    cookiedomain = ".twitter.com"
-												[twitter] skip login if 'auth_token' cookie is present

											
										
										
											2021-01-25 14:52:22 +01:00
+								    cookienames = ("auth_token",)
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    root = "https://twitter.com"
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								    def __init__(self, match):
-												propagate 'match' to base extractor constructor

											
										
										
											2019-02-11 13:31:10 +01:00
+								        Extractor.__init__(self, match)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								        self.user = match.group(1)
-												[twitter] rename 'text-only' to 'text-tweets' (#570)

											
										
										
											2021-05-22 21:07:21 +02:00
+								        self.textonly = self.config("text-tweets", False)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								        self.retweets = self.config("retweets", True)
-												[twitter] add 'replies' option (closes #705)

											
										
										
											2020-04-29 23:11:24 +02:00
+								        self.replies = self.config("replies", True)
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											2020-01-18 21:26:46 +01:00
+								        self.twitpic = self.config("twitpic", False)
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								        self.quoted = self.config("quoted", True)
-												[twitter] change default value for 'videos' to 'true'

Every other 'videos' option defaulted to 'true', except Twitter.

											
										
										
											2020-02-14 01:03:42 +01:00
+								        self.videos = self.config("videos", True)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								        self.cards = self.config("cards", False)
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								        self._user_cache = {}
-												[twitter] add experimental 'videos' option (#99)

Enabling this option will detect videos in tweets and output them as
"unsupported" URLs, so that these can then be downloaded with youtube-dl

There are a lot of improvements to be made to the current
implementation, but it works and does what it is supposed to, even if
inefficient as can be ...

											
										
										
											2018-09-30 18:41:39 +02:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    def items(self):
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								        self.login()
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											2019-11-30 21:51:08 +01:00
+								        metadata = self.metadata()
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								        yield Message.Version, 1
 								        for tweet in self.tweets():
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
-												[twitter] add debug messages for all skipped Tweets (#867)

											
										
										
											2020-07-11 00:41:50 +02:00
+								            if not self.retweets and "retweeted_status_id_str" in tweet:
 								                self.log.debug("Skipping %s (retweet)", tweet["id_str"])
 								                continue
 								            if not self.replies and "in_reply_to_user_id_str" in tweet:
 								                self.log.debug("Skipping %s (reply)", tweet["id_str"])
 								                continue
 								            if not self.quoted and "quoted" in tweet:
 								                self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								                continue
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								            files = []
 								            if "extended_entities" in tweet:
 								                self._extract_media(tweet, files)
 								            if "card" in tweet and self.cards:
 								                self._extract_card(tweet, files)
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
+								            if self.twitpic:
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                self._extract_twitpic(tweet, files)
-												[twitter] add 'text-only' option (#570)

											
										
										
											2021-05-22 17:01:49 +02:00
+								            if not files and not self.textonly:
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								                continue
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            tdata = self._transform_tweet(tweet)
 								            tdata.update(metadata)
 								            yield Message.Directory, tdata
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								            for tdata["num"], file in enumerate(files, 1):
 								                file.update(tdata)
 								                url = file.pop("url")
 								                if "extension" not in file:
 								                    text.nameext_from_url(url, file)
 								                yield Message.Url, url, file
 								    def _extract_media(self, tweet, files):
 								        for media in tweet["extended_entities"]["media"]:
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											2020-11-05 22:53:29 +01:00
+								            width = media["original_info"].get("width", 0)
 								            height = media["original_info"].get("height", 0)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
 								            if "video_info" in media:
 								                if self.videos == "ytdl":
 								                    files.append({
 								                        "url": "ytdl:{}/i/web/status/{}".format(
 								                            self.root, tweet["id_str"]),
 								                        "width"    : width,
 								                        "height"   : height,
 								                        "extension": None,
 								                    })
 								                elif self.videos:
 								                    video_info = media["video_info"]
 								                    variant = max(
 								                        video_info["variants"],
 								                        key=lambda v: v.get("bitrate", 0),
 								                    )
 								                    files.append({
 								                        "url"     : variant["url"],
 								                        "width"   : width,
 								                        "height"  : height,
 								                        "bitrate" : variant.get("bitrate", 0),
 								                        "duration": video_info.get(
 								                            "duration_millis", 0) / 1000,
 								                    })
 								            elif "media_url_https" in media:
 								                url = media["media_url_https"]
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								                base, _, fmt = url.rpartition(".")
 								                base += "?format=" + fmt + "&name="
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                files.append(text.nameext_from_url(url, {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								                    "url"      : base + "orig",
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                    "width"    : width,
 								                    "height"   : height,
-												[twitter] rename variables

											
										
										
											2021-04-02 02:49:53 +02:00
+								                    "_fallback": self._image_fallback(base, url + ":"),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                }))
 								            else:
 								                files.append({"url": media["media_url"]})
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								    @staticmethod
-												[twitter] rename variables

											
										
										
											2021-04-02 02:49:53 +02:00
+								    def _image_fallback(new, old):
 								        yield old + "orig"
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
 								        for size in ("large", "medium", "small"):
-												[twitter] rename variables

											
										
										
											2021-04-02 02:49:53 +02:00
+								            yield new + size
 								            yield old + size
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								    def _extract_card(self, tweet, files):
 								        card = tweet["card"]
 								        if card["name"] in ("summary", "summary_large_image"):
 								            bvals = card["binding_values"]
 								            for prefix in ("photo_image_full_size_",
 								                           "summary_photo_image_",
 								                           "thumbnail_image_"):
 								                for size in ("original", "x_large", "large", "small"):
 								                    key = prefix + size
 								                    if key in bvals:
 								                        files.append(bvals[key]["image_value"])
 								                        return
-												[twitter] don't use youtube-dl for cards when videos are disabled

(#1416)

											
										
										
											2021-04-01 14:26:08 +02:00
+								        elif self.videos:
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								            url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"])
 								            files.append({"url": url})
 								    def _extract_twitpic(self, tweet, files):
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
+								        for url in tweet["entities"].get("urls", ()):
 								            url = url["expanded_url"]
-												[twitter] improve twitpic extraction (fixes #1019)

- ignore twitpic.com/photos/… URLs
- ignore empty image URLs

											
										
										
											2020-09-21 22:21:16 +02:00
+								            if "//twitpic.com/" in url and "/photos/" not in url:
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
+								                response = self.request(url, fatal=False)
 								                if response.status_code >= 400:
 								                    continue
 								                url = text.extract(
 								                    response.text, 'name="twitter:image" value="', '"')[0]
-												[twitter] improve twitpic extraction (fixes #1019)

- ignore twitpic.com/photos/… URLs
- ignore empty image URLs

											
										
										
											2020-09-21 22:21:16 +02:00
+								                if url:
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                    files.append({"url": url})
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								    def _transform_tweet(self, tweet):
 								        entities = tweet["entities"]
 								        tdata = {
 								            "tweet_id"      : text.parse_int(tweet["id_str"]),
 								            "retweet_id"    : text.parse_int(
 								                tweet.get("retweeted_status_id_str")),
 								            "quote_id"      : text.parse_int(
 								                tweet.get("quoted_status_id_str")),
 								            "reply_id"      : text.parse_int(
 								                tweet.get("in_reply_to_status_id_str")),
 								            "date"          : text.parse_datetime(
 								                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
 								            "user"          : self._transform_user(tweet["user"]),
 								            "lang"          : tweet["lang"],
 								            "favorite_count": tweet["favorite_count"],
 								            "quote_count"   : tweet["quote_count"],
 								            "reply_count"   : tweet["reply_count"],
 								            "retweet_count" : tweet["retweet_count"],
 								        }
 								        hashtags = entities.get("hashtags")
 								        if hashtags:
 								            tdata["hashtags"] = [t["text"] for t in hashtags]
 								        mentions = entities.get("user_mentions")
 								        if mentions:
 								            tdata["mentions"] = [{
 								                "id": text.parse_int(u["id_str"]),
 								                "name": u["screen_name"],
 								                "nick": u["name"],
 								            } for u in mentions]
-												[twitter] resolve t.co URLs in 'content' (#1532)

											
										
										
											2021-05-15 02:46:46 +02:00
+								        content = tweet["full_text"]
 								        urls = entities.get("urls")
 								        if urls:
 								            for url in urls:
 								                content = content.replace(url["url"], url["expanded_url"])
-												[twitter] strip useless t.co links (#1532)

The 'full_text' of Tweets with media content usually ends with a t.co
link to itself. This commit removes those.

											
										
										
											2021-05-16 02:35:55 +02:00
+								        txt, _, tco = content.rpartition(" ")
 								        tdata["content"] = txt if tco.startswith("https://t.co/") else content
-												[twitter] resolve t.co URLs in 'content' (#1532)

											
										
										
											2021-05-15 02:46:46 +02:00
-												[twitter] add 'reply_to' metadata to replies

											
										
										
											2020-06-09 21:48:04 +02:00
+								        if "in_reply_to_screen_name" in tweet:
 								            tdata["reply_to"] = tweet["in_reply_to_screen_name"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								        if "author" in tweet:
 								            tdata["author"] = self._transform_user(tweet["author"])
-												[twitter] always provide an 'author' field (#831, #833)

The idea was to have less metadata clutter for most Tweets were
'author' and 'user' are the same (non-retweets), and only provide
a 'user' field.

The original Tweet author could be gotten with
{author[…]|user[…]}, but basically no one knows about that.

											
										
										
											2020-06-18 00:12:36 +02:00
+								        else:
 								            tdata["author"] = tdata["user"]
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
 								        return tdata
 								    def _transform_user(self, user):
 								        uid = user["id_str"]
 								        cache = self._user_cache
 								        if uid not in cache:
 								            cache[uid] = {
 								                "id"              : text.parse_int(uid),
 								                "name"            : user["screen_name"],
 								                "nick"            : user["name"],
 								                "description"     : user["description"],
 								                "location"        : user["location"],
 								                "date"            : text.parse_datetime(
 								                    user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
 								                "verified"        : user.get("verified", False),
 								                "profile_banner"  : user.get("profile_banner_url", ""),
 								                "profile_image"   : user.get(
 								                    "profile_image_url_https", "").replace("_normal.", "."),
 								                "favourites_count": user["favourites_count"],
 								                "followers_count" : user["followers_count"],
 								                "friends_count"   : user["friends_count"],
 								                "listed_count"    : user["listed_count"],
 								                "media_count"     : user["media_count"],
 								                "statuses_count"  : user["statuses_count"],
 								            }
 								        return cache[uid]
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								    def _users_result(self, users):
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											2021-03-20 01:31:12 +01:00
+								        userfmt = self.config("users")
 								        if not userfmt or userfmt == "timeline":
 								            cls = TwitterTimelineExtractor
 								            fmt = (self.root + "/i/user/{rest_id}").format_map
 								        elif userfmt == "media":
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								            cls = TwitterMediaExtractor
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											2021-03-20 01:31:12 +01:00
+								            fmt = (self.root + "/id:{rest_id}/media").format_map
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								        else:
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											2021-03-20 01:31:12 +01:00
+								            cls = None
 								            fmt = userfmt.format_map
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
 								        for user in users:
 								            user["_extractor"] = cls
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											2021-03-20 01:31:12 +01:00
+								            yield Message.Queue, fmt(user), user
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    def metadata(self):
 								        """Return general metadata"""
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											2019-11-30 21:51:08 +01:00
+								        return {}
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
 								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        """Yield all relevant tweet objects"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								    def login(self):
-												[twitter] skip login if 'auth_token' cookie is present

											
										
										
											2021-01-25 14:52:22 +01:00
+								        if not self._check_cookies(self.cookienames):
 								            username, password = self._get_auth_info()
 								            if username:
 								                self._update_cookies(self._login_impl(username, password))
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
 								    @cache(maxage=360*24*3600, keyarg=1)
 								    def _login_impl(self, username, password):
 								        self.log.info("Logging in as %s", username)
-												rename 'generate_csrf_token()' to just 'generate_token()'

and add a 'size' argument

											
										
										
											2021-01-11 22:12:40 +01:00
+								        token = util.generate_token()
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								        self.session.cookies.clear()
 								        self.request(self.root + "/login")
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								        url = self.root + "/sessions"
 								        cookies = {
 								            "_mb_tk": token,
 								        }
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								        data = {
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								            "redirect_after_login"      : "/",
 								            "remember_me"               : "1",
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								            "authenticity_token"        : token,
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								            "wfa"                       : "1",
 								            "ui_metrics"                : "{}",
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								            "session[username_or_email]": username,
 								            "session[password]"         : password,
 								        }
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								        response = self.request(
 								            url, method="POST", cookies=cookies, data=data)
-												[twitter] better error message when logging in with 2FA (#1409)

											
										
										
											2021-03-26 21:52:55 +01:00
+								        if "/account/login_verification" in response.url:
 								            raise exception.AuthenticationError(
 								                "Login with two-factor authentication is not supported")
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								        cookies = {
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											2020-03-12 22:02:12 +01:00
+								            cookie.name: cookie.value
 								            for cookie in self.session.cookies
 								        }
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
 								        if "/error" in response.url or "auth_token" not in cookies:
 								            raise exception.AuthenticationError()
 								        return cookies
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								class TwitterTimelineExtractor(TwitterExtractor):
 								    """Extractor for all images from a user's timeline"""
 								    subcategory = "timeline"
-												[twitter] match '/i/user/ID' URLs

											
										
										
											2021-01-20 00:33:57 +01:00
+								    pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
 								               r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    test = (
 								        ("https://twitter.com/supernaturepics", {
 								            "range": "1-40",
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								        }),
 								        ("https://mobile.twitter.com/supernaturepics?p=i"),
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
+								        ("https://www.twitter.com/id:2976459548"),
-												[twitter] match '/i/user/ID' URLs

											
										
										
											2021-01-20 00:33:57 +01:00
+								        ("https://twitter.com/i/user/2976459548"),
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											2020-09-08 23:17:50 +02:00
+								        ("https://twitter.com/intent/user?user_id=2976459548"),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											2020-09-08 23:17:50 +02:00
+								    def __init__(self, match):
 								        TwitterExtractor.__init__(self, match)
-												[twitter] match '/i/user/ID' URLs

											
										
										
											2021-01-20 00:33:57 +01:00
+								        user_id = match.group(2)
 								        if user_id:
 								            self.user = "id:" + user_id
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											2020-09-08 23:17:50 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        return TwitterAPI(self).timeline_profile(self.user)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
 								class TwitterMediaExtractor(TwitterExtractor):
 								    """Extractor for all images from a user's Media Tweets"""
 								    subcategory = "media"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											2020-10-22 23:12:59 +02:00
+								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    test = (
 								        ("https://twitter.com/supernaturepics/media", {
 								            "range": "1-40",
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								        }),
 								        ("https://mobile.twitter.com/supernaturepics/media#t"),
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
+								        ("https://www.twitter.com/id:2976459548/media"),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
 								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        return TwitterAPI(self).timeline_media(self.user)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								class TwitterLikesExtractor(TwitterExtractor):
 								    """Extractor for liked tweets"""
 								    subcategory = "likes"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											2020-10-22 23:12:59 +02:00
+								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								    test = ("https://twitter.com/supernaturepics/likes",)
-												[twitter] add 'user_likes' metadata field for liked tweets

i.e. the 'screen_name' of the user whose liked tweets get extracted.

Ideally this would replace 'user' or at least be in the same format,
but that would break backwards compatibility or be impossible/too
complicated thanks to API result differences.

(#1421)

											
										
										
											2021-04-02 02:52:01 +02:00
+								    def metadata(self):
 								        return {"user_likes": self.user}
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								    def tweets(self):
 								        return TwitterAPI(self).timeline_favorites(self.user)
 								class TwitterBookmarkExtractor(TwitterExtractor):
 								    """Extractor for bookmarked tweets"""
 								    subcategory = "bookmark"
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								    pattern = BASE_PATTERN + r"/i/bookmarks()"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								    test = ("https://twitter.com/i/bookmarks",)
 								    def tweets(self):
 								        return TwitterAPI(self).timeline_bookmark()
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								class TwitterListExtractor(TwitterExtractor):
 								    """Extractor for Twitter lists"""
 								    subcategory = "list"
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								    pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								    test = ("https://twitter.com/i/lists/784214683683127296", {
 								        "range": "1-40",
 								        "count": 40,
 								        "archive": False,
 								    })
 								    def tweets(self):
 								        return TwitterAPI(self).timeline_list(self.user)
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								class TwitterListMembersExtractor(TwitterExtractor):
 								    """Extractor for members of a Twitter list"""
 								    subcategory = "list-members"
 								    pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
 								    test = ("https://twitter.com/i/lists/784214683683127296/members",)
 								    def items(self):
 								        self.login()
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								        return self._users_result(TwitterAPI(self).list_members(self.user))
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								class TwitterFollowingExtractor(TwitterExtractor):
 								    """Extractor for followed users"""
 								    subcategory = "following"
 								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)"
 								    test = (
 								        ("https://twitter.com/supernaturepics/following"),
 								        ("https://www.twitter.com/id:2976459548/following"),
 								    )
 								    def items(self):
 								        self.login()
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								        return self._users_result(TwitterAPI(self).user_following(self.user))
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											2019-10-16 18:23:10 +02:00
+								class TwitterSearchExtractor(TwitterExtractor):
 								    """Extractor for all images from a search timeline"""
 								    subcategory = "search"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
+								    directory_fmt = ("{category}", "Search", "{search}")
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								    pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
+								    test = ("https://twitter.com/search?q=nature", {
 								        "range": "1-40",
 								        "count": 40,
-												update extractor test results

											
										
										
											2020-10-03 19:24:19 +02:00
+								        "archive": False,
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
+								    })
 								    def metadata(self):
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								        return {"search": text.unquote(self.user)}
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											2019-10-16 18:23:10 +02:00
+								    def tweets(self):
-												[twitter] improve search results (fixes #847)

Adding 'tweet_search_mode=live' to the query parameters
is the most important part here.

											
										
										
											2020-06-21 15:43:27 +02:00
+								        return TwitterAPI(self).search(text.unquote(self.user))
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								class TwitterTweetExtractor(TwitterExtractor):
-												[twitter] changes and improvements

- rename User- to TimelineExtractor
- rename 'userid' to 'user_id' to conform to the other ..._id values
- adjust archive_fmt to deal with retweets
- emulate browser behavior for API calls

											
										
										
											2018-08-18 18:58:10 +02:00
+								    """Extractor for images from individual tweets"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    subcategory = "tweet"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											2020-10-22 23:12:59 +02:00
+								    pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											2019-02-08 13:45:40 +01:00
+								    test = (
-												[twitter] replace unit test URLs

https://twitter.com/PicturesEarth was deleted

											
										
										
											2019-05-09 10:17:55 +02:00
+								        ("https://twitter.com/supernaturepics/status/604341487988576256", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "88a40f7d25529c2501c46f2218f9e0de9aa634b4",
-												[twitter] replace unit test URLs

https://twitter.com/PicturesEarth was deleted

											
										
										
											2019-05-09 10:17:55 +02:00
+								            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											2017-08-06 13:43:08 +02:00
+								        }),
-												[twitter] extract 'date' metadata (#224)

											
										
										
											2019-04-21 15:41:22 +02:00
+								        # 4 images
-												[twitter] ignore "Promoted Tweets"

											
										
										
											2017-08-06 13:43:08 +02:00
+								        ("https://twitter.com/perrypumas/status/894001459754180609", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "3a2a43dc5fb79dd5432c701d8e55e87c4e551f47",
-												[twitter] extract 'date' metadata (#224)

											
										
										
											2019-04-21 15:41:22 +02:00
+								        }),
 								        # video
 								        ("https://twitter.com/perrypumas/status/1065692031626829824", {
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											2017-08-06 13:43:08 +02:00
+								        }),
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											2019-07-17 15:35:42 +02:00
+								        # content with emoji, newlines, hashtags (#338)
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											2020-05-28 01:55:32 +02:00
+								        ("https://twitter.com/playpokemon/status/1263832915173048321", {
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            "keyword": {"content": (
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											2020-05-28 01:55:32 +02:00
+								                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
 								                "Gifts! \n\nYou’ll be able to receive four Galarian form "
 								                "Pokémon with Hidden Abilities, plus some very useful items. "
 								                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
-												update test results

- twitter:

    Don't test the whole kwdict, only the actual content, since the
    keyword hash changes whenever that user changes his display name.

- khinsider:

    Download host changed

											
										
										
											2020-02-22 02:59:56 +01:00
+								            )},
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											2019-07-17 15:35:42 +02:00
+								        }),
-												[twitter] update tests

											
										
										
											2020-06-19 18:12:57 +02:00
+								        # Reply to deleted tweet (#403, #838)
 								        ("https://twitter.com/i/web/status/1170041925560258560", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								        }),
-												[twitter] add 'replies' option (closes #705)

											
										
										
											2020-04-29 23:11:24 +02:00
+								        # 'replies' option (#705)
-												[twitter] update tests

											
										
										
											2020-06-19 18:12:57 +02:00
+								        ("https://twitter.com/i/web/status/1170041925560258560", {
-												[twitter] add 'replies' option (closes #705)

											
										
										
											2020-04-29 23:11:24 +02:00
+								            "options": (("replies", False),),
 								            "count": 0,
 								        }),
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								        # quoted tweet (#526, #854)
 								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+=jpg",
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								            "count": 8,
 								        }),
 								        # "quoted" option (#854)
 								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
 								            "options": (("quoted", False),),
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg",
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								            "count": 4,
-												[twitter] handle quoted tweets (#526)

… and categorize them as retweets

											
										
										
											2020-01-04 21:26:55 +01:00
+								        }),
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											2020-01-18 21:26:46 +01:00
+								        # TwitPic embeds (#579)
 								        ("https://twitter.com/i/web/status/112900228289540096", {
 								            "options": (("twitpic", True),),
 								            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
 								            "count": 3,
 								        }),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								        # Nitter tweet (#890)
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								        ("https://nitter.net/ed1conf/status/1163841619336007680", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "4a9ea898b14d3c112f98562d0df75c9785e239d9",
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								            "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
 								        }),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								        # Twitter card (#1005)
 								        ("https://twitter.com/billboard/status/1306599586602135555", {
 								            "options": (("cards", True),),
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											2020-11-05 22:53:29 +01:00
+								            "pattern": r"https://pbs.twimg.com/card_img/\d+/",
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								        }),
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											2020-09-28 23:03:35 +02:00
+								        # original retweets (#1026)
 								        ("https://twitter.com/jessica_3978/status/1296304589591810048", {
 								            "options": (("retweets", "original"),),
 								            "count": 2,
 								            "keyword": {
 								                "tweet_id": 1296296016002547713,
 								                "date"    : "dt:2020-08-20 04:00:28",
 								            },
 								        }),
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											2021-02-26 13:50:46 +01:00
+								        # all Tweets from a conversation (#1319)
 								        ("https://twitter.com/BlankArts_/status/1323314488611872769", {
 								            "options": (("conversations", True),),
 								            "count": ">= 50",
 								        }),
-												[twitter] add missing retweet media entities (fixes #1555)

from the original tweets

											
										
										
											2021-05-14 22:46:06 +02:00
+								        # retweet with missing media entities (#1555)
 								        ("https://twitter.com/morino_ya/status/1392763691599237121", {
 								            "count": 4,
 								        }),
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											2019-02-08 13:45:40 +01:00
+								    )
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
 								    def __init__(self, match):
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								        TwitterExtractor.__init__(self, match)
 								        self.tweet_id = match.group(2)
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    def tweets(self):
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											2021-02-26 13:50:46 +01:00
+								        if self.config("conversations", False):
 								            return TwitterAPI(self).conversation(self.tweet_id)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        return TwitterAPI(self).tweet(self.tweet_id)
-												[twitter] handle API rate limits (#526)

											
										
										
											2020-01-04 23:46:29 +01:00
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
+								class TwitterImageExtractor(Extractor):
 								    category = "twitter"
 								    subcategory = "image"
 								    pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)"
 								    test = (
 								        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg%name=orig"),
 								        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"),
 								    )
 								    def __init__(self, match):
 								        Extractor.__init__(self, match)
 								        self.id, self.fmt = match.groups()
 								    def items(self):
 								        base = "https://pbs.twimg.com/media/" + self.id
-												[twitter] rename variables

											
										
										
											2021-04-02 02:49:53 +02:00
+								        new = base + "?format=" + self.fmt + "&name="
 								        old = base + "." + self.fmt + ":"
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
 								        data = {
 								            "filename": self.id,
 								            "extension": self.fmt,
-												[twitter] rename variables

											
										
										
											2021-04-02 02:49:53 +02:00
+								            "_fallback": TwitterExtractor._image_fallback(new, old),
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
+								        }
 								        yield Message.Directory, data
-												[twitter] rename variables

											
										
										
											2021-04-02 02:49:53 +02:00
+								        yield Message.Url, new + "orig", data
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								class TwitterAPI():
 								    def __init__(self, extractor):
 								        self.extractor = extractor
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
 								        self.root = "https://twitter.com/i/api"
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        self.headers = {
 								            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
 								                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
 								                             "4FA33AGWWjCpTnA",
 								            "x-guest-token": None,
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								            "x-twitter-auth-type": None,
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            "x-twitter-client-language": "en",
 								            "x-twitter-active-user": "yes",
 								            "x-csrf-token": None,
 								            "Referer": "https://twitter.com/",
 								        }
 								        self.params = {
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								            "include_profile_interstitial_type": "1",
 								            "include_blocking": "1",
 								            "include_blocked_by": "1",
 								            "include_followed_by": "1",
 								            "include_want_retweets": "1",
 								            "include_mute_edge": "1",
 								            "include_can_dm": "1",
 								            "include_can_media_tag": "1",
 								            "skip_status": "1",
 								            "cards_platform": "Web-12",
 								            "include_cards": "1",
 								            "include_ext_alt_text": "true",
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								            "include_quote_count": "true",
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								            "include_reply_count": "1",
 								            "tweet_mode": "extended",
 								            "include_entities": "true",
 								            "include_user_entities": "true",
 								            "include_ext_media_color": "true",
 								            "include_ext_media_availability": "true",
 								            "send_error_codes": "true",
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            "simple_quoted_tweet": "true",
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								            "count": "100",
 								            "cursor": None,
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								            "ext": "mediaStats,highlightedLabel",
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								        }
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
 								        cookies = self.extractor.session.cookies
-												[twitter] update 'x-csrf-token' header (fixes #1170)

Twitter started using a bigger (80 instead of 16 bytes) CSRf token for
logged in users, and expects those to be used as 'x-csrf-token' header
when send via 'ct0' cookie.

Generating an 80 byte token ourselves doesn't work, and Twitter will
still insist on using its own.

											
										
										
											2020-12-11 13:40:57 +01:00
+								        cookiedomain = ".twitter.com"
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
 								        # CSRF
-												[twitter] update 'x-csrf-token' header (fixes #1170)

Twitter started using a bigger (80 instead of 16 bytes) CSRf token for
logged in users, and expects those to be used as 'x-csrf-token' header
when send via 'ct0' cookie.

Generating an 80 byte token ourselves doesn't work, and Twitter will
still insist on using its own.

											
										
										
											2020-12-11 13:40:57 +01:00
+								        csrf_token = cookies.get("ct0", domain=cookiedomain)
 								        if not csrf_token:
-												rename 'generate_csrf_token()' to just 'generate_token()'

and add a 'size' argument

											
										
										
											2021-01-11 22:12:40 +01:00
+								            csrf_token = util.generate_token()
-												[twitter] update 'x-csrf-token' header (fixes #1170)

Twitter started using a bigger (80 instead of 16 bytes) CSRf token for
logged in users, and expects those to be used as 'x-csrf-token' header
when send via 'ct0' cookie.

Generating an 80 byte token ourselves doesn't work, and Twitter will
still insist on using its own.

											
										
										
											2020-12-11 13:40:57 +01:00
+								            cookies.set("ct0", csrf_token, domain=cookiedomain)
 								        self.headers["x-csrf-token"] = csrf_token
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] update 'x-csrf-token' header (fixes #1170)

Twitter started using a bigger (80 instead of 16 bytes) CSRf token for
logged in users, and expects those to be used as 'x-csrf-token' header
when send via 'ct0' cookie.

Generating an 80 byte token ourselves doesn't work, and Twitter will
still insist on using its own.

											
										
										
											2020-12-11 13:40:57 +01:00
+								        if cookies.get("auth_token", domain=cookiedomain):
-												[twitter] use 'https://twitter.com/i/api/' for logged in users

Doesn't seem to make a difference from what I can tell,
i.e. downloaded files are the same, but the website does it.

											
										
										
											2020-11-16 11:26:37 +01:00
+								            # logged in
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            self.headers["x-twitter-auth-type"] = "OAuth2Session"
 								        else:
-												[twitter] use 'https://twitter.com/i/api/' for logged in users

Doesn't seem to make a difference from what I can tell,
i.e. downloaded files are the same, but the website does it.

											
										
										
											2020-11-16 11:26:37 +01:00
+								            # guest
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											2020-06-18 00:28:38 +02:00
+								            guest_token = self._guest_token()
-												[twitter] update 'x-csrf-token' header (fixes #1170)

Twitter started using a bigger (80 instead of 16 bytes) CSRf token for
logged in users, and expects those to be used as 'x-csrf-token' header
when send via 'ct0' cookie.

Generating an 80 byte token ourselves doesn't work, and Twitter will
still insist on using its own.

											
										
										
											2020-12-11 13:40:57 +01:00
+								            cookies.set("gt", guest_token, domain=cookiedomain)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            self.headers["x-guest-token"] = guest_token
 								    def tweet(self, tweet_id):
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        endpoint = "/2/timeline/conversation/{}.json".format(tweet_id)
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											2020-06-24 21:08:04 +02:00
+								        tweets = []
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        for tweet in self._pagination(endpoint):
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											2020-09-28 23:03:35 +02:00
+								            if tweet["id_str"] == tweet_id or \
 								                    tweet.get("_retweet_id_str") == tweet_id:
-												[twitter] improve handling of quoted tweets (#854)

Split each "quote" into two parts:
- the original tweet
- the tweet that quoted the original

											
										
										
											2020-06-24 21:08:04 +02:00
+								                tweets.append(tweet)
 								                if "quoted_status_id_str" in tweet:
 								                    tweet_id = tweet["quoted_status_id_str"]
 								                else:
 								                    break
 								        return tweets
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											2021-02-26 13:50:46 +01:00
+								    def conversation(self, conversation_id):
 								        endpoint = "/2/timeline/conversation/{}.json".format(conversation_id)
 								        return self._pagination(endpoint)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								    def timeline_profile(self, screen_name):
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
+								        user_id = self._user_id_by_screen_name(screen_name)
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        endpoint = "/2/timeline/profile/{}.json".format(user_id)
 								        params = self.params.copy()
 								        params["include_tweet_replies"] = "false"
 								        return self._pagination(endpoint, params)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
 								    def timeline_media(self, screen_name):
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
+								        user_id = self._user_id_by_screen_name(screen_name)
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        endpoint = "/2/timeline/media/{}.json".format(user_id)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        return self._pagination(endpoint)
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								    def timeline_favorites(self, screen_name):
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
+								        user_id = self._user_id_by_screen_name(screen_name)
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        endpoint = "/2/timeline/favorites/{}.json".format(user_id)
 								        params = self.params.copy()
 								        params["sorted_by_time"] = "true"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								        return self._pagination(endpoint)
 								    def timeline_bookmark(self):
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        endpoint = "/2/timeline/bookmark.json"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								        return self._pagination(endpoint)
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								    def timeline_list(self, list_id):
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        endpoint = "/2/timeline/list.json"
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								        params = self.params.copy()
 								        params["list_id"] = list_id
 								        params["ranking_mode"] = "reverse_chronological"
 								        return self._pagination(endpoint, params)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								    def search(self, query):
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        endpoint = "/2/search/adaptive.json"
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        params = self.params.copy()
-												[twitter] improve search results (fixes #847)

Adding 'tweet_search_mode=live' to the query parameters
is the most important part here.

											
										
										
											2020-06-21 15:43:27 +02:00
+								        params["q"] = query
 								        params["tweet_search_mode"] = "live"
 								        params["query_source"] = "typed_query"
 								        params["pc"] = "1"
 								        params["spelling_corrections"] = "1"
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								        return self._pagination(endpoint, params)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								    def list_by_rest_id(self, list_id):
-												[twitter] update GraphQL endpoints

											
										
										
											2021-02-20 02:09:17 +01:00
+								        endpoint = "/graphql/18MAHTcDU-TdJSjWWmoH7w/ListByRestId"
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								        params = {"variables": '{"listId":"' + list_id + '"'
 								                               ',"withUserResult":false}'}
 								        try:
 								            return self._call(endpoint, params)["data"]["list"]
 								        except KeyError:
 								            raise exception.NotFoundError("list")
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								    def list_members(self, list_id):
 								        endpoint = "/graphql/tA7h9hy4U0Yc9COfIOh3qQ/ListMembers"
 								        variables = {
 								            "listId": list_id,
 								            "count" : 100,
 								            "withTweetResult": False,
 								            "withUserResult" : False,
 								        }
 								        return self._pagination_graphql(
 								            endpoint, variables, "list", "members_timeline")
 								    def user_following(self, screen_name):
 								        endpoint = "/graphql/Q_QTiPvoXwsA13eoA7okIQ/Following"
 								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
 								            "count" : 100,
 								            "withTweetResult": False,
 								            "withUserResult" : False,
 								            "withTweetQuoteCount"   : False,
 								            "withHighlightedLabel"  : False,
 								            "includePromotedContent": False,
 								        }
 								        return self._pagination_graphql(
 								            endpoint, variables, "user", "following_timeline")
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								    def user_by_screen_name(self, screen_name):
-												[twitter] update GraphQL endpoints

											
										
										
											2021-02-20 02:09:17 +01:00
+								        endpoint = "/graphql/hc-pka9A7gyS3xODIafnrQ/UserByScreenName"
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											2020-11-05 22:53:29 +01:00
+								        params = {"variables": '{"screen_name":"' + screen_name + '"'
 								                               ',"withHighlightedLabel":true}'}
-												[twitter] raise proper exception if user doesn't exist (#891)

											
										
										
											2020-07-14 16:47:25 +02:00
+								        try:
 								            return self._call(endpoint, params)["data"]["user"]
 								        except KeyError:
 								            raise exception.NotFoundError("user")
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
+								    def _user_id_by_screen_name(self, screen_name):
 								        if screen_name.startswith("id:"):
 								            return screen_name[3:]
 								        return self.user_by_screen_name(screen_name)["rest_id"]
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											2020-06-18 00:28:38 +02:00
+								    @cache(maxage=3600)
 								    def _guest_token(self):
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        root = "https://api.twitter.com"
 								        endpoint = "/1.1/guest/activate.json"
 								        return self._call(endpoint, None, root, "POST")["guest_token"]
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											2020-06-18 00:28:38 +02:00
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								    def _call(self, endpoint, params, root=None, method="GET"):
 								        if root is None:
 								            root = self.root
-												[twitter] improve error message formatting

											
										
										
											2020-07-06 23:13:05 +02:00
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											2021-01-19 23:15:57 +01:00
+								        while True:
 								            response = self.extractor.request(
 								                root + endpoint, method=method, params=params,
 								                headers=self.headers, fatal=None)
 								            # update 'x-csrf-token' header (#1170)
 								            csrf_token = response.cookies.get("ct0")
 								            if csrf_token:
 								                self.headers["x-csrf-token"] = csrf_token
 								            if response.status_code < 400:
 								                return response.json()
 								            if response.status_code == 429:
 								                until = response.headers.get("x-rate-limit-reset")
 								                seconds = None if until else 60
 								                self.extractor.wait(until=until, seconds=seconds)
 								                continue
 								            try:
 								                msg = ", ".join(
 								                    '"' + error["message"] + '"'
 								                    for error in response.json()["errors"]
 								                )
 								            except Exception:
 								                msg = response.text
 								            raise exception.StopExtraction(
 								                "%s %s (%s)", response.status_code, response.reason, msg)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								    def _pagination(self, endpoint, params=None):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        if params is None:
 								            params = self.params.copy()
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											2020-09-28 23:03:35 +02:00
+								        original_retweets = (self.extractor.retweets == "original")
-												[twitter] fetch media from pinned tweets (#1203)

											
										
										
											2020-12-29 16:27:43 +01:00
+								        pinned_tweet = True
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
 								        while True:
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            cursor = tweet = None
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            data = self._call(endpoint, params)
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
 								            instr = data["timeline"]["instructions"]
 								            if not instr:
 								                return
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								            tweet_ids = []
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								            tweets = data["globalObjects"]["tweets"]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            users = data["globalObjects"]["users"]
-												[twitter] fetch media from pinned tweets (#1203)

											
										
										
											2020-12-29 16:27:43 +01:00
+								            if pinned_tweet:
 								                if "pinEntry" in instr[-1]:
 								                    tweet_ids.append(instr[-1]["pinEntry"]["entry"]["content"]
 								                                     ["item"]["content"]["tweet"]["id"])
 								                pinned_tweet = False
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								            # collect tweet IDs and cursor value
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            for entry in instr[0]["addEntries"]["entries"]:
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								                entry_startswith = entry["entryId"].startswith
 								                if entry_startswith(("tweet-", "sq-I-t-")):
 								                    tweet_ids.append(
 								                        entry["content"]["item"]["content"]["tweet"]["id"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								                elif entry_startswith("homeConversation-"):
 								                    tweet_ids.extend(
 								                        entry["content"]["timelineModule"]["metadata"]
 								                        ["conversationMetadata"]["allTweetIds"][::-1])
 								                elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")):
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								                    cursor = entry["content"]["operation"]["cursor"]
 								                    if not cursor.get("stopOnEmptyResponse"):
 								                        # keep going even if there are no tweets
 								                        tweet = True
 								                    cursor = cursor["value"]
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											2021-02-26 13:50:46 +01:00
+								                elif entry_startswith("conversationThread-"):
 								                    tweet_ids.extend(
 								                        item["entryId"][6:]
 								                        for item in entry["content"]["timelineModule"]["items"]
 								                        if item["entryId"].startswith("tweet-")
 								                    )
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								            # process tweets
 								            for tweet_id in tweet_ids:
 								                try:
 								                    tweet = tweets[tweet_id]
 								                except KeyError:
 								                    self.extractor.log.debug("Skipping %s (deleted)", tweet_id)
 								                    continue
 								                if "retweeted_status_id_str" in tweet:
 								                    retweet = tweets.get(tweet["retweeted_status_id_str"])
 								                    if original_retweets:
 								                        if not retweet:
 								                            continue
 								                        retweet["_retweet_id_str"] = tweet["id_str"]
 								                        tweet = retweet
 								                    elif retweet:
 								                        tweet["author"] = users[retweet["user_id_str"]]
-												[twitter] add missing retweet media entities (fixes #1555)

from the original tweets

											
										
										
											2021-05-14 22:46:06 +02:00
+								                        if "extended_entities" in retweet and \
 								                                "extended_entities" not in tweet:
 								                            tweet["extended_entities"] = \
 								                                retweet["extended_entities"]
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								                tweet["user"] = users[tweet["user_id_str"]]
 								                yield tweet
 								                if "quoted_status_id_str" in tweet:
 								                    quoted = tweets.get(tweet["quoted_status_id_str"])
 								                    if quoted:
 								                        quoted["author"] = users[quoted["user_id_str"]]
 								                        quoted["user"] = tweet["user"]
 								                        quoted["quoted"] = True
 								                        yield quoted
 								            # update cursor value
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            if "replaceEntry" in instr[-1] :
 								                cursor = (instr[-1]["replaceEntry"]["entry"]
 								                          ["content"]["operation"]["cursor"]["value"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            if not cursor or not tweet:
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								                return
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            params["cursor"] = cursor
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								    def _pagination_graphql(self, endpoint, variables, key, timeline):
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								        while True:
 								            cursor = entry = stop = None
 								            params = {"variables": json.dumps(variables)}
 								            data = self._call(endpoint, params)
 								            try:
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								                instructions = \
 								                    data["data"][key][timeline]["timeline"]["instructions"]
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								            except KeyError:
 								                raise exception.AuthorizationError()
 								            for instr in instructions:
 								                if instr["type"] == "TimelineAddEntries":
 								                    for entry in instr["entries"]:
 								                        if entry["entryId"].startswith("user-"):
 								                            yield entry["content"]["itemContent"]["user"]
 								                        elif entry["entryId"].startswith("cursor-bottom-"):
 								                            cursor = entry["content"]["value"]
 								                elif instr["type"] == "TimelineTerminateTimeline":
 								                    if instr["direction"] == "Bottom":
 								                        stop = True
 								            if stop or not cursor or not entry:
 								                return
 								            variables["cursor"] = cursor