gallery-dl/gallery_dl/extractor/twitter.py

# -*- coding: utf-8 -*-

# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://twitter.com/"""

from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import json

BASE_PATTERN = (
    r"(?:https?://)?(?:www\.|mobile\.)?"
    r"(?:(?:[fv]x)?twitter\.com|nitter\.net)"
)


class TwitterExtractor(Extractor):
    """Base class for twitter extractors"""
    category = "twitter"
    directory_fmt = ("{category}", "{user[name]}")
    filename_fmt = "{tweet_id}_{num}.{extension}"
    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
    cookiedomain = ".twitter.com"
    cookienames = ("auth_token",)
    root = "https://twitter.com"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.user = match.group(1)
        self.textonly = self.config("text-tweets", False)
        self.retweets = self.config("retweets", False)
        self.replies = self.config("replies", True)
        self.twitpic = self.config("twitpic", False)
        self.pinned = self.config("pinned", False)
        self.quoted = self.config("quoted", False)
        self.videos = self.config("videos", True)
        self.cards = self.config("cards", False)
        self._user = self._user_obj = None
        self._user_cache = {}
        self._init_sizes()

    def _init_sizes(self):
        size = self.config("size")
        if size is None:
            self._size_image = "orig"
            self._size_fallback = ("4096x4096", "large", "medium", "small")
        else:
            if isinstance(size, str):
                size = size.split(",")
            self._size_image = size[0]
            self._size_fallback = size[1:]

    def items(self):
        self.login()
        self.api = TwitterAPI(self)
        metadata = self.metadata()

        if self.config("expand"):
            tweets = self._expand_tweets(self.tweets())
            self.tweets = lambda : tweets

        if self.config("unique", True):
            seen_tweets = set()
        else:
            seen_tweets = None

        for tweet in self.tweets():

            if "legacy" in tweet:
                data = tweet["legacy"]
            else:
                data = tweet

            if seen_tweets is not None:
                if data["id_str"] in seen_tweets:
                    continue
                seen_tweets.add(data["id_str"])

            if not self.retweets and "retweeted_status_id_str" in data:
                self.log.debug("Skipping %s (retweet)", data["id_str"])
                continue
            if not self.quoted and "quoted_by_id_str" in data:
                self.log.debug("Skipping %s (quoted tweet)", data["id_str"])
                continue
            if "in_reply_to_user_id_str" in data and (
                not self.replies or (
                    self.replies == "self" and
                    data["user_id_str"] !=
                    (self._user_obj["rest_id"] if self._user else
                     data["in_reply_to_user_id_str"])
                )
            ):
                self.log.debug("Skipping %s (reply)", data["id_str"])
                continue

            files = []
            if "extended_entities" in data:
                self._extract_media(
                    data, data["extended_entities"]["media"], files)
            if "card" in tweet and self.cards:
                self._extract_card(tweet, files)
            if self.twitpic:
                self._extract_twitpic(data, files)
            if not files and not self.textonly:
                continue

            tdata = self._transform_tweet(tweet)
            tdata.update(metadata)
            tdata["count"] = len(files)
            yield Message.Directory, tdata
            for tdata["num"], file in enumerate(files, 1):
                file.update(tdata)
                url = file.pop("url")
                if "extension" not in file:
                    text.nameext_from_url(url, file)
                yield Message.Url, url, file

    def _extract_media(self, tweet, entities, files):
        for media in entities:
            descr = media.get("ext_alt_text")
            width = media["original_info"].get("width", 0)
            height = media["original_info"].get("height", 0)

            if "video_info" in media:
                if self.videos == "ytdl":
                    files.append({
                        "url": "ytdl:{}/i/web/status/{}".format(
                            self.root, tweet["id_str"]),
                        "width"      : width,
                        "height"     : height,
                        "extension"  : None,
                        "description": descr,
                    })
                elif self.videos:
                    video_info = media["video_info"]
                    variant = max(
                        video_info["variants"],
                        key=lambda v: v.get("bitrate", 0),
                    )
                    files.append({
                        "url"        : variant["url"],
                        "width"      : width,
                        "height"     : height,
                        "bitrate"    : variant.get("bitrate", 0),
                        "duration"   : video_info.get(
                            "duration_millis", 0) / 1000,
                        "description": descr,
                    })
            elif "media_url_https" in media:
                url = media["media_url_https"]
                base, _, fmt = url.rpartition(".")
                base += "?format=" + fmt + "&name="
                files.append(text.nameext_from_url(url, {
                    "url"        : base + self._size_image,
                    "width"      : width,
                    "height"     : height,
                    "_fallback"  : self._image_fallback(base),
                    "description": descr,
                }))
            else:
                files.append({"url": media["media_url"]})

    def _image_fallback(self, base):
        for fmt in self._size_fallback:
            yield base + fmt

    def _extract_card(self, tweet, files):
        card = tweet["card"]
        if "legacy" in card:
            card = card["legacy"]
        name = card["name"]

        if name in ("summary", "summary_large_image"):
            bvals = card["binding_values"]
            if isinstance(bvals, list):
                bvals = {
                    bval["key"]: bval["value"]
                    for bval in card["binding_values"]
                }
            for prefix in ("photo_image_full_size_",
                           "summary_photo_image_",
                           "thumbnail_image_"):
                for size in ("original", "x_large", "large", "small"):
                    key = prefix + size
                    if key in bvals:
                        value = bvals[key].get("image_value")
                        if value and "url" in value:
                            base, sep, size = value["url"].rpartition("&name=")
                            if sep:
                                base += sep
                                value["url"] = base + self._size_image
                                value["_fallback"] = self._image_fallback(base)
                            files.append(value)
                            return
        elif name == "unified_card":
            bvals = card["binding_values"]
            if isinstance(bvals, list):
                for bval in card["binding_values"]:
                    if bval["key"] == "unified_card":
                        bval = bval["value"]["string_value"]
                        break
            else:
                bval = bvals["unified_card"]["string_value"]
            data = json.loads(bval)
            if data.get("type") == "image_carousel_website":
                self._extract_media(
                    tweet, data["media_entities"].values(), files)
                return

        if self.cards == "ytdl":
            tweet_id = tweet.get("rest_id") or tweet["id_str"]
            url = "ytdl:{}/i/web/status/{}".format(self.root, tweet_id)
            files.append({"url": url})

    def _extract_twitpic(self, tweet, files):
        for url in tweet["entities"].get("urls", ()):
            url = url["expanded_url"]
            if "//twitpic.com/" in url and "/photos/" not in url:
                response = self.request(url, fatal=False)
                if response.status_code >= 400:
                    continue
                url = text.extract(
                    response.text, 'name="twitter:image" value="', '"')[0]
                if url:
                    files.append({"url": url})

    def _transform_tweet(self, tweet):
        if "author" in tweet:
            author = tweet["author"]
        elif "core" in tweet:
            author = tweet["core"]["user_results"]["result"]
        else:
            author = tweet["user"]
        author = self._transform_user(author)

        if "legacy" in tweet:
            tweet = tweet["legacy"]

        tget = tweet.get
        entities = tweet["entities"]
        tdata = {
            "tweet_id"      : text.parse_int(tweet["id_str"]),
            "retweet_id"    : text.parse_int(
                tget("retweeted_status_id_str")),
            "quote_id"      : text.parse_int(
                tget("quoted_by_id_str")),
            "reply_id"      : text.parse_int(
                tget("in_reply_to_status_id_str")),
            "date"          : text.parse_datetime(
                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
            "user"          : self._user or author,
            "author"        : author,
            "lang"          : tweet["lang"],
            "favorite_count": tget("favorite_count"),
            "quote_count"   : tget("quote_count"),
            "reply_count"   : tget("reply_count"),
            "retweet_count" : tget("retweet_count"),
        }

        hashtags = entities.get("hashtags")
        if hashtags:
            tdata["hashtags"] = [t["text"] for t in hashtags]

        mentions = entities.get("user_mentions")
        if mentions:
            tdata["mentions"] = [{
                "id": text.parse_int(u["id_str"]),
                "name": u["screen_name"],
                "nick": u["name"],
            } for u in mentions]

        content = text.unescape(tget("full_text") or tget("text") or "")
        urls = entities.get("urls")
        if urls:
            for url in urls:
                content = content.replace(url["url"], url["expanded_url"])
        txt, _, tco = content.rpartition(" ")
        tdata["content"] = txt if tco.startswith("https://t.co/") else content

        if "in_reply_to_screen_name" in tweet:
            tdata["reply_to"] = tweet["in_reply_to_screen_name"]
        if "quoted_by" in tweet:
            tdata["quote_by"] = tweet["quoted_by"]

        return tdata

    def _transform_user(self, user):
        uid = user.get("rest_id") or user["id_str"]

        try:
            return self._user_cache[uid]
        except KeyError:
            pass

        if "legacy" in user:
            user = user["legacy"]

        uget = user.get
        entities = user["entities"]

        self._user_cache[uid] = udata = {
            "id"              : text.parse_int(uid),
            "name"            : user["screen_name"],
            "nick"            : user["name"],
            "location"        : uget("location"),
            "date"            : text.parse_datetime(
                uget("created_at"), "%a %b %d %H:%M:%S %z %Y"),
            "verified"        : uget("verified", False),
            "profile_banner"  : uget("profile_banner_url", ""),
            "profile_image"   : uget(
                "profile_image_url_https", "").replace("_normal.", "."),
            "favourites_count": uget("favourites_count"),
            "followers_count" : uget("followers_count"),
            "friends_count"   : uget("friends_count"),
            "listed_count"    : uget("listed_count"),
            "media_count"     : uget("media_count"),
            "statuses_count"  : uget("statuses_count"),
        }

        descr = user["description"]
        urls = entities["description"].get("urls")
        if urls:
            for url in urls:
                descr = descr.replace(url["url"], url["expanded_url"])
        udata["description"] = descr

        if "url" in entities:
            url = entities["url"]["urls"][0]
            udata["url"] = url.get("expanded_url") or url.get("url")

        return udata

    def _users_result(self, users):
        userfmt = self.config("users")
        if not userfmt or userfmt == "timeline":
            cls = TwitterTimelineExtractor
            fmt = (self.root + "/i/user/{rest_id}").format_map
        elif userfmt == "media":
            cls = TwitterMediaExtractor
            fmt = (self.root + "/id:{rest_id}/media").format_map
        elif userfmt == "tweets":
            cls = TwitterTweetsExtractor
            fmt = (self.root + "/id:{rest_id}/tweets").format_map
        else:
            cls = None
            fmt = userfmt.format_map

        for user in users:
            user["_extractor"] = cls
            yield Message.Queue, fmt(user), user

    def _expand_tweets(self, tweets):
        seen = set()
        for tweet in tweets:

            if "legacy" in tweet:
                cid = tweet["legacy"]["conversation_id_str"]
            else:
                cid = tweet["conversation_id_str"]

            if cid not in seen:
                seen.add(cid)
                try:
                    yield from self.api.tweet_detail(cid)
                except Exception:
                    yield tweet

    def metadata(self):
        """Return general metadata"""
        return {}

    def tweets(self):
        """Yield all relevant tweet objects"""

    def login(self):
        if not self._check_cookies(self.cookienames):
            username, password = self._get_auth_info()
            if username:
                self._update_cookies(self._login_impl(username, password))

    @cache(maxage=360*24*3600, keyarg=1)
    def _login_impl(self, username, password):
        self.log.info("Logging in as %s", username)

        token = util.generate_token()
        self.session.cookies.clear()
        self.request(self.root + "/login")

        url = self.root + "/sessions"
        cookies = {
            "_mb_tk": token,
        }
        data = {
            "redirect_after_login"      : "/",
            "remember_me"               : "1",
            "authenticity_token"        : token,
            "wfa"                       : "1",
            "ui_metrics"                : "{}",
            "session[username_or_email]": username,
            "session[password]"         : password,
        }
        response = self.request(
            url, method="POST", cookies=cookies, data=data)

        if "/account/login_verification" in response.url:
            raise exception.AuthenticationError(
                "Login with two-factor authentication is not supported")

        cookies = {
            cookie.name: cookie.value
            for cookie in self.session.cookies
        }

        if "/error" in response.url or "auth_token" not in cookies:
            raise exception.AuthenticationError()
        return cookies


class TwitterTimelineExtractor(TwitterExtractor):
    """Extractor for a Twitter user timeline"""
    subcategory = "timeline"
    pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
               r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
    test = (
        ("https://twitter.com/supernaturepics", {
            "range": "1-40",
            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
        }),
        # suspended account (#2216)
        ("https://twitter.com/realDonaldTrump", {
            "exception": exception.NotFoundError,
        }),
        ("https://mobile.twitter.com/supernaturepics?p=i"),
        ("https://www.twitter.com/id:2976459548"),
        ("https://twitter.com/i/user/2976459548"),
        ("https://twitter.com/intent/user?user_id=2976459548"),
        ("https://fxtwitter.com/supernaturepics"),
        ("https://vxtwitter.com/supernaturepics"),
    )

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        user_id = match.group(2)
        if user_id:
            self.user = "id:" + user_id

    def tweets(self):
        # yield initial batch of (media) tweets
        tweet = None
        for tweet in self._select_tweet_source()(self.user):
            yield tweet
        if tweet is None:
            return

        # build search query
        query = "from:{} max_id:{}".format(
            self._user["name"], tweet["rest_id"])
        if self.retweets:
            query += " include:retweets include:nativeretweets"

        if not self.textonly:
            # try to search for media-only tweets
            tweet = None
            for tweet in self.api.search_adaptive(query + (
                    " (filter:images OR"
                    " filter:native_video OR"
                    " card_name:animated_gif)")):
                yield tweet
            if tweet is not None:
                return

        # yield unfiltered search results
        yield from self.api.search_adaptive(query)

    def _select_tweet_source(self):
        strategy = self.config("strategy")
        if strategy is None or strategy == "auto":
            if self.retweets or self.textonly:
                return self.api.user_tweets
            else:
                return self.api.user_media
        if strategy == "tweets":
            return self.api.user_tweets
        if strategy == "with_replies":
            return self.api.user_tweets_and_replies
        return self.api.user_media


class TwitterTweetsExtractor(TwitterExtractor):
    """Extractor for Tweets from a user's Tweets timeline"""
    subcategory = "tweets"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/tweets", {
            "range": "1-40",
            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
        }),
        ("https://mobile.twitter.com/supernaturepics/tweets#t"),
        ("https://www.twitter.com/id:2976459548/tweets"),
    )

    def tweets(self):
        return self.api.user_tweets(self.user)


class TwitterRepliesExtractor(TwitterExtractor):
    """Extractor for Tweets from a user's timeline including replies"""
    subcategory = "replies"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/with_replies(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/with_replies", {
            "range": "1-40",
            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
        }),
        ("https://mobile.twitter.com/supernaturepics/with_replies#t"),
        ("https://www.twitter.com/id:2976459548/with_replies"),
    )

    def tweets(self):
        return self.api.user_tweets_and_replies(self.user)


class TwitterMediaExtractor(TwitterExtractor):
    """Extractor for Tweets from a user's Media timeline"""
    subcategory = "media"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/media", {
            "range": "1-40",
            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
        }),
        ("https://mobile.twitter.com/supernaturepics/media#t"),
        ("https://www.twitter.com/id:2976459548/media"),
    )

    def tweets(self):
        return self.api.user_media(self.user)


class TwitterLikesExtractor(TwitterExtractor):
    """Extractor for liked tweets"""
    subcategory = "likes"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
    test = ("https://twitter.com/supernaturepics/likes",)

    def metadata(self):
        return {"user_likes": self.user}

    def tweets(self):
        return self.api.user_likes(self.user)


class TwitterBookmarkExtractor(TwitterExtractor):
    """Extractor for bookmarked tweets"""
    subcategory = "bookmark"
    pattern = BASE_PATTERN + r"/i/bookmarks()"
    test = ("https://twitter.com/i/bookmarks",)

    def tweets(self):
        return self.api.user_bookmarks()


class TwitterListExtractor(TwitterExtractor):
    """Extractor for Twitter lists"""
    subcategory = "list"
    pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
    test = ("https://twitter.com/i/lists/784214683683127296", {
        "range": "1-40",
        "count": 40,
        "archive": False,
    })

    def tweets(self):
        return self.api.list_latest_tweets_timeline(self.user)


class TwitterListMembersExtractor(TwitterExtractor):
    """Extractor for members of a Twitter list"""
    subcategory = "list-members"
    pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
    test = ("https://twitter.com/i/lists/784214683683127296/members",)

    def items(self):
        self.login()
        return self._users_result(TwitterAPI(self).list_members(self.user))


class TwitterFollowingExtractor(TwitterExtractor):
    """Extractor for followed users"""
    subcategory = "following"
    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)"
    test = (
        ("https://twitter.com/supernaturepics/following"),
        ("https://www.twitter.com/id:2976459548/following"),
    )

    def items(self):
        self.login()
        return self._users_result(TwitterAPI(self).user_following(self.user))


class TwitterSearchExtractor(TwitterExtractor):
    """Extractor for Twitter search results"""
    subcategory = "search"
    pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
    test = ("https://twitter.com/search?q=nature", {
        "range": "1-40",
        "count": 40,
        "archive": False,
    })

    def metadata(self):
        return {"search": text.unquote(self.user)}

    def tweets(self):
        query = text.unquote(self.user)

        user = None
        for item in query.split():
            item = item.strip("()")
            if item.startswith("from:"):
                if user:
                    user = None
                    break
                else:
                    user = item[5:]

        if user is not None:
            try:
                self._user_obj = user = self.api.user_by_screen_name(user)
            except KeyError:
                raise exception.NotFoundError("user")
            self._user = self._transform_user(user)

        return self.api.search_adaptive(query)


class TwitterEventExtractor(TwitterExtractor):
    """Extractor for Tweets from a Twitter Event"""
    subcategory = "event"
    directory_fmt = ("{category}", "Events",
                     "{event[id]} {event[short_title]}")
    pattern = BASE_PATTERN + r"/i/events/(\d+)"
    test = ("https://twitter.com/i/events/1484669206993903616", {
        "range": "1-20",
        "count": ">5",
    })

    def metadata(self):
        return {"event": self.api.live_event(self.user)}

    def tweets(self):
        return self.api.live_event_timeline(self.user)


class TwitterTweetExtractor(TwitterExtractor):
    """Extractor for images from individual tweets"""
    subcategory = "tweet"
    pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
    test = (
        ("https://twitter.com/supernaturepics/status/604341487988576256", {
            "url": "88a40f7d25529c2501c46f2218f9e0de9aa634b4",
            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
        }),
        # 4 images
        ("https://twitter.com/perrypumas/status/894001459754180609", {
            "url": "3a2a43dc5fb79dd5432c701d8e55e87c4e551f47",
        }),
        # video
        ("https://twitter.com/perrypumas/status/1065692031626829824", {
            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
        }),
        # content with emoji, newlines, hashtags (#338)
        ("https://twitter.com/playpokemon/status/1263832915173048321", {
            "keyword": {"content": (
                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
                "Gifts! \n\nYou’ll be able to receive four Galarian form "
                "Pokémon with Hidden Abilities, plus some very useful items. "
                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
            )},
        }),
        # Reply to deleted tweet (#403, #838)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_",
        }),
        # 'replies' option (#705)
        ("https://twitter.com/i/web/status/1170041925560258560", {
            "options": (("replies", False),),
            "count": 0,
        }),
        # 'replies' to self (#1254)
        ("https://twitter.com/i/web/status/1424882930803908612", {
            "options": (("replies", "self"),),
            "count": 4,
            "keyword": {"user": {
                "description": "re:business email-- rhettaro.bloom@gmail.com "
                               "patreon- http://patreon.com/Princecanary",
                "url": "http://princecanary.tumblr.com",
            }},
        }),
        ("https://twitter.com/i/web/status/1424898916156284928", {
            "options": (("replies", "self"),),
            "count": 0,
        }),
        # "quoted" option (#854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "options": (("quoted", True),),
            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+=jpg",
            "count": 8,
        }),
        # quoted tweet (#526, #854)
        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
            "pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg",
            "count": 4,
        }),
        # TwitPic embeds (#579)
        ("https://twitter.com/i/web/status/112900228289540096", {
            "options": (("twitpic", True), ("cards", False)),
            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
            "count": 3,
        }),
        # Nitter tweet (#890)
        ("https://nitter.net/ed1conf/status/1163841619336007680", {
            "url": "4a9ea898b14d3c112f98562d0df75c9785e239d9",
            "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
        }),
        # Twitter card (#1005)
        ("https://twitter.com/billboard/status/1306599586602135555", {
            "options": (("cards", True),),
            "pattern": r"https://pbs.twimg.com/card_img/\d+/",
        }),
        # unified_card with image_carousel_website
        ("https://twitter.com/doax_vv_staff/status/1479438945662685184", {
            "options": (("cards", True),),
            "pattern": r"https://pbs\.twimg\.com/media/F.+=png",
            "count": 6,
        }),
        # unified_card without type
        ("https://twitter.com/i/web/status/1466183847628865544", {
            "count": 0,
        }),
        # original retweets (#1026)
        ("https://twitter.com/jessica_3978/status/1296304589591810048", {
            "options": (("retweets", "original"),),
            "count": 2,
            "keyword": {
                "tweet_id"  : 1296296016002547713,
                "retweet_id": 1296296016002547713,
                "date"      : "dt:2020-08-20 04:00:28",
            },
        }),
        # all Tweets from a 'conversation' (#1319)
        ("https://twitter.com/supernaturepics/status/604341487988576256", {
            "options": (("conversations", True),),
            "count": 5,
        }),
        # retweet with missing media entities (#1555)
        ("https://twitter.com/morino_ya/status/1392763691599237121", {
            "options": (("retweets", True),),
            "count": 4,
        }),
        # deleted quote tweet (#2225)
        ("https://twitter.com/i/web/status/1460044411165888515", {
            "count": 0,
        }),
        # "Misleading" content
        ("https://twitter.com/i/web/status/1486373748911575046", {
            "count": 4,
        }),
        # age-restricted (#2354)
        ("https://twitter.com/mightbecursed/status/1492954264909479936", {
            "options": (("syndication", True),),
            "count": 1,
        }),
        # media alt texts / descriptions (#2617)
        ("https://twitter.com/my0nruri/status/1528379296041299968", {
            "keyword": {"description": "oc"}
        }),
    )

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        self.tweet_id = match.group(2)

    def tweets(self):
        if self.config("conversations", False):
            return self.api.tweet_detail(self.tweet_id)

        tweets = []
        tweet_id = self.tweet_id
        for tweet in self.api.tweet_detail(tweet_id):
            if tweet["rest_id"] == tweet_id or \
                    tweet.get("_retweet_id_str") == tweet_id:
                tweets.append(tweet)

                tweet_id = tweet["legacy"].get("quoted_status_id_str")
                if not tweet_id:
                    break
        return tweets


class TwitterImageExtractor(Extractor):
    category = "twitter"
    subcategory = "image"
    pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)"
    test = (
        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg&name=orig", {
            "options": (("size", "4096x4096,orig"),),
            "url": "cb3042a6f6826923da98f0d2b66c427e9385114c",
        }),
        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.id, self.fmt = match.groups()
        TwitterExtractor._init_sizes(self)

    def items(self):
        base = "https://pbs.twimg.com/media/{}?format={}&name=".format(
            self.id, self.fmt)

        data = {
            "filename": self.id,
            "extension": self.fmt,
            "_fallback": TwitterExtractor._image_fallback(self, base),
        }

        yield Message.Directory, data
        yield Message.Url, base + self._size_image, data


class TwitterAPI():

    def __init__(self, extractor):
        self.extractor = extractor

        self.root = "https://twitter.com/i/api"
        self.headers = {
            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
                             "4FA33AGWWjCpTnA",
            "x-guest-token": None,
            "x-twitter-auth-type": None,
            "x-twitter-client-language": "en",
            "x-twitter-active-user": "yes",
            "x-csrf-token": None,
            "Referer": "https://twitter.com/",
        }
        self.params = {
            "include_profile_interstitial_type": "1",
            "include_blocking": "1",
            "include_blocked_by": "1",
            "include_followed_by": "1",
            "include_want_retweets": "1",
            "include_mute_edge": "1",
            "include_can_dm": "1",
            "include_can_media_tag": "1",
            "include_ext_has_nft_avatar": "1",
            "skip_status": "1",
            "cards_platform": "Web-12",
            "include_cards": "1",
            "include_ext_alt_text": "true",
            "include_quote_count": "true",
            "include_reply_count": "1",
            "tweet_mode": "extended",
            "include_entities": "true",
            "include_user_entities": "true",
            "include_ext_media_color": "true",
            "include_ext_media_availability": "true",
            "include_ext_sensitive_media_warning": "true",
            "send_error_codes": "true",
            "simple_quoted_tweet": "true",
            "count": "100",
            "cursor": None,
            "ext": "mediaStats,highlightedLabel,hasNftAvatar,"
                   "voiceInfo,superFollowMetadata",
        }
        self.variables = {
            "includePromotedContent": False,
            "withSuperFollowsUserFields": True,
            "withBirdwatchPivots": False,
            "withDownvotePerspective": False,
            "withReactionsMetadata": False,
            "withReactionsPerspective": False,
            "withSuperFollowsTweetFields": True,
            "withClientEventToken": False,
            "withBirdwatchNotes": False,
            "withVoice": True,
            "withV2Timeline": False,
            "__fs_interactive_text": False,
            "__fs_dont_mention_me_view_api_enabled": False,
        }

        self._nsfw_warning = True
        self._syndication = extractor.config("syndication")
        self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode

        cookies = extractor.session.cookies
        cookiedomain = extractor.cookiedomain

        csrf = extractor.config("csrf")
        if csrf is None or csrf == "cookies":
            csrf_token = cookies.get("ct0", domain=cookiedomain)
        else:
            csrf_token = None
        if not csrf_token:
            csrf_token = util.generate_token()
            cookies.set("ct0", csrf_token, domain=cookiedomain)
        self.headers["x-csrf-token"] = csrf_token

        if cookies.get("auth_token", domain=cookiedomain):
            # logged in
            self.headers["x-twitter-auth-type"] = "OAuth2Session"
        else:
            # guest
            guest_token = self._guest_token()
            cookies.set("gt", guest_token, domain=cookiedomain)
            self.headers["x-guest-token"] = guest_token

    def tweet_detail(self, tweet_id):
        endpoint = "/graphql/ItejhtHVxU7ksltgMmyaLA/TweetDetail"
        variables = {
            "focalTweetId": tweet_id,
            "with_rux_injections": False,
            "withCommunity": True,
            "withQuickPromoteEligibilityTweetFields": True,
            "withBirdwatchNotes": False,
        }
        return self._pagination_tweets(
            endpoint, variables, ("threaded_conversation_with_injections",))

    def user_tweets(self, screen_name):
        endpoint = "/graphql/WZT7sCTrLvSOaWOXLDsWbQ/UserTweets"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
            "withQuickPromoteEligibilityTweetFields": True,
        }
        return self._pagination_tweets(endpoint, variables)

    def user_tweets_and_replies(self, screen_name):
        endpoint = "/graphql/t4wEKVulW4Mbv1P0kgxTEw/UserTweetsAndReplies"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
            "withCommunity": True,
        }
        return self._pagination_tweets(endpoint, variables)

    def user_media(self, screen_name):
        endpoint = "/graphql/nRybED9kRbN-TOWioHq1ng/UserMedia"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
        }
        return self._pagination_tweets(endpoint, variables)

    def user_likes(self, screen_name):
        endpoint = "/graphql/9MSTt44HoGjVFSg_u3rHDw/Likes"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
        }
        return self._pagination_tweets(endpoint, variables)

    def user_bookmarks(self):
        endpoint = "/graphql/uKP9v_I31k0_VSBmlpq2Xg/Bookmarks"
        variables = {
            "count": 100,
        }
        return self._pagination_tweets(
            endpoint, variables, ("bookmark_timeline", "timeline"))

    def list_latest_tweets_timeline(self, list_id):
        endpoint = "/graphql/z3l-EHlx-fyg8OvGO4JN8A/ListLatestTweetsTimeline"
        variables = {
            "listId": list_id,
            "count": 100,
        }
        return self._pagination_tweets(
            endpoint, variables, ("list", "tweets_timeline", "timeline"))

    def search_adaptive(self, query):
        endpoint = "/2/search/adaptive.json"
        params = self.params.copy()
        params["q"] = query
        params["tweet_search_mode"] = "live"
        params["query_source"] = "typed_query"
        params["pc"] = "1"
        params["spelling_corrections"] = "1"
        return self._pagination_legacy(endpoint, params)

    def live_event_timeline(self, event_id):
        endpoint = "/2/live_event/timeline/{}.json".format(event_id)
        params = self.params.copy()
        params["timeline_id"] = "recap"
        params["urt"] = "true"
        params["get_annotations"] = "true"
        return self._pagination_legacy(endpoint, params)

    def live_event(self, event_id):
        endpoint = "/1.1/live_event/1/{}/timeline.json".format(event_id)
        params = self.params.copy()
        params["count"] = "0"
        params["urt"] = "true"
        return (self._call(endpoint, params)
                ["twitter_objects"]["live_events"][event_id])

    def list_by_rest_id(self, list_id):
        endpoint = "/graphql/BWEhzAk7k8TwbU4lKH2dpw/ListByRestId"
        params = {"variables": self._json_dumps({
            "listId": list_id,
            "withSuperFollowsUserFields": True,
        })}
        try:
            return self._call(endpoint, params)["data"]["list"]
        except KeyError:
            raise exception.NotFoundError("list")

    def list_members(self, list_id):
        endpoint = "/graphql/snESM0DPs3c7M1SBm4rvVw/ListMembers"
        variables = {
            "listId": list_id,
            "count": 100,
            "withSafetyModeUserFields": True,
        }
        return self._pagination_users(
            endpoint, variables, ("list", "members_timeline", "timeline"))

    def user_following(self, screen_name):
        endpoint = "/graphql/mIwX8GogcobVlRwlgpHNYA/Following"
        variables = {
            "userId": self._user_id_by_screen_name(screen_name),
            "count": 100,
        }
        return self._pagination_users(endpoint, variables)

    def user_by_rest_id(self, rest_id):
        endpoint = "/graphql/I5nvpI91ljifos1Y3Lltyg/UserByRestId"
        params = {"variables": self._json_dumps({
            "userId": rest_id,
            "withSafetyModeUserFields": True,
            "withSuperFollowsUserFields": True,
        })}
        return self._call(endpoint, params)["data"]["user"]["result"]

    def user_by_screen_name(self, screen_name):
        endpoint = "/graphql/7mjxD3-C6BxitPMVQ6w0-Q/UserByScreenName"
        params = {"variables": self._json_dumps({
            "screen_name": screen_name,
            "withSafetyModeUserFields": True,
            "withSuperFollowsUserFields": True,
        })}
        return self._call(endpoint, params)["data"]["user"]["result"]

    def _user_id_by_screen_name(self, screen_name):
        if screen_name.startswith("id:"):
            user_id = screen_name[3:]
            user = self.user_by_rest_id(user_id)

        else:
            user = ()
            try:
                user = self.user_by_screen_name(screen_name)
                user_id = user["rest_id"]
            except KeyError:
                if "unavailable_message" in user:
                    raise exception.NotFoundError("{} ({})".format(
                        user["unavailable_message"].get("text"),
                        user.get("reason")), False)
                else:
                    raise exception.NotFoundError("user")

        extr = self.extractor
        extr._user_obj = user
        extr._user = extr._transform_user(user)

        return user_id

    @cache(maxage=3600)
    def _guest_token(self):
        root = "https://api.twitter.com"
        endpoint = "/1.1/guest/activate.json"
        return str(self._call(endpoint, None, root, "POST")["guest_token"])

    def _call(self, endpoint, params, root=None, method="GET"):
        if root is None:
            root = self.root

        while True:
            response = self.extractor.request(
                root + endpoint, method=method, params=params,
                headers=self.headers, fatal=None)

            # update 'x-csrf-token' header (#1170)
            csrf_token = response.cookies.get("ct0")
            if csrf_token:
                self.headers["x-csrf-token"] = csrf_token

            if response.status_code < 400:
                # success
                return response.json()

            if response.status_code == 429:
                # rate limit exceeded
                until = response.headers.get("x-rate-limit-reset")
                seconds = None if until else 60
                self.extractor.wait(until=until, seconds=seconds)
                continue

            # error
            try:
                data = response.json()
                errors = ", ".join(e["message"] for e in data["errors"])
            except ValueError:
                errors = response.text
            except Exception:
                errors = data.get("errors", "")

            raise exception.StopExtraction(
                "%s %s (%s)", response.status_code, response.reason, errors)

    def _pagination_legacy(self, endpoint, params):
        original_retweets = (self.extractor.retweets == "original")

        while True:
            cursor = tweet = None
            data = self._call(endpoint, params)

            instr = data["timeline"]["instructions"]
            if not instr:
                return
            tweet_ids = []
            tweets = data["globalObjects"]["tweets"]
            users = data["globalObjects"]["users"]

            # collect tweet IDs and cursor value
            for entry in instr[0]["addEntries"]["entries"]:
                entry_startswith = entry["entryId"].startswith

                if entry_startswith(("tweet-", "sq-I-t-")):
                    tweet_ids.append(
                        entry["content"]["item"]["content"]["tweet"]["id"])

                elif entry_startswith("homeConversation-"):
                    tweet_ids.extend(
                        entry["content"]["timelineModule"]["metadata"]
                        ["conversationMetadata"]["allTweetIds"][::-1])

                elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")):
                    cursor = entry["content"]["operation"]["cursor"]
                    if not cursor.get("stopOnEmptyResponse", True):
                        # keep going even if there are no tweets
                        tweet = True
                    cursor = cursor["value"]

                elif entry_startswith("conversationThread-"):
                    tweet_ids.extend(
                        item["entryId"][6:]
                        for item in entry["content"]["timelineModule"]["items"]
                        if item["entryId"].startswith("tweet-")
                    )

            # process tweets
            for tweet_id in tweet_ids:
                try:
                    tweet = tweets[tweet_id]
                except KeyError:
                    self.extractor.log.debug("Skipping %s (deleted)", tweet_id)
                    continue

                if "retweeted_status_id_str" in tweet:
                    retweet = tweets.get(tweet["retweeted_status_id_str"])
                    if original_retweets:
                        if not retweet:
                            continue
                        retweet["retweeted_status_id_str"] = retweet["id_str"]
                        retweet["_retweet_id_str"] = tweet["id_str"]
                        tweet = retweet
                    elif retweet:
                        tweet["author"] = users[retweet["user_id_str"]]
                        if "extended_entities" in retweet and \
                                "extended_entities" not in tweet:
                            tweet["extended_entities"] = \
                                retweet["extended_entities"]
                tweet["user"] = users[tweet["user_id_str"]]
                yield tweet

                if "quoted_status_id_str" in tweet:
                    quoted = tweets.get(tweet["quoted_status_id_str"])
                    if quoted:
                        quoted = quoted.copy()
                        quoted["author"] = users[quoted["user_id_str"]]
                        quoted["quoted_by"] = tweet["user"]["screen_name"]
                        quoted["quoted_by_id_str"] = tweet["id_str"]
                        yield quoted

            # update cursor value
            if "replaceEntry" in instr[-1] :
                cursor = (instr[-1]["replaceEntry"]["entry"]
                          ["content"]["operation"]["cursor"]["value"])

            if not cursor or not tweet:
                return
            params["cursor"] = cursor

    def _pagination_tweets(self, endpoint, variables, path=None):
        extr = self.extractor
        variables.update(self.variables)
        original_retweets = (extr.retweets == "original")
        pinned_tweet = extr.pinned

        while True:
            params = {"variables": self._json_dumps(variables)}
            data = self._call(endpoint, params)["data"]

            try:
                if path is None:
                    instructions = (data["user"]["result"]["timeline"]
                                    ["timeline"]["instructions"])
                else:
                    instructions = data
                    for key in path:
                        instructions = instructions[key]
                    instructions = instructions["instructions"]

                for instr in instructions:
                    if instr.get("type") == "TimelineAddEntries":
                        entries = instr["entries"]
                        break
                else:
                    raise KeyError()

            except LookupError:
                extr.log.debug(data)

                user = extr._user_obj
                if user:
                    user = user["legacy"]
                    if user.get("blocked_by"):
                        if self.headers["x-twitter-auth-type"] and \
                                extr.config("logout"):
                            guest_token = self._guest_token()
                            extr.session.cookies.set(
                                "gt", guest_token, domain=extr.cookiedomain)
                            extr._cookiefile = None
                            del extr.session.cookies["auth_token"]
                            self.headers["x-guest-token"] = guest_token
                            self.headers["x-twitter-auth-type"] = None
                            extr.log.info("Retrying API request as guest")
                            continue
                        raise exception.AuthorizationError(
                            "{} blocked your account".format(
                                user["screen_name"]))
                    elif user.get("protected"):
                        raise exception.AuthorizationError(
                            "{}'s Tweets are protected".format(
                                user["screen_name"]))

                raise exception.StopExtraction(
                    "Unable to retrieve Tweets from this timeline")

            tweets = []
            tweet = cursor = None

            if pinned_tweet:
                pinned_tweet = False
                if instructions[-1]["type"] == "TimelinePinEntry":
                    tweets.append(instructions[-1]["entry"])

            for entry in entries:
                esw = entry["entryId"].startswith

                if esw("tweet-"):
                    tweets.append(entry)
                elif esw("homeConversation-"):
                    tweets.extend(entry["content"]["items"])
                elif esw("conversationthread-"):
                    tweets.extend(entry["content"]["items"])
                elif esw("tombstone-"):
                    item = entry["content"]["itemContent"]
                    item["tweet_results"] = \
                        {"result": {"tombstone": item["tombstoneInfo"]}}
                    tweets.append(entry)
                elif esw("cursor-bottom-"):
                    cursor = entry["content"]
                    if "itemContent" in cursor:
                        cursor = cursor["itemContent"]
                    if not cursor.get("stopOnEmptyResponse", True):
                        # keep going even if there are no tweets
                        tweet = True
                    cursor = cursor.get("value")

            for entry in tweets:
                try:
                    tweet = ((entry.get("content") or entry["item"])
                             ["itemContent"]["tweet_results"]["result"])
                    if "tombstone" in tweet:
                        tweet = self._process_tombstone(
                            entry, tweet["tombstone"])
                        if not tweet:
                            continue
                    if "tweet" in tweet:
                        tweet = tweet["tweet"]
                    legacy = tweet["legacy"]
                except KeyError:
                    extr.log.debug(
                        "Skipping %s (deleted)",
                        (entry.get("entryId") or "").rpartition("-")[2])
                    continue

                if "retweeted_status_result" in legacy:
                    retweet = legacy["retweeted_status_result"]["result"]
                    if original_retweets:
                        try:
                            retweet["legacy"]["retweeted_status_id_str"] = \
                                retweet["rest_id"]
                            retweet["_retweet_id_str"] = tweet["rest_id"]
                            tweet = retweet
                        except KeyError:
                            continue
                    else:
                        try:
                            legacy["retweeted_status_id_str"] = \
                                retweet["rest_id"]
                            tweet["author"] = \
                                retweet["core"]["user_results"]["result"]
                            if "extended_entities" in retweet["legacy"] and \
                                    "extended_entities" not in legacy:
                                legacy["extended_entities"] = \
                                    retweet["legacy"]["extended_entities"]
                        except KeyError:
                            pass

                yield tweet

                if "quoted_status_result" in tweet:
                    try:
                        quoted = tweet["quoted_status_result"]["result"]
                        quoted["legacy"]["quoted_by"] = (
                            tweet["core"]["user_results"]["result"]
                            ["legacy"]["screen_name"])
                        quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"]
                        yield quoted
                    except KeyError:
                        extr.log.debug(
                            "Skipping quote of %s (deleted)",
                            tweet.get("rest_id"))
                        continue

            if not tweet or not cursor:
                return
            variables["cursor"] = cursor

    def _pagination_users(self, endpoint, variables, path=None):
        variables.update(self.variables)

        while True:
            cursor = entry = stop = None
            params = {"variables": self._json_dumps(variables)}
            data = self._call(endpoint, params)["data"]

            try:
                if path is None:
                    instructions = (data["user"]["result"]["timeline"]
                                    ["timeline"]["instructions"])
                else:
                    for key in path:
                        data = data[key]
                    instructions = data["instructions"]
            except KeyError:
                return

            for instr in instructions:
                if instr["type"] == "TimelineAddEntries":
                    for entry in instr["entries"]:
                        if entry["entryId"].startswith("user-"):
                            user = (entry["content"]["itemContent"]
                                    ["user_results"]["result"])
                            if "rest_id" in user:
                                yield user
                        elif entry["entryId"].startswith("cursor-bottom-"):
                            cursor = entry["content"]["value"]
                elif instr["type"] == "TimelineTerminateTimeline":
                    if instr["direction"] == "Bottom":
                        stop = True

            if stop or not cursor or not entry:
                return
            variables["cursor"] = cursor

    def _process_tombstone(self, entry, tombstone):
        text = (tombstone.get("richText") or tombstone["text"])["text"]
        tweet_id = entry["entryId"].rpartition("-")[2]

        if text.startswith("Age-restricted"):
            if self._syndication:
                return self._syndication_tweet(tweet_id)
            elif self._nsfw_warning:
                self._nsfw_warning = False
                self.extractor.log.warning('"%s"', text)

        self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text)

    def _syndication_tweet(self, tweet_id):
        tweet = self.extractor.request(
            "https://cdn.syndication.twimg.com/tweet?id=" + tweet_id).json()

        tweet["user"]["description"] = ""
        tweet["user"]["entities"] = {"description": {}}
        tweet["user_id_str"] = tweet["user"]["id_str"]

        if tweet["id_str"] != tweet_id:
            tweet["retweeted_status_id_str"] = tweet["id_str"]
            tweet["id_str"] = retweet_id = tweet_id
        else:
            retweet_id = None

        if "video" in tweet:
            video = tweet["video"]
            video["variants"] = (max(
                (v for v in video["variants"] if v["type"] == "video/mp4"),
                key=lambda v: text.parse_int(
                    v["src"].split("/")[-2].partition("x")[0])
            ),)
            video["variants"][0]["url"] = video["variants"][0]["src"]
            tweet["extended_entities"] = {"media": [{
                "video_info"   : video,
                "original_info": {"width" : 0, "height": 0},
            }]}
        elif "photos" in tweet:
            for p in tweet["photos"]:
                p["media_url_https"] = p["url"]
                p["original_info"] = {
                    "width" : p["width"],
                    "height": p["height"],
                }
            tweet["extended_entities"] = {"media": tweet["photos"]}

        return {
            "rest_id": tweet["id_str"],
            "legacy" : tweet,
            "user"   : tweet["user"],
            "_retweet_id_str": retweet_id,
        }
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
+								# -*- coding: utf-8 -*-
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											2022-01-13 15:58:18 +01:00
+								# Copyright 2016-2022 Mike Fährmann
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
+								#
 								# This program is free software; you can redistribute it and/or modify
 								# it under the terms of the GNU General Public License version 2 as
 								# published by the Free Software Foundation.
-												[twitter] force old login page layout (fixes #584, fixes #598)

											
										
										
											2020-02-02 17:19:14 +01:00
+								"""Extractors for https://twitter.com/"""
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
 								from .common import Extractor, Message
-												add a general 'generate_csrf_token()' function

											
										
										
											2020-10-15 00:43:26 +02:00
+								from .. import text, util, exception
-												[twitter] don't cache results of 'user_by_screen_name()'

A 'keyarg=1' argument to the memcache decorator would have worked as
well, but keeping the user object in memory isn't useful for the vast
majority of use cases and only wastes space.

(closes #817)

											
										
										
											2020-06-10 20:58:42 +02:00
+								from ..cache import cache
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								import json
-												code adjustments according to pep8 nr2

											
										
										
											2017-02-01 00:53:19 +01:00
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								BASE_PATTERN = (
 								    r"(?:https?://)?(?:www\.|mobile\.)?"
-												recognize vxtwitter URLs (#2621)


											
										
										
											2022-05-25 17:01:58 +02:00
+								    r"(?:(?:[fv]x)?twitter\.com|nitter\.net)"
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								)
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								class TwitterExtractor(Extractor):
 								    """Base class for twitter extractors"""
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
+								    category = "twitter"
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								    directory_fmt = ("{category}", "{user[name]}")
 								    filename_fmt = "{tweet_id}_{num}.{extension}"
 								    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											2020-03-12 22:02:12 +01:00
+								    cookiedomain = ".twitter.com"
-												[twitter] skip login if 'auth_token' cookie is present

											
										
										
											2021-01-25 14:52:22 +01:00
+								    cookienames = ("auth_token",)
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    root = "https://twitter.com"
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								    def __init__(self, match):
-												propagate 'match' to base extractor constructor

											
										
										
											2019-02-11 13:31:10 +01:00
+								        Extractor.__init__(self, match)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								        self.user = match.group(1)
-												[twitter] rename 'text-only' to 'text-tweets' (#570)

											
										
										
											2021-05-22 21:07:21 +02:00
+								        self.textonly = self.config("text-tweets", False)
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											2021-06-11 21:19:04 +02:00
+								        self.retweets = self.config("retweets", False)
-												[twitter] add 'replies' option (closes #705)

											
										
										
											2020-04-29 23:11:24 +02:00
+								        self.replies = self.config("replies", True)
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											2020-01-18 21:26:46 +01:00
+								        self.twitpic = self.config("twitpic", False)
-												[twitter] add 'pinned' option

											
										
										
											2021-10-29 22:10:58 +02:00
+								        self.pinned = self.config("pinned", False)
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											2021-06-11 21:19:04 +02:00
+								        self.quoted = self.config("quoted", False)
-												[twitter] change default value for 'videos' to 'true'

Every other 'videos' option defaulted to 'true', except Twitter.

											
										
										
											2020-02-14 01:03:42 +01:00
+								        self.videos = self.config("videos", True)
-												[twitter] disable 'cards' by default

											
										
										
											2022-05-21 15:39:25 +02:00
+								        self.cards = self.config("cards", False)
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								        self._user = self._user_obj = None
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								        self._user_cache = {}
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											2021-11-16 22:57:46 +01:00
+								        self._init_sizes()
-												[twitter] add experimental 'videos' option (#99)

Enabling this option will detect videos in tweets and output them as
"unsupported" URLs, so that these can then be downloaded with youtube-dl

There are a lot of improvements to be made to the current
implementation, but it works and does what it is supposed to, even if
inefficient as can be ...

											
										
										
											2018-09-30 18:41:39 +02:00
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											2021-11-16 22:57:46 +01:00
+								    def _init_sizes(self):
-												[twitter] add 'size' option (#1881)

											
										
										
											2021-10-05 18:58:10 +02:00
+								        size = self.config("size")
 								        if size is None:
 								            self._size_image = "orig"
-												[twitter] include '4096x4096' as a default image fallback

(closes #2107, closes #1881)

											
										
										
											2021-12-15 23:17:07 +01:00
+								            self._size_fallback = ("4096x4096", "large", "medium", "small")
-												[twitter] add 'size' option (#1881)

											
										
										
											2021-10-05 18:58:10 +02:00
+								        else:
 								            if isinstance(size, str):
 								                size = size.split(",")
 								            self._size_image = size[0]
 								            self._size_fallback = size[1:]
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    def items(self):
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								        self.login()
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        self.api = TwitterAPI(self)
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											2019-11-30 21:51:08 +01:00
+								        metadata = self.metadata()
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
-												[twitter] implement 'expand' option (#2665)

											
										
										
											2022-06-12 17:26:51 +02:00
+								        if self.config("expand"):
 								            tweets = self._expand_tweets(self.tweets())
 								            self.tweets = lambda : tweets
-												[twitter] ignore previously seen Tweets (#2712)

occurs primarily for /with_replies results when logged in

											
										
										
											2022-07-03 16:07:07 +02:00
+								        if self.config("unique", True):
 								            seen_tweets = set()
 								        else:
 								            seen_tweets = None
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								        for tweet in self.tweets():
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            if "legacy" in tweet:
 								                data = tweet["legacy"]
 								            else:
 								                data = tweet
-												[twitter] ignore previously seen Tweets (#2712)

occurs primarily for /with_replies results when logged in

											
										
										
											2022-07-03 16:07:07 +02:00
+								            if seen_tweets is not None:
 								                if data["id_str"] in seen_tweets:
 								                    continue
 								                seen_tweets.add(data["id_str"])
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            if not self.retweets and "retweeted_status_id_str" in data:
 								                self.log.debug("Skipping %s (retweet)", data["id_str"])
-												[twitter] add debug messages for all skipped Tweets (#867)

											
										
										
											2020-07-11 00:41:50 +02:00
+								                continue
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            if not self.quoted and "quoted_by_id_str" in data:
 								                self.log.debug("Skipping %s (quoted tweet)", data["id_str"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								                continue
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            if "in_reply_to_user_id_str" in data and (
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											2021-08-10 22:02:19 +02:00
+								                not self.replies or (
 								                    self.replies == "self" and
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								                    data["user_id_str"] !=
 								                    (self._user_obj["rest_id"] if self._user else
 								                     data["in_reply_to_user_id_str"])
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											2021-08-10 22:02:19 +02:00
+								                )
 								            ):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                self.log.debug("Skipping %s (reply)", data["id_str"])
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											2021-08-10 22:02:19 +02:00
+								                continue
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								            files = []
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            if "extended_entities" in data:
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											2022-01-13 15:58:18 +01:00
+								                self._extract_media(
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                    data, data["extended_entities"]["media"], files)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								            if "card" in tweet and self.cards:
 								                self._extract_card(tweet, files)
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
+								            if self.twitpic:
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                self._extract_twitpic(data, files)
-												[twitter] add 'text-only' option (#570)

											
										
										
											2021-05-22 17:01:49 +02:00
+								            if not files and not self.textonly:
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								                continue
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            tdata = self._transform_tweet(tweet)
 								            tdata.update(metadata)
-												[twitter] add 'count' metadata field (#2741)

											
										
										
											2022-07-10 14:37:04 +02:00
+								            tdata["count"] = len(files)
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            yield Message.Directory, tdata
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								            for tdata["num"], file in enumerate(files, 1):
 								                file.update(tdata)
 								                url = file.pop("url")
 								                if "extension" not in file:
 								                    text.nameext_from_url(url, file)
 								                yield Message.Url, url, file
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											2022-01-13 15:58:18 +01:00
+								    def _extract_media(self, tweet, entities, files):
 								        for media in entities:
-												[twitter] extract alt texts as 'description' (closes #2617)

											
										
										
											2022-05-24 12:37:38 +02:00
+								            descr = media.get("ext_alt_text")
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											2020-11-05 22:53:29 +01:00
+								            width = media["original_info"].get("width", 0)
 								            height = media["original_info"].get("height", 0)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
 								            if "video_info" in media:
 								                if self.videos == "ytdl":
 								                    files.append({
 								                        "url": "ytdl:{}/i/web/status/{}".format(
 								                            self.root, tweet["id_str"]),
-												[twitter] extract alt texts as 'description' (closes #2617)

											
										
										
											2022-05-24 12:37:38 +02:00
+								                        "width"      : width,
 								                        "height"     : height,
 								                        "extension"  : None,
 								                        "description": descr,
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                    })
 								                elif self.videos:
 								                    video_info = media["video_info"]
 								                    variant = max(
 								                        video_info["variants"],
 								                        key=lambda v: v.get("bitrate", 0),
 								                    )
 								                    files.append({
-												[twitter] extract alt texts as 'description' (closes #2617)

											
										
										
											2022-05-24 12:37:38 +02:00
+								                        "url"        : variant["url"],
 								                        "width"      : width,
 								                        "height"     : height,
 								                        "bitrate"    : variant.get("bitrate", 0),
 								                        "duration"   : video_info.get(
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                            "duration_millis", 0) / 1000,
-												[twitter] extract alt texts as 'description' (closes #2617)

											
										
										
											2022-05-24 12:37:38 +02:00
+								                        "description": descr,
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                    })
 								            elif "media_url_https" in media:
 								                url = media["media_url_https"]
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								                base, _, fmt = url.rpartition(".")
 								                base += "?format=" + fmt + "&name="
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                files.append(text.nameext_from_url(url, {
-												[twitter] extract alt texts as 'description' (closes #2617)

											
										
										
											2022-05-24 12:37:38 +02:00
+								                    "url"        : base + self._size_image,
 								                    "width"      : width,
 								                    "height"     : height,
 								                    "_fallback"  : self._image_fallback(base),
 								                    "description": descr,
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                }))
 								            else:
 								                files.append({"url": media["media_url"]})
-												[twitter] add 'size' option (#1881)

											
										
										
											2021-10-05 18:58:10 +02:00
+								    def _image_fallback(self, base):
 								        for fmt in self._size_fallback:
 								            yield base + fmt
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								    def _extract_card(self, tweet, files):
 								        card = tweet["card"]
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        if "legacy" in card:
 								            card = card["legacy"]
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											2022-01-13 15:58:18 +01:00
+								        name = card["name"]
 								        if name in ("summary", "summary_large_image"):
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								            bvals = card["binding_values"]
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            if isinstance(bvals, list):
 								                bvals = {
 								                    bval["key"]: bval["value"]
 								                    for bval in card["binding_values"]
 								                }
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								            for prefix in ("photo_image_full_size_",
 								                           "summary_photo_image_",
 								                           "thumbnail_image_"):
 								                for size in ("original", "x_large", "large", "small"):
 								                    key = prefix + size
 								                    if key in bvals:
-												[twitter] ensure card entries have a 'url' (#1868)

											
										
										
											2021-09-20 22:32:03 +02:00
+								                        value = bvals[key].get("image_value")
 								                        if value and "url" in value:
-												[twitter] provide fallback URLs for card images

https://github.com/mikf/gallery-dl/commit/f2e8aedd746bd570a79d0289aaad68a00dbcf9f9#commitcomment-64057751

											
										
										
											2022-02-03 23:43:18 +01:00
+								                            base, sep, size = value["url"].rpartition("&name=")
 								                            if sep:
 								                                base += sep
 								                                value["url"] = base + self._size_image
 								                                value["_fallback"] = self._image_fallback(base)
-												[twitter] ensure card entries have a 'url' (#1868)

											
										
										
											2021-09-20 22:32:03 +02:00
+								                            files.append(value)
 								                            return
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											2022-01-13 15:58:18 +01:00
+								        elif name == "unified_card":
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            bvals = card["binding_values"]
 								            if isinstance(bvals, list):
 								                for bval in card["binding_values"]:
 								                    if bval["key"] == "unified_card":
-												[twitter] fix unified cards from search results

											
										
										
											2022-01-22 20:25:10 +01:00
+								                        bval = bval["value"]["string_value"]
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                        break
 								            else:
-												[twitter] fix unified cards from search results

											
										
										
											2022-01-22 20:25:10 +01:00
+								                bval = bvals["unified_card"]["string_value"]
 								            data = json.loads(bval)
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								            if data.get("type") == "image_carousel_website":
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											2022-01-13 15:58:18 +01:00
+								                self._extract_media(
 								                    tweet, data["media_entities"].values(), files)
 								                return
-												[twitter] changes to 'cards' option

- change default value to 'true'
- only invoke youtube-dl for cards unsupported by gallery
  when 'cards' is set to "ytdl"

"cards": true   --> only download card images
"cards": "ytdl" --> download card images and
                    use youtube_dl on otherwise unsupported cards

											
										
										
											2022-01-15 22:02:57 +01:00
+								        if self.cards == "ytdl":
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            tweet_id = tweet.get("rest_id") or tweet["id_str"]
 								            url = "ytdl:{}/i/web/status/{}".format(self.root, tweet_id)
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								            files.append({"url": url})
 								    def _extract_twitpic(self, tweet, files):
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
+								        for url in tweet["entities"].get("urls", ()):
 								            url = url["expanded_url"]
-												[twitter] improve twitpic extraction (fixes #1019)

- ignore twitpic.com/photos/… URLs
- ignore empty image URLs

											
										
										
											2020-09-21 22:21:16 +02:00
+								            if "//twitpic.com/" in url and "/photos/" not in url:
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
+								                response = self.request(url, fatal=False)
 								                if response.status_code >= 400:
 								                    continue
 								                url = text.extract(
 								                    response.text, 'name="twitter:image" value="', '"')[0]
-												[twitter] improve twitpic extraction (fixes #1019)

- ignore twitpic.com/photos/… URLs
- ignore empty image URLs

											
										
										
											2020-09-21 22:21:16 +02:00
+								                if url:
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								                    files.append({"url": url})
-												[twitter] restore TwitPic support

											
										
										
											2020-06-04 01:22:34 +02:00
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								    def _transform_tweet(self, tweet):
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								        if "author" in tweet:
 								            author = tweet["author"]
 								        elif "core" in tweet:
 								            author = tweet["core"]["user_results"]["result"]
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        else:
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								            author = tweet["user"]
 								        author = self._transform_user(author)
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
 								        if "legacy" in tweet:
 								            tweet = tweet["legacy"]
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								        tget = tweet.get
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								        entities = tweet["entities"]
 								        tdata = {
 								            "tweet_id"      : text.parse_int(tweet["id_str"]),
 								            "retweet_id"    : text.parse_int(
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								                tget("retweeted_status_id_str")),
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            "quote_id"      : text.parse_int(
-												[twitter] update 'quote_id' and 'quote_by'

- 'quote_id' is now non-null for quoted Tweets and has the ID of the
  quoting Tweet, instead the other way round like before
- 'quote_by' is now the 'screen_name' of the quoting user
  (was the same the new 'quote_id' is now)

											
										
										
											2022-07-17 18:50:21 +02:00
+								                tget("quoted_by_id_str")),
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            "reply_id"      : text.parse_int(
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								                tget("in_reply_to_status_id_str")),
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            "date"          : text.parse_datetime(
 								                tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								            "user"          : self._user or author,
 								            "author"        : author,
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            "lang"          : tweet["lang"],
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								            "favorite_count": tget("favorite_count"),
 								            "quote_count"   : tget("quote_count"),
 								            "reply_count"   : tget("reply_count"),
 								            "retweet_count" : tget("retweet_count"),
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								        }
 								        hashtags = entities.get("hashtags")
 								        if hashtags:
 								            tdata["hashtags"] = [t["text"] for t in hashtags]
 								        mentions = entities.get("user_mentions")
 								        if mentions:
 								            tdata["mentions"] = [{
 								                "id": text.parse_int(u["id_str"]),
 								                "name": u["screen_name"],
 								                "nick": u["name"],
 								            } for u in mentions]
-												[twitter] unescape content (#2756) (#2757)

Fixes #2756
											
										
										
											2022-07-13 19:45:14 +02:00
+								        content = text.unescape(tget("full_text") or tget("text") or "")
-												[twitter] resolve t.co URLs in 'content' (#1532)

											
										
										
											2021-05-15 02:46:46 +02:00
+								        urls = entities.get("urls")
 								        if urls:
 								            for url in urls:
 								                content = content.replace(url["url"], url["expanded_url"])
-												[twitter] strip useless t.co links (#1532)

The 'full_text' of Tweets with media content usually ends with a t.co
link to itself. This commit removes those.

											
										
										
											2021-05-16 02:35:55 +02:00
+								        txt, _, tco = content.rpartition(" ")
 								        tdata["content"] = txt if tco.startswith("https://t.co/") else content
-												[twitter] resolve t.co URLs in 'content' (#1532)

											
										
										
											2021-05-15 02:46:46 +02:00
-												[twitter] add 'reply_to' metadata to replies

											
										
										
											2020-06-09 21:48:04 +02:00
+								        if "in_reply_to_screen_name" in tweet:
 								            tdata["reply_to"] = tweet["in_reply_to_screen_name"]
-												[twitter] update 'quote_id' and 'quote_by'

- 'quote_id' is now non-null for quoted Tweets and has the ID of the
  quoting Tweet, instead the other way round like before
- 'quote_by' is now the 'screen_name' of the quoting user
  (was the same the new 'quote_id' is now)

											
										
										
											2022-07-17 18:50:21 +02:00
+								        if "quoted_by" in tweet:
 								            tdata["quote_by"] = tweet["quoted_by"]
-												[twitter] add 'reply_to' metadata to replies

											
										
										
											2020-06-09 21:48:04 +02:00
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								        return tdata
 								    def _transform_user(self, user):
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								        uid = user.get("rest_id") or user["id_str"]
-												[twitter] slightly improve '_transform_user()'

											
										
										
											2021-08-23 22:28:09 +02:00
+								        try:
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								            return self._user_cache[uid]
-												[twitter] slightly improve '_transform_user()'

											
										
										
											2021-08-23 22:28:09 +02:00
+								        except KeyError:
 								            pass
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        if "legacy" in user:
 								            user = user["legacy"]
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
 								        uget = user.get
-												[twitter] add 'url' to user objects (#1532, #1787)

											
										
										
											2021-08-23 22:36:55 +02:00
+								        entities = user["entities"]
-												[twitter] slightly improve '_transform_user()'

											
										
										
											2021-08-23 22:28:09 +02:00
+								        self._user_cache[uid] = udata = {
 								            "id"              : text.parse_int(uid),
 								            "name"            : user["screen_name"],
 								            "nick"            : user["name"],
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								            "location"        : uget("location"),
-												[twitter] slightly improve '_transform_user()'

											
										
										
											2021-08-23 22:28:09 +02:00
+								            "date"            : text.parse_datetime(
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								                uget("created_at"), "%a %b %d %H:%M:%S %z %Y"),
 								            "verified"        : uget("verified", False),
 								            "profile_banner"  : uget("profile_banner_url", ""),
 								            "profile_image"   : uget(
-												[twitter] slightly improve '_transform_user()'

											
										
										
											2021-08-23 22:28:09 +02:00
+								                "profile_image_url_https", "").replace("_normal.", "."),
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								            "favourites_count": uget("favourites_count"),
 								            "followers_count" : uget("followers_count"),
 								            "friends_count"   : uget("friends_count"),
 								            "listed_count"    : uget("listed_count"),
 								            "media_count"     : uget("media_count"),
 								            "statuses_count"  : uget("statuses_count"),
-												[twitter] slightly improve '_transform_user()'

											
										
										
											2021-08-23 22:28:09 +02:00
+								        }
-												[twitter] add 'url' to user objects (#1532, #1787)

											
										
										
											2021-08-23 22:36:55 +02:00
-												[twitter] expand t.co links in user descriptions (#1532, #1787)

											
										
										
											2021-08-23 22:49:35 +02:00
+								        descr = user["description"]
 								        urls = entities["description"].get("urls")
 								        if urls:
 								            for url in urls:
 								                descr = descr.replace(url["url"], url["expanded_url"])
 								        udata["description"] = descr
-												[twitter] add 'url' to user objects (#1532, #1787)

											
										
										
											2021-08-23 22:36:55 +02:00
+								        if "url" in entities:
-												[twitter] fix 'url' extraction for users without 'expanded_url'

(#1532, #1787)

											
										
										
											2021-08-27 18:41:16 +02:00
+								            url = entities["url"]["urls"][0]
 								            udata["url"] = url.get("expanded_url") or url.get("url")
-												[twitter] add 'url' to user objects (#1532, #1787)

											
										
										
											2021-08-23 22:36:55 +02:00
-												[twitter] slightly improve '_transform_user()'

											
										
										
											2021-08-23 22:28:09 +02:00
+								        return udata
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								    def _users_result(self, users):
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											2021-03-20 01:31:12 +01:00
+								        userfmt = self.config("users")
 								        if not userfmt or userfmt == "timeline":
 								            cls = TwitterTimelineExtractor
 								            fmt = (self.root + "/i/user/{rest_id}").format_map
 								        elif userfmt == "media":
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								            cls = TwitterMediaExtractor
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											2021-03-20 01:31:12 +01:00
+								            fmt = (self.root + "/id:{rest_id}/media").format_map
-												[twitter] improve results for regular user URLs

- continuation of 3346f58a
- use media timeline results (or tweet timeline if retweets are enabled)
  plus search results starting from the last tweet id of the first
  timeline, similar to how Twitter Media Downloader operates
- the old behavior can be forced by appending '/tweets' to a user URL,
  like with '/media' (https://twitter.com/USER/tweets)
  although there should be no need to ever do that

											
										
										
											2022-05-23 18:23:21 +02:00
+								        elif userfmt == "tweets":
 								            cls = TwitterTweetsExtractor
 								            fmt = (self.root + "/id:{rest_id}/tweets").format_map
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								        else:
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											2021-03-20 01:31:12 +01:00
+								            cls = None
 								            fmt = userfmt.format_map
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
 								        for user in users:
 								            user["_extractor"] = cls
-												[twitter] allow specifying a custom format for user results

(#1337)

											
										
										
											2021-03-20 01:31:12 +01:00
+								            yield Message.Queue, fmt(user), user
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
-												[twitter] implement 'expand' option (#2665)

											
										
										
											2022-06-12 17:26:51 +02:00
+								    def _expand_tweets(self, tweets):
 								        seen = set()
 								        for tweet in tweets:
 								            if "legacy" in tweet:
 								                cid = tweet["legacy"]["conversation_id_str"]
 								            else:
 								                cid = tweet["conversation_id_str"]
 								            if cid not in seen:
 								                seen.add(cid)
 								                try:
 								                    yield from self.api.tweet_detail(cid)
 								                except Exception:
 								                    yield tweet
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    def metadata(self):
 								        """Return general metadata"""
-												[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs

											
										
										
											2019-11-30 21:51:08 +01:00
+								        return {}
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
 								    def tweets(self):
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        """Yield all relevant tweet objects"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								    def login(self):
-												[twitter] skip login if 'auth_token' cookie is present

											
										
										
											2021-01-25 14:52:22 +01:00
+								        if not self._check_cookies(self.cookienames):
 								            username, password = self._get_auth_info()
 								            if username:
 								                self._update_cookies(self._login_impl(username, password))
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
 								    @cache(maxage=360*24*3600, keyarg=1)
 								    def _login_impl(self, username, password):
 								        self.log.info("Logging in as %s", username)
-												rename 'generate_csrf_token()' to just 'generate_token()'

and add a 'size' argument

											
										
										
											2021-01-11 22:12:40 +01:00
+								        token = util.generate_token()
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								        self.session.cookies.clear()
 								        self.request(self.root + "/login")
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								        url = self.root + "/sessions"
 								        cookies = {
 								            "_mb_tk": token,
 								        }
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								        data = {
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								            "redirect_after_login"      : "/",
 								            "remember_me"               : "1",
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								            "authenticity_token"        : token,
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								            "wfa"                       : "1",
 								            "ui_metrics"                : "{}",
-												[twitter] add login support (#214)

											
										
										
											2019-04-07 23:06:57 +02:00
+								            "session[username_or_email]": username,
 								            "session[password]"         : password,
 								        }
-												[twitter] fix login with username & password

It is no longer possible to get an 'authenticity_token' from Twitter's
Javascript-free login form, which got disabled few days ago.

Generating a random 16 byte hex string client-side and sending that as
a cookie alongside the regular login form works just as well.

											
										
										
											2020-12-28 15:54:47 +01:00
+								        response = self.request(
 								            url, method="POST", cookies=cookies, data=data)
-												[twitter] better error message when logging in with 2FA (#1409)

											
										
										
											2021-03-26 21:52:55 +01:00
+								        if "/account/login_verification" in response.url:
 								            raise exception.AuthenticationError(
 								                "Login with two-factor authentication is not supported")
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
+								        cookies = {
-												[twitter] use a simpler data structure to store cookies in cache

Use a dict with name-value pairs instead of an entire
RequestsCookieJar object.

											
										
										
											2020-03-12 22:02:12 +01:00
+								            cookie.name: cookie.value
 								            for cookie in self.session.cookies
 								        }
-												[twitter] login using the mobile nojs login page

											
										
										
											2020-06-04 00:07:12 +02:00
 								        if "/error" in response.url or "auth_token" not in cookies:
 								            raise exception.AuthenticationError()
 								        return cookies
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								class TwitterTimelineExtractor(TwitterExtractor):
-												[twitter] improve results for regular user URLs

- continuation of 3346f58a
- use media timeline results (or tweet timeline if retweets are enabled)
  plus search results starting from the last tweet id of the first
  timeline, similar to how Twitter Media Downloader operates
- the old behavior can be forced by appending '/tweets' to a user URL,
  like with '/media' (https://twitter.com/USER/tweets)
  although there should be no need to ever do that

											
										
										
											2022-05-23 18:23:21 +02:00
+								    """Extractor for a Twitter user timeline"""
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								    subcategory = "timeline"
-												[twitter] match '/i/user/ID' URLs

											
										
										
											2021-01-20 00:33:57 +01:00
+								    pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
 								               r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    test = (
 								        ("https://twitter.com/supernaturepics", {
 								            "range": "1-40",
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								        }),
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								        # suspended account (#2216)
 								        ("https://twitter.com/realDonaldTrump", {
 								            "exception": exception.NotFoundError,
 								        }),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								        ("https://mobile.twitter.com/supernaturepics?p=i"),
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
+								        ("https://www.twitter.com/id:2976459548"),
-												[twitter] match '/i/user/ID' URLs

											
										
										
											2021-01-20 00:33:57 +01:00
+								        ("https://twitter.com/i/user/2976459548"),
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											2020-09-08 23:17:50 +02:00
+								        ("https://twitter.com/intent/user?user_id=2976459548"),
-												recognize vxtwitter URLs (#2621)


											
										
										
											2022-05-25 17:01:58 +02:00
+								        ("https://fxtwitter.com/supernaturepics"),
 								        ("https://vxtwitter.com/supernaturepics"),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											2020-09-08 23:17:50 +02:00
+								    def __init__(self, match):
 								        TwitterExtractor.__init__(self, match)
-												[twitter] match '/i/user/ID' URLs

											
										
										
											2021-01-20 00:33:57 +01:00
+								        user_id = match.group(2)
 								        if user_id:
 								            self.user = "id:" + user_id
-												[twitter] support '/intent/user?user_id=…' URLs (#980)

											
										
										
											2020-09-08 23:17:50 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								    def tweets(self):
-												[twitter] improve results for regular user URLs

- continuation of 3346f58a
- use media timeline results (or tweet timeline if retweets are enabled)
  plus search results starting from the last tweet id of the first
  timeline, similar to how Twitter Media Downloader operates
- the old behavior can be forced by appending '/tweets' to a user URL,
  like with '/media' (https://twitter.com/USER/tweets)
  although there should be no need to ever do that

											
										
										
											2022-05-23 18:23:21 +02:00
+								        # yield initial batch of (media) tweets
-												[twitter] use twMediaDownloader strategy for user URLs

- use media timeline + search for default user URLs like
  https://twitter.com/SCREEN_NAME
- fetches all/most media for the type of twitter URL that most users
  use with gallery-dl
- can be disabled by setting 'strategy' to any truthy value,
  like "timeline"

											
										
										
											2022-05-02 08:50:04 +02:00
+								        tweet = None
-												[twitter] implement 'strategy' option (#2712)

to be able to better control what Tweets get used an returned
for twitter.com/USER URLs.

											
										
										
											2022-07-03 14:29:15 +02:00
+								        for tweet in self._select_tweet_source()(self.user):
-												[twitter] use twMediaDownloader strategy for user URLs

- use media timeline + search for default user URLs like
  https://twitter.com/SCREEN_NAME
- fetches all/most media for the type of twitter URL that most users
  use with gallery-dl
- can be disabled by setting 'strategy' to any truthy value,
  like "timeline"

											
										
										
											2022-05-02 08:50:04 +02:00
+								            yield tweet
 								        if tweet is None:
 								            return
-												[twitter] improve strategy for user URLs (#2665)

- use '/with_replies' when appropriate
- consider 'text-tweets'
- build search query as necessary

											
										
										
											2022-06-11 18:07:07 +02:00
+								        # build search query
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								        query = "from:{} max_id:{}".format(
 								            self._user["name"], tweet["rest_id"])
-												[twitter] improve strategy for user URLs (#2665)

- use '/with_replies' when appropriate
- consider 'text-tweets'
- build search query as necessary

											
										
										
											2022-06-11 18:07:07 +02:00
+								        if self.retweets:
 								            query += " include:retweets include:nativeretweets"
-												[twitter] fall back to unfiltered search (#2766)

											
										
										
											2022-07-20 15:12:08 +02:00
-												[twitter] improve strategy for user URLs (#2665)

- use '/with_replies' when appropriate
- consider 'text-tweets'
- build search query as necessary

											
										
										
											2022-06-11 18:07:07 +02:00
+								        if not self.textonly:
-												[twitter] fall back to unfiltered search (#2766)

											
										
										
											2022-07-20 15:12:08 +02:00
+								            # try to search for media-only tweets
 								            tweet = None
 								            for tweet in self.api.search_adaptive(query + (
 								                    " (filter:images OR"
 								                    " filter:native_video OR"
 								                    " card_name:animated_gif)")):
 								                yield tweet
 								            if tweet is not None:
 								                return
-												[twitter] improve strategy for user URLs (#2665)

- use '/with_replies' when appropriate
- consider 'text-tweets'
- build search query as necessary

											
										
										
											2022-06-11 18:07:07 +02:00
-												[twitter] fall back to unfiltered search (#2766)

											
										
										
											2022-07-20 15:12:08 +02:00
+								        # yield unfiltered search results
-												[twitter] improve strategy for user URLs (#2665)

- use '/with_replies' when appropriate
- consider 'text-tweets'
- build search query as necessary

											
										
										
											2022-06-11 18:07:07 +02:00
+								        yield from self.api.search_adaptive(query)
-												[twitter] use twMediaDownloader strategy for user URLs

- use media timeline + search for default user URLs like
  https://twitter.com/SCREEN_NAME
- fetches all/most media for the type of twitter URL that most users
  use with gallery-dl
- can be disabled by setting 'strategy' to any truthy value,
  like "timeline"

											
										
										
											2022-05-02 08:50:04 +02:00
-												[twitter] implement 'strategy' option (#2712)

to be able to better control what Tweets get used an returned
for twitter.com/USER URLs.

											
										
										
											2022-07-03 14:29:15 +02:00
+								    def _select_tweet_source(self):
 								        strategy = self.config("strategy")
 								        if strategy is None or strategy == "auto":
 								            if self.retweets or self.textonly:
 								                return self.api.user_tweets
 								            else:
 								                return self.api.user_media
 								        if strategy == "tweets":
 								            return self.api.user_tweets
 								        if strategy == "with_replies":
 								            return self.api.user_tweets_and_replies
 								        return self.api.user_media
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
-												[twitter] improve results for regular user URLs

- continuation of 3346f58a
- use media timeline results (or tweet timeline if retweets are enabled)
  plus search results starting from the last tweet id of the first
  timeline, similar to how Twitter Media Downloader operates
- the old behavior can be forced by appending '/tweets' to a user URL,
  like with '/media' (https://twitter.com/USER/tweets)
  although there should be no need to ever do that

											
										
										
											2022-05-23 18:23:21 +02:00
+								class TwitterTweetsExtractor(TwitterExtractor):
 								    """Extractor for Tweets from a user's Tweets timeline"""
 								    subcategory = "tweets"
 								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)"
 								    test = (
 								        ("https://twitter.com/supernaturepics/tweets", {
 								            "range": "1-40",
 								            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
 								        }),
 								        ("https://mobile.twitter.com/supernaturepics/tweets#t"),
 								        ("https://www.twitter.com/id:2976459548/tweets"),
 								    )
 								    def tweets(self):
 								        return self.api.user_tweets(self.user)
-												[twitter] support '/with_replies' URLs (closes #1833)

											
										
										
											2021-09-10 20:40:43 +02:00
+								class TwitterRepliesExtractor(TwitterExtractor):
 								    """Extractor for Tweets from a user's timeline including replies"""
 								    subcategory = "replies"
 								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/with_replies(?!\w)"
 								    test = (
 								        ("https://twitter.com/supernaturepics/with_replies", {
 								            "range": "1-40",
 								            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
 								        }),
 								        ("https://mobile.twitter.com/supernaturepics/with_replies#t"),
 								        ("https://www.twitter.com/id:2976459548/with_replies"),
 								    )
 								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        return self.api.user_tweets_and_replies(self.user)
-												[twitter] support '/with_replies' URLs (closes #1833)

											
										
										
											2021-09-10 20:40:43 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								class TwitterMediaExtractor(TwitterExtractor):
-												[twitter] support '/with_replies' URLs (closes #1833)

											
										
										
											2021-09-10 20:40:43 +02:00
+								    """Extractor for Tweets from a user's Media timeline"""
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								    subcategory = "media"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											2020-10-22 23:12:59 +02:00
+								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    test = (
 								        ("https://twitter.com/supernaturepics/media", {
 								            "range": "1-40",
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								        }),
 								        ("https://mobile.twitter.com/supernaturepics/media#t"),
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
+								        ("https://www.twitter.com/id:2976459548/media"),
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								    )
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
 								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        return self.api.user_media(self.user)
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								class TwitterLikesExtractor(TwitterExtractor):
 								    """Extractor for liked tweets"""
 								    subcategory = "likes"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											2020-10-22 23:12:59 +02:00
+								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								    test = ("https://twitter.com/supernaturepics/likes",)
-												[twitter] add 'user_likes' metadata field for liked tweets

i.e. the 'screen_name' of the user whose liked tweets get extracted.

Ideally this would replace 'user' or at least be in the same format,
but that would break backwards compatibility or be impossible/too
complicated thanks to API result differences.

(#1421)

											
										
										
											2021-04-02 02:52:01 +02:00
+								    def metadata(self):
 								        return {"user_likes": self.user}
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        return self.api.user_likes(self.user)
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
 								class TwitterBookmarkExtractor(TwitterExtractor):
 								    """Extractor for bookmarked tweets"""
 								    subcategory = "bookmark"
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								    pattern = BASE_PATTERN + r"/i/bookmarks()"
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
+								    test = ("https://twitter.com/i/bookmarks",)
 								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        return self.api.user_bookmarks()
-												[twitter] add extractor for liked tweets (closes #837)

You need to be logged in to get access to anyone's liked tweets,
it seems.

											
										
										
											2020-06-16 14:27:22 +02:00
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								class TwitterListExtractor(TwitterExtractor):
 								    """Extractor for Twitter lists"""
 								    subcategory = "list"
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								    pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								    test = ("https://twitter.com/i/lists/784214683683127296", {
 								        "range": "1-40",
 								        "count": 40,
 								        "archive": False,
 								    })
 								    def tweets(self):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        return self.api.list_latest_tweets_timeline(self.user)
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								class TwitterListMembersExtractor(TwitterExtractor):
 								    """Extractor for members of a Twitter list"""
 								    subcategory = "list-members"
 								    pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
 								    test = ("https://twitter.com/i/lists/784214683683127296/members",)
 								    def items(self):
 								        self.login()
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								        return self._users_result(TwitterAPI(self).list_members(self.user))
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								class TwitterFollowingExtractor(TwitterExtractor):
 								    """Extractor for followed users"""
 								    subcategory = "following"
 								    pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)"
 								    test = (
 								        ("https://twitter.com/supernaturepics/following"),
 								        ("https://www.twitter.com/id:2976459548/following"),
 								    )
 								    def items(self):
 								        self.login()
-												[twitter] implement 'users' option (#1337)

											
										
										
											2021-03-15 22:55:24 +01:00
+								        return self._users_result(TwitterAPI(self).user_following(self.user))
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											2019-10-16 18:23:10 +02:00
+								class TwitterSearchExtractor(TwitterExtractor):
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											2022-01-22 20:55:50 +01:00
+								    """Extractor for Twitter search results"""
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											2019-10-16 18:23:10 +02:00
+								    subcategory = "search"
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								    pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
+								    test = ("https://twitter.com/search?q=nature", {
 								        "range": "1-40",
 								        "count": 40,
-												update extractor test results

											
										
										
											2020-10-03 19:24:19 +02:00
+								        "archive": False,
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
+								    })
 								    def metadata(self):
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								        return {"search": text.unquote(self.user)}
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
-												Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en

The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
											
										
										
											2019-10-16 18:23:10 +02:00
+								    def tweets(self):
-												[twitter] implement constant 'user' for 'from:…' searches

											
										
										
											2022-07-17 19:14:32 +02:00
+								        query = text.unquote(self.user)
 								        user = None
 								        for item in query.split():
 								            item = item.strip("()")
 								            if item.startswith("from:"):
 								                if user:
 								                    user = None
 								                    break
 								                else:
 								                    user = item[5:]
 								        if user is not None:
 								            try:
 								                self._user_obj = user = self.api.user_by_screen_name(user)
 								            except KeyError:
 								                raise exception.NotFoundError("user")
 								            self._user = self._transform_user(user)
 								        return self.api.search_adaptive(query)
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											2022-01-22 20:55:50 +01:00
 								class TwitterEventExtractor(TwitterExtractor):
 								    """Extractor for Tweets from a Twitter Event"""
 								    subcategory = "event"
-												[twitter] define directory format for events (#2109)

											
										
										
											2022-01-24 17:44:17 +01:00
+								    directory_fmt = ("{category}", "Events",
 								                     "{event[id]} {event[short_title]}")
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											2022-01-22 20:55:50 +01:00
+								    pattern = BASE_PATTERN + r"/i/events/(\d+)"
 								    test = ("https://twitter.com/i/events/1484669206993903616", {
 								        "range": "1-20",
 								        "count": ">5",
 								    })
 								    def metadata(self):
 								        return {"event": self.api.live_event(self.user)}
 								    def tweets(self):
 								        return self.api.live_event_timeline(self.user)
-												[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8

											
										
										
											2019-10-17 18:34:07 +02:00
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								class TwitterTweetExtractor(TwitterExtractor):
-												[twitter] changes and improvements

- rename User- to TimelineExtractor
- rename 'userid' to 'user_id' to conform to the other ..._id values
- adjust archive_fmt to deal with retweets
- emulate browser behavior for API calls

											
										
										
											2018-08-18 18:58:10 +02:00
+								    """Extractor for images from individual tweets"""
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    subcategory = "tweet"
-												remove '&' from URL patterns

'/?&#' -> '/?#' and '?&#' -> '?#'

According to https://www.ietf.org/rfc/rfc3986.txt, URLs are
"organized hierarchically" by using "the slash ("/"), question
mark ("?"), and number sign ("#") characters to delimit components"

											
										
										
											2020-10-22 23:12:59 +02:00
+								    pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											2019-02-08 13:45:40 +01:00
+								    test = (
-												[twitter] replace unit test URLs

https://twitter.com/PicturesEarth was deleted

											
										
										
											2019-05-09 10:17:55 +02:00
+								        ("https://twitter.com/supernaturepics/status/604341487988576256", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "88a40f7d25529c2501c46f2218f9e0de9aa634b4",
-												[twitter] replace unit test URLs

https://twitter.com/PicturesEarth was deleted

											
										
										
											2019-05-09 10:17:55 +02:00
+								            "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											2017-08-06 13:43:08 +02:00
+								        }),
-												[twitter] extract 'date' metadata (#224)

											
										
										
											2019-04-21 15:41:22 +02:00
+								        # 4 images
-												[twitter] ignore "Promoted Tweets"

											
										
										
											2017-08-06 13:43:08 +02:00
+								        ("https://twitter.com/perrypumas/status/894001459754180609", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "3a2a43dc5fb79dd5432c701d8e55e87c4e551f47",
-												[twitter] extract 'date' metadata (#224)

											
										
										
											2019-04-21 15:41:22 +02:00
+								        }),
 								        # video
 								        ("https://twitter.com/perrypumas/status/1065692031626829824", {
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
-												[twitter] ignore "Promoted Tweets"

											
										
										
											2017-08-06 13:43:08 +02:00
+								        }),
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											2019-07-17 15:35:42 +02:00
+								        # content with emoji, newlines, hashtags (#338)
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											2020-05-28 01:55:32 +02:00
+								        ("https://twitter.com/playpokemon/status/1263832915173048321", {
-												[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'

											
										
										
											2020-06-06 23:51:54 +02:00
+								            "keyword": {"content": (
-												update extractor test results

- don't run Instagram tests on Travis anymore
- replace Twitter test because timeline was made private
- update Hiperdex domain to '.com' (again ...)

											
										
										
											2020-05-28 01:55:32 +02:00
+								                r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
 								                "Gifts! \n\nYou’ll be able to receive four Galarian form "
 								                "Pokémon with Hidden Abilities, plus some very useful items. "
 								                "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
-												update test results

- twitter:

    Don't test the whole kwdict, only the actual content, since the
    keyword hash changes whenever that user changes his display name.

- khinsider:

    Download host changed

											
										
										
											2020-02-22 02:59:56 +01:00
+								            )},
-												[twitter] improve 'content' formatting; add option (#338)

- include emoticons
- leave newlines intact
- remove pic.twitter.com/ links at the end

											
										
										
											2019-07-17 15:35:42 +02:00
+								        }),
-												[twitter] update tests

											
										
										
											2020-06-19 18:12:57 +02:00
+								        # Reply to deleted tweet (#403, #838)
 								        ("https://twitter.com/i/web/status/1170041925560258560", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_",
-												[twitter] small improvements

- handle reply tweets (#403)
- unset cookies in Tweet extractor to "force" the legacy interface

											
										
										
											2019-09-01 17:37:48 +02:00
+								        }),
-												[twitter] add 'replies' option (closes #705)

											
										
										
											2020-04-29 23:11:24 +02:00
+								        # 'replies' option (#705)
-												[twitter] update tests

											
										
										
											2020-06-19 18:12:57 +02:00
+								        ("https://twitter.com/i/web/status/1170041925560258560", {
-												[twitter] add 'replies' option (closes #705)

											
										
										
											2020-04-29 23:11:24 +02:00
+								            "options": (("replies", False),),
 								            "count": 0,
 								        }),
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											2021-08-10 22:02:19 +02:00
+								        # 'replies' to self (#1254)
 								        ("https://twitter.com/i/web/status/1424882930803908612", {
 								            "options": (("replies", "self"),),
 								            "count": 4,
-												[twitter] expand t.co links in user descriptions (#1532, #1787)

											
										
										
											2021-08-23 22:49:35 +02:00
+								            "keyword": {"user": {
 								                "description": "re:business email-- rhettaro.bloom@gmail.com "
 								                               "patreon- http://patreon.com/Princecanary",
 								                "url": "http://princecanary.tumblr.com",
 								            }},
-												[twitter] extend 'replies' option (#1254)

Allow setting 'replies to '"self"' to only download from self-replies.

											
										
										
											2021-08-10 22:02:19 +02:00
+								        }),
 								        ("https://twitter.com/i/web/status/1424898916156284928", {
 								            "options": (("replies", "self"),),
 								            "count": 0,
 								        }),
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											2021-06-11 21:19:04 +02:00
+								        # "quoted" option (#854)
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											2021-06-11 21:19:04 +02:00
+								            "options": (("quoted", True),),
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+=jpg",
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								            "count": 8,
 								        }),
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											2021-06-11 21:19:04 +02:00
+								        # quoted tweet (#526, #854)
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								        ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg",
-												[twitter] add option to filter media from quoted tweets (#854)

											
										
										
											2020-06-24 21:13:16 +02:00
+								            "count": 4,
-												[twitter] handle quoted tweets (#526)

… and categorize them as retweets

											
										
										
											2020-01-04 21:26:55 +01:00
+								        }),
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											2020-01-18 21:26:46 +01:00
+								        # TwitPic embeds (#579)
 								        ("https://twitter.com/i/web/status/112900228289540096", {
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            "options": (("twitpic", True), ("cards", False)),
-												[twitter] add option to extract TwitPic embeds (#579)

											
										
										
											2020-01-18 21:26:46 +01:00
+								            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
 								            "count": 3,
 								        }),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								        # Nitter tweet (#890)
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								        ("https://nitter.net/ed1conf/status/1163841619336007680", {
-												[twitter] update image URL format (#1145)

use
'/<name>?format=<fmt>&name=<size>'
instead of the potentially deprecated
'/<name>.<fmt>:<size>'

but keep all of them as fallback URLs

											
										
										
											2020-12-01 11:53:51 +01:00
+								            "url": "4a9ea898b14d3c112f98562d0df75c9785e239d9",
-												[twitter] add support for nitter.net URLs in pattern (#890)

Please note that URLs are only "translated", all requests are still
done always via the Twitter API.
											
										
										
											2020-07-13 23:48:42 +02:00
+								            "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
 								        }),
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								        # Twitter card (#1005)
 								        ("https://twitter.com/billboard/status/1306599586602135555", {
 								            "options": (("cards", True),),
-												[twitter] update GraphQL endpoint & fix width/height entries

											
										
										
											2020-11-05 22:53:29 +01:00
+								            "pattern": r"https://pbs.twimg.com/card_img/\d+/",
-												[twitter] support media from Cards (#1005, #937)

Can be enabled with 'extractor.twitter.cards', but for now disabled by
default because cards can redirect to rather large videos from YouTube
or Twitch.

											
										
										
											2020-10-22 21:33:53 +02:00
+								        }),
-												[twitter] support "image_carousel_website" unified cards

											
										
										
											2022-01-13 15:58:18 +01:00
+								        # unified_card with image_carousel_website
 								        ("https://twitter.com/doax_vv_staff/status/1479438945662685184", {
 								            "options": (("cards", True),),
 								            "pattern": r"https://pbs\.twimg\.com/media/F.+=png",
 								            "count": 6,
 								        }),
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								        # unified_card without type
 								        ("https://twitter.com/i/web/status/1466183847628865544", {
 								            "count": 0,
 								        }),
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											2020-09-28 23:03:35 +02:00
+								        # original retweets (#1026)
 								        ("https://twitter.com/jessica_3978/status/1296304589591810048", {
 								            "options": (("retweets", "original"),),
 								            "count": 2,
 								            "keyword": {
-												[twitter] set 'retweet_id' for original retweets (#1481)

											
										
										
											2021-07-02 21:47:22 +02:00
+								                "tweet_id"  : 1296296016002547713,
 								                "retweet_id": 1296296016002547713,
 								                "date"      : "dt:2020-08-20 04:00:28",
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											2020-09-28 23:03:35 +02:00
+								            },
 								        }),
-												[twitter] fix pagination for conversion tweets

a relic from the switch to GraphQL API

											
										
										
											2022-06-13 16:27:30 +02:00
+								        # all Tweets from a 'conversation' (#1319)
 								        ("https://twitter.com/supernaturepics/status/604341487988576256", {
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											2021-02-26 13:50:46 +01:00
+								            "options": (("conversations", True),),
-												[twitter] fix pagination for conversion tweets

a relic from the switch to GraphQL API

											
										
										
											2022-06-13 16:27:30 +02:00
+								            "count": 5,
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											2021-02-26 13:50:46 +01:00
+								        }),
-												[twitter] add missing retweet media entities (fixes #1555)

from the original tweets

											
										
										
											2021-05-14 22:46:06 +02:00
+								        # retweet with missing media entities (#1555)
 								        ("https://twitter.com/morino_ya/status/1392763691599237121", {
-												[twitter] change some defaults

- 'retweets' option: true -> false
- 'quoted' option  : true -> false

  i.e. disable downloading tweets from other user's timelines by default

- search directory:
    '["{category}", "Search", "{search}"]' ->
    '["{category}", "{user[name]}"]'

  i.e. change it to the same as other twitter extractors (#1308)

											
										
										
											2021-06-11 21:19:04 +02:00
+								            "options": (("retweets", True),),
-												[twitter] add missing retweet media entities (fixes #1555)

from the original tweets

											
										
										
											2021-05-14 22:46:06 +02:00
+								            "count": 4,
 								        }),
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								        # deleted quote tweet (#2225)
 								        ("https://twitter.com/i/web/status/1460044411165888515", {
 								            "count": 0,
 								        }),
-												[twitter] handle Tweets with "softIntervention" entries

or other such things where the actual Tweet data is one level deeper
than usual

											
										
										
											2022-03-03 01:56:14 +01:00
+								        # "Misleading" content
 								        ("https://twitter.com/i/web/status/1486373748911575046", {
 								            "count": 4,
 								        }),
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								        # age-restricted (#2354)
 								        ("https://twitter.com/mightbecursed/status/1492954264909479936", {
 								            "options": (("syndication", True),),
 								            "count": 1,
 								        }),
-												[twitter] extract alt texts as 'description' (closes #2617)

											
										
										
											2022-05-24 12:37:38 +02:00
+								        # media alt texts / descriptions (#2617)
 								        ("https://twitter.com/my0nruri/status/1528379296041299968", {
 								            "keyword": {"description": "oc"}
 								        }),
-												simplify extractor constants

- single strings for URL patterns
- tuples instead of lists for 'directory_fmt' and 'test'
- single-tuple tests where applicable

											
										
										
											2019-02-08 13:45:40 +01:00
+								    )
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
 								    def __init__(self, match):
-												[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.

											
										
										
											2018-08-19 20:36:33 +02:00
+								        TwitterExtractor.__init__(self, match)
 								        self.tweet_id = match.group(2)
-												[twitter] add extractor

											
										
										
											2016-10-06 19:12:07 +02:00
-												[twitter] add support for user-timelines (closes #96)

also adds a 'retweets' option to filter retweeted content

											
										
										
											2018-08-17 20:04:11 +02:00
+								    def tweets(self):
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											2021-02-26 13:50:46 +01:00
+								        if self.config("conversations", False):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								            return self.api.tweet_detail(self.tweet_id)
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
 								        tweets = []
 								        tweet_id = self.tweet_id
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        for tweet in self.api.tweet_detail(tweet_id):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            if tweet["rest_id"] == tweet_id or \
 								                    tweet.get("_retweet_id_str") == tweet_id:
 								                tweets.append(tweet)
 								                tweet_id = tweet["legacy"].get("quoted_status_id_str")
 								                if not tweet_id:
 								                    break
 								        return tweets
-												[twitter] handle API rate limits (#526)

											
										
										
											2020-01-04 23:46:29 +01:00
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
+								class TwitterImageExtractor(Extractor):
 								    category = "twitter"
 								    subcategory = "image"
 								    pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)"
 								    test = (
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											2021-11-16 22:57:46 +01:00
+								        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg&name=orig", {
 								            "options": (("size", "4096x4096,orig"),),
 								            "url": "cb3042a6f6826923da98f0d2b66c427e9385114c",
 								        }),
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
+								        ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"),
 								    )
 								    def __init__(self, match):
 								        Extractor.__init__(self, match)
 								        self.id, self.fmt = match.groups()
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											2021-11-16 22:57:46 +01:00
+								        TwitterExtractor._init_sizes(self)
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
 								    def items(self):
-												[twitter] remove old-style URLs from image fallback lists

											
										
										
											2021-06-28 16:25:24 +02:00
+								        base = "https://pbs.twimg.com/media/{}?format={}&name=".format(
 								            self.id, self.fmt)
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
 								        data = {
 								            "filename": self.id,
 								            "extension": self.fmt,
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											2021-11-16 22:57:46 +01:00
+								            "_fallback": TwitterExtractor._image_fallback(self, base),
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
+								        }
 								        yield Message.Directory, data
-												[twitter] fix extractor for direct image links (fixes #2030)

											
										
										
											2021-11-16 22:57:46 +01:00
+								        yield Message.Url, base + self._size_image, data
-												[twitter] add extractor for direct image links (closes #1417)

											
										
										
											2021-04-02 02:45:23 +02:00
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								class TwitterAPI():
 								    def __init__(self, extractor):
 								        self.extractor = extractor
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
 								        self.root = "https://twitter.com/i/api"
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								        self.headers = {
 								            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
 								                             "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
 								                             "4FA33AGWWjCpTnA",
 								            "x-guest-token": None,
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								            "x-twitter-auth-type": None,
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            "x-twitter-client-language": "en",
 								            "x-twitter-active-user": "yes",
 								            "x-csrf-token": None,
 								            "Referer": "https://twitter.com/",
 								        }
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											2022-01-22 20:55:50 +01:00
+								        self.params = {
 								            "include_profile_interstitial_type": "1",
 								            "include_blocking": "1",
 								            "include_blocked_by": "1",
 								            "include_followed_by": "1",
 								            "include_want_retweets": "1",
 								            "include_mute_edge": "1",
 								            "include_can_dm": "1",
 								            "include_can_media_tag": "1",
 								            "include_ext_has_nft_avatar": "1",
 								            "skip_status": "1",
 								            "cards_platform": "Web-12",
 								            "include_cards": "1",
 								            "include_ext_alt_text": "true",
 								            "include_quote_count": "true",
 								            "include_reply_count": "1",
 								            "tweet_mode": "extended",
 								            "include_entities": "true",
 								            "include_user_entities": "true",
 								            "include_ext_media_color": "true",
 								            "include_ext_media_availability": "true",
 								            "include_ext_sensitive_media_warning": "true",
 								            "send_error_codes": "true",
 								            "simple_quoted_tweet": "true",
 								            "count": "100",
 								            "cursor": None,
 								            "ext": "mediaStats,highlightedLabel,hasNftAvatar,"
 								                   "voiceInfo,superFollowMetadata",
 								        }
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        self.variables = {
 								            "includePromotedContent": False,
 								            "withSuperFollowsUserFields": True,
 								            "withBirdwatchPivots": False,
 								            "withDownvotePerspective": False,
 								            "withReactionsMetadata": False,
 								            "withReactionsPerspective": False,
 								            "withSuperFollowsTweetFields": True,
 								            "withClientEventToken": False,
 								            "withBirdwatchNotes": False,
 								            "withVoice": True,
 								            "withV2Timeline": False,
 								            "__fs_interactive_text": False,
 								            "__fs_dont_mention_me_view_api_enabled": False,
 								        }
-												[twitter] add 'warnings' option (#2258)

disable reporting any non-fatal errors by default

											
										
										
											2022-02-02 18:37:19 +01:00
-												[twitter] warn about age-restricted Tweets (#2354)

											
										
										
											2022-03-03 01:51:52 +01:00
+								        self._nsfw_warning = True
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								        self._syndication = extractor.config("syndication")
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
 								        cookies = extractor.session.cookies
 								        cookiedomain = extractor.cookiedomain
-												[twitter] implement 'csrf' option (#2676)

											
										
										
											2022-06-13 18:36:39 +02:00
+								        csrf = extractor.config("csrf")
 								        if csrf is None or csrf == "cookies":
 								            csrf_token = cookies.get("ct0", domain=cookiedomain)
 								        else:
 								            csrf_token = None
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        if not csrf_token:
 								            csrf_token = util.generate_token()
 								            cookies.set("ct0", csrf_token, domain=cookiedomain)
 								        self.headers["x-csrf-token"] = csrf_token
 								        if cookies.get("auth_token", domain=cookiedomain):
 								            # logged in
 								            self.headers["x-twitter-auth-type"] = "OAuth2Session"
 								        else:
 								            # guest
 								            guest_token = self._guest_token()
 								            cookies.set("gt", guest_token, domain=cookiedomain)
 								            self.headers["x-guest-token"] = guest_token
 								    def tweet_detail(self, tweet_id):
-												[twitter] update query hashes

											
										
										
											2022-03-02 23:05:31 +01:00
+								        endpoint = "/graphql/ItejhtHVxU7ksltgMmyaLA/TweetDetail"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        variables = {
 								            "focalTweetId": tweet_id,
 								            "with_rux_injections": False,
 								            "withCommunity": True,
 								            "withQuickPromoteEligibilityTweetFields": True,
 								            "withBirdwatchNotes": False,
 								        }
 								        return self._pagination_tweets(
 								            endpoint, variables, ("threaded_conversation_with_injections",))
 								    def user_tweets(self, screen_name):
-												[twitter] update query hashes

											
										
										
											2022-03-02 23:05:31 +01:00
+								        endpoint = "/graphql/WZT7sCTrLvSOaWOXLDsWbQ/UserTweets"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
 								            "count": 100,
 								            "withQuickPromoteEligibilityTweetFields": True,
 								        }
 								        return self._pagination_tweets(endpoint, variables)
 								    def user_tweets_and_replies(self, screen_name):
-												[twitter] update query hashes

											
										
										
											2022-03-02 23:05:31 +01:00
+								        endpoint = "/graphql/t4wEKVulW4Mbv1P0kgxTEw/UserTweetsAndReplies"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
 								            "count": 100,
 								            "withCommunity": True,
 								        }
 								        return self._pagination_tweets(endpoint, variables)
 								    def user_media(self, screen_name):
-												[twitter] update query hashes

											
										
										
											2022-03-02 23:05:31 +01:00
+								        endpoint = "/graphql/nRybED9kRbN-TOWioHq1ng/UserMedia"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
 								            "count": 100,
 								        }
 								        return self._pagination_tweets(endpoint, variables)
 								    def user_likes(self, screen_name):
-												[twitter] update query hashes

											
										
										
											2022-03-02 23:05:31 +01:00
+								        endpoint = "/graphql/9MSTt44HoGjVFSg_u3rHDw/Likes"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
 								            "count": 100,
 								        }
 								        return self._pagination_tweets(endpoint, variables)
 								    def user_bookmarks(self):
-												[twitter] update query hashes

											
										
										
											2022-03-02 23:05:31 +01:00
+								        endpoint = "/graphql/uKP9v_I31k0_VSBmlpq2Xg/Bookmarks"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        variables = {
 								            "count": 100,
 								        }
 								        return self._pagination_tweets(
 								            endpoint, variables, ("bookmark_timeline", "timeline"))
 								    def list_latest_tweets_timeline(self, list_id):
-												[twitter] update query hashes

											
										
										
											2022-03-02 23:05:31 +01:00
+								        endpoint = "/graphql/z3l-EHlx-fyg8OvGO4JN8A/ListLatestTweetsTimeline"
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        variables = {
 								            "listId": list_id,
 								            "count": 100,
 								        }
 								        return self._pagination_tweets(
 								            endpoint, variables, ("list", "tweets_timeline", "timeline"))
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											2022-01-22 20:55:50 +01:00
+								    def search_adaptive(self, query):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        endpoint = "/2/search/adaptive.json"
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											2022-01-22 20:55:50 +01:00
+								        params = self.params.copy()
 								        params["q"] = query
 								        params["tweet_search_mode"] = "live"
 								        params["query_source"] = "typed_query"
 								        params["pc"] = "1"
 								        params["spelling_corrections"] = "1"
 								        return self._pagination_legacy(endpoint, params)
 								    def live_event_timeline(self, event_id):
 								        endpoint = "/2/live_event/timeline/{}.json".format(event_id)
 								        params = self.params.copy()
 								        params["timeline_id"] = "recap"
 								        params["urt"] = "true"
 								        params["get_annotations"] = "true"
 								        return self._pagination_legacy(endpoint, params)
 								    def live_event(self, event_id):
 								        endpoint = "/1.1/live_event/1/{}/timeline.json".format(event_id)
 								        params = self.params.copy()
 								        params["count"] = "0"
 								        params["urt"] = "true"
 								        return (self._call(endpoint, params)
 								                ["twitter_objects"]["live_events"][event_id])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								    def list_by_rest_id(self, list_id):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        endpoint = "/graphql/BWEhzAk7k8TwbU4lKH2dpw/ListByRestId"
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        params = {"variables": self._json_dumps({
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            "listId": list_id,
 								            "withSuperFollowsUserFields": True,
 								        })}
-												[twitter] add 'list' extractor (#1096)

											
										
										
											2020-11-05 22:55:38 +01:00
+								        try:
 								            return self._call(endpoint, params)["data"]["list"]
 								        except KeyError:
 								            raise exception.NotFoundError("list")
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								    def list_members(self, list_id):
-												[twitter] update query hashes

											
										
										
											2022-03-02 23:05:31 +01:00
+								        endpoint = "/graphql/snESM0DPs3c7M1SBm4rvVw/ListMembers"
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								        variables = {
 								            "listId": list_id,
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            "count": 100,
 								            "withSafetyModeUserFields": True,
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								        }
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        return self._pagination_users(
 								            endpoint, variables, ("list", "members_timeline", "timeline"))
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
 								    def user_following(self, screen_name):
-												[twitter] update query hashes

											
										
										
											2022-03-02 23:05:31 +01:00
+								        endpoint = "/graphql/mIwX8GogcobVlRwlgpHNYA/Following"
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								        variables = {
 								            "userId": self._user_id_by_screen_name(screen_name),
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            "count": 100,
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
+								        }
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        return self._pagination_users(endpoint, variables)
-												[twitter] add extractor for followed users (#1337)

https://twitter.com/USER/following or
https://twitter.com/id:USERID/following

											
										
										
											2021-02-22 18:18:33 +01:00
-												[twitter] restore 'logout' functionality (#1719)

											
										
										
											2022-02-01 18:24:03 +01:00
+								    def user_by_rest_id(self, rest_id):
 								        endpoint = "/graphql/I5nvpI91ljifos1Y3Lltyg/UserByRestId"
 								        params = {"variables": self._json_dumps({
 								            "userId": rest_id,
 								            "withSafetyModeUserFields": True,
 								            "withSuperFollowsUserFields": True,
 								        })}
 								        return self._call(endpoint, params)["data"]["user"]["result"]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								    def user_by_screen_name(self, screen_name):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								        endpoint = "/graphql/7mjxD3-C6BxitPMVQ6w0-Q/UserByScreenName"
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        params = {"variables": self._json_dumps({
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            "screen_name": screen_name,
 								            "withSafetyModeUserFields": True,
 								            "withSuperFollowsUserFields": True,
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        })}
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											2022-01-23 17:31:07 +01:00
+								        return self._call(endpoint, params)["data"]["user"]["result"]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
+								    def _user_id_by_screen_name(self, screen_name):
 								        if screen_name.startswith("id:"):
-												[twitter] improve '"replies": "self"' (#2665)

If a username is given in the input URL,
only download from replies by that user.

											
										
										
											2022-06-13 18:56:25 +02:00
+								            user_id = screen_name[3:]
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								            user = self.user_by_rest_id(user_id)
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											2022-01-23 17:31:07 +01:00
-												[twitter] improve '"replies": "self"' (#2665)

If a username is given in the input URL,
only download from replies by that user.

											
										
										
											2022-06-13 18:56:25 +02:00
+								        else:
 								            user = ()
 								            try:
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								                user = self.user_by_screen_name(screen_name)
-												[twitter] improve '"replies": "self"' (#2665)

If a username is given in the input URL,
only download from replies by that user.

											
										
										
											2022-06-13 18:56:25 +02:00
+								                user_id = user["rest_id"]
 								            except KeyError:
 								                if "unavailable_message" in user:
 								                    raise exception.NotFoundError("{} ({})".format(
 								                        user["unavailable_message"].get("text"),
 								                        user.get("reason")), False)
 								                else:
 								                    raise exception.NotFoundError("user")
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								        extr = self.extractor
 								        extr._user_obj = user
 								        extr._user = extr._transform_user(user)
-												[twitter] improve '"replies": "self"' (#2665)

If a username is given in the input URL,
only download from replies by that user.

											
										
										
											2022-06-13 18:56:25 +02:00
+								        return user_id
-												[twitter] support specifying users by ID (#980)

by using 'id:…' as their screen name, i.e.
https://www.twitter.com/id:2976459548/media
instead of
https://twitter.com/supernaturepics/media

The user ID can, for example, be obtained from the output of
$ gallery-dl -j --range 1 https://twitter.com/<screen-name>

											
										
										
											2020-09-08 22:56:52 +02:00
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											2020-06-18 00:28:38 +02:00
+								    @cache(maxage=3600)
 								    def _guest_token(self):
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        root = "https://api.twitter.com"
 								        endpoint = "/1.1/guest/activate.json"
-												[twitter] ensure guest tokens are returned as string (#1665)

											
										
										
											2021-07-01 14:35:53 +02:00
+								        return str(self._call(endpoint, None, root, "POST")["guest_token"])
-												[twitter] move '_guest_token()' into TwitterAPI class

											
										
										
											2020-06-18 00:28:38 +02:00
-												[twitter] add 'warnings' option (#2258)

disable reporting any non-fatal errors by default

											
										
										
											2022-02-02 18:37:19 +01:00
+								    def _call(self, endpoint, params, root=None, method="GET"):
-												[twitter] update API calls

- use 'https://twitter.com/i/api' for all requests
  except '/guest/activate.json'
- update (default) URL parameters
- update GraphQL endpoints

											
										
										
											2020-12-28 22:05:48 +01:00
+								        if root is None:
 								            root = self.root
-												[twitter] improve error message formatting

											
										
										
											2020-07-06 23:13:05 +02:00
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											2021-01-19 23:15:57 +01:00
+								        while True:
 								            response = self.extractor.request(
 								                root + endpoint, method=method, params=params,
 								                headers=self.headers, fatal=None)
 								            # update 'x-csrf-token' header (#1170)
 								            csrf_token = response.cookies.get("ct0")
 								            if csrf_token:
 								                self.headers["x-csrf-token"] = csrf_token
 								            if response.status_code < 400:
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											2021-08-12 19:11:41 +02:00
+								                # success
-												[twitter] fix handling of 429 responses (fixes #2339)

Twitter doesn't return a valid JSON response for 429 errors anymore.

											
										
										
											2022-02-28 16:32:43 +01:00
+								                return response.json()
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											2021-08-12 19:11:41 +02:00
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											2021-01-19 23:15:57 +01:00
+								            if response.status_code == 429:
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											2021-08-12 19:11:41 +02:00
+								                # rate limit exceeded
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											2021-01-19 23:15:57 +01:00
+								                until = response.headers.get("x-rate-limit-reset")
 								                seconds = None if until else 60
 								                self.extractor.wait(until=until, seconds=seconds)
 								                continue
-												[twitter] add option to log out when blocked (#1719)

											
										
										
											2021-08-12 19:11:41 +02:00
 								            # error
-												[twitter] fix handling of 429 responses (fixes #2339)

Twitter doesn't return a valid JSON response for 429 errors anymore.

											
										
										
											2022-02-28 16:32:43 +01:00
+								            try:
 								                data = response.json()
 								                errors = ", ".join(e["message"] for e in data["errors"])
 								            except ValueError:
 								                errors = response.text
 								            except Exception:
 								                errors = data.get("errors", "")
-												[twitter] improve and fix retry after hitting rate limit

- replace recursive call with infinite loop
- fix function arguments for recursive call

											
										
										
											2021-01-19 23:15:57 +01:00
+								            raise exception.StopExtraction(
-												[twitter] distinguish between fatal & nonfatal errors (#2020)

only show a warning for nonfatal errors
and do not raise a StopExtraction exception

											
										
										
											2021-11-13 22:44:11 +01:00
+								                "%s %s (%s)", response.status_code, response.reason, errors)
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											2022-01-22 20:55:50 +01:00
+								    def _pagination_legacy(self, endpoint, params):
-												[twitter] extend 'retweets' option (closes #1026)

Setting 'retweets' to '"original"' will use metadata from the
original retweeted Tweets, and not from the Retweet entry.

											
										
										
											2020-09-28 23:03:35 +02:00
+								        original_retweets = (self.extractor.retweets == "original")
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
 								        while True:
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            cursor = tweet = None
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            data = self._call(endpoint, params)
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
 								            instr = data["timeline"]["instructions"]
 								            if not instr:
 								                return
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								            tweet_ids = []
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								            tweets = data["globalObjects"]["tweets"]
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            users = data["globalObjects"]["users"]
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								            # collect tweet IDs and cursor value
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            for entry in instr[0]["addEntries"]["entries"]:
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								                entry_startswith = entry["entryId"].startswith
 								                if entry_startswith(("tweet-", "sq-I-t-")):
 								                    tweet_ids.append(
 								                        entry["content"]["item"]["content"]["tweet"]["id"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								                elif entry_startswith("homeConversation-"):
 								                    tweet_ids.extend(
 								                        entry["content"]["timelineModule"]["metadata"]
 								                        ["conversationMetadata"]["allTweetIds"][::-1])
 								                elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")):
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								                    cursor = entry["content"]["operation"]["cursor"]
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											2022-01-22 20:55:50 +01:00
+								                    if not cursor.get("stopOnEmptyResponse", True):
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								                        # keep going even if there are no tweets
 								                        tweet = True
 								                    cursor = cursor["value"]
-												[twitter] add option to download all media from a conversation

(fixes #1319)

											
										
										
											2021-02-26 13:50:46 +01:00
+								                elif entry_startswith("conversationThread-"):
 								                    tweet_ids.extend(
 								                        item["entryId"][6:]
 								                        for item in entry["content"]["timelineModule"]["items"]
 								                        if item["entryId"].startswith("tweet-")
 								                    )
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								            # process tweets
 								            for tweet_id in tweet_ids:
 								                try:
 								                    tweet = tweets[tweet_id]
 								                except KeyError:
 								                    self.extractor.log.debug("Skipping %s (deleted)", tweet_id)
 								                    continue
 								                if "retweeted_status_id_str" in tweet:
 								                    retweet = tweets.get(tweet["retweeted_status_id_str"])
 								                    if original_retweets:
 								                        if not retweet:
 								                            continue
-												[twitter] set 'retweet_id' for original retweets (#1481)

											
										
										
											2021-07-02 21:47:22 +02:00
+								                        retweet["retweeted_status_id_str"] = retweet["id_str"]
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								                        retweet["_retweet_id_str"] = tweet["id_str"]
 								                        tweet = retweet
 								                    elif retweet:
 								                        tweet["author"] = users[retweet["user_id_str"]]
-												[twitter] add missing retweet media entities (fixes #1555)

from the original tweets

											
										
										
											2021-05-14 22:46:06 +02:00
+								                        if "extended_entities" in retweet and \
 								                                "extended_entities" not in tweet:
 								                            tweet["extended_entities"] = \
 								                                retweet["extended_entities"]
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								                tweet["user"] = users[tweet["user_id_str"]]
 								                yield tweet
 								                if "quoted_status_id_str" in tweet:
 								                    quoted = tweets.get(tweet["quoted_status_id_str"])
 								                    if quoted:
-												[twitter] fix issue when filtering quote tweets (#1792)

When a user quotes his own Tweet and that Tweet gets filtered by
'"quoted": false', it could also get filtered when it appeared later
as regular Tweet.

											
										
										
											2021-08-25 20:04:22 +02:00
+								                        quoted = quoted.copy()
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								                        quoted["author"] = users[quoted["user_id_str"]]
-												[twitter] update 'quote_id' and 'quote_by'

- 'quote_id' is now non-null for quoted Tweets and has the ID of the
  quoting Tweet, instead the other way round like before
- 'quote_by' is now the 'screen_name' of the quoting user
  (was the same the new 'quote_id' is now)

											
										
										
											2022-07-17 18:50:21 +02:00
+								                        quoted["quoted_by"] = tweet["user"]["screen_name"]
-												[twitter] add 'quote_by' metadata field (#1481)

Only present for tweets quoted by another tweet.
Represents the tweet_id of said tweet quoting this one.

											
										
										
											2021-09-25 18:15:14 +02:00
+								                        quoted["quoted_by_id_str"] = tweet["id_str"]
-												[twitter] fetch tweets from  'homeConversation' entries

When logged in, some entries returned by Twitter's API are so called
'homeConversation's (they would be regular tweet entries otherwise.)

Those weren't picked up before and resulted in missing files compared
to accessing a timeline as guest.

('/media' timelines and search results were not affected)

											
										
										
											2020-12-28 23:34:46 +01:00
+								                        yield quoted
 								            # update cursor value
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            if "replaceEntry" in instr[-1] :
 								                cursor = (instr[-1]["replaceEntry"]["entry"]
 								                          ["content"]["operation"]["cursor"]["value"])
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
-												[twitter] improve pagination

											
										
										
											2020-06-07 03:10:09 +02:00
+								            if not cursor or not tweet:
-												[twitter] add 'bookmark' extractor (closes #625)

											
										
										
											2020-03-05 22:55:26 +01:00
+								                return
-												[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.

											
										
										
											2020-06-03 20:51:29 +02:00
+								            params["cursor"] = cursor
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								    def _pagination_tweets(self, endpoint, variables, path=None):
-												[twitter] restore 'logout' functionality (#1719)

											
										
										
											2022-02-01 18:24:03 +01:00
+								        extr = self.extractor
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        variables.update(self.variables)
-												[twitter] restore 'logout' functionality (#1719)

											
										
										
											2022-02-01 18:24:03 +01:00
+								        original_retweets = (extr.retweets == "original")
 								        pinned_tweet = extr.pinned
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
 								        while True:
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								            params = {"variables": self._json_dumps(variables)}
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            data = self._call(endpoint, params)["data"]
-												[twitter] improve error handling

											
										
										
											2022-01-22 23:09:45 +01:00
+								            try:
 								                if path is None:
 								                    instructions = (data["user"]["result"]["timeline"]
 								                                    ["timeline"]["instructions"])
 								                else:
-												[twitter] restore errors for protected timelines etc (fixes #2237)

											
										
										
											2022-01-29 23:08:33 +01:00
+								                    instructions = data
-												[twitter] improve error handling

											
										
										
											2022-01-22 23:09:45 +01:00
+								                    for key in path:
-												[twitter] restore errors for protected timelines etc (fixes #2237)

											
										
										
											2022-01-29 23:08:33 +01:00
+								                        instructions = instructions[key]
 								                    instructions = instructions["instructions"]
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											2022-01-23 17:31:07 +01:00
-												[twitter] fix extraction (#2275)

											
										
										
											2022-02-07 23:18:35 +01:00
+								                for instr in instructions:
 								                    if instr.get("type") == "TimelineAddEntries":
 								                        entries = instr["entries"]
 								                        break
 								                else:
 								                    raise KeyError()
-												combine KeyError & IndexError to common base class LookupError

											
										
										
											2022-02-11 00:42:49 +01:00
+								            except LookupError:
-												[twitter] restore 'logout' functionality (#1719)

											
										
										
											2022-02-01 18:24:03 +01:00
+								                extr.log.debug(data)
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								                user = extr._user_obj
 								                if user:
 								                    user = user["legacy"]
 								                    if user.get("blocked_by"):
-												[twitter] restore 'logout' functionality (#1719)

											
										
										
											2022-02-01 18:24:03 +01:00
+								                        if self.headers["x-twitter-auth-type"] and \
 								                                extr.config("logout"):
 								                            guest_token = self._guest_token()
 								                            extr.session.cookies.set(
 								                                "gt", guest_token, domain=extr.cookiedomain)
 								                            extr._cookiefile = None
 								                            del extr.session.cookies["auth_token"]
 								                            self.headers["x-guest-token"] = guest_token
 								                            self.headers["x-twitter-auth-type"] = None
 								                            extr.log.info("Retrying API request as guest")
 								                            continue
 								                        raise exception.AuthorizationError(
 								                            "{} blocked your account".format(
 								                                user["screen_name"]))
 								                    elif user.get("protected"):
 								                        raise exception.AuthorizationError(
 								                            "{}'s Tweets are protected".format(
 								                                user["screen_name"]))
 								                raise exception.StopExtraction(
 								                    "Unable to retrieve Tweets from this timeline")
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											2022-01-23 17:31:07 +01:00
+								            tweets = []
 								            tweet = cursor = None
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            if pinned_tweet:
 								                pinned_tweet = False
 								                if instructions[-1]["type"] == "TimelinePinEntry":
-												[twitter] fix pinned tweets (#2216)

caused by the changes in dffa440edef9be1e169ef1e2d6bc0a492493ffce

											
										
										
											2022-01-23 22:52:57 +01:00
+								                    tweets.append(instructions[-1]["entry"])
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											2022-01-23 17:31:07 +01:00
+								            for entry in entries:
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                esw = entry["entryId"].startswith
 								                if esw("tweet-"):
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                    tweets.append(entry)
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                elif esw("homeConversation-"):
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                    tweets.extend(entry["content"]["items"])
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                elif esw("conversationthread-"):
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                    tweets.extend(entry["content"]["items"])
-												[twitter] warn about age-restricted Tweets (#2354)

											
										
										
											2022-03-03 01:51:52 +01:00
+								                elif esw("tombstone-"):
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								                    item = entry["content"]["itemContent"]
 								                    item["tweet_results"] = \
 								                        {"result": {"tombstone": item["tombstoneInfo"]}}
 								                    tweets.append(entry)
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                elif esw("cursor-bottom-"):
 								                    cursor = entry["content"]
-												[twitter] fix pagination for conversion tweets

a relic from the switch to GraphQL API

											
										
										
											2022-06-13 16:27:30 +02:00
+								                    if "itemContent" in cursor:
 								                        cursor = cursor["itemContent"]
-												[twitter] add 'event' extractor (closes #2109)

											
										
										
											2022-01-22 20:55:50 +01:00
+								                    if not cursor.get("stopOnEmptyResponse", True):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                        # keep going even if there are no tweets
 								                        tweet = True
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                    cursor = cursor.get("value")
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								            for entry in tweets:
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                try:
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								                    tweet = ((entry.get("content") or entry["item"])
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                             ["itemContent"]["tweet_results"]["result"])
-												[twitter] warn about age-restricted Tweets (#2354)

											
										
										
											2022-03-03 01:51:52 +01:00
+								                    if "tombstone" in tweet:
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								                        tweet = self._process_tombstone(
 								                            entry, tweet["tombstone"])
 								                        if not tweet:
 								                            continue
-												[twitter] handle Tweets with "softIntervention" entries

or other such things where the actual Tweet data is one level deeper
than usual

											
										
										
											2022-03-03 01:56:14 +01:00
+								                    if "tweet" in tweet:
 								                        tweet = tweet["tweet"]
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								                    legacy = tweet["legacy"]
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                except KeyError:
-												[twitter] restore 'logout' functionality (#1719)

											
										
										
											2022-02-01 18:24:03 +01:00
+								                    extr.log.debug(
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                        "Skipping %s (deleted)",
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								                        (entry.get("entryId") or "").rpartition("-")[2])
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                    continue
 								                if "retweeted_status_result" in legacy:
 								                    retweet = legacy["retweeted_status_result"]["result"]
 								                    if original_retweets:
-												[twitter] fix deleted/invalid retweets (#2225)

											
										
										
											2022-01-25 23:52:44 +01:00
+								                        try:
 								                            retweet["legacy"]["retweeted_status_id_str"] = \
 								                                retweet["rest_id"]
 								                            retweet["_retweet_id_str"] = tweet["rest_id"]
 								                            tweet = retweet
 								                        except KeyError:
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                            continue
-												[twitter] fix deleted/invalid retweets (#2225)

											
										
										
											2022-01-25 23:52:44 +01:00
+								                    else:
 								                        try:
 								                            legacy["retweeted_status_id_str"] = \
 								                                retweet["rest_id"]
-												[twitter] update 'user' and 'author' fields

- 'author' is always the user who authored a tweet
- 'user' is always the user specified in the input URL
  or equal to 'author' when the former is not given

											
										
										
											2022-07-17 17:04:24 +02:00
+								                            tweet["author"] = \
-												[twitter] fix deleted/invalid retweets (#2225)

											
										
										
											2022-01-25 23:52:44 +01:00
+								                                retweet["core"]["user_results"]["result"]
 								                            if "extended_entities" in retweet["legacy"] and \
 								                                    "extended_entities" not in legacy:
 								                                legacy["extended_entities"] = \
 								                                    retweet["legacy"]["extended_entities"]
 								                        except KeyError:
 								                            pass
-												[twitter] improve handling of deleted tweets (#2212)

											
										
										
											2022-01-22 00:41:58 +01:00
+								                yield tweet
 								                if "quoted_status_result" in tweet:
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								                    try:
 								                        quoted = tweet["quoted_status_result"]["result"]
-												[twitter] update 'quote_id' and 'quote_by'

- 'quote_id' is now non-null for quoted Tweets and has the ID of the
  quoting Tweet, instead the other way round like before
- 'quote_by' is now the 'screen_name' of the quoting user
  (was the same the new 'quote_id' is now)

											
										
										
											2022-07-17 18:50:21 +02:00
+								                        quoted["legacy"]["quoted_by"] = (
 								                            tweet["core"]["user_results"]["result"]
 								                            ["legacy"]["screen_name"])
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								                        quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"]
 								                        yield quoted
 								                    except KeyError:
-												[twitter] restore 'logout' functionality (#1719)

											
										
										
											2022-02-01 18:24:03 +01:00
+								                        extr.log.debug(
-												[twitter] fix several errors (#2212, #2216, #2225)

- fix Tweets with deleted quotes
- fix suspended Tweets without 'legacy' entry
- fix unified_cards without 'type'

											
										
										
											2022-01-25 16:13:22 +01:00
+								                            "Skipping quote of %s (deleted)",
 								                            tweet.get("rest_id"))
 								                        continue
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
-												[twitter] improve error handling

- handle accounts without 'rest_id'
- handle timelines with empty 'instructions'

											
										
										
											2022-01-23 17:31:07 +01:00
+								            if not tweet or not cursor:
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                return
 								            variables["cursor"] = cursor
 								    def _pagination_users(self, endpoint, variables, path=None):
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								        variables.update(self.variables)
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								        while True:
 								            cursor = entry = stop = None
-												[twitter] simplify

- use dict with common GraphQL variables
- reduce 'variables' size with custom JSON encoder instance
- centralise TwitterAPI() creation

											
										
										
											2022-01-23 01:44:55 +01:00
+								            params = {"variables": self._json_dumps(variables)}
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								            data = self._call(endpoint, params)["data"]
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
-												[twitter] improve error handling

											
										
										
											2022-01-22 23:09:45 +01:00
+								            try:
 								                if path is None:
 								                    instructions = (data["user"]["result"]["timeline"]
 								                                    ["timeline"]["instructions"])
 								                else:
 								                    for key in path:
 								                        data = data[key]
 								                    instructions = data["instructions"]
 								            except KeyError:
 								                return
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
 								            for instr in instructions:
 								                if instr["type"] == "TimelineAddEntries":
 								                    for entry in instr["entries"]:
 								                        if entry["entryId"].startswith("user-"):
-												[twitter] update to GraphQL API (#2212)

The old REST API endpoints, which were not used by Twitter since
summer 2021, are going to finally be phased out it seems, with
'/2/timeline/profile/USERID.json' being the first one.

Only Twitter's search doesn't have a GraphQL interface yet.

											
										
										
											2022-01-21 23:34:41 +01:00
+								                            user = (entry["content"]["itemContent"]
 								                                    ["user_results"]["result"])
 								                            if "rest_id" in user:
 								                                yield user
-												[twitter] add 'list-members' extractor (closes #1096)

											
										
										
											2020-11-13 06:47:45 +01:00
+								                        elif entry["entryId"].startswith("cursor-bottom-"):
 								                            cursor = entry["content"]["value"]
 								                elif instr["type"] == "TimelineTerminateTimeline":
 								                    if instr["direction"] == "Bottom":
 								                        stop = True
 								            if stop or not cursor or not entry:
 								                return
 								            variables["cursor"] = cursor
-												[twitter] warn about age-restricted Tweets (#2354)

											
										
										
											2022-03-03 01:51:52 +01:00
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								    def _process_tombstone(self, entry, tombstone):
-												[twitter] warn about age-restricted Tweets (#2354)

											
										
										
											2022-03-03 01:51:52 +01:00
+								        text = (tombstone.get("richText") or tombstone["text"])["text"]
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								        tweet_id = entry["entryId"].rpartition("-")[2]
 								        if text.startswith("Age-restricted"):
 								            if self._syndication:
 								                return self._syndication_tweet(tweet_id)
 								            elif self._nsfw_warning:
 								                self._nsfw_warning = False
 								                self.extractor.log.warning('"%s"', text)
 								        self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text)
 								    def _syndication_tweet(self, tweet_id):
 								        tweet = self.extractor.request(
 								            "https://cdn.syndication.twimg.com/tweet?id=" + tweet_id).json()
 								        tweet["user"]["description"] = ""
 								        tweet["user"]["entities"] = {"description": {}}
-												[twitter] fix various syndication issues

- handle retweets
- fix videos without dimensions in URL (3e942a58)
- fix '"retweets": "self"' filter (#2499)

											
										
										
											2022-04-15 20:49:26 +02:00
+								        tweet["user_id_str"] = tweet["user"]["id_str"]
 								        if tweet["id_str"] != tweet_id:
 								            tweet["retweeted_status_id_str"] = tweet["id_str"]
 								            tweet["id_str"] = retweet_id = tweet_id
 								        else:
 								            retweet_id = None
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
 								        if "video" in tweet:
 								            video = tweet["video"]
-												[twitter] improve syndication video selection (#2354)

- ignore .m3u8 manifests
- always select largest format

											
										
										
											2022-04-11 17:06:10 +02:00
+								            video["variants"] = (max(
 								                (v for v in video["variants"] if v["type"] == "video/mp4"),
-												[twitter] fix various syndication issues

- handle retweets
- fix videos without dimensions in URL (3e942a58)
- fix '"retweets": "self"' filter (#2499)

											
										
										
											2022-04-15 20:49:26 +02:00
+								                key=lambda v: text.parse_int(
 								                    v["src"].split("/")[-2].partition("x")[0])
-												[twitter] improve syndication video selection (#2354)

- ignore .m3u8 manifests
- always select largest format

											
										
										
											2022-04-11 17:06:10 +02:00
+								            ),)
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								            video["variants"][0]["url"] = video["variants"][0]["src"]
 								            tweet["extended_entities"] = {"media": [{
 								                "video_info"   : video,
 								                "original_info": {"width" : 0, "height": 0},
 								            }]}
 								        elif "photos" in tweet:
 								            for p in tweet["photos"]:
 								                p["media_url_https"] = p["url"]
 								                p["original_info"] = {
 								                    "width" : p["width"],
 								                    "height": p["height"],
 								                }
 								            tweet["extended_entities"] = {"media": tweet["photos"]}
 								        return {
 								            "rest_id": tweet["id_str"],
 								            "legacy" : tweet,
 								            "user"   : tweet["user"],
-												[twitter] fix various syndication issues

- handle retweets
- fix videos without dimensions in URL (3e942a58)
- fix '"retweets": "self"' filter (#2499)

											
										
										
											2022-04-15 20:49:26 +02:00
+								            "_retweet_id_str": retweet_id,
-												[twitter] add 'syndication' option (#2354)

to fetch age-restricted content using Twitter's  syndication API

											
										
										
											2022-03-31 20:31:58 +02:00
+								        }