2016-10-06 19:12:07 +02:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
2022-01-13 15:58:18 +01:00
|
|
|
|
# Copyright 2016-2022 Mike Fährmann
|
2016-10-06 19:12:07 +02:00
|
|
|
|
#
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
2020-02-02 17:19:14 +01:00
|
|
|
|
"""Extractors for https://twitter.com/"""
|
2016-10-06 19:12:07 +02:00
|
|
|
|
|
|
|
|
|
from .common import Extractor, Message
|
2020-10-15 00:43:26 +02:00
|
|
|
|
from .. import text, util, exception
|
2020-06-10 20:58:42 +02:00
|
|
|
|
from ..cache import cache
|
2020-11-13 06:47:45 +01:00
|
|
|
|
import json
|
2017-02-01 00:53:19 +01:00
|
|
|
|
|
2020-07-13 23:48:42 +02:00
|
|
|
|
BASE_PATTERN = (
|
|
|
|
|
r"(?:https?://)?(?:www\.|mobile\.)?"
|
2022-05-25 17:01:58 +02:00
|
|
|
|
r"(?:(?:[fv]x)?twitter\.com|nitter\.net)"
|
2020-07-13 23:48:42 +02:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2018-08-17 20:04:11 +02:00
|
|
|
|
class TwitterExtractor(Extractor):
|
|
|
|
|
"""Base class for twitter extractors"""
|
2016-10-06 19:12:07 +02:00
|
|
|
|
category = "twitter"
|
2020-06-06 23:51:54 +02:00
|
|
|
|
directory_fmt = ("{category}", "{user[name]}")
|
|
|
|
|
filename_fmt = "{tweet_id}_{num}.{extension}"
|
|
|
|
|
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
|
2020-03-12 22:02:12 +01:00
|
|
|
|
cookiedomain = ".twitter.com"
|
2021-01-25 14:52:22 +01:00
|
|
|
|
cookienames = ("auth_token",)
|
2018-08-17 20:04:11 +02:00
|
|
|
|
root = "https://twitter.com"
|
|
|
|
|
|
2018-08-19 20:36:33 +02:00
|
|
|
|
def __init__(self, match):
|
2019-02-11 13:31:10 +01:00
|
|
|
|
Extractor.__init__(self, match)
|
2018-08-19 20:36:33 +02:00
|
|
|
|
self.user = match.group(1)
|
2021-05-22 21:07:21 +02:00
|
|
|
|
self.textonly = self.config("text-tweets", False)
|
2021-06-11 21:19:04 +02:00
|
|
|
|
self.retweets = self.config("retweets", False)
|
2020-04-29 23:11:24 +02:00
|
|
|
|
self.replies = self.config("replies", True)
|
2020-01-18 21:26:46 +01:00
|
|
|
|
self.twitpic = self.config("twitpic", False)
|
2021-10-29 22:10:58 +02:00
|
|
|
|
self.pinned = self.config("pinned", False)
|
2021-06-11 21:19:04 +02:00
|
|
|
|
self.quoted = self.config("quoted", False)
|
2020-02-14 01:03:42 +01:00
|
|
|
|
self.videos = self.config("videos", True)
|
2022-05-21 15:39:25 +02:00
|
|
|
|
self.cards = self.config("cards", False)
|
2022-07-17 17:04:24 +02:00
|
|
|
|
self._user = self._user_obj = None
|
2020-06-06 23:51:54 +02:00
|
|
|
|
self._user_cache = {}
|
2021-11-16 22:57:46 +01:00
|
|
|
|
self._init_sizes()
|
2018-09-30 18:41:39 +02:00
|
|
|
|
|
2021-11-16 22:57:46 +01:00
|
|
|
|
def _init_sizes(self):
|
2021-10-05 18:58:10 +02:00
|
|
|
|
size = self.config("size")
|
|
|
|
|
if size is None:
|
|
|
|
|
self._size_image = "orig"
|
2021-12-15 23:17:07 +01:00
|
|
|
|
self._size_fallback = ("4096x4096", "large", "medium", "small")
|
2021-10-05 18:58:10 +02:00
|
|
|
|
else:
|
|
|
|
|
if isinstance(size, str):
|
|
|
|
|
size = size.split(",")
|
|
|
|
|
self._size_image = size[0]
|
|
|
|
|
self._size_fallback = size[1:]
|
|
|
|
|
|
2018-08-17 20:04:11 +02:00
|
|
|
|
def items(self):
|
2019-04-07 23:06:57 +02:00
|
|
|
|
self.login()
|
2022-01-23 01:44:55 +01:00
|
|
|
|
self.api = TwitterAPI(self)
|
2019-11-30 21:51:08 +01:00
|
|
|
|
metadata = self.metadata()
|
2018-08-17 20:04:11 +02:00
|
|
|
|
|
2022-06-12 17:26:51 +02:00
|
|
|
|
if self.config("expand"):
|
|
|
|
|
tweets = self._expand_tweets(self.tweets())
|
|
|
|
|
self.tweets = lambda : tweets
|
|
|
|
|
|
2022-07-03 16:07:07 +02:00
|
|
|
|
if self.config("unique", True):
|
|
|
|
|
seen_tweets = set()
|
|
|
|
|
else:
|
|
|
|
|
seen_tweets = None
|
|
|
|
|
|
2018-08-17 20:04:11 +02:00
|
|
|
|
for tweet in self.tweets():
|
2020-06-04 01:22:34 +02:00
|
|
|
|
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if "legacy" in tweet:
|
|
|
|
|
data = tweet["legacy"]
|
|
|
|
|
else:
|
|
|
|
|
data = tweet
|
|
|
|
|
|
2022-07-03 16:07:07 +02:00
|
|
|
|
if seen_tweets is not None:
|
|
|
|
|
if data["id_str"] in seen_tweets:
|
|
|
|
|
continue
|
|
|
|
|
seen_tweets.add(data["id_str"])
|
|
|
|
|
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if not self.retweets and "retweeted_status_id_str" in data:
|
|
|
|
|
self.log.debug("Skipping %s (retweet)", data["id_str"])
|
2020-07-11 00:41:50 +02:00
|
|
|
|
continue
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if not self.quoted and "quoted_by_id_str" in data:
|
|
|
|
|
self.log.debug("Skipping %s (quoted tweet)", data["id_str"])
|
2020-06-03 20:51:29 +02:00
|
|
|
|
continue
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if "in_reply_to_user_id_str" in data and (
|
2021-08-10 22:02:19 +02:00
|
|
|
|
not self.replies or (
|
|
|
|
|
self.replies == "self" and
|
2022-07-17 17:04:24 +02:00
|
|
|
|
data["user_id_str"] !=
|
|
|
|
|
(self._user_obj["rest_id"] if self._user else
|
|
|
|
|
data["in_reply_to_user_id_str"])
|
2021-08-10 22:02:19 +02:00
|
|
|
|
)
|
|
|
|
|
):
|
2022-01-21 23:34:41 +01:00
|
|
|
|
self.log.debug("Skipping %s (reply)", data["id_str"])
|
2021-08-10 22:02:19 +02:00
|
|
|
|
continue
|
2020-06-03 20:51:29 +02:00
|
|
|
|
|
2020-10-22 21:33:53 +02:00
|
|
|
|
files = []
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if "extended_entities" in data:
|
2022-01-13 15:58:18 +01:00
|
|
|
|
self._extract_media(
|
2022-01-21 23:34:41 +01:00
|
|
|
|
data, data["extended_entities"]["media"], files)
|
2020-10-22 21:33:53 +02:00
|
|
|
|
if "card" in tweet and self.cards:
|
|
|
|
|
self._extract_card(tweet, files)
|
2020-06-04 01:22:34 +02:00
|
|
|
|
if self.twitpic:
|
2022-01-21 23:34:41 +01:00
|
|
|
|
self._extract_twitpic(data, files)
|
2021-05-22 17:01:49 +02:00
|
|
|
|
if not files and not self.textonly:
|
2018-08-17 20:04:11 +02:00
|
|
|
|
continue
|
|
|
|
|
|
2020-06-06 23:51:54 +02:00
|
|
|
|
tdata = self._transform_tweet(tweet)
|
|
|
|
|
tdata.update(metadata)
|
2022-07-10 14:37:04 +02:00
|
|
|
|
tdata["count"] = len(files)
|
2020-06-06 23:51:54 +02:00
|
|
|
|
yield Message.Directory, tdata
|
2020-10-22 21:33:53 +02:00
|
|
|
|
for tdata["num"], file in enumerate(files, 1):
|
|
|
|
|
file.update(tdata)
|
|
|
|
|
url = file.pop("url")
|
|
|
|
|
if "extension" not in file:
|
|
|
|
|
text.nameext_from_url(url, file)
|
|
|
|
|
yield Message.Url, url, file
|
|
|
|
|
|
2022-01-13 15:58:18 +01:00
|
|
|
|
def _extract_media(self, tweet, entities, files):
|
|
|
|
|
for media in entities:
|
2022-05-24 12:37:38 +02:00
|
|
|
|
descr = media.get("ext_alt_text")
|
2020-11-05 22:53:29 +01:00
|
|
|
|
width = media["original_info"].get("width", 0)
|
|
|
|
|
height = media["original_info"].get("height", 0)
|
2020-10-22 21:33:53 +02:00
|
|
|
|
|
|
|
|
|
if "video_info" in media:
|
|
|
|
|
if self.videos == "ytdl":
|
|
|
|
|
files.append({
|
|
|
|
|
"url": "ytdl:{}/i/web/status/{}".format(
|
|
|
|
|
self.root, tweet["id_str"]),
|
2022-05-24 12:37:38 +02:00
|
|
|
|
"width" : width,
|
|
|
|
|
"height" : height,
|
|
|
|
|
"extension" : None,
|
|
|
|
|
"description": descr,
|
2020-10-22 21:33:53 +02:00
|
|
|
|
})
|
|
|
|
|
elif self.videos:
|
|
|
|
|
video_info = media["video_info"]
|
|
|
|
|
variant = max(
|
|
|
|
|
video_info["variants"],
|
|
|
|
|
key=lambda v: v.get("bitrate", 0),
|
|
|
|
|
)
|
|
|
|
|
files.append({
|
2022-05-24 12:37:38 +02:00
|
|
|
|
"url" : variant["url"],
|
|
|
|
|
"width" : width,
|
|
|
|
|
"height" : height,
|
|
|
|
|
"bitrate" : variant.get("bitrate", 0),
|
|
|
|
|
"duration" : video_info.get(
|
2020-10-22 21:33:53 +02:00
|
|
|
|
"duration_millis", 0) / 1000,
|
2022-05-24 12:37:38 +02:00
|
|
|
|
"description": descr,
|
2020-10-22 21:33:53 +02:00
|
|
|
|
})
|
|
|
|
|
elif "media_url_https" in media:
|
|
|
|
|
url = media["media_url_https"]
|
2020-12-01 11:53:51 +01:00
|
|
|
|
base, _, fmt = url.rpartition(".")
|
|
|
|
|
base += "?format=" + fmt + "&name="
|
2020-10-22 21:33:53 +02:00
|
|
|
|
files.append(text.nameext_from_url(url, {
|
2022-05-24 12:37:38 +02:00
|
|
|
|
"url" : base + self._size_image,
|
|
|
|
|
"width" : width,
|
|
|
|
|
"height" : height,
|
|
|
|
|
"_fallback" : self._image_fallback(base),
|
|
|
|
|
"description": descr,
|
2020-10-22 21:33:53 +02:00
|
|
|
|
}))
|
|
|
|
|
else:
|
|
|
|
|
files.append({"url": media["media_url"]})
|
|
|
|
|
|
2021-10-05 18:58:10 +02:00
|
|
|
|
def _image_fallback(self, base):
|
|
|
|
|
for fmt in self._size_fallback:
|
|
|
|
|
yield base + fmt
|
2020-12-01 11:53:51 +01:00
|
|
|
|
|
2020-10-22 21:33:53 +02:00
|
|
|
|
def _extract_card(self, tweet, files):
|
|
|
|
|
card = tweet["card"]
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if "legacy" in card:
|
|
|
|
|
card = card["legacy"]
|
2022-01-13 15:58:18 +01:00
|
|
|
|
name = card["name"]
|
|
|
|
|
|
|
|
|
|
if name in ("summary", "summary_large_image"):
|
2020-10-22 21:33:53 +02:00
|
|
|
|
bvals = card["binding_values"]
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if isinstance(bvals, list):
|
|
|
|
|
bvals = {
|
|
|
|
|
bval["key"]: bval["value"]
|
|
|
|
|
for bval in card["binding_values"]
|
|
|
|
|
}
|
2020-10-22 21:33:53 +02:00
|
|
|
|
for prefix in ("photo_image_full_size_",
|
|
|
|
|
"summary_photo_image_",
|
|
|
|
|
"thumbnail_image_"):
|
|
|
|
|
for size in ("original", "x_large", "large", "small"):
|
|
|
|
|
key = prefix + size
|
|
|
|
|
if key in bvals:
|
2021-09-20 22:32:03 +02:00
|
|
|
|
value = bvals[key].get("image_value")
|
|
|
|
|
if value and "url" in value:
|
2022-02-03 23:43:18 +01:00
|
|
|
|
base, sep, size = value["url"].rpartition("&name=")
|
|
|
|
|
if sep:
|
|
|
|
|
base += sep
|
|
|
|
|
value["url"] = base + self._size_image
|
|
|
|
|
value["_fallback"] = self._image_fallback(base)
|
2021-09-20 22:32:03 +02:00
|
|
|
|
files.append(value)
|
|
|
|
|
return
|
2022-01-13 15:58:18 +01:00
|
|
|
|
elif name == "unified_card":
|
2022-01-21 23:34:41 +01:00
|
|
|
|
bvals = card["binding_values"]
|
|
|
|
|
if isinstance(bvals, list):
|
|
|
|
|
for bval in card["binding_values"]:
|
|
|
|
|
if bval["key"] == "unified_card":
|
2022-01-22 20:25:10 +01:00
|
|
|
|
bval = bval["value"]["string_value"]
|
2022-01-21 23:34:41 +01:00
|
|
|
|
break
|
|
|
|
|
else:
|
2022-01-22 20:25:10 +01:00
|
|
|
|
bval = bvals["unified_card"]["string_value"]
|
|
|
|
|
data = json.loads(bval)
|
2022-01-25 16:13:22 +01:00
|
|
|
|
if data.get("type") == "image_carousel_website":
|
2022-01-13 15:58:18 +01:00
|
|
|
|
self._extract_media(
|
|
|
|
|
tweet, data["media_entities"].values(), files)
|
|
|
|
|
return
|
|
|
|
|
|
2022-01-15 22:02:57 +01:00
|
|
|
|
if self.cards == "ytdl":
|
2022-01-21 23:34:41 +01:00
|
|
|
|
tweet_id = tweet.get("rest_id") or tweet["id_str"]
|
|
|
|
|
url = "ytdl:{}/i/web/status/{}".format(self.root, tweet_id)
|
2020-10-22 21:33:53 +02:00
|
|
|
|
files.append({"url": url})
|
|
|
|
|
|
|
|
|
|
def _extract_twitpic(self, tweet, files):
|
2020-06-04 01:22:34 +02:00
|
|
|
|
for url in tweet["entities"].get("urls", ()):
|
|
|
|
|
url = url["expanded_url"]
|
2020-09-21 22:21:16 +02:00
|
|
|
|
if "//twitpic.com/" in url and "/photos/" not in url:
|
2020-06-04 01:22:34 +02:00
|
|
|
|
response = self.request(url, fatal=False)
|
|
|
|
|
if response.status_code >= 400:
|
|
|
|
|
continue
|
|
|
|
|
url = text.extract(
|
|
|
|
|
response.text, 'name="twitter:image" value="', '"')[0]
|
2020-09-21 22:21:16 +02:00
|
|
|
|
if url:
|
2020-10-22 21:33:53 +02:00
|
|
|
|
files.append({"url": url})
|
2020-06-04 01:22:34 +02:00
|
|
|
|
|
2020-06-06 23:51:54 +02:00
|
|
|
|
def _transform_tweet(self, tweet):
|
2022-07-17 17:04:24 +02:00
|
|
|
|
if "author" in tweet:
|
|
|
|
|
author = tweet["author"]
|
|
|
|
|
elif "core" in tweet:
|
|
|
|
|
author = tweet["core"]["user_results"]["result"]
|
2022-01-21 23:34:41 +01:00
|
|
|
|
else:
|
2022-07-17 17:04:24 +02:00
|
|
|
|
author = tweet["user"]
|
|
|
|
|
author = self._transform_user(author)
|
2022-01-21 23:34:41 +01:00
|
|
|
|
|
|
|
|
|
if "legacy" in tweet:
|
|
|
|
|
tweet = tweet["legacy"]
|
|
|
|
|
|
2022-03-31 20:31:58 +02:00
|
|
|
|
tget = tweet.get
|
2020-06-06 23:51:54 +02:00
|
|
|
|
entities = tweet["entities"]
|
|
|
|
|
tdata = {
|
|
|
|
|
"tweet_id" : text.parse_int(tweet["id_str"]),
|
|
|
|
|
"retweet_id" : text.parse_int(
|
2022-03-31 20:31:58 +02:00
|
|
|
|
tget("retweeted_status_id_str")),
|
2020-06-06 23:51:54 +02:00
|
|
|
|
"quote_id" : text.parse_int(
|
2022-07-17 18:50:21 +02:00
|
|
|
|
tget("quoted_by_id_str")),
|
2020-06-06 23:51:54 +02:00
|
|
|
|
"reply_id" : text.parse_int(
|
2022-03-31 20:31:58 +02:00
|
|
|
|
tget("in_reply_to_status_id_str")),
|
2020-06-06 23:51:54 +02:00
|
|
|
|
"date" : text.parse_datetime(
|
|
|
|
|
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
|
2022-07-17 17:04:24 +02:00
|
|
|
|
"user" : self._user or author,
|
|
|
|
|
"author" : author,
|
2020-06-06 23:51:54 +02:00
|
|
|
|
"lang" : tweet["lang"],
|
2022-03-31 20:31:58 +02:00
|
|
|
|
"favorite_count": tget("favorite_count"),
|
|
|
|
|
"quote_count" : tget("quote_count"),
|
|
|
|
|
"reply_count" : tget("reply_count"),
|
|
|
|
|
"retweet_count" : tget("retweet_count"),
|
2020-06-06 23:51:54 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hashtags = entities.get("hashtags")
|
|
|
|
|
if hashtags:
|
|
|
|
|
tdata["hashtags"] = [t["text"] for t in hashtags]
|
|
|
|
|
|
|
|
|
|
mentions = entities.get("user_mentions")
|
|
|
|
|
if mentions:
|
|
|
|
|
tdata["mentions"] = [{
|
|
|
|
|
"id": text.parse_int(u["id_str"]),
|
|
|
|
|
"name": u["screen_name"],
|
|
|
|
|
"nick": u["name"],
|
|
|
|
|
} for u in mentions]
|
|
|
|
|
|
2022-07-13 19:45:14 +02:00
|
|
|
|
content = text.unescape(tget("full_text") or tget("text") or "")
|
2021-05-15 02:46:46 +02:00
|
|
|
|
urls = entities.get("urls")
|
|
|
|
|
if urls:
|
|
|
|
|
for url in urls:
|
|
|
|
|
content = content.replace(url["url"], url["expanded_url"])
|
2021-05-16 02:35:55 +02:00
|
|
|
|
txt, _, tco = content.rpartition(" ")
|
|
|
|
|
tdata["content"] = txt if tco.startswith("https://t.co/") else content
|
2021-05-15 02:46:46 +02:00
|
|
|
|
|
2020-06-09 21:48:04 +02:00
|
|
|
|
if "in_reply_to_screen_name" in tweet:
|
|
|
|
|
tdata["reply_to"] = tweet["in_reply_to_screen_name"]
|
2022-07-17 18:50:21 +02:00
|
|
|
|
if "quoted_by" in tweet:
|
|
|
|
|
tdata["quote_by"] = tweet["quoted_by"]
|
2020-06-09 21:48:04 +02:00
|
|
|
|
|
2020-06-06 23:51:54 +02:00
|
|
|
|
return tdata
|
|
|
|
|
|
|
|
|
|
def _transform_user(self, user):
|
2022-03-31 20:31:58 +02:00
|
|
|
|
uid = user.get("rest_id") or user["id_str"]
|
|
|
|
|
|
2021-08-23 22:28:09 +02:00
|
|
|
|
try:
|
2022-03-31 20:31:58 +02:00
|
|
|
|
return self._user_cache[uid]
|
2021-08-23 22:28:09 +02:00
|
|
|
|
except KeyError:
|
|
|
|
|
pass
|
2020-06-06 23:51:54 +02:00
|
|
|
|
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if "legacy" in user:
|
|
|
|
|
user = user["legacy"]
|
2022-03-31 20:31:58 +02:00
|
|
|
|
|
|
|
|
|
uget = user.get
|
2021-08-23 22:36:55 +02:00
|
|
|
|
entities = user["entities"]
|
|
|
|
|
|
2021-08-23 22:28:09 +02:00
|
|
|
|
self._user_cache[uid] = udata = {
|
|
|
|
|
"id" : text.parse_int(uid),
|
|
|
|
|
"name" : user["screen_name"],
|
|
|
|
|
"nick" : user["name"],
|
2022-03-31 20:31:58 +02:00
|
|
|
|
"location" : uget("location"),
|
2021-08-23 22:28:09 +02:00
|
|
|
|
"date" : text.parse_datetime(
|
2022-03-31 20:31:58 +02:00
|
|
|
|
uget("created_at"), "%a %b %d %H:%M:%S %z %Y"),
|
|
|
|
|
"verified" : uget("verified", False),
|
|
|
|
|
"profile_banner" : uget("profile_banner_url", ""),
|
|
|
|
|
"profile_image" : uget(
|
2021-08-23 22:28:09 +02:00
|
|
|
|
"profile_image_url_https", "").replace("_normal.", "."),
|
2022-03-31 20:31:58 +02:00
|
|
|
|
"favourites_count": uget("favourites_count"),
|
|
|
|
|
"followers_count" : uget("followers_count"),
|
|
|
|
|
"friends_count" : uget("friends_count"),
|
|
|
|
|
"listed_count" : uget("listed_count"),
|
|
|
|
|
"media_count" : uget("media_count"),
|
|
|
|
|
"statuses_count" : uget("statuses_count"),
|
2021-08-23 22:28:09 +02:00
|
|
|
|
}
|
2021-08-23 22:36:55 +02:00
|
|
|
|
|
2021-08-23 22:49:35 +02:00
|
|
|
|
descr = user["description"]
|
|
|
|
|
urls = entities["description"].get("urls")
|
|
|
|
|
if urls:
|
|
|
|
|
for url in urls:
|
|
|
|
|
descr = descr.replace(url["url"], url["expanded_url"])
|
|
|
|
|
udata["description"] = descr
|
|
|
|
|
|
2021-08-23 22:36:55 +02:00
|
|
|
|
if "url" in entities:
|
2021-08-27 18:41:16 +02:00
|
|
|
|
url = entities["url"]["urls"][0]
|
|
|
|
|
udata["url"] = url.get("expanded_url") or url.get("url")
|
2021-08-23 22:36:55 +02:00
|
|
|
|
|
2021-08-23 22:28:09 +02:00
|
|
|
|
return udata
|
2020-06-06 23:51:54 +02:00
|
|
|
|
|
2021-03-15 22:55:24 +01:00
|
|
|
|
def _users_result(self, users):
|
2021-03-20 01:31:12 +01:00
|
|
|
|
userfmt = self.config("users")
|
|
|
|
|
if not userfmt or userfmt == "timeline":
|
|
|
|
|
cls = TwitterTimelineExtractor
|
|
|
|
|
fmt = (self.root + "/i/user/{rest_id}").format_map
|
|
|
|
|
elif userfmt == "media":
|
2021-03-15 22:55:24 +01:00
|
|
|
|
cls = TwitterMediaExtractor
|
2021-03-20 01:31:12 +01:00
|
|
|
|
fmt = (self.root + "/id:{rest_id}/media").format_map
|
2022-05-23 18:23:21 +02:00
|
|
|
|
elif userfmt == "tweets":
|
|
|
|
|
cls = TwitterTweetsExtractor
|
|
|
|
|
fmt = (self.root + "/id:{rest_id}/tweets").format_map
|
2021-03-15 22:55:24 +01:00
|
|
|
|
else:
|
2021-03-20 01:31:12 +01:00
|
|
|
|
cls = None
|
|
|
|
|
fmt = userfmt.format_map
|
2021-03-15 22:55:24 +01:00
|
|
|
|
|
|
|
|
|
for user in users:
|
|
|
|
|
user["_extractor"] = cls
|
2021-03-20 01:31:12 +01:00
|
|
|
|
yield Message.Queue, fmt(user), user
|
2021-03-15 22:55:24 +01:00
|
|
|
|
|
2022-06-12 17:26:51 +02:00
|
|
|
|
def _expand_tweets(self, tweets):
|
|
|
|
|
seen = set()
|
|
|
|
|
for tweet in tweets:
|
|
|
|
|
|
|
|
|
|
if "legacy" in tweet:
|
|
|
|
|
cid = tweet["legacy"]["conversation_id_str"]
|
|
|
|
|
else:
|
|
|
|
|
cid = tweet["conversation_id_str"]
|
|
|
|
|
|
|
|
|
|
if cid not in seen:
|
|
|
|
|
seen.add(cid)
|
|
|
|
|
try:
|
|
|
|
|
yield from self.api.tweet_detail(cid)
|
|
|
|
|
except Exception:
|
|
|
|
|
yield tweet
|
|
|
|
|
|
2018-08-17 20:04:11 +02:00
|
|
|
|
def metadata(self):
|
|
|
|
|
"""Return general metadata"""
|
2019-11-30 21:51:08 +01:00
|
|
|
|
return {}
|
2018-08-17 20:04:11 +02:00
|
|
|
|
|
|
|
|
|
def tweets(self):
|
2020-06-03 20:51:29 +02:00
|
|
|
|
"""Yield all relevant tweet objects"""
|
2018-08-17 20:04:11 +02:00
|
|
|
|
|
2019-04-07 23:06:57 +02:00
|
|
|
|
def login(self):
|
2021-01-25 14:52:22 +01:00
|
|
|
|
if not self._check_cookies(self.cookienames):
|
|
|
|
|
username, password = self._get_auth_info()
|
|
|
|
|
if username:
|
|
|
|
|
self._update_cookies(self._login_impl(username, password))
|
2019-04-07 23:06:57 +02:00
|
|
|
|
|
|
|
|
|
@cache(maxage=360*24*3600, keyarg=1)
|
|
|
|
|
def _login_impl(self, username, password):
|
|
|
|
|
self.log.info("Logging in as %s", username)
|
|
|
|
|
|
2021-01-11 22:12:40 +01:00
|
|
|
|
token = util.generate_token()
|
2020-12-28 15:54:47 +01:00
|
|
|
|
self.session.cookies.clear()
|
|
|
|
|
self.request(self.root + "/login")
|
2020-06-04 00:07:12 +02:00
|
|
|
|
|
2020-12-28 15:54:47 +01:00
|
|
|
|
url = self.root + "/sessions"
|
|
|
|
|
cookies = {
|
|
|
|
|
"_mb_tk": token,
|
|
|
|
|
}
|
2019-04-07 23:06:57 +02:00
|
|
|
|
data = {
|
2020-12-28 15:54:47 +01:00
|
|
|
|
"redirect_after_login" : "/",
|
|
|
|
|
"remember_me" : "1",
|
2020-06-04 00:07:12 +02:00
|
|
|
|
"authenticity_token" : token,
|
2020-12-28 15:54:47 +01:00
|
|
|
|
"wfa" : "1",
|
|
|
|
|
"ui_metrics" : "{}",
|
2019-04-07 23:06:57 +02:00
|
|
|
|
"session[username_or_email]": username,
|
|
|
|
|
"session[password]" : password,
|
|
|
|
|
}
|
2020-12-28 15:54:47 +01:00
|
|
|
|
response = self.request(
|
|
|
|
|
url, method="POST", cookies=cookies, data=data)
|
|
|
|
|
|
2021-03-26 21:52:55 +01:00
|
|
|
|
if "/account/login_verification" in response.url:
|
|
|
|
|
raise exception.AuthenticationError(
|
|
|
|
|
"Login with two-factor authentication is not supported")
|
|
|
|
|
|
2020-06-04 00:07:12 +02:00
|
|
|
|
cookies = {
|
2020-03-12 22:02:12 +01:00
|
|
|
|
cookie.name: cookie.value
|
|
|
|
|
for cookie in self.session.cookies
|
|
|
|
|
}
|
2020-06-04 00:07:12 +02:00
|
|
|
|
|
|
|
|
|
if "/error" in response.url or "auth_token" not in cookies:
|
|
|
|
|
raise exception.AuthenticationError()
|
|
|
|
|
return cookies
|
2018-08-17 20:04:11 +02:00
|
|
|
|
|
|
|
|
|
|
2018-08-19 20:36:33 +02:00
|
|
|
|
class TwitterTimelineExtractor(TwitterExtractor):
|
2022-05-23 18:23:21 +02:00
|
|
|
|
"""Extractor for a Twitter user timeline"""
|
2018-08-19 20:36:33 +02:00
|
|
|
|
subcategory = "timeline"
|
2021-01-20 00:33:57 +01:00
|
|
|
|
pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
|
|
|
|
|
r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
|
2019-09-01 17:37:48 +02:00
|
|
|
|
test = (
|
|
|
|
|
("https://twitter.com/supernaturepics", {
|
|
|
|
|
"range": "1-40",
|
2020-12-01 11:53:51 +01:00
|
|
|
|
"url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
|
2019-09-01 17:37:48 +02:00
|
|
|
|
}),
|
2022-01-25 16:13:22 +01:00
|
|
|
|
# suspended account (#2216)
|
|
|
|
|
("https://twitter.com/realDonaldTrump", {
|
|
|
|
|
"exception": exception.NotFoundError,
|
|
|
|
|
}),
|
2019-09-01 17:37:48 +02:00
|
|
|
|
("https://mobile.twitter.com/supernaturepics?p=i"),
|
2020-09-08 22:56:52 +02:00
|
|
|
|
("https://www.twitter.com/id:2976459548"),
|
2021-01-20 00:33:57 +01:00
|
|
|
|
("https://twitter.com/i/user/2976459548"),
|
2020-09-08 23:17:50 +02:00
|
|
|
|
("https://twitter.com/intent/user?user_id=2976459548"),
|
2022-05-25 17:01:58 +02:00
|
|
|
|
("https://fxtwitter.com/supernaturepics"),
|
|
|
|
|
("https://vxtwitter.com/supernaturepics"),
|
2019-09-01 17:37:48 +02:00
|
|
|
|
)
|
2018-08-19 20:36:33 +02:00
|
|
|
|
|
2020-09-08 23:17:50 +02:00
|
|
|
|
def __init__(self, match):
|
|
|
|
|
TwitterExtractor.__init__(self, match)
|
2021-01-20 00:33:57 +01:00
|
|
|
|
user_id = match.group(2)
|
|
|
|
|
if user_id:
|
|
|
|
|
self.user = "id:" + user_id
|
2020-09-08 23:17:50 +02:00
|
|
|
|
|
2018-08-19 20:36:33 +02:00
|
|
|
|
def tweets(self):
|
2022-05-23 18:23:21 +02:00
|
|
|
|
# yield initial batch of (media) tweets
|
2022-05-02 08:50:04 +02:00
|
|
|
|
tweet = None
|
2022-07-03 14:29:15 +02:00
|
|
|
|
for tweet in self._select_tweet_source()(self.user):
|
2022-05-02 08:50:04 +02:00
|
|
|
|
yield tweet
|
|
|
|
|
if tweet is None:
|
|
|
|
|
return
|
|
|
|
|
|
2022-06-11 18:07:07 +02:00
|
|
|
|
# build search query
|
2022-07-17 17:04:24 +02:00
|
|
|
|
query = "from:{} max_id:{}".format(
|
|
|
|
|
self._user["name"], tweet["rest_id"])
|
2022-06-11 18:07:07 +02:00
|
|
|
|
if self.retweets:
|
|
|
|
|
query += " include:retweets include:nativeretweets"
|
2022-07-20 15:12:08 +02:00
|
|
|
|
|
2022-06-11 18:07:07 +02:00
|
|
|
|
if not self.textonly:
|
2022-07-20 15:12:08 +02:00
|
|
|
|
# try to search for media-only tweets
|
|
|
|
|
tweet = None
|
|
|
|
|
for tweet in self.api.search_adaptive(query + (
|
|
|
|
|
" (filter:images OR"
|
|
|
|
|
" filter:native_video OR"
|
|
|
|
|
" card_name:animated_gif)")):
|
|
|
|
|
yield tweet
|
|
|
|
|
if tweet is not None:
|
|
|
|
|
return
|
2022-06-11 18:07:07 +02:00
|
|
|
|
|
2022-07-20 15:12:08 +02:00
|
|
|
|
# yield unfiltered search results
|
2022-06-11 18:07:07 +02:00
|
|
|
|
yield from self.api.search_adaptive(query)
|
2022-05-02 08:50:04 +02:00
|
|
|
|
|
2022-07-03 14:29:15 +02:00
|
|
|
|
def _select_tweet_source(self):
|
|
|
|
|
strategy = self.config("strategy")
|
|
|
|
|
if strategy is None or strategy == "auto":
|
|
|
|
|
if self.retweets or self.textonly:
|
|
|
|
|
return self.api.user_tweets
|
|
|
|
|
else:
|
|
|
|
|
return self.api.user_media
|
|
|
|
|
if strategy == "tweets":
|
|
|
|
|
return self.api.user_tweets
|
|
|
|
|
if strategy == "with_replies":
|
|
|
|
|
return self.api.user_tweets_and_replies
|
|
|
|
|
return self.api.user_media
|
|
|
|
|
|
2018-08-19 20:36:33 +02:00
|
|
|
|
|
2022-05-23 18:23:21 +02:00
|
|
|
|
class TwitterTweetsExtractor(TwitterExtractor):
|
|
|
|
|
"""Extractor for Tweets from a user's Tweets timeline"""
|
|
|
|
|
subcategory = "tweets"
|
|
|
|
|
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)"
|
|
|
|
|
test = (
|
|
|
|
|
("https://twitter.com/supernaturepics/tweets", {
|
|
|
|
|
"range": "1-40",
|
|
|
|
|
"url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
|
|
|
|
|
}),
|
|
|
|
|
("https://mobile.twitter.com/supernaturepics/tweets#t"),
|
|
|
|
|
("https://www.twitter.com/id:2976459548/tweets"),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def tweets(self):
|
|
|
|
|
return self.api.user_tweets(self.user)
|
|
|
|
|
|
|
|
|
|
|
2021-09-10 20:40:43 +02:00
|
|
|
|
class TwitterRepliesExtractor(TwitterExtractor):
|
|
|
|
|
"""Extractor for Tweets from a user's timeline including replies"""
|
|
|
|
|
subcategory = "replies"
|
|
|
|
|
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/with_replies(?!\w)"
|
|
|
|
|
test = (
|
|
|
|
|
("https://twitter.com/supernaturepics/with_replies", {
|
|
|
|
|
"range": "1-40",
|
|
|
|
|
"url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
|
|
|
|
|
}),
|
|
|
|
|
("https://mobile.twitter.com/supernaturepics/with_replies#t"),
|
|
|
|
|
("https://www.twitter.com/id:2976459548/with_replies"),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def tweets(self):
|
2022-01-23 01:44:55 +01:00
|
|
|
|
return self.api.user_tweets_and_replies(self.user)
|
2021-09-10 20:40:43 +02:00
|
|
|
|
|
|
|
|
|
|
2018-08-19 20:36:33 +02:00
|
|
|
|
class TwitterMediaExtractor(TwitterExtractor):
|
2021-09-10 20:40:43 +02:00
|
|
|
|
"""Extractor for Tweets from a user's Media timeline"""
|
2018-08-19 20:36:33 +02:00
|
|
|
|
subcategory = "media"
|
2020-10-22 23:12:59 +02:00
|
|
|
|
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
|
2019-09-01 17:37:48 +02:00
|
|
|
|
test = (
|
|
|
|
|
("https://twitter.com/supernaturepics/media", {
|
|
|
|
|
"range": "1-40",
|
2020-12-01 11:53:51 +01:00
|
|
|
|
"url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
|
2019-09-01 17:37:48 +02:00
|
|
|
|
}),
|
|
|
|
|
("https://mobile.twitter.com/supernaturepics/media#t"),
|
2020-09-08 22:56:52 +02:00
|
|
|
|
("https://www.twitter.com/id:2976459548/media"),
|
2019-09-01 17:37:48 +02:00
|
|
|
|
)
|
2018-08-19 20:36:33 +02:00
|
|
|
|
|
|
|
|
|
def tweets(self):
|
2022-01-23 01:44:55 +01:00
|
|
|
|
return self.api.user_media(self.user)
|
2018-08-19 20:36:33 +02:00
|
|
|
|
|
2019-10-17 18:34:07 +02:00
|
|
|
|
|
2020-06-16 14:27:22 +02:00
|
|
|
|
class TwitterLikesExtractor(TwitterExtractor):
|
|
|
|
|
"""Extractor for liked tweets"""
|
|
|
|
|
subcategory = "likes"
|
2020-10-22 23:12:59 +02:00
|
|
|
|
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
|
2020-06-16 14:27:22 +02:00
|
|
|
|
test = ("https://twitter.com/supernaturepics/likes",)
|
|
|
|
|
|
2021-04-02 02:52:01 +02:00
|
|
|
|
def metadata(self):
|
|
|
|
|
return {"user_likes": self.user}
|
|
|
|
|
|
2020-06-16 14:27:22 +02:00
|
|
|
|
def tweets(self):
|
2022-01-23 01:44:55 +01:00
|
|
|
|
return self.api.user_likes(self.user)
|
2020-06-16 14:27:22 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TwitterBookmarkExtractor(TwitterExtractor):
|
|
|
|
|
"""Extractor for bookmarked tweets"""
|
|
|
|
|
subcategory = "bookmark"
|
2020-07-13 23:48:42 +02:00
|
|
|
|
pattern = BASE_PATTERN + r"/i/bookmarks()"
|
2020-06-16 14:27:22 +02:00
|
|
|
|
test = ("https://twitter.com/i/bookmarks",)
|
|
|
|
|
|
|
|
|
|
def tweets(self):
|
2022-01-23 01:44:55 +01:00
|
|
|
|
return self.api.user_bookmarks()
|
2020-06-16 14:27:22 +02:00
|
|
|
|
|
|
|
|
|
|
2020-11-05 22:55:38 +01:00
|
|
|
|
class TwitterListExtractor(TwitterExtractor):
|
|
|
|
|
"""Extractor for Twitter lists"""
|
|
|
|
|
subcategory = "list"
|
2020-11-13 06:47:45 +01:00
|
|
|
|
pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
|
2020-11-05 22:55:38 +01:00
|
|
|
|
test = ("https://twitter.com/i/lists/784214683683127296", {
|
|
|
|
|
"range": "1-40",
|
|
|
|
|
"count": 40,
|
|
|
|
|
"archive": False,
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
def tweets(self):
|
2022-01-23 01:44:55 +01:00
|
|
|
|
return self.api.list_latest_tweets_timeline(self.user)
|
2020-11-05 22:55:38 +01:00
|
|
|
|
|
|
|
|
|
|
2020-11-13 06:47:45 +01:00
|
|
|
|
class TwitterListMembersExtractor(TwitterExtractor):
|
|
|
|
|
"""Extractor for members of a Twitter list"""
|
|
|
|
|
subcategory = "list-members"
|
|
|
|
|
pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
|
|
|
|
|
test = ("https://twitter.com/i/lists/784214683683127296/members",)
|
|
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
|
self.login()
|
2021-03-15 22:55:24 +01:00
|
|
|
|
return self._users_result(TwitterAPI(self).list_members(self.user))
|
2020-11-13 06:47:45 +01:00
|
|
|
|
|
|
|
|
|
|
2021-02-22 18:18:33 +01:00
|
|
|
|
class TwitterFollowingExtractor(TwitterExtractor):
|
|
|
|
|
"""Extractor for followed users"""
|
|
|
|
|
subcategory = "following"
|
|
|
|
|
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)"
|
|
|
|
|
test = (
|
|
|
|
|
("https://twitter.com/supernaturepics/following"),
|
|
|
|
|
("https://www.twitter.com/id:2976459548/following"),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
|
self.login()
|
2021-03-15 22:55:24 +01:00
|
|
|
|
return self._users_result(TwitterAPI(self).user_following(self.user))
|
2021-02-22 18:18:33 +01:00
|
|
|
|
|
|
|
|
|
|
2019-10-16 18:23:10 +02:00
|
|
|
|
class TwitterSearchExtractor(TwitterExtractor):
|
2022-01-22 20:55:50 +01:00
|
|
|
|
"""Extractor for Twitter search results"""
|
2019-10-16 18:23:10 +02:00
|
|
|
|
subcategory = "search"
|
2020-07-13 23:48:42 +02:00
|
|
|
|
pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
|
2019-10-17 18:34:07 +02:00
|
|
|
|
test = ("https://twitter.com/search?q=nature", {
|
|
|
|
|
"range": "1-40",
|
|
|
|
|
"count": 40,
|
2020-10-03 19:24:19 +02:00
|
|
|
|
"archive": False,
|
2019-10-17 18:34:07 +02:00
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
def metadata(self):
|
2020-06-07 03:10:09 +02:00
|
|
|
|
return {"search": text.unquote(self.user)}
|
2019-10-17 18:34:07 +02:00
|
|
|
|
|
2019-10-16 18:23:10 +02:00
|
|
|
|
def tweets(self):
|
2022-07-17 19:14:32 +02:00
|
|
|
|
query = text.unquote(self.user)
|
|
|
|
|
|
|
|
|
|
user = None
|
|
|
|
|
for item in query.split():
|
|
|
|
|
item = item.strip("()")
|
|
|
|
|
if item.startswith("from:"):
|
|
|
|
|
if user:
|
|
|
|
|
user = None
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
user = item[5:]
|
|
|
|
|
|
|
|
|
|
if user is not None:
|
|
|
|
|
try:
|
|
|
|
|
self._user_obj = user = self.api.user_by_screen_name(user)
|
|
|
|
|
except KeyError:
|
|
|
|
|
raise exception.NotFoundError("user")
|
|
|
|
|
self._user = self._transform_user(user)
|
|
|
|
|
|
|
|
|
|
return self.api.search_adaptive(query)
|
2022-01-22 20:55:50 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TwitterEventExtractor(TwitterExtractor):
|
|
|
|
|
"""Extractor for Tweets from a Twitter Event"""
|
|
|
|
|
subcategory = "event"
|
2022-01-24 17:44:17 +01:00
|
|
|
|
directory_fmt = ("{category}", "Events",
|
|
|
|
|
"{event[id]} {event[short_title]}")
|
2022-01-22 20:55:50 +01:00
|
|
|
|
pattern = BASE_PATTERN + r"/i/events/(\d+)"
|
|
|
|
|
test = ("https://twitter.com/i/events/1484669206993903616", {
|
|
|
|
|
"range": "1-20",
|
|
|
|
|
"count": ">5",
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
def metadata(self):
|
|
|
|
|
return {"event": self.api.live_event(self.user)}
|
|
|
|
|
|
|
|
|
|
def tweets(self):
|
|
|
|
|
return self.api.live_event_timeline(self.user)
|
2019-10-17 18:34:07 +02:00
|
|
|
|
|
2018-08-19 20:36:33 +02:00
|
|
|
|
|
2018-08-17 20:04:11 +02:00
|
|
|
|
class TwitterTweetExtractor(TwitterExtractor):
|
2018-08-18 18:58:10 +02:00
|
|
|
|
"""Extractor for images from individual tweets"""
|
2018-08-17 20:04:11 +02:00
|
|
|
|
subcategory = "tweet"
|
2020-10-22 23:12:59 +02:00
|
|
|
|
pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
|
2019-02-08 13:45:40 +01:00
|
|
|
|
test = (
|
2019-05-09 10:17:55 +02:00
|
|
|
|
("https://twitter.com/supernaturepics/status/604341487988576256", {
|
2020-12-01 11:53:51 +01:00
|
|
|
|
"url": "88a40f7d25529c2501c46f2218f9e0de9aa634b4",
|
2019-05-09 10:17:55 +02:00
|
|
|
|
"content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
|
2017-08-06 13:43:08 +02:00
|
|
|
|
}),
|
2019-04-21 15:41:22 +02:00
|
|
|
|
# 4 images
|
2017-08-06 13:43:08 +02:00
|
|
|
|
("https://twitter.com/perrypumas/status/894001459754180609", {
|
2020-12-01 11:53:51 +01:00
|
|
|
|
"url": "3a2a43dc5fb79dd5432c701d8e55e87c4e551f47",
|
2019-04-21 15:41:22 +02:00
|
|
|
|
}),
|
|
|
|
|
# video
|
|
|
|
|
("https://twitter.com/perrypumas/status/1065692031626829824", {
|
2020-06-03 20:51:29 +02:00
|
|
|
|
"pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
|
2017-08-06 13:43:08 +02:00
|
|
|
|
}),
|
2019-07-17 15:35:42 +02:00
|
|
|
|
# content with emoji, newlines, hashtags (#338)
|
2020-05-28 01:55:32 +02:00
|
|
|
|
("https://twitter.com/playpokemon/status/1263832915173048321", {
|
2020-06-06 23:51:54 +02:00
|
|
|
|
"keyword": {"content": (
|
2020-05-28 01:55:32 +02:00
|
|
|
|
r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
|
|
|
|
|
"Gifts! \n\nYou’ll be able to receive four Galarian form "
|
|
|
|
|
"Pokémon with Hidden Abilities, plus some very useful items. "
|
|
|
|
|
"It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
|
2020-02-22 02:59:56 +01:00
|
|
|
|
)},
|
2019-07-17 15:35:42 +02:00
|
|
|
|
}),
|
2020-06-19 18:12:57 +02:00
|
|
|
|
# Reply to deleted tweet (#403, #838)
|
|
|
|
|
("https://twitter.com/i/web/status/1170041925560258560", {
|
2020-12-01 11:53:51 +01:00
|
|
|
|
"pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_",
|
2019-09-01 17:37:48 +02:00
|
|
|
|
}),
|
2020-04-29 23:11:24 +02:00
|
|
|
|
# 'replies' option (#705)
|
2020-06-19 18:12:57 +02:00
|
|
|
|
("https://twitter.com/i/web/status/1170041925560258560", {
|
2020-04-29 23:11:24 +02:00
|
|
|
|
"options": (("replies", False),),
|
|
|
|
|
"count": 0,
|
|
|
|
|
}),
|
2021-08-10 22:02:19 +02:00
|
|
|
|
# 'replies' to self (#1254)
|
|
|
|
|
("https://twitter.com/i/web/status/1424882930803908612", {
|
|
|
|
|
"options": (("replies", "self"),),
|
|
|
|
|
"count": 4,
|
2021-08-23 22:49:35 +02:00
|
|
|
|
"keyword": {"user": {
|
|
|
|
|
"description": "re:business email-- rhettaro.bloom@gmail.com "
|
|
|
|
|
"patreon- http://patreon.com/Princecanary",
|
|
|
|
|
"url": "http://princecanary.tumblr.com",
|
|
|
|
|
}},
|
2021-08-10 22:02:19 +02:00
|
|
|
|
}),
|
|
|
|
|
("https://twitter.com/i/web/status/1424898916156284928", {
|
|
|
|
|
"options": (("replies", "self"),),
|
|
|
|
|
"count": 0,
|
|
|
|
|
}),
|
2021-06-11 21:19:04 +02:00
|
|
|
|
# "quoted" option (#854)
|
2020-06-24 21:13:16 +02:00
|
|
|
|
("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
|
2021-06-11 21:19:04 +02:00
|
|
|
|
"options": (("quoted", True),),
|
2020-12-01 11:53:51 +01:00
|
|
|
|
"pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+=jpg",
|
2020-06-24 21:13:16 +02:00
|
|
|
|
"count": 8,
|
|
|
|
|
}),
|
2021-06-11 21:19:04 +02:00
|
|
|
|
# quoted tweet (#526, #854)
|
2020-06-24 21:13:16 +02:00
|
|
|
|
("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
|
2020-12-01 11:53:51 +01:00
|
|
|
|
"pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg",
|
2020-06-24 21:13:16 +02:00
|
|
|
|
"count": 4,
|
2020-01-04 21:26:55 +01:00
|
|
|
|
}),
|
2020-01-18 21:26:46 +01:00
|
|
|
|
# TwitPic embeds (#579)
|
|
|
|
|
("https://twitter.com/i/web/status/112900228289540096", {
|
2022-01-21 23:34:41 +01:00
|
|
|
|
"options": (("twitpic", True), ("cards", False)),
|
2020-01-18 21:26:46 +01:00
|
|
|
|
"pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
|
|
|
|
|
"count": 3,
|
|
|
|
|
}),
|
2020-10-22 21:33:53 +02:00
|
|
|
|
# Nitter tweet (#890)
|
2020-07-13 23:48:42 +02:00
|
|
|
|
("https://nitter.net/ed1conf/status/1163841619336007680", {
|
2020-12-01 11:53:51 +01:00
|
|
|
|
"url": "4a9ea898b14d3c112f98562d0df75c9785e239d9",
|
2020-07-13 23:48:42 +02:00
|
|
|
|
"content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
|
|
|
|
|
}),
|
2020-10-22 21:33:53 +02:00
|
|
|
|
# Twitter card (#1005)
|
|
|
|
|
("https://twitter.com/billboard/status/1306599586602135555", {
|
|
|
|
|
"options": (("cards", True),),
|
2020-11-05 22:53:29 +01:00
|
|
|
|
"pattern": r"https://pbs.twimg.com/card_img/\d+/",
|
2020-10-22 21:33:53 +02:00
|
|
|
|
}),
|
2022-01-13 15:58:18 +01:00
|
|
|
|
# unified_card with image_carousel_website
|
|
|
|
|
("https://twitter.com/doax_vv_staff/status/1479438945662685184", {
|
|
|
|
|
"options": (("cards", True),),
|
|
|
|
|
"pattern": r"https://pbs\.twimg\.com/media/F.+=png",
|
|
|
|
|
"count": 6,
|
|
|
|
|
}),
|
2022-01-25 16:13:22 +01:00
|
|
|
|
# unified_card without type
|
|
|
|
|
("https://twitter.com/i/web/status/1466183847628865544", {
|
|
|
|
|
"count": 0,
|
|
|
|
|
}),
|
2020-09-28 23:03:35 +02:00
|
|
|
|
# original retweets (#1026)
|
|
|
|
|
("https://twitter.com/jessica_3978/status/1296304589591810048", {
|
|
|
|
|
"options": (("retweets", "original"),),
|
|
|
|
|
"count": 2,
|
|
|
|
|
"keyword": {
|
2021-07-02 21:47:22 +02:00
|
|
|
|
"tweet_id" : 1296296016002547713,
|
|
|
|
|
"retweet_id": 1296296016002547713,
|
|
|
|
|
"date" : "dt:2020-08-20 04:00:28",
|
2020-09-28 23:03:35 +02:00
|
|
|
|
},
|
|
|
|
|
}),
|
2022-06-13 16:27:30 +02:00
|
|
|
|
# all Tweets from a 'conversation' (#1319)
|
|
|
|
|
("https://twitter.com/supernaturepics/status/604341487988576256", {
|
2021-02-26 13:50:46 +01:00
|
|
|
|
"options": (("conversations", True),),
|
2022-06-13 16:27:30 +02:00
|
|
|
|
"count": 5,
|
2021-02-26 13:50:46 +01:00
|
|
|
|
}),
|
2021-05-14 22:46:06 +02:00
|
|
|
|
# retweet with missing media entities (#1555)
|
|
|
|
|
("https://twitter.com/morino_ya/status/1392763691599237121", {
|
2021-06-11 21:19:04 +02:00
|
|
|
|
"options": (("retweets", True),),
|
2021-05-14 22:46:06 +02:00
|
|
|
|
"count": 4,
|
|
|
|
|
}),
|
2022-01-25 16:13:22 +01:00
|
|
|
|
# deleted quote tweet (#2225)
|
|
|
|
|
("https://twitter.com/i/web/status/1460044411165888515", {
|
|
|
|
|
"count": 0,
|
|
|
|
|
}),
|
2022-03-03 01:56:14 +01:00
|
|
|
|
# "Misleading" content
|
|
|
|
|
("https://twitter.com/i/web/status/1486373748911575046", {
|
|
|
|
|
"count": 4,
|
|
|
|
|
}),
|
2022-03-31 20:31:58 +02:00
|
|
|
|
# age-restricted (#2354)
|
|
|
|
|
("https://twitter.com/mightbecursed/status/1492954264909479936", {
|
|
|
|
|
"options": (("syndication", True),),
|
|
|
|
|
"count": 1,
|
|
|
|
|
}),
|
2022-05-24 12:37:38 +02:00
|
|
|
|
# media alt texts / descriptions (#2617)
|
|
|
|
|
("https://twitter.com/my0nruri/status/1528379296041299968", {
|
|
|
|
|
"keyword": {"description": "oc"}
|
|
|
|
|
}),
|
2019-02-08 13:45:40 +01:00
|
|
|
|
)
|
2016-10-06 19:12:07 +02:00
|
|
|
|
|
|
|
|
|
def __init__(self, match):
|
2018-08-19 20:36:33 +02:00
|
|
|
|
TwitterExtractor.__init__(self, match)
|
|
|
|
|
self.tweet_id = match.group(2)
|
2016-10-06 19:12:07 +02:00
|
|
|
|
|
2018-08-17 20:04:11 +02:00
|
|
|
|
def tweets(self):
|
2021-02-26 13:50:46 +01:00
|
|
|
|
if self.config("conversations", False):
|
2022-01-23 01:44:55 +01:00
|
|
|
|
return self.api.tweet_detail(self.tweet_id)
|
2022-01-21 23:34:41 +01:00
|
|
|
|
|
|
|
|
|
tweets = []
|
|
|
|
|
tweet_id = self.tweet_id
|
2022-01-23 01:44:55 +01:00
|
|
|
|
for tweet in self.api.tweet_detail(tweet_id):
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if tweet["rest_id"] == tweet_id or \
|
|
|
|
|
tweet.get("_retweet_id_str") == tweet_id:
|
|
|
|
|
tweets.append(tweet)
|
|
|
|
|
|
|
|
|
|
tweet_id = tweet["legacy"].get("quoted_status_id_str")
|
|
|
|
|
if not tweet_id:
|
|
|
|
|
break
|
|
|
|
|
return tweets
|
2020-01-04 23:46:29 +01:00
|
|
|
|
|
|
|
|
|
|
2021-04-02 02:45:23 +02:00
|
|
|
|
class TwitterImageExtractor(Extractor):
|
|
|
|
|
category = "twitter"
|
|
|
|
|
subcategory = "image"
|
|
|
|
|
pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)"
|
|
|
|
|
test = (
|
2021-11-16 22:57:46 +01:00
|
|
|
|
("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg&name=orig", {
|
|
|
|
|
"options": (("size", "4096x4096,orig"),),
|
|
|
|
|
"url": "cb3042a6f6826923da98f0d2b66c427e9385114c",
|
|
|
|
|
}),
|
2021-04-02 02:45:23 +02:00
|
|
|
|
("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
|
Extractor.__init__(self, match)
|
|
|
|
|
self.id, self.fmt = match.groups()
|
2021-11-16 22:57:46 +01:00
|
|
|
|
TwitterExtractor._init_sizes(self)
|
2021-04-02 02:45:23 +02:00
|
|
|
|
|
|
|
|
|
def items(self):
|
2021-06-28 16:25:24 +02:00
|
|
|
|
base = "https://pbs.twimg.com/media/{}?format={}&name=".format(
|
|
|
|
|
self.id, self.fmt)
|
2021-04-02 02:45:23 +02:00
|
|
|
|
|
|
|
|
|
data = {
|
|
|
|
|
"filename": self.id,
|
|
|
|
|
"extension": self.fmt,
|
2021-11-16 22:57:46 +01:00
|
|
|
|
"_fallback": TwitterExtractor._image_fallback(self, base),
|
2021-04-02 02:45:23 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
yield Message.Directory, data
|
2021-11-16 22:57:46 +01:00
|
|
|
|
yield Message.Url, base + self._size_image, data
|
2021-04-02 02:45:23 +02:00
|
|
|
|
|
|
|
|
|
|
2020-06-03 20:51:29 +02:00
|
|
|
|
class TwitterAPI():
|
|
|
|
|
|
|
|
|
|
def __init__(self, extractor):
|
|
|
|
|
self.extractor = extractor
|
2020-12-28 22:05:48 +01:00
|
|
|
|
|
|
|
|
|
self.root = "https://twitter.com/i/api"
|
2020-06-03 20:51:29 +02:00
|
|
|
|
self.headers = {
|
|
|
|
|
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
|
|
|
|
|
"COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
|
|
|
|
|
"4FA33AGWWjCpTnA",
|
|
|
|
|
"x-guest-token": None,
|
2020-12-28 22:05:48 +01:00
|
|
|
|
"x-twitter-auth-type": None,
|
2020-06-03 20:51:29 +02:00
|
|
|
|
"x-twitter-client-language": "en",
|
|
|
|
|
"x-twitter-active-user": "yes",
|
|
|
|
|
"x-csrf-token": None,
|
|
|
|
|
"Referer": "https://twitter.com/",
|
|
|
|
|
}
|
2022-01-22 20:55:50 +01:00
|
|
|
|
self.params = {
|
|
|
|
|
"include_profile_interstitial_type": "1",
|
|
|
|
|
"include_blocking": "1",
|
|
|
|
|
"include_blocked_by": "1",
|
|
|
|
|
"include_followed_by": "1",
|
|
|
|
|
"include_want_retweets": "1",
|
|
|
|
|
"include_mute_edge": "1",
|
|
|
|
|
"include_can_dm": "1",
|
|
|
|
|
"include_can_media_tag": "1",
|
|
|
|
|
"include_ext_has_nft_avatar": "1",
|
|
|
|
|
"skip_status": "1",
|
|
|
|
|
"cards_platform": "Web-12",
|
|
|
|
|
"include_cards": "1",
|
|
|
|
|
"include_ext_alt_text": "true",
|
|
|
|
|
"include_quote_count": "true",
|
|
|
|
|
"include_reply_count": "1",
|
|
|
|
|
"tweet_mode": "extended",
|
|
|
|
|
"include_entities": "true",
|
|
|
|
|
"include_user_entities": "true",
|
|
|
|
|
"include_ext_media_color": "true",
|
|
|
|
|
"include_ext_media_availability": "true",
|
|
|
|
|
"include_ext_sensitive_media_warning": "true",
|
|
|
|
|
"send_error_codes": "true",
|
|
|
|
|
"simple_quoted_tweet": "true",
|
|
|
|
|
"count": "100",
|
|
|
|
|
"cursor": None,
|
|
|
|
|
"ext": "mediaStats,highlightedLabel,hasNftAvatar,"
|
|
|
|
|
"voiceInfo,superFollowMetadata",
|
|
|
|
|
}
|
2022-01-23 01:44:55 +01:00
|
|
|
|
self.variables = {
|
|
|
|
|
"includePromotedContent": False,
|
|
|
|
|
"withSuperFollowsUserFields": True,
|
|
|
|
|
"withBirdwatchPivots": False,
|
|
|
|
|
"withDownvotePerspective": False,
|
|
|
|
|
"withReactionsMetadata": False,
|
|
|
|
|
"withReactionsPerspective": False,
|
|
|
|
|
"withSuperFollowsTweetFields": True,
|
|
|
|
|
"withClientEventToken": False,
|
|
|
|
|
"withBirdwatchNotes": False,
|
|
|
|
|
"withVoice": True,
|
|
|
|
|
"withV2Timeline": False,
|
|
|
|
|
"__fs_interactive_text": False,
|
|
|
|
|
"__fs_dont_mention_me_view_api_enabled": False,
|
|
|
|
|
}
|
2022-02-02 18:37:19 +01:00
|
|
|
|
|
2022-03-03 01:51:52 +01:00
|
|
|
|
self._nsfw_warning = True
|
2022-03-31 20:31:58 +02:00
|
|
|
|
self._syndication = extractor.config("syndication")
|
2022-01-23 01:44:55 +01:00
|
|
|
|
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
|
2022-01-21 23:34:41 +01:00
|
|
|
|
|
|
|
|
|
cookies = extractor.session.cookies
|
|
|
|
|
cookiedomain = extractor.cookiedomain
|
|
|
|
|
|
2022-06-13 18:36:39 +02:00
|
|
|
|
csrf = extractor.config("csrf")
|
|
|
|
|
if csrf is None or csrf == "cookies":
|
|
|
|
|
csrf_token = cookies.get("ct0", domain=cookiedomain)
|
|
|
|
|
else:
|
|
|
|
|
csrf_token = None
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if not csrf_token:
|
|
|
|
|
csrf_token = util.generate_token()
|
|
|
|
|
cookies.set("ct0", csrf_token, domain=cookiedomain)
|
|
|
|
|
self.headers["x-csrf-token"] = csrf_token
|
|
|
|
|
|
|
|
|
|
if cookies.get("auth_token", domain=cookiedomain):
|
|
|
|
|
# logged in
|
|
|
|
|
self.headers["x-twitter-auth-type"] = "OAuth2Session"
|
|
|
|
|
else:
|
|
|
|
|
# guest
|
|
|
|
|
guest_token = self._guest_token()
|
|
|
|
|
cookies.set("gt", guest_token, domain=cookiedomain)
|
|
|
|
|
self.headers["x-guest-token"] = guest_token
|
|
|
|
|
|
|
|
|
|
def tweet_detail(self, tweet_id):
|
2022-03-02 23:05:31 +01:00
|
|
|
|
endpoint = "/graphql/ItejhtHVxU7ksltgMmyaLA/TweetDetail"
|
2022-01-21 23:34:41 +01:00
|
|
|
|
variables = {
|
|
|
|
|
"focalTweetId": tweet_id,
|
|
|
|
|
"with_rux_injections": False,
|
|
|
|
|
"withCommunity": True,
|
|
|
|
|
"withQuickPromoteEligibilityTweetFields": True,
|
|
|
|
|
"withBirdwatchNotes": False,
|
|
|
|
|
}
|
|
|
|
|
return self._pagination_tweets(
|
|
|
|
|
endpoint, variables, ("threaded_conversation_with_injections",))
|
|
|
|
|
|
|
|
|
|
def user_tweets(self, screen_name):
|
2022-03-02 23:05:31 +01:00
|
|
|
|
endpoint = "/graphql/WZT7sCTrLvSOaWOXLDsWbQ/UserTweets"
|
2022-01-21 23:34:41 +01:00
|
|
|
|
variables = {
|
|
|
|
|
"userId": self._user_id_by_screen_name(screen_name),
|
|
|
|
|
"count": 100,
|
|
|
|
|
"withQuickPromoteEligibilityTweetFields": True,
|
|
|
|
|
}
|
|
|
|
|
return self._pagination_tweets(endpoint, variables)
|
|
|
|
|
|
|
|
|
|
def user_tweets_and_replies(self, screen_name):
|
2022-03-02 23:05:31 +01:00
|
|
|
|
endpoint = "/graphql/t4wEKVulW4Mbv1P0kgxTEw/UserTweetsAndReplies"
|
2022-01-21 23:34:41 +01:00
|
|
|
|
variables = {
|
|
|
|
|
"userId": self._user_id_by_screen_name(screen_name),
|
|
|
|
|
"count": 100,
|
|
|
|
|
"withCommunity": True,
|
|
|
|
|
}
|
|
|
|
|
return self._pagination_tweets(endpoint, variables)
|
|
|
|
|
|
|
|
|
|
def user_media(self, screen_name):
|
2022-03-02 23:05:31 +01:00
|
|
|
|
endpoint = "/graphql/nRybED9kRbN-TOWioHq1ng/UserMedia"
|
2022-01-21 23:34:41 +01:00
|
|
|
|
variables = {
|
|
|
|
|
"userId": self._user_id_by_screen_name(screen_name),
|
|
|
|
|
"count": 100,
|
|
|
|
|
}
|
|
|
|
|
return self._pagination_tweets(endpoint, variables)
|
|
|
|
|
|
|
|
|
|
def user_likes(self, screen_name):
|
2022-03-02 23:05:31 +01:00
|
|
|
|
endpoint = "/graphql/9MSTt44HoGjVFSg_u3rHDw/Likes"
|
2022-01-21 23:34:41 +01:00
|
|
|
|
variables = {
|
|
|
|
|
"userId": self._user_id_by_screen_name(screen_name),
|
|
|
|
|
"count": 100,
|
|
|
|
|
}
|
|
|
|
|
return self._pagination_tweets(endpoint, variables)
|
|
|
|
|
|
|
|
|
|
def user_bookmarks(self):
|
2022-03-02 23:05:31 +01:00
|
|
|
|
endpoint = "/graphql/uKP9v_I31k0_VSBmlpq2Xg/Bookmarks"
|
2022-01-21 23:34:41 +01:00
|
|
|
|
variables = {
|
|
|
|
|
"count": 100,
|
|
|
|
|
}
|
|
|
|
|
return self._pagination_tweets(
|
|
|
|
|
endpoint, variables, ("bookmark_timeline", "timeline"))
|
|
|
|
|
|
|
|
|
|
def list_latest_tweets_timeline(self, list_id):
|
2022-03-02 23:05:31 +01:00
|
|
|
|
endpoint = "/graphql/z3l-EHlx-fyg8OvGO4JN8A/ListLatestTweetsTimeline"
|
2022-01-21 23:34:41 +01:00
|
|
|
|
variables = {
|
|
|
|
|
"listId": list_id,
|
|
|
|
|
"count": 100,
|
|
|
|
|
}
|
|
|
|
|
return self._pagination_tweets(
|
|
|
|
|
endpoint, variables, ("list", "tweets_timeline", "timeline"))
|
|
|
|
|
|
2022-01-22 20:55:50 +01:00
|
|
|
|
def search_adaptive(self, query):
|
2022-01-21 23:34:41 +01:00
|
|
|
|
endpoint = "/2/search/adaptive.json"
|
2022-01-22 20:55:50 +01:00
|
|
|
|
params = self.params.copy()
|
|
|
|
|
params["q"] = query
|
|
|
|
|
params["tweet_search_mode"] = "live"
|
|
|
|
|
params["query_source"] = "typed_query"
|
|
|
|
|
params["pc"] = "1"
|
|
|
|
|
params["spelling_corrections"] = "1"
|
|
|
|
|
return self._pagination_legacy(endpoint, params)
|
|
|
|
|
|
|
|
|
|
def live_event_timeline(self, event_id):
|
|
|
|
|
endpoint = "/2/live_event/timeline/{}.json".format(event_id)
|
|
|
|
|
params = self.params.copy()
|
|
|
|
|
params["timeline_id"] = "recap"
|
|
|
|
|
params["urt"] = "true"
|
|
|
|
|
params["get_annotations"] = "true"
|
|
|
|
|
return self._pagination_legacy(endpoint, params)
|
|
|
|
|
|
|
|
|
|
def live_event(self, event_id):
|
|
|
|
|
endpoint = "/1.1/live_event/1/{}/timeline.json".format(event_id)
|
|
|
|
|
params = self.params.copy()
|
|
|
|
|
params["count"] = "0"
|
|
|
|
|
params["urt"] = "true"
|
|
|
|
|
return (self._call(endpoint, params)
|
|
|
|
|
["twitter_objects"]["live_events"][event_id])
|
2020-06-03 20:51:29 +02:00
|
|
|
|
|
2020-11-05 22:55:38 +01:00
|
|
|
|
def list_by_rest_id(self, list_id):
|
2022-01-21 23:34:41 +01:00
|
|
|
|
endpoint = "/graphql/BWEhzAk7k8TwbU4lKH2dpw/ListByRestId"
|
2022-01-23 01:44:55 +01:00
|
|
|
|
params = {"variables": self._json_dumps({
|
2022-01-21 23:34:41 +01:00
|
|
|
|
"listId": list_id,
|
|
|
|
|
"withSuperFollowsUserFields": True,
|
|
|
|
|
})}
|
2020-11-05 22:55:38 +01:00
|
|
|
|
try:
|
|
|
|
|
return self._call(endpoint, params)["data"]["list"]
|
|
|
|
|
except KeyError:
|
|
|
|
|
raise exception.NotFoundError("list")
|
|
|
|
|
|
2021-02-22 18:18:33 +01:00
|
|
|
|
def list_members(self, list_id):
|
2022-03-02 23:05:31 +01:00
|
|
|
|
endpoint = "/graphql/snESM0DPs3c7M1SBm4rvVw/ListMembers"
|
2021-02-22 18:18:33 +01:00
|
|
|
|
variables = {
|
|
|
|
|
"listId": list_id,
|
2022-01-21 23:34:41 +01:00
|
|
|
|
"count": 100,
|
|
|
|
|
"withSafetyModeUserFields": True,
|
2021-02-22 18:18:33 +01:00
|
|
|
|
}
|
2022-01-21 23:34:41 +01:00
|
|
|
|
return self._pagination_users(
|
|
|
|
|
endpoint, variables, ("list", "members_timeline", "timeline"))
|
2021-02-22 18:18:33 +01:00
|
|
|
|
|
|
|
|
|
def user_following(self, screen_name):
|
2022-03-02 23:05:31 +01:00
|
|
|
|
endpoint = "/graphql/mIwX8GogcobVlRwlgpHNYA/Following"
|
2021-02-22 18:18:33 +01:00
|
|
|
|
variables = {
|
|
|
|
|
"userId": self._user_id_by_screen_name(screen_name),
|
2022-01-21 23:34:41 +01:00
|
|
|
|
"count": 100,
|
2021-02-22 18:18:33 +01:00
|
|
|
|
}
|
2022-01-21 23:34:41 +01:00
|
|
|
|
return self._pagination_users(endpoint, variables)
|
2021-02-22 18:18:33 +01:00
|
|
|
|
|
2022-02-01 18:24:03 +01:00
|
|
|
|
def user_by_rest_id(self, rest_id):
|
|
|
|
|
endpoint = "/graphql/I5nvpI91ljifos1Y3Lltyg/UserByRestId"
|
|
|
|
|
params = {"variables": self._json_dumps({
|
|
|
|
|
"userId": rest_id,
|
|
|
|
|
"withSafetyModeUserFields": True,
|
|
|
|
|
"withSuperFollowsUserFields": True,
|
|
|
|
|
})}
|
|
|
|
|
return self._call(endpoint, params)["data"]["user"]["result"]
|
|
|
|
|
|
2020-06-03 20:51:29 +02:00
|
|
|
|
def user_by_screen_name(self, screen_name):
|
2022-01-21 23:34:41 +01:00
|
|
|
|
endpoint = "/graphql/7mjxD3-C6BxitPMVQ6w0-Q/UserByScreenName"
|
2022-01-23 01:44:55 +01:00
|
|
|
|
params = {"variables": self._json_dumps({
|
2022-01-21 23:34:41 +01:00
|
|
|
|
"screen_name": screen_name,
|
|
|
|
|
"withSafetyModeUserFields": True,
|
|
|
|
|
"withSuperFollowsUserFields": True,
|
2022-01-23 01:44:55 +01:00
|
|
|
|
})}
|
2022-01-23 17:31:07 +01:00
|
|
|
|
return self._call(endpoint, params)["data"]["user"]["result"]
|
2020-06-03 20:51:29 +02:00
|
|
|
|
|
2020-09-08 22:56:52 +02:00
|
|
|
|
def _user_id_by_screen_name(self, screen_name):
|
|
|
|
|
if screen_name.startswith("id:"):
|
2022-06-13 18:56:25 +02:00
|
|
|
|
user_id = screen_name[3:]
|
2022-07-17 17:04:24 +02:00
|
|
|
|
user = self.user_by_rest_id(user_id)
|
2022-01-23 17:31:07 +01:00
|
|
|
|
|
2022-06-13 18:56:25 +02:00
|
|
|
|
else:
|
|
|
|
|
user = ()
|
|
|
|
|
try:
|
2022-07-17 17:04:24 +02:00
|
|
|
|
user = self.user_by_screen_name(screen_name)
|
2022-06-13 18:56:25 +02:00
|
|
|
|
user_id = user["rest_id"]
|
|
|
|
|
except KeyError:
|
|
|
|
|
if "unavailable_message" in user:
|
|
|
|
|
raise exception.NotFoundError("{} ({})".format(
|
|
|
|
|
user["unavailable_message"].get("text"),
|
|
|
|
|
user.get("reason")), False)
|
|
|
|
|
else:
|
|
|
|
|
raise exception.NotFoundError("user")
|
|
|
|
|
|
2022-07-17 17:04:24 +02:00
|
|
|
|
extr = self.extractor
|
|
|
|
|
extr._user_obj = user
|
|
|
|
|
extr._user = extr._transform_user(user)
|
|
|
|
|
|
2022-06-13 18:56:25 +02:00
|
|
|
|
return user_id
|
2020-09-08 22:56:52 +02:00
|
|
|
|
|
2020-06-18 00:28:38 +02:00
|
|
|
|
@cache(maxage=3600)
|
|
|
|
|
def _guest_token(self):
|
2020-12-28 22:05:48 +01:00
|
|
|
|
root = "https://api.twitter.com"
|
|
|
|
|
endpoint = "/1.1/guest/activate.json"
|
2021-07-01 14:35:53 +02:00
|
|
|
|
return str(self._call(endpoint, None, root, "POST")["guest_token"])
|
2020-06-18 00:28:38 +02:00
|
|
|
|
|
2022-02-02 18:37:19 +01:00
|
|
|
|
def _call(self, endpoint, params, root=None, method="GET"):
|
2020-12-28 22:05:48 +01:00
|
|
|
|
if root is None:
|
|
|
|
|
root = self.root
|
2020-07-06 23:13:05 +02:00
|
|
|
|
|
2021-01-19 23:15:57 +01:00
|
|
|
|
while True:
|
|
|
|
|
response = self.extractor.request(
|
|
|
|
|
root + endpoint, method=method, params=params,
|
|
|
|
|
headers=self.headers, fatal=None)
|
|
|
|
|
|
|
|
|
|
# update 'x-csrf-token' header (#1170)
|
|
|
|
|
csrf_token = response.cookies.get("ct0")
|
|
|
|
|
if csrf_token:
|
|
|
|
|
self.headers["x-csrf-token"] = csrf_token
|
|
|
|
|
|
|
|
|
|
if response.status_code < 400:
|
2021-08-12 19:11:41 +02:00
|
|
|
|
# success
|
2022-02-28 16:32:43 +01:00
|
|
|
|
return response.json()
|
2021-08-12 19:11:41 +02:00
|
|
|
|
|
2021-01-19 23:15:57 +01:00
|
|
|
|
if response.status_code == 429:
|
2021-08-12 19:11:41 +02:00
|
|
|
|
# rate limit exceeded
|
2021-01-19 23:15:57 +01:00
|
|
|
|
until = response.headers.get("x-rate-limit-reset")
|
|
|
|
|
seconds = None if until else 60
|
|
|
|
|
self.extractor.wait(until=until, seconds=seconds)
|
|
|
|
|
continue
|
2021-08-12 19:11:41 +02:00
|
|
|
|
|
|
|
|
|
# error
|
2022-02-28 16:32:43 +01:00
|
|
|
|
try:
|
|
|
|
|
data = response.json()
|
|
|
|
|
errors = ", ".join(e["message"] for e in data["errors"])
|
|
|
|
|
except ValueError:
|
|
|
|
|
errors = response.text
|
|
|
|
|
except Exception:
|
|
|
|
|
errors = data.get("errors", "")
|
|
|
|
|
|
2021-01-19 23:15:57 +01:00
|
|
|
|
raise exception.StopExtraction(
|
2021-11-13 22:44:11 +01:00
|
|
|
|
"%s %s (%s)", response.status_code, response.reason, errors)
|
2020-06-03 20:51:29 +02:00
|
|
|
|
|
2022-01-22 20:55:50 +01:00
|
|
|
|
def _pagination_legacy(self, endpoint, params):
|
2020-09-28 23:03:35 +02:00
|
|
|
|
original_retweets = (self.extractor.retweets == "original")
|
2020-03-05 22:55:26 +01:00
|
|
|
|
|
|
|
|
|
while True:
|
2020-06-07 03:10:09 +02:00
|
|
|
|
cursor = tweet = None
|
2020-06-03 20:51:29 +02:00
|
|
|
|
data = self._call(endpoint, params)
|
2020-06-07 03:10:09 +02:00
|
|
|
|
|
|
|
|
|
instr = data["timeline"]["instructions"]
|
|
|
|
|
if not instr:
|
|
|
|
|
return
|
2020-12-28 23:34:46 +01:00
|
|
|
|
tweet_ids = []
|
2020-03-05 22:55:26 +01:00
|
|
|
|
tweets = data["globalObjects"]["tweets"]
|
2020-06-03 20:51:29 +02:00
|
|
|
|
users = data["globalObjects"]["users"]
|
|
|
|
|
|
2020-12-28 23:34:46 +01:00
|
|
|
|
# collect tweet IDs and cursor value
|
2020-06-07 03:10:09 +02:00
|
|
|
|
for entry in instr[0]["addEntries"]["entries"]:
|
2020-12-28 23:34:46 +01:00
|
|
|
|
entry_startswith = entry["entryId"].startswith
|
|
|
|
|
|
|
|
|
|
if entry_startswith(("tweet-", "sq-I-t-")):
|
|
|
|
|
tweet_ids.append(
|
|
|
|
|
entry["content"]["item"]["content"]["tweet"]["id"])
|
2020-06-03 20:51:29 +02:00
|
|
|
|
|
2020-12-28 23:34:46 +01:00
|
|
|
|
elif entry_startswith("homeConversation-"):
|
|
|
|
|
tweet_ids.extend(
|
|
|
|
|
entry["content"]["timelineModule"]["metadata"]
|
|
|
|
|
["conversationMetadata"]["allTweetIds"][::-1])
|
|
|
|
|
|
|
|
|
|
elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")):
|
2020-06-07 03:10:09 +02:00
|
|
|
|
cursor = entry["content"]["operation"]["cursor"]
|
2022-01-22 20:55:50 +01:00
|
|
|
|
if not cursor.get("stopOnEmptyResponse", True):
|
2020-06-07 03:10:09 +02:00
|
|
|
|
# keep going even if there are no tweets
|
|
|
|
|
tweet = True
|
|
|
|
|
cursor = cursor["value"]
|
|
|
|
|
|
2021-02-26 13:50:46 +01:00
|
|
|
|
elif entry_startswith("conversationThread-"):
|
|
|
|
|
tweet_ids.extend(
|
|
|
|
|
item["entryId"][6:]
|
|
|
|
|
for item in entry["content"]["timelineModule"]["items"]
|
|
|
|
|
if item["entryId"].startswith("tweet-")
|
|
|
|
|
)
|
|
|
|
|
|
2020-12-28 23:34:46 +01:00
|
|
|
|
# process tweets
|
|
|
|
|
for tweet_id in tweet_ids:
|
|
|
|
|
try:
|
|
|
|
|
tweet = tweets[tweet_id]
|
|
|
|
|
except KeyError:
|
|
|
|
|
self.extractor.log.debug("Skipping %s (deleted)", tweet_id)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if "retweeted_status_id_str" in tweet:
|
|
|
|
|
retweet = tweets.get(tweet["retweeted_status_id_str"])
|
|
|
|
|
if original_retweets:
|
|
|
|
|
if not retweet:
|
|
|
|
|
continue
|
2021-07-02 21:47:22 +02:00
|
|
|
|
retweet["retweeted_status_id_str"] = retweet["id_str"]
|
2020-12-28 23:34:46 +01:00
|
|
|
|
retweet["_retweet_id_str"] = tweet["id_str"]
|
|
|
|
|
tweet = retweet
|
|
|
|
|
elif retweet:
|
|
|
|
|
tweet["author"] = users[retweet["user_id_str"]]
|
2021-05-14 22:46:06 +02:00
|
|
|
|
if "extended_entities" in retweet and \
|
|
|
|
|
"extended_entities" not in tweet:
|
|
|
|
|
tweet["extended_entities"] = \
|
|
|
|
|
retweet["extended_entities"]
|
2020-12-28 23:34:46 +01:00
|
|
|
|
tweet["user"] = users[tweet["user_id_str"]]
|
|
|
|
|
yield tweet
|
|
|
|
|
|
|
|
|
|
if "quoted_status_id_str" in tweet:
|
|
|
|
|
quoted = tweets.get(tweet["quoted_status_id_str"])
|
|
|
|
|
if quoted:
|
2021-08-25 20:04:22 +02:00
|
|
|
|
quoted = quoted.copy()
|
2020-12-28 23:34:46 +01:00
|
|
|
|
quoted["author"] = users[quoted["user_id_str"]]
|
2022-07-17 18:50:21 +02:00
|
|
|
|
quoted["quoted_by"] = tweet["user"]["screen_name"]
|
2021-09-25 18:15:14 +02:00
|
|
|
|
quoted["quoted_by_id_str"] = tweet["id_str"]
|
2020-12-28 23:34:46 +01:00
|
|
|
|
yield quoted
|
|
|
|
|
|
|
|
|
|
# update cursor value
|
2020-06-07 03:10:09 +02:00
|
|
|
|
if "replaceEntry" in instr[-1] :
|
|
|
|
|
cursor = (instr[-1]["replaceEntry"]["entry"]
|
|
|
|
|
["content"]["operation"]["cursor"]["value"])
|
2020-06-03 20:51:29 +02:00
|
|
|
|
|
2020-06-07 03:10:09 +02:00
|
|
|
|
if not cursor or not tweet:
|
2020-03-05 22:55:26 +01:00
|
|
|
|
return
|
2020-06-03 20:51:29 +02:00
|
|
|
|
params["cursor"] = cursor
|
2020-11-13 06:47:45 +01:00
|
|
|
|
|
2022-01-21 23:34:41 +01:00
|
|
|
|
def _pagination_tweets(self, endpoint, variables, path=None):
|
2022-02-01 18:24:03 +01:00
|
|
|
|
extr = self.extractor
|
2022-01-23 01:44:55 +01:00
|
|
|
|
variables.update(self.variables)
|
2022-02-01 18:24:03 +01:00
|
|
|
|
original_retweets = (extr.retweets == "original")
|
|
|
|
|
pinned_tweet = extr.pinned
|
2022-01-21 23:34:41 +01:00
|
|
|
|
|
|
|
|
|
while True:
|
2022-01-23 01:44:55 +01:00
|
|
|
|
params = {"variables": self._json_dumps(variables)}
|
2022-01-21 23:34:41 +01:00
|
|
|
|
data = self._call(endpoint, params)["data"]
|
|
|
|
|
|
2022-01-22 23:09:45 +01:00
|
|
|
|
try:
|
|
|
|
|
if path is None:
|
|
|
|
|
instructions = (data["user"]["result"]["timeline"]
|
|
|
|
|
["timeline"]["instructions"])
|
|
|
|
|
else:
|
2022-01-29 23:08:33 +01:00
|
|
|
|
instructions = data
|
2022-01-22 23:09:45 +01:00
|
|
|
|
for key in path:
|
2022-01-29 23:08:33 +01:00
|
|
|
|
instructions = instructions[key]
|
|
|
|
|
instructions = instructions["instructions"]
|
2022-01-23 17:31:07 +01:00
|
|
|
|
|
2022-02-07 23:18:35 +01:00
|
|
|
|
for instr in instructions:
|
|
|
|
|
if instr.get("type") == "TimelineAddEntries":
|
|
|
|
|
entries = instr["entries"]
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
raise KeyError()
|
|
|
|
|
|
2022-02-11 00:42:49 +01:00
|
|
|
|
except LookupError:
|
2022-02-01 18:24:03 +01:00
|
|
|
|
extr.log.debug(data)
|
|
|
|
|
|
2022-07-17 17:04:24 +02:00
|
|
|
|
user = extr._user_obj
|
|
|
|
|
if user:
|
|
|
|
|
user = user["legacy"]
|
|
|
|
|
if user.get("blocked_by"):
|
2022-02-01 18:24:03 +01:00
|
|
|
|
if self.headers["x-twitter-auth-type"] and \
|
|
|
|
|
extr.config("logout"):
|
|
|
|
|
guest_token = self._guest_token()
|
|
|
|
|
extr.session.cookies.set(
|
|
|
|
|
"gt", guest_token, domain=extr.cookiedomain)
|
|
|
|
|
extr._cookiefile = None
|
|
|
|
|
del extr.session.cookies["auth_token"]
|
|
|
|
|
self.headers["x-guest-token"] = guest_token
|
|
|
|
|
self.headers["x-twitter-auth-type"] = None
|
|
|
|
|
extr.log.info("Retrying API request as guest")
|
|
|
|
|
continue
|
|
|
|
|
raise exception.AuthorizationError(
|
|
|
|
|
"{} blocked your account".format(
|
|
|
|
|
user["screen_name"]))
|
|
|
|
|
elif user.get("protected"):
|
|
|
|
|
raise exception.AuthorizationError(
|
|
|
|
|
"{}'s Tweets are protected".format(
|
|
|
|
|
user["screen_name"]))
|
|
|
|
|
|
|
|
|
|
raise exception.StopExtraction(
|
|
|
|
|
"Unable to retrieve Tweets from this timeline")
|
2022-01-21 23:34:41 +01:00
|
|
|
|
|
2022-01-23 17:31:07 +01:00
|
|
|
|
tweets = []
|
|
|
|
|
tweet = cursor = None
|
|
|
|
|
|
2022-01-21 23:34:41 +01:00
|
|
|
|
if pinned_tweet:
|
|
|
|
|
pinned_tweet = False
|
|
|
|
|
if instructions[-1]["type"] == "TimelinePinEntry":
|
2022-01-23 22:52:57 +01:00
|
|
|
|
tweets.append(instructions[-1]["entry"])
|
2022-01-21 23:34:41 +01:00
|
|
|
|
|
2022-01-23 17:31:07 +01:00
|
|
|
|
for entry in entries:
|
2022-01-21 23:34:41 +01:00
|
|
|
|
esw = entry["entryId"].startswith
|
|
|
|
|
|
|
|
|
|
if esw("tweet-"):
|
2022-01-22 00:41:58 +01:00
|
|
|
|
tweets.append(entry)
|
2022-01-21 23:34:41 +01:00
|
|
|
|
elif esw("homeConversation-"):
|
2022-01-22 00:41:58 +01:00
|
|
|
|
tweets.extend(entry["content"]["items"])
|
2022-01-21 23:34:41 +01:00
|
|
|
|
elif esw("conversationthread-"):
|
2022-01-22 00:41:58 +01:00
|
|
|
|
tweets.extend(entry["content"]["items"])
|
2022-03-03 01:51:52 +01:00
|
|
|
|
elif esw("tombstone-"):
|
2022-03-31 20:31:58 +02:00
|
|
|
|
item = entry["content"]["itemContent"]
|
|
|
|
|
item["tweet_results"] = \
|
|
|
|
|
{"result": {"tombstone": item["tombstoneInfo"]}}
|
|
|
|
|
tweets.append(entry)
|
2022-01-21 23:34:41 +01:00
|
|
|
|
elif esw("cursor-bottom-"):
|
|
|
|
|
cursor = entry["content"]
|
2022-06-13 16:27:30 +02:00
|
|
|
|
if "itemContent" in cursor:
|
|
|
|
|
cursor = cursor["itemContent"]
|
2022-01-22 20:55:50 +01:00
|
|
|
|
if not cursor.get("stopOnEmptyResponse", True):
|
2022-01-21 23:34:41 +01:00
|
|
|
|
# keep going even if there are no tweets
|
|
|
|
|
tweet = True
|
2022-01-22 00:41:58 +01:00
|
|
|
|
cursor = cursor.get("value")
|
|
|
|
|
|
2022-01-25 16:13:22 +01:00
|
|
|
|
for entry in tweets:
|
2022-01-22 00:41:58 +01:00
|
|
|
|
try:
|
2022-01-25 16:13:22 +01:00
|
|
|
|
tweet = ((entry.get("content") or entry["item"])
|
2022-01-22 00:41:58 +01:00
|
|
|
|
["itemContent"]["tweet_results"]["result"])
|
2022-03-03 01:51:52 +01:00
|
|
|
|
if "tombstone" in tweet:
|
2022-03-31 20:31:58 +02:00
|
|
|
|
tweet = self._process_tombstone(
|
|
|
|
|
entry, tweet["tombstone"])
|
|
|
|
|
if not tweet:
|
|
|
|
|
continue
|
2022-03-03 01:56:14 +01:00
|
|
|
|
if "tweet" in tweet:
|
|
|
|
|
tweet = tweet["tweet"]
|
2022-01-25 16:13:22 +01:00
|
|
|
|
legacy = tweet["legacy"]
|
2022-01-22 00:41:58 +01:00
|
|
|
|
except KeyError:
|
2022-02-01 18:24:03 +01:00
|
|
|
|
extr.log.debug(
|
2022-01-22 00:41:58 +01:00
|
|
|
|
"Skipping %s (deleted)",
|
2022-01-25 16:13:22 +01:00
|
|
|
|
(entry.get("entryId") or "").rpartition("-")[2])
|
2022-01-22 00:41:58 +01:00
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if "retweeted_status_result" in legacy:
|
|
|
|
|
retweet = legacy["retweeted_status_result"]["result"]
|
|
|
|
|
if original_retweets:
|
2022-01-25 23:52:44 +01:00
|
|
|
|
try:
|
|
|
|
|
retweet["legacy"]["retweeted_status_id_str"] = \
|
|
|
|
|
retweet["rest_id"]
|
|
|
|
|
retweet["_retweet_id_str"] = tweet["rest_id"]
|
|
|
|
|
tweet = retweet
|
|
|
|
|
except KeyError:
|
2022-01-22 00:41:58 +01:00
|
|
|
|
continue
|
2022-01-25 23:52:44 +01:00
|
|
|
|
else:
|
|
|
|
|
try:
|
|
|
|
|
legacy["retweeted_status_id_str"] = \
|
|
|
|
|
retweet["rest_id"]
|
2022-07-17 17:04:24 +02:00
|
|
|
|
tweet["author"] = \
|
2022-01-25 23:52:44 +01:00
|
|
|
|
retweet["core"]["user_results"]["result"]
|
|
|
|
|
if "extended_entities" in retweet["legacy"] and \
|
|
|
|
|
"extended_entities" not in legacy:
|
|
|
|
|
legacy["extended_entities"] = \
|
|
|
|
|
retweet["legacy"]["extended_entities"]
|
|
|
|
|
except KeyError:
|
|
|
|
|
pass
|
|
|
|
|
|
2022-01-22 00:41:58 +01:00
|
|
|
|
yield tweet
|
|
|
|
|
|
|
|
|
|
if "quoted_status_result" in tweet:
|
2022-01-25 16:13:22 +01:00
|
|
|
|
try:
|
|
|
|
|
quoted = tweet["quoted_status_result"]["result"]
|
2022-07-17 18:50:21 +02:00
|
|
|
|
quoted["legacy"]["quoted_by"] = (
|
|
|
|
|
tweet["core"]["user_results"]["result"]
|
|
|
|
|
["legacy"]["screen_name"])
|
2022-01-25 16:13:22 +01:00
|
|
|
|
quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"]
|
|
|
|
|
yield quoted
|
|
|
|
|
except KeyError:
|
2022-02-01 18:24:03 +01:00
|
|
|
|
extr.log.debug(
|
2022-01-25 16:13:22 +01:00
|
|
|
|
"Skipping quote of %s (deleted)",
|
|
|
|
|
tweet.get("rest_id"))
|
|
|
|
|
continue
|
2022-01-21 23:34:41 +01:00
|
|
|
|
|
2022-01-23 17:31:07 +01:00
|
|
|
|
if not tweet or not cursor:
|
2022-01-21 23:34:41 +01:00
|
|
|
|
return
|
|
|
|
|
variables["cursor"] = cursor
|
|
|
|
|
|
|
|
|
|
def _pagination_users(self, endpoint, variables, path=None):
|
2022-01-23 01:44:55 +01:00
|
|
|
|
variables.update(self.variables)
|
|
|
|
|
|
2020-11-13 06:47:45 +01:00
|
|
|
|
while True:
|
|
|
|
|
cursor = entry = stop = None
|
2022-01-23 01:44:55 +01:00
|
|
|
|
params = {"variables": self._json_dumps(variables)}
|
2022-01-21 23:34:41 +01:00
|
|
|
|
data = self._call(endpoint, params)["data"]
|
2020-11-13 06:47:45 +01:00
|
|
|
|
|
2022-01-22 23:09:45 +01:00
|
|
|
|
try:
|
|
|
|
|
if path is None:
|
|
|
|
|
instructions = (data["user"]["result"]["timeline"]
|
|
|
|
|
["timeline"]["instructions"])
|
|
|
|
|
else:
|
|
|
|
|
for key in path:
|
|
|
|
|
data = data[key]
|
|
|
|
|
instructions = data["instructions"]
|
|
|
|
|
except KeyError:
|
|
|
|
|
return
|
2020-11-13 06:47:45 +01:00
|
|
|
|
|
|
|
|
|
for instr in instructions:
|
|
|
|
|
if instr["type"] == "TimelineAddEntries":
|
|
|
|
|
for entry in instr["entries"]:
|
|
|
|
|
if entry["entryId"].startswith("user-"):
|
2022-01-21 23:34:41 +01:00
|
|
|
|
user = (entry["content"]["itemContent"]
|
|
|
|
|
["user_results"]["result"])
|
|
|
|
|
if "rest_id" in user:
|
|
|
|
|
yield user
|
2020-11-13 06:47:45 +01:00
|
|
|
|
elif entry["entryId"].startswith("cursor-bottom-"):
|
|
|
|
|
cursor = entry["content"]["value"]
|
|
|
|
|
elif instr["type"] == "TimelineTerminateTimeline":
|
|
|
|
|
if instr["direction"] == "Bottom":
|
|
|
|
|
stop = True
|
|
|
|
|
|
|
|
|
|
if stop or not cursor or not entry:
|
|
|
|
|
return
|
|
|
|
|
variables["cursor"] = cursor
|
2022-03-03 01:51:52 +01:00
|
|
|
|
|
2022-03-31 20:31:58 +02:00
|
|
|
|
def _process_tombstone(self, entry, tombstone):
|
2022-03-03 01:51:52 +01:00
|
|
|
|
text = (tombstone.get("richText") or tombstone["text"])["text"]
|
2022-03-31 20:31:58 +02:00
|
|
|
|
tweet_id = entry["entryId"].rpartition("-")[2]
|
|
|
|
|
|
|
|
|
|
if text.startswith("Age-restricted"):
|
|
|
|
|
if self._syndication:
|
|
|
|
|
return self._syndication_tweet(tweet_id)
|
|
|
|
|
elif self._nsfw_warning:
|
|
|
|
|
self._nsfw_warning = False
|
|
|
|
|
self.extractor.log.warning('"%s"', text)
|
|
|
|
|
|
|
|
|
|
self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text)
|
|
|
|
|
|
|
|
|
|
def _syndication_tweet(self, tweet_id):
|
|
|
|
|
tweet = self.extractor.request(
|
|
|
|
|
"https://cdn.syndication.twimg.com/tweet?id=" + tweet_id).json()
|
|
|
|
|
|
|
|
|
|
tweet["user"]["description"] = ""
|
|
|
|
|
tweet["user"]["entities"] = {"description": {}}
|
2022-04-15 20:49:26 +02:00
|
|
|
|
tweet["user_id_str"] = tweet["user"]["id_str"]
|
|
|
|
|
|
|
|
|
|
if tweet["id_str"] != tweet_id:
|
|
|
|
|
tweet["retweeted_status_id_str"] = tweet["id_str"]
|
|
|
|
|
tweet["id_str"] = retweet_id = tweet_id
|
|
|
|
|
else:
|
|
|
|
|
retweet_id = None
|
2022-03-31 20:31:58 +02:00
|
|
|
|
|
|
|
|
|
if "video" in tweet:
|
|
|
|
|
video = tweet["video"]
|
2022-04-11 17:06:10 +02:00
|
|
|
|
video["variants"] = (max(
|
|
|
|
|
(v for v in video["variants"] if v["type"] == "video/mp4"),
|
2022-04-15 20:49:26 +02:00
|
|
|
|
key=lambda v: text.parse_int(
|
|
|
|
|
v["src"].split("/")[-2].partition("x")[0])
|
2022-04-11 17:06:10 +02:00
|
|
|
|
),)
|
2022-03-31 20:31:58 +02:00
|
|
|
|
video["variants"][0]["url"] = video["variants"][0]["src"]
|
|
|
|
|
tweet["extended_entities"] = {"media": [{
|
|
|
|
|
"video_info" : video,
|
|
|
|
|
"original_info": {"width" : 0, "height": 0},
|
|
|
|
|
}]}
|
|
|
|
|
elif "photos" in tweet:
|
|
|
|
|
for p in tweet["photos"]:
|
|
|
|
|
p["media_url_https"] = p["url"]
|
|
|
|
|
p["original_info"] = {
|
|
|
|
|
"width" : p["width"],
|
|
|
|
|
"height": p["height"],
|
|
|
|
|
}
|
|
|
|
|
tweet["extended_entities"] = {"media": tweet["photos"]}
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"rest_id": tweet["id_str"],
|
|
|
|
|
"legacy" : tweet,
|
|
|
|
|
"user" : tweet["user"],
|
2022-04-15 20:49:26 +02:00
|
|
|
|
"_retweet_id_str": retweet_id,
|
2022-03-31 20:31:58 +02:00
|
|
|
|
}
|