2022-11-15 11:44:16 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-01-03 15:14:23 +01:00
|
|
|
# Copyright 2022-2023 Mike Fährmann
|
2022-11-15 11:44:16 +01:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Extractors for Nitter instances"""
|
|
|
|
|
|
|
|
from .common import BaseExtractor, Message
|
|
|
|
from .. import text
|
2022-11-26 19:56:28 +01:00
|
|
|
import binascii
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
class NitterExtractor(BaseExtractor):
|
|
|
|
"""Base class for nitter extractors"""
|
|
|
|
basecategory = "nitter"
|
|
|
|
directory_fmt = ("{category}", "{user[name]}")
|
|
|
|
filename_fmt = "{tweet_id}_{num}.{extension}"
|
|
|
|
archive_fmt = "{tweet_id}_{num}"
|
|
|
|
|
|
|
|
def __init__(self, match):
|
2023-07-21 22:38:39 +02:00
|
|
|
self.cookies_domain = self.root.partition("://")[2]
|
2022-11-15 11:44:16 +01:00
|
|
|
BaseExtractor.__init__(self, match)
|
2022-12-04 12:07:19 +01:00
|
|
|
|
|
|
|
lastindex = match.lastindex
|
|
|
|
self.user = match.group(lastindex)
|
|
|
|
self.user_id = match.group(lastindex + 1)
|
2022-11-25 18:50:04 +01:00
|
|
|
self.user_obj = None
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
def items(self):
|
2022-11-25 19:53:28 +01:00
|
|
|
retweets = self.config("retweets", False)
|
2022-11-24 22:56:01 +01:00
|
|
|
videos = self.config("videos", True)
|
2022-11-25 00:45:32 +01:00
|
|
|
if videos:
|
|
|
|
ytdl = (videos == "ytdl")
|
|
|
|
videos = True
|
2023-07-21 22:38:39 +02:00
|
|
|
self.cookies.set("hlsPlayback", "on", domain=self.cookies_domain)
|
2022-11-24 22:56:01 +01:00
|
|
|
|
2022-11-25 20:50:38 +01:00
|
|
|
for tweet in self.tweets():
|
2022-11-15 11:44:16 +01:00
|
|
|
|
2022-11-25 19:53:28 +01:00
|
|
|
if not retweets and tweet["retweet"]:
|
|
|
|
self.log.debug("Skipping %s (retweet)", tweet["tweet_id"])
|
|
|
|
continue
|
|
|
|
|
2022-11-24 22:56:01 +01:00
|
|
|
attachments = tweet.pop("_attach", "")
|
|
|
|
if attachments:
|
|
|
|
files = []
|
|
|
|
append = files.append
|
|
|
|
|
|
|
|
for url in text.extract_iter(
|
|
|
|
attachments, 'href="', '"'):
|
2022-11-26 19:56:28 +01:00
|
|
|
|
2023-03-25 13:09:24 +01:00
|
|
|
if "/i/broadcasts/" in url:
|
|
|
|
self.log.debug(
|
|
|
|
"Skipping unsupported broadcast '%s'", url)
|
|
|
|
continue
|
|
|
|
|
2022-11-26 19:56:28 +01:00
|
|
|
if "/enc/" in url:
|
|
|
|
name = binascii.a2b_base64(url.rpartition(
|
|
|
|
"/")[2]).decode().rpartition("/")[2]
|
|
|
|
else:
|
|
|
|
name = url.rpartition("%2F")[2]
|
|
|
|
|
2022-11-24 22:56:01 +01:00
|
|
|
if url[0] == "/":
|
|
|
|
url = self.root + url
|
2023-03-09 23:30:15 +01:00
|
|
|
file = {"url": url, "_http_retry": _retry_on_404}
|
2022-11-25 00:34:45 +01:00
|
|
|
file["filename"], _, file["extension"] = \
|
|
|
|
name.rpartition(".")
|
|
|
|
append(file)
|
2022-11-24 22:56:01 +01:00
|
|
|
|
|
|
|
if videos and not files:
|
|
|
|
if ytdl:
|
|
|
|
append({
|
|
|
|
"url": "ytdl:{}/i/status/{}".format(
|
|
|
|
self.root, tweet["tweet_id"]),
|
|
|
|
"extension": None,
|
|
|
|
})
|
|
|
|
else:
|
|
|
|
for url in text.extract_iter(
|
|
|
|
attachments, 'data-url="', '"'):
|
2022-11-26 19:56:28 +01:00
|
|
|
|
|
|
|
if "/enc/" in url:
|
|
|
|
name = binascii.a2b_base64(url.rpartition(
|
|
|
|
"/")[2]).decode().rpartition("/")[2]
|
|
|
|
else:
|
|
|
|
name = url.rpartition("%2F")[2]
|
|
|
|
|
2022-11-24 22:56:01 +01:00
|
|
|
if url[0] == "/":
|
|
|
|
url = self.root + url
|
2022-11-25 00:34:45 +01:00
|
|
|
append({
|
|
|
|
"url" : "ytdl:" + url,
|
|
|
|
"filename" : name.rpartition(".")[0],
|
|
|
|
"extension": "mp4",
|
|
|
|
})
|
2023-04-14 19:00:56 +02:00
|
|
|
|
|
|
|
for url in text.extract_iter(
|
|
|
|
attachments, '<source src="', '"'):
|
2023-11-27 17:28:06 +01:00
|
|
|
if url[0] == "/":
|
|
|
|
url = self.root + url
|
2023-04-14 19:00:56 +02:00
|
|
|
append(text.nameext_from_url(url, {"url": url}))
|
|
|
|
|
2022-11-15 11:44:16 +01:00
|
|
|
else:
|
2022-11-24 22:56:01 +01:00
|
|
|
files = ()
|
|
|
|
tweet["count"] = len(files)
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
yield Message.Directory, tweet
|
2022-11-24 22:56:01 +01:00
|
|
|
for tweet["num"], file in enumerate(files, 1):
|
|
|
|
url = file["url"]
|
|
|
|
file.update(tweet)
|
|
|
|
yield Message.Url, url, file
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
def _tweet_from_html(self, html):
|
|
|
|
extr = text.extract_from(html)
|
2022-11-25 18:50:04 +01:00
|
|
|
author = {
|
2022-11-15 11:44:16 +01:00
|
|
|
"name": extr('class="fullname" href="/', '"'),
|
|
|
|
"nick": extr('title="', '"'),
|
|
|
|
}
|
|
|
|
extr('<span class="tweet-date', '')
|
|
|
|
link = extr('href="', '"')
|
|
|
|
return {
|
2022-11-25 19:53:28 +01:00
|
|
|
"author" : author,
|
|
|
|
"user" : self.user_obj or author,
|
|
|
|
"date" : text.parse_datetime(
|
2022-11-15 11:44:16 +01:00
|
|
|
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
|
|
|
|
"tweet_id": link.rpartition("/")[2].partition("#")[0],
|
|
|
|
"content": extr('class="tweet-content', "</div").partition(">")[2],
|
2022-11-25 19:53:28 +01:00
|
|
|
"_attach" : extr('class="attachments', 'class="tweet-stats'),
|
2022-11-15 11:44:16 +01:00
|
|
|
"comments": text.parse_int(extr(
|
|
|
|
'class="icon-comment', '</div>').rpartition(">")[2]),
|
|
|
|
"retweets": text.parse_int(extr(
|
|
|
|
'class="icon-retweet', '</div>').rpartition(">")[2]),
|
|
|
|
"quotes" : text.parse_int(extr(
|
|
|
|
'class="icon-quote', '</div>').rpartition(">")[2]),
|
|
|
|
"likes" : text.parse_int(extr(
|
|
|
|
'class="icon-heart', '</div>').rpartition(">")[2]),
|
2022-11-25 19:53:28 +01:00
|
|
|
"retweet" : 'class="retweet-header' in html,
|
2023-03-25 13:09:24 +01:00
|
|
|
"quoted" : False,
|
2022-11-26 11:23:03 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
def _tweet_from_quote(self, html):
|
|
|
|
extr = text.extract_from(html)
|
|
|
|
author = {
|
|
|
|
"name": extr('class="fullname" href="/', '"'),
|
|
|
|
"nick": extr('title="', '"'),
|
|
|
|
}
|
|
|
|
extr('<span class="tweet-date', '')
|
|
|
|
link = extr('href="', '"')
|
|
|
|
return {
|
|
|
|
"author" : author,
|
|
|
|
"user" : self.user_obj or author,
|
|
|
|
"date" : text.parse_datetime(
|
|
|
|
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
|
|
|
|
"tweet_id": link.rpartition("/")[2].partition("#")[0],
|
2023-03-25 13:09:24 +01:00
|
|
|
"content" : extr('class="quote-text', "</div").partition(">")[2],
|
2022-11-26 11:23:03 +01:00
|
|
|
"_attach" : extr('class="attachments', '''
|
|
|
|
</div>'''),
|
|
|
|
"retweet" : False,
|
2023-03-25 13:09:24 +01:00
|
|
|
"quoted" : True,
|
2022-11-15 11:44:16 +01:00
|
|
|
}
|
|
|
|
|
2022-11-25 18:50:04 +01:00
|
|
|
def _user_from_html(self, html):
|
|
|
|
extr = text.extract_from(html, html.index('class="profile-tabs'))
|
|
|
|
banner = extr('class="profile-banner"><a href="', '"')
|
2023-03-25 12:50:40 +01:00
|
|
|
|
|
|
|
try:
|
2023-04-23 19:13:27 +02:00
|
|
|
if "/enc/" in banner:
|
|
|
|
uid = binascii.a2b_base64(banner.rpartition(
|
|
|
|
"/")[2]).decode().split("/")[4]
|
|
|
|
else:
|
|
|
|
uid = banner.split("%2F")[4]
|
2023-03-25 12:50:40 +01:00
|
|
|
except Exception:
|
|
|
|
uid = 0
|
|
|
|
|
2022-11-25 18:50:04 +01:00
|
|
|
return {
|
2023-03-25 12:50:40 +01:00
|
|
|
"id" : uid,
|
2022-11-25 18:50:04 +01:00
|
|
|
"profile_banner" : self.root + banner if banner else "",
|
|
|
|
"profile_image" : self.root + extr(
|
|
|
|
'class="profile-card-avatar" href="', '"'),
|
|
|
|
"nick" : extr('title="', '"'),
|
|
|
|
"name" : extr('title="@', '"'),
|
|
|
|
"description" : extr('<p dir="auto">', '<'),
|
|
|
|
"date" : text.parse_datetime(
|
|
|
|
extr('class="profile-joindate"><span title="', '"'),
|
|
|
|
"%I:%M %p - %d %b %Y"),
|
2022-11-26 11:23:03 +01:00
|
|
|
"statuses_count" : text.parse_int(extr(
|
|
|
|
'class="profile-stat-num">', '<').replace(",", "")),
|
|
|
|
"friends_count" : text.parse_int(extr(
|
|
|
|
'class="profile-stat-num">', '<').replace(",", "")),
|
|
|
|
"followers_count" : text.parse_int(extr(
|
|
|
|
'class="profile-stat-num">', '<').replace(",", "")),
|
|
|
|
"favourites_count": text.parse_int(extr(
|
|
|
|
'class="profile-stat-num">', '<').replace(",", "")),
|
2022-11-25 18:50:04 +01:00
|
|
|
"verified" : 'title="Verified account"' in html,
|
|
|
|
}
|
|
|
|
|
2022-11-26 11:23:03 +01:00
|
|
|
def _extract_quote(self, html):
|
|
|
|
html, _, quote = html.partition('class="quote')
|
|
|
|
if quote:
|
|
|
|
quote, _, tail = quote.partition('class="tweet-published')
|
|
|
|
return (html + tail, quote)
|
|
|
|
return (html, None)
|
|
|
|
|
2022-11-15 11:44:16 +01:00
|
|
|
def _pagination(self, path):
|
2022-11-26 11:23:03 +01:00
|
|
|
quoted = self.config("quoted", False)
|
2022-12-04 12:07:19 +01:00
|
|
|
|
|
|
|
if self.user_id:
|
|
|
|
self.user = self.request(
|
|
|
|
"{}/i/user/{}".format(self.root, self.user_id),
|
|
|
|
allow_redirects=False,
|
|
|
|
).headers["location"].rpartition("/")[2]
|
|
|
|
base_url = url = "{}/{}{}".format(self.root, self.user, path)
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
while True:
|
2022-11-25 18:50:04 +01:00
|
|
|
tweets_html = self.request(url).text.split(
|
|
|
|
'<div class="timeline-item')
|
|
|
|
|
|
|
|
if self.user_obj is None:
|
|
|
|
self.user_obj = self._user_from_html(tweets_html[0])
|
2022-11-15 11:44:16 +01:00
|
|
|
|
2022-11-26 11:23:03 +01:00
|
|
|
for html, quote in map(self._extract_quote, tweets_html[1:]):
|
2024-02-29 16:29:31 +01:00
|
|
|
tweet = self._tweet_from_html(html)
|
|
|
|
if not tweet["date"]:
|
|
|
|
continue
|
|
|
|
yield tweet
|
2022-11-26 11:23:03 +01:00
|
|
|
if quoted and quote:
|
|
|
|
yield self._tweet_from_quote(quote)
|
2022-11-15 11:44:16 +01:00
|
|
|
|
2022-11-25 18:50:04 +01:00
|
|
|
more = text.extr(
|
|
|
|
tweets_html[-1], '<div class="show-more"><a href="?', '"')
|
2022-11-15 11:44:16 +01:00
|
|
|
if not more:
|
|
|
|
return
|
|
|
|
url = base_url + "?" + text.unescape(more)
|
|
|
|
|
|
|
|
|
|
|
|
BASE_PATTERN = NitterExtractor.update({
|
|
|
|
})
|
|
|
|
|
2022-12-04 12:07:19 +01:00
|
|
|
USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)"
|
|
|
|
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
class NitterTweetsExtractor(NitterExtractor):
|
|
|
|
subcategory = "tweets"
|
2022-12-04 12:07:19 +01:00
|
|
|
pattern = USER_PATTERN + r"(?:/tweets)?(?:$|\?|#)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://nitter.net/USER"
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
def tweets(self):
|
2022-12-04 12:07:19 +01:00
|
|
|
return self._pagination("")
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
class NitterRepliesExtractor(NitterExtractor):
|
|
|
|
subcategory = "replies"
|
2022-12-04 12:07:19 +01:00
|
|
|
pattern = USER_PATTERN + r"/with_replies"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://nitter.net/USER/with_replies"
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
def tweets(self):
|
2022-12-04 12:07:19 +01:00
|
|
|
return self._pagination("/with_replies")
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
class NitterMediaExtractor(NitterExtractor):
|
|
|
|
subcategory = "media"
|
2022-12-04 12:07:19 +01:00
|
|
|
pattern = USER_PATTERN + r"/media"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://nitter.net/USER/media"
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
def tweets(self):
|
2022-12-04 12:07:19 +01:00
|
|
|
return self._pagination("/media")
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
class NitterSearchExtractor(NitterExtractor):
|
|
|
|
subcategory = "search"
|
2022-12-04 12:07:19 +01:00
|
|
|
pattern = USER_PATTERN + r"/search"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://nitter.net/USER/search"
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
def tweets(self):
|
2022-12-04 12:07:19 +01:00
|
|
|
return self._pagination("/search")
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
class NitterTweetExtractor(NitterExtractor):
|
|
|
|
"""Extractor for nitter tweets"""
|
|
|
|
subcategory = "tweet"
|
|
|
|
directory_fmt = ("{category}", "{user[name]}")
|
|
|
|
filename_fmt = "{tweet_id}_{num}.{extension}"
|
|
|
|
archive_fmt = "{tweet_id}_{num}"
|
2022-12-04 12:07:19 +01:00
|
|
|
pattern = BASE_PATTERN + r"/(i/web|[^/?#]+)/status/(\d+())"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://nitter.net/USER/status/12345"
|
2022-11-15 11:44:16 +01:00
|
|
|
|
|
|
|
def tweets(self):
|
|
|
|
url = "{}/i/status/{}".format(self.root, self.user)
|
2022-11-25 20:50:38 +01:00
|
|
|
html = text.extr(self.request(url).text, 'class="main-tweet', '''\
|
|
|
|
</div>
|
|
|
|
</div></div></div>''')
|
2022-11-26 11:23:03 +01:00
|
|
|
html, quote = self._extract_quote(html)
|
|
|
|
tweet = self._tweet_from_html(html)
|
|
|
|
if quote and self.config("quoted", False):
|
|
|
|
quoted = self._tweet_from_quote(quote)
|
|
|
|
quoted["user"] = tweet["user"]
|
|
|
|
return (tweet, quoted)
|
|
|
|
return (tweet,)
|
2023-03-09 23:30:15 +01:00
|
|
|
|
|
|
|
|
|
|
|
def _retry_on_404(response):
|
|
|
|
return response.status_code == 404
|