1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-24 03:32:33 +01:00
gallery-dl/gallery_dl/extractor/nitter.py
Mike Fährmann 5d7435e803
[nitter] extract user IDs from encoded banner URLs
still requires a banner to be present to begin with
2023-04-23 19:13:27 +02:00

495 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Nitter instances"""
from .common import BaseExtractor, Message
from .. import text
import binascii
class NitterExtractor(BaseExtractor):
"""Base class for nitter extractors"""
basecategory = "nitter"
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{num}"
def __init__(self, match):
self.cookiedomain = self.root.partition("://")[2]
BaseExtractor.__init__(self, match)
lastindex = match.lastindex
self.user = match.group(lastindex)
self.user_id = match.group(lastindex + 1)
self.user_obj = None
def items(self):
retweets = self.config("retweets", False)
videos = self.config("videos", True)
if videos:
ytdl = (videos == "ytdl")
videos = True
self._cookiejar.set("hlsPlayback", "on", domain=self.cookiedomain)
for tweet in self.tweets():
if not retweets and tweet["retweet"]:
self.log.debug("Skipping %s (retweet)", tweet["tweet_id"])
continue
attachments = tweet.pop("_attach", "")
if attachments:
files = []
append = files.append
for url in text.extract_iter(
attachments, 'href="', '"'):
if "/i/broadcasts/" in url:
self.log.debug(
"Skipping unsupported broadcast '%s'", url)
continue
if "/enc/" in url:
name = binascii.a2b_base64(url.rpartition(
"/")[2]).decode().rpartition("/")[2]
else:
name = url.rpartition("%2F")[2]
if url[0] == "/":
url = self.root + url
file = {"url": url, "_http_retry": _retry_on_404}
file["filename"], _, file["extension"] = \
name.rpartition(".")
append(file)
if videos and not files:
if ytdl:
append({
"url": "ytdl:{}/i/status/{}".format(
self.root, tweet["tweet_id"]),
"extension": None,
})
else:
for url in text.extract_iter(
attachments, 'data-url="', '"'):
if "/enc/" in url:
name = binascii.a2b_base64(url.rpartition(
"/")[2]).decode().rpartition("/")[2]
else:
name = url.rpartition("%2F")[2]
if url[0] == "/":
url = self.root + url
append({
"url" : "ytdl:" + url,
"filename" : name.rpartition(".")[0],
"extension": "mp4",
})
for url in text.extract_iter(
attachments, '<source src="', '"'):
append(text.nameext_from_url(url, {"url": url}))
else:
files = ()
tweet["count"] = len(files)
yield Message.Directory, tweet
for tweet["num"], file in enumerate(files, 1):
url = file["url"]
file.update(tweet)
yield Message.Url, url, file
def _tweet_from_html(self, html):
extr = text.extract_from(html)
author = {
"name": extr('class="fullname" href="/', '"'),
"nick": extr('title="', '"'),
}
extr('<span class="tweet-date', '')
link = extr('href="', '"')
return {
"author" : author,
"user" : self.user_obj or author,
"date" : text.parse_datetime(
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
"tweet_id": link.rpartition("/")[2].partition("#")[0],
"content": extr('class="tweet-content', "</div").partition(">")[2],
"_attach" : extr('class="attachments', 'class="tweet-stats'),
"comments": text.parse_int(extr(
'class="icon-comment', '</div>').rpartition(">")[2]),
"retweets": text.parse_int(extr(
'class="icon-retweet', '</div>').rpartition(">")[2]),
"quotes" : text.parse_int(extr(
'class="icon-quote', '</div>').rpartition(">")[2]),
"likes" : text.parse_int(extr(
'class="icon-heart', '</div>').rpartition(">")[2]),
"retweet" : 'class="retweet-header' in html,
"quoted" : False,
}
def _tweet_from_quote(self, html):
extr = text.extract_from(html)
author = {
"name": extr('class="fullname" href="/', '"'),
"nick": extr('title="', '"'),
}
extr('<span class="tweet-date', '')
link = extr('href="', '"')
return {
"author" : author,
"user" : self.user_obj or author,
"date" : text.parse_datetime(
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
"tweet_id": link.rpartition("/")[2].partition("#")[0],
"content" : extr('class="quote-text', "</div").partition(">")[2],
"_attach" : extr('class="attachments', '''
</div>'''),
"retweet" : False,
"quoted" : True,
}
def _user_from_html(self, html):
extr = text.extract_from(html, html.index('class="profile-tabs'))
banner = extr('class="profile-banner"><a href="', '"')
try:
if "/enc/" in banner:
uid = binascii.a2b_base64(banner.rpartition(
"/")[2]).decode().split("/")[4]
else:
uid = banner.split("%2F")[4]
except Exception:
uid = 0
return {
"id" : uid,
"profile_banner" : self.root + banner if banner else "",
"profile_image" : self.root + extr(
'class="profile-card-avatar" href="', '"'),
"nick" : extr('title="', '"'),
"name" : extr('title="@', '"'),
"description" : extr('<p dir="auto">', '<'),
"date" : text.parse_datetime(
extr('class="profile-joindate"><span title="', '"'),
"%I:%M %p - %d %b %Y"),
"statuses_count" : text.parse_int(extr(
'class="profile-stat-num">', '<').replace(",", "")),
"friends_count" : text.parse_int(extr(
'class="profile-stat-num">', '<').replace(",", "")),
"followers_count" : text.parse_int(extr(
'class="profile-stat-num">', '<').replace(",", "")),
"favourites_count": text.parse_int(extr(
'class="profile-stat-num">', '<').replace(",", "")),
"verified" : 'title="Verified account"' in html,
}
def _extract_quote(self, html):
html, _, quote = html.partition('class="quote')
if quote:
quote, _, tail = quote.partition('class="tweet-published')
return (html + tail, quote)
return (html, None)
def _pagination(self, path):
quoted = self.config("quoted", False)
if self.user_id:
self.user = self.request(
"{}/i/user/{}".format(self.root, self.user_id),
allow_redirects=False,
).headers["location"].rpartition("/")[2]
base_url = url = "{}/{}{}".format(self.root, self.user, path)
while True:
tweets_html = self.request(url).text.split(
'<div class="timeline-item')
if self.user_obj is None:
self.user_obj = self._user_from_html(tweets_html[0])
for html, quote in map(self._extract_quote, tweets_html[1:]):
yield self._tweet_from_html(html)
if quoted and quote:
yield self._tweet_from_quote(quote)
more = text.extr(
tweets_html[-1], '<div class="show-more"><a href="?', '"')
if not more:
return
url = base_url + "?" + text.unescape(more)
BASE_PATTERN = NitterExtractor.update({
"nitter.net": {
"root": "https://nitter.net",
"pattern": r"nitter\.net",
},
"nitter.lacontrevoie.fr": {
"root": "https://nitter.lacontrevoie.fr",
"pattern": r"nitter\.lacontrevoie\.fr",
},
"nitter.1d4.us": {
"root": "https://nitter.1d4.us",
"pattern": r"nitter\.1d4\.us",
},
"nitter.kavin.rocks": {
"root": "https://nitter.kavin.rocks",
"pattern": r"nitter\.kavin\.rocks",
},
"nitter.unixfox.eu": {
"root": "https://nitter.unixfox.eu",
"pattern": r"nitter\.unixfox\.eu",
},
"nitter.it": {
"root": "https://nitter.it",
"pattern": r"nitter\.it",
},
})
USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)"
class NitterTweetsExtractor(NitterExtractor):
subcategory = "tweets"
pattern = USER_PATTERN + r"(?:/tweets)?(?:$|\?|#)"
test = (
("https://nitter.net/supernaturepics", {
"pattern": r"https://nitter\.net/pic/orig"
r"/media%2F[\w-]+\.(jpg|png)$",
"range": "1-20",
"count": 20,
"keyword": {
"author": {
"name": "supernaturepics",
"nick": "Nature Pictures"
},
"comments": int,
"content": str,
"count": 1,
"date": "type:datetime",
"likes": int,
"quotes": int,
"retweets": int,
"tweet_id": r"re:\d+",
"user": {
"date": "dt:2015-01-12 10:25:00",
"description": "The very best nature pictures.",
"favourites_count": int,
"followers_count": int,
"friends_count": int,
"id": "2976459548",
"name": "supernaturepics",
"nick": "Nature Pictures",
"profile_banner": "https://nitter.net/pic/https%3A%2F%2Fpb"
"s.twimg.com%2Fprofile_banners%2F2976459"
"548%2F1421058583%2F1500x500",
"profile_image": "https://nitter.net/pic/pbs.twimg.com%2Fp"
"rofile_images%2F554585280938659841%2FFLV"
"AlX18.jpeg",
"statuses_count": 1568,
"verified": False,
},
},
}),
("https://nitter.lacontrevoie.fr/supernaturepics", {
"url": "54f4b55f2099dcc248f3fb7bfacf1349e08d8e2d",
"pattern": r"https://nitter\.lacontrevoie\.fr/pic/orig"
r"/media%2FCGMNYZvW0AIVoom\.jpg",
"range": "1",
}),
("https://nitter.1d4.us/supernaturepics", {
"range": "1",
"keyword": {"user": {"id": "2976459548"}},
}),
("https://nitter.kavin.rocks/id:2976459548"),
("https://nitter.unixfox.eu/supernaturepics"),
)
def tweets(self):
return self._pagination("")
class NitterRepliesExtractor(NitterExtractor):
subcategory = "replies"
pattern = USER_PATTERN + r"/with_replies"
test = (
("https://nitter.net/supernaturepics/with_replies", {
"pattern": r"https://nitter\.net/pic/orig"
r"/media%2F[\w-]+\.(jpg|png)$",
"range": "1-20",
}),
("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"),
("https://nitter.1d4.us/supernaturepics/with_replies"),
("https://nitter.kavin.rocks/id:2976459548/with_replies"),
("https://nitter.unixfox.eu/i/user/2976459548/with_replies"),
)
def tweets(self):
return self._pagination("/with_replies")
class NitterMediaExtractor(NitterExtractor):
subcategory = "media"
pattern = USER_PATTERN + r"/media"
test = (
("https://nitter.net/supernaturepics/media", {
"pattern": r"https://nitter\.net/pic/orig"
r"/media%2F[\w-]+\.(jpg|png)$",
"range": "1-20",
}),
("https://nitter.kavin.rocks/id:2976459548/media", {
"pattern": r"https://nitter\.kavin\.rocks/pic/orig"
r"/media%2F[\w-]+\.(jpg|png)$",
"range": "1-20",
}),
("https://nitter.lacontrevoie.fr/supernaturepics/media"),
("https://nitter.1d4.us/supernaturepics/media"),
("https://nitter.unixfox.eu/i/user/2976459548/media"),
)
def tweets(self):
return self._pagination("/media")
class NitterSearchExtractor(NitterExtractor):
subcategory = "search"
pattern = USER_PATTERN + r"/search"
test = (
("https://nitter.net/supernaturepics/search", {
"pattern": r"https://nitter\.net/pic/orig"
r"/media%2F[\w-]+\.(jpg|png)$",
"range": "1-20",
}),
("https://nitter.lacontrevoie.fr/supernaturepics/search"),
("https://nitter.1d4.us/supernaturepics/search"),
("https://nitter.kavin.rocks/id:2976459548/search"),
("https://nitter.unixfox.eu/i/user/2976459548/search"),
)
def tweets(self):
return self._pagination("/search")
class NitterTweetExtractor(NitterExtractor):
"""Extractor for nitter tweets"""
subcategory = "tweet"
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{num}"
pattern = BASE_PATTERN + r"/(i/web|[^/?#]+)/status/(\d+())"
test = (
("https://nitter.net/supernaturepics/status/604341487988576256", {
"url": "3f2b64e175bf284aa672c3bb53ed275e470b919a",
"content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
"keyword": {
"comments": 19,
"content": "Big Wedeene River, Canada",
"count": 1,
"date": "dt:2015-05-29 17:40:00",
"extension": "jpg",
"filename": "CGMNYZvW0AIVoom",
"likes": int,
"num": 1,
"quotes": 10,
"retweets": int,
"tweet_id": "604341487988576256",
"url": "https://nitter.net/pic/orig"
"/media%2FCGMNYZvW0AIVoom.jpg",
"user": {
"name": "supernaturepics",
"nick": "Nature Pictures",
},
},
}),
# 4 images
("https://nitter.lacontrevoie.fr/i/status/894001459754180609", {
"url": "9c51b3a4a1114535eb9b168bba97ad95db0d59ff",
}),
# video
("https://nitter.lacontrevoie.fr/i/status/1065692031626829824", {
"pattern": r"ytdl:https://nitter\.lacontrevoie\.fr/video"
r"/[0-9A-F]{10,}/https%3A%2F%2Fvideo.twimg.com%2F"
r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F"
r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5",
"keyword": {
"extension": "mp4",
"filename": "nv8hUQC1R0SjhzcZ",
},
}),
# content with emoji, newlines, hashtags (#338)
("https://nitter.1d4.us/playpokemon/status/1263832915173048321", {
"keyword": {"content": (
r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
"Gifts! \n\nYoull be able to receive four Galarian form "
"Pokémon with Hidden Abilities, plus some very useful items. "
"Its our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
)},
}),
# Nitter tweet (#890)
("https://nitter.kavin.rocks/ed1conf/status/1163841619336007680", {
"url": "e115bd1c86c660064e392b05269bbcafcd8c8b7a",
"content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
}),
# Reply to deleted tweet (#403, #838)
("https://nitter.unixfox.eu/i/web/status/1170041925560258560", {
"pattern": r"https://nitter\.unixfox\.eu/pic/orig"
r"/media%2FEDzS7VrU0AAFL4_\.jpg",
}),
# "quoted" option (#854)
("https://nitter.net/StobiesGalaxy/status/1270755918330896395", {
"options": (("quoted", True),),
"pattern": r"https://nitter\.net/pic/orig/media%2FEa[KG].+\.jpg",
"count": 8,
}),
# quoted tweet (#526, #854)
("https://nitter.1d4.us/StobiesGalaxy/status/1270755918330896395", {
"pattern": r"https://nitter\.1d4\.us/pic/orig"
r"/enc/bWVkaWEvRWFL\w+LmpwZw==",
"keyword": {"filename": r"re:EaK.{12}"},
"count": 4,
}),
# deleted quote tweet (#2225)
("https://nitter.lacontrevoie.fr/i/status/1460044411165888515", {
"count": 0,
}),
# "Misleading" content
("https://nitter.lacontrevoie.fr/i/status/1486373748911575046", {
"count": 4,
}),
# age-restricted (#2354)
("https://nitter.unixfox.eu/mightbecurse/status/1492954264909479936", {
"keyword": {"date": "dt:2022-02-13 20:10:00"},
"count": 1,
}),
# broadcast
("https://nitter.it/POTUS/status/1639409307878928384", {
"count": 0,
})
)
def tweets(self):
url = "{}/i/status/{}".format(self.root, self.user)
html = text.extr(self.request(url).text, 'class="main-tweet', '''\
</div>
</div></div></div>''')
html, quote = self._extract_quote(html)
tweet = self._tweet_from_html(html)
if quote and self.config("quoted", False):
quoted = self._tweet_from_quote(quote)
quoted["user"] = tweet["user"]
return (tweet, quoted)
return (tweet,)
def _retry_on_404(response):
return response.status_code == 404