gallery-dl/gallery_dl/extractor/nitter.py

# -*- coding: utf-8 -*-

# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for Nitter instances"""

from .common import BaseExtractor, Message
from .. import text
import binascii


class NitterExtractor(BaseExtractor):
    """Base class for nitter extractors"""
    basecategory = "nitter"
    directory_fmt = ("{category}", "{user[name]}")
    filename_fmt = "{tweet_id}_{num}.{extension}"
    archive_fmt = "{tweet_id}_{num}"

    def __init__(self, match):
        self.cookies_domain = self.root.partition("://")[2]
        BaseExtractor.__init__(self, match)

        lastindex = match.lastindex
        self.user = match.group(lastindex)
        self.user_id = match.group(lastindex + 1)
        self.user_obj = None

    def items(self):
        retweets = self.config("retweets", False)
        videos = self.config("videos", True)
        if videos:
            ytdl = (videos == "ytdl")
            videos = True
            self.cookies.set("hlsPlayback", "on", domain=self.cookies_domain)

        for tweet in self.tweets():

            if not retweets and tweet["retweet"]:
                self.log.debug("Skipping %s (retweet)", tweet["tweet_id"])
                continue

            attachments = tweet.pop("_attach", "")
            if attachments:
                files = []
                append = files.append

                for url in text.extract_iter(
                        attachments, 'href="', '"'):

                    if "/i/broadcasts/" in url:
                        self.log.debug(
                            "Skipping unsupported broadcast '%s'", url)
                        continue

                    if "/enc/" in url:
                        name = binascii.a2b_base64(url.rpartition(
                            "/")[2]).decode().rpartition("/")[2]
                    else:
                        name = url.rpartition("%2F")[2]

                    if url[0] == "/":
                        url = self.root + url
                    file = {"url": url, "_http_retry": _retry_on_404}
                    file["filename"], _, file["extension"] = \
                        name.rpartition(".")
                    append(file)

                if videos and not files:
                    if ytdl:
                        append({
                            "url": "ytdl:{}/i/status/{}".format(
                                self.root, tweet["tweet_id"]),
                            "extension": None,
                        })
                    else:
                        for url in text.extract_iter(
                                attachments, 'data-url="', '"'):

                            if "/enc/" in url:
                                name = binascii.a2b_base64(url.rpartition(
                                    "/")[2]).decode().rpartition("/")[2]
                            else:
                                name = url.rpartition("%2F")[2]

                            if url[0] == "/":
                                url = self.root + url
                            append({
                                "url"      : "ytdl:" + url,
                                "filename" : name.rpartition(".")[0],
                                "extension": "mp4",
                            })

                        for url in text.extract_iter(
                                attachments, '<source src="', '"'):
                            if url[0] == "/":
                                url = self.root + url
                            append(text.nameext_from_url(url, {"url": url}))

            else:
                files = ()
            tweet["count"] = len(files)

            yield Message.Directory, tweet
            for tweet["num"], file in enumerate(files, 1):
                url = file["url"]
                file.update(tweet)
                yield Message.Url, url, file

    def _tweet_from_html(self, html):
        extr = text.extract_from(html)
        author = {
            "name": extr('class="fullname" href="/', '"'),
            "nick": extr('title="', '"'),
        }
        extr('<span class="tweet-date', '')
        link = extr('href="', '"')
        return {
            "author"  : author,
            "user"    : self.user_obj or author,
            "date"    : text.parse_datetime(
                extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
            "tweet_id": link.rpartition("/")[2].partition("#")[0],
            "content": extr('class="tweet-content', "</div").partition(">")[2],
            "_attach" : extr('class="attachments', 'class="tweet-stats'),
            "comments": text.parse_int(extr(
                'class="icon-comment', '</div>').rpartition(">")[2]),
            "retweets": text.parse_int(extr(
                'class="icon-retweet', '</div>').rpartition(">")[2]),
            "quotes"  : text.parse_int(extr(
                'class="icon-quote', '</div>').rpartition(">")[2]),
            "likes"   : text.parse_int(extr(
                'class="icon-heart', '</div>').rpartition(">")[2]),
            "retweet" : 'class="retweet-header' in html,
            "quoted"  : False,
        }

    def _tweet_from_quote(self, html):
        extr = text.extract_from(html)
        author = {
            "name": extr('class="fullname" href="/', '"'),
            "nick": extr('title="', '"'),
        }
        extr('<span class="tweet-date', '')
        link = extr('href="', '"')
        return {
            "author"  : author,
            "user"    : self.user_obj or author,
            "date"    : text.parse_datetime(
                extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
            "tweet_id": link.rpartition("/")[2].partition("#")[0],
            "content" : extr('class="quote-text', "</div").partition(">")[2],
            "_attach" : extr('class="attachments', '''
                </div>'''),
            "retweet" : False,
            "quoted"  : True,
        }

    def _user_from_html(self, html):
        extr = text.extract_from(html, html.index('class="profile-tabs'))
        banner = extr('class="profile-banner"><a href="', '"')

        try:
            if "/enc/" in banner:
                uid = binascii.a2b_base64(banner.rpartition(
                    "/")[2]).decode().split("/")[4]
            else:
                uid = banner.split("%2F")[4]
        except Exception:
            uid = 0

        return {
            "id"              : uid,
            "profile_banner"  : self.root + banner if banner else "",
            "profile_image"   : self.root + extr(
                'class="profile-card-avatar" href="', '"'),
            "nick"            : extr('title="', '"'),
            "name"            : extr('title="@', '"'),
            "description"     : extr('<p dir="auto">', '<'),
            "date"            : text.parse_datetime(
                extr('class="profile-joindate"><span title="', '"'),
                "%I:%M %p - %d %b %Y"),
            "statuses_count"  : text.parse_int(extr(
                'class="profile-stat-num">', '<').replace(",", "")),
            "friends_count"   : text.parse_int(extr(
                'class="profile-stat-num">', '<').replace(",", "")),
            "followers_count" : text.parse_int(extr(
                'class="profile-stat-num">', '<').replace(",", "")),
            "favourites_count": text.parse_int(extr(
                'class="profile-stat-num">', '<').replace(",", "")),
            "verified"        : 'title="Verified account"' in html,
        }

    def _extract_quote(self, html):
        html, _, quote = html.partition('class="quote')
        if quote:
            quote, _, tail = quote.partition('class="tweet-published')
            return (html + tail, quote)
        return (html, None)

    def _pagination(self, path):
        quoted = self.config("quoted", False)

        if self.user_id:
            self.user = self.request(
                "{}/i/user/{}".format(self.root, self.user_id),
                allow_redirects=False,
            ).headers["location"].rpartition("/")[2]
        base_url = url = "{}/{}{}".format(self.root, self.user, path)

        while True:
            tweets_html = self.request(url).text.split(
                '<div class="timeline-item')

            if self.user_obj is None:
                self.user_obj = self._user_from_html(tweets_html[0])

            for html, quote in map(self._extract_quote, tweets_html[1:]):
                yield self._tweet_from_html(html)
                if quoted and quote:
                    yield self._tweet_from_quote(quote)

            more = text.extr(
                tweets_html[-1], '<div class="show-more"><a href="?', '"')
            if not more:
                return
            url = base_url + "?" + text.unescape(more)


BASE_PATTERN = NitterExtractor.update({
    "nitter.net": {
        "root": "https://nitter.net",
        "pattern": r"nitter\.net",
    },
    "nitter.1d4.us": {
        "root": "https://nitter.1d4.us",
        "pattern": r"nitter\.1d4\.us",
    },
    "nitter.kavin.rocks": {
        "root": "https://nitter.kavin.rocks",
        "pattern": r"nitter\.kavin\.rocks",
    },
    "nitter.unixfox.eu": {
        "root": "https://nitter.unixfox.eu",
        "pattern": r"nitter\.unixfox\.eu",
    },
    "nitter.it": {
        "root": "https://nitter.it",
        "pattern": r"nitter\.it",
    },
})

USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)"


class NitterTweetsExtractor(NitterExtractor):
    subcategory = "tweets"
    pattern = USER_PATTERN + r"(?:/tweets)?(?:$|\?|#)"
    example = "https://nitter.net/USER"

    def tweets(self):
        return self._pagination("")


class NitterRepliesExtractor(NitterExtractor):
    subcategory = "replies"
    pattern = USER_PATTERN + r"/with_replies"
    example = "https://nitter.net/USER/with_replies"

    def tweets(self):
        return self._pagination("/with_replies")


class NitterMediaExtractor(NitterExtractor):
    subcategory = "media"
    pattern = USER_PATTERN + r"/media"
    example = "https://nitter.net/USER/media"

    def tweets(self):
        return self._pagination("/media")


class NitterSearchExtractor(NitterExtractor):
    subcategory = "search"
    pattern = USER_PATTERN + r"/search"
    example = "https://nitter.net/USER/search"

    def tweets(self):
        return self._pagination("/search")


class NitterTweetExtractor(NitterExtractor):
    """Extractor for nitter tweets"""
    subcategory = "tweet"
    directory_fmt = ("{category}", "{user[name]}")
    filename_fmt = "{tweet_id}_{num}.{extension}"
    archive_fmt = "{tweet_id}_{num}"
    pattern = BASE_PATTERN + r"/(i/web|[^/?#]+)/status/(\d+())"
    example = "https://nitter.net/USER/status/12345"

    def tweets(self):
        url = "{}/i/status/{}".format(self.root, self.user)
        html = text.extr(self.request(url).text, 'class="main-tweet', '''\
                </div>
              </div></div></div>''')
        html, quote = self._extract_quote(html)
        tweet = self._tweet_from_html(html)
        if quote and self.config("quoted", False):
            quoted = self._tweet_from_quote(quote)
            quoted["user"] = tweet["user"]
            return (tweet, quoted)
        return (tweet,)


def _retry_on_404(response):
    return response.status_code == 404
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`# -- coding: utf-8 --`

fix 'keywords' in extractor tests (#3491) 2023-01-03 15:14:23 +01:00			`# Copyright 2022-2023 Mike Fährmann`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for Nitter instances"""`

			`from .common import BaseExtractor, Message`
			`from .. import text`
[nitter] handle base64-encoded filenames 2022-11-26 19:56:28 +01:00			`import binascii`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00

			`class NitterExtractor(BaseExtractor):`
			`"""Base class for nitter extractors"""`
			`basecategory = "nitter"`
			`directory_fmt = ("{category}", "{user[name]}")`
			`filename_fmt = "{tweet_id}_{num}.{extension}"`
			`archive_fmt = "{tweet_id}_{num}"`

			`def __init__(self, match):`
consistent cookie-related names - rename every cookie variable or method to 'cookies_*' - simplify '.session.cookies' to just '.cookies' - more consistent 'login()' structure 2023-07-21 22:38:39 +02:00			`self.cookies_domain = self.root.partition("://")[2]`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`BaseExtractor.__init__(self, match)`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00
			`lastindex = match.lastindex`
			`self.user = match.group(lastindex)`
			`self.user_id = match.group(lastindex + 1)`
[nitter] update 'user' and 'author' 2022-11-25 18:50:04 +01:00			`self.user_obj = None`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`def items(self):`
[nitter] add 'retweets' option (#3278) 2022-11-25 19:53:28 +01:00			`retweets = self.config("retweets", False)`
[nitter] add 'videos' option (#3279) with the same semantics as for twitter 2022-11-24 22:56:01 +01:00			`videos = self.config("videos", True)`
[nitter] set 'hlsPlayback' cookie 2022-11-25 00:45:32 +01:00			`if videos:`
			`ytdl = (videos == "ytdl")`
			`videos = True`
consistent cookie-related names - rename every cookie variable or method to 'cookies_*' - simplify '.session.cookies' to just '.cookies' - more consistent 'login()' structure 2023-07-21 22:38:39 +02:00			`self.cookies.set("hlsPlayback", "on", domain=self.cookies_domain)`
[nitter] add 'videos' option (#3279) with the same semantics as for twitter 2022-11-24 22:56:01 +01:00
[nitter] fix direct Tweet links 2022-11-25 20:50:38 +01:00			`for tweet in self.tweets():`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
[nitter] add 'retweets' option (#3278) 2022-11-25 19:53:28 +01:00			`if not retweets and tweet["retweet"]:`
			`self.log.debug("Skipping %s (retweet)", tweet["tweet_id"])`
			`continue`

[nitter] add 'videos' option (#3279) with the same semantics as for twitter 2022-11-24 22:56:01 +01:00			`attachments = tweet.pop("_attach", "")`
			`if attachments:`
			`files = []`
			`append = files.append`

			`for url in text.extract_iter(`
			`attachments, 'href="', '"'):`
[nitter] handle base64-encoded filenames 2022-11-26 19:56:28 +01:00
[nitter] skip broadcasts instead of downloading an "Unsupported feature" HTML page 2023-03-25 13:09:24 +01:00			`if "/i/broadcasts/" in url:`
			`self.log.debug(`
			`"Skipping unsupported broadcast '%s'", url)`
			`continue`

[nitter] handle base64-encoded filenames 2022-11-26 19:56:28 +01:00			`if "/enc/" in url:`
			`name = binascii.a2b_base64(url.rpartition(`
			`"/")[2]).decode().rpartition("/")[2]`
			`else:`
			`name = url.rpartition("%2F")[2]`

[nitter] add 'videos' option (#3279) with the same semantics as for twitter 2022-11-24 22:56:01 +01:00			`if url[0] == "/":`
			`url = self.root + url`
[downloader:http] change '_http_retry' to accept a Python function and rename '_http_retry_codes' to '_http_retry' (#3569) 2023-03-09 23:30:15 +01:00			`file = {"url": url, "_http_retry": _retry_on_404}`
[nitter] sanitize filenames (#3294) 2022-11-25 00:34:45 +01:00			`file["filename"], _, file["extension"] = \`
			`name.rpartition(".")`
			`append(file)`
[nitter] add 'videos' option (#3279) with the same semantics as for twitter 2022-11-24 22:56:01 +01:00
			`if videos and not files:`
			`if ytdl:`
			`append({`
			`"url": "ytdl:{}/i/status/{}".format(`
			`self.root, tweet["tweet_id"]),`
			`"extension": None,`
			`})`
			`else:`
			`for url in text.extract_iter(`
			`attachments, 'data-url="', '"'):`
[nitter] handle base64-encoded filenames 2022-11-26 19:56:28 +01:00
			`if "/enc/" in url:`
			`name = binascii.a2b_base64(url.rpartition(`
			`"/")[2]).decode().rpartition("/")[2]`
			`else:`
			`name = url.rpartition("%2F")[2]`

[nitter] add 'videos' option (#3279) with the same semantics as for twitter 2022-11-24 22:56:01 +01:00			`if url[0] == "/":`
			`url = self.root + url`
[nitter] sanitize filenames (#3294) 2022-11-25 00:34:45 +01:00			`append({`
			`"url" : "ytdl:" + url,`
			`"filename" : name.rpartition(".")[0],`
			`"extension": "mp4",`
			`})`
[nitter] extract videos from 'source' elements (#3912) 2023-04-14 19:00:56 +02:00
			`for url in text.extract_iter(`
			`attachments, '<source src="', '"'):`
[nitter] fix video extraction 2023-11-27 17:28:06 +01:00			`if url[0] == "/":`
			`url = self.root + url`
[nitter] extract videos from 'source' elements (#3912) 2023-04-14 19:00:56 +02:00			`append(text.nameext_from_url(url, {"url": url}))`

[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`else:`
[nitter] add 'videos' option (#3279) with the same semantics as for twitter 2022-11-24 22:56:01 +01:00			`files = ()`
			`tweet["count"] = len(files)`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`yield Message.Directory, tweet`
[nitter] add 'videos' option (#3279) with the same semantics as for twitter 2022-11-24 22:56:01 +01:00			`for tweet["num"], file in enumerate(files, 1):`
			`url = file["url"]`
			`file.update(tweet)`
			`yield Message.Url, url, file`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`def _tweet_from_html(self, html):`
			`extr = text.extract_from(html)`
[nitter] update 'user' and 'author' 2022-11-25 18:50:04 +01:00			`author = {`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`"name": extr('class="fullname" href="/', '"'),`
			`"nick": extr('title="', '"'),`
			`}`
			`extr('<span class="tweet-date', '')`
			`link = extr('href="', '"')`
			`return {`
[nitter] add 'retweets' option (#3278) 2022-11-25 19:53:28 +01:00			`"author" : author,`
			`"user" : self.user_obj or author,`
			`"date" : text.parse_datetime(`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),`
			`"tweet_id": link.rpartition("/")[2].partition("#")[0],`
			`"content": extr('class="tweet-content', "</div").partition(">")[2],`
[nitter] add 'retweets' option (#3278) 2022-11-25 19:53:28 +01:00			`"_attach" : extr('class="attachments', 'class="tweet-stats'),`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`"comments": text.parse_int(extr(`
			`'class="icon-comment', '</div>').rpartition(">")[2]),`
			`"retweets": text.parse_int(extr(`
			`'class="icon-retweet', '</div>').rpartition(">")[2]),`
			`"quotes" : text.parse_int(extr(`
			`'class="icon-quote', '</div>').rpartition(">")[2]),`
			`"likes" : text.parse_int(extr(`
			`'class="icon-heart', '</div>').rpartition(">")[2]),`
[nitter] add 'retweets' option (#3278) 2022-11-25 19:53:28 +01:00			`"retweet" : 'class="retweet-header' in html,`
[nitter] skip broadcasts instead of downloading an "Unsupported feature" HTML page 2023-03-25 13:09:24 +01:00			`"quoted" : False,`
[nitter] support quoted Tweets - distinguish between regular and quoted Tweets and media - add 'quoted' option and metadata field 2022-11-26 11:23:03 +01:00			`}`

			`def _tweet_from_quote(self, html):`
			`extr = text.extract_from(html)`
			`author = {`
			`"name": extr('class="fullname" href="/', '"'),`
			`"nick": extr('title="', '"'),`
			`}`
			`extr('<span class="tweet-date', '')`
			`link = extr('href="', '"')`
			`return {`
			`"author" : author,`
			`"user" : self.user_obj or author,`
			`"date" : text.parse_datetime(`
			`extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),`
			`"tweet_id": link.rpartition("/")[2].partition("#")[0],`
[nitter] skip broadcasts instead of downloading an "Unsupported feature" HTML page 2023-03-25 13:09:24 +01:00			`"content" : extr('class="quote-text', "</div").partition(">")[2],`
[nitter] support quoted Tweets - distinguish between regular and quoted Tweets and media - add 'quoted' option and metadata field 2022-11-26 11:23:03 +01:00			`"_attach" : extr('class="attachments', '''`
			`</div>'''),`
			`"retweet" : False,`
[nitter] skip broadcasts instead of downloading an "Unsupported feature" HTML page 2023-03-25 13:09:24 +01:00			`"quoted" : True,`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`}`

[nitter] update 'user' and 'author' 2022-11-25 18:50:04 +01:00			`def _user_from_html(self, html):`
			`extr = text.extract_from(html, html.index('class="profile-tabs'))`
			`banner = extr('class="profile-banner"><a href="', '"')`
[nitter] fix extraction for instances without user banners 2023-03-25 12:50:40 +01:00
			`try:`
[nitter] extract user IDs from encoded banner URLs still requires a banner to be present to begin with 2023-04-23 19:13:27 +02:00			`if "/enc/" in banner:`
			`uid = binascii.a2b_base64(banner.rpartition(`
			`"/")[2]).decode().split("/")[4]`
			`else:`
			`uid = banner.split("%2F")[4]`
[nitter] fix extraction for instances without user banners 2023-03-25 12:50:40 +01:00			`except Exception:`
			`uid = 0`

[nitter] update 'user' and 'author' 2022-11-25 18:50:04 +01:00			`return {`
[nitter] fix extraction for instances without user banners 2023-03-25 12:50:40 +01:00			`"id" : uid,`
[nitter] update 'user' and 'author' 2022-11-25 18:50:04 +01:00			`"profile_banner" : self.root + banner if banner else "",`
			`"profile_image" : self.root + extr(`
			`'class="profile-card-avatar" href="', '"'),`
			`"nick" : extr('title="', '"'),`
			`"name" : extr('title="@', '"'),`
			`"description" : extr('<p dir="auto">', '<'),`
			`"date" : text.parse_datetime(`
			`extr('class="profile-joindate"><span title="', '"'),`
			`"%I:%M %p - %d %b %Y"),`
[nitter] support quoted Tweets - distinguish between regular and quoted Tweets and media - add 'quoted' option and metadata field 2022-11-26 11:23:03 +01:00			`"statuses_count" : text.parse_int(extr(`
			`'class="profile-stat-num">', '<').replace(",", "")),`
			`"friends_count" : text.parse_int(extr(`
			`'class="profile-stat-num">', '<').replace(",", "")),`
			`"followers_count" : text.parse_int(extr(`
			`'class="profile-stat-num">', '<').replace(",", "")),`
			`"favourites_count": text.parse_int(extr(`
			`'class="profile-stat-num">', '<').replace(",", "")),`
[nitter] update 'user' and 'author' 2022-11-25 18:50:04 +01:00			`"verified" : 'title="Verified account"' in html,`
			`}`

[nitter] support quoted Tweets - distinguish between regular and quoted Tweets and media - add 'quoted' option and metadata field 2022-11-26 11:23:03 +01:00			`def _extract_quote(self, html):`
			`html, _, quote = html.partition('class="quote')`
			`if quote:`
			`quote, _, tail = quote.partition('class="tweet-published')`
			`return (html + tail, quote)`
			`return (html, None)`

[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`def _pagination(self, path):`
[nitter] support quoted Tweets - distinguish between regular and quoted Tweets and media - add 'quoted' option and metadata field 2022-11-26 11:23:03 +01:00			`quoted = self.config("quoted", False)`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00
			`if self.user_id:`
			`self.user = self.request(`
			`"{}/i/user/{}".format(self.root, self.user_id),`
			`allow_redirects=False,`
			`).headers["location"].rpartition("/")[2]`
			`base_url = url = "{}/{}{}".format(self.root, self.user, path)`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`while True:`
[nitter] update 'user' and 'author' 2022-11-25 18:50:04 +01:00			`tweets_html = self.request(url).text.split(`
			`'<div class="timeline-item')`

			`if self.user_obj is None:`
			`self.user_obj = self._user_from_html(tweets_html[0])`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
[nitter] support quoted Tweets - distinguish between regular and quoted Tweets and media - add 'quoted' option and metadata field 2022-11-26 11:23:03 +01:00			`for html, quote in map(self._extract_quote, tweets_html[1:]):`
[nitter] fix direct Tweet links 2022-11-25 20:50:38 +01:00			`yield self._tweet_from_html(html)`
[nitter] support quoted Tweets - distinguish between regular and quoted Tweets and media - add 'quoted' option and metadata field 2022-11-26 11:23:03 +01:00			`if quoted and quote:`
			`yield self._tweet_from_quote(quote)`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
[nitter] update 'user' and 'author' 2022-11-25 18:50:04 +01:00			`more = text.extr(`
			`tweets_html[-1], '<div class="show-more"><a href="?', '"')`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`if not more:`
			`return`
			`url = base_url + "?" + text.unescape(more)`


			`BASE_PATTERN = NitterExtractor.update({`
			`"nitter.net": {`
			`"root": "https://nitter.net",`
			`"pattern": r"nitter\.net",`
			`},`
			`"nitter.1d4.us": {`
			`"root": "https://nitter.1d4.us",`
			`"pattern": r"nitter\.1d4\.us",`
			`},`
			`"nitter.kavin.rocks": {`
			`"root": "https://nitter.kavin.rocks",`
			`"pattern": r"nitter\.kavin\.rocks",`
			`},`
			`"nitter.unixfox.eu": {`
			`"root": "https://nitter.unixfox.eu",`
			`"pattern": r"nitter\.unixfox\.eu",`
			`},`
[nitter] support nitter.it (#3819) 2023-03-25 13:29:22 +01:00			`"nitter.it": {`
			`"root": "https://nitter.it",`
			`"pattern": r"nitter\.it",`
			`},`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00			`})`

[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/\|d:)(\d+)\|[^/?#]+)"`

[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`class NitterTweetsExtractor(NitterExtractor):`
			`subcategory = "tweets"`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`pattern = USER_PATTERN + r"(?:/tweets)?(?:$\|\?\|#)"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://nitter.net/USER"`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`def tweets(self):`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`return self._pagination("")`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00

			`class NitterRepliesExtractor(NitterExtractor):`
			`subcategory = "replies"`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`pattern = USER_PATTERN + r"/with_replies"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://nitter.net/USER/with_replies"`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`def tweets(self):`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`return self._pagination("/with_replies")`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00

			`class NitterMediaExtractor(NitterExtractor):`
			`subcategory = "media"`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`pattern = USER_PATTERN + r"/media"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://nitter.net/USER/media"`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`def tweets(self):`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`return self._pagination("/media")`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00

			`class NitterSearchExtractor(NitterExtractor):`
			`subcategory = "search"`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`pattern = USER_PATTERN + r"/search"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://nitter.net/USER/search"`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`def tweets(self):`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`return self._pagination("/search")`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00

			`class NitterTweetExtractor(NitterExtractor):`
			`"""Extractor for nitter tweets"""`
			`subcategory = "tweet"`
			`directory_fmt = ("{category}", "{user[name]}")`
			`filename_fmt = "{tweet_id}_{num}.{extension}"`
			`archive_fmt = "{tweet_id}_{num}"`
[nitter] support '/i/user/' URLs (#3310) as well as using 'id:<userid>' as username not all nitter instances seem to support '/i/user/' ... 2022-12-04 12:07:19 +01:00			`pattern = BASE_PATTERN + r"/(i/web\|[^/?#]+)/status/(\d+())"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://nitter.net/USER/status/12345"`
[nitter] add extractors for Nitter instances (#2696) 2022-11-15 11:44:16 +01:00
			`def tweets(self):`
			`url = "{}/i/status/{}".format(self.root, self.user)`
[nitter] fix direct Tweet links 2022-11-25 20:50:38 +01:00			`html = text.extr(self.request(url).text, 'class="main-tweet', '''\`
			`</div>`
			`</div></div></div>''')`
[nitter] support quoted Tweets - distinguish between regular and quoted Tweets and media - add 'quoted' option and metadata field 2022-11-26 11:23:03 +01:00			`html, quote = self._extract_quote(html)`
			`tweet = self._tweet_from_html(html)`
			`if quote and self.config("quoted", False):`
			`quoted = self._tweet_from_quote(quote)`
			`quoted["user"] = tweet["user"]`
			`return (tweet, quoted)`
			`return (tweet,)`
[downloader:http] change '_http_retry' to accept a Python function and rename '_http_retry_codes' to '_http_retry' (#3569) 2023-03-09 23:30:15 +01:00

			`def _retry_on_404(response):`
			`return response.status_code == 404`