gallery-dl/gallery_dl/extractor/twitter.py

# -*- coding: utf-8 -*-

# Copyright 2016-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from https://twitter.com/"""

from .common import Extractor, Message
from .. import text


class TwitterExtractor(Extractor):
    """Base class for twitter extractors"""
    category = "twitter"
    directory_fmt = ["{category}", "{user}"]
    filename_fmt = "{tweet_id}_{num}.{extension}"
    archive_fmt = "{tweet_id}_{retweet_id}_{num}"
    root = "https://twitter.com"

    def __init__(self, match):
        Extractor.__init__(self)
        self.user = match.group(1)
        self.retweets = self.config("retweets", True)
        self.videos = self.config("videos", False)

    def items(self):
        yield Message.Version, 1
        yield Message.Directory, self.metadata()

        for tweet in self.tweets():
            data = self._data_from_tweet(tweet)
            if not self.retweets and data["retweet_id"]:
                continue

            images = text.extract_iter(
                tweet, 'data-image-url="', '"')
            for data["num"], url in enumerate(images, 1):
                text.nameext_from_url(url, data)
                yield Message.Url, url + ":orig", data

            if self.videos and "-videoContainer" in tweet:
                data["num"] = 1
                url = "ytdl:{}/{}/status/{}".format(
                    self.root, data["user"], data["tweet_id"])
                yield Message.Url, url, data

    def metadata(self):
        """Return general metadata"""
        return {"user": self.user}

    def tweets(self):
        """Yield HTML content of all relevant tweets"""

    @staticmethod
    def _data_from_tweet(tweet):
        data = text.extract_all(tweet, (
            ("tweet_id"  , 'data-tweet-id="'   , '"'),
            ("retweet_id", 'data-retweet-id="' , '"'),
            ("retweeter" , 'data-retweeter="'  , '"'),
            ("user"      , 'data-screen-name="', '"'),
            ("username"  , 'data-name="'       , '"'),
            ("user_id"   , 'data-user-id="'    , '"'),
        ))[0]
        for key in ("tweet_id", "retweet_id", "user_id"):
            data[key] = text.parse_int(data[key])
        data["retweeter"] = data["retweeter"] or ""
        return data

    def _tweets_from_api(self, url):
        params = {
            "include_available_features": "1",
            "include_entities": "1",
            "reset_error_state": "false",
            "lang": "en",
        }
        headers = {
            "X-Requested-With": "XMLHttpRequest",
            "X-Twitter-Active-User": "yes",
            "Referer": "{}/{}".format(self.root, self.user)
        }

        while True:
            data = self.request(url, params=params, headers=headers).json()
            if "inner" in data:
                data = data["inner"]

            for tweet in text.extract_iter(
                    data["items_html"], '<div class="tweet ', '\n</li>'):
                yield tweet

            if not data["has_more_items"]:
                return
            params["max_position"] = text.extract(
                tweet, 'data-tweet-id="', '"')[0]


class TwitterTimelineExtractor(TwitterExtractor):
    """Extractor for all images from a user's timeline"""
    subcategory = "timeline"
    pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
               r"/([^/?&#]+)/?$"]
    test = [("https://twitter.com/PicturesEarth", {
        "range": "1-40",
        "url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",
        "keyword": "cbae53b6f4ba133078bb13c95dbd3cbb4fa40b9f",
    })]

    def tweets(self):
        url = "{}/i/profiles/show/{}/timeline/tweets".format(
            self.root, self.user)
        return self._tweets_from_api(url)


class TwitterMediaExtractor(TwitterExtractor):
    """Extractor for all images from a user's Media Tweets"""
    subcategory = "media"
    pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
               r"/([^/?&#]+)/media(?!\w)"]
    test = [("https://twitter.com/PicturesEarth/media", {
        "range": "1-40",
        "url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",
    })]

    def tweets(self):
        url = "{}/i/profiles/show/{}/media_timeline".format(
            self.root, self.user)
        return self._tweets_from_api(url)


class TwitterTweetExtractor(TwitterExtractor):
    """Extractor for images from individual tweets"""
    subcategory = "tweet"
    pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
               r"/([^/?&#]+)/status/(\d+)"]
    test = [
        ("https://twitter.com/PicturesEarth/status/672897688871018500", {
            "url": "d9e68d41301d2fe382eb27711dea28366be03b1a",
            "keyword": "46c8e739a892000848a8a2184da91346c9cbe4bf",
            "content": "a1f2f04cb2d8df24b1afa7a39910afda23484342",
        }),
        ("https://twitter.com/perrypumas/status/894001459754180609", {
            "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
            "keyword": "7729cd3ff16a5647b0b5ffdec9d428c91eedafbe",
        }),
    ]

    def __init__(self, match):
        TwitterExtractor.__init__(self, match)
        self.tweet_id = match.group(2)

    def metadata(self):
        return {"user": self.user, "tweet_id": self.tweet_id}

    def tweets(self):
        url = "{}/{}/status/{}".format(self.root, self.user, self.tweet_id)
        page = self.request(url).text
        return (text.extract(
            page, '<div class="tweet ', '<ul class="stats')[0],)
[twitter] add extractor 2016-10-06 19:12:07 +02:00			`# -- coding: utf-8 --`

set 'archive_fmt' values These are going to be used to create an unique id for each image. 2018-01-30 22:49:16 +01:00			`# Copyright 2016-2018 Mike Fährmann`
[twitter] add extractor 2016-10-06 19:12:07 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract images from https://twitter.com/"""`

			`from .common import Extractor, Message`
implement youtube-dl downloader module URLs starting with 'ytdl:' will now be handled by youtube-dl. There is probably a lot to fix and improve, but the basic use case works. TODO: - format selection and ytdl options in general - better filename/path handling - ytdl support for "unsupported URLs" - ... 2018-10-05 17:58:15 +02:00			`from .. import text`
[twitter] add extractor 2016-10-06 19:12:07 +02:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`class TwitterExtractor(Extractor):`
			`"""Base class for twitter extractors"""`
[twitter] add extractor 2016-10-06 19:12:07 +02:00			`category = "twitter"`
			`directory_fmt = ["{category}", "{user}"]`
change keyword names to valid Python identifiers This commit mostly replaces all minus-signs ('-') in keyword names with underscores ('_') to allow them to be used in filter-expressions. For example 'gallery-id' got renamed to 'gallery_id'. (It is theoretically possible to access any variable, regardless of its name, with 'locals()["NAME"]', but that seems a bit too convoluted if just 'NAME' could be enough) 2017-09-10 22:20:47 +02:00			`filename_fmt = "{tweet_id}_{num}.{extension}"`
[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`archive_fmt = "{tweet_id}_{retweet_id}_{num}"`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`root = "https://twitter.com"`

[twitter] add extractor for media-tweet timelines (#96) For example "https://twitter.com/PicturesEarth/media". They are different from normal timelines in that they do not contain any (re)tweets from other users and feature all media the user ever posted, including responses to other tweets. 2018-08-19 20:36:33 +02:00			`def __init__(self, match):`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`Extractor.__init__(self)`
[twitter] add extractor for media-tweet timelines (#96) For example "https://twitter.com/PicturesEarth/media". They are different from normal timelines in that they do not contain any (re)tweets from other users and feature all media the user ever posted, including responses to other tweets. 2018-08-19 20:36:33 +02:00			`self.user = match.group(1)`
			`self.retweets = self.config("retweets", True)`
[twitter] add experimental 'videos' option (#99) Enabling this option will detect videos in tweets and output them as "unsupported" URLs, so that these can then be downloaded with youtube-dl There are a lot of improvements to be made to the current implementation, but it works and does what it is supposed to, even if inefficient as can be ... 2018-09-30 18:41:39 +02:00			`self.videos = self.config("videos", False)`

[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`def items(self):`
			`yield Message.Version, 1`
			`yield Message.Directory, self.metadata()`

			`for tweet in self.tweets():`
			`data = self._data_from_tweet(tweet)`
			`if not self.retweets and data["retweet_id"]:`
			`continue`

[twitter] add experimental 'videos' option (#99) Enabling this option will detect videos in tweets and output them as "unsupported" URLs, so that these can then be downloaded with youtube-dl There are a lot of improvements to be made to the current implementation, but it works and does what it is supposed to, even if inefficient as can be ... 2018-09-30 18:41:39 +02:00			`images = text.extract_iter(`
			`tweet, 'data-image-url="', '"')`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`for data["num"], url in enumerate(images, 1):`
			`text.nameext_from_url(url, data)`
			`yield Message.Url, url + ":orig", data`

[twitter] add experimental 'videos' option (#99) Enabling this option will detect videos in tweets and output them as "unsupported" URLs, so that these can then be downloaded with youtube-dl There are a lot of improvements to be made to the current implementation, but it works and does what it is supposed to, even if inefficient as can be ... 2018-09-30 18:41:39 +02:00			`if self.videos and "-videoContainer" in tweet:`
implement youtube-dl downloader module URLs starting with 'ytdl:' will now be handled by youtube-dl. There is probably a lot to fix and improve, but the basic use case works. TODO: - format selection and ytdl options in general - better filename/path handling - ytdl support for "unsupported URLs" - ... 2018-10-05 17:58:15 +02:00			`data["num"] = 1`
			`url = "ytdl:{}/{}/status/{}".format(`
[twitter] add experimental 'videos' option (#99) Enabling this option will detect videos in tweets and output them as "unsupported" URLs, so that these can then be downloaded with youtube-dl There are a lot of improvements to be made to the current implementation, but it works and does what it is supposed to, even if inefficient as can be ... 2018-09-30 18:41:39 +02:00			`self.root, data["user"], data["tweet_id"])`
implement youtube-dl downloader module URLs starting with 'ytdl:' will now be handled by youtube-dl. There is probably a lot to fix and improve, but the basic use case works. TODO: - format selection and ytdl options in general - better filename/path handling - ytdl support for "unsupported URLs" - ... 2018-10-05 17:58:15 +02:00			`yield Message.Url, url, data`
[twitter] add experimental 'videos' option (#99) Enabling this option will detect videos in tweets and output them as "unsupported" URLs, so that these can then be downloaded with youtube-dl There are a lot of improvements to be made to the current implementation, but it works and does what it is supposed to, even if inefficient as can be ... 2018-09-30 18:41:39 +02:00
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`def metadata(self):`
			`"""Return general metadata"""`
[twitter] add extractor for media-tweet timelines (#96) For example "https://twitter.com/PicturesEarth/media". They are different from normal timelines in that they do not contain any (re)tweets from other users and feature all media the user ever posted, including responses to other tweets. 2018-08-19 20:36:33 +02:00			`return {"user": self.user}`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00
			`def tweets(self):`
			`"""Yield HTML content of all relevant tweets"""`

			`@staticmethod`
			`def _data_from_tweet(tweet):`
			`data = text.extract_all(tweet, (`
			`("tweet_id" , 'data-tweet-id="' , '"'),`
			`("retweet_id", 'data-retweet-id="' , '"'),`
			`("retweeter" , 'data-retweeter="' , '"'),`
			`("user" , 'data-screen-name="', '"'),`
			`("username" , 'data-name="' , '"'),`
[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`("user_id" , 'data-user-id="' , '"'),`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`))[0]`
[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`for key in ("tweet_id", "retweet_id", "user_id"):`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`data[key] = text.parse_int(data[key])`
			`data["retweeter"] = data["retweeter"] or ""`
			`return data`

[twitter] add extractor for media-tweet timelines (#96) For example "https://twitter.com/PicturesEarth/media". They are different from normal timelines in that they do not contain any (re)tweets from other users and feature all media the user ever posted, including responses to other tweets. 2018-08-19 20:36:33 +02:00			`def _tweets_from_api(self, url):`
[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`params = {`
			`"include_available_features": "1",`
			`"include_entities": "1",`
			`"reset_error_state": "false",`
			`"lang": "en",`
			`}`
			`headers = {`
			`"X-Requested-With": "XMLHttpRequest",`
			`"X-Twitter-Active-User": "yes",`
			`"Referer": "{}/{}".format(self.root, self.user)`
			`}`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00
			`while True:`
[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`data = self.request(url, params=params, headers=headers).json()`
[twitter] unpack API responses when logged in (closes #123) 2018-11-14 11:48:09 +01:00			`if "inner" in data:`
			`data = data["inner"]`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00
			`for tweet in text.extract_iter(`
[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`data["items_html"], '<div class="tweet ', '\n</li>'):`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`yield tweet`

[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`if not data["has_more_items"]:`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`return`
			`params["max_position"] = text.extract(`
			`tweet, 'data-tweet-id="', '"')[0]`


[twitter] add extractor for media-tweet timelines (#96) For example "https://twitter.com/PicturesEarth/media". They are different from normal timelines in that they do not contain any (re)tweets from other users and feature all media the user ever posted, including responses to other tweets. 2018-08-19 20:36:33 +02:00			`class TwitterTimelineExtractor(TwitterExtractor):`
			`"""Extractor for all images from a user's timeline"""`
			`subcategory = "timeline"`
			`pattern = [r"(?:https?://)?(?:www\.\|mobile\.)?twitter\.com"`
			`r"/([^/?&#]+)/?$"]`
			`test = [("https://twitter.com/PicturesEarth", {`
fix 'range' tests and update a few test results 2018-10-08 23:30:06 +02:00			`"range": "1-40",`
[twitter] add extractor for media-tweet timelines (#96) For example "https://twitter.com/PicturesEarth/media". They are different from normal timelines in that they do not contain any (re)tweets from other users and feature all media the user ever posted, including responses to other tweets. 2018-08-19 20:36:33 +02:00			`"url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",`
			`"keyword": "cbae53b6f4ba133078bb13c95dbd3cbb4fa40b9f",`
			`})]`

			`def tweets(self):`
			`url = "{}/i/profiles/show/{}/timeline/tweets".format(`
			`self.root, self.user)`
			`return self._tweets_from_api(url)`


			`class TwitterMediaExtractor(TwitterExtractor):`
			`"""Extractor for all images from a user's Media Tweets"""`
			`subcategory = "media"`
			`pattern = [r"(?:https?://)?(?:www\.\|mobile\.)?twitter\.com"`
			`r"/([^/?&#]+)/media(?!\w)"]`
			`test = [("https://twitter.com/PicturesEarth/media", {`
fix 'range' tests and update a few test results 2018-10-08 23:30:06 +02:00			`"range": "1-40",`
[twitter] add extractor for media-tweet timelines (#96) For example "https://twitter.com/PicturesEarth/media". They are different from normal timelines in that they do not contain any (re)tweets from other users and feature all media the user ever posted, including responses to other tweets. 2018-08-19 20:36:33 +02:00			`"url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",`
			`})]`

			`def tweets(self):`
			`url = "{}/i/profiles/show/{}/media_timeline".format(`
			`self.root, self.user)`
			`return self._tweets_from_api(url)`


[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`class TwitterTweetExtractor(TwitterExtractor):`
[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`"""Extractor for images from individual tweets"""`
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`subcategory = "tweet"`
			`pattern = [r"(?:https?://)?(?:www\.\|mobile\.)?twitter\.com"`
			`r"/([^/?&#]+)/status/(\d+)"]`
[twitter] ignore "Promoted Tweets" 2017-08-06 13:43:08 +02:00			`test = [`
			`("https://twitter.com/PicturesEarth/status/672897688871018500", {`
			`"url": "d9e68d41301d2fe382eb27711dea28366be03b1a",`
[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`"keyword": "46c8e739a892000848a8a2184da91346c9cbe4bf",`
[twitter] ignore "Promoted Tweets" 2017-08-06 13:43:08 +02:00			`"content": "a1f2f04cb2d8df24b1afa7a39910afda23484342",`
			`}),`
			`("https://twitter.com/perrypumas/status/894001459754180609", {`
			`"url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",`
[twitter] changes and improvements - rename User- to TimelineExtractor - rename 'userid' to 'user_id' to conform to the other ..._id values - adjust archive_fmt to deal with retweets - emulate browser behavior for API calls 2018-08-18 18:58:10 +02:00			`"keyword": "7729cd3ff16a5647b0b5ffdec9d428c91eedafbe",`
[twitter] ignore "Promoted Tweets" 2017-08-06 13:43:08 +02:00			`}),`
			`]`
[twitter] add extractor 2016-10-06 19:12:07 +02:00
			`def __init__(self, match):`
[twitter] add extractor for media-tweet timelines (#96) For example "https://twitter.com/PicturesEarth/media". They are different from normal timelines in that they do not contain any (re)tweets from other users and feature all media the user ever posted, including responses to other tweets. 2018-08-19 20:36:33 +02:00			`TwitterExtractor.__init__(self, match)`
			`self.tweet_id = match.group(2)`
[twitter] add extractor 2016-10-06 19:12:07 +02:00
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`def metadata(self):`
			`return {"user": self.user, "tweet_id": self.tweet_id}`
[twitter] add extractor 2016-10-06 19:12:07 +02:00
[twitter] add support for user-timelines (closes #96) also adds a 'retweets' option to filter retweeted content 2018-08-17 20:04:11 +02:00			`def tweets(self):`
			`url = "{}/{}/status/{}".format(self.root, self.user, self.tweet_id)`
			`page = self.request(url).text`
			`return (text.extract(`
			`page, '<div class="tweet ', '<ul class="stats')[0],)`