[twitter] add experimental 'videos' option (#99)

Enabling this option will detect videos in tweets and output them as "unsupported" URLs, so that these can then be downloaded with youtube-dl There are a lot of improvements to be made to the current implementation, but it works and does what it is supposed to, even if inefficient as can be ...
2024-11-25 12:12:34 +01:00 · 2018-09-30 18:41:39 +02:00 · 2018-09-30 18:41:39 +02:00 · f8b3b00249
commit f8b3b00249
parent 5507f5ce2e
3 changed files with 24 additions and 8 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -654,6 +654,15 @@ Description Extract images from retweets.
 =========== =====


+extractor.twitter.videos
+------------------------
+=========== =====
+Type        ``bool``
+Default     ``false``
+Description Output video tweets as unsupported URLs.
+=========== =====
+
+
 extractor.[booru].tags
 ----------------------
 =========== =====
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@ -110,7 +110,8 @@
        },
        "twitter":
        {
-            "retweets": true
+            "retweets": true,
+            "videos": false
        },
        "booru":
        {
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@ -9,7 +9,7 @@
 """Extract images from https://twitter.com/"""

 from .common import Extractor, Message
-from .. import text
+from .. import text, extractor


 class TwitterExtractor(Extractor):
@ -24,32 +24,38 @@ class TwitterExtractor(Extractor):
        Extractor.__init__(self)
        self.user = match.group(1)
        self.retweets = self.config("retweets", True)
+        self.videos = self.config("videos", False)
+
+        if self.videos:
+            self._blacklist = extractor.blacklist(("twitter",))

    def items(self):
        yield Message.Version, 1
        yield Message.Directory, self.metadata()

        for tweet in self.tweets():
-            images = list(text.extract_iter(
-                tweet, 'data-image-url="', '"'))
-            if not images:
-                continue
-
            data = self._data_from_tweet(tweet)
            if not self.retweets and data["retweet_id"]:
                continue

+            images = text.extract_iter(
+                tweet, 'data-image-url="', '"')
            for data["num"], url in enumerate(images, 1):
                text.nameext_from_url(url, data)
                yield Message.Url, url + ":orig", data

+            if self.videos and "-videoContainer" in tweet:
+                url = "{}/{}/status/{}".format(
+                    self.root, data["user"], data["tweet_id"])
+                with self._blacklist:
+                    yield Message.Queue, url, data
+
    def metadata(self):
        """Return general metadata"""
        return {"user": self.user}

    def tweets(self):
        """Yield HTML content of all relevant tweets"""
-        return ()

    @staticmethod
    def _data_from_tweet(tweet):