[ytdl] add extractor for sites supported by youtube-dl

(#1680, #878) Can be used by prefixing any URL with 'ytdl:', or by setting 'extractor,ytdl.enabled' to 'true'.
2024-11-22 10:42:34 +01:00 · 2021-07-10 20:47:33 +02:00 · 2021-07-10 20:47:33 +02:00 · 36ac2197db
commit 36ac2197db
parent 64240c8d42
8 changed files with 195 additions and 9 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -1955,6 +1955,72 @@ Description
    Download video files.


+extractor.ytdl.enabled
+----------------------
+Type
+    ``bool``
+Default
+    ``false``
+Description
+    Match **all** URLs, even ones without a ``ytdl:`` prefix.
+
+
+extractor.ytdl.format
+---------------------
+Type
+    ``string``
+Default
+    youtube-dl's default, currently ``"bestvideo+bestaudio/best"``
+Description
+    Video `format selection
+    <https://github.com/ytdl-org/youtube-dl#format-selection>`__
+    directly passed to youtube-dl.
+
+
+extractor.ytdl.logging
+----------------------
+Type
+    ``bool``
+Default
+    ``true``
+Description
+    Route youtube-dl's output through gallery-dl's logging system.
+    Otherwise youtube-dl will write its output directly to stdout/stderr.
+
+    Note: Set ``quiet`` and ``no_warnings`` in
+    `extractor.ytdl.raw-options`_ to ``true`` to suppress all output.
+
+
+extractor.ytdl.module
+---------------------
+Type
+    ``string``
+Default
+    ``"youtube_dl"``
+Description
+    Name of the youtube-dl Python module to import.
+
+
+extractor.ytdl.raw-options
+--------------------------
+Type
+    ``object``
+Example
+    .. code:: json
+
+        {
+            "quiet": true,
+            "writesubtitles": true,
+            "merge_output_format": "mkv"
+        }
+
+Description
+    Additional options passed directly to the ``YoutubeDL`` constructor.
+
+    All available options can be found in `youtube-dl's docstrings
+    <https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L138-L318>`__.
+
+
 extractor.[booru].tags
 ----------------------
 Type
@ -1967,6 +2033,7 @@ Description

    Note: This requires 1 additional HTTP request for each post.

+
 extractor.[booru].notes
 -----------------------
 Type
@ -1978,6 +2045,7 @@ Description

    Note: This requires 1 additional HTTP request for each post.

+
 extractor.[manga-extractor].chapter-reverse
 -------------------------------------------
 Type
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@ -287,6 +287,14 @@
            "retweets": true,
            "videos": true
        },
+        "ytdl":
+        {
+            "enabled": false,
+            "format": null,
+            "logging": true,
+            "module": "youtube_dl",
+            "raw-options": null
+        },
        "booru":
        {
            "tags": false,
--- a/gallery_dl/downloader/ytdl.py
+++ b/gallery_dl/downloader/ytdl.py
@ -41,7 +41,10 @@ class YoutubeDLDownloader(DownloaderBase):
            "max_filesize": text.parse_bytes(
                self.config("filesize-max"), None),
        }
-        options.update(self.config("raw-options") or {})
+
+        raw_options = self.config("raw-options")
+        if raw_options:
+            options.update(raw_options)

        if self.config("logging", True):
            options["logger"] = self.log
@ -59,19 +62,22 @@ class YoutubeDLDownloader(DownloaderBase):
            for cookie in self.session.cookies:
                set_cookie(cookie)

-        try:
-            info_dict = self.ytdl.extract_info(url[5:], download=False)
-        except Exception:
-            return False
+        kwdict = pathfmt.kwdict
+        info_dict = kwdict.pop("_ytdl_info_dict", None)
+        if not info_dict:
+            try:
+                info_dict = self.ytdl.extract_info(url[5:], download=False)
+            except Exception:
+                return False

        if "entries" in info_dict:
-            index = pathfmt.kwdict.get("_ytdl_index")
+            index = kwdict.get("_ytdl_index")
            if index is None:
                return self._download_playlist(pathfmt, info_dict)
            else:
                info_dict = info_dict["entries"][index]

-        extra = pathfmt.kwdict.get("_ytdl_extra")
+        extra = kwdict.get("_ytdl_extra")
        if extra:
            info_dict.update(extra)

@ -121,6 +127,7 @@ class YoutubeDLDownloader(DownloaderBase):


 def compatible_formats(formats):
+    """Returns True if 'formats' are compatible for merge"""
    video_ext = formats[0].get("ext")
    audio_ext = formats[1].get("ext")

--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -143,6 +143,7 @@ modules = [
    "recursive",
    "oauth",
    "test",
+    "ytdl",
 ]


--- a/gallery_dl/extractor/ytdl.py
+++ b/gallery_dl/extractor/ytdl.py
@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for sites supported by youtube-dl"""
+
+from .common import Extractor, Message
+from .. import config
+
+
+class YoutubeDLExtractor(Extractor):
+    """Generic extractor for youtube-dl supported URLs"""
+    category = "ytdl"
+    directory_fmt = ("{category}", "{subcategory}")
+    filename_fmt = "{title}-{id}.{extension}"
+    archive_fmt = "{extractor_key} {id}"
+    ytdl_module = None
+    pattern = r"ytdl:(.*)"
+    test = ("ytdl:https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9",)
+
+    def __init__(self, match):
+        # import youtube_dl module
+        module = self.ytdl_module
+        if not module:
+            name = config.get(("extractor", "ytdl"), "module") or "youtube_dl"
+            module = YoutubeDLExtractor.ytdl_module = __import__(name)
+
+        # find suitable youtube_dl extractor
+        self.ytdl_url = url = match.group(1)
+        for ie in module.extractor.gen_extractor_classes():
+            if ie.suitable(url):
+                self.ytdl_ie = ie
+                break
+
+        # set subcategory to youtube_dl extractor's key
+        self.subcategory = ie.ie_key()
+        Extractor.__init__(self, match)
+
+    def items(self):
+        # construct YoutubeDL object
+        options = {
+            "format": self.config("format"),
+            "socket_timeout": self._timeout,
+            "nocheckcertificate": not self._verify,
+            "proxy": self.session.proxies.get("http"),
+        }
+
+        raw_options = self.config("raw-options")
+        if raw_options:
+            options.update(raw_options)
+        if self.config("logging", True):
+            options["logger"] = self.log
+        options["extract_flat"] = "in_playlist"
+
+        ytdl = self.ytdl_module.YoutubeDL(options)
+
+        # extract youtube_dl info_dict
+        info_dict = ytdl._YoutubeDL__extract_info(
+            self.ytdl_url,
+            ytdl.get_info_extractor(self.ytdl_ie.ie_key()),
+            False, {}, True)
+
+        if "entries" in info_dict:
+            results = self._process_entries(ytdl, info_dict["entries"])
+        else:
+            results = (info_dict,)
+
+        # yield results
+        for info_dict in results:
+            info_dict["extension"] = None
+            info_dict["_ytdl_info_dict"] = info_dict
+
+            url = "ytdl:" + (info_dict.get("url") or
+                             info_dict.get("webpage_url") or
+                             self.ytdl_url)
+
+            yield Message.Directory, info_dict
+            yield Message.Url, url, info_dict
+
+    def _process_entries(self, ytdl, entries):
+        for entry in entries:
+            if entry.get("_type") in ("url", "url_transparent"):
+                info_dict = ytdl.extract_info(
+                    entry["url"], False,
+                    ie_key=entry.get("ie_key"))
+                if "entries" in info_dict:
+                    yield from self._process_entries(
+                        ytdl, info_dict["entries"])
+                else:
+                    yield info_dict
+            else:
+                yield entry
+
+
+if config.get(("extractor", "ytdl"), "enabled"):
+    # make 'ytdl:' prefix optional
+    YoutubeDLExtractor.pattern = r"(?:ytdl:)?(.*)"
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@ -6,4 +6,4 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-__version__ = "1.18.2-dev"
+__version__ = "1.19.0-dev"
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -254,6 +254,7 @@ IGNORE_LIST = (
    "oauth",
    "recursive",
    "test",
+    "ytdl",
 )


--- a/test/test_extractor.py
+++ b/test/test_extractor.py
@ -147,7 +147,7 @@ class TestExtractorModule(unittest.TestCase):
            return c.capitalize()

        for extr in extractor.extractors():
-            if extr.category not in ("", "oauth"):
+            if extr.category not in ("", "oauth", "ytdl"):
                expected = "{}{}Extractor".format(
                    capitalize(extr.category),
                    capitalize(extr.subcategory),