[ytdl] add extractor for sites supported by youtube-dl

(#1680, #878) Can be used by prefixing any URL with 'ytdl:', or by setting 'extractor,ytdl.enabled' to 'true'.
2024-11-25 12:12:34 +01:00 · 2021-07-10 20:47:33 +02:00 · 2021-07-10 20:47:33 +02:00 · 36ac2197db
commit 36ac2197db
parent 64240c8d42
8 changed files with 195 additions and 9 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -1955,6 +1955,72 @@ Description
    Download video files.
 extractor.ytdl.enabled
 ----------------------
 Type
    ``bool``
 Default
    ``false``
 Description
    Match **all** URLs, even ones without a ``ytdl:`` prefix.
 extractor.ytdl.format
 ---------------------
 Type
    ``string``
 Default
    youtube-dl's default, currently ``"bestvideo+bestaudio/best"``
 Description
    Video `format selection
    <https://github.com/ytdl-org/youtube-dl#format-selection>`__
    directly passed to youtube-dl.
 extractor.ytdl.logging
 ----------------------
 Type
    ``bool``
 Default
    ``true``
 Description
    Route youtube-dl's output through gallery-dl's logging system.
    Otherwise youtube-dl will write its output directly to stdout/stderr.
    Note: Set ``quiet`` and ``no_warnings`` in
    `extractor.ytdl.raw-options`_ to ``true`` to suppress all output.
 extractor.ytdl.module
 ---------------------
 Type
    ``string``
 Default
    ``"youtube_dl"``
 Description
    Name of the youtube-dl Python module to import.
 extractor.ytdl.raw-options
 --------------------------
 Type
    ``object``
 Example
    .. code:: json
        {
            "quiet": true,
            "writesubtitles": true,
            "merge_output_format": "mkv"
        }
 Description
    Additional options passed directly to the ``YoutubeDL`` constructor.
    All available options can be found in `youtube-dl's docstrings
    <https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L138-L318>`__.
 extractor.[booru].tags
 ----------------------
 Type
@ -1967,6 +2033,7 @@ Description
    Note: This requires 1 additional HTTP request for each post.
 extractor.[booru].notes
 -----------------------
 Type
@ -1978,6 +2045,7 @@ Description
    Note: This requires 1 additional HTTP request for each post.
 extractor.[manga-extractor].chapter-reverse
 -------------------------------------------
 Type
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@ -287,6 +287,14 @@
            "retweets": true,
            "videos": true
        },
        "ytdl":
        {
            "enabled": false,
            "format": null,
            "logging": true,
            "module": "youtube_dl",
            "raw-options": null
        },
        "booru":
        {
            "tags": false,
--- a/gallery_dl/downloader/ytdl.py
+++ b/gallery_dl/downloader/ytdl.py
@ -41,7 +41,10 @@ class YoutubeDLDownloader(DownloaderBase):
            "max_filesize": text.parse_bytes(
                self.config("filesize-max"), None),
        }
-        options.update(self.config("raw-options") or {})
+
        raw_options = self.config("raw-options")
        if raw_options:
            options.update(raw_options)
        if self.config("logging", True):
            options["logger"] = self.log
@ -59,19 +62,22 @@ class YoutubeDLDownloader(DownloaderBase):
            for cookie in self.session.cookies:
                set_cookie(cookie)
-        try:
+        kwdict = pathfmt.kwdict
-            info_dict = self.ytdl.extract_info(url[5:], download=False)
+        info_dict = kwdict.pop("_ytdl_info_dict", None)
-        except Exception:
+        if not info_dict:
-            return False
+            try:
                info_dict = self.ytdl.extract_info(url[5:], download=False)
            except Exception:
                return False
        if "entries" in info_dict:
-            index = pathfmt.kwdict.get("_ytdl_index")
+            index = kwdict.get("_ytdl_index")
            if index is None:
                return self._download_playlist(pathfmt, info_dict)
            else:
                info_dict = info_dict["entries"][index]
-        extra = pathfmt.kwdict.get("_ytdl_extra")
+        extra = kwdict.get("_ytdl_extra")
        if extra:
            info_dict.update(extra)
@ -121,6 +127,7 @@ class YoutubeDLDownloader(DownloaderBase):
 def compatible_formats(formats):
    """Returns True if 'formats' are compatible for merge"""
    video_ext = formats[0].get("ext")
    audio_ext = formats[1].get("ext")
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -143,6 +143,7 @@ modules = [
    "recursive",
    "oauth",
    "test",
    "ytdl",
 ]
--- a/gallery_dl/extractor/ytdl.py
+++ b/gallery_dl/extractor/ytdl.py
@ -0,0 +1,101 @@
 # -*- coding: utf-8 -*-
 # Copyright 2021 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extractors for sites supported by youtube-dl"""
 from .common import Extractor, Message
 from .. import config
 class YoutubeDLExtractor(Extractor):
    """Generic extractor for youtube-dl supported URLs"""
    category = "ytdl"
    directory_fmt = ("{category}", "{subcategory}")
    filename_fmt = "{title}-{id}.{extension}"
    archive_fmt = "{extractor_key} {id}"
    ytdl_module = None
    pattern = r"ytdl:(.*)"
    test = ("ytdl:https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9",)
    def __init__(self, match):
        # import youtube_dl module
        module = self.ytdl_module
        if not module:
            name = config.get(("extractor", "ytdl"), "module") or "youtube_dl"
            module = YoutubeDLExtractor.ytdl_module = __import__(name)
        # find suitable youtube_dl extractor
        self.ytdl_url = url = match.group(1)
        for ie in module.extractor.gen_extractor_classes():
            if ie.suitable(url):
                self.ytdl_ie = ie
                break
        # set subcategory to youtube_dl extractor's key
        self.subcategory = ie.ie_key()
        Extractor.__init__(self, match)
    def items(self):
        # construct YoutubeDL object
        options = {
            "format": self.config("format"),
            "socket_timeout": self._timeout,
            "nocheckcertificate": not self._verify,
            "proxy": self.session.proxies.get("http"),
        }
        raw_options = self.config("raw-options")
        if raw_options:
            options.update(raw_options)
        if self.config("logging", True):
            options["logger"] = self.log
        options["extract_flat"] = "in_playlist"
        ytdl = self.ytdl_module.YoutubeDL(options)
        # extract youtube_dl info_dict
        info_dict = ytdl._YoutubeDL__extract_info(
            self.ytdl_url,
            ytdl.get_info_extractor(self.ytdl_ie.ie_key()),
            False, {}, True)
        if "entries" in info_dict:
            results = self._process_entries(ytdl, info_dict["entries"])
        else:
            results = (info_dict,)
        # yield results
        for info_dict in results:
            info_dict["extension"] = None
            info_dict["_ytdl_info_dict"] = info_dict
            url = "ytdl:" + (info_dict.get("url") or
                             info_dict.get("webpage_url") or
                             self.ytdl_url)
            yield Message.Directory, info_dict
            yield Message.Url, url, info_dict
    def _process_entries(self, ytdl, entries):
        for entry in entries:
            if entry.get("_type") in ("url", "url_transparent"):
                info_dict = ytdl.extract_info(
                    entry["url"], False,
                    ie_key=entry.get("ie_key"))
                if "entries" in info_dict:
                    yield from self._process_entries(
                        ytdl, info_dict["entries"])
                else:
                    yield info_dict
            else:
                yield entry
 if config.get(("extractor", "ytdl"), "enabled"):
    # make 'ytdl:' prefix optional
    YoutubeDLExtractor.pattern = r"(?:ytdl:)?(.*)"
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@ -6,4 +6,4 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
-__version__ = "1.18.2-dev"
+__version__ = "1.19.0-dev"
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -254,6 +254,7 @@ IGNORE_LIST = (
    "oauth",
    "recursive",
    "test",
    "ytdl",
 )
--- a/test/test_extractor.py
+++ b/test/test_extractor.py
@ -147,7 +147,7 @@ class TestExtractorModule(unittest.TestCase):
            return c.capitalize()
        for extr in extractor.extractors():
-            if extr.category not in ("", "oauth"):
+            if extr.category not in ("", "oauth", "ytdl"):
                expected = "{}{}Extractor".format(
                    capitalize(extr.category),
                    capitalize(extr.subcategory),