gallery-dl/gallery_dl/__init__.py

# -*- coding: utf-8 -*-

# Copyright 2014-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

import sys
import logging
from . import version, config, option, output, extractor, job, util, exception

__author__ = "Mike Fährmann"
__copyright__ = "Copyright 2014-2023 Mike Fährmann"
__license__ = "GPLv2"
__maintainer__ = "Mike Fährmann"
__email__ = "mike_faehrmann@web.de"
__version__ = version.__version__


def main():
    try:
        parser = option.build_parser()
        args = parser.parse_args()
        log = output.initialize_logging(args.loglevel)

        # configuration
        if args.config_load:
            config.load()
        if args.configs_json:
            config.load(args.configs_json, strict=True)
        if args.configs_yaml:
            import yaml
            config.load(args.configs_yaml, strict=True, loads=yaml.safe_load)
        if args.configs_toml:
            try:
                import tomllib as toml
            except ImportError:
                import toml
            config.load(args.configs_toml, strict=True, loads=toml.loads)
        if not args.colors:
            output.ANSI = False
            config.set((), "colors", False)
            if util.WINDOWS:
                config.set(("output",), "ansi", False)
        if args.filename:
            filename = args.filename
            if filename == "/O":
                filename = "{filename}.{extension}"
            elif filename.startswith("\\f"):
                filename = "\f" + filename[2:]
            config.set((), "filename", filename)
        if args.directory is not None:
            config.set((), "base-directory", args.directory)
            config.set((), "directory", ())
        if args.postprocessors:
            config.set((), "postprocessors", args.postprocessors)
        if args.abort:
            config.set((), "skip", "abort:" + str(args.abort))
        if args.terminate:
            config.set((), "skip", "terminate:" + str(args.terminate))
        if args.cookies_from_browser:
            browser, _, profile = args.cookies_from_browser.partition(":")
            browser, _, keyring = browser.partition("+")
            browser, _, domain = browser.partition("/")
            if profile and profile[0] == ":":
                container = profile[1:]
                profile = None
            else:
                profile, _, container = profile.partition("::")
            config.set((), "cookies", (
                browser, profile, keyring, container, domain))
        if args.options_pp:
            config.set((), "postprocessor-options", args.options_pp)
        for opts in args.options:
            config.set(*opts)

        output.configure_standard_streams()

        # signals
        signals = config.get((), "signals-ignore")
        if signals:
            import signal
            if isinstance(signals, str):
                signals = signals.split(",")
            for signal_name in signals:
                signal_num = getattr(signal, signal_name, None)
                if signal_num is None:
                    log.warning("signal '%s' is not defined", signal_name)
                else:
                    signal.signal(signal_num, signal.SIG_IGN)

        # enable ANSI escape sequences on Windows
        if util.WINDOWS and config.get(("output",), "ansi", output.COLORS):
            from ctypes import windll, wintypes, byref
            kernel32 = windll.kernel32
            mode = wintypes.DWORD()

            for handle_id in (-11, -12):  # stdout and stderr
                handle = kernel32.GetStdHandle(handle_id)
                kernel32.GetConsoleMode(handle, byref(mode))
                if not mode.value & 0x4:
                    mode.value |= 0x4
                    kernel32.SetConsoleMode(handle, mode)

            output.ANSI = True

        # filter environment
        filterenv = config.get((), "filters-environment", True)
        if not filterenv:
            util.compile_expression = util.compile_expression_raw

        # format string separator
        separator = config.get((), "format-separator")
        if separator:
            from . import formatter
            formatter._SEPARATOR = separator

        # eval globals
        path = config.get((), "globals")
        if path:
            util.GLOBALS.update(util.import_file(path).__dict__)

        # loglevels
        output.configure_logging(args.loglevel)
        if args.loglevel >= logging.WARNING:
            config.set(("output",), "mode", "null")
            config.set(("downloader",), "progress", None)
        elif args.loglevel <= logging.DEBUG:
            import platform
            import requests

            extra = ""
            if util.EXECUTABLE:
                extra = " - Executable ({})".format(version.__variant__)
            else:
                git_head = util.git_head()
                if git_head:
                    extra = " - Git HEAD: " + git_head

            log.debug("Version %s%s", __version__, extra)
            log.debug("Python %s - %s",
                      platform.python_version(), platform.platform())
            try:
                log.debug("requests %s - urllib3 %s",
                          requests.__version__,
                          requests.packages.urllib3.__version__)
            except AttributeError:
                pass

            log.debug("Configuration Files %s", config._files)

        if args.print_traffic:
            import requests
            requests.packages.urllib3.connection.HTTPConnection.debuglevel = 1

        # extractor modules
        modules = config.get(("extractor",), "modules")
        if modules is not None:
            if isinstance(modules, str):
                modules = modules.split(",")
            extractor.modules = modules

        # external modules
        if args.extractor_sources:
            sources = args.extractor_sources
            sources.append(None)
        else:
            sources = config.get(("extractor",), "module-sources")

        if sources:
            import os
            modules = []

            for source in sources:
                if source:
                    path = util.expand_path(source)
                    try:
                        files = os.listdir(path)
                        modules.append(extractor._modules_path(path, files))
                    except Exception as exc:
                        log.warning("Unable to load modules from %s (%s: %s)",
                                    path, exc.__class__.__name__, exc)
                else:
                    modules.append(extractor._modules_internal())

            if len(modules) > 1:
                import itertools
                extractor._module_iter = itertools.chain(*modules)
            elif not modules:
                extractor._module_iter = ()
            else:
                extractor._module_iter = iter(modules[0])

        if args.update:
            from . import update
            extr = update.UpdateExtractor.from_url("update:" + args.update)
            ujob = update.UpdateJob(extr)
            return ujob.run()

        elif args.list_modules:
            extractor.modules.append("")
            sys.stdout.write("\n".join(extractor.modules))

        elif args.list_extractors is not None:
            write = sys.stdout.write
            fmt = ("{}{}\nCategory: {} - Subcategory: {}"
                   "\nExample : {}\n\n").format

            extractors = extractor.extractors()
            if args.list_extractors:
                fltr = util.build_extractor_filter(
                    args.list_extractors, negate=False)
                extractors = filter(fltr, extractors)

            for extr in extractors:
                write(fmt(
                    extr.__name__,
                    "\n" + extr.__doc__ if extr.__doc__ else "",
                    extr.category, extr.subcategory,
                    extr.example,
                ))

        elif args.clear_cache:
            from . import cache
            log = logging.getLogger("cache")
            cnt = cache.clear(args.clear_cache)

            if cnt is None:
                log.error("Database file not available")
                return 1
            else:
                log.info(
                    "Deleted %d %s from '%s'",
                    cnt, "entry" if cnt == 1 else "entries", cache._path(),
                )

        elif args.config:
            if args.config == "init":
                return config.initialize()
            elif args.config == "status":
                return config.status()
            else:
                return config.open_extern()

        else:
            input_files = config.get((), "input-files")
            if input_files:
                for input_file in input_files:
                    if isinstance(input_file, str):
                        input_file = (input_file, None)
                    args.input_files.append(input_file)

            if not args.urls and not args.input_files:
                if args.cookies_from_browser or config.interpolate(
                        ("extractor",), "cookies"):
                    args.urls.append("noop")
                else:
                    parser.error(
                        "The following arguments are required: URL\nUse "
                        "'gallery-dl --help' to get a list of all options.")

            if args.list_urls:
                jobtype = job.UrlJob
                jobtype.maxdepth = args.list_urls
                if config.get(("output",), "fallback", True):
                    jobtype.handle_url = \
                        staticmethod(jobtype.handle_url_fallback)
            elif args.dump_json:
                jobtype = job.DataJob
                jobtype.resolve = args.dump_json - 1
            else:
                jobtype = args.jobtype or job.DownloadJob

            input_manager = InputManager()
            input_manager.log = input_log = logging.getLogger("inputfile")

            # unsupported file logging handler
            handler = output.setup_logging_handler(
                "unsupportedfile", fmt="{message}")
            if handler:
                ulog = job.Job.ulog = logging.getLogger("unsupported")
                ulog.addHandler(handler)
                ulog.propagate = False

            # error file logging handler
            handler = output.setup_logging_handler(
                "errorfile", fmt="{message}", mode="a")
            if handler:
                elog = input_manager.err = logging.getLogger("errorfile")
                elog.addHandler(handler)
                elog.propagate = False

            # collect input URLs
            input_manager.add_list(args.urls)

            if args.input_files:
                for input_file, action in args.input_files:
                    try:
                        path = util.expand_path(input_file)
                        input_manager.add_file(path, action)
                    except Exception as exc:
                        input_log.error(exc)
                        return getattr(exc, "code", 128)

            pformat = config.get(("output",), "progress", True)
            if pformat and len(input_manager.urls) > 1 and \
                    args.loglevel < logging.ERROR:
                input_manager.progress(pformat)

            # process input URLs
            retval = 0
            for url in input_manager:
                try:
                    log.debug("Starting %s for '%s'", jobtype.__name__, url)

                    if isinstance(url, ExtendedUrl):
                        for opts in url.gconfig:
                            config.set(*opts)
                        with config.apply(url.lconfig):
                            status = jobtype(url.value).run()
                    else:
                        status = jobtype(url).run()

                    if status:
                        retval |= status
                        input_manager.error()
                    else:
                        input_manager.success()

                except exception.StopExtraction:
                    pass
                except exception.TerminateExtraction:
                    pass
                except exception.RestartExtraction:
                    log.debug("Restarting '%s'", url)
                    continue
                except exception.NoExtractorError:
                    log.error("Unsupported URL '%s'", url)
                    retval |= 64
                    input_manager.error()

                input_manager.next()
            return retval
        return 0

    except KeyboardInterrupt:
        raise SystemExit("\nKeyboardInterrupt")
    except BrokenPipeError:
        pass
    except OSError as exc:
        import errno
        if exc.errno != errno.EPIPE:
            raise
    return 1


class InputManager():

    def __init__(self):
        self.urls = []
        self.files = ()
        self.log = self.err = None

        self._url = ""
        self._item = None
        self._index = 0
        self._pformat = None

    def add_url(self, url):
        self.urls.append(url)

    def add_list(self, urls):
        self.urls += urls

    def add_file(self, path, action=None):
        """Process an input file.

        Lines starting with '#' and empty lines will be ignored.
        Lines starting with '-' will be interpreted as a key-value pair
          separated by an '='. where
          'key' is a dot-separated option name and
          'value' is a JSON-parsable string.
          These configuration options will be applied
          while processing the next URL only.
        Lines starting with '-G' are the same as above, except these options
          will be applied for *all* following URLs, i.e. they are Global.
        Everything else will be used as a potential URL.

        Example input file:

        # settings global options
        -G base-directory = "/tmp/"
        -G skip = false

        # setting local options for the next URL
        -filename="spaces_are_optional.jpg"
        -skip    = true

        https://example.org/

        # next URL uses default filename and 'skip' is false.
        https://example.com/index.htm # comment1
        https://example.com/404.htm   # comment2
        """
        if path == "-" and not action:
            try:
                lines = sys.stdin.readlines()
            except Exception:
                raise exception.InputFileError("stdin is not readable")
            path = None
        else:
            try:
                with open(path, encoding="utf-8") as fp:
                    lines = fp.readlines()
            except Exception as exc:
                raise exception.InputFileError(str(exc))

            if self.files:
                self.files[path] = lines
            else:
                self.files = {path: lines}

            if action == "c":
                action = self._action_comment
            elif action == "d":
                action = self._action_delete
            else:
                action = None

        gconf = []
        lconf = []
        indicies = []
        strip_comment = None
        append = self.urls.append

        for n, line in enumerate(lines):
            line = line.strip()

            if not line or line[0] == "#":
                # empty line or comment
                continue

            elif line[0] == "-":
                # config spec
                if len(line) >= 2 and line[1] == "G":
                    conf = gconf
                    line = line[2:]
                else:
                    conf = lconf
                    line = line[1:]
                    if action:
                        indicies.append(n)

                key, sep, value = line.partition("=")
                if not sep:
                    raise exception.InputFileError(
                        "Invalid KEY=VALUE pair '%s' on line %s in %s",
                        line, n+1, path)

                try:
                    value = util.json_loads(value.strip())
                except ValueError as exc:
                    self.log.debug("%s: %s", exc.__class__.__name__, exc)
                    raise exception.InputFileError(
                        "Unable to parse '%s' on line %s in %s",
                        value, n+1, path)

                key = key.strip().split(".")
                conf.append((key[:-1], key[-1], value))

            else:
                # url
                if " #" in line or "\t#" in line:
                    if strip_comment is None:
                        import re
                        strip_comment = re.compile(r"\s+#.*").sub
                    line = strip_comment("", line)
                if gconf or lconf:
                    url = ExtendedUrl(line, gconf, lconf)
                    gconf = []
                    lconf = []
                else:
                    url = line

                if action:
                    indicies.append(n)
                    append((url, path, action, indicies))
                    indicies = []
                else:
                    append(url)

    def progress(self, pformat=True):
        if pformat is True:
            pformat = "[{current}/{total}] {url}\n"
        else:
            pformat += "\n"
        self._pformat = pformat.format_map

    def next(self):
        self._index += 1

    def success(self):
        if self._item:
            self._rewrite()

    def error(self):
        if self.err:
            if self._item:
                url, path, action, indicies = self._item
                lines = self.files[path]
                out = "".join(lines[i] for i in indicies)
                if out and out[-1] == "\n":
                    out = out[:-1]
                self._rewrite()
            else:
                out = str(self._url)
            self.err.info(out)

    def _rewrite(self):
        url, path, action, indicies = self._item
        lines = self.files[path]
        action(lines, indicies)
        try:
            with open(path, "w", encoding="utf-8") as fp:
                fp.writelines(lines)
        except Exception as exc:
            self.log.warning(
                "Unable to update '%s' (%s: %s)",
                path, exc.__class__.__name__, exc)

    @staticmethod
    def _action_comment(lines, indicies):
        for i in indicies:
            lines[i] = "# " + lines[i]

    @staticmethod
    def _action_delete(lines, indicies):
        for i in indicies:
            lines[i] = ""

    def __iter__(self):
        self._index = 0
        return self

    def __next__(self):
        try:
            url = self.urls[self._index]
        except IndexError:
            raise StopIteration

        if isinstance(url, tuple):
            self._item = url
            url = url[0]
        else:
            self._item = None
        self._url = url

        if self._pformat:
            output.stderr_write(self._pformat({
                "total"  : len(self.urls),
                "current": self._index + 1,
                "url"    : url,
            }))
        return url


class ExtendedUrl():
    """URL with attached config key-value pairs"""
    __slots__ = ("value", "gconfig", "lconfig")

    def __init__(self, url, gconf, lconf):
        self.value = url
        self.gconfig = gconf
        self.lconfig = lconf

    def __str__(self):
        return self.value