mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-22 10:42:34 +01:00
4da3347d18
with blacklist/whitelist syntax, e.g. --list-extractors pixiv --list-extractors pixiv:user pixiv:work --list-extractors :search
575 lines
19 KiB
Python
575 lines
19 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2014-2023 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
import sys
|
|
import logging
|
|
from . import version, config, option, output, extractor, job, util, exception
|
|
|
|
__author__ = "Mike Fährmann"
|
|
__copyright__ = "Copyright 2014-2023 Mike Fährmann"
|
|
__license__ = "GPLv2"
|
|
__maintainer__ = "Mike Fährmann"
|
|
__email__ = "mike_faehrmann@web.de"
|
|
__version__ = version.__version__
|
|
|
|
|
|
def main():
|
|
try:
|
|
parser = option.build_parser()
|
|
args = parser.parse_args()
|
|
log = output.initialize_logging(args.loglevel)
|
|
|
|
# configuration
|
|
if args.config_load:
|
|
config.load()
|
|
if args.configs_json:
|
|
config.load(args.configs_json, strict=True)
|
|
if args.configs_yaml:
|
|
import yaml
|
|
config.load(args.configs_yaml, strict=True, loads=yaml.safe_load)
|
|
if args.configs_toml:
|
|
try:
|
|
import tomllib as toml
|
|
except ImportError:
|
|
import toml
|
|
config.load(args.configs_toml, strict=True, loads=toml.loads)
|
|
if not args.colors:
|
|
output.ANSI = False
|
|
config.set((), "colors", False)
|
|
if util.WINDOWS:
|
|
config.set(("output",), "ansi", False)
|
|
if args.filename:
|
|
filename = args.filename
|
|
if filename == "/O":
|
|
filename = "{filename}.{extension}"
|
|
elif filename.startswith("\\f"):
|
|
filename = "\f" + filename[2:]
|
|
config.set((), "filename", filename)
|
|
if args.directory is not None:
|
|
config.set((), "base-directory", args.directory)
|
|
config.set((), "directory", ())
|
|
if args.postprocessors:
|
|
config.set((), "postprocessors", args.postprocessors)
|
|
if args.abort:
|
|
config.set((), "skip", "abort:" + str(args.abort))
|
|
if args.terminate:
|
|
config.set((), "skip", "terminate:" + str(args.terminate))
|
|
if args.cookies_from_browser:
|
|
browser, _, profile = args.cookies_from_browser.partition(":")
|
|
browser, _, keyring = browser.partition("+")
|
|
browser, _, domain = browser.partition("/")
|
|
if profile.startswith(":"):
|
|
container = profile[1:]
|
|
profile = None
|
|
else:
|
|
profile, _, container = profile.partition("::")
|
|
config.set((), "cookies", (
|
|
browser, profile, keyring, container, domain))
|
|
if args.options_pp:
|
|
config.set((), "postprocessor-options", args.options_pp)
|
|
for opts in args.options:
|
|
config.set(*opts)
|
|
|
|
output.configure_standard_streams()
|
|
|
|
# signals
|
|
signals = config.get((), "signals-ignore")
|
|
if signals:
|
|
import signal
|
|
if isinstance(signals, str):
|
|
signals = signals.split(",")
|
|
for signal_name in signals:
|
|
signal_num = getattr(signal, signal_name, None)
|
|
if signal_num is None:
|
|
log.warning("signal '%s' is not defined", signal_name)
|
|
else:
|
|
signal.signal(signal_num, signal.SIG_IGN)
|
|
|
|
# enable ANSI escape sequences on Windows
|
|
if util.WINDOWS and config.get(("output",), "ansi", output.COLORS):
|
|
from ctypes import windll, wintypes, byref
|
|
kernel32 = windll.kernel32
|
|
mode = wintypes.DWORD()
|
|
|
|
for handle_id in (-11, -12): # stdout and stderr
|
|
handle = kernel32.GetStdHandle(handle_id)
|
|
kernel32.GetConsoleMode(handle, byref(mode))
|
|
if not mode.value & 0x4:
|
|
mode.value |= 0x4
|
|
kernel32.SetConsoleMode(handle, mode)
|
|
|
|
output.ANSI = True
|
|
|
|
# filter environment
|
|
filterenv = config.get((), "filters-environment", True)
|
|
if not filterenv:
|
|
util.compile_expression = util.compile_expression_raw
|
|
|
|
# format string separator
|
|
separator = config.get((), "format-separator")
|
|
if separator:
|
|
from . import formatter
|
|
formatter._SEPARATOR = separator
|
|
|
|
# eval globals
|
|
path = config.get((), "globals")
|
|
if path:
|
|
util.GLOBALS.update(util.import_file(path).__dict__)
|
|
|
|
# loglevels
|
|
output.configure_logging(args.loglevel)
|
|
if args.loglevel >= logging.WARNING:
|
|
config.set(("output",), "mode", "null")
|
|
config.set(("downloader",), "progress", None)
|
|
elif args.loglevel <= logging.DEBUG:
|
|
import platform
|
|
import requests
|
|
|
|
extra = ""
|
|
if util.EXECUTABLE:
|
|
extra = " - Executable ({})".format(version.__variant__)
|
|
else:
|
|
git_head = util.git_head()
|
|
if git_head:
|
|
extra = " - Git HEAD: " + git_head
|
|
|
|
log.debug("Version %s%s", __version__, extra)
|
|
log.debug("Python %s - %s",
|
|
platform.python_version(), platform.platform())
|
|
try:
|
|
log.debug("requests %s - urllib3 %s",
|
|
requests.__version__,
|
|
requests.packages.urllib3.__version__)
|
|
except AttributeError:
|
|
pass
|
|
|
|
log.debug("Configuration Files %s", config._files)
|
|
|
|
if args.print_traffic:
|
|
import requests
|
|
requests.packages.urllib3.connection.HTTPConnection.debuglevel = 1
|
|
|
|
# extractor modules
|
|
modules = config.get(("extractor",), "modules")
|
|
if modules is not None:
|
|
if isinstance(modules, str):
|
|
modules = modules.split(",")
|
|
extractor.modules = modules
|
|
|
|
# external modules
|
|
if args.extractor_sources:
|
|
sources = args.extractor_sources
|
|
sources.append(None)
|
|
else:
|
|
sources = config.get(("extractor",), "module-sources")
|
|
|
|
if sources:
|
|
import os
|
|
modules = []
|
|
|
|
for source in sources:
|
|
if source:
|
|
path = util.expand_path(source)
|
|
try:
|
|
files = os.listdir(path)
|
|
modules.append(extractor._modules_path(path, files))
|
|
except Exception as exc:
|
|
log.warning("Unable to load modules from %s (%s: %s)",
|
|
path, exc.__class__.__name__, exc)
|
|
else:
|
|
modules.append(extractor._modules_internal())
|
|
|
|
if len(modules) > 1:
|
|
import itertools
|
|
extractor._module_iter = itertools.chain(*modules)
|
|
elif not modules:
|
|
extractor._module_iter = ()
|
|
else:
|
|
extractor._module_iter = iter(modules[0])
|
|
|
|
if args.update:
|
|
from . import update
|
|
extr = update.UpdateExtractor.from_url("update:" + args.update)
|
|
ujob = update.UpdateJob(extr)
|
|
return ujob.run()
|
|
|
|
elif args.list_modules:
|
|
extractor.modules.append("")
|
|
sys.stdout.write("\n".join(extractor.modules))
|
|
|
|
elif args.list_extractors is not None:
|
|
write = sys.stdout.write
|
|
fmt = ("{}{}\nCategory: {} - Subcategory: {}"
|
|
"\nExample : {}\n\n").format
|
|
|
|
extractors = extractor.extractors()
|
|
if args.list_extractors:
|
|
fltr = util.build_extractor_filter(
|
|
args.list_extractors, negate=False)
|
|
extractors = filter(fltr, extractors)
|
|
|
|
for extr in extractors:
|
|
write(fmt(
|
|
extr.__name__,
|
|
"\n" + extr.__doc__ if extr.__doc__ else "",
|
|
extr.category, extr.subcategory,
|
|
extr.example,
|
|
))
|
|
|
|
elif args.clear_cache:
|
|
from . import cache
|
|
log = logging.getLogger("cache")
|
|
cnt = cache.clear(args.clear_cache)
|
|
|
|
if cnt is None:
|
|
log.error("Database file not available")
|
|
return 1
|
|
else:
|
|
log.info(
|
|
"Deleted %d %s from '%s'",
|
|
cnt, "entry" if cnt == 1 else "entries", cache._path(),
|
|
)
|
|
|
|
elif args.config:
|
|
if args.config == "init":
|
|
return config.initialize()
|
|
elif args.config == "status":
|
|
return config.status()
|
|
else:
|
|
return config.open_extern()
|
|
|
|
else:
|
|
input_files = config.get((), "input-files")
|
|
if input_files:
|
|
for input_file in input_files:
|
|
if isinstance(input_file, str):
|
|
input_file = (input_file, None)
|
|
args.input_files.append(input_file)
|
|
|
|
if not args.urls and not args.input_files:
|
|
parser.error(
|
|
"The following arguments are required: URL\n"
|
|
"Use 'gallery-dl --help' to get a list of all options.")
|
|
|
|
if args.list_urls:
|
|
jobtype = job.UrlJob
|
|
jobtype.maxdepth = args.list_urls
|
|
if config.get(("output",), "fallback", True):
|
|
jobtype.handle_url = \
|
|
staticmethod(jobtype.handle_url_fallback)
|
|
elif args.dump_json:
|
|
jobtype = job.DataJob
|
|
jobtype.resolve = args.dump_json - 1
|
|
else:
|
|
jobtype = args.jobtype or job.DownloadJob
|
|
|
|
input_manager = InputManager()
|
|
input_manager.log = input_log = logging.getLogger("inputfile")
|
|
|
|
# unsupported file logging handler
|
|
handler = output.setup_logging_handler(
|
|
"unsupportedfile", fmt="{message}")
|
|
if handler:
|
|
ulog = job.Job.ulog = logging.getLogger("unsupported")
|
|
ulog.addHandler(handler)
|
|
ulog.propagate = False
|
|
|
|
# error file logging handler
|
|
handler = output.setup_logging_handler(
|
|
"errorfile", fmt="{message}", mode="a")
|
|
if handler:
|
|
elog = input_manager.err = logging.getLogger("errorfile")
|
|
elog.addHandler(handler)
|
|
elog.propagate = False
|
|
|
|
# collect input URLs
|
|
input_manager.add_list(args.urls)
|
|
|
|
if args.input_files:
|
|
for input_file, action in args.input_files:
|
|
try:
|
|
path = util.expand_path(input_file)
|
|
input_manager.add_file(path, action)
|
|
except Exception as exc:
|
|
input_log.error(exc)
|
|
return getattr(exc, "code", 128)
|
|
|
|
pformat = config.get(("output",), "progress", True)
|
|
if pformat and len(input_manager.urls) > 1 and \
|
|
args.loglevel < logging.ERROR:
|
|
input_manager.progress(pformat)
|
|
|
|
# process input URLs
|
|
retval = 0
|
|
for url in input_manager:
|
|
try:
|
|
log.debug("Starting %s for '%s'", jobtype.__name__, url)
|
|
|
|
if isinstance(url, ExtendedUrl):
|
|
for opts in url.gconfig:
|
|
config.set(*opts)
|
|
with config.apply(url.lconfig):
|
|
status = jobtype(url.value).run()
|
|
else:
|
|
status = jobtype(url).run()
|
|
|
|
if status:
|
|
retval |= status
|
|
input_manager.error()
|
|
else:
|
|
input_manager.success()
|
|
|
|
except exception.StopExtraction:
|
|
pass
|
|
except exception.TerminateExtraction:
|
|
pass
|
|
except exception.RestartExtraction:
|
|
log.debug("Restarting '%s'", url)
|
|
continue
|
|
except exception.NoExtractorError:
|
|
log.error("Unsupported URL '%s'", url)
|
|
retval |= 64
|
|
input_manager.error()
|
|
|
|
input_manager.next()
|
|
return retval
|
|
return 0
|
|
|
|
except KeyboardInterrupt:
|
|
raise SystemExit("\nKeyboardInterrupt")
|
|
except BrokenPipeError:
|
|
pass
|
|
except OSError as exc:
|
|
import errno
|
|
if exc.errno != errno.EPIPE:
|
|
raise
|
|
return 1
|
|
|
|
|
|
class InputManager():
|
|
|
|
def __init__(self):
|
|
self.urls = []
|
|
self.files = ()
|
|
self.log = self.err = None
|
|
|
|
self._url = ""
|
|
self._item = None
|
|
self._index = 0
|
|
self._pformat = None
|
|
|
|
def add_url(self, url):
|
|
self.urls.append(url)
|
|
|
|
def add_list(self, urls):
|
|
self.urls += urls
|
|
|
|
def add_file(self, path, action=None):
|
|
"""Process an input file.
|
|
|
|
Lines starting with '#' and empty lines will be ignored.
|
|
Lines starting with '-' will be interpreted as a key-value pair
|
|
separated by an '='. where
|
|
'key' is a dot-separated option name and
|
|
'value' is a JSON-parsable string.
|
|
These configuration options will be applied
|
|
while processing the next URL only.
|
|
Lines starting with '-G' are the same as above, except these options
|
|
will be applied for *all* following URLs, i.e. they are Global.
|
|
Everything else will be used as a potential URL.
|
|
|
|
Example input file:
|
|
|
|
# settings global options
|
|
-G base-directory = "/tmp/"
|
|
-G skip = false
|
|
|
|
# setting local options for the next URL
|
|
-filename="spaces_are_optional.jpg"
|
|
-skip = true
|
|
|
|
https://example.org/
|
|
|
|
# next URL uses default filename and 'skip' is false.
|
|
https://example.com/index.htm # comment1
|
|
https://example.com/404.htm # comment2
|
|
"""
|
|
if path == "-" and not action:
|
|
try:
|
|
lines = sys.stdin.readlines()
|
|
except Exception:
|
|
raise exception.InputFileError("stdin is not readable")
|
|
path = None
|
|
else:
|
|
try:
|
|
with open(path, encoding="utf-8") as fp:
|
|
lines = fp.readlines()
|
|
except Exception as exc:
|
|
raise exception.InputFileError(str(exc))
|
|
|
|
if self.files:
|
|
self.files[path] = lines
|
|
else:
|
|
self.files = {path: lines}
|
|
|
|
if action == "c":
|
|
action = self._action_comment
|
|
elif action == "d":
|
|
action = self._action_delete
|
|
else:
|
|
action = None
|
|
|
|
gconf = []
|
|
lconf = []
|
|
indicies = []
|
|
strip_comment = None
|
|
append = self.urls.append
|
|
|
|
for n, line in enumerate(lines):
|
|
line = line.strip()
|
|
|
|
if not line or line[0] == "#":
|
|
# empty line or comment
|
|
continue
|
|
|
|
elif line[0] == "-":
|
|
# config spec
|
|
if len(line) >= 2 and line[1] == "G":
|
|
conf = gconf
|
|
line = line[2:]
|
|
else:
|
|
conf = lconf
|
|
line = line[1:]
|
|
if action:
|
|
indicies.append(n)
|
|
|
|
key, sep, value = line.partition("=")
|
|
if not sep:
|
|
raise exception.InputFileError(
|
|
"Invalid KEY=VALUE pair '%s' on line %s in %s",
|
|
line, n+1, path)
|
|
|
|
try:
|
|
value = util.json_loads(value.strip())
|
|
except ValueError as exc:
|
|
self.log.debug("%s: %s", exc.__class__.__name__, exc)
|
|
raise exception.InputFileError(
|
|
"Unable to parse '%s' on line %s in %s",
|
|
value, n+1, path)
|
|
|
|
key = key.strip().split(".")
|
|
conf.append((key[:-1], key[-1], value))
|
|
|
|
else:
|
|
# url
|
|
if " #" in line or "\t#" in line:
|
|
if strip_comment is None:
|
|
import re
|
|
strip_comment = re.compile(r"\s+#.*").sub
|
|
line = strip_comment("", line)
|
|
if gconf or lconf:
|
|
url = ExtendedUrl(line, gconf, lconf)
|
|
gconf = []
|
|
lconf = []
|
|
else:
|
|
url = line
|
|
|
|
if action:
|
|
indicies.append(n)
|
|
append((url, path, action, indicies))
|
|
indicies = []
|
|
else:
|
|
append(url)
|
|
|
|
def progress(self, pformat=True):
|
|
if pformat is True:
|
|
pformat = "[{current}/{total}] {url}\n"
|
|
else:
|
|
pformat += "\n"
|
|
self._pformat = pformat.format_map
|
|
|
|
def next(self):
|
|
self._index += 1
|
|
|
|
def success(self):
|
|
if self._item:
|
|
self._rewrite()
|
|
|
|
def error(self):
|
|
if self.err:
|
|
if self._item:
|
|
url, path, action, indicies = self._item
|
|
lines = self.files[path]
|
|
out = "".join(lines[i] for i in indicies)
|
|
if out and out[-1] == "\n":
|
|
out = out[:-1]
|
|
self._rewrite()
|
|
else:
|
|
out = str(self._url)
|
|
self.err.info(out)
|
|
|
|
def _rewrite(self):
|
|
url, path, action, indicies = self._item
|
|
lines = self.files[path]
|
|
action(lines, indicies)
|
|
try:
|
|
with open(path, "w", encoding="utf-8") as fp:
|
|
fp.writelines(lines)
|
|
except Exception as exc:
|
|
self.log.warning(
|
|
"Unable to update '%s' (%s: %s)",
|
|
path, exc.__class__.__name__, exc)
|
|
|
|
@staticmethod
|
|
def _action_comment(lines, indicies):
|
|
for i in indicies:
|
|
lines[i] = "# " + lines[i]
|
|
|
|
@staticmethod
|
|
def _action_delete(lines, indicies):
|
|
for i in indicies:
|
|
lines[i] = ""
|
|
|
|
def __iter__(self):
|
|
self._index = 0
|
|
return self
|
|
|
|
def __next__(self):
|
|
try:
|
|
url = self.urls[self._index]
|
|
except IndexError:
|
|
raise StopIteration
|
|
|
|
if isinstance(url, tuple):
|
|
self._item = url
|
|
url = url[0]
|
|
else:
|
|
self._item = None
|
|
self._url = url
|
|
|
|
if self._pformat:
|
|
output.stderr_write(self._pformat({
|
|
"total" : len(self.urls),
|
|
"current": self._index + 1,
|
|
"url" : url,
|
|
}))
|
|
return url
|
|
|
|
|
|
class ExtendedUrl():
|
|
"""URL with attached config key-value pairs"""
|
|
__slots__ = ("value", "gconfig", "lconfig")
|
|
|
|
def __init__(self, url, gconf, lconf):
|
|
self.value = url
|
|
self.gconfig = gconf
|
|
self.lconfig = lconf
|
|
|
|
def __str__(self):
|
|
return self.value
|