1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-25 04:02:32 +01:00
gallery-dl/gallery_dl/__init__.py

579 lines
19 KiB
Python
Raw Normal View History

2015-04-05 17:15:27 +02:00
# -*- coding: utf-8 -*-
# Copyright 2014-2023 Mike Fährmann
2015-04-05 17:15:27 +02:00
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
2021-06-12 00:20:59 +02:00
import sys
import logging
from . import version, config, option, output, extractor, job, util, exception
2016-10-04 14:33:50 +02:00
2017-01-30 19:40:15 +01:00
__author__ = "Mike Fährmann"
__copyright__ = "Copyright 2014-2023 Mike Fährmann"
2017-01-30 19:40:15 +01:00
__license__ = "GPLv2"
2014-10-12 21:56:44 +02:00
__maintainer__ = "Mike Fährmann"
2017-01-30 19:40:15 +01:00
__email__ = "mike_faehrmann@web.de"
__version__ = version.__version__
2014-10-12 21:56:44 +02:00
def main():
try:
2017-03-23 16:29:40 +01:00
parser = option.build_parser()
args = parser.parse_args()
log = output.initialize_logging(args.loglevel)
# configuration
if args.config_load:
2017-04-25 17:09:10 +02:00
config.load()
if args.configs_json:
config.load(args.configs_json, strict=True)
if args.configs_yaml:
2023-02-28 22:02:15 +01:00
import yaml
2023-08-21 21:18:40 +02:00
config.load(args.configs_yaml, strict=True, loads=yaml.safe_load)
if args.configs_toml:
2023-02-28 22:02:15 +01:00
try:
import tomllib as toml
except ImportError:
import toml
2023-08-21 21:18:40 +02:00
config.load(args.configs_toml, strict=True, loads=toml.loads)
2024-04-20 21:00:26 +02:00
if not args.colors:
output.ANSI = False
config.set((), "colors", False)
if util.WINDOWS:
config.set(("output",), "ansi", False)
if args.filename:
filename = args.filename
if filename == "/O":
filename = "{filename}.{extension}"
elif filename.startswith("\\f"):
filename = "\f" + filename[2:]
config.set((), "filename", filename)
if args.directory is not None:
config.set((), "base-directory", args.directory)
config.set((), "directory", ())
if args.postprocessors:
2019-11-23 23:50:16 +01:00
config.set((), "postprocessors", args.postprocessors)
if args.abort:
2019-11-23 23:50:16 +01:00
config.set((), "skip", "abort:" + str(args.abort))
if args.terminate:
config.set((), "skip", "terminate:" + str(args.terminate))
if args.cookies_from_browser:
browser, _, profile = args.cookies_from_browser.partition(":")
browser, _, keyring = browser.partition("+")
browser, _, domain = browser.partition("/")
if profile and profile[0] == ":":
container = profile[1:]
profile = None
else:
profile, _, container = profile.partition("::")
config.set((), "cookies", (
browser, profile, keyring, container, domain))
if args.options_pp:
config.set((), "postprocessor-options", args.options_pp)
2019-11-23 23:50:16 +01:00
for opts in args.options:
config.set(*opts)
output.configure_standard_streams()
2022-02-13 22:39:26 +01:00
# signals
signals = config.get((), "signals-ignore")
if signals:
import signal
if isinstance(signals, str):
signals = signals.split(",")
for signal_name in signals:
signal_num = getattr(signal, signal_name, None)
if signal_num is None:
log.warning("signal '%s' is not defined", signal_name)
else:
signal.signal(signal_num, signal.SIG_IGN)
2022-05-29 19:15:25 +02:00
# enable ANSI escape sequences on Windows
if util.WINDOWS and config.get(("output",), "ansi", output.COLORS):
2022-05-29 19:15:25 +02:00
from ctypes import windll, wintypes, byref
kernel32 = windll.kernel32
mode = wintypes.DWORD()
for handle_id in (-11, -12): # stdout and stderr
handle = kernel32.GetStdHandle(handle_id)
kernel32.GetConsoleMode(handle, byref(mode))
if not mode.value & 0x4:
mode.value |= 0x4
kernel32.SetConsoleMode(handle, mode)
output.ANSI = True
# filter environment
filterenv = config.get((), "filters-environment", True)
if not filterenv:
util.compile_expression = util.compile_expression_raw
# format string separator
separator = config.get((), "format-separator")
if separator:
from . import formatter
formatter._SEPARATOR = separator
2023-02-28 18:18:55 +01:00
# eval globals
path = config.get((), "globals")
if path:
util.GLOBALS.update(util.import_file(path).__dict__)
2023-02-28 18:18:55 +01:00
# loglevels
output.configure_logging(args.loglevel)
if args.loglevel >= logging.WARNING:
2019-11-23 23:50:16 +01:00
config.set(("output",), "mode", "null")
config.set(("downloader",), "progress", None)
2017-08-13 20:35:44 +02:00
elif args.loglevel <= logging.DEBUG:
import platform
import requests
extra = ""
2023-02-28 23:10:23 +01:00
if util.EXECUTABLE:
extra = " - Executable ({})".format(version.__variant__)
else:
git_head = util.git_head()
if git_head:
extra = " - Git HEAD: " + git_head
log.debug("Version %s%s", __version__, extra)
2017-08-13 20:35:44 +02:00
log.debug("Python %s - %s",
platform.python_version(), platform.platform())
try:
log.debug("requests %s - urllib3 %s",
requests.__version__,
requests.packages.urllib3.__version__)
except AttributeError:
pass
log.debug("Configuration Files %s", config._files)
2024-07-05 03:22:06 +02:00
if args.print_traffic:
import requests
requests.packages.urllib3.connection.HTTPConnection.debuglevel = 1
# extractor modules
modules = config.get(("extractor",), "modules")
if modules is not None:
if isinstance(modules, str):
modules = modules.split(",")
extractor.modules = modules
# external modules
if args.extractor_sources:
sources = args.extractor_sources
sources.append(None)
else:
sources = config.get(("extractor",), "module-sources")
if sources:
import os
modules = []
for source in sources:
if source:
path = util.expand_path(source)
try:
files = os.listdir(path)
modules.append(extractor._modules_path(path, files))
except Exception as exc:
log.warning("Unable to load modules from %s (%s: %s)",
path, exc.__class__.__name__, exc)
else:
modules.append(extractor._modules_internal())
if len(modules) > 1:
import itertools
extractor._module_iter = itertools.chain(*modules)
elif not modules:
extractor._module_iter = ()
else:
extractor._module_iter = iter(modules[0])
if args.update:
from . import update
extr = update.UpdateExtractor.from_url("update:" + args.update)
ujob = update.UpdateJob(extr)
return ujob.run()
elif args.list_modules:
extractor.modules.append("")
sys.stdout.write("\n".join(extractor.modules))
elif args.list_extractors is not None:
write = sys.stdout.write
2023-09-11 17:32:59 +02:00
fmt = ("{}{}\nCategory: {} - Subcategory: {}"
"\nExample : {}\n\n").format
extractors = extractor.extractors()
if args.list_extractors:
fltr = util.build_extractor_filter(
args.list_extractors, negate=False)
extractors = filter(fltr, extractors)
for extr in extractors:
write(fmt(
2023-09-11 17:32:59 +02:00
extr.__name__,
"\n" + extr.__doc__ if extr.__doc__ else "",
extr.category, extr.subcategory,
2023-09-11 17:32:59 +02:00
extr.example,
))
elif args.clear_cache:
from . import cache
log = logging.getLogger("cache")
cnt = cache.clear(args.clear_cache)
if cnt is None:
log.error("Database file not available")
return 1
else:
log.info(
"Deleted %d %s from '%s'",
cnt, "entry" if cnt == 1 else "entries", cache._path(),
)
elif args.config:
if args.config == "init":
return config.initialize()
elif args.config == "status":
return config.status()
else:
return config.open_extern()
else:
input_files = config.get((), "input-files")
if input_files:
for input_file in input_files:
if isinstance(input_file, str):
input_file = (input_file, None)
args.input_files.append(input_file)
if not args.urls and not args.input_files:
if args.cookies_from_browser or config.interpolate(
("extractor",), "cookies"):
args.urls.append("noop")
else:
parser.error(
"The following arguments are required: URL\nUse "
"'gallery-dl --help' to get a list of all options.")
2016-07-21 13:13:53 +02:00
2015-12-10 02:14:28 +01:00
if args.list_urls:
2016-07-14 14:25:56 +02:00
jobtype = job.UrlJob
jobtype.maxdepth = args.list_urls
if config.get(("output",), "fallback", True):
jobtype.handle_url = \
staticmethod(jobtype.handle_url_fallback)
elif args.dump_json:
jobtype = job.DataJob
jobtype.resolve = args.dump_json - 1
2015-12-10 02:14:28 +01:00
else:
jobtype = args.jobtype or job.DownloadJob
2016-07-21 13:13:53 +02:00
input_manager = InputManager()
input_manager.log = input_log = logging.getLogger("inputfile")
# unsupported file logging handler
handler = output.setup_logging_handler(
"unsupportedfile", fmt="{message}")
if handler:
ulog = job.Job.ulog = logging.getLogger("unsupported")
ulog.addHandler(handler)
ulog.propagate = False
# error file logging handler
handler = output.setup_logging_handler(
"errorfile", fmt="{message}", mode="a")
if handler:
elog = input_manager.err = logging.getLogger("errorfile")
elog.addHandler(handler)
elog.propagate = False
2016-12-04 16:11:54 +01:00
# collect input URLs
input_manager.add_list(args.urls)
if args.input_files:
for input_file, action in args.input_files:
try:
path = util.expand_path(input_file)
input_manager.add_file(path, action)
except Exception as exc:
input_log.error(exc)
return getattr(exc, "code", 128)
2019-11-23 23:50:16 +01:00
pformat = config.get(("output",), "progress", True)
if pformat and len(input_manager.urls) > 1 and \
args.loglevel < logging.ERROR:
input_manager.progress(pformat)
# process input URLs
retval = 0
for url in input_manager:
2016-07-14 14:57:42 +02:00
try:
log.debug("Starting %s for '%s'", jobtype.__name__, url)
if isinstance(url, ExtendedUrl):
2019-11-23 23:50:16 +01:00
for opts in url.gconfig:
config.set(*opts)
with config.apply(url.lconfig):
status = jobtype(url.value).run()
else:
status = jobtype(url).run()
if status:
retval |= status
input_manager.error()
else:
input_manager.success()
except exception.StopExtraction:
pass
except exception.TerminateExtraction:
pass
except exception.RestartExtraction:
log.debug("Restarting '%s'", url)
continue
2016-07-14 14:57:42 +02:00
except exception.NoExtractorError:
2022-10-28 11:49:20 +02:00
log.error("Unsupported URL '%s'", url)
retval |= 64
input_manager.error()
input_manager.next()
return retval
return 0
2017-02-25 23:53:31 +01:00
except KeyboardInterrupt:
2023-08-21 23:46:39 +02:00
raise SystemExit("\nKeyboardInterrupt")
2016-08-05 10:25:31 +02:00
except BrokenPipeError:
pass
except OSError as exc:
2016-08-05 10:25:31 +02:00
import errno
2017-08-13 20:35:44 +02:00
if exc.errno != errno.EPIPE:
2016-08-05 10:25:31 +02:00
raise
return 1
class InputManager():
def __init__(self):
self.urls = []
self.files = ()
self.log = self.err = None
self._url = ""
self._item = None
self._index = 0
self._pformat = None
def add_url(self, url):
self.urls.append(url)
def add_list(self, urls):
self.urls += urls
def add_file(self, path, action=None):
"""Process an input file.
Lines starting with '#' and empty lines will be ignored.
Lines starting with '-' will be interpreted as a key-value pair
separated by an '='. where
'key' is a dot-separated option name and
'value' is a JSON-parsable string.
These configuration options will be applied
while processing the next URL only.
Lines starting with '-G' are the same as above, except these options
will be applied for *all* following URLs, i.e. they are Global.
Everything else will be used as a potential URL.
Example input file:
# settings global options
-G base-directory = "/tmp/"
-G skip = false
# setting local options for the next URL
-filename="spaces_are_optional.jpg"
-skip = true
https://example.org/
# next URL uses default filename and 'skip' is false.
https://example.com/index.htm # comment1
https://example.com/404.htm # comment2
"""
if path == "-" and not action:
try:
lines = sys.stdin.readlines()
except Exception:
raise exception.InputFileError("stdin is not readable")
path = None
else:
try:
with open(path, encoding="utf-8") as fp:
lines = fp.readlines()
except Exception as exc:
raise exception.InputFileError(str(exc))
if self.files:
self.files[path] = lines
else:
self.files = {path: lines}
if action == "c":
action = self._action_comment
elif action == "d":
action = self._action_delete
else:
action = None
gconf = []
lconf = []
indicies = []
strip_comment = None
append = self.urls.append
for n, line in enumerate(lines):
line = line.strip()
if not line or line[0] == "#":
# empty line or comment
continue
elif line[0] == "-":
# config spec
if len(line) >= 2 and line[1] == "G":
conf = gconf
line = line[2:]
else:
conf = lconf
line = line[1:]
if action:
indicies.append(n)
key, sep, value = line.partition("=")
if not sep:
raise exception.InputFileError(
"Invalid KEY=VALUE pair '%s' on line %s in %s",
line, n+1, path)
try:
value = util.json_loads(value.strip())
except ValueError as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
raise exception.InputFileError(
"Unable to parse '%s' on line %s in %s",
value, n+1, path)
key = key.strip().split(".")
conf.append((key[:-1], key[-1], value))
else:
# url
if " #" in line or "\t#" in line:
if strip_comment is None:
import re
strip_comment = re.compile(r"\s+#.*").sub
line = strip_comment("", line)
if gconf or lconf:
url = ExtendedUrl(line, gconf, lconf)
gconf = []
lconf = []
else:
url = line
if action:
indicies.append(n)
append((url, path, action, indicies))
indicies = []
else:
append(url)
def progress(self, pformat=True):
if pformat is True:
pformat = "[{current}/{total}] {url}\n"
else:
pformat += "\n"
self._pformat = pformat.format_map
def next(self):
self._index += 1
def success(self):
if self._item:
self._rewrite()
def error(self):
if self.err:
if self._item:
url, path, action, indicies = self._item
lines = self.files[path]
out = "".join(lines[i] for i in indicies)
if out and out[-1] == "\n":
out = out[:-1]
self._rewrite()
else:
out = str(self._url)
self.err.info(out)
def _rewrite(self):
url, path, action, indicies = self._item
lines = self.files[path]
action(lines, indicies)
try:
with open(path, "w", encoding="utf-8") as fp:
fp.writelines(lines)
except Exception as exc:
self.log.warning(
"Unable to update '%s' (%s: %s)",
path, exc.__class__.__name__, exc)
@staticmethod
def _action_comment(lines, indicies):
for i in indicies:
lines[i] = "# " + lines[i]
@staticmethod
def _action_delete(lines, indicies):
for i in indicies:
lines[i] = ""
def __iter__(self):
self._index = 0
return self
def __next__(self):
try:
url = self.urls[self._index]
except IndexError:
raise StopIteration
if isinstance(url, tuple):
self._item = url
url = url[0]
else:
self._item = None
self._url = url
if self._pformat:
output.stderr_write(self._pformat({
"total" : len(self.urls),
"current": self._index + 1,
"url" : url,
}))
return url
class ExtendedUrl():
"""URL with attached config key-value pairs"""
__slots__ = ("value", "gconfig", "lconfig")
def __init__(self, url, gconf, lconf):
self.value = url
self.gconfig = gconf
self.lconfig = lconf
def __str__(self):
return self.value