2015-04-05 17:15:27 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-01-26 14:59:24 +01:00
|
|
|
# Copyright 2014-2023 Mike Fährmann
|
2015-04-05 17:15:27 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2021-06-12 00:20:59 +02:00
|
|
|
import sys
|
|
|
|
import logging
|
|
|
|
from . import version, config, option, output, extractor, job, util, exception
|
2016-10-04 14:33:50 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
__author__ = "Mike Fährmann"
|
2023-01-30 20:07:18 +01:00
|
|
|
__copyright__ = "Copyright 2014-2023 Mike Fährmann"
|
2017-01-30 19:40:15 +01:00
|
|
|
__license__ = "GPLv2"
|
2014-10-12 21:56:44 +02:00
|
|
|
__maintainer__ = "Mike Fährmann"
|
2017-01-30 19:40:15 +01:00
|
|
|
__email__ = "mike_faehrmann@web.de"
|
2017-03-27 11:59:27 +02:00
|
|
|
__version__ = version.__version__
|
2018-04-04 17:30:42 +02:00
|
|
|
|
|
|
|
|
2014-10-12 21:56:44 +02:00
|
|
|
def main():
|
2015-04-10 17:31:49 +02:00
|
|
|
try:
|
2017-03-23 16:29:40 +01:00
|
|
|
parser = option.build_parser()
|
2015-11-14 15:31:07 +01:00
|
|
|
args = parser.parse_args()
|
2019-02-13 17:39:43 +01:00
|
|
|
log = output.initialize_logging(args.loglevel)
|
2015-11-14 15:11:44 +01:00
|
|
|
|
2017-09-08 17:52:00 +02:00
|
|
|
# configuration
|
2023-03-01 14:49:40 +01:00
|
|
|
if args.config_load:
|
2017-04-25 17:09:10 +02:00
|
|
|
config.load()
|
2023-02-28 21:54:46 +01:00
|
|
|
if args.configs_json:
|
|
|
|
config.load(args.configs_json, strict=True)
|
|
|
|
if args.configs_yaml:
|
2023-02-28 22:02:15 +01:00
|
|
|
import yaml
|
2023-08-21 21:18:40 +02:00
|
|
|
config.load(args.configs_yaml, strict=True, loads=yaml.safe_load)
|
2023-02-28 21:54:46 +01:00
|
|
|
if args.configs_toml:
|
2023-02-28 22:02:15 +01:00
|
|
|
try:
|
|
|
|
import tomllib as toml
|
|
|
|
except ImportError:
|
|
|
|
import toml
|
2023-08-21 21:18:40 +02:00
|
|
|
config.load(args.configs_toml, strict=True, loads=toml.loads)
|
2024-04-20 21:00:26 +02:00
|
|
|
if not args.colors:
|
|
|
|
output.ANSI = False
|
|
|
|
config.set((), "colors", False)
|
|
|
|
if util.WINDOWS:
|
|
|
|
config.set(("output",), "ansi", False)
|
2021-12-27 23:31:54 +01:00
|
|
|
if args.filename:
|
2022-04-28 21:54:05 +02:00
|
|
|
filename = args.filename
|
|
|
|
if filename == "/O":
|
|
|
|
filename = "{filename}.{extension}"
|
|
|
|
elif filename.startswith("\\f"):
|
|
|
|
filename = "\f" + filename[2:]
|
|
|
|
config.set((), "filename", filename)
|
2024-01-12 16:38:18 +01:00
|
|
|
if args.directory is not None:
|
2021-12-27 23:31:54 +01:00
|
|
|
config.set((), "base-directory", args.directory)
|
|
|
|
config.set((), "directory", ())
|
2019-05-10 15:32:23 +02:00
|
|
|
if args.postprocessors:
|
2019-11-23 23:50:16 +01:00
|
|
|
config.set((), "postprocessors", args.postprocessors)
|
2019-06-29 23:46:55 +02:00
|
|
|
if args.abort:
|
2019-11-23 23:50:16 +01:00
|
|
|
config.set((), "skip", "abort:" + str(args.abort))
|
2021-06-05 04:00:29 +02:00
|
|
|
if args.terminate:
|
|
|
|
config.set((), "skip", "terminate:" + str(args.terminate))
|
2022-05-07 23:03:48 +02:00
|
|
|
if args.cookies_from_browser:
|
|
|
|
browser, _, profile = args.cookies_from_browser.partition(":")
|
|
|
|
browser, _, keyring = browser.partition("+")
|
2023-05-05 21:32:18 +02:00
|
|
|
browser, _, domain = browser.partition("/")
|
2022-12-09 19:43:55 +01:00
|
|
|
if profile.startswith(":"):
|
|
|
|
container = profile[1:]
|
|
|
|
profile = None
|
|
|
|
else:
|
|
|
|
profile, _, container = profile.partition("::")
|
2023-05-05 21:32:18 +02:00
|
|
|
config.set((), "cookies", (
|
|
|
|
browser, profile, keyring, container, domain))
|
2023-01-26 14:59:24 +01:00
|
|
|
if args.options_pp:
|
|
|
|
config.set((), "postprocessor-options", args.options_pp)
|
2019-11-23 23:50:16 +01:00
|
|
|
for opts in args.options:
|
|
|
|
config.set(*opts)
|
2017-09-08 17:52:00 +02:00
|
|
|
|
2023-02-26 14:56:19 +01:00
|
|
|
output.configure_standard_streams()
|
|
|
|
|
2022-02-13 22:39:26 +01:00
|
|
|
# signals
|
|
|
|
signals = config.get((), "signals-ignore")
|
|
|
|
if signals:
|
|
|
|
import signal
|
|
|
|
if isinstance(signals, str):
|
|
|
|
signals = signals.split(",")
|
|
|
|
for signal_name in signals:
|
|
|
|
signal_num = getattr(signal, signal_name, None)
|
|
|
|
if signal_num is None:
|
|
|
|
log.warning("signal '%s' is not defined", signal_name)
|
|
|
|
else:
|
|
|
|
signal.signal(signal_num, signal.SIG_IGN)
|
|
|
|
|
2022-05-29 19:15:25 +02:00
|
|
|
# enable ANSI escape sequences on Windows
|
2024-05-03 02:03:59 +02:00
|
|
|
if util.WINDOWS and config.get(("output",), "ansi", output.COLORS):
|
2022-05-29 19:15:25 +02:00
|
|
|
from ctypes import windll, wintypes, byref
|
|
|
|
kernel32 = windll.kernel32
|
|
|
|
mode = wintypes.DWORD()
|
|
|
|
|
|
|
|
for handle_id in (-11, -12): # stdout and stderr
|
|
|
|
handle = kernel32.GetStdHandle(handle_id)
|
|
|
|
kernel32.GetConsoleMode(handle, byref(mode))
|
|
|
|
if not mode.value & 0x4:
|
|
|
|
mode.value |= 0x4
|
|
|
|
kernel32.SetConsoleMode(handle, mode)
|
|
|
|
|
|
|
|
output.ANSI = True
|
|
|
|
|
2024-07-11 01:16:06 +02:00
|
|
|
# filter environment
|
|
|
|
filterenv = config.get((), "filters-environment", True)
|
|
|
|
if not filterenv:
|
|
|
|
util.compile_expression = util.compile_expression_raw
|
|
|
|
|
2022-07-10 13:30:45 +02:00
|
|
|
# format string separator
|
|
|
|
separator = config.get((), "format-separator")
|
|
|
|
if separator:
|
|
|
|
from . import formatter
|
|
|
|
formatter._SEPARATOR = separator
|
|
|
|
|
2023-02-28 18:18:55 +01:00
|
|
|
# eval globals
|
|
|
|
path = config.get((), "globals")
|
|
|
|
if path:
|
2023-03-16 18:37:00 +01:00
|
|
|
util.GLOBALS.update(util.import_file(path).__dict__)
|
2023-02-28 18:18:55 +01:00
|
|
|
|
2018-01-27 00:35:18 +01:00
|
|
|
# loglevels
|
2020-01-30 15:11:02 +01:00
|
|
|
output.configure_logging(args.loglevel)
|
2024-04-14 10:55:50 +02:00
|
|
|
if args.loglevel >= logging.WARNING:
|
2019-11-23 23:50:16 +01:00
|
|
|
config.set(("output",), "mode", "null")
|
2023-11-18 01:16:49 +01:00
|
|
|
config.set(("downloader",), "progress", None)
|
2017-08-13 20:35:44 +02:00
|
|
|
elif args.loglevel <= logging.DEBUG:
|
2018-01-27 01:05:17 +01:00
|
|
|
import platform
|
|
|
|
import requests
|
2018-07-17 22:44:32 +02:00
|
|
|
|
2021-12-10 03:18:02 +01:00
|
|
|
extra = ""
|
2023-02-28 23:10:23 +01:00
|
|
|
if util.EXECUTABLE:
|
2024-05-27 21:37:01 +02:00
|
|
|
extra = " - Executable ({})".format(version.__variant__)
|
2021-12-10 03:18:02 +01:00
|
|
|
else:
|
2022-11-04 17:35:47 +01:00
|
|
|
git_head = util.git_head()
|
|
|
|
if git_head:
|
|
|
|
extra = " - Git HEAD: " + git_head
|
2021-12-10 03:18:02 +01:00
|
|
|
|
|
|
|
log.debug("Version %s%s", __version__, extra)
|
2017-08-13 20:35:44 +02:00
|
|
|
log.debug("Python %s - %s",
|
|
|
|
platform.python_version(), platform.platform())
|
2017-12-27 22:12:40 +01:00
|
|
|
try:
|
2018-01-27 01:05:17 +01:00
|
|
|
log.debug("requests %s - urllib3 %s",
|
|
|
|
requests.__version__,
|
|
|
|
requests.packages.urllib3.__version__)
|
2017-12-27 22:12:40 +01:00
|
|
|
except AttributeError:
|
|
|
|
pass
|
2015-11-14 15:11:44 +01:00
|
|
|
|
2022-11-18 17:15:32 +01:00
|
|
|
log.debug("Configuration Files %s", config._files)
|
|
|
|
|
2024-07-05 03:22:06 +02:00
|
|
|
if args.print_traffic:
|
|
|
|
import requests
|
|
|
|
requests.packages.urllib3.connection.HTTPConnection.debuglevel = 1
|
|
|
|
|
2023-01-30 20:07:18 +01:00
|
|
|
# extractor modules
|
|
|
|
modules = config.get(("extractor",), "modules")
|
|
|
|
if modules is not None:
|
|
|
|
if isinstance(modules, str):
|
|
|
|
modules = modules.split(",")
|
|
|
|
extractor.modules = modules
|
|
|
|
|
|
|
|
# external modules
|
|
|
|
if args.extractor_sources:
|
|
|
|
sources = args.extractor_sources
|
|
|
|
sources.append(None)
|
|
|
|
else:
|
|
|
|
sources = config.get(("extractor",), "module-sources")
|
|
|
|
|
|
|
|
if sources:
|
|
|
|
import os
|
|
|
|
modules = []
|
|
|
|
|
|
|
|
for source in sources:
|
|
|
|
if source:
|
|
|
|
path = util.expand_path(source)
|
|
|
|
try:
|
|
|
|
files = os.listdir(path)
|
|
|
|
modules.append(extractor._modules_path(path, files))
|
|
|
|
except Exception as exc:
|
|
|
|
log.warning("Unable to load modules from %s (%s: %s)",
|
|
|
|
path, exc.__class__.__name__, exc)
|
|
|
|
else:
|
|
|
|
modules.append(extractor._modules_internal())
|
|
|
|
|
|
|
|
if len(modules) > 1:
|
|
|
|
import itertools
|
|
|
|
extractor._module_iter = itertools.chain(*modules)
|
|
|
|
elif not modules:
|
|
|
|
extractor._module_iter = ()
|
|
|
|
else:
|
|
|
|
extractor._module_iter = iter(modules[0])
|
|
|
|
|
2024-05-27 23:38:11 +02:00
|
|
|
if args.update:
|
|
|
|
from . import update
|
|
|
|
extr = update.UpdateExtractor.from_url("update:" + args.update)
|
|
|
|
ujob = update.UpdateJob(extr)
|
|
|
|
return ujob.run()
|
|
|
|
|
|
|
|
elif args.list_modules:
|
2022-05-19 13:24:37 +02:00
|
|
|
extractor.modules.append("")
|
|
|
|
sys.stdout.write("\n".join(extractor.modules))
|
|
|
|
|
2016-09-14 09:51:01 +02:00
|
|
|
elif args.list_extractors:
|
2022-05-19 13:24:37 +02:00
|
|
|
write = sys.stdout.write
|
2023-09-11 17:32:59 +02:00
|
|
|
fmt = ("{}{}\nCategory: {} - Subcategory: {}"
|
|
|
|
"\nExample : {}\n\n").format
|
2022-05-19 13:24:37 +02:00
|
|
|
|
2016-09-14 09:51:01 +02:00
|
|
|
for extr in extractor.extractors():
|
2022-05-19 13:24:37 +02:00
|
|
|
write(fmt(
|
2023-09-11 17:32:59 +02:00
|
|
|
extr.__name__,
|
|
|
|
"\n" + extr.__doc__ if extr.__doc__ else "",
|
2022-05-19 13:24:37 +02:00
|
|
|
extr.category, extr.subcategory,
|
2023-09-11 17:32:59 +02:00
|
|
|
extr.example,
|
2022-05-19 13:24:37 +02:00
|
|
|
))
|
|
|
|
|
2019-04-25 21:30:16 +02:00
|
|
|
elif args.clear_cache:
|
|
|
|
from . import cache
|
|
|
|
log = logging.getLogger("cache")
|
2021-05-03 22:24:15 +02:00
|
|
|
cnt = cache.clear(args.clear_cache)
|
2019-04-25 21:30:16 +02:00
|
|
|
|
|
|
|
if cnt is None:
|
|
|
|
log.error("Database file not available")
|
2024-05-29 02:56:25 +02:00
|
|
|
return 1
|
2019-04-25 21:30:16 +02:00
|
|
|
else:
|
|
|
|
log.info(
|
|
|
|
"Deleted %d %s from '%s'",
|
|
|
|
cnt, "entry" if cnt == 1 else "entries", cache._path(),
|
|
|
|
)
|
2023-03-01 14:49:40 +01:00
|
|
|
|
2024-06-13 23:08:11 +02:00
|
|
|
elif args.config:
|
|
|
|
if args.config == "init":
|
|
|
|
return config.initialize()
|
|
|
|
elif args.config == "status":
|
|
|
|
return config.status()
|
|
|
|
else:
|
|
|
|
return config.open_extern()
|
2023-03-01 14:49:40 +01:00
|
|
|
|
2015-11-14 15:11:44 +01:00
|
|
|
else:
|
2024-08-27 17:21:49 +02:00
|
|
|
input_files = config.get((), "input-files")
|
|
|
|
if input_files:
|
|
|
|
for input_file in input_files:
|
|
|
|
if isinstance(input_file, str):
|
|
|
|
input_file = (input_file, None)
|
|
|
|
args.input_files.append(input_file)
|
|
|
|
|
2023-11-14 20:38:11 +01:00
|
|
|
if not args.urls and not args.input_files:
|
2017-08-13 20:35:44 +02:00
|
|
|
parser.error(
|
|
|
|
"The following arguments are required: URL\n"
|
|
|
|
"Use 'gallery-dl --help' to get a list of all options.")
|
2016-07-21 13:13:53 +02:00
|
|
|
|
2015-12-10 02:14:28 +01:00
|
|
|
if args.list_urls:
|
2016-07-14 14:25:56 +02:00
|
|
|
jobtype = job.UrlJob
|
2017-02-17 22:18:16 +01:00
|
|
|
jobtype.maxdepth = args.list_urls
|
2021-04-12 01:55:55 +02:00
|
|
|
if config.get(("output",), "fallback", True):
|
|
|
|
jobtype.handle_url = \
|
|
|
|
staticmethod(jobtype.handle_url_fallback)
|
2024-07-26 20:36:04 +02:00
|
|
|
elif args.dump_json:
|
|
|
|
jobtype = job.DataJob
|
|
|
|
jobtype.resolve = args.dump_json - 1
|
2015-12-10 02:14:28 +01:00
|
|
|
else:
|
2019-05-10 22:05:57 +02:00
|
|
|
jobtype = args.jobtype or job.DownloadJob
|
2016-07-21 13:13:53 +02:00
|
|
|
|
2023-12-10 00:00:57 +01:00
|
|
|
input_manager = InputManager()
|
|
|
|
input_manager.log = input_log = logging.getLogger("inputfile")
|
|
|
|
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
# unsupported file logging handler
|
2019-02-13 17:39:43 +01:00
|
|
|
handler = output.setup_logging_handler(
|
|
|
|
"unsupportedfile", fmt="{message}")
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
if handler:
|
2023-12-10 00:00:57 +01:00
|
|
|
ulog = job.Job.ulog = logging.getLogger("unsupported")
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
ulog.addHandler(handler)
|
|
|
|
ulog.propagate = False
|
2023-12-10 00:00:57 +01:00
|
|
|
|
|
|
|
# error file logging handler
|
|
|
|
handler = output.setup_logging_handler(
|
|
|
|
"errorfile", fmt="{message}", mode="a")
|
|
|
|
if handler:
|
|
|
|
elog = input_manager.err = logging.getLogger("errorfile")
|
|
|
|
elog.addHandler(handler)
|
|
|
|
elog.propagate = False
|
2016-12-04 16:11:54 +01:00
|
|
|
|
2023-11-14 20:38:11 +01:00
|
|
|
# collect input URLs
|
|
|
|
input_manager.add_list(args.urls)
|
|
|
|
|
|
|
|
if args.input_files:
|
|
|
|
for input_file, action in args.input_files:
|
|
|
|
try:
|
|
|
|
path = util.expand_path(input_file)
|
|
|
|
input_manager.add_file(path, action)
|
|
|
|
except Exception as exc:
|
|
|
|
input_log.error(exc)
|
|
|
|
return getattr(exc, "code", 128)
|
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
pformat = config.get(("output",), "progress", True)
|
2023-11-14 20:38:11 +01:00
|
|
|
if pformat and len(input_manager.urls) > 1 and \
|
|
|
|
args.loglevel < logging.ERROR:
|
|
|
|
input_manager.progress(pformat)
|
2017-06-09 20:12:15 +02:00
|
|
|
|
2023-11-14 20:38:11 +01:00
|
|
|
# process input URLs
|
2019-10-27 23:34:52 +01:00
|
|
|
retval = 0
|
2023-11-14 20:38:11 +01:00
|
|
|
for url in input_manager:
|
2016-07-14 14:57:42 +02:00
|
|
|
try:
|
2017-04-18 11:38:48 +02:00
|
|
|
log.debug("Starting %s for '%s'", jobtype.__name__, url)
|
2023-11-14 20:38:11 +01:00
|
|
|
|
|
|
|
if isinstance(url, ExtendedUrl):
|
2019-11-23 23:50:16 +01:00
|
|
|
for opts in url.gconfig:
|
|
|
|
config.set(*opts)
|
2018-02-15 21:15:33 +01:00
|
|
|
with config.apply(url.lconfig):
|
2023-11-14 20:38:11 +01:00
|
|
|
status = jobtype(url.value).run()
|
|
|
|
else:
|
|
|
|
status = jobtype(url).run()
|
|
|
|
|
|
|
|
if status:
|
|
|
|
retval |= status
|
2023-12-05 20:49:51 +01:00
|
|
|
input_manager.error()
|
2018-02-07 21:47:27 +01:00
|
|
|
else:
|
2023-11-14 20:38:11 +01:00
|
|
|
input_manager.success()
|
|
|
|
|
2024-06-29 19:03:49 +02:00
|
|
|
except exception.StopExtraction:
|
|
|
|
pass
|
2021-05-12 02:22:28 +02:00
|
|
|
except exception.TerminateExtraction:
|
|
|
|
pass
|
2023-02-11 21:06:14 +01:00
|
|
|
except exception.RestartExtraction:
|
|
|
|
log.debug("Restarting '%s'", url)
|
|
|
|
continue
|
2016-07-14 14:57:42 +02:00
|
|
|
except exception.NoExtractorError:
|
2022-10-28 11:49:20 +02:00
|
|
|
log.error("Unsupported URL '%s'", url)
|
2019-10-29 15:56:54 +01:00
|
|
|
retval |= 64
|
2023-12-05 20:49:51 +01:00
|
|
|
input_manager.error()
|
2023-02-11 21:06:14 +01:00
|
|
|
|
2023-11-14 20:38:11 +01:00
|
|
|
input_manager.next()
|
2019-10-27 23:34:52 +01:00
|
|
|
return retval
|
2024-05-29 02:56:25 +02:00
|
|
|
return 0
|
2017-02-25 23:53:31 +01:00
|
|
|
|
2015-04-10 17:31:49 +02:00
|
|
|
except KeyboardInterrupt:
|
2023-08-21 23:46:39 +02:00
|
|
|
raise SystemExit("\nKeyboardInterrupt")
|
2016-08-05 10:25:31 +02:00
|
|
|
except BrokenPipeError:
|
|
|
|
pass
|
2019-10-27 23:34:52 +01:00
|
|
|
except OSError as exc:
|
2016-08-05 10:25:31 +02:00
|
|
|
import errno
|
2017-08-13 20:35:44 +02:00
|
|
|
if exc.errno != errno.EPIPE:
|
2016-08-05 10:25:31 +02:00
|
|
|
raise
|
2019-10-27 23:34:52 +01:00
|
|
|
return 1
|
2023-11-14 20:38:11 +01:00
|
|
|
|
|
|
|
|
|
|
|
class InputManager():
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.urls = []
|
|
|
|
self.files = ()
|
2023-12-10 00:00:57 +01:00
|
|
|
self.log = self.err = None
|
2023-12-05 20:49:51 +01:00
|
|
|
|
|
|
|
self._url = ""
|
|
|
|
self._item = None
|
2023-11-14 20:38:11 +01:00
|
|
|
self._index = 0
|
|
|
|
self._pformat = None
|
|
|
|
|
|
|
|
def add_url(self, url):
|
|
|
|
self.urls.append(url)
|
|
|
|
|
|
|
|
def add_list(self, urls):
|
|
|
|
self.urls += urls
|
|
|
|
|
|
|
|
def add_file(self, path, action=None):
|
|
|
|
"""Process an input file.
|
|
|
|
|
|
|
|
Lines starting with '#' and empty lines will be ignored.
|
|
|
|
Lines starting with '-' will be interpreted as a key-value pair
|
|
|
|
separated by an '='. where
|
|
|
|
'key' is a dot-separated option name and
|
|
|
|
'value' is a JSON-parsable string.
|
|
|
|
These configuration options will be applied
|
|
|
|
while processing the next URL only.
|
|
|
|
Lines starting with '-G' are the same as above, except these options
|
|
|
|
will be applied for *all* following URLs, i.e. they are Global.
|
|
|
|
Everything else will be used as a potential URL.
|
|
|
|
|
|
|
|
Example input file:
|
|
|
|
|
|
|
|
# settings global options
|
|
|
|
-G base-directory = "/tmp/"
|
|
|
|
-G skip = false
|
|
|
|
|
|
|
|
# setting local options for the next URL
|
|
|
|
-filename="spaces_are_optional.jpg"
|
|
|
|
-skip = true
|
|
|
|
|
|
|
|
https://example.org/
|
|
|
|
|
|
|
|
# next URL uses default filename and 'skip' is false.
|
|
|
|
https://example.com/index.htm # comment1
|
|
|
|
https://example.com/404.htm # comment2
|
|
|
|
"""
|
|
|
|
if path == "-" and not action:
|
|
|
|
try:
|
|
|
|
lines = sys.stdin.readlines()
|
|
|
|
except Exception:
|
|
|
|
raise exception.InputFileError("stdin is not readable")
|
|
|
|
path = None
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
with open(path, encoding="utf-8") as fp:
|
|
|
|
lines = fp.readlines()
|
|
|
|
except Exception as exc:
|
|
|
|
raise exception.InputFileError(str(exc))
|
|
|
|
|
|
|
|
if self.files:
|
|
|
|
self.files[path] = lines
|
|
|
|
else:
|
|
|
|
self.files = {path: lines}
|
|
|
|
|
|
|
|
if action == "c":
|
|
|
|
action = self._action_comment
|
|
|
|
elif action == "d":
|
|
|
|
action = self._action_delete
|
|
|
|
else:
|
|
|
|
action = None
|
|
|
|
|
|
|
|
gconf = []
|
|
|
|
lconf = []
|
|
|
|
indicies = []
|
|
|
|
strip_comment = None
|
|
|
|
append = self.urls.append
|
|
|
|
|
|
|
|
for n, line in enumerate(lines):
|
|
|
|
line = line.strip()
|
|
|
|
|
|
|
|
if not line or line[0] == "#":
|
|
|
|
# empty line or comment
|
|
|
|
continue
|
|
|
|
|
|
|
|
elif line[0] == "-":
|
|
|
|
# config spec
|
|
|
|
if len(line) >= 2 and line[1] == "G":
|
|
|
|
conf = gconf
|
|
|
|
line = line[2:]
|
|
|
|
else:
|
|
|
|
conf = lconf
|
|
|
|
line = line[1:]
|
|
|
|
if action:
|
|
|
|
indicies.append(n)
|
|
|
|
|
|
|
|
key, sep, value = line.partition("=")
|
|
|
|
if not sep:
|
|
|
|
raise exception.InputFileError(
|
|
|
|
"Invalid KEY=VALUE pair '%s' on line %s in %s",
|
|
|
|
line, n+1, path)
|
|
|
|
|
|
|
|
try:
|
|
|
|
value = util.json_loads(value.strip())
|
|
|
|
except ValueError as exc:
|
|
|
|
self.log.debug("%s: %s", exc.__class__.__name__, exc)
|
|
|
|
raise exception.InputFileError(
|
|
|
|
"Unable to parse '%s' on line %s in %s",
|
|
|
|
value, n+1, path)
|
|
|
|
|
|
|
|
key = key.strip().split(".")
|
|
|
|
conf.append((key[:-1], key[-1], value))
|
|
|
|
|
|
|
|
else:
|
|
|
|
# url
|
|
|
|
if " #" in line or "\t#" in line:
|
|
|
|
if strip_comment is None:
|
|
|
|
import re
|
|
|
|
strip_comment = re.compile(r"\s+#.*").sub
|
|
|
|
line = strip_comment("", line)
|
|
|
|
if gconf or lconf:
|
|
|
|
url = ExtendedUrl(line, gconf, lconf)
|
|
|
|
gconf = []
|
|
|
|
lconf = []
|
|
|
|
else:
|
|
|
|
url = line
|
|
|
|
|
|
|
|
if action:
|
|
|
|
indicies.append(n)
|
|
|
|
append((url, path, action, indicies))
|
|
|
|
indicies = []
|
|
|
|
else:
|
|
|
|
append(url)
|
|
|
|
|
|
|
|
def progress(self, pformat=True):
|
|
|
|
if pformat is True:
|
|
|
|
pformat = "[{current}/{total}] {url}\n"
|
|
|
|
else:
|
|
|
|
pformat += "\n"
|
|
|
|
self._pformat = pformat.format_map
|
|
|
|
|
|
|
|
def next(self):
|
|
|
|
self._index += 1
|
|
|
|
|
|
|
|
def success(self):
|
2023-12-05 20:49:51 +01:00
|
|
|
if self._item:
|
|
|
|
self._rewrite()
|
|
|
|
|
|
|
|
def error(self):
|
2023-12-10 00:00:57 +01:00
|
|
|
if self.err:
|
2023-12-05 20:49:51 +01:00
|
|
|
if self._item:
|
|
|
|
url, path, action, indicies = self._item
|
|
|
|
lines = self.files[path]
|
|
|
|
out = "".join(lines[i] for i in indicies)
|
2023-12-10 00:00:57 +01:00
|
|
|
if out and out[-1] == "\n":
|
|
|
|
out = out[:-1]
|
2023-12-05 20:49:51 +01:00
|
|
|
self._rewrite()
|
|
|
|
else:
|
2023-12-10 00:00:57 +01:00
|
|
|
out = str(self._url)
|
|
|
|
self.err.info(out)
|
2023-12-05 20:49:51 +01:00
|
|
|
|
|
|
|
def _rewrite(self):
|
|
|
|
url, path, action, indicies = self._item
|
|
|
|
lines = self.files[path]
|
|
|
|
action(lines, indicies)
|
|
|
|
try:
|
|
|
|
with open(path, "w", encoding="utf-8") as fp:
|
|
|
|
fp.writelines(lines)
|
|
|
|
except Exception as exc:
|
|
|
|
self.log.warning(
|
|
|
|
"Unable to update '%s' (%s: %s)",
|
|
|
|
path, exc.__class__.__name__, exc)
|
2023-11-14 20:38:11 +01:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _action_comment(lines, indicies):
|
|
|
|
for i in indicies:
|
|
|
|
lines[i] = "# " + lines[i]
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _action_delete(lines, indicies):
|
|
|
|
for i in indicies:
|
|
|
|
lines[i] = ""
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
self._index = 0
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __next__(self):
|
|
|
|
try:
|
2023-12-05 20:49:51 +01:00
|
|
|
url = self.urls[self._index]
|
2023-11-14 20:38:11 +01:00
|
|
|
except IndexError:
|
|
|
|
raise StopIteration
|
|
|
|
|
2023-12-05 20:49:51 +01:00
|
|
|
if isinstance(url, tuple):
|
|
|
|
self._item = url
|
|
|
|
url = url[0]
|
2023-11-14 20:38:11 +01:00
|
|
|
else:
|
2023-12-05 20:49:51 +01:00
|
|
|
self._item = None
|
|
|
|
self._url = url
|
2023-11-14 20:38:11 +01:00
|
|
|
|
|
|
|
if self._pformat:
|
|
|
|
output.stderr_write(self._pformat({
|
|
|
|
"total" : len(self.urls),
|
|
|
|
"current": self._index + 1,
|
2023-12-05 20:49:51 +01:00
|
|
|
"url" : url,
|
2023-11-14 20:38:11 +01:00
|
|
|
}))
|
2023-12-05 20:49:51 +01:00
|
|
|
return url
|
2023-11-14 20:38:11 +01:00
|
|
|
|
|
|
|
|
|
|
|
class ExtendedUrl():
|
|
|
|
"""URL with attached config key-value pairs"""
|
|
|
|
__slots__ = ("value", "gconfig", "lconfig")
|
|
|
|
|
|
|
|
def __init__(self, url, gconf, lconf):
|
|
|
|
self.value = url
|
|
|
|
self.gconfig = gconf
|
|
|
|
self.lconfig = lconf
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return self.value
|