2015-04-05 16:23:20 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-01-26 14:59:24 +01:00
|
|
|
# Copyright 2015-2023 Mike Fährmann
|
2015-04-05 16:23:20 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
import sys
|
2019-12-18 22:08:53 +01:00
|
|
|
import errno
|
2018-02-01 20:49:41 +01:00
|
|
|
import logging
|
2021-06-04 18:08:08 +02:00
|
|
|
import functools
|
2020-11-18 17:11:55 +01:00
|
|
|
import collections
|
2018-05-20 22:03:57 +02:00
|
|
|
from . import extractor, downloader, postprocessor
|
2022-11-27 16:09:42 +01:00
|
|
|
from . import config, text, util, path, formatter, output, exception, version
|
2015-11-24 19:47:51 +01:00
|
|
|
from .extractor.message import Message
|
2022-05-19 13:24:37 +02:00
|
|
|
from .output import stdout_write
|
2015-04-05 16:23:20 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class Job():
|
2023-08-07 23:22:12 +02:00
|
|
|
"""Base class for Job types"""
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
ulog = None
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
def __init__(self, extr, parent=None):
|
|
|
|
if isinstance(extr, str):
|
|
|
|
extr = extractor.find(extr)
|
|
|
|
if not extr:
|
|
|
|
raise exception.NoExtractorError()
|
2021-04-27 15:08:51 +02:00
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
self.extractor = extr
|
2020-05-18 01:35:53 +02:00
|
|
|
self.pathfmt = None
|
2021-04-27 15:08:51 +02:00
|
|
|
self.kwdict = {}
|
|
|
|
self.status = 0
|
2020-05-18 01:35:53 +02:00
|
|
|
|
2023-07-28 17:07:25 +02:00
|
|
|
cfgpath = []
|
2023-08-07 23:22:12 +02:00
|
|
|
if parent:
|
|
|
|
if extr.category == parent.extractor.category or \
|
|
|
|
extr.category in parent.parents:
|
|
|
|
parents = parent.parents
|
|
|
|
else:
|
|
|
|
parents = parent.parents + (parent.extractor.category,)
|
|
|
|
|
|
|
|
if parents:
|
|
|
|
for category in parents:
|
|
|
|
cat = "{}>{}".format(category, extr.category)
|
|
|
|
cfgpath.append((cat, extr.subcategory))
|
|
|
|
cfgpath.append((extr.category, extr.subcategory))
|
|
|
|
self.parents = parents
|
|
|
|
else:
|
|
|
|
self.parents = ()
|
|
|
|
else:
|
|
|
|
self.parents = ()
|
|
|
|
|
2023-07-28 17:07:25 +02:00
|
|
|
if extr.basecategory:
|
|
|
|
if not cfgpath:
|
|
|
|
cfgpath.append((extr.category, extr.subcategory))
|
|
|
|
cfgpath.append((extr.basecategory, extr.subcategory))
|
2023-08-07 23:22:12 +02:00
|
|
|
|
2023-07-28 17:07:25 +02:00
|
|
|
if cfgpath:
|
|
|
|
extr._cfgpath = cfgpath
|
|
|
|
extr.config = extr._config_shared
|
|
|
|
extr.config_accumulate = extr._config_shared_accumulate
|
|
|
|
|
2023-03-10 22:08:10 +01:00
|
|
|
actions = extr.config("actions")
|
|
|
|
if actions:
|
|
|
|
from .actions import parse
|
|
|
|
self._logger_actions = parse(actions)
|
|
|
|
self._wrap_logger = self._wrap_logger_actions
|
2023-02-13 13:33:42 +01:00
|
|
|
|
2022-07-30 12:31:45 +02:00
|
|
|
path_proxy = output.PathfmtProxy(self)
|
2020-05-18 01:35:53 +02:00
|
|
|
self._logger_extra = {
|
|
|
|
"job" : self,
|
|
|
|
"extractor": extr,
|
2022-07-30 12:31:45 +02:00
|
|
|
"path" : path_proxy,
|
2020-05-18 01:35:53 +02:00
|
|
|
"keywords" : output.KwdictProxy(self),
|
|
|
|
}
|
|
|
|
extr.log = self._wrap_logger(extr.log)
|
2019-02-13 17:39:43 +01:00
|
|
|
extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url)
|
2015-12-12 00:11:05 +01:00
|
|
|
|
2021-03-11 01:10:34 +01:00
|
|
|
# data from parent job
|
2020-05-12 23:52:01 +02:00
|
|
|
if parent:
|
|
|
|
pextr = parent.extractor
|
|
|
|
|
|
|
|
# transfer (sub)category
|
|
|
|
if pextr.config("category-transfer", pextr.categorytransfer):
|
2021-04-27 15:08:51 +02:00
|
|
|
extr._cfgpath = pextr._cfgpath
|
2020-05-12 23:52:01 +02:00
|
|
|
extr.category = pextr.category
|
|
|
|
extr.subcategory = pextr.subcategory
|
|
|
|
|
2023-11-18 23:43:40 +01:00
|
|
|
self.metadata_url = extr.config2("metadata-url", "url-metadata")
|
|
|
|
self.metadata_http = extr.config2("metadata-http", "http-metadata")
|
|
|
|
metadata_path = extr.config2("metadata-path", "path-metadata")
|
|
|
|
metadata_version = extr.config2("metadata-version", "version-metadata")
|
2023-11-20 22:16:15 +01:00
|
|
|
metadata_extractor = extr.config2(
|
|
|
|
"metadata-extractor", "extractor-metadata")
|
2022-11-07 16:33:26 +01:00
|
|
|
|
|
|
|
if metadata_path:
|
|
|
|
self.kwdict[metadata_path] = path_proxy
|
2023-11-20 22:16:15 +01:00
|
|
|
if metadata_extractor:
|
|
|
|
self.kwdict[metadata_extractor] = extr
|
2023-11-18 23:43:40 +01:00
|
|
|
if metadata_version:
|
|
|
|
self.kwdict[metadata_version] = {
|
2022-11-27 16:09:42 +01:00
|
|
|
"version" : version.__version__,
|
2023-02-28 23:10:23 +01:00
|
|
|
"is_executable" : util.EXECUTABLE,
|
2022-11-27 16:09:42 +01:00
|
|
|
"current_git_head": util.git_head()
|
|
|
|
}
|
2023-11-20 22:16:15 +01:00
|
|
|
# user-supplied metadata
|
|
|
|
kwdict = extr.config("keywords")
|
|
|
|
if kwdict:
|
|
|
|
self.kwdict.update(kwdict)
|
2021-04-27 15:08:51 +02:00
|
|
|
|
2017-02-23 21:51:29 +01:00
|
|
|
def run(self):
|
|
|
|
"""Execute or run the job"""
|
2021-08-16 02:49:36 +02:00
|
|
|
extractor = self.extractor
|
|
|
|
log = extractor.log
|
|
|
|
msg = None
|
|
|
|
|
2023-09-21 23:14:08 +02:00
|
|
|
self._init()
|
|
|
|
|
|
|
|
# sleep before extractor start
|
2022-10-07 12:42:00 +02:00
|
|
|
sleep = util.build_duration_func(
|
|
|
|
extractor.config("sleep-extractor"))
|
2020-09-12 21:04:47 +02:00
|
|
|
if sleep:
|
2022-10-07 12:42:00 +02:00
|
|
|
extractor.sleep(sleep(), "extractor")
|
2021-08-16 02:49:36 +02:00
|
|
|
|
2017-02-23 21:51:29 +01:00
|
|
|
try:
|
2021-08-16 02:49:36 +02:00
|
|
|
for msg in extractor:
|
2017-02-26 02:06:56 +01:00
|
|
|
self.dispatch(msg)
|
2019-10-27 23:05:00 +01:00
|
|
|
except exception.StopExtraction as exc:
|
|
|
|
if exc.message:
|
2019-10-28 16:06:36 +01:00
|
|
|
log.error(exc.message)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= exc.code
|
2023-02-11 21:06:14 +01:00
|
|
|
except (exception.TerminateExtraction, exception.RestartExtraction):
|
2021-05-12 02:22:28 +02:00
|
|
|
raise
|
2019-10-27 23:05:00 +01:00
|
|
|
except exception.GalleryDLException as exc:
|
|
|
|
log.error("%s: %s", exc.__class__.__name__, exc)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= exc.code
|
2017-08-10 16:29:05 +02:00
|
|
|
except OSError as exc:
|
2018-12-04 19:24:50 +01:00
|
|
|
log.error("Unable to download data: %s: %s",
|
|
|
|
exc.__class__.__name__, exc)
|
|
|
|
log.debug("", exc_info=True)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 128
|
2017-04-18 11:38:48 +02:00
|
|
|
except Exception as exc:
|
2017-08-10 16:29:05 +02:00
|
|
|
log.error(("An unexpected error occurred: %s - %s. "
|
|
|
|
"Please run gallery-dl again with the --verbose flag, "
|
|
|
|
"copy its output and report this issue on "
|
|
|
|
"https://github.com/mikf/gallery-dl/issues ."),
|
|
|
|
exc.__class__.__name__, exc)
|
2018-12-04 19:24:50 +01:00
|
|
|
log.debug("", exc_info=True)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 1
|
2019-11-03 21:45:45 +01:00
|
|
|
except BaseException:
|
|
|
|
self.status |= 1
|
|
|
|
raise
|
2021-08-16 02:49:36 +02:00
|
|
|
else:
|
|
|
|
if msg is None:
|
2021-08-23 22:02:04 +02:00
|
|
|
log.info("No results for %s", extractor.url)
|
2019-07-27 11:14:52 +02:00
|
|
|
finally:
|
|
|
|
self.handle_finalize()
|
2023-07-29 13:43:27 +02:00
|
|
|
extractor.finalize()
|
2021-08-16 02:49:36 +02:00
|
|
|
|
2019-10-29 15:56:54 +01:00
|
|
|
return self.status
|
2017-04-18 11:38:48 +02:00
|
|
|
|
2017-02-26 02:06:56 +01:00
|
|
|
def dispatch(self, msg):
|
|
|
|
"""Call the appropriate message handler"""
|
2017-03-17 09:39:46 +01:00
|
|
|
if msg[0] == Message.Url:
|
2021-07-19 02:23:20 +02:00
|
|
|
_, url, kwdict = msg
|
2022-11-07 16:33:26 +01:00
|
|
|
if self.metadata_url:
|
|
|
|
kwdict[self.metadata_url] = url
|
2021-07-19 02:23:20 +02:00
|
|
|
if self.pred_url(url, kwdict):
|
|
|
|
self.update_kwdict(kwdict)
|
|
|
|
self.handle_url(url, kwdict)
|
2017-02-26 02:06:56 +01:00
|
|
|
|
|
|
|
elif msg[0] == Message.Directory:
|
|
|
|
self.update_kwdict(msg[1])
|
|
|
|
self.handle_directory(msg[1])
|
|
|
|
|
2017-03-17 09:39:46 +01:00
|
|
|
elif msg[0] == Message.Queue:
|
2021-07-19 02:23:20 +02:00
|
|
|
_, url, kwdict = msg
|
2022-11-07 16:33:26 +01:00
|
|
|
if self.metadata_url:
|
|
|
|
kwdict[self.metadata_url] = url
|
2021-07-19 02:23:20 +02:00
|
|
|
if self.pred_queue(url, kwdict):
|
|
|
|
self.handle_queue(url, kwdict)
|
2017-02-26 02:06:56 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Url"""
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Directory"""
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Queue"""
|
|
|
|
|
2018-06-08 17:39:02 +02:00
|
|
|
def handle_finalize(self):
|
|
|
|
"""Handle job finalization"""
|
|
|
|
|
2016-09-24 10:45:11 +02:00
|
|
|
def update_kwdict(self, kwdict):
|
2018-02-08 23:10:58 +01:00
|
|
|
"""Update 'kwdict' with additional metadata"""
|
2019-10-29 15:46:35 +01:00
|
|
|
extr = self.extractor
|
|
|
|
kwdict["category"] = extr.category
|
|
|
|
kwdict["subcategory"] = extr.subcategory
|
2022-11-19 11:37:57 +01:00
|
|
|
if self.metadata_http:
|
|
|
|
kwdict.pop(self.metadata_http, None)
|
2021-03-11 01:10:34 +01:00
|
|
|
if self.kwdict:
|
|
|
|
kwdict.update(self.kwdict)
|
2015-12-12 00:11:05 +01:00
|
|
|
|
2023-09-21 23:14:08 +02:00
|
|
|
def _init(self):
|
|
|
|
self.extractor.initialize()
|
|
|
|
self.pred_url = self._prepare_predicates("image", True)
|
|
|
|
self.pred_queue = self._prepare_predicates("chapter", False)
|
|
|
|
|
2019-06-29 22:48:59 +02:00
|
|
|
def _prepare_predicates(self, target, skip=True):
|
|
|
|
predicates = []
|
|
|
|
|
|
|
|
if self.extractor.config(target + "-unique"):
|
|
|
|
predicates.append(util.UniquePredicate())
|
|
|
|
|
2018-10-07 21:34:25 +02:00
|
|
|
pfilter = self.extractor.config(target + "-filter")
|
|
|
|
if pfilter:
|
|
|
|
try:
|
|
|
|
pred = util.FilterPredicate(pfilter, target)
|
|
|
|
except (SyntaxError, ValueError, TypeError) as exc:
|
|
|
|
self.extractor.log.warning(exc)
|
|
|
|
else:
|
|
|
|
predicates.append(pred)
|
|
|
|
|
|
|
|
prange = self.extractor.config(target + "-range")
|
|
|
|
if prange:
|
|
|
|
try:
|
|
|
|
pred = util.RangePredicate(prange)
|
|
|
|
except ValueError as exc:
|
|
|
|
self.extractor.log.warning(
|
|
|
|
"invalid %s range: %s", target, exc)
|
|
|
|
else:
|
|
|
|
if skip and pred.lower > 1 and not pfilter:
|
|
|
|
pred.index += self.extractor.skip(pred.lower - 1)
|
|
|
|
predicates.append(pred)
|
|
|
|
|
|
|
|
return util.build_predicate(predicates)
|
|
|
|
|
2020-05-18 01:35:53 +02:00
|
|
|
def get_logger(self, name):
|
|
|
|
return self._wrap_logger(logging.getLogger(name))
|
|
|
|
|
|
|
|
def _wrap_logger(self, logger):
|
2023-03-10 22:08:10 +01:00
|
|
|
return output.LoggerAdapter(logger, self)
|
2020-05-18 01:35:53 +02:00
|
|
|
|
2023-03-10 22:08:10 +01:00
|
|
|
def _wrap_logger_actions(self, logger):
|
|
|
|
return output.LoggerAdapterActions(logger, self)
|
2023-02-13 13:33:42 +01:00
|
|
|
|
2017-05-27 16:16:57 +02:00
|
|
|
def _write_unsupported(self, url):
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
if self.ulog:
|
|
|
|
self.ulog.info(url)
|
2017-05-27 16:16:57 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class DownloadJob(Job):
|
|
|
|
"""Download images into appropriate directory/filename locations"""
|
|
|
|
|
2021-05-13 21:56:34 +02:00
|
|
|
def __init__(self, url, parent=None):
|
2017-10-06 15:38:35 +02:00
|
|
|
Job.__init__(self, url, parent)
|
2020-05-18 01:35:53 +02:00
|
|
|
self.log = self.get_logger("download")
|
2021-08-16 01:47:59 +02:00
|
|
|
self.fallback = None
|
2018-02-01 20:49:41 +01:00
|
|
|
self.archive = None
|
2017-12-29 22:15:57 +01:00
|
|
|
self.sleep = None
|
2020-11-28 18:13:46 +01:00
|
|
|
self.hooks = ()
|
2015-04-08 01:51:48 +02:00
|
|
|
self.downloaders = {}
|
2016-09-30 12:32:48 +02:00
|
|
|
self.out = output.select()
|
2021-05-13 21:56:34 +02:00
|
|
|
self.visited = parent.visited if parent else set()
|
2021-11-23 19:23:02 +01:00
|
|
|
self._extractor_filter = None
|
2021-05-12 23:37:01 +02:00
|
|
|
self._skipcnt = 0
|
2020-01-29 18:32:37 +01:00
|
|
|
|
2020-10-16 01:09:55 +02:00
|
|
|
def handle_url(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Download the resource specified in 'url'"""
|
2020-11-18 17:11:55 +01:00
|
|
|
hooks = self.hooks
|
2019-07-13 21:42:07 +02:00
|
|
|
pathfmt = self.pathfmt
|
|
|
|
archive = self.archive
|
|
|
|
|
2018-02-01 20:49:41 +01:00
|
|
|
# prepare download
|
2019-10-29 15:46:35 +01:00
|
|
|
pathfmt.set_filename(kwdict)
|
2018-02-01 20:49:41 +01:00
|
|
|
|
2020-11-18 17:11:55 +01:00
|
|
|
if "prepare" in hooks:
|
|
|
|
for callback in hooks["prepare"]:
|
|
|
|
callback(pathfmt)
|
2018-10-18 22:32:03 +02:00
|
|
|
|
2020-09-23 15:00:27 +02:00
|
|
|
if archive and archive.check(kwdict):
|
2020-09-03 18:37:38 +02:00
|
|
|
pathfmt.fix_extension()
|
|
|
|
self.handle_skip()
|
|
|
|
return
|
|
|
|
|
2022-11-08 17:01:10 +01:00
|
|
|
if pathfmt.extension and not self.metadata_http:
|
|
|
|
pathfmt.build_path()
|
|
|
|
|
|
|
|
if pathfmt.exists():
|
|
|
|
if archive:
|
|
|
|
archive.add(kwdict)
|
|
|
|
self.handle_skip()
|
|
|
|
return
|
2018-02-01 20:49:41 +01:00
|
|
|
|
2023-08-10 21:28:48 +02:00
|
|
|
if "prepare-after" in hooks:
|
|
|
|
for callback in hooks["prepare-after"]:
|
|
|
|
callback(pathfmt)
|
|
|
|
|
2018-02-01 20:49:41 +01:00
|
|
|
if self.sleep:
|
2022-10-07 12:42:00 +02:00
|
|
|
self.extractor.sleep(self.sleep(), "download")
|
2018-02-01 20:49:41 +01:00
|
|
|
|
|
|
|
# download from URL
|
2018-10-05 17:58:15 +02:00
|
|
|
if not self.download(url):
|
2018-02-01 20:49:41 +01:00
|
|
|
|
2021-08-16 01:47:59 +02:00
|
|
|
# use fallback URLs if available/enabled
|
|
|
|
fallback = kwdict.get("_fallback", ()) if self.fallback else ()
|
|
|
|
for num, url in enumerate(fallback, 1):
|
2020-01-19 22:53:06 +01:00
|
|
|
util.remove_file(pathfmt.temppath)
|
2018-02-01 20:49:41 +01:00
|
|
|
self.log.info("Trying fallback URL #%d", num)
|
2018-10-05 17:58:15 +02:00
|
|
|
if self.download(url):
|
2018-02-01 20:49:41 +01:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
# download failed
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 4
|
2019-07-13 21:42:07 +02:00
|
|
|
self.log.error("Failed to download %s",
|
|
|
|
pathfmt.filename or url)
|
2018-02-01 20:49:41 +01:00
|
|
|
return
|
|
|
|
|
2019-07-13 21:42:07 +02:00
|
|
|
if not pathfmt.temppath:
|
2020-09-03 18:37:38 +02:00
|
|
|
if archive:
|
|
|
|
archive.add(kwdict)
|
2018-10-13 17:21:55 +02:00
|
|
|
self.handle_skip()
|
2018-06-27 17:16:07 +02:00
|
|
|
return
|
|
|
|
|
2018-05-20 22:03:57 +02:00
|
|
|
# run post processors
|
2020-11-18 17:11:55 +01:00
|
|
|
if "file" in hooks:
|
|
|
|
for callback in hooks["file"]:
|
|
|
|
callback(pathfmt)
|
2018-05-20 22:03:57 +02:00
|
|
|
|
2018-02-12 16:56:45 +01:00
|
|
|
# download succeeded
|
2019-07-13 21:42:07 +02:00
|
|
|
pathfmt.finalize()
|
2022-05-24 10:45:09 +02:00
|
|
|
self.out.success(pathfmt.path)
|
2020-11-18 17:11:55 +01:00
|
|
|
self._skipcnt = 0
|
2019-07-13 21:42:07 +02:00
|
|
|
if archive:
|
2019-10-29 15:46:35 +01:00
|
|
|
archive.add(kwdict)
|
2020-11-18 17:11:55 +01:00
|
|
|
if "after" in hooks:
|
|
|
|
for callback in hooks["after"]:
|
|
|
|
callback(pathfmt)
|
2018-01-17 22:08:19 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2015-04-08 01:51:48 +02:00
|
|
|
"""Set and create the target directory for downloads"""
|
2018-09-21 17:55:04 +02:00
|
|
|
if not self.pathfmt:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.initialize(kwdict)
|
2018-11-21 22:21:26 +01:00
|
|
|
else:
|
2022-10-31 14:35:48 +01:00
|
|
|
if "post-after" in self.hooks:
|
|
|
|
for callback in self.hooks["post-after"]:
|
|
|
|
callback(self.pathfmt)
|
2019-10-29 15:46:35 +01:00
|
|
|
self.pathfmt.set_directory(kwdict)
|
2020-11-18 17:11:55 +01:00
|
|
|
if "post" in self.hooks:
|
|
|
|
for callback in self.hooks["post"]:
|
|
|
|
callback(self.pathfmt)
|
2018-09-21 17:55:04 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2019-12-26 23:38:16 +01:00
|
|
|
if url in self.visited:
|
|
|
|
return
|
|
|
|
self.visited.add(url)
|
|
|
|
|
2021-03-20 01:19:31 +01:00
|
|
|
cls = kwdict.get("_extractor")
|
|
|
|
if cls:
|
|
|
|
extr = cls.from_url(url)
|
2019-02-12 21:26:41 +01:00
|
|
|
else:
|
|
|
|
extr = extractor.find(url)
|
2020-09-10 22:54:10 +02:00
|
|
|
if extr:
|
2021-11-23 19:23:02 +01:00
|
|
|
if self._extractor_filter is None:
|
|
|
|
self._extractor_filter = self._build_extractor_filter()
|
|
|
|
if not self._extractor_filter(extr):
|
2020-09-10 22:54:10 +02:00
|
|
|
extr = None
|
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
if extr:
|
2021-05-13 21:56:34 +02:00
|
|
|
job = self.__class__(extr, self)
|
|
|
|
pfmt = self.pathfmt
|
|
|
|
pextr = self.extractor
|
|
|
|
|
|
|
|
if pfmt and pextr.config("parent-directory"):
|
|
|
|
extr._parentdir = pfmt.directory
|
|
|
|
else:
|
|
|
|
extr._parentdir = pextr._parentdir
|
|
|
|
|
2023-11-18 23:43:40 +01:00
|
|
|
pmeta = pextr.config2("parent-metadata", "metadata-parent")
|
2021-07-13 02:04:59 +02:00
|
|
|
if pmeta:
|
|
|
|
if isinstance(pmeta, str):
|
|
|
|
data = self.kwdict.copy()
|
|
|
|
if kwdict:
|
|
|
|
data.update(kwdict)
|
|
|
|
job.kwdict[pmeta] = data
|
|
|
|
else:
|
|
|
|
if self.kwdict:
|
|
|
|
job.kwdict.update(self.kwdict)
|
|
|
|
if kwdict:
|
|
|
|
job.kwdict.update(kwdict)
|
2021-05-13 21:56:34 +02:00
|
|
|
|
2023-02-11 21:06:14 +01:00
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
if pextr.config("parent-skip"):
|
|
|
|
job._skipcnt = self._skipcnt
|
2023-08-24 15:23:38 +02:00
|
|
|
status = job.run()
|
2023-02-11 21:06:14 +01:00
|
|
|
self._skipcnt = job._skipcnt
|
|
|
|
else:
|
2023-08-24 15:23:38 +02:00
|
|
|
status = job.run()
|
|
|
|
|
|
|
|
if status:
|
|
|
|
self.status |= status
|
|
|
|
if "_fallback" in kwdict and self.fallback:
|
|
|
|
fallback = kwdict["_fallback"] = \
|
|
|
|
iter(kwdict["_fallback"])
|
|
|
|
try:
|
|
|
|
url = next(fallback)
|
|
|
|
except StopIteration:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
text.nameext_from_url(url, kwdict)
|
2023-08-28 18:05:31 +02:00
|
|
|
if url.startswith("ytdl:"):
|
|
|
|
kwdict["extension"] = ""
|
2023-08-24 15:23:38 +02:00
|
|
|
self.handle_url(url, kwdict)
|
2023-02-11 21:06:14 +01:00
|
|
|
break
|
|
|
|
except exception.RestartExtraction:
|
|
|
|
pass
|
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
else:
|
2018-09-21 17:55:04 +02:00
|
|
|
self._write_unsupported(url)
|
|
|
|
|
|
|
|
def handle_finalize(self):
|
2019-09-10 22:26:40 +02:00
|
|
|
if self.archive:
|
|
|
|
self.archive.close()
|
2022-10-31 14:35:48 +01:00
|
|
|
|
|
|
|
pathfmt = self.pathfmt
|
2019-11-03 21:45:45 +01:00
|
|
|
if pathfmt:
|
2022-10-31 14:35:48 +01:00
|
|
|
hooks = self.hooks
|
|
|
|
if "post-after" in hooks:
|
|
|
|
for callback in hooks["post-after"]:
|
|
|
|
callback(pathfmt)
|
|
|
|
|
2023-07-21 22:38:39 +02:00
|
|
|
self.extractor.cookies_store()
|
2023-08-10 19:46:37 +02:00
|
|
|
|
2022-10-31 14:35:48 +01:00
|
|
|
if "finalize" in hooks:
|
|
|
|
for callback in hooks["finalize"]:
|
2023-08-10 19:46:37 +02:00
|
|
|
callback(pathfmt)
|
|
|
|
if self.status:
|
|
|
|
if "finalize-error" in hooks:
|
|
|
|
for callback in hooks["finalize-error"]:
|
|
|
|
callback(pathfmt)
|
|
|
|
else:
|
|
|
|
if "finalize-success" in hooks:
|
|
|
|
for callback in hooks["finalize-success"]:
|
|
|
|
callback(pathfmt)
|
2018-09-21 17:55:04 +02:00
|
|
|
|
2018-10-13 17:21:55 +02:00
|
|
|
def handle_skip(self):
|
2020-11-18 17:11:55 +01:00
|
|
|
pathfmt = self.pathfmt
|
|
|
|
self.out.skip(pathfmt.path)
|
|
|
|
if "skip" in self.hooks:
|
|
|
|
for callback in self.hooks["skip"]:
|
|
|
|
callback(pathfmt)
|
2018-10-13 17:21:55 +02:00
|
|
|
if self._skipexc:
|
|
|
|
self._skipcnt += 1
|
|
|
|
if self._skipcnt >= self._skipmax:
|
|
|
|
raise self._skipexc()
|
|
|
|
|
2018-10-05 17:58:15 +02:00
|
|
|
def download(self, url):
|
|
|
|
"""Download 'url'"""
|
2018-09-21 17:55:04 +02:00
|
|
|
scheme = url.partition(":")[0]
|
2018-10-05 17:58:15 +02:00
|
|
|
downloader = self.get_downloader(scheme)
|
|
|
|
if downloader:
|
2019-12-18 22:08:53 +01:00
|
|
|
try:
|
|
|
|
return downloader.download(url, self.pathfmt)
|
|
|
|
except OSError as exc:
|
|
|
|
if exc.errno == errno.ENOSPC:
|
|
|
|
raise
|
|
|
|
self.log.warning("%s: %s", exc.__class__.__name__, exc)
|
|
|
|
return False
|
2018-11-13 18:06:36 +01:00
|
|
|
self._write_unsupported(url)
|
2018-10-05 17:58:15 +02:00
|
|
|
return False
|
|
|
|
|
|
|
|
def get_downloader(self, scheme):
|
|
|
|
"""Return a downloader suitable for 'scheme'"""
|
2018-09-21 17:55:04 +02:00
|
|
|
try:
|
|
|
|
return self.downloaders[scheme]
|
|
|
|
except KeyError:
|
|
|
|
pass
|
2018-10-05 17:58:15 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
cls = downloader.find(scheme)
|
|
|
|
if cls and config.get(("downloader", cls.scheme), "enabled", True):
|
2020-05-18 01:35:53 +02:00
|
|
|
instance = cls(self)
|
2018-10-05 17:58:15 +02:00
|
|
|
else:
|
|
|
|
instance = None
|
2018-11-16 18:02:24 +01:00
|
|
|
self.log.error("'%s:' URLs are not supported/enabled", scheme)
|
2019-06-20 16:59:44 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
if cls and cls.scheme == "http":
|
2019-06-20 16:59:44 +02:00
|
|
|
self.downloaders["http"] = self.downloaders["https"] = instance
|
|
|
|
else:
|
|
|
|
self.downloaders[scheme] = instance
|
2018-09-21 17:55:04 +02:00
|
|
|
return instance
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def initialize(self, kwdict=None):
|
2018-09-21 17:55:04 +02:00
|
|
|
"""Delayed initialization of PathFormat, etc."""
|
2022-03-20 21:16:46 +01:00
|
|
|
extr = self.extractor
|
|
|
|
cfg = extr.config
|
|
|
|
|
|
|
|
pathfmt = self.pathfmt = path.PathFormat(extr)
|
2019-10-29 15:46:35 +01:00
|
|
|
if kwdict:
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.set_directory(kwdict)
|
2019-07-13 21:49:26 +02:00
|
|
|
|
2021-09-14 17:40:05 +02:00
|
|
|
self.sleep = util.build_duration_func(cfg("sleep"))
|
2021-08-16 01:47:59 +02:00
|
|
|
self.fallback = cfg("fallback", True)
|
2021-06-05 01:37:47 +02:00
|
|
|
if not cfg("download", True):
|
2020-09-23 15:00:27 +02:00
|
|
|
# monkey-patch method to do nothing and always return True
|
2020-03-10 23:08:29 +01:00
|
|
|
self.download = pathfmt.fix_extension
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2021-06-05 01:37:47 +02:00
|
|
|
archive = cfg("archive")
|
2020-09-23 15:00:27 +02:00
|
|
|
if archive:
|
2021-09-27 21:17:44 +02:00
|
|
|
archive = util.expand_path(archive)
|
2022-03-20 21:16:46 +01:00
|
|
|
archive_format = (cfg("archive-prefix", extr.category) +
|
|
|
|
cfg("archive-format", extr.archive_fmt))
|
2023-02-05 16:05:13 +01:00
|
|
|
archive_pragma = (cfg("archive-pragma"))
|
2020-09-23 15:00:27 +02:00
|
|
|
try:
|
2021-09-27 21:17:44 +02:00
|
|
|
if "{" in archive:
|
|
|
|
archive = formatter.parse(archive).format_map(kwdict)
|
2023-02-05 16:05:13 +01:00
|
|
|
self.archive = util.DownloadArchive(
|
|
|
|
archive, archive_format, archive_pragma)
|
2020-09-23 15:00:27 +02:00
|
|
|
except Exception as exc:
|
2022-03-20 21:16:46 +01:00
|
|
|
extr.log.warning(
|
2023-12-11 19:13:45 +01:00
|
|
|
"Failed to open download archive at '%s' (%s: %s)",
|
2021-09-27 21:17:44 +02:00
|
|
|
archive, exc.__class__.__name__, exc)
|
2020-09-23 15:00:27 +02:00
|
|
|
else:
|
2022-03-20 21:16:46 +01:00
|
|
|
extr.log.debug("Using download archive '%s'", archive)
|
2020-09-23 15:00:27 +02:00
|
|
|
|
2021-06-05 01:37:47 +02:00
|
|
|
skip = cfg("skip", True)
|
2018-10-13 17:21:55 +02:00
|
|
|
if skip:
|
|
|
|
self._skipexc = None
|
2019-08-08 18:34:31 +02:00
|
|
|
if skip == "enumerate":
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.check_file = pathfmt._enum_file
|
2019-08-08 18:34:31 +02:00
|
|
|
elif isinstance(skip, str):
|
2018-10-13 17:21:55 +02:00
|
|
|
skip, _, smax = skip.partition(":")
|
|
|
|
if skip == "abort":
|
|
|
|
self._skipexc = exception.StopExtraction
|
2021-05-12 02:22:28 +02:00
|
|
|
elif skip == "terminate":
|
|
|
|
self._skipexc = exception.TerminateExtraction
|
2018-10-13 17:21:55 +02:00
|
|
|
elif skip == "exit":
|
2023-08-21 23:46:39 +02:00
|
|
|
self._skipexc = SystemExit
|
2018-10-13 17:21:55 +02:00
|
|
|
self._skipmax = text.parse_int(smax)
|
|
|
|
else:
|
2020-09-23 15:00:27 +02:00
|
|
|
# monkey-patch methods to always return False
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.exists = lambda x=None: False
|
2020-09-23 15:00:27 +02:00
|
|
|
if self.archive:
|
|
|
|
self.archive.check = pathfmt.exists
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2022-07-03 12:09:09 +02:00
|
|
|
if not cfg("postprocess", True):
|
|
|
|
return
|
|
|
|
|
2022-03-20 21:16:46 +01:00
|
|
|
postprocessors = extr.config_accumulate("postprocessors")
|
2018-06-08 17:39:02 +02:00
|
|
|
if postprocessors:
|
2020-11-28 18:13:46 +01:00
|
|
|
self.hooks = collections.defaultdict(list)
|
2023-01-26 14:59:24 +01:00
|
|
|
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log = self.get_logger("postprocessor")
|
2023-01-26 14:59:24 +01:00
|
|
|
pp_conf = config.get((), "postprocessor") or {}
|
|
|
|
pp_opts = cfg("postprocessor-options")
|
2019-08-15 13:31:04 +02:00
|
|
|
pp_list = []
|
|
|
|
|
2018-06-08 17:39:02 +02:00
|
|
|
for pp_dict in postprocessors:
|
2021-06-05 01:37:47 +02:00
|
|
|
if isinstance(pp_dict, str):
|
|
|
|
pp_dict = pp_conf.get(pp_dict) or {"name": pp_dict}
|
2023-01-26 14:59:24 +01:00
|
|
|
if pp_opts:
|
|
|
|
pp_dict = pp_dict.copy()
|
|
|
|
pp_dict.update(pp_opts)
|
2020-11-17 00:38:29 +01:00
|
|
|
|
2021-11-23 19:23:02 +01:00
|
|
|
clist = pp_dict.get("whitelist")
|
|
|
|
if clist is not None:
|
|
|
|
negate = False
|
|
|
|
else:
|
|
|
|
clist = pp_dict.get("blacklist")
|
|
|
|
negate = True
|
|
|
|
if clist and not util.build_extractor_filter(
|
2022-03-20 21:16:46 +01:00
|
|
|
clist, negate)(extr):
|
2018-06-08 17:39:02 +02:00
|
|
|
continue
|
2020-11-17 00:38:29 +01:00
|
|
|
|
2018-09-03 14:53:43 +02:00
|
|
|
name = pp_dict.get("name")
|
2018-06-08 17:39:02 +02:00
|
|
|
pp_cls = postprocessor.find(name)
|
|
|
|
if not pp_cls:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log.warning("module '%s' not found", name)
|
2018-06-08 17:39:02 +02:00
|
|
|
continue
|
|
|
|
try:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_obj = pp_cls(self, pp_dict)
|
2018-06-08 17:39:02 +02:00
|
|
|
except Exception as exc:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log.error("'%s' initialization failed: %s: %s",
|
|
|
|
name, exc.__class__.__name__, exc)
|
2021-09-27 21:17:44 +02:00
|
|
|
pp_log.debug("", exc_info=True)
|
2018-06-08 17:39:02 +02:00
|
|
|
else:
|
2019-08-15 13:31:04 +02:00
|
|
|
pp_list.append(pp_obj)
|
|
|
|
|
|
|
|
if pp_list:
|
2022-03-20 21:16:46 +01:00
|
|
|
extr.log.debug("Active postprocessor modules: %s", pp_list)
|
2020-11-18 17:11:55 +01:00
|
|
|
if "init" in self.hooks:
|
|
|
|
for callback in self.hooks["init"]:
|
|
|
|
callback(pathfmt)
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2021-06-04 18:08:08 +02:00
|
|
|
def register_hooks(self, hooks, options=None):
|
|
|
|
expr = options.get("filter") if options else None
|
|
|
|
|
|
|
|
if expr:
|
|
|
|
condition = util.compile_expression(expr)
|
|
|
|
for hook, callback in hooks.items():
|
|
|
|
self.hooks[hook].append(functools.partial(
|
|
|
|
self._call_hook, callback, condition))
|
|
|
|
else:
|
|
|
|
for hook, callback in hooks.items():
|
|
|
|
self.hooks[hook].append(callback)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _call_hook(callback, condition, pathfmt):
|
|
|
|
if condition(pathfmt.kwdict):
|
|
|
|
callback(pathfmt)
|
|
|
|
|
2021-11-23 19:23:02 +01:00
|
|
|
def _build_extractor_filter(self):
|
|
|
|
clist = self.extractor.config("whitelist")
|
|
|
|
if clist is not None:
|
|
|
|
negate = False
|
2022-01-06 21:09:30 +01:00
|
|
|
special = None
|
2020-09-10 22:54:10 +02:00
|
|
|
else:
|
2021-11-23 19:23:02 +01:00
|
|
|
clist = self.extractor.config("blacklist")
|
|
|
|
negate = True
|
2022-01-06 21:09:30 +01:00
|
|
|
special = util.SPECIAL_EXTRACTORS
|
2021-11-23 19:23:02 +01:00
|
|
|
if clist is None:
|
|
|
|
clist = (self.extractor.category,)
|
|
|
|
|
2022-01-06 21:09:30 +01:00
|
|
|
return util.build_extractor_filter(clist, negate, special)
|
2020-09-10 22:54:10 +02:00
|
|
|
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2018-05-25 16:07:18 +02:00
|
|
|
class SimulationJob(DownloadJob):
|
|
|
|
"""Simulate the extraction process without downloading anything"""
|
|
|
|
|
2021-03-07 22:27:59 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
|
|
|
if not kwdict["extension"]:
|
|
|
|
kwdict["extension"] = "jpg"
|
2018-05-25 16:07:18 +02:00
|
|
|
if self.sleep:
|
2022-10-07 12:42:00 +02:00
|
|
|
self.extractor.sleep(self.sleep(), "download")
|
2018-05-25 16:07:18 +02:00
|
|
|
if self.archive:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.archive.add(kwdict)
|
2022-11-08 17:01:10 +01:00
|
|
|
self.out.skip(self.pathfmt.build_filename(kwdict))
|
2018-05-25 16:07:18 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2018-09-21 17:55:04 +02:00
|
|
|
if not self.pathfmt:
|
|
|
|
self.initialize()
|
|
|
|
|
2018-05-25 16:07:18 +02:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class KeywordJob(Job):
|
|
|
|
"""Print available keywords"""
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2022-03-24 23:05:36 +01:00
|
|
|
def __init__(self, url, parent=None):
|
|
|
|
Job.__init__(self, url, parent)
|
|
|
|
self.private = config.get(("output",), "private")
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("\nKeywords for filenames and --filter:\n"
|
|
|
|
"------------------------------------\n")
|
2022-11-07 16:33:26 +01:00
|
|
|
|
2022-11-19 11:41:06 +01:00
|
|
|
if self.metadata_http and url.startswith("http"):
|
2022-11-07 16:33:26 +01:00
|
|
|
kwdict[self.metadata_http] = util.extract_headers(
|
|
|
|
self.extractor.request(url, method="HEAD"))
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2017-05-17 14:31:14 +02:00
|
|
|
raise exception.StopExtraction()
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("Keywords for directory names:\n"
|
|
|
|
"-----------------------------\n")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2020-06-18 15:04:15 +02:00
|
|
|
extr = None
|
|
|
|
if "_extractor" in kwdict:
|
|
|
|
extr = kwdict["_extractor"].from_url(url)
|
|
|
|
|
2019-12-28 22:26:49 +01:00
|
|
|
if not util.filter_dict(kwdict):
|
2017-09-26 20:50:49 +02:00
|
|
|
self.extractor.log.info(
|
2019-12-28 22:26:49 +01:00
|
|
|
"This extractor only spawns other extractors "
|
|
|
|
"and does not provide any metadata on its own.")
|
|
|
|
|
2020-06-18 15:04:15 +02:00
|
|
|
if extr:
|
2019-12-28 22:26:49 +01:00
|
|
|
self.extractor.log.info(
|
|
|
|
"Showing results for '%s' instead:\n", url)
|
|
|
|
KeywordJob(extr, self).run()
|
|
|
|
else:
|
|
|
|
self.extractor.log.info(
|
|
|
|
"Try 'gallery-dl -K \"%s\"' instead.", url)
|
2017-09-26 20:50:49 +02:00
|
|
|
else:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("Keywords for --chapter-filter:\n"
|
|
|
|
"------------------------------\n")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2020-06-18 15:04:15 +02:00
|
|
|
if extr or self.extractor.categorytransfer:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("\n")
|
2020-06-18 15:04:15 +02:00
|
|
|
KeywordJob(extr or url, self).run()
|
2017-08-10 17:36:21 +02:00
|
|
|
raise exception.StopExtraction()
|
|
|
|
|
2022-05-20 20:47:25 +02:00
|
|
|
def print_kwdict(self, kwdict, prefix="", markers=None):
|
2019-10-29 15:46:35 +01:00
|
|
|
"""Print key-value pairs in 'kwdict' with formatting"""
|
2022-05-19 13:24:37 +02:00
|
|
|
write = sys.stdout.write
|
2023-03-21 22:28:04 +01:00
|
|
|
suffix = "']" if prefix else ""
|
2022-05-20 20:47:25 +02:00
|
|
|
|
|
|
|
markerid = id(kwdict)
|
|
|
|
if markers is None:
|
|
|
|
markers = {markerid}
|
|
|
|
elif markerid in markers:
|
2023-03-21 23:05:05 +01:00
|
|
|
write("{}\n <circular reference>\n".format(prefix[:-2]))
|
2022-05-20 20:47:25 +02:00
|
|
|
return # ignore circular reference
|
|
|
|
else:
|
|
|
|
markers.add(markerid)
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
for key, value in sorted(kwdict.items()):
|
2022-03-24 23:05:36 +01:00
|
|
|
if key[0] == "_" and not self.private:
|
2019-02-12 21:26:41 +01:00
|
|
|
continue
|
2017-05-17 14:31:14 +02:00
|
|
|
key = prefix + key + suffix
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
if isinstance(value, dict):
|
2023-03-21 22:28:04 +01:00
|
|
|
self.print_kwdict(value, key + "['", markers)
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
elif isinstance(value, list):
|
2022-10-28 12:04:58 +02:00
|
|
|
if not value:
|
|
|
|
pass
|
|
|
|
elif isinstance(value[0], dict):
|
2023-03-21 22:28:04 +01:00
|
|
|
self.print_kwdict(value[0], key + "[N]['", markers)
|
2017-05-15 18:30:47 +02:00
|
|
|
else:
|
2022-10-28 12:04:58 +02:00
|
|
|
fmt = (" {:>%s} {}\n" % len(str(len(value)))).format
|
|
|
|
write(key + "[N]\n")
|
|
|
|
for idx, val in enumerate(value, 0):
|
|
|
|
write(fmt(idx, val))
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
else:
|
|
|
|
# string or number
|
2022-05-19 13:24:37 +02:00
|
|
|
write("{}\n {}\n".format(key, value))
|
2015-12-10 02:14:28 +01:00
|
|
|
|
2023-03-21 23:05:05 +01:00
|
|
|
markers.remove(markerid)
|
|
|
|
|
2015-12-10 02:14:28 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class UrlJob(Job):
|
|
|
|
"""Print download urls"""
|
2018-01-22 22:49:00 +01:00
|
|
|
maxdepth = 1
|
2017-02-17 22:18:16 +01:00
|
|
|
|
2017-09-30 18:52:23 +02:00
|
|
|
def __init__(self, url, parent=None, depth=1):
|
|
|
|
Job.__init__(self, url, parent)
|
2017-02-17 22:18:16 +01:00
|
|
|
self.depth = depth
|
2018-01-22 22:49:00 +01:00
|
|
|
if depth >= self.maxdepth:
|
2017-09-12 16:19:00 +02:00
|
|
|
self.handle_queue = self.handle_url
|
2015-12-10 02:14:28 +01:00
|
|
|
|
2017-05-23 11:48:00 +02:00
|
|
|
@staticmethod
|
2021-04-12 01:55:55 +02:00
|
|
|
def handle_url(url, _):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write(url + "\n")
|
2021-04-12 01:55:55 +02:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def handle_url_fallback(url, kwdict):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write(url + "\n")
|
2020-10-16 01:09:55 +02:00
|
|
|
if "_fallback" in kwdict:
|
|
|
|
for url in kwdict["_fallback"]:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("| " + url + "\n")
|
2018-01-17 22:08:19 +01:00
|
|
|
|
2021-05-19 15:52:30 +02:00
|
|
|
def handle_queue(self, url, kwdict):
|
|
|
|
cls = kwdict.get("_extractor")
|
|
|
|
if cls:
|
|
|
|
extr = cls.from_url(url)
|
|
|
|
else:
|
|
|
|
extr = extractor.find(url)
|
|
|
|
|
|
|
|
if extr:
|
2021-06-26 00:26:08 +02:00
|
|
|
self.status |= self.__class__(extr, self, self.depth + 1).run()
|
2021-05-19 15:52:30 +02:00
|
|
|
else:
|
2017-05-27 16:16:57 +02:00
|
|
|
self._write_unsupported(url)
|
2017-05-23 11:48:00 +02:00
|
|
|
|
2015-12-12 01:16:02 +01:00
|
|
|
|
2021-03-02 23:59:34 +01:00
|
|
|
class InfoJob(Job):
|
|
|
|
"""Print extractor defaults and settings"""
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
ex = self.extractor
|
|
|
|
pm = self._print_multi
|
|
|
|
pc = self._print_config
|
|
|
|
|
|
|
|
if ex.basecategory:
|
|
|
|
pm("Category / Subcategory / Basecategory",
|
|
|
|
ex.category, ex.subcategory, ex.basecategory)
|
|
|
|
else:
|
|
|
|
pm("Category / Subcategory", ex.category, ex.subcategory)
|
|
|
|
|
|
|
|
pc("Filename format", "filename", ex.filename_fmt)
|
|
|
|
pc("Directory format", "directory", ex.directory_fmt)
|
2021-04-06 18:20:42 +02:00
|
|
|
pc("Archive format", "archive-format", ex.archive_fmt)
|
2021-03-02 23:59:34 +01:00
|
|
|
pc("Request interval", "sleep-request", ex.request_interval)
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def _print_multi(self, title, *values):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("{}\n {}\n\n".format(
|
2023-02-09 15:50:55 +01:00
|
|
|
title, " / ".join(map(util.json_dumps, values))))
|
2021-03-02 23:59:34 +01:00
|
|
|
|
|
|
|
def _print_config(self, title, optname, value):
|
|
|
|
optval = self.extractor.config(optname, util.SENTINEL)
|
|
|
|
if optval is not util.SENTINEL:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write(
|
|
|
|
"{} (custom):\n {}\n{} (default):\n {}\n\n".format(
|
2023-02-09 15:50:55 +01:00
|
|
|
title, util.json_dumps(optval),
|
|
|
|
title, util.json_dumps(value)))
|
2021-03-02 23:59:34 +01:00
|
|
|
elif value:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write(
|
2023-02-09 15:50:55 +01:00
|
|
|
"{} (default):\n {}\n\n".format(
|
|
|
|
title, util.json_dumps(value)))
|
2021-03-02 23:59:34 +01:00
|
|
|
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
class DataJob(Job):
|
|
|
|
"""Collect extractor results and dump them"""
|
|
|
|
|
2018-11-15 14:24:18 +01:00
|
|
|
def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True):
|
2017-10-06 15:38:35 +02:00
|
|
|
Job.__init__(self, url, parent)
|
2017-04-12 18:43:41 +02:00
|
|
|
self.file = file
|
|
|
|
self.data = []
|
2019-11-23 23:50:16 +01:00
|
|
|
self.ascii = config.get(("output",), "ascii", ensure_ascii)
|
2017-04-12 18:43:41 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
private = config.get(("output",), "private")
|
2022-10-13 19:17:23 +02:00
|
|
|
self.filter = dict.copy if private else util.filter_dict
|
2019-11-21 16:57:39 +01:00
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
def run(self):
|
2023-09-21 23:14:08 +02:00
|
|
|
self._init()
|
|
|
|
|
2022-10-07 12:42:00 +02:00
|
|
|
extractor = self.extractor
|
2021-09-14 17:40:05 +02:00
|
|
|
sleep = util.build_duration_func(
|
2022-10-07 12:42:00 +02:00
|
|
|
extractor.config("sleep-extractor"))
|
2020-09-12 21:04:47 +02:00
|
|
|
if sleep:
|
2022-10-07 12:42:00 +02:00
|
|
|
extractor.sleep(sleep(), "extractor")
|
2020-09-12 21:04:47 +02:00
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
# collect data
|
|
|
|
try:
|
2022-10-07 12:42:00 +02:00
|
|
|
for msg in extractor:
|
2017-11-18 17:35:57 +01:00
|
|
|
self.dispatch(msg)
|
2018-11-15 14:24:18 +01:00
|
|
|
except exception.StopExtraction:
|
|
|
|
pass
|
2017-04-12 18:43:41 +02:00
|
|
|
except Exception as exc:
|
|
|
|
self.data.append((exc.__class__.__name__, str(exc)))
|
2017-11-18 17:35:57 +01:00
|
|
|
except BaseException:
|
|
|
|
pass
|
2017-04-12 18:43:41 +02:00
|
|
|
|
2019-05-09 16:22:06 +02:00
|
|
|
# convert numbers to string
|
2019-11-23 23:50:16 +01:00
|
|
|
if config.get(("output",), "num-to-str", False):
|
2018-10-08 20:28:54 +02:00
|
|
|
for msg in self.data:
|
|
|
|
util.transform_dict(msg[-1], util.number_to_string)
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
# dump to 'file'
|
2020-06-19 23:05:44 +02:00
|
|
|
try:
|
|
|
|
util.dump_json(self.data, self.file, self.ascii, 2)
|
|
|
|
self.file.flush()
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
|
2019-10-27 23:05:00 +01:00
|
|
|
return 0
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Url, url, self.filter(kwdict)))
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Directory, self.filter(kwdict)))
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Queue, url, self.filter(kwdict)))
|