2015-04-05 16:23:20 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-01-26 14:59:24 +01:00
|
|
|
# Copyright 2015-2023 Mike Fährmann
|
2015-04-05 16:23:20 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
import sys
|
2019-12-18 22:08:53 +01:00
|
|
|
import errno
|
2018-02-01 20:49:41 +01:00
|
|
|
import logging
|
2021-06-04 18:08:08 +02:00
|
|
|
import functools
|
2020-11-18 17:11:55 +01:00
|
|
|
import collections
|
2024-05-10 01:05:28 +02:00
|
|
|
|
|
|
|
from . import (
|
|
|
|
extractor,
|
|
|
|
downloader,
|
|
|
|
postprocessor,
|
|
|
|
archive,
|
|
|
|
config,
|
|
|
|
exception,
|
|
|
|
formatter,
|
|
|
|
output,
|
|
|
|
path,
|
|
|
|
text,
|
|
|
|
util,
|
|
|
|
version,
|
|
|
|
)
|
2015-11-24 19:47:51 +01:00
|
|
|
from .extractor.message import Message
|
2024-05-10 01:05:28 +02:00
|
|
|
stdout_write = output.stdout_write
|
2015-04-05 16:23:20 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class Job():
|
2023-08-07 23:22:12 +02:00
|
|
|
"""Base class for Job types"""
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
ulog = None
|
2024-06-30 20:41:51 +02:00
|
|
|
_logger_adapter = output.LoggerAdapter
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
def __init__(self, extr, parent=None):
|
|
|
|
if isinstance(extr, str):
|
|
|
|
extr = extractor.find(extr)
|
|
|
|
if not extr:
|
|
|
|
raise exception.NoExtractorError()
|
2021-04-27 15:08:51 +02:00
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
self.extractor = extr
|
2020-05-18 01:35:53 +02:00
|
|
|
self.pathfmt = None
|
2021-04-27 15:08:51 +02:00
|
|
|
self.status = 0
|
2024-05-22 22:51:12 +02:00
|
|
|
self.kwdict = {}
|
|
|
|
self.kwdict_eval = False
|
2020-05-18 01:35:53 +02:00
|
|
|
|
2023-07-28 17:07:25 +02:00
|
|
|
cfgpath = []
|
2023-08-07 23:22:12 +02:00
|
|
|
if parent:
|
|
|
|
if extr.category == parent.extractor.category or \
|
|
|
|
extr.category in parent.parents:
|
|
|
|
parents = parent.parents
|
|
|
|
else:
|
|
|
|
parents = parent.parents + (parent.extractor.category,)
|
|
|
|
|
|
|
|
if parents:
|
|
|
|
for category in parents:
|
|
|
|
cat = "{}>{}".format(category, extr.category)
|
|
|
|
cfgpath.append((cat, extr.subcategory))
|
|
|
|
cfgpath.append((extr.category, extr.subcategory))
|
|
|
|
self.parents = parents
|
|
|
|
else:
|
|
|
|
self.parents = ()
|
|
|
|
else:
|
|
|
|
self.parents = ()
|
|
|
|
|
2023-07-28 17:07:25 +02:00
|
|
|
if extr.basecategory:
|
|
|
|
if not cfgpath:
|
|
|
|
cfgpath.append((extr.category, extr.subcategory))
|
|
|
|
cfgpath.append((extr.basecategory, extr.subcategory))
|
2023-08-07 23:22:12 +02:00
|
|
|
|
2023-07-28 17:07:25 +02:00
|
|
|
if cfgpath:
|
|
|
|
extr._cfgpath = cfgpath
|
|
|
|
extr.config = extr._config_shared
|
|
|
|
extr.config_accumulate = extr._config_shared_accumulate
|
|
|
|
|
2023-03-10 22:08:10 +01:00
|
|
|
actions = extr.config("actions")
|
|
|
|
if actions:
|
2024-06-30 20:41:51 +02:00
|
|
|
from .actions import LoggerAdapter, parse
|
|
|
|
self._logger_adapter = LoggerAdapter
|
2023-03-10 22:08:10 +01:00
|
|
|
self._logger_actions = parse(actions)
|
2023-02-13 13:33:42 +01:00
|
|
|
|
2022-07-30 12:31:45 +02:00
|
|
|
path_proxy = output.PathfmtProxy(self)
|
2020-05-18 01:35:53 +02:00
|
|
|
self._logger_extra = {
|
|
|
|
"job" : self,
|
|
|
|
"extractor": extr,
|
2022-07-30 12:31:45 +02:00
|
|
|
"path" : path_proxy,
|
2020-05-18 01:35:53 +02:00
|
|
|
"keywords" : output.KwdictProxy(self),
|
|
|
|
}
|
|
|
|
extr.log = self._wrap_logger(extr.log)
|
2019-02-13 17:39:43 +01:00
|
|
|
extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url)
|
2015-12-12 00:11:05 +01:00
|
|
|
|
2021-03-11 01:10:34 +01:00
|
|
|
# data from parent job
|
2020-05-12 23:52:01 +02:00
|
|
|
if parent:
|
|
|
|
pextr = parent.extractor
|
|
|
|
|
|
|
|
# transfer (sub)category
|
|
|
|
if pextr.config("category-transfer", pextr.categorytransfer):
|
2021-04-27 15:08:51 +02:00
|
|
|
extr._cfgpath = pextr._cfgpath
|
2020-05-12 23:52:01 +02:00
|
|
|
extr.category = pextr.category
|
|
|
|
extr.subcategory = pextr.subcategory
|
|
|
|
|
2023-11-18 23:43:40 +01:00
|
|
|
self.metadata_url = extr.config2("metadata-url", "url-metadata")
|
|
|
|
self.metadata_http = extr.config2("metadata-http", "http-metadata")
|
|
|
|
metadata_path = extr.config2("metadata-path", "path-metadata")
|
|
|
|
metadata_version = extr.config2("metadata-version", "version-metadata")
|
2023-11-20 22:16:15 +01:00
|
|
|
metadata_extractor = extr.config2(
|
|
|
|
"metadata-extractor", "extractor-metadata")
|
2022-11-07 16:33:26 +01:00
|
|
|
|
|
|
|
if metadata_path:
|
|
|
|
self.kwdict[metadata_path] = path_proxy
|
2023-11-20 22:16:15 +01:00
|
|
|
if metadata_extractor:
|
|
|
|
self.kwdict[metadata_extractor] = extr
|
2023-11-18 23:43:40 +01:00
|
|
|
if metadata_version:
|
|
|
|
self.kwdict[metadata_version] = {
|
2022-11-27 16:09:42 +01:00
|
|
|
"version" : version.__version__,
|
2023-02-28 23:10:23 +01:00
|
|
|
"is_executable" : util.EXECUTABLE,
|
2022-11-27 16:09:42 +01:00
|
|
|
"current_git_head": util.git_head()
|
|
|
|
}
|
2023-11-20 22:16:15 +01:00
|
|
|
# user-supplied metadata
|
|
|
|
kwdict = extr.config("keywords")
|
|
|
|
if kwdict:
|
2024-05-22 22:51:12 +02:00
|
|
|
if extr.config("keywords-eval"):
|
|
|
|
self.kwdict_eval = []
|
|
|
|
for key, value in kwdict.items():
|
|
|
|
if isinstance(value, str):
|
|
|
|
fmt = formatter.parse(value, None, util.identity)
|
|
|
|
self.kwdict_eval.append((key, fmt.format_map))
|
|
|
|
else:
|
|
|
|
self.kwdict[key] = value
|
|
|
|
else:
|
|
|
|
self.kwdict.update(kwdict)
|
2021-04-27 15:08:51 +02:00
|
|
|
|
2017-02-23 21:51:29 +01:00
|
|
|
def run(self):
|
|
|
|
"""Execute or run the job"""
|
2021-08-16 02:49:36 +02:00
|
|
|
extractor = self.extractor
|
|
|
|
log = extractor.log
|
|
|
|
msg = None
|
|
|
|
|
2023-09-21 23:14:08 +02:00
|
|
|
self._init()
|
|
|
|
|
|
|
|
# sleep before extractor start
|
2022-10-07 12:42:00 +02:00
|
|
|
sleep = util.build_duration_func(
|
|
|
|
extractor.config("sleep-extractor"))
|
2020-09-12 21:04:47 +02:00
|
|
|
if sleep:
|
2022-10-07 12:42:00 +02:00
|
|
|
extractor.sleep(sleep(), "extractor")
|
2021-08-16 02:49:36 +02:00
|
|
|
|
2017-02-23 21:51:29 +01:00
|
|
|
try:
|
2021-08-16 02:49:36 +02:00
|
|
|
for msg in extractor:
|
2017-02-26 02:06:56 +01:00
|
|
|
self.dispatch(msg)
|
2019-10-27 23:05:00 +01:00
|
|
|
except exception.StopExtraction as exc:
|
|
|
|
if exc.message:
|
2019-10-28 16:06:36 +01:00
|
|
|
log.error(exc.message)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= exc.code
|
2023-02-11 21:06:14 +01:00
|
|
|
except (exception.TerminateExtraction, exception.RestartExtraction):
|
2021-05-12 02:22:28 +02:00
|
|
|
raise
|
2019-10-27 23:05:00 +01:00
|
|
|
except exception.GalleryDLException as exc:
|
|
|
|
log.error("%s: %s", exc.__class__.__name__, exc)
|
2024-09-19 14:50:08 +02:00
|
|
|
log.debug("", exc_info=exc)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= exc.code
|
2017-08-10 16:29:05 +02:00
|
|
|
except OSError as exc:
|
2018-12-04 19:24:50 +01:00
|
|
|
log.error("Unable to download data: %s: %s",
|
|
|
|
exc.__class__.__name__, exc)
|
2024-09-19 14:50:08 +02:00
|
|
|
log.debug("", exc_info=exc)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 128
|
2017-04-18 11:38:48 +02:00
|
|
|
except Exception as exc:
|
2017-08-10 16:29:05 +02:00
|
|
|
log.error(("An unexpected error occurred: %s - %s. "
|
|
|
|
"Please run gallery-dl again with the --verbose flag, "
|
|
|
|
"copy its output and report this issue on "
|
|
|
|
"https://github.com/mikf/gallery-dl/issues ."),
|
|
|
|
exc.__class__.__name__, exc)
|
2024-09-19 14:50:08 +02:00
|
|
|
log.debug("", exc_info=exc)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 1
|
2019-11-03 21:45:45 +01:00
|
|
|
except BaseException:
|
|
|
|
self.status |= 1
|
|
|
|
raise
|
2021-08-16 02:49:36 +02:00
|
|
|
else:
|
|
|
|
if msg is None:
|
2021-08-23 22:02:04 +02:00
|
|
|
log.info("No results for %s", extractor.url)
|
2019-07-27 11:14:52 +02:00
|
|
|
finally:
|
|
|
|
self.handle_finalize()
|
2023-07-29 13:43:27 +02:00
|
|
|
extractor.finalize()
|
2021-08-16 02:49:36 +02:00
|
|
|
|
2019-10-29 15:56:54 +01:00
|
|
|
return self.status
|
2017-04-18 11:38:48 +02:00
|
|
|
|
2017-02-26 02:06:56 +01:00
|
|
|
def dispatch(self, msg):
|
|
|
|
"""Call the appropriate message handler"""
|
2017-03-17 09:39:46 +01:00
|
|
|
if msg[0] == Message.Url:
|
2021-07-19 02:23:20 +02:00
|
|
|
_, url, kwdict = msg
|
2022-11-07 16:33:26 +01:00
|
|
|
if self.metadata_url:
|
|
|
|
kwdict[self.metadata_url] = url
|
2021-07-19 02:23:20 +02:00
|
|
|
if self.pred_url(url, kwdict):
|
|
|
|
self.update_kwdict(kwdict)
|
|
|
|
self.handle_url(url, kwdict)
|
2017-02-26 02:06:56 +01:00
|
|
|
|
|
|
|
elif msg[0] == Message.Directory:
|
|
|
|
self.update_kwdict(msg[1])
|
|
|
|
self.handle_directory(msg[1])
|
|
|
|
|
2017-03-17 09:39:46 +01:00
|
|
|
elif msg[0] == Message.Queue:
|
2021-07-19 02:23:20 +02:00
|
|
|
_, url, kwdict = msg
|
2022-11-07 16:33:26 +01:00
|
|
|
if self.metadata_url:
|
|
|
|
kwdict[self.metadata_url] = url
|
2021-07-19 02:23:20 +02:00
|
|
|
if self.pred_queue(url, kwdict):
|
|
|
|
self.handle_queue(url, kwdict)
|
2017-02-26 02:06:56 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Url"""
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Directory"""
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Queue"""
|
|
|
|
|
2018-06-08 17:39:02 +02:00
|
|
|
def handle_finalize(self):
|
|
|
|
"""Handle job finalization"""
|
|
|
|
|
2016-09-24 10:45:11 +02:00
|
|
|
def update_kwdict(self, kwdict):
|
2018-02-08 23:10:58 +01:00
|
|
|
"""Update 'kwdict' with additional metadata"""
|
2019-10-29 15:46:35 +01:00
|
|
|
extr = self.extractor
|
|
|
|
kwdict["category"] = extr.category
|
|
|
|
kwdict["subcategory"] = extr.subcategory
|
2022-11-19 11:37:57 +01:00
|
|
|
if self.metadata_http:
|
|
|
|
kwdict.pop(self.metadata_http, None)
|
2021-03-11 01:10:34 +01:00
|
|
|
if self.kwdict:
|
|
|
|
kwdict.update(self.kwdict)
|
2024-05-22 22:51:12 +02:00
|
|
|
if self.kwdict_eval:
|
|
|
|
for key, valuegen in self.kwdict_eval:
|
|
|
|
kwdict[key] = valuegen(kwdict)
|
2015-12-12 00:11:05 +01:00
|
|
|
|
2023-09-21 23:14:08 +02:00
|
|
|
def _init(self):
|
|
|
|
self.extractor.initialize()
|
|
|
|
self.pred_url = self._prepare_predicates("image", True)
|
|
|
|
self.pred_queue = self._prepare_predicates("chapter", False)
|
|
|
|
|
2019-06-29 22:48:59 +02:00
|
|
|
def _prepare_predicates(self, target, skip=True):
|
|
|
|
predicates = []
|
|
|
|
|
|
|
|
if self.extractor.config(target + "-unique"):
|
|
|
|
predicates.append(util.UniquePredicate())
|
|
|
|
|
2018-10-07 21:34:25 +02:00
|
|
|
pfilter = self.extractor.config(target + "-filter")
|
|
|
|
if pfilter:
|
|
|
|
try:
|
|
|
|
pred = util.FilterPredicate(pfilter, target)
|
|
|
|
except (SyntaxError, ValueError, TypeError) as exc:
|
|
|
|
self.extractor.log.warning(exc)
|
|
|
|
else:
|
|
|
|
predicates.append(pred)
|
|
|
|
|
|
|
|
prange = self.extractor.config(target + "-range")
|
|
|
|
if prange:
|
|
|
|
try:
|
|
|
|
pred = util.RangePredicate(prange)
|
|
|
|
except ValueError as exc:
|
|
|
|
self.extractor.log.warning(
|
|
|
|
"invalid %s range: %s", target, exc)
|
|
|
|
else:
|
|
|
|
if skip and pred.lower > 1 and not pfilter:
|
|
|
|
pred.index += self.extractor.skip(pred.lower - 1)
|
|
|
|
predicates.append(pred)
|
|
|
|
|
|
|
|
return util.build_predicate(predicates)
|
|
|
|
|
2020-05-18 01:35:53 +02:00
|
|
|
def get_logger(self, name):
|
|
|
|
return self._wrap_logger(logging.getLogger(name))
|
|
|
|
|
|
|
|
def _wrap_logger(self, logger):
|
2024-06-30 20:41:51 +02:00
|
|
|
return self._logger_adapter(logger, self)
|
2023-02-13 13:33:42 +01:00
|
|
|
|
2017-05-27 16:16:57 +02:00
|
|
|
def _write_unsupported(self, url):
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
if self.ulog:
|
|
|
|
self.ulog.info(url)
|
2017-05-27 16:16:57 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class DownloadJob(Job):
|
|
|
|
"""Download images into appropriate directory/filename locations"""
|
|
|
|
|
2021-05-13 21:56:34 +02:00
|
|
|
def __init__(self, url, parent=None):
|
2017-10-06 15:38:35 +02:00
|
|
|
Job.__init__(self, url, parent)
|
2020-05-18 01:35:53 +02:00
|
|
|
self.log = self.get_logger("download")
|
2021-08-16 01:47:59 +02:00
|
|
|
self.fallback = None
|
2018-02-01 20:49:41 +01:00
|
|
|
self.archive = None
|
2017-12-29 22:15:57 +01:00
|
|
|
self.sleep = None
|
2020-11-28 18:13:46 +01:00
|
|
|
self.hooks = ()
|
2015-04-08 01:51:48 +02:00
|
|
|
self.downloaders = {}
|
2016-09-30 12:32:48 +02:00
|
|
|
self.out = output.select()
|
2021-05-13 21:56:34 +02:00
|
|
|
self.visited = parent.visited if parent else set()
|
2021-11-23 19:23:02 +01:00
|
|
|
self._extractor_filter = None
|
2021-05-12 23:37:01 +02:00
|
|
|
self._skipcnt = 0
|
2020-01-29 18:32:37 +01:00
|
|
|
|
2020-10-16 01:09:55 +02:00
|
|
|
def handle_url(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Download the resource specified in 'url'"""
|
2020-11-18 17:11:55 +01:00
|
|
|
hooks = self.hooks
|
2019-07-13 21:42:07 +02:00
|
|
|
pathfmt = self.pathfmt
|
|
|
|
archive = self.archive
|
|
|
|
|
2018-02-01 20:49:41 +01:00
|
|
|
# prepare download
|
2019-10-29 15:46:35 +01:00
|
|
|
pathfmt.set_filename(kwdict)
|
2018-02-01 20:49:41 +01:00
|
|
|
|
2020-11-18 17:11:55 +01:00
|
|
|
if "prepare" in hooks:
|
|
|
|
for callback in hooks["prepare"]:
|
|
|
|
callback(pathfmt)
|
2018-10-18 22:32:03 +02:00
|
|
|
|
2020-09-23 15:00:27 +02:00
|
|
|
if archive and archive.check(kwdict):
|
2020-09-03 18:37:38 +02:00
|
|
|
pathfmt.fix_extension()
|
|
|
|
self.handle_skip()
|
|
|
|
return
|
|
|
|
|
2022-11-08 17:01:10 +01:00
|
|
|
if pathfmt.extension and not self.metadata_http:
|
|
|
|
pathfmt.build_path()
|
|
|
|
|
|
|
|
if pathfmt.exists():
|
2024-06-27 22:00:59 +02:00
|
|
|
if archive and self._archive_write_skip:
|
2022-11-08 17:01:10 +01:00
|
|
|
archive.add(kwdict)
|
|
|
|
self.handle_skip()
|
|
|
|
return
|
2018-02-01 20:49:41 +01:00
|
|
|
|
2023-08-10 21:28:48 +02:00
|
|
|
if "prepare-after" in hooks:
|
|
|
|
for callback in hooks["prepare-after"]:
|
|
|
|
callback(pathfmt)
|
|
|
|
|
2024-09-05 17:42:29 +02:00
|
|
|
if kwdict.pop("_file_recheck", False) and pathfmt.exists():
|
2024-09-03 21:13:19 +02:00
|
|
|
if archive and self._archive_write_skip:
|
|
|
|
archive.add(kwdict)
|
|
|
|
self.handle_skip()
|
|
|
|
return
|
|
|
|
|
2018-02-01 20:49:41 +01:00
|
|
|
if self.sleep:
|
2022-10-07 12:42:00 +02:00
|
|
|
self.extractor.sleep(self.sleep(), "download")
|
2018-02-01 20:49:41 +01:00
|
|
|
|
|
|
|
# download from URL
|
2018-10-05 17:58:15 +02:00
|
|
|
if not self.download(url):
|
2018-02-01 20:49:41 +01:00
|
|
|
|
2021-08-16 01:47:59 +02:00
|
|
|
# use fallback URLs if available/enabled
|
|
|
|
fallback = kwdict.get("_fallback", ()) if self.fallback else ()
|
|
|
|
for num, url in enumerate(fallback, 1):
|
2020-01-19 22:53:06 +01:00
|
|
|
util.remove_file(pathfmt.temppath)
|
2018-02-01 20:49:41 +01:00
|
|
|
self.log.info("Trying fallback URL #%d", num)
|
2018-10-05 17:58:15 +02:00
|
|
|
if self.download(url):
|
2018-02-01 20:49:41 +01:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
# download failed
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 4
|
2019-07-13 21:42:07 +02:00
|
|
|
self.log.error("Failed to download %s",
|
|
|
|
pathfmt.filename or url)
|
2024-10-19 20:30:34 +02:00
|
|
|
if "error" in hooks:
|
|
|
|
for callback in hooks["error"]:
|
|
|
|
callback(pathfmt)
|
2018-02-01 20:49:41 +01:00
|
|
|
return
|
|
|
|
|
2019-07-13 21:42:07 +02:00
|
|
|
if not pathfmt.temppath:
|
2024-06-27 22:00:59 +02:00
|
|
|
if archive and self._archive_write_skip:
|
2020-09-03 18:37:38 +02:00
|
|
|
archive.add(kwdict)
|
2018-10-13 17:21:55 +02:00
|
|
|
self.handle_skip()
|
2018-06-27 17:16:07 +02:00
|
|
|
return
|
|
|
|
|
2018-05-20 22:03:57 +02:00
|
|
|
# run post processors
|
2020-11-18 17:11:55 +01:00
|
|
|
if "file" in hooks:
|
|
|
|
for callback in hooks["file"]:
|
|
|
|
callback(pathfmt)
|
2018-05-20 22:03:57 +02:00
|
|
|
|
2018-02-12 16:56:45 +01:00
|
|
|
# download succeeded
|
2019-07-13 21:42:07 +02:00
|
|
|
pathfmt.finalize()
|
2022-05-24 10:45:09 +02:00
|
|
|
self.out.success(pathfmt.path)
|
2020-11-18 17:11:55 +01:00
|
|
|
self._skipcnt = 0
|
2024-06-27 22:00:59 +02:00
|
|
|
if archive and self._archive_write_file:
|
2019-10-29 15:46:35 +01:00
|
|
|
archive.add(kwdict)
|
2020-11-18 17:11:55 +01:00
|
|
|
if "after" in hooks:
|
|
|
|
for callback in hooks["after"]:
|
|
|
|
callback(pathfmt)
|
2018-01-17 22:08:19 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2015-04-08 01:51:48 +02:00
|
|
|
"""Set and create the target directory for downloads"""
|
2018-09-21 17:55:04 +02:00
|
|
|
if not self.pathfmt:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.initialize(kwdict)
|
2018-11-21 22:21:26 +01:00
|
|
|
else:
|
2022-10-31 14:35:48 +01:00
|
|
|
if "post-after" in self.hooks:
|
|
|
|
for callback in self.hooks["post-after"]:
|
|
|
|
callback(self.pathfmt)
|
2019-10-29 15:46:35 +01:00
|
|
|
self.pathfmt.set_directory(kwdict)
|
2020-11-18 17:11:55 +01:00
|
|
|
if "post" in self.hooks:
|
|
|
|
for callback in self.hooks["post"]:
|
|
|
|
callback(self.pathfmt)
|
2018-09-21 17:55:04 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2019-12-26 23:38:16 +01:00
|
|
|
if url in self.visited:
|
|
|
|
return
|
|
|
|
self.visited.add(url)
|
|
|
|
|
2021-03-20 01:19:31 +01:00
|
|
|
cls = kwdict.get("_extractor")
|
|
|
|
if cls:
|
|
|
|
extr = cls.from_url(url)
|
2019-02-12 21:26:41 +01:00
|
|
|
else:
|
|
|
|
extr = extractor.find(url)
|
2020-09-10 22:54:10 +02:00
|
|
|
if extr:
|
2021-11-23 19:23:02 +01:00
|
|
|
if self._extractor_filter is None:
|
|
|
|
self._extractor_filter = self._build_extractor_filter()
|
|
|
|
if not self._extractor_filter(extr):
|
2020-09-10 22:54:10 +02:00
|
|
|
extr = None
|
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
if extr:
|
2021-05-13 21:56:34 +02:00
|
|
|
job = self.__class__(extr, self)
|
|
|
|
pfmt = self.pathfmt
|
|
|
|
pextr = self.extractor
|
|
|
|
|
|
|
|
if pfmt and pextr.config("parent-directory"):
|
|
|
|
extr._parentdir = pfmt.directory
|
|
|
|
else:
|
|
|
|
extr._parentdir = pextr._parentdir
|
|
|
|
|
2023-11-18 23:43:40 +01:00
|
|
|
pmeta = pextr.config2("parent-metadata", "metadata-parent")
|
2021-07-13 02:04:59 +02:00
|
|
|
if pmeta:
|
|
|
|
if isinstance(pmeta, str):
|
|
|
|
data = self.kwdict.copy()
|
|
|
|
if kwdict:
|
|
|
|
data.update(kwdict)
|
|
|
|
job.kwdict[pmeta] = data
|
|
|
|
else:
|
|
|
|
if self.kwdict:
|
|
|
|
job.kwdict.update(self.kwdict)
|
|
|
|
if kwdict:
|
|
|
|
job.kwdict.update(kwdict)
|
2021-05-13 21:56:34 +02:00
|
|
|
|
2023-02-11 21:06:14 +01:00
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
if pextr.config("parent-skip"):
|
|
|
|
job._skipcnt = self._skipcnt
|
2023-08-24 15:23:38 +02:00
|
|
|
status = job.run()
|
2023-02-11 21:06:14 +01:00
|
|
|
self._skipcnt = job._skipcnt
|
|
|
|
else:
|
2023-08-24 15:23:38 +02:00
|
|
|
status = job.run()
|
|
|
|
|
|
|
|
if status:
|
|
|
|
self.status |= status
|
2024-10-17 08:04:41 +02:00
|
|
|
if (status & 95 and # not FormatError or OSError
|
|
|
|
"_fallback" in kwdict and self.fallback):
|
2023-08-24 15:23:38 +02:00
|
|
|
fallback = kwdict["_fallback"] = \
|
|
|
|
iter(kwdict["_fallback"])
|
|
|
|
try:
|
|
|
|
url = next(fallback)
|
|
|
|
except StopIteration:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
text.nameext_from_url(url, kwdict)
|
2023-08-28 18:05:31 +02:00
|
|
|
if url.startswith("ytdl:"):
|
|
|
|
kwdict["extension"] = ""
|
2023-08-24 15:23:38 +02:00
|
|
|
self.handle_url(url, kwdict)
|
2023-02-11 21:06:14 +01:00
|
|
|
break
|
|
|
|
except exception.RestartExtraction:
|
|
|
|
pass
|
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
else:
|
2018-09-21 17:55:04 +02:00
|
|
|
self._write_unsupported(url)
|
|
|
|
|
|
|
|
def handle_finalize(self):
|
2019-09-10 22:26:40 +02:00
|
|
|
if self.archive:
|
2024-05-10 22:17:53 +02:00
|
|
|
if not self.status:
|
|
|
|
self.archive.finalize()
|
2019-09-10 22:26:40 +02:00
|
|
|
self.archive.close()
|
2022-10-31 14:35:48 +01:00
|
|
|
|
|
|
|
pathfmt = self.pathfmt
|
2019-11-03 21:45:45 +01:00
|
|
|
if pathfmt:
|
2022-10-31 14:35:48 +01:00
|
|
|
hooks = self.hooks
|
|
|
|
if "post-after" in hooks:
|
|
|
|
for callback in hooks["post-after"]:
|
|
|
|
callback(pathfmt)
|
|
|
|
|
2023-07-21 22:38:39 +02:00
|
|
|
self.extractor.cookies_store()
|
2023-08-10 19:46:37 +02:00
|
|
|
|
2022-10-31 14:35:48 +01:00
|
|
|
if "finalize" in hooks:
|
|
|
|
for callback in hooks["finalize"]:
|
2023-08-10 19:46:37 +02:00
|
|
|
callback(pathfmt)
|
|
|
|
if self.status:
|
|
|
|
if "finalize-error" in hooks:
|
|
|
|
for callback in hooks["finalize-error"]:
|
|
|
|
callback(pathfmt)
|
|
|
|
else:
|
|
|
|
if "finalize-success" in hooks:
|
|
|
|
for callback in hooks["finalize-success"]:
|
|
|
|
callback(pathfmt)
|
2018-09-21 17:55:04 +02:00
|
|
|
|
2018-10-13 17:21:55 +02:00
|
|
|
def handle_skip(self):
|
2020-11-18 17:11:55 +01:00
|
|
|
pathfmt = self.pathfmt
|
|
|
|
if "skip" in self.hooks:
|
|
|
|
for callback in self.hooks["skip"]:
|
|
|
|
callback(pathfmt)
|
2024-09-03 21:13:19 +02:00
|
|
|
self.out.skip(pathfmt.path)
|
|
|
|
|
2018-10-13 17:21:55 +02:00
|
|
|
if self._skipexc:
|
2024-05-10 22:56:51 +02:00
|
|
|
if not self._skipftr or self._skipftr(pathfmt.kwdict):
|
|
|
|
self._skipcnt += 1
|
|
|
|
if self._skipcnt >= self._skipmax:
|
|
|
|
raise self._skipexc()
|
|
|
|
else:
|
|
|
|
self._skipcnt = 0
|
2018-10-13 17:21:55 +02:00
|
|
|
|
2018-10-05 17:58:15 +02:00
|
|
|
def download(self, url):
|
|
|
|
"""Download 'url'"""
|
2018-09-21 17:55:04 +02:00
|
|
|
scheme = url.partition(":")[0]
|
2018-10-05 17:58:15 +02:00
|
|
|
downloader = self.get_downloader(scheme)
|
|
|
|
if downloader:
|
2019-12-18 22:08:53 +01:00
|
|
|
try:
|
|
|
|
return downloader.download(url, self.pathfmt)
|
|
|
|
except OSError as exc:
|
|
|
|
if exc.errno == errno.ENOSPC:
|
|
|
|
raise
|
|
|
|
self.log.warning("%s: %s", exc.__class__.__name__, exc)
|
|
|
|
return False
|
2018-11-13 18:06:36 +01:00
|
|
|
self._write_unsupported(url)
|
2018-10-05 17:58:15 +02:00
|
|
|
return False
|
|
|
|
|
|
|
|
def get_downloader(self, scheme):
|
|
|
|
"""Return a downloader suitable for 'scheme'"""
|
2018-09-21 17:55:04 +02:00
|
|
|
try:
|
|
|
|
return self.downloaders[scheme]
|
|
|
|
except KeyError:
|
|
|
|
pass
|
2018-10-05 17:58:15 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
cls = downloader.find(scheme)
|
|
|
|
if cls and config.get(("downloader", cls.scheme), "enabled", True):
|
2020-05-18 01:35:53 +02:00
|
|
|
instance = cls(self)
|
2018-10-05 17:58:15 +02:00
|
|
|
else:
|
|
|
|
instance = None
|
2018-11-16 18:02:24 +01:00
|
|
|
self.log.error("'%s:' URLs are not supported/enabled", scheme)
|
2019-06-20 16:59:44 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
if cls and cls.scheme == "http":
|
2019-06-20 16:59:44 +02:00
|
|
|
self.downloaders["http"] = self.downloaders["https"] = instance
|
|
|
|
else:
|
|
|
|
self.downloaders[scheme] = instance
|
2018-09-21 17:55:04 +02:00
|
|
|
return instance
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def initialize(self, kwdict=None):
|
2018-09-21 17:55:04 +02:00
|
|
|
"""Delayed initialization of PathFormat, etc."""
|
2022-03-20 21:16:46 +01:00
|
|
|
extr = self.extractor
|
|
|
|
cfg = extr.config
|
|
|
|
|
|
|
|
pathfmt = self.pathfmt = path.PathFormat(extr)
|
2019-10-29 15:46:35 +01:00
|
|
|
if kwdict:
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.set_directory(kwdict)
|
2019-07-13 21:49:26 +02:00
|
|
|
|
2021-09-14 17:40:05 +02:00
|
|
|
self.sleep = util.build_duration_func(cfg("sleep"))
|
2021-08-16 01:47:59 +02:00
|
|
|
self.fallback = cfg("fallback", True)
|
2021-06-05 01:37:47 +02:00
|
|
|
if not cfg("download", True):
|
2020-09-23 15:00:27 +02:00
|
|
|
# monkey-patch method to do nothing and always return True
|
2020-03-10 23:08:29 +01:00
|
|
|
self.download = pathfmt.fix_extension
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2024-05-10 01:05:28 +02:00
|
|
|
archive_path = cfg("archive")
|
|
|
|
if archive_path:
|
|
|
|
archive_path = util.expand_path(archive_path)
|
2022-03-20 21:16:46 +01:00
|
|
|
archive_format = (cfg("archive-prefix", extr.category) +
|
|
|
|
cfg("archive-format", extr.archive_fmt))
|
2023-02-05 16:05:13 +01:00
|
|
|
archive_pragma = (cfg("archive-pragma"))
|
2020-09-23 15:00:27 +02:00
|
|
|
try:
|
2024-05-10 01:05:28 +02:00
|
|
|
if "{" in archive_path:
|
|
|
|
archive_path = formatter.parse(
|
|
|
|
archive_path).format_map(kwdict)
|
2024-05-10 22:17:53 +02:00
|
|
|
if cfg("archive-mode") == "memory":
|
|
|
|
archive_cls = archive.DownloadArchiveMemory
|
|
|
|
else:
|
|
|
|
archive_cls = archive.DownloadArchive
|
|
|
|
self.archive = archive_cls(
|
2024-05-10 01:05:28 +02:00
|
|
|
archive_path, archive_format, archive_pragma)
|
2020-09-23 15:00:27 +02:00
|
|
|
except Exception as exc:
|
2022-03-20 21:16:46 +01:00
|
|
|
extr.log.warning(
|
2023-12-11 19:13:45 +01:00
|
|
|
"Failed to open download archive at '%s' (%s: %s)",
|
2024-05-10 01:05:28 +02:00
|
|
|
archive_path, exc.__class__.__name__, exc)
|
2020-09-23 15:00:27 +02:00
|
|
|
else:
|
2024-05-10 01:05:28 +02:00
|
|
|
extr.log.debug("Using download archive '%s'", archive_path)
|
2020-09-23 15:00:27 +02:00
|
|
|
|
2024-06-27 22:00:59 +02:00
|
|
|
events = cfg("archive-event")
|
|
|
|
if events is None:
|
|
|
|
self._archive_write_file = True
|
|
|
|
self._archive_write_skip = False
|
|
|
|
else:
|
|
|
|
if isinstance(events, str):
|
|
|
|
events = events.split(",")
|
|
|
|
self._archive_write_file = ("file" in events)
|
|
|
|
self._archive_write_skip = ("skip" in events)
|
|
|
|
|
2021-06-05 01:37:47 +02:00
|
|
|
skip = cfg("skip", True)
|
2018-10-13 17:21:55 +02:00
|
|
|
if skip:
|
|
|
|
self._skipexc = None
|
2019-08-08 18:34:31 +02:00
|
|
|
if skip == "enumerate":
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.check_file = pathfmt._enum_file
|
2019-08-08 18:34:31 +02:00
|
|
|
elif isinstance(skip, str):
|
2018-10-13 17:21:55 +02:00
|
|
|
skip, _, smax = skip.partition(":")
|
|
|
|
if skip == "abort":
|
|
|
|
self._skipexc = exception.StopExtraction
|
2021-05-12 02:22:28 +02:00
|
|
|
elif skip == "terminate":
|
|
|
|
self._skipexc = exception.TerminateExtraction
|
2018-10-13 17:21:55 +02:00
|
|
|
elif skip == "exit":
|
2023-08-21 23:46:39 +02:00
|
|
|
self._skipexc = SystemExit
|
2018-10-13 17:21:55 +02:00
|
|
|
self._skipmax = text.parse_int(smax)
|
2024-05-10 22:56:51 +02:00
|
|
|
|
|
|
|
skip_filter = cfg("skip-filter")
|
|
|
|
if skip_filter:
|
|
|
|
self._skipftr = util.compile_expression(skip_filter)
|
|
|
|
else:
|
|
|
|
self._skipftr = None
|
2018-10-13 17:21:55 +02:00
|
|
|
else:
|
2020-09-23 15:00:27 +02:00
|
|
|
# monkey-patch methods to always return False
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.exists = lambda x=None: False
|
2020-09-23 15:00:27 +02:00
|
|
|
if self.archive:
|
|
|
|
self.archive.check = pathfmt.exists
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2022-07-03 12:09:09 +02:00
|
|
|
if not cfg("postprocess", True):
|
|
|
|
return
|
|
|
|
|
2022-03-20 21:16:46 +01:00
|
|
|
postprocessors = extr.config_accumulate("postprocessors")
|
2018-06-08 17:39:02 +02:00
|
|
|
if postprocessors:
|
2020-11-28 18:13:46 +01:00
|
|
|
self.hooks = collections.defaultdict(list)
|
2023-01-26 14:59:24 +01:00
|
|
|
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log = self.get_logger("postprocessor")
|
2023-01-26 14:59:24 +01:00
|
|
|
pp_conf = config.get((), "postprocessor") or {}
|
|
|
|
pp_opts = cfg("postprocessor-options")
|
2019-08-15 13:31:04 +02:00
|
|
|
pp_list = []
|
|
|
|
|
2018-06-08 17:39:02 +02:00
|
|
|
for pp_dict in postprocessors:
|
2021-06-05 01:37:47 +02:00
|
|
|
if isinstance(pp_dict, str):
|
|
|
|
pp_dict = pp_conf.get(pp_dict) or {"name": pp_dict}
|
2023-01-26 14:59:24 +01:00
|
|
|
if pp_opts:
|
|
|
|
pp_dict = pp_dict.copy()
|
|
|
|
pp_dict.update(pp_opts)
|
2020-11-17 00:38:29 +01:00
|
|
|
|
2021-11-23 19:23:02 +01:00
|
|
|
clist = pp_dict.get("whitelist")
|
|
|
|
if clist is not None:
|
|
|
|
negate = False
|
|
|
|
else:
|
|
|
|
clist = pp_dict.get("blacklist")
|
|
|
|
negate = True
|
|
|
|
if clist and not util.build_extractor_filter(
|
2022-03-20 21:16:46 +01:00
|
|
|
clist, negate)(extr):
|
2018-06-08 17:39:02 +02:00
|
|
|
continue
|
2020-11-17 00:38:29 +01:00
|
|
|
|
2018-09-03 14:53:43 +02:00
|
|
|
name = pp_dict.get("name")
|
2018-06-08 17:39:02 +02:00
|
|
|
pp_cls = postprocessor.find(name)
|
|
|
|
if not pp_cls:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log.warning("module '%s' not found", name)
|
2018-06-08 17:39:02 +02:00
|
|
|
continue
|
|
|
|
try:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_obj = pp_cls(self, pp_dict)
|
2018-06-08 17:39:02 +02:00
|
|
|
except Exception as exc:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log.error("'%s' initialization failed: %s: %s",
|
|
|
|
name, exc.__class__.__name__, exc)
|
2024-09-19 14:50:08 +02:00
|
|
|
pp_log.debug("", exc_info=exc)
|
2018-06-08 17:39:02 +02:00
|
|
|
else:
|
2019-08-15 13:31:04 +02:00
|
|
|
pp_list.append(pp_obj)
|
|
|
|
|
|
|
|
if pp_list:
|
2022-03-20 21:16:46 +01:00
|
|
|
extr.log.debug("Active postprocessor modules: %s", pp_list)
|
2020-11-18 17:11:55 +01:00
|
|
|
if "init" in self.hooks:
|
|
|
|
for callback in self.hooks["init"]:
|
|
|
|
callback(pathfmt)
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2021-06-04 18:08:08 +02:00
|
|
|
def register_hooks(self, hooks, options=None):
|
|
|
|
expr = options.get("filter") if options else None
|
|
|
|
|
|
|
|
if expr:
|
|
|
|
condition = util.compile_expression(expr)
|
|
|
|
for hook, callback in hooks.items():
|
|
|
|
self.hooks[hook].append(functools.partial(
|
|
|
|
self._call_hook, callback, condition))
|
|
|
|
else:
|
|
|
|
for hook, callback in hooks.items():
|
|
|
|
self.hooks[hook].append(callback)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _call_hook(callback, condition, pathfmt):
|
|
|
|
if condition(pathfmt.kwdict):
|
|
|
|
callback(pathfmt)
|
|
|
|
|
2021-11-23 19:23:02 +01:00
|
|
|
def _build_extractor_filter(self):
|
|
|
|
clist = self.extractor.config("whitelist")
|
|
|
|
if clist is not None:
|
|
|
|
negate = False
|
2022-01-06 21:09:30 +01:00
|
|
|
special = None
|
2020-09-10 22:54:10 +02:00
|
|
|
else:
|
2021-11-23 19:23:02 +01:00
|
|
|
clist = self.extractor.config("blacklist")
|
|
|
|
negate = True
|
2022-01-06 21:09:30 +01:00
|
|
|
special = util.SPECIAL_EXTRACTORS
|
2021-11-23 19:23:02 +01:00
|
|
|
if clist is None:
|
|
|
|
clist = (self.extractor.category,)
|
|
|
|
|
2022-01-06 21:09:30 +01:00
|
|
|
return util.build_extractor_filter(clist, negate, special)
|
2020-09-10 22:54:10 +02:00
|
|
|
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2018-05-25 16:07:18 +02:00
|
|
|
class SimulationJob(DownloadJob):
|
|
|
|
"""Simulate the extraction process without downloading anything"""
|
|
|
|
|
2021-03-07 22:27:59 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
|
|
|
if not kwdict["extension"]:
|
|
|
|
kwdict["extension"] = "jpg"
|
2018-05-25 16:07:18 +02:00
|
|
|
if self.sleep:
|
2022-10-07 12:42:00 +02:00
|
|
|
self.extractor.sleep(self.sleep(), "download")
|
2024-06-27 22:00:59 +02:00
|
|
|
if self.archive and self._archive_write_skip:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.archive.add(kwdict)
|
2022-11-08 17:01:10 +01:00
|
|
|
self.out.skip(self.pathfmt.build_filename(kwdict))
|
2018-05-25 16:07:18 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2018-09-21 17:55:04 +02:00
|
|
|
if not self.pathfmt:
|
|
|
|
self.initialize()
|
|
|
|
|
2018-05-25 16:07:18 +02:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class KeywordJob(Job):
|
|
|
|
"""Print available keywords"""
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2022-03-24 23:05:36 +01:00
|
|
|
def __init__(self, url, parent=None):
|
|
|
|
Job.__init__(self, url, parent)
|
|
|
|
self.private = config.get(("output",), "private")
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("\nKeywords for filenames and --filter:\n"
|
|
|
|
"------------------------------------\n")
|
2022-11-07 16:33:26 +01:00
|
|
|
|
2022-11-19 11:41:06 +01:00
|
|
|
if self.metadata_http and url.startswith("http"):
|
2022-11-07 16:33:26 +01:00
|
|
|
kwdict[self.metadata_http] = util.extract_headers(
|
|
|
|
self.extractor.request(url, method="HEAD"))
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2017-05-17 14:31:14 +02:00
|
|
|
raise exception.StopExtraction()
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("Keywords for directory names:\n"
|
|
|
|
"-----------------------------\n")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2020-06-18 15:04:15 +02:00
|
|
|
extr = None
|
|
|
|
if "_extractor" in kwdict:
|
|
|
|
extr = kwdict["_extractor"].from_url(url)
|
|
|
|
|
2019-12-28 22:26:49 +01:00
|
|
|
if not util.filter_dict(kwdict):
|
2017-09-26 20:50:49 +02:00
|
|
|
self.extractor.log.info(
|
2019-12-28 22:26:49 +01:00
|
|
|
"This extractor only spawns other extractors "
|
|
|
|
"and does not provide any metadata on its own.")
|
|
|
|
|
2020-06-18 15:04:15 +02:00
|
|
|
if extr:
|
2019-12-28 22:26:49 +01:00
|
|
|
self.extractor.log.info(
|
|
|
|
"Showing results for '%s' instead:\n", url)
|
|
|
|
KeywordJob(extr, self).run()
|
|
|
|
else:
|
|
|
|
self.extractor.log.info(
|
|
|
|
"Try 'gallery-dl -K \"%s\"' instead.", url)
|
2017-09-26 20:50:49 +02:00
|
|
|
else:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("Keywords for --chapter-filter:\n"
|
|
|
|
"------------------------------\n")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2020-06-18 15:04:15 +02:00
|
|
|
if extr or self.extractor.categorytransfer:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("\n")
|
2020-06-18 15:04:15 +02:00
|
|
|
KeywordJob(extr or url, self).run()
|
2017-08-10 17:36:21 +02:00
|
|
|
raise exception.StopExtraction()
|
|
|
|
|
2022-05-20 20:47:25 +02:00
|
|
|
def print_kwdict(self, kwdict, prefix="", markers=None):
|
2019-10-29 15:46:35 +01:00
|
|
|
"""Print key-value pairs in 'kwdict' with formatting"""
|
2022-05-19 13:24:37 +02:00
|
|
|
write = sys.stdout.write
|
2023-03-21 22:28:04 +01:00
|
|
|
suffix = "']" if prefix else ""
|
2022-05-20 20:47:25 +02:00
|
|
|
|
|
|
|
markerid = id(kwdict)
|
|
|
|
if markers is None:
|
|
|
|
markers = {markerid}
|
|
|
|
elif markerid in markers:
|
2023-03-21 23:05:05 +01:00
|
|
|
write("{}\n <circular reference>\n".format(prefix[:-2]))
|
2022-05-20 20:47:25 +02:00
|
|
|
return # ignore circular reference
|
|
|
|
else:
|
|
|
|
markers.add(markerid)
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
for key, value in sorted(kwdict.items()):
|
2022-03-24 23:05:36 +01:00
|
|
|
if key[0] == "_" and not self.private:
|
2019-02-12 21:26:41 +01:00
|
|
|
continue
|
2017-05-17 14:31:14 +02:00
|
|
|
key = prefix + key + suffix
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
if isinstance(value, dict):
|
2023-03-21 22:28:04 +01:00
|
|
|
self.print_kwdict(value, key + "['", markers)
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
elif isinstance(value, list):
|
2022-10-28 12:04:58 +02:00
|
|
|
if not value:
|
|
|
|
pass
|
|
|
|
elif isinstance(value[0], dict):
|
2023-03-21 22:28:04 +01:00
|
|
|
self.print_kwdict(value[0], key + "[N]['", markers)
|
2017-05-15 18:30:47 +02:00
|
|
|
else:
|
2022-10-28 12:04:58 +02:00
|
|
|
fmt = (" {:>%s} {}\n" % len(str(len(value)))).format
|
|
|
|
write(key + "[N]\n")
|
|
|
|
for idx, val in enumerate(value, 0):
|
|
|
|
write(fmt(idx, val))
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
else:
|
|
|
|
# string or number
|
2022-05-19 13:24:37 +02:00
|
|
|
write("{}\n {}\n".format(key, value))
|
2015-12-10 02:14:28 +01:00
|
|
|
|
2023-03-21 23:05:05 +01:00
|
|
|
markers.remove(markerid)
|
|
|
|
|
2015-12-10 02:14:28 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class UrlJob(Job):
|
|
|
|
"""Print download urls"""
|
2018-01-22 22:49:00 +01:00
|
|
|
maxdepth = 1
|
2017-02-17 22:18:16 +01:00
|
|
|
|
2017-09-30 18:52:23 +02:00
|
|
|
def __init__(self, url, parent=None, depth=1):
|
|
|
|
Job.__init__(self, url, parent)
|
2017-02-17 22:18:16 +01:00
|
|
|
self.depth = depth
|
2018-01-22 22:49:00 +01:00
|
|
|
if depth >= self.maxdepth:
|
2017-09-12 16:19:00 +02:00
|
|
|
self.handle_queue = self.handle_url
|
2015-12-10 02:14:28 +01:00
|
|
|
|
2017-05-23 11:48:00 +02:00
|
|
|
@staticmethod
|
2021-04-12 01:55:55 +02:00
|
|
|
def handle_url(url, _):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write(url + "\n")
|
2021-04-12 01:55:55 +02:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def handle_url_fallback(url, kwdict):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write(url + "\n")
|
2020-10-16 01:09:55 +02:00
|
|
|
if "_fallback" in kwdict:
|
|
|
|
for url in kwdict["_fallback"]:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("| " + url + "\n")
|
2018-01-17 22:08:19 +01:00
|
|
|
|
2021-05-19 15:52:30 +02:00
|
|
|
def handle_queue(self, url, kwdict):
|
|
|
|
cls = kwdict.get("_extractor")
|
|
|
|
if cls:
|
|
|
|
extr = cls.from_url(url)
|
|
|
|
else:
|
|
|
|
extr = extractor.find(url)
|
|
|
|
|
|
|
|
if extr:
|
2021-06-26 00:26:08 +02:00
|
|
|
self.status |= self.__class__(extr, self, self.depth + 1).run()
|
2021-05-19 15:52:30 +02:00
|
|
|
else:
|
2017-05-27 16:16:57 +02:00
|
|
|
self._write_unsupported(url)
|
2017-05-23 11:48:00 +02:00
|
|
|
|
2015-12-12 01:16:02 +01:00
|
|
|
|
2021-03-02 23:59:34 +01:00
|
|
|
class InfoJob(Job):
|
|
|
|
"""Print extractor defaults and settings"""
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
ex = self.extractor
|
|
|
|
pm = self._print_multi
|
|
|
|
pc = self._print_config
|
|
|
|
|
|
|
|
if ex.basecategory:
|
|
|
|
pm("Category / Subcategory / Basecategory",
|
|
|
|
ex.category, ex.subcategory, ex.basecategory)
|
|
|
|
else:
|
|
|
|
pm("Category / Subcategory", ex.category, ex.subcategory)
|
|
|
|
|
|
|
|
pc("Filename format", "filename", ex.filename_fmt)
|
|
|
|
pc("Directory format", "directory", ex.directory_fmt)
|
2021-04-06 18:20:42 +02:00
|
|
|
pc("Archive format", "archive-format", ex.archive_fmt)
|
2021-03-02 23:59:34 +01:00
|
|
|
pc("Request interval", "sleep-request", ex.request_interval)
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def _print_multi(self, title, *values):
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write("{}\n {}\n\n".format(
|
2023-02-09 15:50:55 +01:00
|
|
|
title, " / ".join(map(util.json_dumps, values))))
|
2021-03-02 23:59:34 +01:00
|
|
|
|
|
|
|
def _print_config(self, title, optname, value):
|
|
|
|
optval = self.extractor.config(optname, util.SENTINEL)
|
|
|
|
if optval is not util.SENTINEL:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write(
|
|
|
|
"{} (custom):\n {}\n{} (default):\n {}\n\n".format(
|
2023-02-09 15:50:55 +01:00
|
|
|
title, util.json_dumps(optval),
|
|
|
|
title, util.json_dumps(value)))
|
2021-03-02 23:59:34 +01:00
|
|
|
elif value:
|
2022-05-19 13:24:37 +02:00
|
|
|
stdout_write(
|
2023-02-09 15:50:55 +01:00
|
|
|
"{} (default):\n {}\n\n".format(
|
|
|
|
title, util.json_dumps(value)))
|
2021-03-02 23:59:34 +01:00
|
|
|
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
class DataJob(Job):
|
|
|
|
"""Collect extractor results and dump them"""
|
2024-07-26 20:36:04 +02:00
|
|
|
resolve = False
|
2017-04-12 18:43:41 +02:00
|
|
|
|
2024-07-19 14:32:42 +02:00
|
|
|
def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True,
|
|
|
|
resolve=False):
|
2017-10-06 15:38:35 +02:00
|
|
|
Job.__init__(self, url, parent)
|
2017-04-12 18:43:41 +02:00
|
|
|
self.file = file
|
|
|
|
self.data = []
|
2019-11-23 23:50:16 +01:00
|
|
|
self.ascii = config.get(("output",), "ascii", ensure_ascii)
|
2024-07-26 20:36:04 +02:00
|
|
|
self.resolve = 128 if resolve is True else (resolve or self.resolve)
|
2017-04-12 18:43:41 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
private = config.get(("output",), "private")
|
2022-10-13 19:17:23 +02:00
|
|
|
self.filter = dict.copy if private else util.filter_dict
|
2019-11-21 16:57:39 +01:00
|
|
|
|
2024-07-26 20:36:04 +02:00
|
|
|
if self.resolve > 0:
|
2024-07-19 14:32:42 +02:00
|
|
|
self.handle_queue = self.handle_queue_resolve
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
def run(self):
|
2023-09-21 23:14:08 +02:00
|
|
|
self._init()
|
|
|
|
|
2022-10-07 12:42:00 +02:00
|
|
|
extractor = self.extractor
|
2021-09-14 17:40:05 +02:00
|
|
|
sleep = util.build_duration_func(
|
2022-10-07 12:42:00 +02:00
|
|
|
extractor.config("sleep-extractor"))
|
2020-09-12 21:04:47 +02:00
|
|
|
if sleep:
|
2022-10-07 12:42:00 +02:00
|
|
|
extractor.sleep(sleep(), "extractor")
|
2020-09-12 21:04:47 +02:00
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
# collect data
|
|
|
|
try:
|
2022-10-07 12:42:00 +02:00
|
|
|
for msg in extractor:
|
2017-11-18 17:35:57 +01:00
|
|
|
self.dispatch(msg)
|
2018-11-15 14:24:18 +01:00
|
|
|
except exception.StopExtraction:
|
|
|
|
pass
|
2017-04-12 18:43:41 +02:00
|
|
|
except Exception as exc:
|
|
|
|
self.data.append((exc.__class__.__name__, str(exc)))
|
2017-11-18 17:35:57 +01:00
|
|
|
except BaseException:
|
|
|
|
pass
|
2017-04-12 18:43:41 +02:00
|
|
|
|
2019-05-09 16:22:06 +02:00
|
|
|
# convert numbers to string
|
2019-11-23 23:50:16 +01:00
|
|
|
if config.get(("output",), "num-to-str", False):
|
2018-10-08 20:28:54 +02:00
|
|
|
for msg in self.data:
|
|
|
|
util.transform_dict(msg[-1], util.number_to_string)
|
|
|
|
|
2024-07-19 14:32:42 +02:00
|
|
|
if self.file:
|
|
|
|
# dump to 'file'
|
|
|
|
try:
|
|
|
|
util.dump_json(self.data, self.file, self.ascii, 2)
|
|
|
|
self.file.flush()
|
|
|
|
except Exception:
|
|
|
|
pass
|
2020-06-19 23:05:44 +02:00
|
|
|
|
2019-10-27 23:05:00 +01:00
|
|
|
return 0
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Url, url, self.filter(kwdict)))
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Directory, self.filter(kwdict)))
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Queue, url, self.filter(kwdict)))
|
2024-07-19 14:32:42 +02:00
|
|
|
|
|
|
|
def handle_queue_resolve(self, url, kwdict):
|
|
|
|
cls = kwdict.get("_extractor")
|
|
|
|
if cls:
|
|
|
|
extr = cls.from_url(url)
|
|
|
|
else:
|
|
|
|
extr = extractor.find(url)
|
|
|
|
|
|
|
|
if not extr:
|
|
|
|
return self.data.append((Message.Queue, url, self.filter(kwdict)))
|
|
|
|
|
|
|
|
job = self.__class__(extr, self, None, self.ascii, self.resolve-1)
|
|
|
|
job.data = self.data
|
|
|
|
job.run()
|