2015-04-05 16:23:20 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2021-01-31 02:12:37 +01:00
|
|
|
# Copyright 2015-2021 Mike Fährmann
|
2015-04-05 16:23:20 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
import sys
|
2021-03-02 23:59:34 +01:00
|
|
|
import json
|
2017-12-04 17:06:17 +01:00
|
|
|
import time
|
2019-12-18 22:08:53 +01:00
|
|
|
import errno
|
2018-02-01 20:49:41 +01:00
|
|
|
import logging
|
2021-02-15 21:58:33 +01:00
|
|
|
import operator
|
2021-06-04 18:08:08 +02:00
|
|
|
import functools
|
2020-11-18 17:11:55 +01:00
|
|
|
import collections
|
2018-05-20 22:03:57 +02:00
|
|
|
from . import extractor, downloader, postprocessor
|
2018-10-13 17:21:55 +02:00
|
|
|
from . import config, text, util, output, exception
|
2015-11-24 19:47:51 +01:00
|
|
|
from .extractor.message import Message
|
2015-04-05 16:23:20 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class Job():
|
|
|
|
"""Base class for Job-types"""
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
ulog = None
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
def __init__(self, extr, parent=None):
|
|
|
|
if isinstance(extr, str):
|
|
|
|
extr = extractor.find(extr)
|
|
|
|
if not extr:
|
|
|
|
raise exception.NoExtractorError()
|
2021-04-27 15:08:51 +02:00
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
self.extractor = extr
|
2020-05-18 01:35:53 +02:00
|
|
|
self.pathfmt = None
|
2021-04-27 15:08:51 +02:00
|
|
|
self.kwdict = {}
|
|
|
|
self.status = 0
|
2020-05-18 01:35:53 +02:00
|
|
|
|
|
|
|
self._logger_extra = {
|
|
|
|
"job" : self,
|
|
|
|
"extractor": extr,
|
|
|
|
"path" : output.PathfmtProxy(self),
|
|
|
|
"keywords" : output.KwdictProxy(self),
|
|
|
|
}
|
|
|
|
extr.log = self._wrap_logger(extr.log)
|
2019-02-13 17:39:43 +01:00
|
|
|
extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url)
|
2015-12-12 00:11:05 +01:00
|
|
|
|
2021-03-11 01:10:34 +01:00
|
|
|
# data from parent job
|
2020-05-12 23:52:01 +02:00
|
|
|
if parent:
|
|
|
|
pextr = parent.extractor
|
|
|
|
|
|
|
|
# transfer (sub)category
|
|
|
|
if pextr.config("category-transfer", pextr.categorytransfer):
|
2021-04-27 15:08:51 +02:00
|
|
|
extr._cfgpath = pextr._cfgpath
|
2020-05-12 23:52:01 +02:00
|
|
|
extr.category = pextr.category
|
|
|
|
extr.subcategory = pextr.subcategory
|
|
|
|
|
|
|
|
# reuse connection adapters
|
|
|
|
extr.session.adapters = pextr.session.adapters
|
2017-09-30 18:52:23 +02:00
|
|
|
|
2021-04-27 15:08:51 +02:00
|
|
|
# user-supplied metadata
|
|
|
|
kwdict = self.extractor.config("keywords")
|
|
|
|
if kwdict:
|
|
|
|
self.kwdict.update(kwdict)
|
|
|
|
|
|
|
|
# predicates
|
|
|
|
self.pred_url = self._prepare_predicates("image", True)
|
|
|
|
self.pred_queue = self._prepare_predicates("chapter", False)
|
|
|
|
|
2017-02-23 21:51:29 +01:00
|
|
|
def run(self):
|
|
|
|
"""Execute or run the job"""
|
2020-09-12 21:04:47 +02:00
|
|
|
sleep = self.extractor.config("sleep-extractor")
|
|
|
|
if sleep:
|
|
|
|
time.sleep(sleep)
|
2017-02-23 21:51:29 +01:00
|
|
|
try:
|
2017-03-11 01:47:57 +01:00
|
|
|
log = self.extractor.log
|
2017-02-23 21:51:29 +01:00
|
|
|
for msg in self.extractor:
|
2017-02-26 02:06:56 +01:00
|
|
|
self.dispatch(msg)
|
2019-10-27 23:05:00 +01:00
|
|
|
except exception.StopExtraction as exc:
|
|
|
|
if exc.message:
|
2019-10-28 16:06:36 +01:00
|
|
|
log.error(exc.message)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= exc.code
|
2021-05-12 02:22:28 +02:00
|
|
|
except exception.TerminateExtraction:
|
|
|
|
raise
|
2019-10-27 23:05:00 +01:00
|
|
|
except exception.GalleryDLException as exc:
|
|
|
|
log.error("%s: %s", exc.__class__.__name__, exc)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= exc.code
|
2017-08-10 16:29:05 +02:00
|
|
|
except OSError as exc:
|
2018-12-04 19:24:50 +01:00
|
|
|
log.error("Unable to download data: %s: %s",
|
|
|
|
exc.__class__.__name__, exc)
|
|
|
|
log.debug("", exc_info=True)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 128
|
2017-04-18 11:38:48 +02:00
|
|
|
except Exception as exc:
|
2017-08-10 16:29:05 +02:00
|
|
|
log.error(("An unexpected error occurred: %s - %s. "
|
|
|
|
"Please run gallery-dl again with the --verbose flag, "
|
|
|
|
"copy its output and report this issue on "
|
|
|
|
"https://github.com/mikf/gallery-dl/issues ."),
|
|
|
|
exc.__class__.__name__, exc)
|
2018-12-04 19:24:50 +01:00
|
|
|
log.debug("", exc_info=True)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 1
|
2019-11-03 21:45:45 +01:00
|
|
|
except BaseException:
|
|
|
|
self.status |= 1
|
|
|
|
raise
|
2019-07-27 11:14:52 +02:00
|
|
|
finally:
|
|
|
|
self.handle_finalize()
|
2019-10-29 15:56:54 +01:00
|
|
|
return self.status
|
2017-04-18 11:38:48 +02:00
|
|
|
|
2017-02-26 02:06:56 +01:00
|
|
|
def dispatch(self, msg):
|
|
|
|
"""Call the appropriate message handler"""
|
2017-03-17 09:39:46 +01:00
|
|
|
if msg[0] == Message.Url:
|
2017-09-06 17:08:50 +02:00
|
|
|
_, url, kwds = msg
|
|
|
|
if self.pred_url(url, kwds):
|
|
|
|
self.update_kwdict(kwds)
|
|
|
|
self.handle_url(url, kwds)
|
2017-02-26 02:06:56 +01:00
|
|
|
|
|
|
|
elif msg[0] == Message.Directory:
|
|
|
|
self.update_kwdict(msg[1])
|
|
|
|
self.handle_directory(msg[1])
|
|
|
|
|
2017-03-17 09:39:46 +01:00
|
|
|
elif msg[0] == Message.Queue:
|
2017-09-12 16:19:00 +02:00
|
|
|
_, url, kwds = msg
|
|
|
|
if self.pred_queue(url, kwds):
|
|
|
|
self.handle_queue(url, kwds)
|
2017-02-26 02:06:56 +01:00
|
|
|
|
|
|
|
elif msg[0] == Message.Version:
|
|
|
|
if msg[1] != 1:
|
|
|
|
raise "unsupported message-version ({}, {})".format(
|
|
|
|
self.extractor.category, msg[1]
|
|
|
|
)
|
|
|
|
# TODO: support for multiple message versions
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Url"""
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Directory"""
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Queue"""
|
|
|
|
|
2018-06-08 17:39:02 +02:00
|
|
|
def handle_finalize(self):
|
|
|
|
"""Handle job finalization"""
|
|
|
|
|
2016-09-24 10:45:11 +02:00
|
|
|
def update_kwdict(self, kwdict):
|
2018-02-08 23:10:58 +01:00
|
|
|
"""Update 'kwdict' with additional metadata"""
|
2019-10-29 15:46:35 +01:00
|
|
|
extr = self.extractor
|
|
|
|
kwdict["category"] = extr.category
|
|
|
|
kwdict["subcategory"] = extr.subcategory
|
2021-03-11 01:10:34 +01:00
|
|
|
if self.kwdict:
|
|
|
|
kwdict.update(self.kwdict)
|
2015-12-12 00:11:05 +01:00
|
|
|
|
2019-06-29 22:48:59 +02:00
|
|
|
def _prepare_predicates(self, target, skip=True):
|
|
|
|
predicates = []
|
|
|
|
|
|
|
|
if self.extractor.config(target + "-unique"):
|
|
|
|
predicates.append(util.UniquePredicate())
|
|
|
|
|
2018-10-07 21:34:25 +02:00
|
|
|
pfilter = self.extractor.config(target + "-filter")
|
|
|
|
if pfilter:
|
|
|
|
try:
|
|
|
|
pred = util.FilterPredicate(pfilter, target)
|
|
|
|
except (SyntaxError, ValueError, TypeError) as exc:
|
|
|
|
self.extractor.log.warning(exc)
|
|
|
|
else:
|
|
|
|
predicates.append(pred)
|
|
|
|
|
|
|
|
prange = self.extractor.config(target + "-range")
|
|
|
|
if prange:
|
|
|
|
try:
|
|
|
|
pred = util.RangePredicate(prange)
|
|
|
|
except ValueError as exc:
|
|
|
|
self.extractor.log.warning(
|
|
|
|
"invalid %s range: %s", target, exc)
|
|
|
|
else:
|
|
|
|
if skip and pred.lower > 1 and not pfilter:
|
|
|
|
pred.index += self.extractor.skip(pred.lower - 1)
|
|
|
|
predicates.append(pred)
|
|
|
|
|
|
|
|
return util.build_predicate(predicates)
|
|
|
|
|
2020-05-18 01:35:53 +02:00
|
|
|
def get_logger(self, name):
|
|
|
|
return self._wrap_logger(logging.getLogger(name))
|
|
|
|
|
|
|
|
def _wrap_logger(self, logger):
|
|
|
|
return output.LoggerAdapter(logger, self._logger_extra)
|
|
|
|
|
2017-05-27 16:16:57 +02:00
|
|
|
def _write_unsupported(self, url):
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
if self.ulog:
|
|
|
|
self.ulog.info(url)
|
2017-05-27 16:16:57 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class DownloadJob(Job):
|
|
|
|
"""Download images into appropriate directory/filename locations"""
|
|
|
|
|
2021-05-13 21:56:34 +02:00
|
|
|
def __init__(self, url, parent=None):
|
2017-10-06 15:38:35 +02:00
|
|
|
Job.__init__(self, url, parent)
|
2020-05-18 01:35:53 +02:00
|
|
|
self.log = self.get_logger("download")
|
2020-09-10 22:54:10 +02:00
|
|
|
self.blacklist = None
|
2018-02-01 20:49:41 +01:00
|
|
|
self.archive = None
|
2017-12-29 22:15:57 +01:00
|
|
|
self.sleep = None
|
2020-11-28 18:13:46 +01:00
|
|
|
self.hooks = ()
|
2015-04-08 01:51:48 +02:00
|
|
|
self.downloaders = {}
|
2016-09-30 12:32:48 +02:00
|
|
|
self.out = output.select()
|
2021-05-13 21:56:34 +02:00
|
|
|
self.visited = parent.visited if parent else set()
|
2021-05-12 23:37:01 +02:00
|
|
|
self._skipcnt = 0
|
2020-01-29 18:32:37 +01:00
|
|
|
|
2020-10-16 01:09:55 +02:00
|
|
|
def handle_url(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Download the resource specified in 'url'"""
|
2020-11-18 17:11:55 +01:00
|
|
|
hooks = self.hooks
|
2019-07-13 21:42:07 +02:00
|
|
|
pathfmt = self.pathfmt
|
|
|
|
archive = self.archive
|
|
|
|
|
2018-02-01 20:49:41 +01:00
|
|
|
# prepare download
|
2019-10-29 15:46:35 +01:00
|
|
|
pathfmt.set_filename(kwdict)
|
2018-02-01 20:49:41 +01:00
|
|
|
|
2020-11-18 17:11:55 +01:00
|
|
|
if "prepare" in hooks:
|
|
|
|
for callback in hooks["prepare"]:
|
|
|
|
callback(pathfmt)
|
2018-10-18 22:32:03 +02:00
|
|
|
|
2020-09-23 15:00:27 +02:00
|
|
|
if archive and archive.check(kwdict):
|
2020-09-03 18:37:38 +02:00
|
|
|
pathfmt.fix_extension()
|
|
|
|
self.handle_skip()
|
|
|
|
return
|
|
|
|
|
|
|
|
if pathfmt.exists():
|
|
|
|
if archive:
|
|
|
|
archive.add(kwdict)
|
2018-10-13 17:21:55 +02:00
|
|
|
self.handle_skip()
|
2018-02-01 20:49:41 +01:00
|
|
|
return
|
|
|
|
|
|
|
|
if self.sleep:
|
|
|
|
time.sleep(self.sleep)
|
|
|
|
|
|
|
|
# download from URL
|
2018-10-05 17:58:15 +02:00
|
|
|
if not self.download(url):
|
2018-02-01 20:49:41 +01:00
|
|
|
|
|
|
|
# use fallback URLs if available
|
2020-10-16 01:09:55 +02:00
|
|
|
for num, url in enumerate(kwdict.get("_fallback", ()), 1):
|
2020-01-19 22:53:06 +01:00
|
|
|
util.remove_file(pathfmt.temppath)
|
2018-02-01 20:49:41 +01:00
|
|
|
self.log.info("Trying fallback URL #%d", num)
|
2018-10-05 17:58:15 +02:00
|
|
|
if self.download(url):
|
2018-02-01 20:49:41 +01:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
# download failed
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 4
|
2019-07-13 21:42:07 +02:00
|
|
|
self.log.error("Failed to download %s",
|
|
|
|
pathfmt.filename or url)
|
2018-02-01 20:49:41 +01:00
|
|
|
return
|
|
|
|
|
2019-07-13 21:42:07 +02:00
|
|
|
if not pathfmt.temppath:
|
2020-09-03 18:37:38 +02:00
|
|
|
if archive:
|
|
|
|
archive.add(kwdict)
|
2018-10-13 17:21:55 +02:00
|
|
|
self.handle_skip()
|
2018-06-27 17:16:07 +02:00
|
|
|
return
|
|
|
|
|
2018-05-20 22:03:57 +02:00
|
|
|
# run post processors
|
2020-11-18 17:11:55 +01:00
|
|
|
if "file" in hooks:
|
|
|
|
for callback in hooks["file"]:
|
|
|
|
callback(pathfmt)
|
2018-05-20 22:03:57 +02:00
|
|
|
|
2018-02-12 16:56:45 +01:00
|
|
|
# download succeeded
|
2019-07-13 21:42:07 +02:00
|
|
|
pathfmt.finalize()
|
|
|
|
self.out.success(pathfmt.path, 0)
|
2020-11-18 17:11:55 +01:00
|
|
|
self._skipcnt = 0
|
2019-07-13 21:42:07 +02:00
|
|
|
if archive:
|
2019-10-29 15:46:35 +01:00
|
|
|
archive.add(kwdict)
|
2020-11-18 17:11:55 +01:00
|
|
|
if "after" in hooks:
|
|
|
|
for callback in hooks["after"]:
|
|
|
|
callback(pathfmt)
|
2018-01-17 22:08:19 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2015-04-08 01:51:48 +02:00
|
|
|
"""Set and create the target directory for downloads"""
|
2018-09-21 17:55:04 +02:00
|
|
|
if not self.pathfmt:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.initialize(kwdict)
|
2018-11-21 22:21:26 +01:00
|
|
|
else:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.pathfmt.set_directory(kwdict)
|
2020-11-18 17:11:55 +01:00
|
|
|
if "post" in self.hooks:
|
|
|
|
for callback in self.hooks["post"]:
|
|
|
|
callback(self.pathfmt)
|
2018-09-21 17:55:04 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2019-12-26 23:38:16 +01:00
|
|
|
if url in self.visited:
|
|
|
|
return
|
|
|
|
self.visited.add(url)
|
|
|
|
|
2021-03-20 01:19:31 +01:00
|
|
|
cls = kwdict.get("_extractor")
|
|
|
|
if cls:
|
|
|
|
extr = cls.from_url(url)
|
2019-02-12 21:26:41 +01:00
|
|
|
else:
|
|
|
|
extr = extractor.find(url)
|
2020-09-10 22:54:10 +02:00
|
|
|
if extr:
|
|
|
|
if self.blacklist is None:
|
|
|
|
self.blacklist = self._build_blacklist()
|
|
|
|
if extr.category in self.blacklist:
|
|
|
|
extr = None
|
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
if extr:
|
2021-05-13 21:56:34 +02:00
|
|
|
job = self.__class__(extr, self)
|
|
|
|
pfmt = self.pathfmt
|
|
|
|
pextr = self.extractor
|
|
|
|
|
|
|
|
if pfmt and pextr.config("parent-directory"):
|
|
|
|
extr._parentdir = pfmt.directory
|
|
|
|
else:
|
|
|
|
extr._parentdir = pextr._parentdir
|
|
|
|
|
|
|
|
if pextr.config("parent-metadata"):
|
|
|
|
if self.kwdict:
|
|
|
|
job.kwdict.update(self.kwdict)
|
|
|
|
if kwdict:
|
|
|
|
job.kwdict.update(kwdict)
|
|
|
|
|
|
|
|
if pextr.config("parent-skip"):
|
2021-05-12 23:37:01 +02:00
|
|
|
job._skipcnt = self._skipcnt
|
|
|
|
self.status |= job.run()
|
|
|
|
self._skipcnt = job._skipcnt
|
|
|
|
else:
|
|
|
|
self.status |= job.run()
|
2019-02-12 21:26:41 +01:00
|
|
|
else:
|
2018-09-21 17:55:04 +02:00
|
|
|
self._write_unsupported(url)
|
|
|
|
|
|
|
|
def handle_finalize(self):
|
2019-11-03 21:45:45 +01:00
|
|
|
pathfmt = self.pathfmt
|
2019-09-10 22:26:40 +02:00
|
|
|
if self.archive:
|
|
|
|
self.archive.close()
|
2019-11-03 21:45:45 +01:00
|
|
|
if pathfmt:
|
2019-10-18 21:31:33 +02:00
|
|
|
self.extractor._store_cookies()
|
2020-11-18 17:11:55 +01:00
|
|
|
if "finalize" in self.hooks:
|
2019-11-03 21:45:45 +01:00
|
|
|
status = self.status
|
2020-11-18 17:11:55 +01:00
|
|
|
for callback in self.hooks["finalize"]:
|
|
|
|
callback(pathfmt, status)
|
2018-09-21 17:55:04 +02:00
|
|
|
|
2018-10-13 17:21:55 +02:00
|
|
|
def handle_skip(self):
|
2020-11-18 17:11:55 +01:00
|
|
|
pathfmt = self.pathfmt
|
|
|
|
self.out.skip(pathfmt.path)
|
|
|
|
if "skip" in self.hooks:
|
|
|
|
for callback in self.hooks["skip"]:
|
|
|
|
callback(pathfmt)
|
2018-10-13 17:21:55 +02:00
|
|
|
if self._skipexc:
|
|
|
|
self._skipcnt += 1
|
|
|
|
if self._skipcnt >= self._skipmax:
|
|
|
|
raise self._skipexc()
|
|
|
|
|
2018-10-05 17:58:15 +02:00
|
|
|
def download(self, url):
|
|
|
|
"""Download 'url'"""
|
2018-09-21 17:55:04 +02:00
|
|
|
scheme = url.partition(":")[0]
|
2018-10-05 17:58:15 +02:00
|
|
|
downloader = self.get_downloader(scheme)
|
|
|
|
if downloader:
|
2019-12-18 22:08:53 +01:00
|
|
|
try:
|
|
|
|
return downloader.download(url, self.pathfmt)
|
|
|
|
except OSError as exc:
|
|
|
|
if exc.errno == errno.ENOSPC:
|
|
|
|
raise
|
|
|
|
self.log.warning("%s: %s", exc.__class__.__name__, exc)
|
|
|
|
return False
|
2018-11-13 18:06:36 +01:00
|
|
|
self._write_unsupported(url)
|
2018-10-05 17:58:15 +02:00
|
|
|
return False
|
|
|
|
|
|
|
|
def get_downloader(self, scheme):
|
|
|
|
"""Return a downloader suitable for 'scheme'"""
|
2018-09-21 17:55:04 +02:00
|
|
|
try:
|
|
|
|
return self.downloaders[scheme]
|
|
|
|
except KeyError:
|
|
|
|
pass
|
2018-10-05 17:58:15 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
cls = downloader.find(scheme)
|
|
|
|
if cls and config.get(("downloader", cls.scheme), "enabled", True):
|
2020-05-18 01:35:53 +02:00
|
|
|
instance = cls(self)
|
2018-10-05 17:58:15 +02:00
|
|
|
else:
|
|
|
|
instance = None
|
2018-11-16 18:02:24 +01:00
|
|
|
self.log.error("'%s:' URLs are not supported/enabled", scheme)
|
2019-06-20 16:59:44 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
if cls and cls.scheme == "http":
|
2019-06-20 16:59:44 +02:00
|
|
|
self.downloaders["http"] = self.downloaders["https"] = instance
|
|
|
|
else:
|
|
|
|
self.downloaders[scheme] = instance
|
2018-09-21 17:55:04 +02:00
|
|
|
return instance
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def initialize(self, kwdict=None):
|
2018-09-21 17:55:04 +02:00
|
|
|
"""Delayed initialization of PathFormat, etc."""
|
2021-06-05 01:37:47 +02:00
|
|
|
cfg = self.extractor.config
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt = self.pathfmt = util.PathFormat(self.extractor)
|
2019-10-29 15:46:35 +01:00
|
|
|
if kwdict:
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.set_directory(kwdict)
|
2019-07-13 21:49:26 +02:00
|
|
|
|
2021-06-05 01:37:47 +02:00
|
|
|
self.sleep = cfg("sleep")
|
|
|
|
if not cfg("download", True):
|
2020-09-23 15:00:27 +02:00
|
|
|
# monkey-patch method to do nothing and always return True
|
2020-03-10 23:08:29 +01:00
|
|
|
self.download = pathfmt.fix_extension
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2021-06-05 01:37:47 +02:00
|
|
|
archive = cfg("archive")
|
2020-09-23 15:00:27 +02:00
|
|
|
if archive:
|
|
|
|
path = util.expand_path(archive)
|
|
|
|
try:
|
|
|
|
if "{" in path:
|
|
|
|
path = util.Formatter(path).format_map(kwdict)
|
|
|
|
self.archive = util.DownloadArchive(path, self.extractor)
|
|
|
|
except Exception as exc:
|
|
|
|
self.extractor.log.warning(
|
|
|
|
"Failed to open download archive at '%s' ('%s: %s')",
|
|
|
|
path, exc.__class__.__name__, exc)
|
|
|
|
else:
|
|
|
|
self.extractor.log.debug("Using download archive '%s'", path)
|
|
|
|
|
2021-06-05 01:37:47 +02:00
|
|
|
skip = cfg("skip", True)
|
2018-10-13 17:21:55 +02:00
|
|
|
if skip:
|
|
|
|
self._skipexc = None
|
2019-08-08 18:34:31 +02:00
|
|
|
if skip == "enumerate":
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.check_file = pathfmt._enum_file
|
2019-08-08 18:34:31 +02:00
|
|
|
elif isinstance(skip, str):
|
2018-10-13 17:21:55 +02:00
|
|
|
skip, _, smax = skip.partition(":")
|
|
|
|
if skip == "abort":
|
|
|
|
self._skipexc = exception.StopExtraction
|
2021-05-12 02:22:28 +02:00
|
|
|
elif skip == "terminate":
|
|
|
|
self._skipexc = exception.TerminateExtraction
|
2018-10-13 17:21:55 +02:00
|
|
|
elif skip == "exit":
|
|
|
|
self._skipexc = sys.exit
|
|
|
|
self._skipmax = text.parse_int(smax)
|
|
|
|
else:
|
2020-09-23 15:00:27 +02:00
|
|
|
# monkey-patch methods to always return False
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.exists = lambda x=None: False
|
2020-09-23 15:00:27 +02:00
|
|
|
if self.archive:
|
|
|
|
self.archive.check = pathfmt.exists
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2020-09-14 21:39:17 +02:00
|
|
|
postprocessors = self.extractor.config_accumulate("postprocessors")
|
2018-06-08 17:39:02 +02:00
|
|
|
if postprocessors:
|
2020-11-28 18:13:46 +01:00
|
|
|
self.hooks = collections.defaultdict(list)
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log = self.get_logger("postprocessor")
|
2019-08-15 13:31:04 +02:00
|
|
|
pp_list = []
|
2020-03-10 23:08:29 +01:00
|
|
|
category = self.extractor.category
|
2020-11-17 00:38:29 +01:00
|
|
|
basecategory = self.extractor.basecategory
|
2019-08-15 13:31:04 +02:00
|
|
|
|
2021-06-05 01:37:47 +02:00
|
|
|
pp_conf = config.get((), "postprocessor") or {}
|
2018-06-08 17:39:02 +02:00
|
|
|
for pp_dict in postprocessors:
|
2021-06-05 01:37:47 +02:00
|
|
|
if isinstance(pp_dict, str):
|
|
|
|
pp_dict = pp_conf.get(pp_dict) or {"name": pp_dict}
|
2020-11-17 00:38:29 +01:00
|
|
|
|
2018-09-03 14:53:43 +02:00
|
|
|
whitelist = pp_dict.get("whitelist")
|
2020-11-17 00:38:29 +01:00
|
|
|
if whitelist and category not in whitelist and \
|
|
|
|
basecategory not in whitelist:
|
|
|
|
continue
|
|
|
|
|
2018-09-03 14:53:43 +02:00
|
|
|
blacklist = pp_dict.get("blacklist")
|
2020-11-17 00:38:29 +01:00
|
|
|
if blacklist and (
|
|
|
|
category in blacklist or basecategory in blacklist):
|
2018-06-08 17:39:02 +02:00
|
|
|
continue
|
2020-11-17 00:38:29 +01:00
|
|
|
|
2018-09-03 14:53:43 +02:00
|
|
|
name = pp_dict.get("name")
|
2018-06-08 17:39:02 +02:00
|
|
|
pp_cls = postprocessor.find(name)
|
|
|
|
if not pp_cls:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log.warning("module '%s' not found", name)
|
2018-06-08 17:39:02 +02:00
|
|
|
continue
|
|
|
|
try:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_obj = pp_cls(self, pp_dict)
|
2018-06-08 17:39:02 +02:00
|
|
|
except Exception as exc:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log.error("'%s' initialization failed: %s: %s",
|
|
|
|
name, exc.__class__.__name__, exc)
|
2018-06-08 17:39:02 +02:00
|
|
|
else:
|
2019-08-15 13:31:04 +02:00
|
|
|
pp_list.append(pp_obj)
|
|
|
|
|
|
|
|
if pp_list:
|
|
|
|
self.extractor.log.debug(
|
|
|
|
"Active postprocessor modules: %s", pp_list)
|
2020-11-18 17:11:55 +01:00
|
|
|
if "init" in self.hooks:
|
|
|
|
for callback in self.hooks["init"]:
|
|
|
|
callback(pathfmt)
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2021-06-04 18:08:08 +02:00
|
|
|
def register_hooks(self, hooks, options=None):
|
|
|
|
expr = options.get("filter") if options else None
|
|
|
|
|
|
|
|
if expr:
|
|
|
|
condition = util.compile_expression(expr)
|
|
|
|
for hook, callback in hooks.items():
|
|
|
|
self.hooks[hook].append(functools.partial(
|
|
|
|
self._call_hook, callback, condition))
|
|
|
|
else:
|
|
|
|
for hook, callback in hooks.items():
|
|
|
|
self.hooks[hook].append(callback)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _call_hook(callback, condition, pathfmt):
|
|
|
|
if condition(pathfmt.kwdict):
|
|
|
|
callback(pathfmt)
|
|
|
|
|
2020-09-10 22:54:10 +02:00
|
|
|
def _build_blacklist(self):
|
|
|
|
wlist = self.extractor.config("whitelist")
|
2020-10-08 14:55:21 +02:00
|
|
|
if wlist is not None:
|
2020-09-10 22:54:10 +02:00
|
|
|
if isinstance(wlist, str):
|
|
|
|
wlist = wlist.split(",")
|
2021-02-15 21:58:33 +01:00
|
|
|
|
|
|
|
# build a set of all categories
|
|
|
|
blist = set()
|
|
|
|
add = blist.add
|
|
|
|
update = blist.update
|
|
|
|
get = operator.itemgetter(0)
|
|
|
|
|
|
|
|
for extr in extractor._list_classes():
|
|
|
|
category = extr.category
|
|
|
|
if category:
|
|
|
|
add(category)
|
|
|
|
else:
|
|
|
|
update(map(get, extr.instances))
|
|
|
|
|
|
|
|
# remove whitelisted categories
|
2020-09-10 22:54:10 +02:00
|
|
|
blist.difference_update(wlist)
|
|
|
|
return blist
|
|
|
|
|
|
|
|
blist = self.extractor.config("blacklist")
|
2020-10-08 14:55:21 +02:00
|
|
|
if blist is not None:
|
2020-09-10 22:54:10 +02:00
|
|
|
if isinstance(blist, str):
|
|
|
|
blist = blist.split(",")
|
|
|
|
blist = set(blist)
|
|
|
|
else:
|
|
|
|
blist = {self.extractor.category}
|
|
|
|
blist |= util.SPECIAL_EXTRACTORS
|
|
|
|
return blist
|
|
|
|
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2018-05-25 16:07:18 +02:00
|
|
|
class SimulationJob(DownloadJob):
|
|
|
|
"""Simulate the extraction process without downloading anything"""
|
|
|
|
|
2021-03-07 22:27:59 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
|
|
|
if not kwdict["extension"]:
|
|
|
|
kwdict["extension"] = "jpg"
|
2019-10-29 15:46:35 +01:00
|
|
|
self.pathfmt.set_filename(kwdict)
|
2018-05-25 16:07:18 +02:00
|
|
|
self.out.skip(self.pathfmt.path)
|
|
|
|
if self.sleep:
|
|
|
|
time.sleep(self.sleep)
|
|
|
|
if self.archive:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.archive.add(kwdict)
|
2018-05-25 16:07:18 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2018-09-21 17:55:04 +02:00
|
|
|
if not self.pathfmt:
|
|
|
|
self.initialize()
|
|
|
|
|
2018-05-25 16:07:18 +02:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class KeywordJob(Job):
|
|
|
|
"""Print available keywords"""
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2017-09-30 18:52:23 +02:00
|
|
|
print("\nKeywords for filenames and --filter:")
|
|
|
|
print("------------------------------------")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2017-05-17 14:31:14 +02:00
|
|
|
raise exception.StopExtraction()
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2017-05-17 14:31:14 +02:00
|
|
|
print("Keywords for directory names:")
|
|
|
|
print("-----------------------------")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2020-06-18 15:04:15 +02:00
|
|
|
extr = None
|
|
|
|
if "_extractor" in kwdict:
|
|
|
|
extr = kwdict["_extractor"].from_url(url)
|
|
|
|
|
2019-12-28 22:26:49 +01:00
|
|
|
if not util.filter_dict(kwdict):
|
2017-09-26 20:50:49 +02:00
|
|
|
self.extractor.log.info(
|
2019-12-28 22:26:49 +01:00
|
|
|
"This extractor only spawns other extractors "
|
|
|
|
"and does not provide any metadata on its own.")
|
|
|
|
|
2020-06-18 15:04:15 +02:00
|
|
|
if extr:
|
2019-12-28 22:26:49 +01:00
|
|
|
self.extractor.log.info(
|
|
|
|
"Showing results for '%s' instead:\n", url)
|
|
|
|
KeywordJob(extr, self).run()
|
|
|
|
else:
|
|
|
|
self.extractor.log.info(
|
|
|
|
"Try 'gallery-dl -K \"%s\"' instead.", url)
|
2017-09-26 20:50:49 +02:00
|
|
|
else:
|
|
|
|
print("Keywords for --chapter-filter:")
|
|
|
|
print("------------------------------")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2020-06-18 15:04:15 +02:00
|
|
|
if extr or self.extractor.categorytransfer:
|
2017-09-26 20:50:49 +02:00
|
|
|
print()
|
2020-06-18 15:04:15 +02:00
|
|
|
KeywordJob(extr or url, self).run()
|
2017-08-10 17:36:21 +02:00
|
|
|
raise exception.StopExtraction()
|
|
|
|
|
2015-11-13 01:02:49 +01:00
|
|
|
@staticmethod
|
2019-10-29 15:46:35 +01:00
|
|
|
def print_kwdict(kwdict, prefix=""):
|
|
|
|
"""Print key-value pairs in 'kwdict' with formatting"""
|
2017-05-17 14:31:14 +02:00
|
|
|
suffix = "]" if prefix else ""
|
2019-10-29 15:46:35 +01:00
|
|
|
for key, value in sorted(kwdict.items()):
|
2019-02-12 21:26:41 +01:00
|
|
|
if key[0] == "_":
|
|
|
|
continue
|
2017-05-17 14:31:14 +02:00
|
|
|
key = prefix + key + suffix
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
if isinstance(value, dict):
|
2019-10-29 15:46:35 +01:00
|
|
|
KeywordJob.print_kwdict(value, key + "[")
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
elif isinstance(value, list):
|
|
|
|
if value and isinstance(value[0], dict):
|
2019-10-29 15:46:35 +01:00
|
|
|
KeywordJob.print_kwdict(value[0], key + "[][")
|
2017-05-15 18:30:47 +02:00
|
|
|
else:
|
2017-05-17 14:31:14 +02:00
|
|
|
print(key, "[]", sep="")
|
2017-05-15 18:30:47 +02:00
|
|
|
for val in value:
|
2017-05-17 14:31:14 +02:00
|
|
|
print(" -", val)
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
else:
|
|
|
|
# string or number
|
2017-05-17 14:31:14 +02:00
|
|
|
print(key, "\n ", value, sep="")
|
2015-12-10 02:14:28 +01:00
|
|
|
|
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class UrlJob(Job):
|
|
|
|
"""Print download urls"""
|
2018-01-22 22:49:00 +01:00
|
|
|
maxdepth = 1
|
2017-02-17 22:18:16 +01:00
|
|
|
|
2017-09-30 18:52:23 +02:00
|
|
|
def __init__(self, url, parent=None, depth=1):
|
|
|
|
Job.__init__(self, url, parent)
|
2017-02-17 22:18:16 +01:00
|
|
|
self.depth = depth
|
2018-01-22 22:49:00 +01:00
|
|
|
if depth >= self.maxdepth:
|
2017-09-12 16:19:00 +02:00
|
|
|
self.handle_queue = self.handle_url
|
2015-12-10 02:14:28 +01:00
|
|
|
|
2017-05-23 11:48:00 +02:00
|
|
|
@staticmethod
|
2021-04-12 01:55:55 +02:00
|
|
|
def handle_url(url, _):
|
|
|
|
print(url)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def handle_url_fallback(url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
print(url)
|
2020-10-16 01:09:55 +02:00
|
|
|
if "_fallback" in kwdict:
|
|
|
|
for url in kwdict["_fallback"]:
|
|
|
|
print("|", url)
|
2018-01-17 22:08:19 +01:00
|
|
|
|
2021-05-19 15:52:30 +02:00
|
|
|
def handle_queue(self, url, kwdict):
|
|
|
|
cls = kwdict.get("_extractor")
|
|
|
|
if cls:
|
|
|
|
extr = cls.from_url(url)
|
|
|
|
else:
|
|
|
|
extr = extractor.find(url)
|
|
|
|
|
|
|
|
if extr:
|
2021-06-26 00:26:08 +02:00
|
|
|
self.status |= self.__class__(extr, self, self.depth + 1).run()
|
2021-05-19 15:52:30 +02:00
|
|
|
else:
|
2017-05-27 16:16:57 +02:00
|
|
|
self._write_unsupported(url)
|
2017-05-23 11:48:00 +02:00
|
|
|
|
2015-12-12 01:16:02 +01:00
|
|
|
|
2021-03-02 23:59:34 +01:00
|
|
|
class InfoJob(Job):
|
|
|
|
"""Print extractor defaults and settings"""
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
ex = self.extractor
|
|
|
|
pm = self._print_multi
|
|
|
|
pc = self._print_config
|
|
|
|
|
|
|
|
if ex.basecategory:
|
|
|
|
pm("Category / Subcategory / Basecategory",
|
|
|
|
ex.category, ex.subcategory, ex.basecategory)
|
|
|
|
else:
|
|
|
|
pm("Category / Subcategory", ex.category, ex.subcategory)
|
|
|
|
|
|
|
|
pc("Filename format", "filename", ex.filename_fmt)
|
|
|
|
pc("Directory format", "directory", ex.directory_fmt)
|
2021-04-06 18:20:42 +02:00
|
|
|
pc("Archive format", "archive-format", ex.archive_fmt)
|
2021-03-02 23:59:34 +01:00
|
|
|
pc("Request interval", "sleep-request", ex.request_interval)
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def _print_multi(self, title, *values):
|
|
|
|
print(title, "\n ", " / ".join(json.dumps(v) for v in values), sep="")
|
|
|
|
|
|
|
|
def _print_config(self, title, optname, value):
|
|
|
|
optval = self.extractor.config(optname, util.SENTINEL)
|
|
|
|
if optval is not util.SENTINEL:
|
|
|
|
print(title, "(custom):\n ", json.dumps(optval))
|
|
|
|
print(title, "(default):\n ", json.dumps(value))
|
|
|
|
elif value:
|
|
|
|
print(title, "(default):\n ", json.dumps(value))
|
|
|
|
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
class DataJob(Job):
|
|
|
|
"""Collect extractor results and dump them"""
|
|
|
|
|
2018-11-15 14:24:18 +01:00
|
|
|
def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True):
|
2017-10-06 15:38:35 +02:00
|
|
|
Job.__init__(self, url, parent)
|
2017-04-12 18:43:41 +02:00
|
|
|
self.file = file
|
|
|
|
self.data = []
|
2019-11-23 23:50:16 +01:00
|
|
|
self.ascii = config.get(("output",), "ascii", ensure_ascii)
|
2017-04-12 18:43:41 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
private = config.get(("output",), "private")
|
2021-05-04 18:00:38 +02:00
|
|
|
self.filter = util.identity if private else util.filter_dict
|
2019-11-21 16:57:39 +01:00
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
def run(self):
|
2020-09-12 21:04:47 +02:00
|
|
|
sleep = self.extractor.config("sleep-extractor")
|
|
|
|
if sleep:
|
|
|
|
time.sleep(sleep)
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
# collect data
|
|
|
|
try:
|
|
|
|
for msg in self.extractor:
|
2017-11-18 17:35:57 +01:00
|
|
|
self.dispatch(msg)
|
2018-11-15 14:24:18 +01:00
|
|
|
except exception.StopExtraction:
|
|
|
|
pass
|
2017-04-12 18:43:41 +02:00
|
|
|
except Exception as exc:
|
|
|
|
self.data.append((exc.__class__.__name__, str(exc)))
|
2017-11-18 17:35:57 +01:00
|
|
|
except BaseException:
|
|
|
|
pass
|
2017-04-12 18:43:41 +02:00
|
|
|
|
2019-05-09 16:22:06 +02:00
|
|
|
# convert numbers to string
|
2019-11-23 23:50:16 +01:00
|
|
|
if config.get(("output",), "num-to-str", False):
|
2018-10-08 20:28:54 +02:00
|
|
|
for msg in self.data:
|
|
|
|
util.transform_dict(msg[-1], util.number_to_string)
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
# dump to 'file'
|
2020-06-19 23:05:44 +02:00
|
|
|
try:
|
|
|
|
util.dump_json(self.data, self.file, self.ascii, 2)
|
|
|
|
self.file.flush()
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
|
2019-10-27 23:05:00 +01:00
|
|
|
return 0
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Url, url, self.filter(kwdict)))
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Directory, self.filter(kwdict)))
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Queue, url, self.filter(kwdict)))
|