2015-04-05 16:23:20 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2020-01-18 00:30:57 +01:00
|
|
|
# Copyright 2015-2020 Mike Fährmann
|
2015-04-05 16:23:20 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
import sys
|
2017-12-04 17:06:17 +01:00
|
|
|
import time
|
2019-12-18 22:08:53 +01:00
|
|
|
import errno
|
2018-02-01 20:49:41 +01:00
|
|
|
import logging
|
2018-05-20 22:03:57 +02:00
|
|
|
from . import extractor, downloader, postprocessor
|
2018-10-13 17:21:55 +02:00
|
|
|
from . import config, text, util, output, exception
|
2015-11-24 19:47:51 +01:00
|
|
|
from .extractor.message import Message
|
2015-04-05 16:23:20 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class Job():
|
|
|
|
"""Base class for Job-types"""
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
ulog = None
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2019-02-12 21:26:41 +01:00
|
|
|
def __init__(self, extr, parent=None):
|
|
|
|
if isinstance(extr, str):
|
|
|
|
extr = extractor.find(extr)
|
|
|
|
if not extr:
|
|
|
|
raise exception.NoExtractorError()
|
|
|
|
self.extractor = extr
|
2020-05-18 01:35:53 +02:00
|
|
|
self.pathfmt = None
|
|
|
|
|
|
|
|
self._logger_extra = {
|
|
|
|
"job" : self,
|
|
|
|
"extractor": extr,
|
|
|
|
"path" : output.PathfmtProxy(self),
|
|
|
|
"keywords" : output.KwdictProxy(self),
|
|
|
|
}
|
|
|
|
extr.log = self._wrap_logger(extr.log)
|
2019-02-13 17:39:43 +01:00
|
|
|
extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url)
|
2015-12-12 00:11:05 +01:00
|
|
|
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status = 0
|
2019-06-29 22:48:59 +02:00
|
|
|
self.pred_url = self._prepare_predicates("image", True)
|
|
|
|
self.pred_queue = self._prepare_predicates("chapter", False)
|
2017-09-06 17:08:50 +02:00
|
|
|
|
2020-05-12 23:52:01 +02:00
|
|
|
if parent:
|
|
|
|
pextr = parent.extractor
|
|
|
|
|
|
|
|
# transfer (sub)category
|
|
|
|
if pextr.config("category-transfer", pextr.categorytransfer):
|
|
|
|
extr.category = pextr.category
|
|
|
|
extr.subcategory = pextr.subcategory
|
|
|
|
|
2020-07-28 23:38:18 +02:00
|
|
|
# transfer parent directory
|
|
|
|
extr._parentdir = pextr._parentdir
|
|
|
|
|
2020-05-12 23:52:01 +02:00
|
|
|
# reuse connection adapters
|
|
|
|
extr.session.adapters = pextr.session.adapters
|
2017-09-30 18:52:23 +02:00
|
|
|
|
2018-02-08 23:10:58 +01:00
|
|
|
# user-supplied metadata
|
|
|
|
self.userkwds = self.extractor.config("keywords")
|
|
|
|
|
2017-02-23 21:51:29 +01:00
|
|
|
def run(self):
|
|
|
|
"""Execute or run the job"""
|
|
|
|
try:
|
2017-03-11 01:47:57 +01:00
|
|
|
log = self.extractor.log
|
2017-02-23 21:51:29 +01:00
|
|
|
for msg in self.extractor:
|
2017-02-26 02:06:56 +01:00
|
|
|
self.dispatch(msg)
|
2019-10-27 23:05:00 +01:00
|
|
|
except exception.StopExtraction as exc:
|
|
|
|
if exc.message:
|
2019-10-28 16:06:36 +01:00
|
|
|
log.error(exc.message)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= exc.code
|
2019-10-27 23:05:00 +01:00
|
|
|
except exception.GalleryDLException as exc:
|
|
|
|
log.error("%s: %s", exc.__class__.__name__, exc)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= exc.code
|
2017-08-10 16:29:05 +02:00
|
|
|
except OSError as exc:
|
2018-12-04 19:24:50 +01:00
|
|
|
log.error("Unable to download data: %s: %s",
|
|
|
|
exc.__class__.__name__, exc)
|
|
|
|
log.debug("", exc_info=True)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 128
|
2017-04-18 11:38:48 +02:00
|
|
|
except Exception as exc:
|
2017-08-10 16:29:05 +02:00
|
|
|
log.error(("An unexpected error occurred: %s - %s. "
|
|
|
|
"Please run gallery-dl again with the --verbose flag, "
|
|
|
|
"copy its output and report this issue on "
|
|
|
|
"https://github.com/mikf/gallery-dl/issues ."),
|
|
|
|
exc.__class__.__name__, exc)
|
2018-12-04 19:24:50 +01:00
|
|
|
log.debug("", exc_info=True)
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 1
|
2019-11-03 21:45:45 +01:00
|
|
|
except BaseException:
|
|
|
|
self.status |= 1
|
|
|
|
raise
|
2019-07-27 11:14:52 +02:00
|
|
|
finally:
|
|
|
|
self.handle_finalize()
|
2019-10-29 15:56:54 +01:00
|
|
|
return self.status
|
2017-04-18 11:38:48 +02:00
|
|
|
|
2017-02-26 02:06:56 +01:00
|
|
|
def dispatch(self, msg):
|
|
|
|
"""Call the appropriate message handler"""
|
2017-03-17 09:39:46 +01:00
|
|
|
if msg[0] == Message.Url:
|
2017-09-06 17:08:50 +02:00
|
|
|
_, url, kwds = msg
|
|
|
|
if self.pred_url(url, kwds):
|
|
|
|
self.update_kwdict(kwds)
|
|
|
|
self.handle_url(url, kwds)
|
2017-02-26 02:06:56 +01:00
|
|
|
|
|
|
|
elif msg[0] == Message.Directory:
|
|
|
|
self.update_kwdict(msg[1])
|
|
|
|
self.handle_directory(msg[1])
|
|
|
|
|
2017-03-17 09:39:46 +01:00
|
|
|
elif msg[0] == Message.Queue:
|
2017-09-12 16:19:00 +02:00
|
|
|
_, url, kwds = msg
|
|
|
|
if self.pred_queue(url, kwds):
|
|
|
|
self.handle_queue(url, kwds)
|
2017-02-26 02:06:56 +01:00
|
|
|
|
2018-01-17 22:08:19 +01:00
|
|
|
elif msg[0] == Message.Urllist:
|
|
|
|
_, urls, kwds = msg
|
|
|
|
if self.pred_url(urls[0], kwds):
|
|
|
|
self.update_kwdict(kwds)
|
|
|
|
self.handle_urllist(urls, kwds)
|
|
|
|
|
2019-12-09 07:56:27 +01:00
|
|
|
elif msg[0] == Message.Metadata:
|
2019-12-16 17:19:23 +01:00
|
|
|
self.update_kwdict(msg[1])
|
|
|
|
self.handle_metadata(msg[1])
|
2019-12-09 07:56:27 +01:00
|
|
|
|
2017-02-26 02:06:56 +01:00
|
|
|
elif msg[0] == Message.Version:
|
|
|
|
if msg[1] != 1:
|
|
|
|
raise "unsupported message-version ({}, {})".format(
|
|
|
|
self.extractor.category, msg[1]
|
|
|
|
)
|
|
|
|
# TODO: support for multiple message versions
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Url"""
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_urllist(self, urls, kwdict):
|
2018-01-17 22:08:19 +01:00
|
|
|
"""Handle Message.Urllist"""
|
2019-10-29 15:46:35 +01:00
|
|
|
self.handle_url(urls[0], kwdict)
|
2018-01-17 22:08:19 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Directory"""
|
|
|
|
|
2019-12-16 17:19:23 +01:00
|
|
|
def handle_metadata(self, kwdict):
|
|
|
|
"""Handle Message.Metadata"""
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Handle Message.Queue"""
|
|
|
|
|
2018-06-08 17:39:02 +02:00
|
|
|
def handle_finalize(self):
|
|
|
|
"""Handle job finalization"""
|
|
|
|
|
2016-09-24 10:45:11 +02:00
|
|
|
def update_kwdict(self, kwdict):
|
2018-02-08 23:10:58 +01:00
|
|
|
"""Update 'kwdict' with additional metadata"""
|
2019-10-29 15:46:35 +01:00
|
|
|
extr = self.extractor
|
|
|
|
kwdict["category"] = extr.category
|
|
|
|
kwdict["subcategory"] = extr.subcategory
|
2018-02-08 23:10:58 +01:00
|
|
|
if self.userkwds:
|
|
|
|
kwdict.update(self.userkwds)
|
2015-12-12 00:11:05 +01:00
|
|
|
|
2019-06-29 22:48:59 +02:00
|
|
|
def _prepare_predicates(self, target, skip=True):
|
|
|
|
predicates = []
|
|
|
|
|
|
|
|
if self.extractor.config(target + "-unique"):
|
|
|
|
predicates.append(util.UniquePredicate())
|
|
|
|
|
2018-10-07 21:34:25 +02:00
|
|
|
pfilter = self.extractor.config(target + "-filter")
|
|
|
|
if pfilter:
|
|
|
|
try:
|
|
|
|
pred = util.FilterPredicate(pfilter, target)
|
|
|
|
except (SyntaxError, ValueError, TypeError) as exc:
|
|
|
|
self.extractor.log.warning(exc)
|
|
|
|
else:
|
|
|
|
predicates.append(pred)
|
|
|
|
|
|
|
|
prange = self.extractor.config(target + "-range")
|
|
|
|
if prange:
|
|
|
|
try:
|
|
|
|
pred = util.RangePredicate(prange)
|
|
|
|
except ValueError as exc:
|
|
|
|
self.extractor.log.warning(
|
|
|
|
"invalid %s range: %s", target, exc)
|
|
|
|
else:
|
|
|
|
if skip and pred.lower > 1 and not pfilter:
|
|
|
|
pred.index += self.extractor.skip(pred.lower - 1)
|
|
|
|
predicates.append(pred)
|
|
|
|
|
|
|
|
return util.build_predicate(predicates)
|
|
|
|
|
2020-05-18 01:35:53 +02:00
|
|
|
def get_logger(self, name):
|
|
|
|
return self._wrap_logger(logging.getLogger(name))
|
|
|
|
|
|
|
|
def _wrap_logger(self, logger):
|
|
|
|
return output.LoggerAdapter(logger, self._logger_extra)
|
|
|
|
|
2017-05-27 16:16:57 +02:00
|
|
|
def _write_unsupported(self, url):
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
if self.ulog:
|
|
|
|
self.ulog.info(url)
|
2017-05-27 16:16:57 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class DownloadJob(Job):
|
|
|
|
"""Download images into appropriate directory/filename locations"""
|
|
|
|
|
2017-09-30 18:52:23 +02:00
|
|
|
def __init__(self, url, parent=None):
|
2017-10-06 15:38:35 +02:00
|
|
|
Job.__init__(self, url, parent)
|
2020-05-18 01:35:53 +02:00
|
|
|
self.log = self.get_logger("download")
|
2018-02-01 20:49:41 +01:00
|
|
|
self.archive = None
|
2017-12-29 22:15:57 +01:00
|
|
|
self.sleep = None
|
2015-04-08 01:51:48 +02:00
|
|
|
self.downloaders = {}
|
2018-06-07 22:29:54 +02:00
|
|
|
self.postprocessors = None
|
2016-09-30 12:32:48 +02:00
|
|
|
self.out = output.select()
|
2020-01-29 18:32:37 +01:00
|
|
|
|
|
|
|
if parent:
|
|
|
|
self.visited = parent.visited
|
|
|
|
pfmt = parent.pathfmt
|
|
|
|
if pfmt and parent.extractor.config("parent-directory"):
|
|
|
|
self.extractor._parentdir = pfmt.directory
|
|
|
|
else:
|
|
|
|
self.visited = set()
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict, fallback=None):
|
2016-09-24 10:45:11 +02:00
|
|
|
"""Download the resource specified in 'url'"""
|
2019-07-13 21:42:07 +02:00
|
|
|
postprocessors = self.postprocessors
|
|
|
|
pathfmt = self.pathfmt
|
|
|
|
archive = self.archive
|
|
|
|
|
2018-02-01 20:49:41 +01:00
|
|
|
# prepare download
|
2019-10-29 15:46:35 +01:00
|
|
|
pathfmt.set_filename(kwdict)
|
2018-02-01 20:49:41 +01:00
|
|
|
|
2019-07-13 21:42:07 +02:00
|
|
|
if postprocessors:
|
|
|
|
for pp in postprocessors:
|
|
|
|
pp.prepare(pathfmt)
|
2018-10-18 22:32:03 +02:00
|
|
|
|
2019-07-13 21:42:07 +02:00
|
|
|
if pathfmt.exists(archive):
|
2018-10-13 17:21:55 +02:00
|
|
|
self.handle_skip()
|
2018-02-01 20:49:41 +01:00
|
|
|
return
|
|
|
|
|
|
|
|
if self.sleep:
|
|
|
|
time.sleep(self.sleep)
|
|
|
|
|
|
|
|
# download from URL
|
2018-10-05 17:58:15 +02:00
|
|
|
if not self.download(url):
|
2018-02-01 20:49:41 +01:00
|
|
|
|
|
|
|
# use fallback URLs if available
|
2018-02-12 16:56:45 +01:00
|
|
|
for num, url in enumerate(fallback or (), 1):
|
2020-01-19 22:53:06 +01:00
|
|
|
util.remove_file(pathfmt.temppath)
|
2018-02-01 20:49:41 +01:00
|
|
|
self.log.info("Trying fallback URL #%d", num)
|
2018-10-05 17:58:15 +02:00
|
|
|
if self.download(url):
|
2018-02-01 20:49:41 +01:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
# download failed
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= 4
|
2019-07-13 21:42:07 +02:00
|
|
|
self.log.error("Failed to download %s",
|
|
|
|
pathfmt.filename or url)
|
2018-02-01 20:49:41 +01:00
|
|
|
return
|
|
|
|
|
2019-07-13 21:42:07 +02:00
|
|
|
if not pathfmt.temppath:
|
2018-10-13 17:21:55 +02:00
|
|
|
self.handle_skip()
|
2018-06-27 17:16:07 +02:00
|
|
|
return
|
|
|
|
|
2018-05-20 22:03:57 +02:00
|
|
|
# run post processors
|
2019-07-13 21:42:07 +02:00
|
|
|
if postprocessors:
|
|
|
|
for pp in postprocessors:
|
|
|
|
pp.run(pathfmt)
|
2018-05-20 22:03:57 +02:00
|
|
|
|
2018-02-12 16:56:45 +01:00
|
|
|
# download succeeded
|
2019-07-13 21:42:07 +02:00
|
|
|
pathfmt.finalize()
|
|
|
|
self.out.success(pathfmt.path, 0)
|
|
|
|
if archive:
|
2019-10-29 15:46:35 +01:00
|
|
|
archive.add(kwdict)
|
2019-10-06 21:58:00 +02:00
|
|
|
if postprocessors:
|
|
|
|
for pp in postprocessors:
|
|
|
|
pp.run_after(pathfmt)
|
2018-10-13 17:21:55 +02:00
|
|
|
self._skipcnt = 0
|
2018-01-17 22:08:19 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_urllist(self, urls, kwdict):
|
2018-01-17 22:08:19 +01:00
|
|
|
"""Download the resource specified in 'url'"""
|
2018-02-01 20:49:41 +01:00
|
|
|
fallback = iter(urls)
|
|
|
|
url = next(fallback)
|
2019-10-29 15:46:35 +01:00
|
|
|
self.handle_url(url, kwdict, fallback)
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2015-04-08 01:51:48 +02:00
|
|
|
"""Set and create the target directory for downloads"""
|
2018-09-21 17:55:04 +02:00
|
|
|
if not self.pathfmt:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.initialize(kwdict)
|
2018-11-21 22:21:26 +01:00
|
|
|
else:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.pathfmt.set_directory(kwdict)
|
2018-09-21 17:55:04 +02:00
|
|
|
|
2019-12-16 17:19:23 +01:00
|
|
|
def handle_metadata(self, kwdict):
|
|
|
|
"""Run postprocessors with metadata from 'kwdict'"""
|
|
|
|
postprocessors = self.postprocessors
|
|
|
|
|
|
|
|
if postprocessors:
|
2020-07-04 21:55:12 +02:00
|
|
|
kwdict["extension"] = "metadata"
|
2019-12-16 17:19:23 +01:00
|
|
|
pathfmt = self.pathfmt
|
|
|
|
pathfmt.set_filename(kwdict)
|
|
|
|
for pp in postprocessors:
|
|
|
|
pp.run_metadata(pathfmt)
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2019-12-26 23:38:16 +01:00
|
|
|
if url in self.visited:
|
|
|
|
return
|
|
|
|
self.visited.add(url)
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
if "_extractor" in kwdict:
|
|
|
|
extr = kwdict["_extractor"].from_url(url)
|
2019-02-12 21:26:41 +01:00
|
|
|
else:
|
|
|
|
extr = extractor.find(url)
|
|
|
|
if extr:
|
2019-10-29 15:56:54 +01:00
|
|
|
self.status |= self.__class__(extr, self).run()
|
2019-02-12 21:26:41 +01:00
|
|
|
else:
|
2018-09-21 17:55:04 +02:00
|
|
|
self._write_unsupported(url)
|
|
|
|
|
|
|
|
def handle_finalize(self):
|
2019-11-03 21:45:45 +01:00
|
|
|
pathfmt = self.pathfmt
|
2019-09-10 22:26:40 +02:00
|
|
|
if self.archive:
|
|
|
|
self.archive.close()
|
2019-11-03 21:45:45 +01:00
|
|
|
if pathfmt:
|
2019-10-18 21:31:33 +02:00
|
|
|
self.extractor._store_cookies()
|
2019-11-03 21:45:45 +01:00
|
|
|
if self.postprocessors:
|
|
|
|
status = self.status
|
|
|
|
for pp in self.postprocessors:
|
|
|
|
pp.run_final(pathfmt, status)
|
2018-09-21 17:55:04 +02:00
|
|
|
|
2018-10-13 17:21:55 +02:00
|
|
|
def handle_skip(self):
|
|
|
|
self.out.skip(self.pathfmt.path)
|
|
|
|
if self._skipexc:
|
|
|
|
self._skipcnt += 1
|
|
|
|
if self._skipcnt >= self._skipmax:
|
|
|
|
raise self._skipexc()
|
|
|
|
|
2018-10-05 17:58:15 +02:00
|
|
|
def download(self, url):
|
|
|
|
"""Download 'url'"""
|
2018-09-21 17:55:04 +02:00
|
|
|
scheme = url.partition(":")[0]
|
2018-10-05 17:58:15 +02:00
|
|
|
downloader = self.get_downloader(scheme)
|
|
|
|
if downloader:
|
2019-12-18 22:08:53 +01:00
|
|
|
try:
|
|
|
|
return downloader.download(url, self.pathfmt)
|
|
|
|
except OSError as exc:
|
|
|
|
if exc.errno == errno.ENOSPC:
|
|
|
|
raise
|
|
|
|
self.log.warning("%s: %s", exc.__class__.__name__, exc)
|
|
|
|
return False
|
2018-11-13 18:06:36 +01:00
|
|
|
self._write_unsupported(url)
|
2018-10-05 17:58:15 +02:00
|
|
|
return False
|
|
|
|
|
|
|
|
def get_downloader(self, scheme):
|
|
|
|
"""Return a downloader suitable for 'scheme'"""
|
2018-09-21 17:55:04 +02:00
|
|
|
try:
|
|
|
|
return self.downloaders[scheme]
|
|
|
|
except KeyError:
|
|
|
|
pass
|
2018-10-05 17:58:15 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
cls = downloader.find(scheme)
|
|
|
|
if cls and config.get(("downloader", cls.scheme), "enabled", True):
|
2020-05-18 01:35:53 +02:00
|
|
|
instance = cls(self)
|
2018-10-05 17:58:15 +02:00
|
|
|
else:
|
|
|
|
instance = None
|
2018-11-16 18:02:24 +01:00
|
|
|
self.log.error("'%s:' URLs are not supported/enabled", scheme)
|
2019-06-20 16:59:44 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
if cls and cls.scheme == "http":
|
2019-06-20 16:59:44 +02:00
|
|
|
self.downloaders["http"] = self.downloaders["https"] = instance
|
|
|
|
else:
|
|
|
|
self.downloaders[scheme] = instance
|
2018-09-21 17:55:04 +02:00
|
|
|
return instance
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def initialize(self, kwdict=None):
|
2018-09-21 17:55:04 +02:00
|
|
|
"""Delayed initialization of PathFormat, etc."""
|
2020-03-10 23:08:29 +01:00
|
|
|
config = self.extractor.config
|
|
|
|
pathfmt = self.pathfmt = util.PathFormat(self.extractor)
|
2019-10-29 15:46:35 +01:00
|
|
|
if kwdict:
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.set_directory(kwdict)
|
2019-07-13 21:49:26 +02:00
|
|
|
|
2020-03-10 23:08:29 +01:00
|
|
|
self.sleep = config("sleep")
|
|
|
|
if not config("download", True):
|
|
|
|
self.download = pathfmt.fix_extension
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2020-03-10 23:08:29 +01:00
|
|
|
skip = config("skip", True)
|
2018-10-13 17:21:55 +02:00
|
|
|
if skip:
|
|
|
|
self._skipexc = None
|
2019-08-08 18:34:31 +02:00
|
|
|
if skip == "enumerate":
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.check_file = pathfmt._enum_file
|
2019-08-08 18:34:31 +02:00
|
|
|
elif isinstance(skip, str):
|
2018-10-13 17:21:55 +02:00
|
|
|
skip, _, smax = skip.partition(":")
|
|
|
|
if skip == "abort":
|
|
|
|
self._skipexc = exception.StopExtraction
|
|
|
|
elif skip == "exit":
|
|
|
|
self._skipexc = sys.exit
|
|
|
|
self._skipcnt = 0
|
|
|
|
self._skipmax = text.parse_int(smax)
|
|
|
|
else:
|
2020-03-10 23:08:29 +01:00
|
|
|
pathfmt.exists = lambda x=None: False
|
2018-10-13 17:21:55 +02:00
|
|
|
|
2020-03-10 23:08:29 +01:00
|
|
|
archive = config("archive")
|
2018-06-08 17:39:02 +02:00
|
|
|
if archive:
|
|
|
|
path = util.expand_path(archive)
|
2019-09-10 16:44:47 +02:00
|
|
|
try:
|
|
|
|
self.archive = util.DownloadArchive(path, self.extractor)
|
|
|
|
except Exception as exc:
|
|
|
|
self.extractor.log.warning(
|
|
|
|
"Failed to open download archive at '%s' ('%s: %s')",
|
|
|
|
path, exc.__class__.__name__, exc)
|
|
|
|
else:
|
|
|
|
self.extractor.log.debug("Using download archive '%s'", path)
|
2018-06-08 17:39:02 +02:00
|
|
|
|
2020-03-10 23:08:29 +01:00
|
|
|
postprocessors = config("postprocessors")
|
2018-06-08 17:39:02 +02:00
|
|
|
if postprocessors:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log = self.get_logger("postprocessor")
|
2019-08-15 13:31:04 +02:00
|
|
|
pp_list = []
|
2020-03-10 23:08:29 +01:00
|
|
|
category = self.extractor.category
|
2019-08-15 13:31:04 +02:00
|
|
|
|
2018-06-08 17:39:02 +02:00
|
|
|
for pp_dict in postprocessors:
|
2018-09-03 14:53:43 +02:00
|
|
|
whitelist = pp_dict.get("whitelist")
|
|
|
|
blacklist = pp_dict.get("blacklist")
|
2020-03-10 23:08:29 +01:00
|
|
|
if (whitelist and category not in whitelist or
|
|
|
|
blacklist and category in blacklist):
|
2018-06-08 17:39:02 +02:00
|
|
|
continue
|
2018-09-03 14:53:43 +02:00
|
|
|
name = pp_dict.get("name")
|
2018-06-08 17:39:02 +02:00
|
|
|
pp_cls = postprocessor.find(name)
|
|
|
|
if not pp_cls:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log.warning("module '%s' not found", name)
|
2018-06-08 17:39:02 +02:00
|
|
|
continue
|
|
|
|
try:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_obj = pp_cls(self, pp_dict)
|
2018-06-08 17:39:02 +02:00
|
|
|
except Exception as exc:
|
2020-05-18 01:35:53 +02:00
|
|
|
pp_log.error("'%s' initialization failed: %s: %s",
|
|
|
|
name, exc.__class__.__name__, exc)
|
2018-06-08 17:39:02 +02:00
|
|
|
else:
|
2019-08-15 13:31:04 +02:00
|
|
|
pp_list.append(pp_obj)
|
|
|
|
|
|
|
|
if pp_list:
|
|
|
|
self.postprocessors = pp_list
|
|
|
|
self.extractor.log.debug(
|
|
|
|
"Active postprocessor modules: %s", pp_list)
|
2015-04-08 01:51:48 +02:00
|
|
|
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2018-05-25 16:07:18 +02:00
|
|
|
class SimulationJob(DownloadJob):
|
|
|
|
"""Simulate the extraction process without downloading anything"""
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict, fallback=None):
|
|
|
|
self.pathfmt.set_filename(kwdict)
|
2018-05-25 16:07:18 +02:00
|
|
|
self.out.skip(self.pathfmt.path)
|
|
|
|
if self.sleep:
|
|
|
|
time.sleep(self.sleep)
|
|
|
|
if self.archive:
|
2019-10-29 15:46:35 +01:00
|
|
|
self.archive.add(kwdict)
|
2018-05-25 16:07:18 +02:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2018-09-21 17:55:04 +02:00
|
|
|
if not self.pathfmt:
|
|
|
|
self.initialize()
|
|
|
|
|
2018-05-25 16:07:18 +02:00
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class KeywordJob(Job):
|
|
|
|
"""Print available keywords"""
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2017-09-30 18:52:23 +02:00
|
|
|
print("\nKeywords for filenames and --filter:")
|
|
|
|
print("------------------------------------")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2017-05-17 14:31:14 +02:00
|
|
|
raise exception.StopExtraction()
|
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2017-05-17 14:31:14 +02:00
|
|
|
print("Keywords for directory names:")
|
|
|
|
print("-----------------------------")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2015-11-13 01:02:49 +01:00
|
|
|
|
2019-10-29 15:46:35 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2020-06-18 15:04:15 +02:00
|
|
|
extr = None
|
|
|
|
if "_extractor" in kwdict:
|
|
|
|
extr = kwdict["_extractor"].from_url(url)
|
|
|
|
|
2019-12-28 22:26:49 +01:00
|
|
|
if not util.filter_dict(kwdict):
|
2017-09-26 20:50:49 +02:00
|
|
|
self.extractor.log.info(
|
2019-12-28 22:26:49 +01:00
|
|
|
"This extractor only spawns other extractors "
|
|
|
|
"and does not provide any metadata on its own.")
|
|
|
|
|
2020-06-18 15:04:15 +02:00
|
|
|
if extr:
|
2019-12-28 22:26:49 +01:00
|
|
|
self.extractor.log.info(
|
|
|
|
"Showing results for '%s' instead:\n", url)
|
|
|
|
KeywordJob(extr, self).run()
|
|
|
|
else:
|
|
|
|
self.extractor.log.info(
|
|
|
|
"Try 'gallery-dl -K \"%s\"' instead.", url)
|
2017-09-26 20:50:49 +02:00
|
|
|
else:
|
|
|
|
print("Keywords for --chapter-filter:")
|
|
|
|
print("------------------------------")
|
2019-10-29 15:46:35 +01:00
|
|
|
self.print_kwdict(kwdict)
|
2020-06-18 15:04:15 +02:00
|
|
|
if extr or self.extractor.categorytransfer:
|
2017-09-26 20:50:49 +02:00
|
|
|
print()
|
2020-06-18 15:04:15 +02:00
|
|
|
KeywordJob(extr or url, self).run()
|
2017-08-10 17:36:21 +02:00
|
|
|
raise exception.StopExtraction()
|
|
|
|
|
2015-11-13 01:02:49 +01:00
|
|
|
@staticmethod
|
2019-10-29 15:46:35 +01:00
|
|
|
def print_kwdict(kwdict, prefix=""):
|
|
|
|
"""Print key-value pairs in 'kwdict' with formatting"""
|
2017-05-17 14:31:14 +02:00
|
|
|
suffix = "]" if prefix else ""
|
2019-10-29 15:46:35 +01:00
|
|
|
for key, value in sorted(kwdict.items()):
|
2019-02-12 21:26:41 +01:00
|
|
|
if key[0] == "_":
|
|
|
|
continue
|
2017-05-17 14:31:14 +02:00
|
|
|
key = prefix + key + suffix
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
if isinstance(value, dict):
|
2019-10-29 15:46:35 +01:00
|
|
|
KeywordJob.print_kwdict(value, key + "[")
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
elif isinstance(value, list):
|
|
|
|
if value and isinstance(value[0], dict):
|
2019-10-29 15:46:35 +01:00
|
|
|
KeywordJob.print_kwdict(value[0], key + "[][")
|
2017-05-15 18:30:47 +02:00
|
|
|
else:
|
2017-05-17 14:31:14 +02:00
|
|
|
print(key, "[]", sep="")
|
2017-05-15 18:30:47 +02:00
|
|
|
for val in value:
|
2017-05-17 14:31:14 +02:00
|
|
|
print(" -", val)
|
2017-05-15 18:30:47 +02:00
|
|
|
|
|
|
|
else:
|
|
|
|
# string or number
|
2017-05-17 14:31:14 +02:00
|
|
|
print(key, "\n ", value, sep="")
|
2015-12-10 02:14:28 +01:00
|
|
|
|
|
|
|
|
2015-12-12 00:11:05 +01:00
|
|
|
class UrlJob(Job):
|
|
|
|
"""Print download urls"""
|
2018-01-22 22:49:00 +01:00
|
|
|
maxdepth = 1
|
2017-02-17 22:18:16 +01:00
|
|
|
|
2017-09-30 18:52:23 +02:00
|
|
|
def __init__(self, url, parent=None, depth=1):
|
|
|
|
Job.__init__(self, url, parent)
|
2017-02-17 22:18:16 +01:00
|
|
|
self.depth = depth
|
2018-01-22 22:49:00 +01:00
|
|
|
if depth >= self.maxdepth:
|
2017-09-12 16:19:00 +02:00
|
|
|
self.handle_queue = self.handle_url
|
2015-12-10 02:14:28 +01:00
|
|
|
|
2017-05-23 11:48:00 +02:00
|
|
|
@staticmethod
|
|
|
|
def handle_url(url, _):
|
2016-09-24 10:45:11 +02:00
|
|
|
print(url)
|
2016-08-11 13:20:21 +02:00
|
|
|
|
2018-01-17 22:08:19 +01:00
|
|
|
@staticmethod
|
|
|
|
def handle_urllist(urls, _):
|
|
|
|
prefix = ""
|
|
|
|
for url in urls:
|
|
|
|
print(prefix, url, sep="")
|
|
|
|
prefix = "| "
|
|
|
|
|
2017-09-12 16:19:00 +02:00
|
|
|
def handle_queue(self, url, _):
|
2016-09-24 10:45:11 +02:00
|
|
|
try:
|
2017-09-30 18:52:23 +02:00
|
|
|
UrlJob(url, self, self.depth + 1).run()
|
2016-09-24 10:45:11 +02:00
|
|
|
except exception.NoExtractorError:
|
2017-05-27 16:16:57 +02:00
|
|
|
self._write_unsupported(url)
|
2017-05-23 11:48:00 +02:00
|
|
|
|
2015-12-12 01:16:02 +01:00
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
class DataJob(Job):
|
|
|
|
"""Collect extractor results and dump them"""
|
|
|
|
|
2018-11-15 14:24:18 +01:00
|
|
|
def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True):
|
2017-10-06 15:38:35 +02:00
|
|
|
Job.__init__(self, url, parent)
|
2017-04-12 18:43:41 +02:00
|
|
|
self.file = file
|
|
|
|
self.data = []
|
2019-11-23 23:50:16 +01:00
|
|
|
self.ascii = config.get(("output",), "ascii", ensure_ascii)
|
2017-04-12 18:43:41 +02:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
private = config.get(("output",), "private")
|
2019-11-21 16:57:39 +01:00
|
|
|
self.filter = (lambda x: x) if private else util.filter_dict
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
def run(self):
|
|
|
|
# collect data
|
|
|
|
try:
|
|
|
|
for msg in self.extractor:
|
2017-11-18 17:35:57 +01:00
|
|
|
self.dispatch(msg)
|
2018-11-15 14:24:18 +01:00
|
|
|
except exception.StopExtraction:
|
|
|
|
pass
|
2017-04-12 18:43:41 +02:00
|
|
|
except Exception as exc:
|
|
|
|
self.data.append((exc.__class__.__name__, str(exc)))
|
2017-11-18 17:35:57 +01:00
|
|
|
except BaseException:
|
|
|
|
pass
|
2017-04-12 18:43:41 +02:00
|
|
|
|
2019-05-09 16:22:06 +02:00
|
|
|
# convert numbers to string
|
2019-11-23 23:50:16 +01:00
|
|
|
if config.get(("output",), "num-to-str", False):
|
2018-10-08 20:28:54 +02:00
|
|
|
for msg in self.data:
|
|
|
|
util.transform_dict(msg[-1], util.number_to_string)
|
|
|
|
|
2017-04-12 18:43:41 +02:00
|
|
|
# dump to 'file'
|
2020-06-19 23:05:44 +02:00
|
|
|
try:
|
|
|
|
util.dump_json(self.data, self.file, self.ascii, 2)
|
|
|
|
self.file.flush()
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
|
2019-10-27 23:05:00 +01:00
|
|
|
return 0
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_url(self, url, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Url, url, self.filter(kwdict)))
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_urllist(self, urls, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Urllist, list(urls), self.filter(kwdict)))
|
2018-01-17 22:08:19 +01:00
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_directory(self, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Directory, self.filter(kwdict)))
|
2017-11-18 17:35:57 +01:00
|
|
|
|
2019-12-16 17:19:23 +01:00
|
|
|
def handle_metadata(self, kwdict):
|
|
|
|
self.data.append((Message.Metadata, self.filter(kwdict)))
|
|
|
|
|
2019-02-13 13:22:11 +01:00
|
|
|
def handle_queue(self, url, kwdict):
|
2019-11-21 16:57:39 +01:00
|
|
|
self.data.append((Message.Queue, url, self.filter(kwdict)))
|