1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-22 10:42:34 +01:00

rework and extend input file processing (#4732)

- add 2 command-line options to modify input file contents
  - -I/--input-file-comment
  - -x/--input-file-delete
- implement InputManager class
- move code from util.py to __init__.py
  (mainly to avoid import cycles)
This commit is contained in:
Mike Fährmann 2023-11-14 20:38:11 +01:00
parent 17e710c4bf
commit 4700051562
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
5 changed files with 286 additions and 141 deletions

View File

@ -6,8 +6,6 @@
## General Options:
-h, --help Print this help message and exit
--version Print program version and exit
-i, --input-file FILE Download URLs found in FILE ('-' for stdin).
More than one --input-file can be specified
-f, --filename FORMAT Filename format string for downloaded files
('/O' for "original" filenames)
-d, --destination PATH Target location for file downloads
@ -19,6 +17,16 @@
--clear-cache MODULE Delete cached login sessions, cookies, etc. for
MODULE (ALL to delete everything)
## Input Options:
-i, --input-file FILE Download URLs found in FILE ('-' for stdin).
More than one --input-file can be specified
-I, --input-file-comment FILE
Download URLs found in FILE. Comment them out
after they were downloaded successfully.
-x, --input-file-delete FILE
Download URLs found in FILE. Delete them after
they were downloaded successfully.
## Output Options:
-q, --quiet Activate quiet mode
-v, --verbose Print various debugging information

View File

@ -18,19 +18,6 @@ __email__ = "mike_faehrmann@web.de"
__version__ = version.__version__
def progress(urls, pformat):
"""Wrapper around urls to output a simple progress indicator"""
if pformat is True:
pformat = "[{current}/{total}] {url}\n"
else:
pformat += "\n"
pinfo = {"total": len(urls)}
for pinfo["current"], pinfo["url"] in enumerate(urls, 1):
output.stderr_write(pformat.format_map(pinfo))
yield pinfo["url"]
def main():
try:
parser = option.build_parser()
@ -224,7 +211,7 @@ def main():
return config.initialize()
else:
if not args.urls and not args.inputfiles:
if not args.urls and not args.input_files:
parser.error(
"The following arguments are required: URL\n"
"Use 'gallery-dl --help' to get a list of all options.")
@ -238,22 +225,6 @@ def main():
else:
jobtype = args.jobtype or job.DownloadJob
urls = args.urls
if args.inputfiles:
for inputfile in args.inputfiles:
try:
if inputfile == "-":
if sys.stdin:
urls += util.parse_inputfile(sys.stdin, log)
else:
log.warning(
"input file: stdin is not readable")
else:
with open(inputfile, encoding="utf-8") as file:
urls += util.parse_inputfile(file, log)
except OSError as exc:
log.warning("input file: %s", exc)
# unsupported file logging handler
handler = output.setup_logging_handler(
"unsupportedfile", fmt="{message}")
@ -263,25 +234,44 @@ def main():
ulog.propagate = False
job.Job.ulog = ulog
# collect input URLs
input_manager = InputManager()
input_manager.log = input_log = logging.getLogger("inputfile")
input_manager.add_list(args.urls)
if args.input_files:
for input_file, action in args.input_files:
try:
path = util.expand_path(input_file)
input_manager.add_file(path, action)
except Exception as exc:
input_log.error(exc)
return getattr(exc, "code", 128)
pformat = config.get(("output",), "progress", True)
if pformat and len(urls) > 1 and args.loglevel < logging.ERROR:
urls = progress(urls, pformat)
else:
urls = iter(urls)
if pformat and len(input_manager.urls) > 1 and \
args.loglevel < logging.ERROR:
input_manager.progress(pformat)
# process input URLs
retval = 0
url = next(urls, None)
while url is not None:
for url in input_manager:
try:
log.debug("Starting %s for '%s'", jobtype.__name__, url)
if isinstance(url, util.ExtendedUrl):
if isinstance(url, ExtendedUrl):
for opts in url.gconfig:
config.set(*opts)
with config.apply(url.lconfig):
retval |= jobtype(url.value).run()
status = jobtype(url.value).run()
else:
retval |= jobtype(url).run()
status = jobtype(url).run()
if status:
retval |= status
else:
input_manager.success()
except exception.TerminateExtraction:
pass
except exception.RestartExtraction:
@ -291,8 +281,7 @@ def main():
log.error("Unsupported URL '%s'", url)
retval |= 64
url = next(urls, None)
input_manager.next()
return retval
except KeyboardInterrupt:
@ -304,3 +293,206 @@ def main():
if exc.errno != errno.EPIPE:
raise
return 1
class InputManager():
def __init__(self):
self.urls = []
self.files = ()
self._index = 0
self._current = None
self._pformat = None
def add_url(self, url):
self.urls.append(url)
def add_list(self, urls):
self.urls += urls
def add_file(self, path, action=None):
"""Process an input file.
Lines starting with '#' and empty lines will be ignored.
Lines starting with '-' will be interpreted as a key-value pair
separated by an '='. where
'key' is a dot-separated option name and
'value' is a JSON-parsable string.
These configuration options will be applied
while processing the next URL only.
Lines starting with '-G' are the same as above, except these options
will be applied for *all* following URLs, i.e. they are Global.
Everything else will be used as a potential URL.
Example input file:
# settings global options
-G base-directory = "/tmp/"
-G skip = false
# setting local options for the next URL
-filename="spaces_are_optional.jpg"
-skip = true
https://example.org/
# next URL uses default filename and 'skip' is false.
https://example.com/index.htm # comment1
https://example.com/404.htm # comment2
"""
if path == "-" and not action:
try:
lines = sys.stdin.readlines()
except Exception:
raise exception.InputFileError("stdin is not readable")
path = None
else:
try:
with open(path, encoding="utf-8") as fp:
lines = fp.readlines()
except Exception as exc:
raise exception.InputFileError(str(exc))
if self.files:
self.files[path] = lines
else:
self.files = {path: lines}
if action == "c":
action = self._action_comment
elif action == "d":
action = self._action_delete
else:
action = None
gconf = []
lconf = []
indicies = []
strip_comment = None
append = self.urls.append
for n, line in enumerate(lines):
line = line.strip()
if not line or line[0] == "#":
# empty line or comment
continue
elif line[0] == "-":
# config spec
if len(line) >= 2 and line[1] == "G":
conf = gconf
line = line[2:]
else:
conf = lconf
line = line[1:]
if action:
indicies.append(n)
key, sep, value = line.partition("=")
if not sep:
raise exception.InputFileError(
"Invalid KEY=VALUE pair '%s' on line %s in %s",
line, n+1, path)
try:
value = util.json_loads(value.strip())
except ValueError as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
raise exception.InputFileError(
"Unable to parse '%s' on line %s in %s",
value, n+1, path)
key = key.strip().split(".")
conf.append((key[:-1], key[-1], value))
else:
# url
if " #" in line or "\t#" in line:
if strip_comment is None:
import re
strip_comment = re.compile(r"\s+#.*").sub
line = strip_comment("", line)
if gconf or lconf:
url = ExtendedUrl(line, gconf, lconf)
gconf = []
lconf = []
else:
url = line
if action:
indicies.append(n)
append((url, path, action, indicies))
indicies = []
else:
append(url)
def progress(self, pformat=True):
if pformat is True:
pformat = "[{current}/{total}] {url}\n"
else:
pformat += "\n"
self._pformat = pformat.format_map
def next(self):
self._index += 1
def success(self):
if self._current:
url, path, action, indicies = self._current
lines = self.files[path]
action(lines, indicies)
try:
with open(path, "w", encoding="utf-8") as fp:
fp.writelines(lines)
except Exception as exc:
self.log.warning(
"Unable to update '%s' (%s: %s)",
path, exc.__class__.__name__, exc)
@staticmethod
def _action_comment(lines, indicies):
for i in indicies:
lines[i] = "# " + lines[i]
@staticmethod
def _action_delete(lines, indicies):
for i in indicies:
lines[i] = ""
def __iter__(self):
self._index = 0
return self
def __next__(self):
try:
item = self.urls[self._index]
except IndexError:
raise StopIteration
if isinstance(item, tuple):
self._current = item
item = item[0]
else:
self._current = None
if self._pformat:
output.stderr_write(self._pformat({
"total" : len(self.urls),
"current": self._index + 1,
"url" : item,
}))
return item
class ExtendedUrl():
"""URL with attached config key-value pairs"""
__slots__ = ("value", "gconfig", "lconfig")
def __init__(self, url, gconf, lconf):
self.value = url
self.gconfig = gconf
self.lconfig = lconf
def __str__(self):
return self.value

View File

@ -21,6 +21,7 @@ Exception
| +-- FilenameFormatError
| +-- DirectoryFormatError
+-- FilterError
+-- InputFileError
+-- NoExtractorError
+-- StopExtraction
+-- TerminateExtraction
@ -99,6 +100,15 @@ class FilterError(GalleryDLException):
code = 32
class InputFileError(GalleryDLException):
"""Error when parsing input file"""
code = 32
def __init__(self, message, *args):
GalleryDLException.__init__(
self, message % args if args else message)
class NoExtractorError(GalleryDLException):
"""No extractor can handle the given URL"""
code = 64

View File

@ -59,6 +59,12 @@ class OptionAction(argparse.Action):
namespace.options_pp[key] = value
class InputfileAction(argparse.Action):
"""Process input files"""
def __call__(self, parser, namespace, value, option_string=None):
namespace.input_files.append((value, self.const))
class Formatter(argparse.HelpFormatter):
"""Custom HelpFormatter class to customize help output"""
def __init__(self, prog):
@ -100,12 +106,6 @@ def build_parser():
action="version", version=version.__version__,
help="Print program version and exit",
)
general.add_argument(
"-i", "--input-file",
dest="inputfiles", metavar="FILE", action="append",
help=("Download URLs found in FILE ('-' for stdin). "
"More than one --input-file can be specified"),
)
general.add_argument(
"-f", "--filename",
dest="filename", metavar="FORMAT",
@ -149,6 +149,32 @@ def build_parser():
"(ALL to delete everything)",
)
input = parser.add_argument_group("Input Options")
input.add_argument(
"urls",
metavar="URL", nargs="*",
help=argparse.SUPPRESS,
)
input.add_argument(
"-i", "--input-file",
dest="input_files", metavar="FILE", action=InputfileAction, const=None,
default=[],
help=("Download URLs found in FILE ('-' for stdin). "
"More than one --input-file can be specified"),
)
input.add_argument(
"-I", "--input-file-comment",
dest="input_files", metavar="FILE", action=InputfileAction, const="c",
help=("Download URLs found in FILE. "
"Comment them out after they were downloaded successfully."),
)
input.add_argument(
"-x", "--input-file-delete",
dest="input_files", metavar="FILE", action=InputfileAction, const="d",
help=("Download URLs found in FILE. "
"Delete them after they were downloaded successfully."),
)
output = parser.add_argument_group("Output Options")
output.add_argument(
"-q", "--quiet",
@ -534,10 +560,4 @@ def build_parser():
help="Additional '<key>=<value>' post processor options",
)
parser.add_argument(
"urls",
metavar="URL", nargs="*",
help=argparse.SUPPRESS,
)
return parser

View File

@ -487,82 +487,6 @@ CODES = {
}
def parse_inputfile(file, log):
"""Filter and process strings from an input file.
Lines starting with '#' and empty lines will be ignored.
Lines starting with '-' will be interpreted as a key-value pair separated
by an '='. where 'key' is a dot-separated option name and 'value' is a
JSON-parsable value. These configuration options will be applied while
processing the next URL.
Lines starting with '-G' are the same as above, except these options will
be applied for *all* following URLs, i.e. they are Global.
Everything else will be used as a potential URL.
Example input file:
# settings global options
-G base-directory = "/tmp/"
-G skip = false
# setting local options for the next URL
-filename="spaces_are_optional.jpg"
-skip = true
https://example.org/
# next URL uses default filename and 'skip' is false.
https://example.com/index.htm # comment1
https://example.com/404.htm # comment2
"""
gconf = []
lconf = []
strip_comment = None
for line in file:
line = line.strip()
if not line or line[0] == "#":
# empty line or comment
continue
elif line[0] == "-":
# config spec
if len(line) >= 2 and line[1] == "G":
conf = gconf
line = line[2:]
else:
conf = lconf
line = line[1:]
key, sep, value = line.partition("=")
if not sep:
log.warning("input file: invalid <key>=<value> pair: %s", line)
continue
try:
value = json_loads(value.strip())
except ValueError as exc:
log.warning("input file: unable to parse '%s': %s", value, exc)
continue
key = key.strip().split(".")
conf.append((key[:-1], key[-1], value))
else:
# url
if " #" in line or "\t#" in line:
if strip_comment is None:
strip_comment = re.compile(r"\s+#.*").sub
line = strip_comment("", line)
if gconf or lconf:
yield ExtendedUrl(line, gconf, lconf)
gconf = []
lconf = []
else:
yield line
class CustomNone():
"""None-style type that supports more operations than regular None"""
__slots__ = ()
@ -873,15 +797,6 @@ class FilterPredicate():
raise exception.FilterError(exc)
class ExtendedUrl():
"""URL with attached config key-value pairs"""
def __init__(self, url, gconf, lconf):
self.value, self.gconfig, self.lconfig = url, gconf, lconf
def __str__(self):
return self.value
class DownloadArchive():
def __init__(self, path, format_string, pragma=None,