Merge branch 'config' into loader

2024-11-22 02:32:33 +01:00 · 2015-10-05 17:46:04 +02:00 · 2015-10-05 17:46:04 +02:00 · e23aaa4298
commit e23aaa4298
parent 26bb9d62de 5ae3dd84ba
35 changed files with 532 additions and 351 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,57 @@
+# Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
 build/
-dist/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
--- a/18
+++ b/18
@ -1,18 +0,0 @@
-[pixiv]
-username = XXXXX
-password = XXXXX
-
-[exhentai-cookies]
-ipb_member_id = XXXXX
-ipb_pass_hash = XXXXX
-
-[nijie-cookies]
-NIJIEIJIEID = XXXXX
-nijie_email_hash = XXXXX
-nijie_login_hash = XXXXX
-
-[danbooru]
-regex0 = d(?:anbooru)?[.:-_](\w.+)
-
-[gelbooru]
-regex0 = g(?:elbooru)?[.:-_](\w.+)
--- a/config.json
+++ b/config.json
@ -0,0 +1,39 @@
+{
+    "base-directory": "/tmp/",
+    "extractor":
+    {
+        "pixiv":
+        {
+            "directory": ["{category}", "{artist-id}"],
+            "username": "XXX",
+            "password": "XXX"
+        },
+        "nijie":
+        {
+            "cookies":
+            {
+                "NIJIEIJIEID": "XXX",
+                "nijie_email_hash": "XXX",
+                "nijie_login_hash": "XXX"
+            }
+        },
+        "4chan":
+        {
+            "directory": ["{category}", "{board}", "{thread} - {title}"]
+        },
+        "danbooru":
+        {
+            "pattern": ["d(?:anbooru)?[.:-_](\\w.+)"],
+            "filename": "{category}_{id:>07}_{md5}.{extension}"
+        },
+        "gelbooru":
+        {
+            "pattern": ["g(?:elbooru)?[.:-_](\\w.+)"],
+            "filename": "{category}_{id:>07}_{md5}.{extension}"
+        },
+        "e621":
+        {
+            "pattern": ["e(?:621)?[.:-_](\\w.+)"]
+        }
+    }
+}
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@ -1,23 +0,0 @@
-Metadata-Version: 1.1
-Name: gallery-dl
-Version: 0.2
-Summary: gallery- and image downloader
-Home-page: https://github.com/mikf/gallery-dl
-Author: Mike Fährmann
-Author-email: mike_faehrmann@web.de
-License: GPLv2
-Description: download image galleries from several image hosting platforms
-Platform: UNKNOWN
-Classifier: Development Status :: 3 - Alpha
-Classifier: Environment :: Console
-Classifier: Intended Audience :: End Users/Desktop
-Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2)
-Classifier: Operating System :: POSIX
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.2
-Classifier: Programming Language :: Python :: 3.3
-Classifier: Programming Language :: Python :: 3.4
-Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
-Classifier: Topic :: Multimedia
-Classifier: Topic :: Multimedia :: Graphics
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@ -1,35 +0,0 @@
-setup.py
-bin/gallery-dl
-gallery_dl/__init__.py
-gallery_dl/download.py
-gallery_dl.egg-info/PKG-INFO
-gallery_dl.egg-info/SOURCES.txt
-gallery_dl.egg-info/dependency_links.txt
-gallery_dl.egg-info/entry_points.txt
-gallery_dl.egg-info/requires.txt
-gallery_dl.egg-info/top_level.txt
-gallery_dl/downloader/__init__.py
-gallery_dl/downloader/common.py
-gallery_dl/downloader/http.py
-gallery_dl/downloader/https.py
-gallery_dl/downloader/text.py
-gallery_dl/extractor/3dbooru.py
-gallery_dl/extractor/4chan.py
-gallery_dl/extractor/8chan.py
-gallery_dl/extractor/__init__.py
-gallery_dl/extractor/batoto.py
-gallery_dl/extractor/booru.py
-gallery_dl/extractor/common.py
-gallery_dl/extractor/danbooru.py
-gallery_dl/extractor/e621.py
-gallery_dl/extractor/exhentai.py
-gallery_dl/extractor/gelbooru.py
-gallery_dl/extractor/imagebam.py
-gallery_dl/extractor/imgbox.py
-gallery_dl/extractor/imgchili.py
-gallery_dl/extractor/mangareader.py
-gallery_dl/extractor/nijie.py
-gallery_dl/extractor/pixiv.py
-gallery_dl/extractor/redhawkscans.py
-gallery_dl/extractor/sankaku.py
-gallery_dl/extractor/yandere.py
--- a/gallery_dl.egg-info/dependency_links.txt
+++ b/gallery_dl.egg-info/dependency_links.txt
@ -1 +0,0 @@
-
--- a/gallery_dl.egg-info/entry_points.txt
+++ b/gallery_dl.egg-info/entry_points.txt
@ -1,3 +0,0 @@
-[console_scripts]
-gallery-dl = gallery_dl:main
-
--- a/gallery_dl.egg-info/requires.txt
+++ b/gallery_dl.egg-info/requires.txt
@ -1 +0,0 @@
-requests >= 2.0
--- a/gallery_dl.egg-info/top_level.txt
+++ b/gallery_dl.egg-info/top_level.txt
@ -1 +0,0 @@
-gallery_dl
--- a/gallery_dl/init.py
+++ b/gallery_dl/init.py
@ -17,9 +17,7 @@ __email__      = "mike_faehrmann@web.de"
 import os
 import sys
 import argparse
-import configparser
-
-from .download import DownloadManager
+from . import config, download

 def parse_cmdline_options():
    parser = argparse.ArgumentParser(
@ -41,18 +39,10 @@ def parse_cmdline_options():
    )
    return parser.parse_args()

-def parse_config_file(path):
-    config = configparser.ConfigParser(
-        interpolation=None,
-    )
-    config.optionxform = lambda opt: opt
-    config.read(os.path.expanduser(path))
-    return config
-
 def main():
+    config.load()
    opts = parse_cmdline_options()
-    conf = parse_config_file(opts.config)
-    dlmgr = DownloadManager(opts, conf)
+    dlmgr = download.DownloadManager(opts)

    try:
        for url in opts.urls:
--- a/gallery_dl/config.py
+++ b/gallery_dl/config.py
@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Global configuration module"""
+
+import sys
+import json
+import os.path
+import platform
+
+# --------------------------------------------------------------------
+# public interface
+
+def load(*files):
+    """Load JSON configuration files"""
+    configfiles = files or _default_configs
+    for conf in configfiles:
+        try:
+            path = os.path.expanduser(conf)
+            with open(path) as file:
+                confdict = json.load(file)
+            _config.update(confdict)
+        except FileNotFoundError:
+            continue
+        except json.decoder.JSONDecodeError as exception:
+            print("Error while loading '", path, "':", sep="", file=sys.stderr)
+            print(exception, file=sys.stderr)
+
+def clear():
+    """Reset configuration to en empty state"""
+    globals()["_config"] = {}
+
+def get(keys, default=None):
+    """Get the value of property 'key' or a default-value if it doenst exist"""
+    conf = _config
+    try:
+        for k in keys:
+            conf = conf[k]
+        return conf
+    except (KeyError, AttributeError):
+        return default
+
+def interpolate(keys, default=None):
+    """Interpolate the value of 'key'"""
+    conf = _config
+    try:
+        for k in keys:
+            default = conf.get(keys[-1], default)
+            conf = conf[k]
+        return conf
+    except (KeyError, AttributeError):
+        return default
+
+def set(keys, value):
+    """Set the value of property 'key' for this session"""
+    conf = _config
+    for k in keys[:-1]:
+        try:
+            conf = conf[k]
+        except KeyError:
+            temp = {}
+            conf[k] = temp
+            conf = temp
+    conf[keys[-1]] = value
+
+
+# --------------------------------------------------------------------
+# internals
+
+_config = {}
+
+if platform.system() == "Windows":
+    _default_configs = [
+        r"~\.config\gallery-dl.conf",
+        r"~\.gallery-dl.conf",
+    ]
+else:
+    _default_configs = [
+        "/etc/gallery-dl.conf",
+        "~/.config/gallery/config.json",
+        "~/.config/gallery-dl.conf",
+        "~/.gallery-dl.conf",
+    ]
--- a/gallery_dl/download.py
+++ b/gallery_dl/download.py
@ -11,12 +11,12 @@ import sys
 import importlib
 from . import extractor
 from .extractor.common import Message
+from . import config

 class DownloadManager():

-    def __init__(self, opts, config):
+    def __init__(self, opts):
        self.opts = opts
-        self.config = config
        self.modules = {}

    def add(self, url):
@ -36,7 +36,7 @@ class DownloadManager():
        if self.opts.dest:
            return self.opts.dest
        else:
-            return self.config.get("general", "destination", fallback="/tmp/")
+            return config.get(("base-directory",), default="/tmp/")


 class DownloadJob():
@ -48,16 +48,14 @@ class DownloadJob():
            return
        self.directory = mngr.get_base_directory()
        self.downloaders = {}
-        self.filename_fmt = mngr.config.get(
-            self.info["category"], "filename",
-            fallback=self.info["filename"]
+        self.filename_fmt = config.get(
+            ("extractor", self.info["category"], "filename"),
+            default=self.info["filename"]
+        )
+        segments = config.get(
+            ("extractor", self.info["category"], "directory"),
+            default=self.info["directory"]
        )
-        try:
-            segments = mngr.config.get(
-                self.info["category"], "directory"
-            ).split("/")
-        except Exception:
-            segments = self.info["directory"]
        self.directory_fmt = os.path.join(*segments)

    def run(self):
@ -112,13 +110,11 @@ class DownloadJob():
        scheme = url[:pos] if pos != -1 else "http"
        if scheme == "https":
            scheme = "http"
-
        downloader = self.downloaders.get(scheme)
        if downloader is None:
            module = self.mngr.get_downloader_module(scheme)
            downloader = module.Downloader()
            self.downloaders[scheme] = downloader
-
        return downloader

    @staticmethod
--- a/gallery_dl/extractor/3dbooru.py
+++ b/gallery_dl/extractor/3dbooru.py
@ -22,8 +22,8 @@ info = {

 class ThreeDeeBooruExtractor(JSONBooruExtractor):

-    def __init__(self, match, config):
-        JSONBooruExtractor.__init__(self, match, config, info)
+    def __init__(self, match):
+        JSONBooruExtractor.__init__(self, match, info)
        self.api_url = "http://behoimi.org/post/index.json"
        self.headers = {
            "Referer": "http://behoimi.org/post/show/",
--- a/gallery_dl/extractor/4chan.py
+++ b/gallery_dl/extractor/4chan.py
@ -8,65 +8,25 @@

 """Extract image- and video-urls from threads on https://www.4chan.org/"""

-from .common import SequentialExtractor, Message
-from urllib.parse import unquote
-import re
+from .chan import ChanExtractor

 info = {
    "category": "4chan",
    "extractor": "FourChanExtractor",
-    "directory": ["{category}", "{board}-{thread-id}"],
-    "filename": "{timestamp}-{name}",
+    "directory": ["{category}", "{board}-{thread}"],
+    "filename": "{tim}-{filename}{ext}",
    "pattern": [
        r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*",
    ],
 }

-class FourChanExtractor(SequentialExtractor):
+class FourChanExtractor(ChanExtractor):

-    url_fmt = "https://boards.4chan.org/{0}/res/{1}.html"
-    regex = (
-        r'<a (?:title="(?P<orig_name>[^"]+)" )?href="'
-        r'(?P<url>//i.4cdn.org/[^/]+/(?P<timestamp>\d+)\.(?P<extension>[^"]+))'
-        r'" target="_blank">(?P<name>[^<]+)</a> '
-        r'\((?P<size>[^,]+), (?P<width>\d+)x(?P<height>\d+)\)'
-    )
+    api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
+    file_url = "https://i.4cdn.org/{board}/{tim}{ext}"

-    def __init__(self, match, config):
-        SequentialExtractor.__init__(self, config)
-        self.match = match
-        self.metadata = None
-
-    def items(self):
-        yield Message.Version, 1
-
-        url = self.url_fmt.format(*self.match.groups())
-        text = self.request(url).text
-        self.metadata = self.get_job_metadata(text)
-
-        yield Message.Directory, self.metadata
-        for match in re.finditer(self.regex, text):
-            yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
-
-    def get_job_metadata(self, text):
-        """Collect metadata for extractor-job"""
-        board, thread_id = self.match.groups()
-        title, _ = self.extract(text, '"description" content="', ' - &quot;/')
-        return {
-            "category": info["category"],
-            "board": board,
-            "thread-id": thread_id,
-            "title": unquote(title),
-        }
-
-    def get_file_metadata(self, match):
-        """Collect metadata for a downloadable file"""
-        data = self.metadata
-        data.update(match.groupdict(default=""))
-        data["name"] = unquote(data["orig_name"] or data["name"])
-        return data
-
-    @staticmethod
-    def get_file_url(match):
-        """Extract download-url from 'match'"""
-        return "https:" + match.group("url")
+    def __init__(self, match):
+        ChanExtractor.__init__(
+            self, info["category"],
+            match.group(1), match.group(2)
+        )
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@ -8,65 +8,25 @@

 """Extract image- and video-urls from threads on https://8ch.net/"""

-from .common import SequentialExtractor, Message
-from urllib.parse import unquote
-import re
+from .chan import ChanExtractor

 info = {
    "category": "8chan",
    "extractor": "InfinityChanExtractor",
-    "directory": ["{category}", "{board}-{thread-id}"],
-    "filename": "{timestamp}-{name}",
+    "directory": ["{category}", "{board}-{thread}"],
+    "filename": "{tim}-{filename}{ext}",
    "pattern": [
-        r"(?:https?://)?(?:www\.)?(?:8chan\.co|8ch\.net)/([^/]+/res/\d+).*",
+        r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+).*",
    ],
 }

-class InfinityChanExtractor(SequentialExtractor):
+class InfinityChanExtractor(ChanExtractor):

-    url_base = "https://8ch.net"
-    url_fmt = url_base + "/{board}/res/{thread-id}.html"
-    regex = (
-        r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?'
-        r'<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
-    )
-
-    def __init__(self, match, config):
-        SequentialExtractor.__init__(self, config)
-        self.match = match
-
-    def items(self):
-        yield Message.Version, 1
-
-        metadata = self.get_job_metadata()
-        yield Message.Directory, metadata
-
-        url = self.url_fmt.format(**metadata)
-        text = self.request(url).text
-        for match in re.finditer(self.regex, text):
-            yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
-
-    def get_job_metadata(self):
-        """Collect metadata for extractor-job"""
-        board, _, thread_id = self.match.group(1).split("/")
-        return {
-            "category": info["category"],
-            "board": board,
-            "thread-id": thread_id,
-        }
-
-    @staticmethod
-    def get_file_metadata(match):
-        """Collect metadata for a downloadable file"""
-        return {
-            "timestamp": match.group(2),
-            "name": unquote(match.group(4) or match.group(5)),
-        }
-
-    def get_file_url(self, match):
-        """Extract download-url from 'match'"""
-        url = match.group(1)
-        if url.startswith("/"):
-            url = self.url_base + url
-        return url
+    api_url = "https://8ch.net/{board}/res/{thread}.json"
+    file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"

+    def __init__(self, match):
+        ChanExtractor.__init__(
+            self, info["category"],
+            match.group(1), match.group(2)
+        )
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@ -8,10 +8,8 @@

 """Extract manga pages from http://bato.to/"""

-from .common import AsynchronousExtractor
-from .common import Message
-from .common import filename_from_url, unescape
-from urllib.parse import unquote
+from .common import AsynchronousExtractor, Message
+from .. import text
 import os.path
 import re

@ -29,8 +27,8 @@ class BatotoExtractor(AsynchronousExtractor):

    url_base = "http://bato.to/read/_/"

-    def __init__(self, match, config):
-        AsynchronousExtractor.__init__(self, config)
+    def __init__(self, match):
+        AsynchronousExtractor.__init__(self)
        self.chapter_id = match.group(1)

    def items(self):
@ -44,13 +42,13 @@ class BatotoExtractor(AsynchronousExtractor):
    def get_page_metadata(self, page_url):
        """Collect next url and metadata for one manga-page"""
        page = self.request(page_url).text
-        _    , pos = self.extract(page, 'selected="selected"', '')
-        title, pos = self.extract(page, ': ', '<', pos)
-        _    , pos = self.extract(page, 'selected="selected"', '', pos)
-        trans, pos = self.extract(page, '>', '<', pos)
-        _    , pos = self.extract(page, '<div id="full_image"', '', pos)
-        image, pos = self.extract(page, '<img src="', '"', pos)
-        url  , pos = self.extract(page, '<a href="', '"', pos)
+        _    , pos = text.extract(page, 'selected="selected"', '')
+        title, pos = text.extract(page, ': ', '<', pos)
+        _    , pos = text.extract(page, 'selected="selected"', '', pos)
+        trans, pos = text.extract(page, '>', '<', pos)
+        _    , pos = text.extract(page, '<div id="full_image"', '', pos)
+        image, pos = text.extract(page, '<img src="', '"', pos)
+        url  , pos = text.extract(page, '<a href="', '"', pos)
        mmatch = re.search(
            r"<title>(.+) - (?:vol (\d+) )?"
            r"ch (\d+)[^ ]+ Page (\d+) | Batoto!</title>",
@ -60,18 +58,18 @@ class BatotoExtractor(AsynchronousExtractor):
            r"(.+) - ([^ ]+)",
            trans
        )
-        filename = unquote(filename_from_url(image))
+        filename = text.unquote(text.filename_from_url(image))
        name, ext = os.path.splitext(filename)
        return url, {
            "category": info["category"],
            "chapter-id": self.chapter_id,
-            "manga": unescape(mmatch.group(1)),
+            "manga": text.unescape(mmatch.group(1)),
            "volume": mmatch.group(2) or "",
            "chapter": mmatch.group(3),
            "page": mmatch.group(4),
            "group": tmatch.group(1),
            "language": tmatch.group(2),
-            "title": unescape(title),
+            "title": text.unescape(title),
            "image-url": image,
            "name": name,
            "extension": ext[1:],
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@ -8,23 +8,21 @@

 """Base classes for extractors for danbooru and co"""

-from .common import SequentialExtractor
-from .common import Message
-from .common import filename_from_url
+from .common import SequentialExtractor, Message
+from .. import text
 import xml.etree.ElementTree as ET
 import json
 import os.path
 import urllib.parse

-
 class BooruExtractor(SequentialExtractor):

    api_url = ""

-    def __init__(self, match, config, info):
-        SequentialExtractor.__init__(self, config)
+    def __init__(self, match, info):
+        SequentialExtractor.__init__(self)
        self.info = info
-        self.tags = urllib.parse.unquote(match.group(1))
+        self.tags = text.unquote(match.group(1))
        self.page = "page"
        self.params = {"tags": self.tags}
        self.headers = {}
@ -58,8 +56,8 @@ class BooruExtractor(SequentialExtractor):
    def get_file_metadata(self, data):
        """Collect metadata for a downloadable file"""
        data["category"] = self.info["category"]
-        data["name"] = urllib.parse.unquote(
-            filename_from_url(self.get_file_url(data))
+        data["name"] = text.unquote(
+            text.filename_from_url(self.get_file_url(data))
        )
        data["extension"] = os.path.splitext(data["name"])[1][1:]
        return data
--- a/gallery_dl/extractor/chan.py
+++ b/gallery_dl/extractor/chan.py
@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Base classes for extractors for different Futaba Channel boards"""
+
+from .common import SequentialExtractor, Message
+from .. import text
+
+class ChanExtractor(SequentialExtractor):
+
+    api_url = ""
+    file_url = ""
+
+    def __init__(self, category, board, thread):
+        SequentialExtractor.__init__(self)
+        self.metadata = {
+            "category": category,
+            "board": board,
+            "thread": thread,
+        }
+
+    def items(self):
+        yield Message.Version, 1
+        posts = self.request(self.api_url.format(**self.metadata)).json()["posts"]
+        self.metadata["title"] = self.get_thread_title(posts[0])
+        yield Message.Directory, self.metadata
+        for post in posts:
+            if "filename" not in post:
+                continue
+            post.update(self.metadata)
+            yield Message.Url, self.file_url.format(**post), post
+            if "extra_files" in post:
+                for file in post["extra_files"]:
+                    post.update(file)
+                    yield Message.Url, self.file_url.format(**post), post
+
+    @staticmethod
+    def get_thread_title(post):
+        """Return thread title from first post"""
+        if "sub" in post:
+            return post["sub"]
+        return text.remove_html(post["com"])[:50]
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@ -12,7 +12,7 @@ import time
 import queue
 import requests
 import threading
-import html.parser
+from .. import config


 class Message():
@ -44,36 +44,18 @@ class Extractor():
            "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
        )

-    @staticmethod
-    def extract(txt, begin, end, pos=0):
-        try:
-            first = txt.index(begin, pos) + len(begin)
-            last = txt.index(end, first)
-            return txt[first:last], last+len(end)
-        except ValueError:
-            return None, pos
-
-    @staticmethod
-    def extract_all(txt, begin, end, pos=0):
-        try:
-            first = txt.index(begin, pos)
-            last = txt.index(end, first + len(begin)) + len(end)
-            return txt[first:last], last
-        except ValueError:
-            return None, pos
-

 class SequentialExtractor(Extractor):

-    def __init__(self, _):
+    def __init__(self):
        Extractor.__init__(self)


 class AsynchronousExtractor(Extractor):

-    def __init__(self, config):
+    def __init__(self):
        Extractor.__init__(self)
-        queue_size = int(config.get("general", "queue-size", fallback=5))
+        queue_size = int(config.get(("queue-size",), default=5))
        self.__queue = queue.Queue(maxsize=queue_size)
        self.__thread = threading.Thread(target=self.async_items, daemon=True)

@ -123,9 +105,3 @@ def safe_request(session, url, method="GET", *args, **kwargs):

        # everything ok -- proceed to download
        return r
-
-def filename_from_url(url):
-    pos = url.rfind("/")
-    return url[pos+1:]
-
-unescape = html.parser.HTMLParser().unescape
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@ -22,6 +22,6 @@ info = {

 class DanbooruExtractor(JSONBooruExtractor):

-    def __init__(self, match, config):
-        JSONBooruExtractor.__init__(self, match, config, info)
+    def __init__(self, match):
+        JSONBooruExtractor.__init__(self, match, info)
        self.api_url = "https://danbooru.donmai.us/posts.json"
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@ -23,6 +23,6 @@ info = {

 class E621Extractor(JSONBooruExtractor):

-    def __init__(self, match, config):
-        JSONBooruExtractor.__init__(self, match, config, info)
+    def __init__(self, match):
+        JSONBooruExtractor.__init__(self, match, info)
        self.api_url = "https://e621.net/post/index.json"
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@ -22,8 +22,8 @@ info = {

 class GelbooruExtractor(XMLBooruExtractor):

-    def __init__(self, match, config):
-        XMLBooruExtractor.__init__(self, match, config, info)
+    def __init__(self, match):
+        XMLBooruExtractor.__init__(self, match, info)
        self.api_url = "http://gelbooru.com/"
        self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}

--- a/gallery_dl/extractor/imagebam.py
+++ b/gallery_dl/extractor/imagebam.py
@ -8,9 +8,8 @@

 """Extract images from galleries at http://www.imagebam.com/"""

-from .common import AsynchronousExtractor
-from .common import Message
-from .common import filename_from_url
+from .common import AsynchronousExtractor, Message
+from .. import text

 info = {
    "category": "imagebam",
@ -26,8 +25,8 @@ class ImagebamExtractor(AsynchronousExtractor):

    url_base = "http://www.imagebam.com"

-    def __init__(self, match, config):
-        AsynchronousExtractor.__init__(self, config)
+    def __init__(self, match):
+        AsynchronousExtractor.__init__(self)
        self.match = match
        self.num = 0
        self.metadata = {}
@ -42,28 +41,28 @@ class ImagebamExtractor(AsynchronousExtractor):
        done = False
        while not done:
            # get current page
-            text = self.request(self.url_base + next_url).text
+            page = self.request(self.url_base + next_url).text

            # get url for next page
-            next_url, pos = self.extract(text, "<a class='buttonblue' href='", "'")
+            next_url, pos = text.extract(page, "<a class='buttonblue' href='", "'")

            # if the following text isn't "><span>next image" we are done
-            if not text.startswith("><span>next image", pos):
+            if not page.startswith("><span>next image", pos):
                done = True

            # get image url
-            img_url, pos = self.extract(text, 'onclick="scale(this);" src="', '"', pos)
+            img_url, pos = text.extract(page, 'onclick="scale(this);" src="', '"', pos)

            yield Message.Url, img_url, self.get_file_metadata(img_url)

    def get_job_metadata(self):
        """Collect metadata for extractor-job"""
        gallery_key = self.match.group(2)
-        text = self.request(self.url_base + "/gallery/" + gallery_key).text
-        _    , pos = self.extract(text, "<img src='/img/icons/photos.png'", "")
-        title, pos = self.extract(text, "'> ", " <", pos)
-        count, pos = self.extract(text, "'>", " images", pos)
-        url  , pos = self.extract(text, "<a href='http://www.imagebam.com", "'", pos)
+        page = self.request(self.url_base + "/gallery/" + gallery_key).text
+        _    , pos = text.extract(page, "<img src='/img/icons/photos.png'", "")
+        title, pos = text.extract(page, "'> ", " <", pos)
+        count, pos = text.extract(page, "'>", " images", pos)
+        url  , pos = text.extract(page, "<a href='http://www.imagebam.com", "'", pos)
        return {
            "category": info["category"],
            "key": gallery_key,
@ -77,5 +76,5 @@ class ImagebamExtractor(AsynchronousExtractor):
        self.num += 1
        data = self.metadata.copy()
        data["num"] = self.num
-        data["name"] = filename_from_url(url)
+        data["name"] = text.filename_from_url(url)
        return data
--- a/gallery_dl/extractor/imgbox.py
+++ b/gallery_dl/extractor/imgbox.py
@ -9,6 +9,7 @@
 """Extract images from galleries at http://imgbox.com/"""

 from .common import AsynchronousExtractor, Message
+from .. import text
 import re

 info = {
@ -25,8 +26,8 @@ class ImgboxExtractor(AsynchronousExtractor):

    url_base = "http://imgbox.com"

-    def __init__(self, match, config):
-        AsynchronousExtractor.__init__(self, config)
+    def __init__(self, match):
+        AsynchronousExtractor.__init__(self)
        self.key = match.group(1)
        self.metadata = {}

@ -36,8 +37,8 @@ class ImgboxExtractor(AsynchronousExtractor):
        yield Message.Version, 1
        yield Message.Directory, self.metadata
        for match in re.finditer(r'<a href="([^"]+)"><img alt="', page):
-            text = self.request(self.url_base + match.group(1)).text
-            yield Message.Url, self.get_file_url(text), self.get_file_metadata(text)
+            imgpage = self.request(self.url_base + match.group(1)).text
+            yield Message.Url, self.get_file_url(imgpage), self.get_file_metadata(imgpage)

    def get_job_metadata(self, page):
        """Collect metadata for extractor-job"""
@ -51,16 +52,16 @@ class ImgboxExtractor(AsynchronousExtractor):
            "count": match.group(4),
        }

-    def get_file_metadata(self, text):
+    def get_file_metadata(self, page):
        """Collect metadata for a downloadable file"""
        data = self.metadata.copy()
-        data["num"]      , pos = self.extract(text, '</a> &nbsp; ', ' of ')
-        data["image-key"], pos = self.extract(text, '/i.imgbox.com/', '?download', pos)
-        data["name"]     , pos = self.extract(text, ' title="', '"', pos)
+        data["num"]      , pos = text.extract(page, '</a> &nbsp; ', ' of ')
+        data["image-key"], pos = text.extract(page, '/i.imgbox.com/', '?download', pos)
+        data["name"]     , pos = text.extract(page, ' title="', '"', pos)
        return data

-    def get_file_url(self, text):
+    def get_file_url(self, page):
        """Extract download-url"""
        base = "http://i.imgbox.com/"
-        path, _ = self.extract(text, base, '"')
+        path, _ = text.extract(page, base, '"')
        return base + path
--- a/gallery_dl/extractor/imgchili.py
+++ b/gallery_dl/extractor/imgchili.py
@ -8,9 +8,8 @@

 """Extract images from albums at http://imgchili.net/"""

-from .common import SequentialExtractor
-from .common import Message
-from .common import filename_from_url
+from .common import SequentialExtractor, Message
+from .. import text
 import re

 info = {
@ -25,8 +24,8 @@ info = {

 class ImgchiliExtractor(SequentialExtractor):

-    def __init__(self, match, config):
-        SequentialExtractor.__init__(self, config)
+    def __init__(self, match):
+        SequentialExtractor.__init__(self)
        self.match = match
        self.num = 0

@ -42,7 +41,7 @@ class ImgchiliExtractor(SequentialExtractor):

    def get_job_metadata(self, page):
        """Collect metadata for extractor-job"""
-        title = self.extract(page, "<h1>", "</h1>")[0]
+        title = text.extract(page, "<h1>", "</h1>")[0]
        return {
            "category": info["category"],
            "title": title,
--- a/gallery_dl/extractor/mangareader.py
+++ b/gallery_dl/extractor/mangareader.py
@ -8,10 +8,8 @@

 """Extract manga pages from http://www.mangareader.net/"""

-from .common import AsynchronousExtractor
-from .common import Message
-from .common import unescape, filename_from_url
-from urllib.parse import unquote
+from .common import AsynchronousExtractor, Message
+from .. import text
 import os.path
 import re

@ -30,8 +28,8 @@ class MangaReaderExtractor(AsynchronousExtractor):

    url_base = "http://www.mangareader.net"

-    def __init__(self, match, config):
-        AsynchronousExtractor.__init__(self, config)
+    def __init__(self, match):
+        AsynchronousExtractor.__init__(self)
        self.part = match.group(1)

    def items(self):
@ -47,7 +45,7 @@ class MangaReaderExtractor(AsynchronousExtractor):
    def get_page_metadata(self, page_url):
        """Collect next url, image-url and metadata for one manga-page"""
        page = self.request(page_url).text
-        extr = self.extract
+        extr = text.extract
        width = None
        descr, pos = extr(page, '<meta name="description" content="', '"')
        test , pos = extr(page, "document['pu']", '', pos)
@ -62,13 +60,13 @@ class MangaReaderExtractor(AsynchronousExtractor):
            width , pos = extr(page, '<img id="img" width="', '"', pos)
            height, pos = extr(page, ' height="', '"', pos)
        image, pos = extr(page, ' src="', '"', pos)
-        filename = unquote(filename_from_url(image))
+        filename = text.unquote(text.filename_from_url(image))
        name, ext = os.path.splitext(filename)
        match = re.match(r"(.*) (\d+) - Read \1 \2 Manga Scans Page (\d+)", descr)

        return self.url_base + url, image, {
            "category": info["category"],
-            "manga": unescape(match.group(1)),
+            "manga": text.unescape(match.group(1)),
            "chapter": match.group(2),
            "page": match.group(3),
            "width": width,
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@ -8,9 +8,8 @@

 """Extract images from https://nijie.info/"""

-from .common import AsynchronousExtractor
-from .common import Message
-from .common import filename_from_url
+from .common import AsynchronousExtractor, Message
+from .. import config, text
 import re

 info = {
@ -27,8 +26,8 @@ class NijieExtractor(AsynchronousExtractor):

    popup_url = "https://nijie.info/view_popup.php?id="

-    def __init__(self, match, config):
-        AsynchronousExtractor.__init__(self, config)
+    def __init__(self, match):
+        AsynchronousExtractor.__init__(self)
        self.artist_id = match.group(1)
        self.artist_url = (
            "https://nijie.info/members_illust.php?id="
@ -37,7 +36,9 @@ class NijieExtractor(AsynchronousExtractor):
        self.session.headers["Referer"] = self.artist_url
        self.session.cookies["R18"] = "1"
        self.session.cookies["nijie_referer"] = "nijie.info"
-        self.session.cookies.update(config["nijie-cookies"])
+        self.session.cookies.update(
+            config.get(("extractor", info["category"], "cookies"))
+        )

    def items(self):
        data = self.get_job_metadata()
@ -56,19 +57,20 @@ class NijieExtractor(AsynchronousExtractor):
        }

    def get_image_ids(self):
-        text = self.request(self.artist_url).text
+        """Collect all image-ids for a specific artist"""
+        page = self.request(self.artist_url).text
        regex = r'<a href="/view\.php\?id=(\d+)"'
-        return [m.group(1) for m in re.finditer(regex, text)]
+        return [m.group(1) for m in re.finditer(regex, page)]

    def get_image_data(self, image_id):
        """Get URL and metadata for images specified by 'image_id'"""
-        text = self.request(self.popup_url + image_id).text
-        matches = re.findall('<img src="([^"]+)"', text)
+        page = self.request(self.popup_url + image_id).text
+        matches = re.findall('<img src="([^"]+)"', page)
        for index, url in enumerate(matches):
            yield "https:" + url, {
                "count": len(matches),
                "index": index,
                "image-id": image_id,
-                "name" : filename_from_url(url),
+                "name" : text.filename_from_url(url),
                "extension": url[url.rfind(".")+1:],
            }
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@ -8,8 +8,8 @@

 """Extract images and ugoira from http://www.pixiv.net/"""

-from .common import SequentialExtractor
-from .common import Message
+from .common import SequentialExtractor, Message
+from .. import config, text
 import re
 import json

@ -29,16 +29,15 @@ class PixivExtractor(SequentialExtractor):
    member_url = "http://www.pixiv.net/member_illust.php"
    illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"

-    def __init__(self, match, config):
-        SequentialExtractor.__init__(self, config)
-        self.config = config
+    def __init__(self, match):
+        SequentialExtractor.__init__(self)
        self.artist_id = match.group(1)
        self.api = PixivAPI(self.session)

    def items(self):
        self.api.login(
-            self.config.get("pixiv", "username"),
-            self.config.get("pixiv", "password"),
+            config.get(("extractor", "pixiv", "username")),
+            config.get(("extractor", "pixiv", "password")),
        )
        metadata = self.get_job_metadata()

@ -84,9 +83,9 @@ class PixivExtractor(SequentialExtractor):

    def get_works(self):
        """Yield all work-items for a pixiv-member"""
-        page = 1
+        pagenum = 1
        while True:
-            data = self.api.user_works(self.artist_id, page)
+            data = self.api.user_works(self.artist_id, pagenum)
            for work in data["response"]:
                url = work["image_urls"]["large"]
                work["num"] = ""
@ -96,17 +95,17 @@ class PixivExtractor(SequentialExtractor):
            pinfo = data["pagination"]
            if pinfo["current"] == pinfo["pages"]:
                return
-            page = pinfo["next"]
+            pagenum = pinfo["next"]

    def parse_ugoira(self, data):
        """Parse ugoira data"""
        # get illust page
-        text = self.request(
+        page = self.request(
            self.illust_url, params={"illust_id": data["id"]},
        ).text

        # parse page
-        frames, _ = self.extract(text, ',"frames":[', ']')
+        frames, _ = text.extract(page, ',"frames":[', ']')

        # build url
        url = re.sub(
@ -146,7 +145,7 @@ class PixivAPI():
        self.session = session
        self.session.headers.update({
            "Referer": "http://www.pixiv.net/",
-            "User-Agent": "PixivIOSApp/5.1.1",
+            "User-Agent": "PixivIOSApp/5.8.0",
            # "Authorization": "Bearer 8mMXXWT9iuwdJvsVIvQsFYDwuZpRCMePeyagSh30ZdU",
        })

--- a/gallery_dl/extractor/redhawkscans.py
+++ b/gallery_dl/extractor/redhawkscans.py
@ -8,9 +8,8 @@

 """Extract manga pages from http://manga.redhawkscans.com/"""

-from .common import SequentialExtractor
-from .common import Message
-from .common import unescape
+from .common import SequentialExtractor, Message
+from .. import text
 import os.path
 import json
 import re
@ -29,8 +28,8 @@ class RedHawkScansExtractor(SequentialExtractor):

    url_base = "https://manga.redhawkscans.com/reader/read/"

-    def __init__(self, match, config):
-        SequentialExtractor.__init__(self, config)
+    def __init__(self, match):
+        SequentialExtractor.__init__(self)
        self.part = match.group(1)

    def items(self):
@ -50,16 +49,16 @@ class RedHawkScansExtractor(SequentialExtractor):
        response = self.request(self.url_base + self.part)
        response.encoding = "utf-8"
        page = response.text
-        _        , pos = self.extract(page, '<h1 class="tbtitle dnone">', '')
-        manga    , pos = self.extract(page, 'title="', '"', pos)
-        chapter  , pos = self.extract(page, '">', '</a>', pos)
-        json_data, pos = self.extract(page, 'var pages = ', ';\r\n', pos)
+        _        , pos = text.extract(page, '<h1 class="tbtitle dnone">', '')
+        manga    , pos = text.extract(page, 'title="', '"', pos)
+        chapter  , pos = text.extract(page, '">', '</a>', pos)
+        json_data, pos = text.extract(page, 'var pages = ', ';\r\n', pos)
        match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter)
        return {
            "category": info["category"],
-            "manga": unescape(manga),
+            "manga": text.unescape(manga),
            "chapter": match.group(2) or match.group(1),
            "chapter-minor": match.group(3) or "",
            "language": "English",
-            "title": unescape(match.group(4) or ""),
+            "title": text.unescape(match.group(4) or ""),
        }, json.loads(json_data)
--- a/gallery_dl/extractor/yandere.py
+++ b/gallery_dl/extractor/yandere.py
@ -22,6 +22,6 @@ info = {

 class YandereExtractor(JSONBooruExtractor):

-    def __init__(self, match, config):
-        JSONBooruExtractor.__init__(self, match, config, info)
+    def __init__(self, match):
+        JSONBooruExtractor.__init__(self, match, info)
        self.api_url = "https://yande.re/post.json"
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Collection of functions that work in strings/text"""
+
+import re
+import html.parser
+import urllib.parse
+import platform
+
+def remove_html(text):
+    """Remove html-tags from a string"""
+    return " ".join(re.sub("<[^>]+?>", " ", text).split())
+
+def filename_from_url(url):
+    """Extract the last part of an url to use as a filename"""
+    try:
+        path = urllib.parse.urlparse(url).path
+        pos = path.rindex("/")
+        return path[pos+1:]
+    except ValueError:
+        return url
+
+def clean_path_windows(path):
+    """Remove illegal characters from a path-segment (Windows)"""
+    return re.sub(r'[<>:"\\/|?*]', "_", path)
+
+def clean_path_posix(path):
+    """Remove illegal characters from a path-segment (Posix)"""
+    return path.replace("/", "_")
+
+def extract(txt, begin, end, pos=0):
+    try:
+        first = txt.index(begin, pos) + len(begin)
+        last = txt.index(end, first)
+        return txt[first:last], last+len(end)
+    except ValueError:
+        return None, pos
+
+def extract_all(txt, begin, end, pos=0):
+    try:
+        first = txt.index(begin, pos)
+        last = txt.index(end, first + len(begin)) + len(end)
+        return txt[first:last], last
+    except ValueError:
+        return None, pos
+
+if platform.system() == "Windows":
+    clean_path = clean_path_windows
+else:
+    clean_path = clean_path_posix
+
+unquote = urllib.parse.unquote
+
+unescape = html.parser.HTMLParser().unescape
--- a/setup.py
+++ b/setup.py
@ -46,4 +46,5 @@ setup(
        "Topic :: Multimedia",
        "Topic :: Multimedia :: Graphics",
    ],
+    test_suite='test',
 )
--- a/test/init.py
+++ b/test/init.py
--- a/test/test_config.py
+++ b/test/test_config.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+import gallery_dl.config as config
+import os
+import tempfile
+
+class TestConfig(unittest.TestCase):
+
+    def setUp(self):
+        fd, self._configfile = tempfile.mkstemp()
+        with os.fdopen(fd, "w") as file:
+            file.write('{"a": "1", "b": {"c": "text"}}')
+        config.load(self._configfile)
+
+    def tearDown(self):
+        config.clear()
+        os.remove(self._configfile)
+
+    def test_get(self):
+        self.assertEqual(config.get(["a"]), "1")
+        self.assertEqual(config.get(["b", "c"]), "text")
+        self.assertEqual(config.get(["d"]), None)
+        self.assertEqual(config.get(["e", "f", "g"], 123), 123)
+
+    def test_set(self):
+        config.set(["b", "c"], [1, 2, 3])
+        config.set(["e", "f", "g"], value=234)
+        self.assertEqual(config.get(["b", "c"]), [1, 2, 3])
+        self.assertEqual(config.get(["e", "f", "g"]), 234)
+
+    def test_interpolate(self):
+        self.assertEqual(config.interpolate(["a"]), "1")
+        self.assertEqual(config.interpolate(["b", "a"]), "1")
+        self.assertEqual(config.interpolate(["b", "c"], "2"), "text")
+        self.assertEqual(config.interpolate(["b", "d"], "2"), "2")
+        config.set(["d"], 123)
+        self.assertEqual(config.interpolate(["b", "d"], "2"), 123)
+        self.assertEqual(config.interpolate(["d", "d"], "2"), 123)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/test_text.py
+++ b/test/test_text.py
@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+import gallery_dl.text as text
+
+class TestText(unittest.TestCase):
+
+    def test_remove_html(self):
+        cases = (
+            "Hello World.",
+            " Hello  World. ",
+            "Hello<br/>World.",
+            "<div><span class='a'>Hello</span><strong>World.</strong></div>"
+        )
+        result = "Hello World."
+        for case in cases:
+            self.assertEqual(text.remove_html(case), result)
+
+    def test_filename_from_url(self):
+        cases = (
+            "http://example.org/v2/filename.ext",
+            "http://example.org/v2/filename.ext?param=value#fragment",
+            "example.org/filename.ext",
+            "/filename.ext",
+            "filename.ext",
+        )
+        result = "filename.ext"
+        for case in cases:
+            self.assertEqual(text.filename_from_url(case), result)
+
+    def test_clean_path(self):
+        cases = {
+            "Hello World." : ("Hello World.", "Hello World."),
+            "Hello/World/.": ("Hello_World_.", "Hello_World_."),
+            r'<Hello>:|"World\*?': (
+                '_Hello____World___', r'<Hello>:|"World\*?'
+            ),
+        }
+        for case, result in cases.items():
+            self.assertEqual(text.clean_path_windows(case), result[0])
+            self.assertEqual(text.clean_path_posix  (case), result[1])
+
+if __name__ == '__main__':
+    unittest.main()