Merge branch 'config' into loader

2024-11-25 12:12:34 +01:00 · 2015-10-05 17:46:04 +02:00 · 2015-10-05 17:46:04 +02:00 · e23aaa4298
commit e23aaa4298
parent 26bb9d62de 5ae3dd84ba
35 changed files with 532 additions and 351 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,57 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 env/
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 *.egg-info/
 .installed.cfg
 *.egg
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *,cover
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
--- a/18
+++ b/18
@ -1,18 +0,0 @@
 [pixiv]
 username = XXXXX
 password = XXXXX
 [exhentai-cookies]
 ipb_member_id = XXXXX
 ipb_pass_hash = XXXXX
 [nijie-cookies]
 NIJIEIJIEID = XXXXX
 nijie_email_hash = XXXXX
 nijie_login_hash = XXXXX
 [danbooru]
 regex0 = d(?:anbooru)?[.:-_](\w.+)
 [gelbooru]
 regex0 = g(?:elbooru)?[.:-_](\w.+)
--- a/config.json
+++ b/config.json
@ -0,0 +1,39 @@
 {
    "base-directory": "/tmp/",
    "extractor":
    {
        "pixiv":
        {
            "directory": ["{category}", "{artist-id}"],
            "username": "XXX",
            "password": "XXX"
        },
        "nijie":
        {
            "cookies":
            {
                "NIJIEIJIEID": "XXX",
                "nijie_email_hash": "XXX",
                "nijie_login_hash": "XXX"
            }
        },
        "4chan":
        {
            "directory": ["{category}", "{board}", "{thread} - {title}"]
        },
        "danbooru":
        {
            "pattern": ["d(?:anbooru)?[.:-_](\\w.+)"],
            "filename": "{category}_{id:>07}_{md5}.{extension}"
        },
        "gelbooru":
        {
            "pattern": ["g(?:elbooru)?[.:-_](\\w.+)"],
            "filename": "{category}_{id:>07}_{md5}.{extension}"
        },
        "e621":
        {
            "pattern": ["e(?:621)?[.:-_](\\w.+)"]
        }
    }
 }
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@ -1,23 +0,0 @@
 Metadata-Version: 1.1
 Name: gallery-dl
 Version: 0.2
 Summary: gallery- and image downloader
 Home-page: https://github.com/mikf/gallery-dl
 Author: Mike Fährmann
 Author-email: mike_faehrmann@web.de
 License: GPLv2
 Description: download image galleries from several image hosting platforms
 Platform: UNKNOWN
 Classifier: Development Status :: 3 - Alpha
 Classifier: Environment :: Console
 Classifier: Intended Audience :: End Users/Desktop
 Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2)
 Classifier: Operating System :: POSIX
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.2
 Classifier: Programming Language :: Python :: 3.3
 Classifier: Programming Language :: Python :: 3.4
 Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
 Classifier: Topic :: Multimedia
 Classifier: Topic :: Multimedia :: Graphics
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@ -1,35 +0,0 @@
 setup.py
 bin/gallery-dl
 gallery_dl/__init__.py
 gallery_dl/download.py
 gallery_dl.egg-info/PKG-INFO
 gallery_dl.egg-info/SOURCES.txt
 gallery_dl.egg-info/dependency_links.txt
 gallery_dl.egg-info/entry_points.txt
 gallery_dl.egg-info/requires.txt
 gallery_dl.egg-info/top_level.txt
 gallery_dl/downloader/__init__.py
 gallery_dl/downloader/common.py
 gallery_dl/downloader/http.py
 gallery_dl/downloader/https.py
 gallery_dl/downloader/text.py
 gallery_dl/extractor/3dbooru.py
 gallery_dl/extractor/4chan.py
 gallery_dl/extractor/8chan.py
 gallery_dl/extractor/__init__.py
 gallery_dl/extractor/batoto.py
 gallery_dl/extractor/booru.py
 gallery_dl/extractor/common.py
 gallery_dl/extractor/danbooru.py
 gallery_dl/extractor/e621.py
 gallery_dl/extractor/exhentai.py
 gallery_dl/extractor/gelbooru.py
 gallery_dl/extractor/imagebam.py
 gallery_dl/extractor/imgbox.py
 gallery_dl/extractor/imgchili.py
 gallery_dl/extractor/mangareader.py
 gallery_dl/extractor/nijie.py
 gallery_dl/extractor/pixiv.py
 gallery_dl/extractor/redhawkscans.py
 gallery_dl/extractor/sankaku.py
 gallery_dl/extractor/yandere.py
--- a/gallery_dl.egg-info/dependency_links.txt
+++ b/gallery_dl.egg-info/dependency_links.txt
@ -1 +0,0 @@
--- a/gallery_dl.egg-info/entry_points.txt
+++ b/gallery_dl.egg-info/entry_points.txt
@ -1,3 +0,0 @@
 [console_scripts]
 gallery-dl = gallery_dl:main
--- a/gallery_dl.egg-info/requires.txt
+++ b/gallery_dl.egg-info/requires.txt
@ -1 +0,0 @@
 requests >= 2.0
--- a/gallery_dl.egg-info/top_level.txt
+++ b/gallery_dl.egg-info/top_level.txt
@ -1 +0,0 @@
 gallery_dl
--- a/gallery_dl/init.py
+++ b/gallery_dl/init.py
@ -17,9 +17,7 @@ __email__      = "mike_faehrmann@web.de"
 import os
 import sys
 import argparse
-import configparser
+from . import config, download
 from .download import DownloadManager
 def parse_cmdline_options():
    parser = argparse.ArgumentParser(
@ -41,18 +39,10 @@ def parse_cmdline_options():
    )
    return parser.parse_args()
 def parse_config_file(path):
    config = configparser.ConfigParser(
        interpolation=None,
    )
    config.optionxform = lambda opt: opt
    config.read(os.path.expanduser(path))
    return config
 def main():
    config.load()
    opts = parse_cmdline_options()
-    conf = parse_config_file(opts.config)
+    dlmgr = download.DownloadManager(opts)
    dlmgr = DownloadManager(opts, conf)
    try:
        for url in opts.urls:
--- a/gallery_dl/config.py
+++ b/gallery_dl/config.py
@ -0,0 +1,88 @@
 # -*- coding: utf-8 -*-
 # Copyright 2015 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Global configuration module"""
 import sys
 import json
 import os.path
 import platform
 # --------------------------------------------------------------------
 # public interface
 def load(*files):
    """Load JSON configuration files"""
    configfiles = files or _default_configs
    for conf in configfiles:
        try:
            path = os.path.expanduser(conf)
            with open(path) as file:
                confdict = json.load(file)
            _config.update(confdict)
        except FileNotFoundError:
            continue
        except json.decoder.JSONDecodeError as exception:
            print("Error while loading '", path, "':", sep="", file=sys.stderr)
            print(exception, file=sys.stderr)
 def clear():
    """Reset configuration to en empty state"""
    globals()["_config"] = {}
 def get(keys, default=None):
    """Get the value of property 'key' or a default-value if it doenst exist"""
    conf = _config
    try:
        for k in keys:
            conf = conf[k]
        return conf
    except (KeyError, AttributeError):
        return default
 def interpolate(keys, default=None):
    """Interpolate the value of 'key'"""
    conf = _config
    try:
        for k in keys:
            default = conf.get(keys[-1], default)
            conf = conf[k]
        return conf
    except (KeyError, AttributeError):
        return default
 def set(keys, value):
    """Set the value of property 'key' for this session"""
    conf = _config
    for k in keys[:-1]:
        try:
            conf = conf[k]
        except KeyError:
            temp = {}
            conf[k] = temp
            conf = temp
    conf[keys[-1]] = value
 # --------------------------------------------------------------------
 # internals
 _config = {}
 if platform.system() == "Windows":
    _default_configs = [
        r"~\.config\gallery-dl.conf",
        r"~\.gallery-dl.conf",
    ]
 else:
    _default_configs = [
        "/etc/gallery-dl.conf",
        "~/.config/gallery/config.json",
        "~/.config/gallery-dl.conf",
        "~/.gallery-dl.conf",
    ]
--- a/gallery_dl/download.py
+++ b/gallery_dl/download.py
@ -11,12 +11,12 @@ import sys
 import importlib
 from . import extractor
 from .extractor.common import Message
 from . import config
 class DownloadManager():
-    def __init__(self, opts, config):
+    def __init__(self, opts):
        self.opts = opts
        self.config = config
        self.modules = {}
    def add(self, url):
@ -36,7 +36,7 @@ class DownloadManager():
        if self.opts.dest:
            return self.opts.dest
        else:
-            return self.config.get("general", "destination", fallback="/tmp/")
+            return config.get(("base-directory",), default="/tmp/")
 class DownloadJob():
@ -48,16 +48,14 @@ class DownloadJob():
            return
        self.directory = mngr.get_base_directory()
        self.downloaders = {}
-        self.filename_fmt = mngr.config.get(
+        self.filename_fmt = config.get(
-            self.info["category"], "filename",
+            ("extractor", self.info["category"], "filename"),
-            fallback=self.info["filename"]
+            default=self.info["filename"]
        )
        segments = config.get(
            ("extractor", self.info["category"], "directory"),
            default=self.info["directory"]
        )
        try:
            segments = mngr.config.get(
                self.info["category"], "directory"
            ).split("/")
        except Exception:
            segments = self.info["directory"]
        self.directory_fmt = os.path.join(*segments)
    def run(self):
@ -112,13 +110,11 @@ class DownloadJob():
        scheme = url[:pos] if pos != -1 else "http"
        if scheme == "https":
            scheme = "http"
        downloader = self.downloaders.get(scheme)
        if downloader is None:
            module = self.mngr.get_downloader_module(scheme)
            downloader = module.Downloader()
            self.downloaders[scheme] = downloader
        return downloader
    @staticmethod
--- a/gallery_dl/extractor/3dbooru.py
+++ b/gallery_dl/extractor/3dbooru.py
@ -22,8 +22,8 @@ info = {
 class ThreeDeeBooruExtractor(JSONBooruExtractor):
-    def __init__(self, match, config):
+    def __init__(self, match):
-        JSONBooruExtractor.__init__(self, match, config, info)
+        JSONBooruExtractor.__init__(self, match, info)
        self.api_url = "http://behoimi.org/post/index.json"
        self.headers = {
            "Referer": "http://behoimi.org/post/show/",
--- a/gallery_dl/extractor/4chan.py
+++ b/gallery_dl/extractor/4chan.py
@ -8,65 +8,25 @@
 """Extract image- and video-urls from threads on https://www.4chan.org/"""
-from .common import SequentialExtractor, Message
+from .chan import ChanExtractor
 from urllib.parse import unquote
 import re
 info = {
    "category": "4chan",
    "extractor": "FourChanExtractor",
-    "directory": ["{category}", "{board}-{thread-id}"],
+    "directory": ["{category}", "{board}-{thread}"],
-    "filename": "{timestamp}-{name}",
+    "filename": "{tim}-{filename}{ext}",
    "pattern": [
        r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*",
    ],
 }
-class FourChanExtractor(SequentialExtractor):
+class FourChanExtractor(ChanExtractor):
-    url_fmt = "https://boards.4chan.org/{0}/res/{1}.html"
+    api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
-    regex = (
+    file_url = "https://i.4cdn.org/{board}/{tim}{ext}"
        r'<a (?:title="(?P<orig_name>[^"]+)" )?href="'
        r'(?P<url>//i.4cdn.org/[^/]+/(?P<timestamp>\d+)\.(?P<extension>[^"]+))'
        r'" target="_blank">(?P<name>[^<]+)</a> '
        r'\((?P<size>[^,]+), (?P<width>\d+)x(?P<height>\d+)\)'
    )
-    def __init__(self, match, config):
+    def __init__(self, match):
-        SequentialExtractor.__init__(self, config)
+        ChanExtractor.__init__(
-        self.match = match
+            self, info["category"],
-        self.metadata = None
+            match.group(1), match.group(2)
-
+        )
    def items(self):
        yield Message.Version, 1
        url = self.url_fmt.format(*self.match.groups())
        text = self.request(url).text
        self.metadata = self.get_job_metadata(text)
        yield Message.Directory, self.metadata
        for match in re.finditer(self.regex, text):
            yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
    def get_job_metadata(self, text):
        """Collect metadata for extractor-job"""
        board, thread_id = self.match.groups()
        title, _ = self.extract(text, '"description" content="', ' - &quot;/')
        return {
            "category": info["category"],
            "board": board,
            "thread-id": thread_id,
            "title": unquote(title),
        }
    def get_file_metadata(self, match):
        """Collect metadata for a downloadable file"""
        data = self.metadata
        data.update(match.groupdict(default=""))
        data["name"] = unquote(data["orig_name"] or data["name"])
        return data
    @staticmethod
    def get_file_url(match):
        """Extract download-url from 'match'"""
        return "https:" + match.group("url")
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@ -8,65 +8,25 @@
 """Extract image- and video-urls from threads on https://8ch.net/"""
-from .common import SequentialExtractor, Message
+from .chan import ChanExtractor
 from urllib.parse import unquote
 import re
 info = {
    "category": "8chan",
    "extractor": "InfinityChanExtractor",
-    "directory": ["{category}", "{board}-{thread-id}"],
+    "directory": ["{category}", "{board}-{thread}"],
-    "filename": "{timestamp}-{name}",
+    "filename": "{tim}-{filename}{ext}",
    "pattern": [
-        r"(?:https?://)?(?:www\.)?(?:8chan\.co|8ch\.net)/([^/]+/res/\d+).*",
+        r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+).*",
    ],
 }
-class InfinityChanExtractor(SequentialExtractor):
+class InfinityChanExtractor(ChanExtractor):
-    url_base = "https://8ch.net"
+    api_url = "https://8ch.net/{board}/res/{thread}.json"
-    url_fmt = url_base + "/{board}/res/{thread-id}.html"
+    file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"
    regex = (
        r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?'
        r'<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
    )
    def __init__(self, match, config):
        SequentialExtractor.__init__(self, config)
        self.match = match
    def items(self):
        yield Message.Version, 1
        metadata = self.get_job_metadata()
        yield Message.Directory, metadata
        url = self.url_fmt.format(**metadata)
        text = self.request(url).text
        for match in re.finditer(self.regex, text):
            yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
    def get_job_metadata(self):
        """Collect metadata for extractor-job"""
        board, _, thread_id = self.match.group(1).split("/")
        return {
            "category": info["category"],
            "board": board,
            "thread-id": thread_id,
        }
    @staticmethod
    def get_file_metadata(match):
        """Collect metadata for a downloadable file"""
        return {
            "timestamp": match.group(2),
            "name": unquote(match.group(4) or match.group(5)),
        }
    def get_file_url(self, match):
        """Extract download-url from 'match'"""
        url = match.group(1)
        if url.startswith("/"):
            url = self.url_base + url
        return url
    def __init__(self, match):
        ChanExtractor.__init__(
            self, info["category"],
            match.group(1), match.group(2)
        )
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@ -8,10 +8,8 @@
 """Extract manga pages from http://bato.to/"""
-from .common import AsynchronousExtractor
+from .common import AsynchronousExtractor, Message
-from .common import Message
+from .. import text
 from .common import filename_from_url, unescape
 from urllib.parse import unquote
 import os.path
 import re
@ -29,8 +27,8 @@ class BatotoExtractor(AsynchronousExtractor):
    url_base = "http://bato.to/read/_/"
-    def __init__(self, match, config):
+    def __init__(self, match):
-        AsynchronousExtractor.__init__(self, config)
+        AsynchronousExtractor.__init__(self)
        self.chapter_id = match.group(1)
    def items(self):
@ -44,13 +42,13 @@ class BatotoExtractor(AsynchronousExtractor):
    def get_page_metadata(self, page_url):
        """Collect next url and metadata for one manga-page"""
        page = self.request(page_url).text
-        _    , pos = self.extract(page, 'selected="selected"', '')
+        _    , pos = text.extract(page, 'selected="selected"', '')
-        title, pos = self.extract(page, ': ', '<', pos)
+        title, pos = text.extract(page, ': ', '<', pos)
-        _    , pos = self.extract(page, 'selected="selected"', '', pos)
+        _    , pos = text.extract(page, 'selected="selected"', '', pos)
-        trans, pos = self.extract(page, '>', '<', pos)
+        trans, pos = text.extract(page, '>', '<', pos)
-        _    , pos = self.extract(page, '<div id="full_image"', '', pos)
+        _    , pos = text.extract(page, '<div id="full_image"', '', pos)
-        image, pos = self.extract(page, '<img src="', '"', pos)
+        image, pos = text.extract(page, '<img src="', '"', pos)
-        url  , pos = self.extract(page, '<a href="', '"', pos)
+        url  , pos = text.extract(page, '<a href="', '"', pos)
        mmatch = re.search(
            r"<title>(.+) - (?:vol (\d+) )?"
            r"ch (\d+)[^ ]+ Page (\d+) | Batoto!</title>",
@ -60,18 +58,18 @@ class BatotoExtractor(AsynchronousExtractor):
            r"(.+) - ([^ ]+)",
            trans
        )
-        filename = unquote(filename_from_url(image))
+        filename = text.unquote(text.filename_from_url(image))
        name, ext = os.path.splitext(filename)
        return url, {
            "category": info["category"],
            "chapter-id": self.chapter_id,
-            "manga": unescape(mmatch.group(1)),
+            "manga": text.unescape(mmatch.group(1)),
            "volume": mmatch.group(2) or "",
            "chapter": mmatch.group(3),
            "page": mmatch.group(4),
            "group": tmatch.group(1),
            "language": tmatch.group(2),
-            "title": unescape(title),
+            "title": text.unescape(title),
            "image-url": image,
            "name": name,
            "extension": ext[1:],
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@ -8,23 +8,21 @@
 """Base classes for extractors for danbooru and co"""
-from .common import SequentialExtractor
+from .common import SequentialExtractor, Message
-from .common import Message
+from .. import text
 from .common import filename_from_url
 import xml.etree.ElementTree as ET
 import json
 import os.path
 import urllib.parse
 class BooruExtractor(SequentialExtractor):
    api_url = ""
-    def __init__(self, match, config, info):
+    def __init__(self, match, info):
-        SequentialExtractor.__init__(self, config)
+        SequentialExtractor.__init__(self)
        self.info = info
-        self.tags = urllib.parse.unquote(match.group(1))
+        self.tags = text.unquote(match.group(1))
        self.page = "page"
        self.params = {"tags": self.tags}
        self.headers = {}
@ -58,8 +56,8 @@ class BooruExtractor(SequentialExtractor):
    def get_file_metadata(self, data):
        """Collect metadata for a downloadable file"""
        data["category"] = self.info["category"]
-        data["name"] = urllib.parse.unquote(
+        data["name"] = text.unquote(
-            filename_from_url(self.get_file_url(data))
+            text.filename_from_url(self.get_file_url(data))
        )
        data["extension"] = os.path.splitext(data["name"])[1][1:]
        return data
--- a/gallery_dl/extractor/chan.py
+++ b/gallery_dl/extractor/chan.py
@ -0,0 +1,47 @@
 # -*- coding: utf-8 -*-
 # Copyright 2015 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Base classes for extractors for different Futaba Channel boards"""
 from .common import SequentialExtractor, Message
 from .. import text
 class ChanExtractor(SequentialExtractor):
    api_url = ""
    file_url = ""
    def __init__(self, category, board, thread):
        SequentialExtractor.__init__(self)
        self.metadata = {
            "category": category,
            "board": board,
            "thread": thread,
        }
    def items(self):
        yield Message.Version, 1
        posts = self.request(self.api_url.format(**self.metadata)).json()["posts"]
        self.metadata["title"] = self.get_thread_title(posts[0])
        yield Message.Directory, self.metadata
        for post in posts:
            if "filename" not in post:
                continue
            post.update(self.metadata)
            yield Message.Url, self.file_url.format(**post), post
            if "extra_files" in post:
                for file in post["extra_files"]:
                    post.update(file)
                    yield Message.Url, self.file_url.format(**post), post
    @staticmethod
    def get_thread_title(post):
        """Return thread title from first post"""
        if "sub" in post:
            return post["sub"]
        return text.remove_html(post["com"])[:50]
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@ -12,7 +12,7 @@ import time
 import queue
 import requests
 import threading
-import html.parser
+from .. import config
 class Message():
@ -44,36 +44,18 @@ class Extractor():
            "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
        )
    @staticmethod
    def extract(txt, begin, end, pos=0):
        try:
            first = txt.index(begin, pos) + len(begin)
            last = txt.index(end, first)
            return txt[first:last], last+len(end)
        except ValueError:
            return None, pos
    @staticmethod
    def extract_all(txt, begin, end, pos=0):
        try:
            first = txt.index(begin, pos)
            last = txt.index(end, first + len(begin)) + len(end)
            return txt[first:last], last
        except ValueError:
            return None, pos
 class SequentialExtractor(Extractor):
-    def __init__(self, _):
+    def __init__(self):
        Extractor.__init__(self)
 class AsynchronousExtractor(Extractor):
-    def __init__(self, config):
+    def __init__(self):
        Extractor.__init__(self)
-        queue_size = int(config.get("general", "queue-size", fallback=5))
+        queue_size = int(config.get(("queue-size",), default=5))
        self.__queue = queue.Queue(maxsize=queue_size)
        self.__thread = threading.Thread(target=self.async_items, daemon=True)
@ -123,9 +105,3 @@ def safe_request(session, url, method="GET", *args, **kwargs):
        # everything ok -- proceed to download
        return r
 def filename_from_url(url):
    pos = url.rfind("/")
    return url[pos+1:]
 unescape = html.parser.HTMLParser().unescape
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@ -22,6 +22,6 @@ info = {
 class DanbooruExtractor(JSONBooruExtractor):
-    def __init__(self, match, config):
+    def __init__(self, match):
-        JSONBooruExtractor.__init__(self, match, config, info)
+        JSONBooruExtractor.__init__(self, match, info)
        self.api_url = "https://danbooru.donmai.us/posts.json"
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@ -23,6 +23,6 @@ info = {
 class E621Extractor(JSONBooruExtractor):
-    def __init__(self, match, config):
+    def __init__(self, match):
-        JSONBooruExtractor.__init__(self, match, config, info)
+        JSONBooruExtractor.__init__(self, match, info)
        self.api_url = "https://e621.net/post/index.json"
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@ -22,8 +22,8 @@ info = {
 class GelbooruExtractor(XMLBooruExtractor):
-    def __init__(self, match, config):
+    def __init__(self, match):
-        XMLBooruExtractor.__init__(self, match, config, info)
+        XMLBooruExtractor.__init__(self, match, info)
        self.api_url = "http://gelbooru.com/"
        self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
--- a/gallery_dl/extractor/imagebam.py
+++ b/gallery_dl/extractor/imagebam.py
@ -8,9 +8,8 @@
 """Extract images from galleries at http://www.imagebam.com/"""
-from .common import AsynchronousExtractor
+from .common import AsynchronousExtractor, Message
-from .common import Message
+from .. import text
 from .common import filename_from_url
 info = {
    "category": "imagebam",
@ -26,8 +25,8 @@ class ImagebamExtractor(AsynchronousExtractor):
    url_base = "http://www.imagebam.com"
-    def __init__(self, match, config):
+    def __init__(self, match):
-        AsynchronousExtractor.__init__(self, config)
+        AsynchronousExtractor.__init__(self)
        self.match = match
        self.num = 0
        self.metadata = {}
@ -42,28 +41,28 @@ class ImagebamExtractor(AsynchronousExtractor):
        done = False
        while not done:
            # get current page
-            text = self.request(self.url_base + next_url).text
+            page = self.request(self.url_base + next_url).text
            # get url for next page
-            next_url, pos = self.extract(text, "<a class='buttonblue' href='", "'")
+            next_url, pos = text.extract(page, "<a class='buttonblue' href='", "'")
            # if the following text isn't "><span>next image" we are done
-            if not text.startswith("><span>next image", pos):
+            if not page.startswith("><span>next image", pos):
                done = True
            # get image url
-            img_url, pos = self.extract(text, 'onclick="scale(this);" src="', '"', pos)
+            img_url, pos = text.extract(page, 'onclick="scale(this);" src="', '"', pos)
            yield Message.Url, img_url, self.get_file_metadata(img_url)
    def get_job_metadata(self):
        """Collect metadata for extractor-job"""
        gallery_key = self.match.group(2)
-        text = self.request(self.url_base + "/gallery/" + gallery_key).text
+        page = self.request(self.url_base + "/gallery/" + gallery_key).text
-        _    , pos = self.extract(text, "<img src='/img/icons/photos.png'", "")
+        _    , pos = text.extract(page, "<img src='/img/icons/photos.png'", "")
-        title, pos = self.extract(text, "'> ", " <", pos)
+        title, pos = text.extract(page, "'> ", " <", pos)
-        count, pos = self.extract(text, "'>", " images", pos)
+        count, pos = text.extract(page, "'>", " images", pos)
-        url  , pos = self.extract(text, "<a href='http://www.imagebam.com", "'", pos)
+        url  , pos = text.extract(page, "<a href='http://www.imagebam.com", "'", pos)
        return {
            "category": info["category"],
            "key": gallery_key,
@ -77,5 +76,5 @@ class ImagebamExtractor(AsynchronousExtractor):
        self.num += 1
        data = self.metadata.copy()
        data["num"] = self.num
-        data["name"] = filename_from_url(url)
+        data["name"] = text.filename_from_url(url)
        return data
--- a/gallery_dl/extractor/imgbox.py
+++ b/gallery_dl/extractor/imgbox.py
@ -9,6 +9,7 @@
 """Extract images from galleries at http://imgbox.com/"""
 from .common import AsynchronousExtractor, Message
 from .. import text
 import re
 info = {
@ -25,8 +26,8 @@ class ImgboxExtractor(AsynchronousExtractor):
    url_base = "http://imgbox.com"
-    def __init__(self, match, config):
+    def __init__(self, match):
-        AsynchronousExtractor.__init__(self, config)
+        AsynchronousExtractor.__init__(self)
        self.key = match.group(1)
        self.metadata = {}
@ -36,8 +37,8 @@ class ImgboxExtractor(AsynchronousExtractor):
        yield Message.Version, 1
        yield Message.Directory, self.metadata
        for match in re.finditer(r'<a href="([^"]+)"><img alt="', page):
-            text = self.request(self.url_base + match.group(1)).text
+            imgpage = self.request(self.url_base + match.group(1)).text
-            yield Message.Url, self.get_file_url(text), self.get_file_metadata(text)
+            yield Message.Url, self.get_file_url(imgpage), self.get_file_metadata(imgpage)
    def get_job_metadata(self, page):
        """Collect metadata for extractor-job"""
@ -51,16 +52,16 @@ class ImgboxExtractor(AsynchronousExtractor):
            "count": match.group(4),
        }
-    def get_file_metadata(self, text):
+    def get_file_metadata(self, page):
        """Collect metadata for a downloadable file"""
        data = self.metadata.copy()
-        data["num"]      , pos = self.extract(text, '</a> &nbsp; ', ' of ')
+        data["num"]      , pos = text.extract(page, '</a> &nbsp; ', ' of ')
-        data["image-key"], pos = self.extract(text, '/i.imgbox.com/', '?download', pos)
+        data["image-key"], pos = text.extract(page, '/i.imgbox.com/', '?download', pos)
-        data["name"]     , pos = self.extract(text, ' title="', '"', pos)
+        data["name"]     , pos = text.extract(page, ' title="', '"', pos)
        return data
-    def get_file_url(self, text):
+    def get_file_url(self, page):
        """Extract download-url"""
        base = "http://i.imgbox.com/"
-        path, _ = self.extract(text, base, '"')
+        path, _ = text.extract(page, base, '"')
        return base + path
--- a/gallery_dl/extractor/imgchili.py
+++ b/gallery_dl/extractor/imgchili.py
@ -8,9 +8,8 @@
 """Extract images from albums at http://imgchili.net/"""
-from .common import SequentialExtractor
+from .common import SequentialExtractor, Message
-from .common import Message
+from .. import text
 from .common import filename_from_url
 import re
 info = {
@ -25,8 +24,8 @@ info = {
 class ImgchiliExtractor(SequentialExtractor):
-    def __init__(self, match, config):
+    def __init__(self, match):
-        SequentialExtractor.__init__(self, config)
+        SequentialExtractor.__init__(self)
        self.match = match
        self.num = 0
@ -42,7 +41,7 @@ class ImgchiliExtractor(SequentialExtractor):
    def get_job_metadata(self, page):
        """Collect metadata for extractor-job"""
-        title = self.extract(page, "<h1>", "</h1>")[0]
+        title = text.extract(page, "<h1>", "</h1>")[0]
        return {
            "category": info["category"],
            "title": title,
--- a/gallery_dl/extractor/mangareader.py
+++ b/gallery_dl/extractor/mangareader.py
@ -8,10 +8,8 @@
 """Extract manga pages from http://www.mangareader.net/"""
-from .common import AsynchronousExtractor
+from .common import AsynchronousExtractor, Message
-from .common import Message
+from .. import text
 from .common import unescape, filename_from_url
 from urllib.parse import unquote
 import os.path
 import re
@ -30,8 +28,8 @@ class MangaReaderExtractor(AsynchronousExtractor):
    url_base = "http://www.mangareader.net"
-    def __init__(self, match, config):
+    def __init__(self, match):
-        AsynchronousExtractor.__init__(self, config)
+        AsynchronousExtractor.__init__(self)
        self.part = match.group(1)
    def items(self):
@ -47,7 +45,7 @@ class MangaReaderExtractor(AsynchronousExtractor):
    def get_page_metadata(self, page_url):
        """Collect next url, image-url and metadata for one manga-page"""
        page = self.request(page_url).text
-        extr = self.extract
+        extr = text.extract
        width = None
        descr, pos = extr(page, '<meta name="description" content="', '"')
        test , pos = extr(page, "document['pu']", '', pos)
@ -62,13 +60,13 @@ class MangaReaderExtractor(AsynchronousExtractor):
            width , pos = extr(page, '<img id="img" width="', '"', pos)
            height, pos = extr(page, ' height="', '"', pos)
        image, pos = extr(page, ' src="', '"', pos)
-        filename = unquote(filename_from_url(image))
+        filename = text.unquote(text.filename_from_url(image))
        name, ext = os.path.splitext(filename)
        match = re.match(r"(.*) (\d+) - Read \1 \2 Manga Scans Page (\d+)", descr)
        return self.url_base + url, image, {
            "category": info["category"],
-            "manga": unescape(match.group(1)),
+            "manga": text.unescape(match.group(1)),
            "chapter": match.group(2),
            "page": match.group(3),
            "width": width,
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@ -8,9 +8,8 @@
 """Extract images from https://nijie.info/"""
-from .common import AsynchronousExtractor
+from .common import AsynchronousExtractor, Message
-from .common import Message
+from .. import config, text
 from .common import filename_from_url
 import re
 info = {
@ -27,8 +26,8 @@ class NijieExtractor(AsynchronousExtractor):
    popup_url = "https://nijie.info/view_popup.php?id="
-    def __init__(self, match, config):
+    def __init__(self, match):
-        AsynchronousExtractor.__init__(self, config)
+        AsynchronousExtractor.__init__(self)
        self.artist_id = match.group(1)
        self.artist_url = (
            "https://nijie.info/members_illust.php?id="
@ -37,7 +36,9 @@ class NijieExtractor(AsynchronousExtractor):
        self.session.headers["Referer"] = self.artist_url
        self.session.cookies["R18"] = "1"
        self.session.cookies["nijie_referer"] = "nijie.info"
-        self.session.cookies.update(config["nijie-cookies"])
+        self.session.cookies.update(
            config.get(("extractor", info["category"], "cookies"))
        )
    def items(self):
        data = self.get_job_metadata()
@ -56,19 +57,20 @@ class NijieExtractor(AsynchronousExtractor):
        }
    def get_image_ids(self):
-        text = self.request(self.artist_url).text
+        """Collect all image-ids for a specific artist"""
        page = self.request(self.artist_url).text
        regex = r'<a href="/view\.php\?id=(\d+)"'
-        return [m.group(1) for m in re.finditer(regex, text)]
+        return [m.group(1) for m in re.finditer(regex, page)]
    def get_image_data(self, image_id):
        """Get URL and metadata for images specified by 'image_id'"""
-        text = self.request(self.popup_url + image_id).text
+        page = self.request(self.popup_url + image_id).text
-        matches = re.findall('<img src="([^"]+)"', text)
+        matches = re.findall('<img src="([^"]+)"', page)
        for index, url in enumerate(matches):
            yield "https:" + url, {
                "count": len(matches),
                "index": index,
                "image-id": image_id,
-                "name" : filename_from_url(url),
+                "name" : text.filename_from_url(url),
                "extension": url[url.rfind(".")+1:],
            }
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@ -8,8 +8,8 @@
 """Extract images and ugoira from http://www.pixiv.net/"""
-from .common import SequentialExtractor
+from .common import SequentialExtractor, Message
-from .common import Message
+from .. import config, text
 import re
 import json
@ -29,16 +29,15 @@ class PixivExtractor(SequentialExtractor):
    member_url = "http://www.pixiv.net/member_illust.php"
    illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
-    def __init__(self, match, config):
+    def __init__(self, match):
-        SequentialExtractor.__init__(self, config)
+        SequentialExtractor.__init__(self)
        self.config = config
        self.artist_id = match.group(1)
        self.api = PixivAPI(self.session)
    def items(self):
        self.api.login(
-            self.config.get("pixiv", "username"),
+            config.get(("extractor", "pixiv", "username")),
-            self.config.get("pixiv", "password"),
+            config.get(("extractor", "pixiv", "password")),
        )
        metadata = self.get_job_metadata()
@ -84,9 +83,9 @@ class PixivExtractor(SequentialExtractor):
    def get_works(self):
        """Yield all work-items for a pixiv-member"""
-        page = 1
+        pagenum = 1
        while True:
-            data = self.api.user_works(self.artist_id, page)
+            data = self.api.user_works(self.artist_id, pagenum)
            for work in data["response"]:
                url = work["image_urls"]["large"]
                work["num"] = ""
@ -96,17 +95,17 @@ class PixivExtractor(SequentialExtractor):
            pinfo = data["pagination"]
            if pinfo["current"] == pinfo["pages"]:
                return
-            page = pinfo["next"]
+            pagenum = pinfo["next"]
    def parse_ugoira(self, data):
        """Parse ugoira data"""
        # get illust page
-        text = self.request(
+        page = self.request(
            self.illust_url, params={"illust_id": data["id"]},
        ).text
        # parse page
-        frames, _ = self.extract(text, ',"frames":[', ']')
+        frames, _ = text.extract(page, ',"frames":[', ']')
        # build url
        url = re.sub(
@ -146,7 +145,7 @@ class PixivAPI():
        self.session = session
        self.session.headers.update({
            "Referer": "http://www.pixiv.net/",
-            "User-Agent": "PixivIOSApp/5.1.1",
+            "User-Agent": "PixivIOSApp/5.8.0",
            # "Authorization": "Bearer 8mMXXWT9iuwdJvsVIvQsFYDwuZpRCMePeyagSh30ZdU",
        })
--- a/gallery_dl/extractor/redhawkscans.py
+++ b/gallery_dl/extractor/redhawkscans.py
@ -8,9 +8,8 @@
 """Extract manga pages from http://manga.redhawkscans.com/"""
-from .common import SequentialExtractor
+from .common import SequentialExtractor, Message
-from .common import Message
+from .. import text
 from .common import unescape
 import os.path
 import json
 import re
@ -29,8 +28,8 @@ class RedHawkScansExtractor(SequentialExtractor):
    url_base = "https://manga.redhawkscans.com/reader/read/"
-    def __init__(self, match, config):
+    def __init__(self, match):
-        SequentialExtractor.__init__(self, config)
+        SequentialExtractor.__init__(self)
        self.part = match.group(1)
    def items(self):
@ -50,16 +49,16 @@ class RedHawkScansExtractor(SequentialExtractor):
        response = self.request(self.url_base + self.part)
        response.encoding = "utf-8"
        page = response.text
-        _        , pos = self.extract(page, '<h1 class="tbtitle dnone">', '')
+        _        , pos = text.extract(page, '<h1 class="tbtitle dnone">', '')
-        manga    , pos = self.extract(page, 'title="', '"', pos)
+        manga    , pos = text.extract(page, 'title="', '"', pos)
-        chapter  , pos = self.extract(page, '">', '</a>', pos)
+        chapter  , pos = text.extract(page, '">', '</a>', pos)
-        json_data, pos = self.extract(page, 'var pages = ', ';\r\n', pos)
+        json_data, pos = text.extract(page, 'var pages = ', ';\r\n', pos)
        match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter)
        return {
            "category": info["category"],
-            "manga": unescape(manga),
+            "manga": text.unescape(manga),
            "chapter": match.group(2) or match.group(1),
            "chapter-minor": match.group(3) or "",
            "language": "English",
-            "title": unescape(match.group(4) or ""),
+            "title": text.unescape(match.group(4) or ""),
        }, json.loads(json_data)
--- a/gallery_dl/extractor/yandere.py
+++ b/gallery_dl/extractor/yandere.py
@ -22,6 +22,6 @@ info = {
 class YandereExtractor(JSONBooruExtractor):
-    def __init__(self, match, config):
+    def __init__(self, match):
-        JSONBooruExtractor.__init__(self, match, config, info)
+        JSONBooruExtractor.__init__(self, match, info)
        self.api_url = "https://yande.re/post.json"
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@ -0,0 +1,60 @@
 # -*- coding: utf-8 -*-
 # Copyright 2015 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Collection of functions that work in strings/text"""
 import re
 import html.parser
 import urllib.parse
 import platform
 def remove_html(text):
    """Remove html-tags from a string"""
    return " ".join(re.sub("<[^>]+?>", " ", text).split())
 def filename_from_url(url):
    """Extract the last part of an url to use as a filename"""
    try:
        path = urllib.parse.urlparse(url).path
        pos = path.rindex("/")
        return path[pos+1:]
    except ValueError:
        return url
 def clean_path_windows(path):
    """Remove illegal characters from a path-segment (Windows)"""
    return re.sub(r'[<>:"\\/|?*]', "_", path)
 def clean_path_posix(path):
    """Remove illegal characters from a path-segment (Posix)"""
    return path.replace("/", "_")
 def extract(txt, begin, end, pos=0):
    try:
        first = txt.index(begin, pos) + len(begin)
        last = txt.index(end, first)
        return txt[first:last], last+len(end)
    except ValueError:
        return None, pos
 def extract_all(txt, begin, end, pos=0):
    try:
        first = txt.index(begin, pos)
        last = txt.index(end, first + len(begin)) + len(end)
        return txt[first:last], last
    except ValueError:
        return None, pos
 if platform.system() == "Windows":
    clean_path = clean_path_windows
 else:
    clean_path = clean_path_posix
 unquote = urllib.parse.unquote
 unescape = html.parser.HTMLParser().unescape
--- a/setup.py
+++ b/setup.py
@ -46,4 +46,5 @@ setup(
        "Topic :: Multimedia",
        "Topic :: Multimedia :: Graphics",
    ],
    test_suite='test',
 )
--- a/test/init.py
+++ b/test/init.py
--- a/test/test_config.py
+++ b/test/test_config.py
@ -0,0 +1,49 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Copyright 2015 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 import unittest
 import gallery_dl.config as config
 import os
 import tempfile
 class TestConfig(unittest.TestCase):
    def setUp(self):
        fd, self._configfile = tempfile.mkstemp()
        with os.fdopen(fd, "w") as file:
            file.write('{"a": "1", "b": {"c": "text"}}')
        config.load(self._configfile)
    def tearDown(self):
        config.clear()
        os.remove(self._configfile)
    def test_get(self):
        self.assertEqual(config.get(["a"]), "1")
        self.assertEqual(config.get(["b", "c"]), "text")
        self.assertEqual(config.get(["d"]), None)
        self.assertEqual(config.get(["e", "f", "g"], 123), 123)
    def test_set(self):
        config.set(["b", "c"], [1, 2, 3])
        config.set(["e", "f", "g"], value=234)
        self.assertEqual(config.get(["b", "c"]), [1, 2, 3])
        self.assertEqual(config.get(["e", "f", "g"]), 234)
    def test_interpolate(self):
        self.assertEqual(config.interpolate(["a"]), "1")
        self.assertEqual(config.interpolate(["b", "a"]), "1")
        self.assertEqual(config.interpolate(["b", "c"], "2"), "text")
        self.assertEqual(config.interpolate(["b", "d"], "2"), "2")
        config.set(["d"], 123)
        self.assertEqual(config.interpolate(["b", "d"], "2"), 123)
        self.assertEqual(config.interpolate(["d", "d"], "2"), 123)
 if __name__ == '__main__':
    unittest.main()
--- a/test/test_text.py
+++ b/test/test_text.py
@ -0,0 +1,51 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Copyright 2015 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 import unittest
 import gallery_dl.text as text
 class TestText(unittest.TestCase):
    def test_remove_html(self):
        cases = (
            "Hello World.",
            " Hello  World. ",
            "Hello<br/>World.",
            "<div><span class='a'>Hello</span><strong>World.</strong></div>"
        )
        result = "Hello World."
        for case in cases:
            self.assertEqual(text.remove_html(case), result)
    def test_filename_from_url(self):
        cases = (
            "http://example.org/v2/filename.ext",
            "http://example.org/v2/filename.ext?param=value#fragment",
            "example.org/filename.ext",
            "/filename.ext",
            "filename.ext",
        )
        result = "filename.ext"
        for case in cases:
            self.assertEqual(text.filename_from_url(case), result)
    def test_clean_path(self):
        cases = {
            "Hello World." : ("Hello World.", "Hello World."),
            "Hello/World/.": ("Hello_World_.", "Hello_World_."),
            r'<Hello>:|"World\*?': (
                '_Hello____World___', r'<Hello>:|"World\*?'
            ),
        }
        for case, result in cases.items():
            self.assertEqual(text.clean_path_windows(case), result[0])
            self.assertEqual(text.clean_path_posix  (case), result[1])
 if __name__ == '__main__':
    unittest.main()
		`@ -1,3 +0,0 @@`
			`[console_scripts]`
			`gallery-dl = gallery_dl:main`