move and rework extractor-loading code

2024-11-25 12:12:34 +01:00 · 2015-06-28 22:53:52 +02:00 · 2015-06-28 22:53:52 +02:00 · 26bb9d62de
commit 26bb9d62de
parent af3b44762c
2 changed files with 55 additions and 66 deletions
--- a/gallery_dl/download.py
+++ b/gallery_dl/download.py
@ -8,9 +8,8 @@
 import os
 import sys
 import re
 import importlib
-
+from . import extractor
 from .extractor.common import Message
 class DownloadManager():
@ -19,7 +18,6 @@ class DownloadManager():
        self.opts = opts
        self.config = config
        self.modules = {}
        self.extractors = ExtractorFinder(config)
    def add(self, url):
        job = DownloadJob(self, url)
@ -45,7 +43,7 @@ class DownloadJob():
    def __init__(self, mngr, url):
        self.mngr = mngr
-        self.extractor, self.info = mngr.extractors.get_for_url(url)
+        self.extractor, self.info = extractor.find(url, mngr.config)
        if self.extractor is None:
            return
        self.directory = mngr.get_base_directory()
@ -140,65 +138,3 @@ class DownloadJob():
        if tries == 0:
            print("\r", end="")
        print("\r\033[1;32m", path, "\033[0m", sep="")
 class ExtractorFinder():
    def __init__(self, config):
        self.config = config
    def get_for_url(self, url):
        """Get an extractor-instance suitable for 'url'"""
        name, match = self.find_pattern_match(url)
        if match:
            module = importlib.import_module(".extractor." + name, __package__)
            klass = getattr(module, module.info["extractor"])
            return klass(match, self.config), module.info
        else:
            print("no suitable extractor found")
            return None, None
    def find_pattern_match(self, url):
        """Find a pattern, that matches 'url', and return the (category,match) tuple"""
        for category in self.config:
            for key, value in self.config[category].items():
                if key.startswith("regex"):
                    match = re.match(value, url)
                    if match:
                        return category, match
        for category, info in self.extractor_metadata():
            for pattern in info["pattern"]:
                match = re.match(pattern, url)
                if match:
                    return category, match
        return None, None
    def extractor_metadata(self):
        """Yield all extractor-name, -metadata tuples"""
        path = os.path.join(os.path.dirname(__file__), "extractor")
        for name in os.listdir(path):
            extractor_path = os.path.join(path, name)
            info = self.get_info_dict(extractor_path)
            if info is not None:
                yield os.path.splitext(name)[0], info
    @staticmethod
    def get_info_dict(extractor_path):
        """Get info-/metadata-dictionary for an extractor"""
        try:
            with open(extractor_path) as file:
                for _ in range(30):
                    line = next(file)
                    if line.startswith("info ="):
                        break
                else:
                    return None
                info = [line[6:]]
                for line in file:
                    info.append(line)
                    if line.startswith("}"):
                        break
        except (StopIteration, OSError):
            return None
        return eval("".join(info))
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -1 +1,54 @@
 # -*- coding: utf-8 -*-
 # Copyright 2015 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 import re
 import importlib
 modules = [
    "pixiv",
    "exhentai",
    "gelbooru",
    "3dbooru",
    "4chan",
    "8chan",
    "batoto",
    "danbooru",
    "e621",
    "imagebam",
    "imgbox",
    "imgchili",
    "mangareader",
    "nijie",
    "redhawkscans",
    "yandere",
 ]
 def find(url, config):
    """Find extractor suitable for handling the given url"""
    for pattern, module, klass in _list_patterns():
        match = re.match(pattern, url)
        if match:
            return klass(match, config), module.info
 # --------------------------------------------------------------------
 # internals
 _cache = []
 _module_iter = iter(modules)
 def _list_patterns():
    """Yield all available (pattern, module, klass) tuples"""
    for entry in _cache:
        yield entry
    for module_name in _module_iter:
        module = importlib.import_module("."+module_name, __package__)
        klass = getattr(module, module.info["extractor"])
        for pattern in module.info["pattern"]:
            etuple = (pattern, module, klass)
            _cache.append(etuple)
            yield etuple