gallery-dl/gallery_dl/extractor/__init__.py

# -*- coding: utf-8 -*-

# Copyright 2015-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

import re
import importlib

modules = [
    "pixiv",
    "2chan",
    "3dbooru",
    "4chan",
    "4plebs",
    "8chan",
    "archivedmoe",
    "archiveofsins",
    "b4k",
    "batoto",
    "danbooru",
    "desuarchive",
    "deviantart",
    "dokireader",
    "dynastyscans",
    "e621",
    "exhentai",
    "fallenangels",
    "fireden",
    "flickr",
    "gelbooru",
    "gfycat",
    "gomanga",
    "hbrowse",
    "hentai2read",
    "hentaifoundry",
    "hentaihere",
    "hitomi",
    "imagebam",
    "imagefap",
    "imgbox",
    "imgchili",
    "imgth",
    "imgur",
    "jaiminisbox",
    "khinsider",
    "kireicake",
    "kissmanga",
    "konachan",
    "loveisover",
    "luscious",
    "mangafox",
    "mangahere",
    "mangapanda",
    "mangapark",
    "mangareader",
    "mangastream",
    "mangazuki",
    "nhentai",
    "nijie",
    "nyafuu",
    "pawoo",
    "pinterest",
    "powermanga",
    "readcomiconline",
    "rebeccablacktech",
    "reddit",
    "rule34",
    "safebooru",
    "sankaku",
    "seaotterscans",
    "seiga",
    "senmanga",
    "sensescans",
    "spectrumnexus",
    "thebarchive",
    "tumblr",
    "twitter",
    "warosu",
    "whatisthisimnotgoodwithcomputers",
    "worldthree",
    "yandere",
    "yeet",
    "imagehosts",
    "directlink",
    "recursive",
    "oauth",
    "test",
]


def find(url):
    """Find suitable extractor for the given url"""
    for pattern, klass in _list_patterns():
        match = pattern.match(url)
        if match and klass.category not in _blacklist:
            return klass(match)
    return None


def extractors():
    """Yield all available extractor classes"""
    return sorted(
        set(klass for _, klass in _list_patterns()),
        key=lambda x: x.__name__
    )


class blacklist():
    """Context Manager to blacklist extractor modules"""
    def __init__(self, categories):
        self.categories = categories

    def __enter__(self):
        _blacklist.extend(self.categories)

    def __exit__(self, etype, value, traceback):
        _blacklist.clear()


# --------------------------------------------------------------------
# internals

_cache = []
_blacklist = []
_module_iter = iter(modules)


def _list_patterns():
    """Yield all available (pattern, class) tuples"""
    yield from _cache

    for module_name in _module_iter:
        module = importlib.import_module("."+module_name, __package__)
        tuples = [
            (re.compile(pattern), klass)
            for klass in _get_classes(module)
            for pattern in klass.pattern
        ]
        _cache.extend(tuples)
        yield from tuples


def _get_classes(module):
    """Return a list of all extractor classes in a module"""
    return [
        klass for klass in module.__dict__.values() if (
            hasattr(klass, "pattern") and klass.__module__ == module.__name__
        )
    ]
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`# -- coding: utf-8 --`
initial commit 2014-10-12 21:56:44 +02:00
[seaotterscans] add extractor 2017-04-07 13:20:35 +02:00			`# Copyright 2015-2017 Mike Fährmann`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`import re`
			`import importlib`

			`modules = [`
			`"pixiv",`
[2chan] add thread extractor 2017-07-14 08:44:31 +02:00			`"2chan",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`"3dbooru",`
			`"4chan",`
[4plebs] add thread extractor (#18) 2017-07-03 16:43:04 +02:00			`"4plebs",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`"8chan",`
[archivedmoe] add thread extractor 2017-07-14 13:25:53 +02:00			`"archivedmoe",`
[archiveofsins] add thread extractor 2017-07-15 13:23:04 +02:00			`"archiveofsins",`
[foolfuuka] add support for more sites (#18) - https://arch.b4k.co - https://archive.whatisthisimnotgoodwithcomputers.com - https://archive.yeet.net Notes: - The name "whatisthisimnotgoodwithcomputers" is way too long ... - archive.yeet.net is out of date and also blocked by 4chan servers - newest threads are 2 weeks old - using "https://archive.yeet.net" as Referer header results in "403 Forbidden" when accessing 4chan 2017-09-16 21:11:44 +02:00			`"b4k",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`"batoto",`
			`"danbooru",`
[desuarchive] add thread extractor 2017-07-11 17:14:50 +02:00			`"desuarchive",`
[deviantart] add to extractor list 2015-10-05 20:29:48 +02:00			`"deviantart",`
[dokireader] add chapter extractor 2016-09-26 21:58:18 +02:00			`"dokireader",`
[dynastyscans] add chapter extractor 2016-09-22 17:20:57 +02:00			`"dynastyscans",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`"e621",`
[doujinmode] add extractor 2016-08-04 18:08:48 +02:00			`"exhentai",`
[fallenangels] add chapter extractor 2017-02-06 20:05:58 +01:00			`"fallenangels",`
[fireden] add thread extractor 2017-07-15 14:51:58 +02:00			`"fireden",`
[flickr] add image extractor 2017-05-30 17:43:02 +02:00			`"flickr",`
[doujinmode] add extractor 2016-08-04 18:08:48 +02:00			`"gelbooru",`
[gfycat] add image extractor 2017-05-28 17:09:54 +02:00			`"gfycat",`
[gomanga] add chapter extractor 2017-01-10 00:05:08 +01:00			`"gomanga",`
[hbrowse] add extractor 2015-11-15 01:30:26 +01:00			`"hbrowse",`
[hentai2read] add extractor 2016-02-19 15:24:49 +01:00			`"hentai2read",`
[hentaifoundry] add extractor 2015-11-14 03:19:44 +01:00			`"hentaifoundry",`
[hentaihere] add manga- and chapter-extractors 2016-10-05 09:20:03 +02:00			`"hentaihere",`
add extractor 'hitomi' 2015-10-28 16:24:35 +01:00			`"hitomi",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`"imagebam",`
[imagefap] add extractor 2016-08-09 14:05:12 +02:00			`"imagefap",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`"imgbox",`
			`"imgchili",`
add extractor 'imgth' 2015-10-28 23:26:47 +01:00			`"imgth",`
[imgur] add to list of extractors 2015-10-12 22:34:45 +02:00			`"imgur",`
[jaiminisbox] add extractor 2016-12-29 16:41:08 +01:00			`"jaiminisbox",`
[khinsider] add extractor 2016-04-20 08:34:44 +02:00			`"khinsider",`
[kireicake] add extractor 2017-04-07 11:41:48 +02:00			`"kireicake",`
[kissmanga] re-enable module 2017-04-05 12:16:23 +02:00			`"kissmanga",`
[konachan] add extractor 2015-11-06 13:24:43 +01:00			`"konachan",`
[loveisover] add thread extractor 2017-07-14 11:17:47 +02:00			`"loveisover",`
[luscious] add extractor 2016-08-01 15:36:56 +02:00			`"luscious",`
[mangafox] add chapter extractor 2017-01-14 19:39:21 +01:00			`"mangafox",`
[mangahere] add chapter-extractor 2015-11-26 03:06:08 +01:00			`"mangahere",`
[mangapanda] add extractor 2015-11-08 00:02:37 +01:00			`"mangapanda",`
[mangapark] add chapter extractor 2015-12-08 22:29:34 +01:00			`"mangapark",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`"mangareader",`
[mangastream] add extractor 2015-11-08 00:03:14 +01:00			`"mangastream",`
[mangazuki] add chapter extractor 2017-07-19 17:20:03 +02:00			`"mangazuki",`
add extractor 'nhentai' 2015-10-28 12:08:27 +01:00			`"nhentai",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`"nijie",`
[nyafuu] add thread extractor (#18) 2017-07-08 17:16:41 +02:00			`"nyafuu",`
[pawoo] add extractors for accounts and statuses https://pawoo.net is a Mastodon[1] instance hosted by Pixiv [1] https://github.com/tootsuite/mastodon 2017-04-19 10:17:43 +02:00			`"pawoo",`
[pinterest] add pin extractor 2016-09-02 19:11:16 +02:00			`"pinterest",`
add extractor 'powermanga' 2015-10-08 20:43:52 +02:00			`"powermanga",`
[readcomiconline] add comic-issue and comic extractor 2016-11-14 18:29:45 +01:00			`"readcomiconline",`
[rebeccablacktech] add thread extractor 2017-07-23 15:33:55 +02:00			`"rebeccablacktech",`
[reddit] add subreddit- and submission-extractor - these extractors scan submissions and their comments for (external) URLs and defer them to other extractors - (#15) 2017-05-23 09:38:50 +02:00			`"reddit",`
[rule34] add tag- and image-extractor 2016-09-17 18:12:37 +02:00			`"rule34",`
[safebooru] add extractor 2015-11-06 13:52:40 +01:00			`"safebooru",`
[sankaku] re-enable extractor 2015-11-09 02:29:33 +01:00			`"sankaku",`
[seaotterscans] add extractor 2017-04-07 13:20:35 +02:00			`"seaotterscans",`
[seiga] add extractor 2016-08-09 16:36:30 +02:00			`"seiga",`
[senmanga] add chapter extractor 2016-08-02 17:42:22 +02:00			`"senmanga",`
[sensescans] add chapter extractor 2016-10-25 15:25:25 +02:00			`"sensescans",`
[spectrumnexus] add extractor 2015-11-13 00:21:50 +01:00			`"spectrumnexus",`
[thebarchive] add thread extractor 2017-07-23 15:45:17 +02:00			`"thebarchive",`
[tumblr] add extractor 2016-02-20 11:29:10 +01:00			`"tumblr",`
[twitter] add extractor 2016-10-06 19:12:07 +02:00			`"twitter",`
[warosu] add thread extractor 2017-08-18 19:52:58 +02:00			`"warosu",`
[foolfuuka] add support for more sites (#18) - https://arch.b4k.co - https://archive.whatisthisimnotgoodwithcomputers.com - https://archive.yeet.net Notes: - The name "whatisthisimnotgoodwithcomputers" is way too long ... - archive.yeet.net is out of date and also blocked by 4chan servers - newest threads are 2 weeks old - using "https://archive.yeet.net" as Referer header results in "403 Forbidden" when accessing 4chan 2017-09-16 21:11:44 +02:00			`"whatisthisimnotgoodwithcomputers",`
[worldthree] add chapter extractor 2016-10-26 23:10:41 +02:00			`"worldthree",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`"yandere",`
[foolfuuka] add support for more sites (#18) - https://arch.b4k.co - https://archive.whatisthisimnotgoodwithcomputers.com - https://archive.yeet.net Notes: - The name "whatisthisimnotgoodwithcomputers" is way too long ... - archive.yeet.net is out of date and also blocked by 4chan servers - newest threads are 2 weeks old - using "https://archive.yeet.net" as Referer header results in "403 Forbidden" when accessing 4chan 2017-09-16 21:11:44 +02:00			`"yeet",`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`"imagehosts",`
support direct image links 2017-05-24 12:51:18 +02:00			`"directlink",`
rename 'generic' to 'recursive' 2016-10-01 15:54:27 +02:00			`"recursive",`
[oauth] add the 'extractor.oauth.browser' option enables/disables the use of webbrowser.open() during OAuth authorization 2017-06-20 16:06:14 +02:00			`"oauth",`
add utility extractor that runs test-URLs 2016-12-10 00:01:00 +01:00			`"test",`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`]`

code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
adjust loader to new config-interface 2015-10-05 17:52:50 +02:00			`def find(url):`
testing environment for extractor results 2015-12-12 15:58:07 +01:00			`"""Find suitable extractor for the given url"""`
remove 'info' parameter 2015-11-21 03:12:36 +01:00			`for pattern, klass in _list_patterns():`
precompile regular expressions 2016-08-23 16:36:39 +02:00			`match = pattern.match(url)`
implement context-manager to blacklist extractors 2017-05-24 12:32:44 +02:00			`if match and klass.category not in _blacklist:`
restructure info-parameters 2015-11-21 00:30:31 +01:00			`return klass(match)`
			`return None`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
testing environment for extractor results 2015-12-12 15:58:07 +01:00			`def extractors():`
			`"""Yield all available extractor classes"""`
ensure extractors() returns each extractor only once 2015-12-13 04:34:15 +01:00			`return sorted(`
			`set(klass for _, klass in _list_patterns()),`
			`key=lambda x: x.__name__`
			`)`
testing environment for extractor results 2015-12-12 15:58:07 +01:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
implement context-manager to blacklist extractors 2017-05-24 12:32:44 +02:00			`class blacklist():`
			`"""Context Manager to blacklist extractor modules"""`
smaller fixes and "security" measures - move the OAuthSession class into util.py - block special extractors for reddit and recursive - ignore 'only matching' tests for testresults script 2017-06-16 21:01:40 +02:00			`def __init__(self, categories):`
implement context-manager to blacklist extractors 2017-05-24 12:32:44 +02:00			`self.categories = categories`

			`def __enter__(self):`
			`_blacklist.extend(self.categories)`

			`def __exit__(self, etype, value, traceback):`
			`_blacklist.clear()`


move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`# --------------------------------------------------------------------`
			`# internals`

			`_cache = []`
implement context-manager to blacklist extractors 2017-05-24 12:32:44 +02:00			`_blacklist = []`
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`_module_iter = iter(modules)`

code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`def _list_patterns():`
bugfix: add patterns to cache before returning them 2015-11-27 01:42:40 +01:00			`"""Yield all available (pattern, class) tuples"""`
precompile regular expressions 2016-08-23 16:36:39 +02:00			`yield from _cache`
enable user-specified patterns 2015-10-05 18:10:18 +02:00
move and rework extractor-loading code 2015-06-28 22:53:52 +02:00			`for module_name in _module_iter:`
			`module = importlib.import_module("."+module_name, __package__)`
bugfix: add patterns to cache before returning them 2015-11-27 01:42:40 +01:00			`tuples = [`
precompile regular expressions 2016-08-23 16:36:39 +02:00			`(re.compile(pattern), klass)`
bugfix: add patterns to cache before returning them 2015-11-27 01:42:40 +01:00			`for klass in _get_classes(module)`
			`for pattern in klass.pattern`
			`]`
			`_cache.extend(tuples)`
precompile regular expressions 2016-08-23 16:36:39 +02:00			`yield from tuples`
allow multiple extractors per module 2015-11-20 19:54:07 +01:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
allow multiple extractors per module 2015-11-20 19:54:07 +01:00			`def _get_classes(module):`
			`"""Return a list of all extractor classes in a module"""`
			`return [`
			`klass for klass in module.__dict__.values() if (`
restructure info-parameters 2015-11-21 00:30:31 +01:00			`hasattr(klass, "pattern") and klass.__module__ == module.__name__`
allow multiple extractors per module 2015-11-20 19:54:07 +01:00			`)`
			`]`