1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2025-01-31 11:41:35 +01:00

remove 'extractor.blacklist' context manager

This commit is contained in:
Mike Fährmann 2020-09-11 13:11:46 +02:00
parent c78aa17506
commit 3918b69677
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
7 changed files with 12 additions and 78 deletions

View File

@ -1081,16 +1081,6 @@ Description Controls how to handle redirects to CAPTCHA pages.
=========== =====
extractor.recursive.blacklist
-----------------------------
=========== =====
Type ``list`` of ``strings``
Default ``["directlink", "oauth", "recursive", "test"]``
Description A list of extractor categories which should be ignored when using
the ``recursive`` extractor.
=========== =====
extractor.reddit.comments
-------------------------
=========== =====

View File

@ -119,10 +119,6 @@
{
"captcha": "stop"
},
"recursive":
{
"blacklist": ["directlink", "oauth", "recursive", "test"]
},
"reddit":
{
"comments": 0,

View File

@ -140,7 +140,7 @@ def find(url):
"""Find a suitable extractor for the given URL"""
for cls in _list_classes():
match = cls.pattern.match(url)
if match and cls not in _blacklist:
if match:
return cls(match)
return None
@ -169,26 +169,10 @@ def extractors():
)
class blacklist():
"""Context Manager to blacklist extractor modules"""
def __init__(self, categories, extractors=None):
self.extractors = extractors or []
for cls in _list_classes():
if cls.category in categories:
self.extractors.append(cls)
def __enter__(self):
_blacklist.update(self.extractors)
def __exit__(self, etype, value, traceback):
_blacklist.clear()
# --------------------------------------------------------------------
# internals
_cache = []
_blacklist = set()
_module_iter = iter(modules)

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2019 Mike Fährmann
# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -9,7 +9,7 @@
"""Extractors for https://www.plurk.com/"""
from .common import Extractor, Message
from .. import text, extractor, exception
from .. import text, exception
import datetime
import time
import json
@ -23,12 +23,9 @@ class PlurkExtractor(Extractor):
def items(self):
urls = self._urls_ex if self.config("comments", False) else self._urls
yield Message.Version, 1
with extractor.blacklist(("plurk",)):
for plurk in self.plurks():
for url in urls(plurk):
yield Message.Queue, url, plurk
for plurk in self.plurks():
for url in urls(plurk):
yield Message.Queue, url, plurk
def plurks(self):
"""Return an iterable with all relevant 'plurk' objects"""

View File

@ -9,7 +9,6 @@
"""Recursive extractor"""
from .common import Extractor, Message
from .. import extractor, util
import requests
import re
@ -23,17 +22,12 @@ class RecursiveExtractor(Extractor):
})
def items(self):
blist = self.config(
"blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS)
self.session.mount("file://", FileAdapter())
page = self.request(self.url.partition(":")[2]).text
del self.session.adapters["file://"]
yield Message.Version, 1
with extractor.blacklist(blist):
for match in re.finditer(r"https?://[^\s\"']+", page):
yield Message.Queue, match.group(0), {}
for match in re.finditer(r"https?://[^\s\"']+", page):
yield Message.Queue, match.group(0), {}
class FileAdapter(requests.adapters.BaseAdapter):

View File

@ -9,7 +9,7 @@
"""Extract images from https://www.tumblr.com/"""
from .common import Extractor, Message
from .. import text, oauth, extractor, exception
from .. import text, oauth, exception
from datetime import datetime, timedelta
import re
@ -128,12 +128,9 @@ class TumblrExtractor(Extractor):
if self.external: # external links
post["extension"] = None
with extractor.blacklist(("tumblr",)):
for key in ("permalink_url", "url"):
url = post.get(key)
if url:
yield Message.Queue, url, post
break
url = post.get("permalink_url") or post.get("url")
if url:
yield Message.Queue, url, post
def posts(self):
"""Return an iterable containing all relevant posts"""

View File

@ -75,30 +75,6 @@ class TestExtractorModule(unittest.TestCase):
self.assertEqual(classes[0], FakeExtractor)
self.assertIsInstance(extractor.find(uri), FakeExtractor)
def test_blacklist(self):
link_uri = "https://example.org/file.jpg"
test_uri = "test:"
fake_uri = "fake:"
self.assertIsInstance(extractor.find(link_uri), DirectlinkExtractor)
self.assertIsInstance(extractor.find(test_uri), Extractor)
self.assertIsNone(extractor.find(fake_uri))
with extractor.blacklist(["directlink"]):
self.assertIsNone(extractor.find(link_uri))
self.assertIsInstance(extractor.find(test_uri), Extractor)
self.assertIsNone(extractor.find(fake_uri))
with extractor.blacklist([], [DirectlinkExtractor, FakeExtractor]):
self.assertIsNone(extractor.find(link_uri))
self.assertIsInstance(extractor.find(test_uri), Extractor)
self.assertIsNone(extractor.find(fake_uri))
with extractor.blacklist(["test"], [DirectlinkExtractor]):
self.assertIsNone(extractor.find(link_uri))
self.assertIsNone(extractor.find(test_uri))
self.assertIsNone(extractor.find(fake_uri))
def test_from_url(self):
for uri in self.VALID_URIS:
cls = extractor.find(uri).__class__