gallery-dl/test/test_extractor.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2018-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

import sys
import unittest
import string

from gallery_dl import extractor
from gallery_dl.extractor.common import Extractor, Message
from gallery_dl.extractor.directlink import DirectlinkExtractor as DLExtractor


class FakeExtractor(Extractor):
    category = "fake"
    subcategory = "test"
    pattern = "fake:"

    def items(self):
        yield Message.Version, 1
        yield Message.Url, "text:foobar", {}


class TestExtractor(unittest.TestCase):
    VALID_URIS = (
        "https://example.org/file.jpg",
        "tumblr:foobar",
        "oauth:flickr",
        "test:pixiv:",
        "recursive:https://example.org/document.html",
    )

    def setUp(self):
        extractor._cache.clear()
        extractor._module_iter = iter(extractor.modules)

    def test_find(self):
        for uri in self.VALID_URIS:
            result = extractor.find(uri)
            self.assertIsInstance(result, Extractor, uri)

        for not_found in ("", "/tmp/file.ext"):
            self.assertIsNone(extractor.find(not_found))

        for invalid in (None, [], {}, 123, b"test:"):
            with self.assertRaises(TypeError):
                extractor.find(invalid)

    def test_add(self):
        uri = "fake:foobar"
        self.assertIsNone(extractor.find(uri))

        extractor.add(FakeExtractor)
        self.assertIsInstance(extractor.find(uri), FakeExtractor)

    def test_add_module(self):
        uri = "fake:foobar"
        self.assertIsNone(extractor.find(uri))

        classes = extractor.add_module(sys.modules[__name__])
        self.assertEqual(len(classes), 1)
        self.assertEqual(classes[0].pattern, FakeExtractor.pattern)
        self.assertEqual(classes[0], FakeExtractor)
        self.assertIsInstance(extractor.find(uri), FakeExtractor)

    def test_blacklist(self):
        link_uri = "https://example.org/file.jpg"
        test_uri = "test:"
        fake_uri = "fake:"

        self.assertIsInstance(extractor.find(link_uri), DLExtractor)
        self.assertIsInstance(extractor.find(test_uri), Extractor)
        self.assertIsNone(extractor.find(fake_uri))

        with extractor.blacklist(["directlink"]):
            self.assertIsNone(extractor.find(link_uri))
            self.assertIsInstance(extractor.find(test_uri), Extractor)
            self.assertIsNone(extractor.find(fake_uri))

        with extractor.blacklist([], [DLExtractor, FakeExtractor]):
            self.assertIsNone(extractor.find(link_uri))
            self.assertIsInstance(extractor.find(test_uri), Extractor)
            self.assertIsNone(extractor.find(fake_uri))

        with extractor.blacklist(["test"], [DLExtractor]):
            self.assertIsNone(extractor.find(link_uri))
            self.assertIsNone(extractor.find(test_uri))
            self.assertIsNone(extractor.find(fake_uri))

    def test_from_url(self):
        for uri in self.VALID_URIS:
            cls = extractor.find(uri).__class__
            extr = cls.from_url(uri)
            self.assertIs(type(extr), cls)
            self.assertIsInstance(extr, Extractor)

        for not_found in ("", "/tmp/file.ext"):
            self.assertIsNone(FakeExtractor.from_url(not_found))

        for invalid in (None, [], {}, 123, b"test:"):
            with self.assertRaises(TypeError):
                FakeExtractor.from_url(invalid)

    def test_unique_pattern_matches(self):
        test_urls = []

        # collect testcase URLs
        for extr in extractor.extractors():
            for testcase in extr._get_tests():
                test_urls.append((testcase[0], extr))

        # iterate over all testcase URLs
        for url, extr1 in test_urls:
            matches = []

            # ... and apply all regex patterns to each one
            for extr2 in extractor._cache:

                # skip DirectlinkExtractor pattern if it isn't tested
                if extr1 != DLExtractor and extr2 == DLExtractor:
                    continue

                match = extr2.pattern.match(url)
                if match:
                    matches.append(match)

            # fail if more or less than 1 match happened
            if len(matches) > 1:
                msg = "'{}' gets matched by more than one pattern:".format(url)
                for match in matches:
                    msg += "\n- "
                    msg += match.re.pattern
                self.fail(msg)

            if len(matches) < 1:
                msg = "'{}' isn't matched by any pattern".format(url)
                self.fail(msg)

    def test_docstrings(self):
        """ensure docstring uniqueness"""
        for extr1 in extractor.extractors():
            for extr2 in extractor.extractors():
                if extr1 != extr2 and extr1.__doc__ and extr2.__doc__:
                    self.assertNotEqual(
                        extr1.__doc__,
                        extr2.__doc__,
                        "{} <-> {}".format(extr1, extr2),
                    )

    def test_names(self):
        """Ensure extractor classes are named CategorySubcategoryExtractor"""
        def capitalize(c):
            if "-" in c:
                return string.capwords(c.replace("-", " ")).replace(" ", "")
            if "." in c:
                c = c.replace(".", "")
            return c.capitalize()

        mapping = {
            "2chan"  : "futaba",
            "3dbooru": "threedeebooru",
            "4chan"  : "fourchan",
            "4plebs" : "fourplebs",
            "8chan"  : "infinitychan",
            "oauth"  : None,
        }

        for extr in extractor.extractors():
            category = mapping.get(extr.category, extr.category)
            if category:
                expected = "{}{}Extractor".format(
                    capitalize(category),
                    capitalize(extr.subcategory),
                )
                self.assertEqual(expected, extr.__name__)


if __name__ == "__main__":
    unittest.main()
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

add generalized extractors for Mastodon instances (#144) Extractors for Mastodon instances can now be dynamically generated, based on the instance names in the 'extractor.mastodon.*' config path. Example: { "extractor": { "mastodon": { "pawoo.net": { ... }, "mastodon.xyz": { ... }, "tabletop.social": { ... }, ... } } } Each entry requires an 'access-token' value, which can be generated with 'gallery-dl oauth:mastodon:<instance URL>'. An 'access-token' (as well as a 'client-id' and 'client-secret') for pawoo.net is always available, but can be overwritten as necessary. 2019-01-19 14:28:59 +01:00			`# Copyright 2018-2019 Mike Fährmann`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`import sys`
			`import unittest`
[test:extractor] small fixes and improvements 2018-08-15 20:39:13 +02:00			`import string`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00
allow for simplified test data structures Instead of a strict list of (URL, RESULTS)-tuples, extractor result tests can now be a single (URL, RESULTS)-tuple, if it's just one test, and "only matching" tests can now be a simple string. 2019-02-06 17:24:44 +01:00			`from gallery_dl import extractor`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00			`from gallery_dl.extractor.common import Extractor, Message`
			`from gallery_dl.extractor.directlink import DirectlinkExtractor as DLExtractor`


			`class FakeExtractor(Extractor):`
			`category = "fake"`
			`subcategory = "test"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = "fake:"`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00
			`def items(self):`
			`yield Message.Version, 1`
			`yield Message.Url, "text:foobar", {}`


			`class TestExtractor(unittest.TestCase):`
implement alternative constructor for extractors 2019-02-09 14:39:38 +01:00			`VALID_URIS = (`
			`"https://example.org/file.jpg",`
			`"tumblr:foobar",`
			`"oauth:flickr",`
			`"test:pixiv:",`
			`"recursive:https://example.org/document.html",`
			`)`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00
			`def setUp(self):`
			`extractor._cache.clear()`
			`extractor._module_iter = iter(extractor.modules)`

			`def test_find(self):`
implement alternative constructor for extractors 2019-02-09 14:39:38 +01:00			`for uri in self.VALID_URIS:`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00			`result = extractor.find(uri)`
			`self.assertIsInstance(result, Extractor, uri)`

			`for not_found in ("", "/tmp/file.ext"):`
			`self.assertIsNone(extractor.find(not_found))`

			`for invalid in (None, [], {}, 123, b"test:"):`
			`with self.assertRaises(TypeError):`
			`extractor.find(invalid)`

			`def test_add(self):`
			`uri = "fake:foobar"`
			`self.assertIsNone(extractor.find(uri))`

			`extractor.add(FakeExtractor)`
			`self.assertIsInstance(extractor.find(uri), FakeExtractor)`

			`def test_add_module(self):`
			`uri = "fake:foobar"`
			`self.assertIsNone(extractor.find(uri))`

update handling of extractor URL patterns When loading extractor classes during 'extractor.find(…)', their 'pattern' attribute will be replaced with a compiled version of itself. 2019-02-08 20:08:16 +01:00			`classes = extractor.add_module(sys.modules[__name__])`
			`self.assertEqual(len(classes), 1)`
			`self.assertEqual(classes[0].pattern, FakeExtractor.pattern)`
			`self.assertEqual(classes[0], FakeExtractor)`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00			`self.assertIsInstance(extractor.find(uri), FakeExtractor)`

			`def test_blacklist(self):`
			`link_uri = "https://example.org/file.jpg"`
			`test_uri = "test:"`
			`fake_uri = "fake:"`

			`self.assertIsInstance(extractor.find(link_uri), DLExtractor)`
			`self.assertIsInstance(extractor.find(test_uri), Extractor)`
			`self.assertIsNone(extractor.find(fake_uri))`

			`with extractor.blacklist(["directlink"]):`
			`self.assertIsNone(extractor.find(link_uri))`
			`self.assertIsInstance(extractor.find(test_uri), Extractor)`
			`self.assertIsNone(extractor.find(fake_uri))`

			`with extractor.blacklist([], [DLExtractor, FakeExtractor]):`
			`self.assertIsNone(extractor.find(link_uri))`
			`self.assertIsInstance(extractor.find(test_uri), Extractor)`
			`self.assertIsNone(extractor.find(fake_uri))`

			`with extractor.blacklist(["test"], [DLExtractor]):`
			`self.assertIsNone(extractor.find(link_uri))`
			`self.assertIsNone(extractor.find(test_uri))`
			`self.assertIsNone(extractor.find(fake_uri))`

implement alternative constructor for extractors 2019-02-09 14:39:38 +01:00			`def test_from_url(self):`
			`for uri in self.VALID_URIS:`
			`cls = extractor.find(uri).__class__`
			`extr = cls.from_url(uri)`
			`self.assertIs(type(extr), cls)`
			`self.assertIsInstance(extr, Extractor)`

			`for not_found in ("", "/tmp/file.ext"):`
			`self.assertIsNone(FakeExtractor.from_url(not_found))`

			`for invalid in (None, [], {}, 123, b"test:"):`
			`with self.assertRaises(TypeError):`
			`FakeExtractor.from_url(invalid)`

[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00			`def test_unique_pattern_matches(self):`
			`test_urls = []`

			`# collect testcase URLs`
			`for extr in extractor.extractors():`
allow for simplified test data structures Instead of a strict list of (URL, RESULTS)-tuples, extractor result tests can now be a single (URL, RESULTS)-tuple, if it's just one test, and "only matching" tests can now be a simple string. 2019-02-06 17:24:44 +01:00			`for testcase in extr._get_tests():`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00			`test_urls.append((testcase[0], extr))`

			`# iterate over all testcase URLs`
			`for url, extr1 in test_urls:`
			`matches = []`

			`# ... and apply all regex patterns to each one`
update handling of extractor URL patterns When loading extractor classes during 'extractor.find(…)', their 'pattern' attribute will be replaced with a compiled version of itself. 2019-02-08 20:08:16 +01:00			`for extr2 in extractor._cache:`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00
			`# skip DirectlinkExtractor pattern if it isn't tested`
			`if extr1 != DLExtractor and extr2 == DLExtractor:`
			`continue`

update handling of extractor URL patterns When loading extractor classes during 'extractor.find(…)', their 'pattern' attribute will be replaced with a compiled version of itself. 2019-02-08 20:08:16 +01:00			`match = extr2.pattern.match(url)`
[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00			`if match:`
			`matches.append(match)`

			`# fail if more or less than 1 match happened`
			`if len(matches) > 1:`
			`msg = "'{}' gets matched by more than one pattern:".format(url)`
			`for match in matches:`
			`msg += "\n- "`
			`msg += match.re.pattern`
			`self.fail(msg)`

			`if len(matches) < 1:`
			`msg = "'{}' isn't matched by any pattern".format(url)`
			`self.fail(msg)`

fix extractor docstrings 2018-04-18 18:01:43 +02:00			`def test_docstrings(self):`
			`"""ensure docstring uniqueness"""`
			`for extr1 in extractor.extractors():`
			`for extr2 in extractor.extractors():`
			`if extr1 != extr2 and extr1.__doc__ and extr2.__doc__:`
			`self.assertNotEqual(`
			`extr1.__doc__,`
			`extr2.__doc__,`
			`"{} <-> {}".format(extr1, extr2),`
			`)`

fix extractor names 2018-04-18 18:06:30 +02:00			`def test_names(self):`
			`"""Ensure extractor classes are named CategorySubcategoryExtractor"""`
[test:extractor] small fixes and improvements 2018-08-15 20:39:13 +02:00			`def capitalize(c):`
			`if "-" in c:`
			`return string.capwords(c.replace("-", " ")).replace(" ", "")`
add generalized extractors for Mastodon instances (#144) Extractors for Mastodon instances can now be dynamically generated, based on the instance names in the 'extractor.mastodon.*' config path. Example: { "extractor": { "mastodon": { "pawoo.net": { ... }, "mastodon.xyz": { ... }, "tabletop.social": { ... }, ... } } } Each entry requires an 'access-token' value, which can be generated with 'gallery-dl oauth:mastodon:<instance URL>'. An 'access-token' (as well as a 'client-id' and 'client-secret') for pawoo.net is always available, but can be overwritten as necessary. 2019-01-19 14:28:59 +01:00			`if "." in c:`
			`c = c.replace(".", "")`
[test:extractor] small fixes and improvements 2018-08-15 20:39:13 +02:00			`return c.capitalize()`

fix extractor names 2018-04-18 18:06:30 +02:00			`mapping = {`
			`"2chan" : "futaba",`
			`"3dbooru": "threedeebooru",`
			`"4chan" : "fourchan",`
			`"4plebs" : "fourplebs",`
			`"8chan" : "infinitychan",`
			`"oauth" : None,`
			`}`

			`for extr in extractor.extractors():`
			`category = mapping.get(extr.category, extr.category)`
			`if category:`
			`expected = "{}{}Extractor".format(`
[test:extractor] small fixes and improvements 2018-08-15 20:39:13 +02:00			`capitalize(category),`
			`capitalize(extr.subcategory),`
fix extractor names 2018-04-18 18:06:30 +02:00			`)`
			`self.assertEqual(expected, extr.__name__)`

[test] add unit tests for extractor module functions 2018-03-24 17:24:34 +01:00
			`if __name__ == "__main__":`
			`unittest.main()`