From 82d020804de938bb7e87bd6bbc4961757b892cd0 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 11 May 2022 21:24:44 +0530 Subject: [PATCH] [extractor] Use classmethod/property where possible and refactor lazy extractors accordingly. This reduces the need to create extractor instances --- devscripts/lazy_load_template.py | 24 ++-- devscripts/make_lazy_extractors.py | 184 ++++++++++++++++------------- devscripts/make_supportedsites.py | 4 +- supportedsites.md | 2 +- yt_dlp/__init__.py | 16 ++- yt_dlp/extractor/__init__.py | 14 ++- yt_dlp/extractor/common.py | 58 ++++----- yt_dlp/extractor/drtv.py | 1 + yt_dlp/extractor/testurl.py | 43 ++----- yt_dlp/extractor/youtube.py | 8 +- yt_dlp/utils.py | 1 + 11 files changed, 188 insertions(+), 167 deletions(-) diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index e4b4f5825..6d9b27742 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -1,30 +1,28 @@ +import importlib +import random import re -from ..utils import bug_reports_message, write_string +from ..utils import bug_reports_message, classproperty, write_string class LazyLoadMetaClass(type): def __getattr__(cls, name): - if '_real_class' not in cls.__dict__: + # "is_suitable" requires "_TESTS". However, they bloat the lazy_extractors + if '_real_class' not in cls.__dict__ and name not in ('is_suitable', 'get_testcases'): write_string( 'WARNING: Falling back to normal extractor since lazy extractor ' - f'{cls.__name__} does not have attribute {name}{bug_reports_message()}') - return getattr(cls._get_real_class(), name) + f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n') + return getattr(cls.real_class, name) class LazyLoadExtractor(metaclass=LazyLoadMetaClass): - _module = None - _WORKING = True - - @classmethod - def _get_real_class(cls): + @classproperty + def real_class(cls): if '_real_class' not in cls.__dict__: - mod = __import__(cls._module, fromlist=(cls.__name__,)) - cls._real_class = getattr(mod, cls.__name__) + cls._real_class = getattr(importlib.import_module(cls._module), cls.__name__) return cls._real_class def __new__(cls, *args, **kwargs): - real_cls = cls._get_real_class() - instance = real_cls.__new__(real_cls) + instance = cls.real_class.__new__(cls.real_class) instance.__init__(*args, **kwargs) return instance diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 6dc8fed90..8ddc54b9b 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -1,101 +1,125 @@ #!/usr/bin/env python3 import os +import optparse import sys from inspect import getsource sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -lazy_extractors_filename = sys.argv[1] if len(sys.argv) > 1 else 'yt_dlp/extractor/lazy_extractors.py' -if os.path.exists(lazy_extractors_filename): - os.remove(lazy_extractors_filename) -# Block plugins from loading -plugins_dirname = 'ytdlp_plugins' -plugins_blocked_dirname = 'ytdlp_plugins_blocked' -if os.path.exists(plugins_dirname): - os.rename(plugins_dirname, plugins_blocked_dirname) - -from yt_dlp.extractor import _ALL_CLASSES -from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor - -if os.path.exists(plugins_blocked_dirname): - os.rename(plugins_blocked_dirname, plugins_dirname) - -with open('devscripts/lazy_load_template.py', encoding='utf-8') as f: - module_template = f.read() - -CLASS_PROPERTIES = ['ie_key', 'working', '_match_valid_url', 'suitable', '_match_id', 'get_temp_id'] -module_contents = [ - module_template, - *[getsource(getattr(InfoExtractor, k)) for k in CLASS_PROPERTIES], - '\nclass LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] - -ie_template = ''' +NO_ATTR = object() +STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE'] +CLASS_METHODS = [ + 'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', +] +IE_TEMPLATE = ''' class {name}({bases}): - _module = '{module}' + _module = {module!r} ''' +with open('devscripts/lazy_load_template.py', encoding='utf-8') as f: + MODULE_TEMPLATE = f.read() -def get_base_name(base): - if base is InfoExtractor: - return 'LazyLoadExtractor' - elif base is SearchInfoExtractor: - return 'LazyLoadSearchExtractor' - else: - return base.__name__ +def main(): + parser = optparse.OptionParser(usage='%prog [OUTFILE.py]') + args = parser.parse_args()[1] or ['yt_dlp/extractor/lazy_extractors.py'] + if len(args) != 1: + parser.error('Expected only an output filename') + + lazy_extractors_filename = args[0] + if os.path.exists(lazy_extractors_filename): + os.remove(lazy_extractors_filename) + + _ALL_CLASSES = get_all_ies() # Must be before import + + from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor + + DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR}) + module_src = '\n'.join(( + MODULE_TEMPLATE, + ' _module = None', + *extra_ie_code(DummyInfoExtractor), + '\nclass LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', + *build_ies(_ALL_CLASSES, (InfoExtractor, SearchInfoExtractor), DummyInfoExtractor), + )) + + with open(lazy_extractors_filename, 'wt', encoding='utf-8') as f: + f.write(f'{module_src}\n') -def build_lazy_ie(ie, name): - s = ie_template.format( - name=name, - bases=', '.join(map(get_base_name, ie.__bases__)), - module=ie.__module__) +def get_all_ies(): + PLUGINS_DIRNAME = 'ytdlp_plugins' + BLOCKED_DIRNAME = f'{PLUGINS_DIRNAME}_blocked' + if os.path.exists(PLUGINS_DIRNAME): + os.rename(PLUGINS_DIRNAME, BLOCKED_DIRNAME) + try: + from yt_dlp.extractor import _ALL_CLASSES + finally: + if os.path.exists(BLOCKED_DIRNAME): + os.rename(BLOCKED_DIRNAME, PLUGINS_DIRNAME) + return _ALL_CLASSES + + +def extra_ie_code(ie, base=None): + for var in STATIC_CLASS_PROPERTIES: + val = getattr(ie, var) + if val != (getattr(base, var) if base else NO_ATTR): + yield f' {var} = {val!r}' + yield '' + + for name in CLASS_METHODS: + f = getattr(ie, name) + if not base or f.__func__ != getattr(base, name).__func__: + yield getsource(f) + + +def build_ies(ies, bases, attr_base): + names = [] + for ie in sort_ies(ies, bases): + yield build_lazy_ie(ie, ie.__name__, attr_base) + if ie in ies: + names.append(ie.__name__) + + yield f'\n_ALL_CLASSES = [{", ".join(names)}]' + + +def sort_ies(ies, ignored_bases): + """find the correct sorting and add the required base classes so that subclasses can be correctly created""" + classes, returned_classes = ies[:-1], set() + assert ies[-1].__name__ == 'GenericIE', 'Last IE must be GenericIE' + while classes: + for c in classes[:]: + bases = set(c.__bases__) - {object, *ignored_bases} + restart = False + for b in bases: + if b not in classes and b not in returned_classes: + assert b.__name__ != 'GenericIE', 'Cannot inherit from GenericIE' + classes.insert(0, b) + restart = True + if restart: + break + if bases <= returned_classes: + yield c + returned_classes.add(c) + classes.remove(c) + break + yield ies[-1] + + +def build_lazy_ie(ie, name, attr_base): + bases = ', '.join({ + 'InfoExtractor': 'LazyLoadExtractor', + 'SearchInfoExtractor': 'LazyLoadSearchExtractor', + }.get(base.__name__, base.__name__) for base in ie.__bases__) + + s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases) valid_url = getattr(ie, '_VALID_URL', None) if not valid_url and hasattr(ie, '_make_valid_url'): valid_url = ie._make_valid_url() if valid_url: s += f' _VALID_URL = {valid_url!r}\n' - if not ie._WORKING: - s += ' _WORKING = False\n' - if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: - s += f'\n{getsource(ie.suitable)}' - return s + return s + '\n'.join(extra_ie_code(ie, attr_base)) -# find the correct sorting and add the required base classes so that subclasses -# can be correctly created -classes = _ALL_CLASSES[:-1] -ordered_cls = [] -while classes: - for c in classes[:]: - bases = set(c.__bases__) - {object, InfoExtractor, SearchInfoExtractor} - stop = False - for b in bases: - if b not in classes and b not in ordered_cls: - if b.__name__ == 'GenericIE': - exit() - classes.insert(0, b) - stop = True - if stop: - break - if all(b in ordered_cls for b in bases): - ordered_cls.append(c) - classes.remove(c) - break -ordered_cls.append(_ALL_CLASSES[-1]) - -names = [] -for ie in ordered_cls: - name = ie.__name__ - src = build_lazy_ie(ie, name) - module_contents.append(src) - if ie in _ALL_CLASSES: - names.append(name) - -module_contents.append( - '\n_ALL_CLASSES = [{}]'.format(', '.join(names))) - -module_src = '\n'.join(module_contents) + '\n' - -with open(lazy_extractors_filename, 'wt', encoding='utf-8') as f: - f.write(module_src) +if __name__ == '__main__': + main() diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 5531fec4d..d8c53c5e1 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -5,7 +5,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from yt_dlp.extractor import list_extractors +from yt_dlp.extractor import list_extractor_classes def main(): @@ -14,7 +14,7 @@ def main(): if len(args) != 1: parser.error('Expected an output filename') - out = '\n'.join(ie.description() for ie in list_extractors(None) if ie.IE_DESC is not False) + out = '\n'.join(ie.description() for ie in list_extractor_classes() if ie.IE_DESC is not False) with open(args[0], 'w', encoding='utf-8') as outf: outf.write(f'# Supported sites\n{out}\n') diff --git a/supportedsites.md b/supportedsites.md index 31bd27768..7663c09d4 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -431,7 +431,6 @@ # Supported sites - **gem.cbc.ca**: [cbcgem] - **gem.cbc.ca:live** - **gem.cbc.ca:playlist** - - **generic**: Generic downloader that works on some sites - **Gettr** - **GettrStreaming** - **Gfycat** @@ -1553,3 +1552,4 @@ # Supported sites - **zingmp3:album** - **zoom** - **Zype** + - **generic**: Generic downloader that works on some sites diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 924604631..0a8bf37b6 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -11,7 +11,7 @@ from .compat import compat_getpass, compat_os_name, compat_shlex_quote from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS from .downloader import FileDownloader -from .extractor import list_extractors +from .extractor import GenericIE, list_extractor_classes from .extractor.adobepass import MSO_INFO from .extractor.common import InfoExtractor from .options import parseOpts @@ -76,14 +76,20 @@ def get_urls(urls, batchfile, verbose): def print_extractor_information(opts, urls): out = '' if opts.list_extractors: - for ie in list_extractors(opts.age_limit): + urls = dict.fromkeys(urls, False) + for ie in list_extractor_classes(opts.age_limit): out += ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n' - out += ''.join(f' {url}\n' for url in filter(ie.suitable, urls)) + if ie == GenericIE: + matched_urls = [url for url, matched in urls.items() if not matched] + else: + matched_urls = tuple(filter(ie.suitable, urls.keys())) + urls.update(dict.fromkeys(matched_urls, True)) + out += ''.join(f' {url}\n' for url in matched_urls) elif opts.list_extractor_descriptions: _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') out = '\n'.join( ie.description(markdown=False, search_examples=_SEARCHES) - for ie in list_extractors(opts.age_limit) if ie.working() and ie.IE_DESC is not False) + '\n' + for ie in list_extractor_classes(opts.age_limit) if ie.working() and ie.IE_DESC is not False) elif opts.ap_list_mso: out = 'Supported TV Providers:\n%s\n' % render_table( ['mso', 'mso name'], @@ -862,7 +868,7 @@ def main(argv=None): sys.exit(f'\nERROR: {e}') -from .extractor import gen_extractors +from .extractor import gen_extractors, list_extractors __all__ = [ 'main', 'YoutubeDL', diff --git a/yt_dlp/extractor/__init__.py b/yt_dlp/extractor/__init__.py index 506ffe87c..afd3d05ac 100644 --- a/yt_dlp/extractor/__init__.py +++ b/yt_dlp/extractor/__init__.py @@ -37,11 +37,17 @@ def gen_extractors(): return [klass() for klass in gen_extractor_classes()] -def list_extractors(age_limit): +def list_extractor_classes(age_limit=None): """Return a list of extractors that are suitable for the given age, sorted by extractor name""" - return sorted(filter( - lambda ie: ie.is_suitable(age_limit), - gen_extractors()), key=lambda ie: ie.IE_NAME.lower()) + yield from sorted(filter( + lambda ie: ie.is_suitable(age_limit) and ie != GenericIE, # noqa: F405 + gen_extractor_classes()), key=lambda ie: ie.IE_NAME.lower()) + yield GenericIE # noqa: F405 + + +def list_extractors(age_limit=None): + """Return a list of extractor instances that are suitable for the given age, sorted by extractor name""" + return [ie() for ie in list_extractor_classes(age_limit)] def get_info_extractor(ie_name): diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 23d57ddaf..e2460b36a 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -40,6 +40,7 @@ age_restricted, base_url, bug_reports_message, + classproperty, clean_html, determine_ext, determine_protocol, @@ -710,9 +711,9 @@ def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" return cls.__name__[:-2] - @property - def IE_NAME(self): - return type(self).__name__[:-2] + @classproperty + def IE_NAME(cls): + return cls.__name__[:-2] @staticmethod def __can_accept_status_code(err, expected_status): @@ -3624,56 +3625,57 @@ def _apply_first_set_cookie_header(self, url_handle, cookie): self._set_cookie(domain, cookie, value) break - def get_testcases(self, include_onlymatching=False): - t = getattr(self, '_TEST', None) + @classmethod + def get_testcases(cls, include_onlymatching=False): + t = getattr(cls, '_TEST', None) if t: - assert not hasattr(self, '_TESTS'), \ - '%s has _TEST and _TESTS' % type(self).__name__ + assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS' tests = [t] else: - tests = getattr(self, '_TESTS', []) + tests = getattr(cls, '_TESTS', []) for t in tests: if not include_onlymatching and t.get('only_matching', False): continue - t['name'] = type(self).__name__[:-len('IE')] + t['name'] = cls.ie_key() yield t - def is_suitable(self, age_limit): + @classmethod + def is_suitable(cls, age_limit): """ Test whether the extractor is generally suitable for the given age limit (i.e. pornographic sites are not, all others usually are) """ any_restricted = False - for tc in self.get_testcases(include_onlymatching=False): + for tc in cls.get_testcases(include_onlymatching=False): if tc.get('playlist', []): tc = tc['playlist'][0] - is_restricted = age_restricted( - tc.get('info_dict', {}).get('age_limit'), age_limit) + is_restricted = age_restricted(tc.get('info_dict', {}).get('age_limit'), age_limit) if not is_restricted: return True any_restricted = any_restricted or is_restricted return not any_restricted - def description(self, *, markdown=True, search_examples=None): + @classmethod + def description(cls, *, markdown=True, search_examples=None): """Description of the extractor""" desc = '' - if self._NETRC_MACHINE: + if cls._NETRC_MACHINE: if markdown: - desc += f' [{self._NETRC_MACHINE}]' + desc += f' [{cls._NETRC_MACHINE}]' else: - desc += f' [{self._NETRC_MACHINE}]' - if self.IE_DESC is False: + desc += f' [{cls._NETRC_MACHINE}]' + if cls.IE_DESC is False: desc += ' [HIDDEN]' - elif self.IE_DESC: - desc += f' {self.IE_DESC}' - if self.SEARCH_KEY: - desc += f'; "{self.SEARCH_KEY}:" prefix' + elif cls.IE_DESC: + desc += f' {cls.IE_DESC}' + if cls.SEARCH_KEY: + desc += f'; "{cls.SEARCH_KEY}:" prefix' if search_examples: _COUNTS = ('', '5', '10', 'all') - desc += f' (Example: "{self.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")' - if not self.working(): + desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")' + if not cls.working(): desc += ' (**Currently broken**)' if markdown else ' (Currently broken)' - name = f' - **{self.IE_NAME}**' if markdown else self.IE_NAME + name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME return f'{name}:{desc}' if desc else name def extract_subtitles(self, *args, **kwargs): @@ -3849,6 +3851,6 @@ def _search_results(self, query): """Returns an iterator of search results""" raise NotImplementedError('This method must be implemented by subclasses') - @property - def SEARCH_KEY(self): - return self._SEARCH_KEY + @classproperty + def SEARCH_KEY(cls): + return cls._SEARCH_KEY diff --git a/yt_dlp/extractor/drtv.py b/yt_dlp/extractor/drtv.py index 74c40efd9..708b72fae 100644 --- a/yt_dlp/extractor/drtv.py +++ b/yt_dlp/extractor/drtv.py @@ -18,6 +18,7 @@ url_or_none, ) + class DRTVIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// diff --git a/yt_dlp/extractor/testurl.py b/yt_dlp/extractor/testurl.py index 140fa4a96..32cae429e 100644 --- a/yt_dlp/extractor/testurl.py +++ b/yt_dlp/extractor/testurl.py @@ -8,55 +8,36 @@ class TestURLIE(InfoExtractor): """ Allows addressing of the test cases as test:yout.*be_1 """ IE_DESC = False # Do not list - _VALID_URL = r'test(?:url)?:(?P(?P.+?)(?:_(?P[0-9]+))?)$' + _VALID_URL = r'test(?:url)?:(?P.+?)(?:_(?P[0-9]+))?$' def _real_extract(self, url): - from ..extractor import gen_extractors + from ..extractor import gen_extractor_classes - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - extractor_id = mobj.group('extractor') - all_extractors = gen_extractors() + extractor_id, num = self._match_valid_url(url).group('extractor', 'num') rex = re.compile(extractor_id, flags=re.IGNORECASE) - matching_extractors = [ - e for e in all_extractors if rex.search(e.IE_NAME)] + matching_extractors = [e for e in gen_extractor_classes() if rex.search(e.IE_NAME)] if len(matching_extractors) == 0: - raise ExtractorError( - 'No extractors matching %r found' % extractor_id, - expected=True) + raise ExtractorError('No extractors matching {extractor_id!r} found', expected=True) elif len(matching_extractors) > 1: - # Is it obvious which one to pick? - try: + try: # Check for exact match extractor = next( ie for ie in matching_extractors if ie.IE_NAME.lower() == extractor_id.lower()) except StopIteration: raise ExtractorError( - ('Found multiple matching extractors: %s' % - ' '.join(ie.IE_NAME for ie in matching_extractors)), + 'Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors), expected=True) else: extractor = matching_extractors[0] - num_str = mobj.group('num') - num = int(num_str) if num_str else 0 - - testcases = [] - t = getattr(extractor, '_TEST', None) - if t: - testcases.append(t) - testcases.extend(getattr(extractor, '_TESTS', [])) - + testcases = tuple(extractor.get_testcases(True)) try: - tc = testcases[num] + tc = testcases[int(num or 0)] except IndexError: raise ExtractorError( - ('Test case %d not found, got only %d tests' % - (num, len(testcases))), - expected=True) + f'Test case {num or 0} not found, got only {len(testcases)} tests', expected=True) - self.to_screen('Test URL: %s' % tc['url']) - - return self.url_result(tc['url'], video_id=video_id) + self.to_screen(f'Test URL: {tc["url"]}') + return self.url_result(tc['url']) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 907b079ec..97c0a2f15 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -31,6 +31,7 @@ NO_DEFAULT, ExtractorError, bug_reports_message, + classproperty, clean_html, datetime_from_str, dict_get, @@ -5781,16 +5782,17 @@ def _real_extract(self, url): class YoutubeFeedsInfoExtractor(InfoExtractor): """ Base class for feed extractors - Subclasses must define the _FEED_NAME property. + Subclasses must re-define the _FEED_NAME property. """ _LOGIN_REQUIRED = True + _FEED_NAME = 'feeds' def _real_initialize(self): YoutubeBaseInfoExtractor._check_login_required(self) - @property + @classproperty def IE_NAME(self): - return 'youtube:%s' % self._FEED_NAME + return f'youtube:{self._FEED_NAME}' def _real_extract(self, url): return self.url_result( diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index ba73c2191..82eb30af6 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5321,6 +5321,7 @@ def merge_headers(*dicts): class classproperty: def __init__(self, f): + functools.update_wrapper(self, f) self.f = f def __get__(self, _, cls):