diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c411efb5a..d38822db6 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -216,6 +216,9 @@ BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, ) +from .bbcmaestro import ( + BBCMaestroComIE, +) from .beatbump import ( BeatBumpPlaylistIE, BeatBumpVideoIE, diff --git a/yt_dlp/extractor/bbcmaestro.py b/yt_dlp/extractor/bbcmaestro.py new file mode 100644 index 000000000..704de0c1a --- /dev/null +++ b/yt_dlp/extractor/bbcmaestro.py @@ -0,0 +1,122 @@ +import re + +from .common import InfoExtractor +from ..utils import orderedSet, smuggle_url, unsmuggle_url + + +class BBCMaestroComIE(InfoExtractor): + _VALID_URL = ( + r'https?://(?:www\.)?bbcmaestro\.com/courses/(?P[^?]+)' + ) + _TESTS = [{ + 'url': 'https://www.bbcmaestro.com/courses/julia-donaldson/writing-children-s-picture-books/trailer', + 'info_dict': { + 'id': 'julia-donaldson/writing-children-s-picture-books/trailer', + 'ext': 'mp4', + 'title': 'Course trailer', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _do_extract_video(self, url, webpage, video_id): + if '/lessons/' not in url: + title = 'Course trailer' + else: + title = self._html_search_regex( + r']*lesson[^>]*title[^>]*>\s*(.+?)\s*', + webpage, + name='title', + flags=re.RegexFlag.S, + ) + + m3u8_url = self._html_search_regex( + r']+src="?\'?(\S+\.m3u8)', + webpage, + 'video URL', + ) + formats = [] + if m3u8_url: + formats = self._extract_m3u8_formats( + m3u8_url=m3u8_url, + video_id=video_id, + ext='mp4', + m3u8_id='hls', + fatal=False, + ) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } + + def _do_extract_playlist(self, url, webpage): + # Twitter Title usually: - | + twitter_title = self._html_search_meta( + ['twitter:title'], + webpage, + fatal=True, + ) + playlist_title = ( + twitter_title + .split('-', maxsplit=1)[-1] + .replace('|', '-') + ) + + url_without_query_parameters = url.split('?', maxsplit=1)[0] + self.write_debug(f'url_without_query_parameters: {url_without_query_parameters}') + playlist_id = self._search_regex( + pattern=r'.*/courses/([^/]+/[^/]+).*', + string=url_without_query_parameters, + name='Playlist ID (from URL)', + fatal=True, + ) + self.write_debug(f'playlist_id: {playlist_id}') + entries = [ + self.url_result( + smuggle_url( + f'https://www.bbcmaestro.com/courses/{playlist_id}/lessons/{item_video_id}', + {'forcevideo': True}, + ), + ie=BBCMaestroComIE.ie_key(), + ) + for item_video_id in orderedSet(re.findall( + r'href=[^>]*/courses/' + re.escape(playlist_id) + r'/lessons/([^?]+)', + webpage, + )) + ] + # self.write_debug('entries: %r' % entries) + return self.playlist_result( + entries=entries, + playlist_id=playlist_id, + playlist_title=playlist_title, + ) + + def _check_login_provided(self): + if not self._cookies_passed: + self.raise_login_required('Login details are needed to download this content', method='cookies') + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self.write_debug(f'Extracting from: {url}') + video_id = self._match_id(url) + self.write_debug(f'Video ID: {video_id}') + webpage = self._download_webpage(url, video_id) + + is_private_course_content = ('/lessons/' in url) + is_login_required = is_private_course_content + if is_login_required: + # Note: We can only download the course trailer without login + self._check_login_provided() + + is_playlist = ( + is_private_course_content + and not smuggled_data.get('forcevideo') + and not self.get_param('noplaylist') + ) + + if is_playlist: + return self._do_extract_playlist(url, webpage=webpage) + return self._do_extract_video(url, webpage=webpage, video_id=video_id)