From c0edd1c19483bfc2ca120cf6893984968860fbce Mon Sep 17 00:00:00 2001 From: zer0-delta <172248380+zer0-delta@users.noreply.github.com> Date: Wed, 12 Jun 2024 05:37:34 +0100 Subject: [PATCH] Added BBC Maestro extractor --- yt_dlp/extractor/_extractors.py | 3 + yt_dlp/extractor/bbcmaestro.py | 126 ++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 yt_dlp/extractor/bbcmaestro.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e9cd38a65..68a51ea2e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -216,6 +216,9 @@ BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, ) +from .bbcmaestro import ( + BBCMaestroComIE, +) from .beatbump import ( BeatBumpPlaylistIE, BeatBumpVideoIE, diff --git a/yt_dlp/extractor/bbcmaestro.py b/yt_dlp/extractor/bbcmaestro.py new file mode 100644 index 000000000..df3f74bf1 --- /dev/null +++ b/yt_dlp/extractor/bbcmaestro.py @@ -0,0 +1,126 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from ..utils import orderedSet, smuggle_url, unsmuggle_url +from .common import InfoExtractor + + +class BBCMaestroComIE(InfoExtractor): + _VALID_URL = ( + r'https?://(?:www\.)?bbcmaestro\.com/courses/(?P[^?]+)' + ) + _TESTS = [{ + 'url': 'https://www.bbcmaestro.com/courses/julia-donaldson/writing-children-s-picture-books/trailer', + 'info_dict': { + 'id': 'julia-donaldson/writing-children-s-picture-books/trailer', + 'ext': 'mp4', + 'title': 'Course trailer' + }, + 'params': { + 'skip_download': True + } + }] + + def _do_extract_video(self, url, webpage, video_id): + if '/lessons/' not in url: + title = 'Course trailer' + else: + title = self._html_search_regex( + r']*lesson[^>]*title[^>]*>\s*(.+?)\s*', + webpage, + name='title', + flags=re.RegexFlag.S + ) + + m3u8_url = self._html_search_regex( + r']+src="?\'?(\S+\.m3u8)', + webpage, + 'video URL' + ) + formats = [] + if m3u8_url: + formats = self._extract_m3u8_formats( + m3u8_url=m3u8_url, + video_id=video_id, + ext='mp4', + m3u8_id='hls', + fatal=False + ) + + video_dict = { + 'id': video_id, + 'title': title, + 'formats': formats + } + + return video_dict + + def _do_extract_playlist(self, url, webpage): + # Twitter Title usually: - | + twitter_title = self._html_search_meta( + ['twitter:title'], + webpage, + fatal=True + ) + playlist_title = ( + twitter_title + .split('-', maxsplit=1)[-1] + .replace('|', '-') + ) + + url_without_query_parameters = url.split('?', maxsplit=1)[0] + self.write_debug('url_without_query_parameters: %r' % url_without_query_parameters) + playlist_id = self._search_regex( + pattern=r'.*/courses/([^/]+/[^/]+).*', + string=url_without_query_parameters, + name='Playlist ID (from URL)', + fatal=True + ) + self.write_debug('playlist_id: %r' % playlist_id) + entries = [ + self.url_result( + smuggle_url( + f'https://www.bbcmaestro.com/courses/{playlist_id}/lessons/{item_video_id}', + {'forcevideo': True} + ), + ie=BBCMaestroComIE.ie_key() + ) + for item_video_id in orderedSet(re.findall( + r'href=[^>]*/courses/' + re.escape(playlist_id) + r'/lessons/([^?]+)', + webpage + )) + ] + # self.write_debug('entries: %r' % entries) + return self.playlist_result( + entries=entries, + playlist_id=playlist_id, + playlist_title=playlist_title + ) + + def _check_login_provided(self): + if not self._cookies_passed: + self.raise_login_required('Login details are needed to download this content', method='cookies') + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self.write_debug('Extracting from: %r' % url) + video_id = self._match_id(url) + self.write_debug('Video ID: %r' % video_id) + webpage = self._download_webpage(url, video_id) + + is_private_course_content = ('/lessons/' in url) + is_login_required = is_private_course_content + if is_login_required: + # Note: We can only download the course trailer without login + self._check_login_provided() + + is_playlist = ( + is_private_course_content + and not smuggled_data.get('forcevideo') + and not self.get_param('noplaylist') + ) + + if is_playlist: + return self._do_extract_playlist(url, webpage=webpage) + return self._do_extract_video(url, webpage=webpage, video_id=video_id)