From 9c634ef857f49229f60e9fd01678134a29f3af8a Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Wed, 5 Jan 2022 08:48:17 +0000 Subject: [PATCH] [MainStreaming] Add extractor (#2180) Closes #1183, https://github.com/ytdl-org/youtube-dl/issues/29615 Authored by: coletdjnz --- yt_dlp/extractor/extractors.py | 1 + yt_dlp/extractor/generic.py | 21 ++- yt_dlp/extractor/mainstreaming.py | 219 ++++++++++++++++++++++++++++++ 3 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/mainstreaming.py diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 02236bf67..268647e7f 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -756,6 +756,7 @@ MailRuMusicIE, MailRuMusicSearchIE, ) +from .mainstreaming import MainStreamingIE from .malltv import MallTVIE from .mangomolo import ( MangomoloVideoIE, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index baff0280f..c834daddb 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -137,6 +137,7 @@ from .wimtv import WimTVIE from .tvp import TVPEmbedIE from .blogger import BloggerIE +from .mainstreaming import MainStreamingIE from .gfycat import GfycatIE @@ -2384,6 +2385,19 @@ class GenericIE(InfoExtractor): 'upload_date': '20211113' } }, + { + # MainStreaming player + 'url': 'https://www.lactv.it/2021/10/03/lac-news24-la-settimana-03-10-2021/', + 'info_dict': { + 'id': 'EUlZfGWkGpOd', + 'title': 'La Settimana ', + 'description': '03 Ottobre ore 02:00', + 'ext': 'mp4', + 'live_status': 'not_live', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + 'duration': 1512 + } + }, { # Multiple gfycat iframe embeds 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422', @@ -2411,7 +2425,6 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 9 } - # ] def report_following_redirect(self, new_url): @@ -3600,10 +3613,16 @@ def _real_extract(self, url): if tvp_urls: return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) + # Look for MainStreaming embeds + mainstreaming_urls = MainStreamingIE._extract_urls(webpage) + if mainstreaming_urls: + return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key()) + # Look for Gfycat Embeds gfycat_urls = GfycatIE._extract_urls(webpage) if gfycat_urls: return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/yt_dlp/extractor/mainstreaming.py b/yt_dlp/extractor/mainstreaming.py new file mode 100644 index 000000000..0f349a7a3 --- /dev/null +++ b/yt_dlp/extractor/mainstreaming.py @@ -0,0 +1,219 @@ +# coding: utf-8 +import re + +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + js_to_json, + parse_duration, + traverse_obj, + try_get, + urljoin +) + + +class MainStreamingIE(InfoExtractor): + _VALID_URL = r'https?://(?:webtools-?)?(?P[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P\w+)' + IE_DESC = 'MainStreaming Player' + + _TESTS = [ + { + # Live stream offline, has alternative content id + 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/53EN6GxbWaJC', + 'info_dict': { + 'id': '53EN6GxbWaJC', + 'title': 'Diretta homepage 2021-12-31 12:00', + 'description': '', + 'live_status': 'was_live', + 'ext': 'mp4', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + }, + 'expected_warnings': [ + 'Ignoring alternative content ID: WDAF1KOWUpH3', + 'MainStreaming said: Live event is OFFLINE' + ], + 'skip': 'live stream offline' + }, { + # playlist + 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3', + 'info_dict': { + 'id': 'WDAF1KOWUpH3', + 'title': 'Playlist homepage', + }, + 'playlist_mincount': 2 + }, { + # livestream + 'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw', + 'info_dict': { + 'id': 'tDoFkZD3T1Lw', + 'title': r're:Class CNBC Live \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'live_status': 'is_live', + 'ext': 'mp4', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + }, + 'skip': 'live stream' + }, { + 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false', + 'info_dict': { + 'id': 'EUlZfGWkGpOd', + 'title': 'La Settimana ', + 'description': '03 Ottobre ore 02:00', + 'ext': 'mp4', + 'live_status': 'not_live', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + 'duration': 1512 + } + }, { + # video without webtools- prefix + 'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445', + 'info_dict': { + 'id': 'MfuWmzL2lGkA', + 'title': 'TG Mattina', + 'description': '06 Ottobre ore 08:00', + 'ext': 'mp4', + 'live_status': 'not_live', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + 'duration': 789.04 + } + }, { + # always-on livestream with DVR + 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy', + 'info_dict': { + 'id': 'HVvPMzy', + 'title': r're:^Diretta LaC News24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': 'canale all news', + 'live_status': 'is_live', + 'ext': 'mp4', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + }, + 'params': { + 'skip_download': True, + }, + }, { + # no host + 'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA', + 'only_matching': True + }, { + 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw', + 'only_matching': True + }, { + 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#', + 'only_matching': True + } + ] + + @staticmethod + def _extract_urls(webpage): + mobj = re.findall( + r']+?src=["\']?(?P%s)["\']?' % MainStreamingIE._VALID_URL, webpage) + if mobj: + return [group[0] for group in mobj] + + def _playlist_entries(self, host, playlist_content): + for entry in playlist_content: + content_id = entry.get('contentID') + yield { + '_type': 'url', + 'ie_key': MainStreamingIE.ie_key(), + 'id': content_id, + 'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))), + 'title': entry.get('title'), + 'url': f'https://{host}/embed/{content_id}' + } + + @staticmethod + def _get_webtools_host(host): + if not host.startswith('webtools'): + host = 'webtools' + ('-' if not host.startswith('.') else '') + host + return host + + def _get_webtools_base_url(self, host): + return f'{self.http_scheme()}//{self._get_webtools_host(host)}' + + def _call_api(self, host: str, path: str, item_id: str, query=None, note='Downloading API JSON', fatal=False): + # JSON API, does not appear to be documented + return self._call_webtools_api(host, '/api/v2/' + path, item_id, query, note, fatal) + + def _call_webtools_api(self, host: str, path: str, item_id: str, query=None, note='Downloading webtools API JSON', fatal=False): + # webtools docs: https://webtools.msvdn.net/ + return self._download_json( + urljoin(self._get_webtools_base_url(host), path), item_id, query=query, note=note, fatal=fatal) + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).groups() + content_info = try_get( + self._call_api( + host, f'content/{video_id}', video_id, note='Downloading content info API JSON'), lambda x: x['playerContentInfo']) + # Fallback + if not content_info: + webpage = self._download_webpage(url, video_id) + player_config = self._parse_json( + self._search_regex( + r'config\s*=\s*({.+?})\s*;', webpage, 'mainstreaming player config', + default='{}', flags=re.DOTALL), + video_id, transform_source=js_to_json, fatal=False) or {} + content_info = player_config['contentInfo'] + + host = content_info.get('host') or host + video_id = content_info.get('contentID') or video_id + title = content_info.get('title') + description = traverse_obj(content_info, 'longDescription', 'shortDescription', expected_type=str) + live_status = 'not_live' + if content_info.get('drmEnabled'): + self.report_drm(video_id) + + alternative_content_id = content_info.get('alternativeContentID') + if alternative_content_id: + self.report_warning(f'Ignoring alternative content ID: {alternative_content_id}') + + content_type = int_or_none(content_info.get('contentType')) + format_base_url = None + formats = [] + subtitles = {} + # Live content + if content_type == 20: + dvr_enabled = traverse_obj(content_info, ('playerSettings', 'dvrEnabled'), expected_type=bool) + format_base_url = f"https://{host}/live/{content_info['liveSourceID']}/{video_id}/%s{'?DVR' if dvr_enabled else ''}" + live_status = 'is_live' + heartbeat = self._call_api(host, f'heartbeat/{video_id}', video_id, note='Checking stream status') or {} + if heartbeat.get('heartBeatUp') is False: + self.raise_no_formats(f'MainStreaming said: {heartbeat.get("responseMessage")}', expected=True) + live_status = 'was_live' + + # Playlist + elif content_type == 31: + return self.playlist_result( + self._playlist_entries(host, content_info.get('playlistContents')), video_id, title, description) + # Normal video content? + elif content_type == 10: + format_base_url = f'https://{host}/vod/{video_id}/%s' + # Progressive format + # Note: in https://webtools.msvdn.net/loader/playerV2.js there is mention of original.mp3 format, + # however it seems to be the same as original.mp4? + formats.append({'url': format_base_url % 'original.mp4', 'format_note': 'original', 'quality': 1}) + else: + self.raise_no_formats(f'Unknown content type {content_type}') + + if format_base_url: + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + format_base_url % 'playlist.m3u8', video_id=video_id, fatal=False) + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles( + format_base_url % 'manifest.mpd', video_id=video_id, fatal=False) + + subtitles = self._merge_subtitles(m3u8_subs, mpd_subs) + formats.extend(m3u8_formats + mpd_formats) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'live_status': live_status, + 'duration': parse_duration(content_info.get('duration')), + 'tags': content_info.get('tags'), + 'subtitles': subtitles, + 'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster') + }