diff --git a/yt_dlp/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py index f902daacf..8a409461a 100644 --- a/yt_dlp/extractor/cloudflarestream.py +++ b/yt_dlp/extractor/cloudflarestream.py @@ -6,11 +6,11 @@ class CloudflareStreamIE(InfoExtractor): _SUBDOMAIN_RE = r'(?:(?:watch|iframe|customer-\w+)\.)?' _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' - _EMBED_RE = rf'embed\.{_DOMAIN_RE}/embed/[^/]+\.js\?.*?\bvideo=' - _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+' + _EMBED_RE = rf'(?:embed\.|{_SUBDOMAIN_RE}){_DOMAIN_RE}/embed/[^/?#]+\.js\?(?:[^#]+&)?video=' + _ID_RE = r'[\da-f]{32}|eyJ[\w-]+\.[\w-]+\.[\w-]+' _VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P{_ID_RE})' _EMBED_REGEX = [ - rf']+\bsrc=(["\'])(?P(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1', + rf']+\bsrc=(["\'])(?P(?:https?:)?//{_EMBED_RE}(?:{_ID_RE})(?:(?!\1).)*)\1', rf']+\bsrc=["\'](?Phttps?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})', ] _TESTS = [{ @@ -24,6 +24,14 @@ class CloudflareStreamIE(InfoExtractor): 'params': { 'skip_download': 'm3u8', }, + }, { + 'url': 'https://watch.cloudflarestream.com/embed/sdk-iframe-integration.fla9.latest.js?video=0e8e040aec776862e1d632a699edf59e', + 'info_dict': { + 'id': '0e8e040aec776862e1d632a699edf59e', + 'ext': 'mp4', + 'title': '0e8e040aec776862e1d632a699edf59e', + 'thumbnail': 'https://videodelivery.net/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg', + }, }, { 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', 'only_matching': True, @@ -36,6 +44,9 @@ class CloudflareStreamIE(InfoExtractor): }, { 'url': 'https://customer-aw5py76sw8wyqzmh.cloudflarestream.com/2463f6d3e06fa29710a337f5f5389fd8/iframe', 'only_matching': True, + }, { + 'url': 'https://watch.cloudflarestream.com/eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJraWQiOiJmYTA0YjViMzQ2NDkwYTM5NWJiNzQ1NWFhZTA2YzYwZSIsInN1YiI6Ijg4ZDQxMDhhMzY0MjA3M2VhYmFhZjg3ZGExODJkMjYzIiwiZXhwIjoxNjAwNjA5MzE5fQ.xkRJwLGkt0nZ%5F0BlPiwU7iW4pqb4lKkznbKfAhGg0tGcxSS6ZBA3lcTUwu7W%2DyCFbnAl%2Dhqk3Fn%5FqeQS%5FQydP27qTHpB9iIFFsMtk1tqzGZV5v4yrYDnwLSKzEKvVd6QwJnfABtxH2JdpSNuWlMUiVXFxGWgjOw6QeTNDDklTQYXV%5FNLV7sErSn5CeOPeRRkdXb%2D8ip%5FVOcfk1nDsFoOo4fctFtGP0wYMyY5ae8nhhatydHwevuvJCcEvEfh%2D4qjq9mCZOodevmtSQ4YWmggf4BxtWnDWYrGW8Otp6oqezrR8oY4%2DbKdV6PaqBj49aJdcls6xK7PmM8%5Fvjy3xfm0Mg', + 'only_matching': True, }] _WEBPAGE_TESTS = [{ 'url': 'https://upride.cc/incident/shoulder-pass-at-light/', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index e5efd08b4..f63bd7825 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2222,6 +2222,11 @@ def build_stream_name(): 'quality': quality, 'has_drm': has_drm, } + + # YouTube-specific + if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'): + f['language'] = yt_audio_content_id.split('.')[0] + resolution = last_stream_inf.get('RESOLUTION') if resolution: mobj = re.search(r'(?P\d+)[xX](?P\d+)', resolution) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 1d1e0770a..a3ca291fc 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -621,6 +621,9 @@ def parse_graphql_video(video): 'url': playable_url, }) extract_dash_manifest(video, formats) + if not formats: + # Do not append false positive entry w/o any formats + return automatic_captions, subtitles = {}, {} is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 48934fc6b..c3505b14f 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -30,6 +30,7 @@ try_call, try_get, url_or_none, + urlencode_postdata, ) @@ -43,8 +44,8 @@ class TikTokBaseIE(InfoExtractor): 'iid': None, # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme 'app_name': 'musical_ly', - 'app_version': '34.1.2', - 'manifest_app_version': '2023401020', + 'app_version': '35.1.3', + 'manifest_app_version': '2023501030', # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 'aid': '0', } @@ -114,7 +115,7 @@ def _get_universal_data(self, webpage, display_id): 'universal data', display_id, end_pattern=r'', default={}), ('__DEFAULT_SCOPE__', {dict})) or {} - def _call_api_impl(self, ep, query, video_id, fatal=True, + def _call_api_impl(self, ep, video_id, query=None, data=None, headers=None, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) @@ -125,7 +126,8 @@ def _call_api_impl(self, ep, query, video_id, fatal=True, fatal=fatal, note=note, errnote=errnote, headers={ 'User-Agent': self._APP_USER_AGENT, 'Accept': 'application/json', - }, query=query) + **(headers or {}), + }, query=query, data=data) def _build_api_query(self, query): return filter_dict({ @@ -174,7 +176,7 @@ def _build_api_query(self, query): 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), }) - def _call_api(self, ep, query, video_id, fatal=True, + def _call_api(self, ep, video_id, query=None, data=None, headers=None, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): if not self._APP_INFO and not self._get_next_app_info(): message = 'No working app info is available' @@ -187,9 +189,11 @@ def _call_api(self, ep, query, video_id, fatal=True, max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO for count in itertools.count(1): self.write_debug(str(self._APP_INFO)) - real_query = self._build_api_query(query) + real_query = self._build_api_query(query or {}) try: - return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote) + return self._call_api_impl( + ep, video_id, query=real_query, data=data, headers=headers, + fatal=fatal, note=note, errnote=errnote) except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: message = str(e.cause or e.msg) @@ -204,12 +208,13 @@ def _call_api(self, ep, query, video_id, fatal=True, raise def _extract_aweme_app(self, aweme_id): - feed_list = self._call_api( - 'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed', - errnote='Unable to download video feed').get('aweme_list') or [] - aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) + aweme_detail = traverse_obj( + self._call_api('multi/aweme/detail', aweme_id, data=urlencode_postdata({ + 'aweme_ids': f'[{aweme_id}]', + 'request_source': '0', + }), headers={'X-Argus': ''}), ('aweme_details', 0, {dict})) if not aweme_detail: - raise ExtractorError('Unable to find video in feed', video_id=aweme_id) + raise ExtractorError('Unable to extract aweme detail info', video_id=aweme_id) return self._parse_aweme_video_app(aweme_detail) def _extract_web_data_and_status(self, url, video_id, fatal=True): @@ -1037,7 +1042,8 @@ def _entries(self, list_id, display_id): for retry in self.RetryManager(): try: post_list = self._call_api( - self._API_ENDPOINT, query, display_id, note=f'Downloading video list page {page}', + self._API_ENDPOINT, display_id, query=query, + note=f'Downloading video list page {page}', errnote='Unable to download video list') except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4291875ee..158dc7822 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3797,6 +3797,8 @@ def _needs_live_processing(self, live_status, duration): def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 + PREFERRED_LANG_VALUE = 10 + original_language = None itags, stream_ids = collections.defaultdict(set), [] itag_qualities, res_qualities = {}, {0: None} q = qualities([ @@ -3845,6 +3847,13 @@ def build_fragments(f): itag_qualities[itag] = quality if height: res_qualities[height] = quality + + is_default = audio_track.get('audioIsDefault') + is_descriptive = 'descriptive' in (audio_track.get('displayName') or '').lower() + language_code = audio_track.get('id', '').split('.')[0] + if language_code and is_default: + original_language = language_code + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment # (adding `&sq=0` to the URL) and parsing emsg box to determine the # number of fragment that would subsequently requested with (`&sq=N`) @@ -3870,7 +3879,6 @@ def build_fragments(f): continue query = parse_qs(fmt_url) - throttled = False if query.get('n'): try: decrypt_nsig = self._cached(self._decrypt_nsig, 'nsig', query['n'][0]) @@ -3884,20 +3892,16 @@ def build_fragments(f): f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') if player_url: self.report_warning( - f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}' + f'nsig extraction failed: Some formats may be missing\n{phantomjs_hint}' f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) self.write_debug(e, only_once=True) else: self.report_warning( - 'Cannot decrypt nsig without player_url: You may experience throttling for some formats', + 'Cannot decrypt nsig without player_url: Some formats may be missing', video_id=video_id, only_once=True) - throttled = True + continue tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) - language_preference = ( - 10 if audio_track.get('audioIsDefault') and 10 - else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 - else -1) format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)})) # Some formats may have much smaller duration than others (possibly damaged during encoding) # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 @@ -3924,17 +3928,15 @@ def build_fragments(f): 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}', 'format_note': join_nonempty( - join_nonempty(audio_track.get('displayName'), - language_preference > 0 and ' (default)', delim=''), + join_nonempty(audio_track.get('displayName'), is_default and ' (default)', delim=''), name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN', + is_damaged and 'DAMAGED', is_broken and 'BROKEN', (self.get_param('verbose') or all_formats) and client_name, delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 - 'source_preference': ((-10 if throttled else -5 if itag == '22' else -1) - + (100 if 'Premium' in name else 0)), + 'source_preference': (-5 if itag == '22' else -1) + (100 if 'Premium' in name else 0), 'fps': fps if fps > 1 else None, # For some formats, fps is wrongly returned as 1 'audio_channels': fmt.get('audioChannels'), 'height': height, @@ -3944,9 +3946,8 @@ def build_fragments(f): 'filesize_approx': filesize_from_tbr(tbr, format_duration), 'url': fmt_url, 'width': int_or_none(fmt.get('width')), - 'language': join_nonempty(audio_track.get('id', '').split('.')[0], - 'desc' if language_preference < -1 else '') or None, - 'language_preference': language_preference, + 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, + 'language_preference': PREFERRED_LANG_VALUE if is_default else -10 if is_descriptive else -1, # Strictly de-prioritize broken, damaged and 3gp formats 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None, } @@ -4007,6 +4008,10 @@ def process_manifest_format(f, proto, client_name, itag): elif itag: f['format_id'] = itag + if original_language and f.get('language') == original_language: + f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') + f['language_preference'] = PREFERRED_LANG_VALUE + if f.get('source_preference') is None: f['source_preference'] = -1 @@ -4351,7 +4356,7 @@ def is_bad_format(fmt): 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), 'live_status': live_status, 'release_timestamp': live_start_time, - '_format_sort_fields': ( # source_preference is lower for throttled/potentially damaged formats + '_format_sort_fields': ( # source_preference is lower for potentially damaged formats 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang', 'proto'), }