Merge branch 'yt-dlp:master' into cleanup/2024-06

2024-07-03 02:35:28 +02:00 · 2024-06-21 18:40:30 -05:00 · 2024-06-21 18:40:30 -05:00 · c0225a68da
commit c0225a68da
parent dae1faada9 8ca1d57ed0
5 changed files with 63 additions and 33 deletions
--- a/yt_dlp/extractor/cloudflarestream.py
+++ b/yt_dlp/extractor/cloudflarestream.py
@ -6,11 +6,11 @@
 class CloudflareStreamIE(InfoExtractor):
    _SUBDOMAIN_RE = r'(?:(?:watch|iframe|customer-\w+)\.)?'
    _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)'
-    _EMBED_RE = rf'embed\.{_DOMAIN_RE}/embed/[^/]+\.js\?.*?\bvideo='
-    _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+'
+    _EMBED_RE = rf'(?:embed\.|{_SUBDOMAIN_RE}){_DOMAIN_RE}/embed/[^/?#]+\.js\?(?:[^#]+&)?video='
+    _ID_RE = r'[\da-f]{32}|eyJ[\w-]+\.[\w-]+\.[\w-]+'
    _VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P<id>{_ID_RE})'
    _EMBED_REGEX = [
-        rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1',
+        rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE})(?:(?!\1).)*)\1',
        rf'<iframe[^>]+\bsrc=["\'](?P<url>https?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})',
    ]
    _TESTS = [{
@ -24,6 +24,14 @@ class CloudflareStreamIE(InfoExtractor):
        'params': {
            'skip_download': 'm3u8',
        },
+    }, {
+        'url': 'https://watch.cloudflarestream.com/embed/sdk-iframe-integration.fla9.latest.js?video=0e8e040aec776862e1d632a699edf59e',
+        'info_dict': {
+            'id': '0e8e040aec776862e1d632a699edf59e',
+            'ext': 'mp4',
+            'title': '0e8e040aec776862e1d632a699edf59e',
+            'thumbnail': 'https://videodelivery.net/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg',
+        },
    }, {
        'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1',
        'only_matching': True,
@ -36,6 +44,9 @@ class CloudflareStreamIE(InfoExtractor):
    }, {
        'url': 'https://customer-aw5py76sw8wyqzmh.cloudflarestream.com/2463f6d3e06fa29710a337f5f5389fd8/iframe',
        'only_matching': True,
+    }, {
+        'url': 'https://watch.cloudflarestream.com/eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJraWQiOiJmYTA0YjViMzQ2NDkwYTM5NWJiNzQ1NWFhZTA2YzYwZSIsInN1YiI6Ijg4ZDQxMDhhMzY0MjA3M2VhYmFhZjg3ZGExODJkMjYzIiwiZXhwIjoxNjAwNjA5MzE5fQ.xkRJwLGkt0nZ%5F0BlPiwU7iW4pqb4lKkznbKfAhGg0tGcxSS6ZBA3lcTUwu7W%2DyCFbnAl%2Dhqk3Fn%5FqeQS%5FQydP27qTHpB9iIFFsMtk1tqzGZV5v4yrYDnwLSKzEKvVd6QwJnfABtxH2JdpSNuWlMUiVXFxGWgjOw6QeTNDDklTQYXV%5FNLV7sErSn5CeOPeRRkdXb%2D8ip%5FVOcfk1nDsFoOo4fctFtGP0wYMyY5ae8nhhatydHwevuvJCcEvEfh%2D4qjq9mCZOodevmtSQ4YWmggf4BxtWnDWYrGW8Otp6oqezrR8oY4%2DbKdV6PaqBj49aJdcls6xK7PmM8%5Fvjy3xfm0Mg',
+        'only_matching': True,
    }]
    _WEBPAGE_TESTS = [{
        'url': 'https://upride.cc/incident/shoulder-pass-at-light/',
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -2222,6 +2222,11 @@ def build_stream_name():
                        'quality': quality,
                        'has_drm': has_drm,
                    }
+
+                    # YouTube-specific
+                    if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'):
+                        f['language'] = yt_audio_content_id.split('.')[0]
+
                    resolution = last_stream_inf.get('RESOLUTION')
                    if resolution:
                        mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@ -621,6 +621,9 @@ def parse_graphql_video(video):
                                'url': playable_url,
                            })
                    extract_dash_manifest(video, formats)
+                    if not formats:
+                        # Do not append false positive entry w/o any formats
+                        return

                    automatic_captions, subtitles = {}, {}
                    is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool}))
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@ -30,6 +30,7 @@
    try_call,
    try_get,
    url_or_none,
+    urlencode_postdata,
 )


@ -43,8 +44,8 @@ class TikTokBaseIE(InfoExtractor):
        'iid': None,
        # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme
        'app_name': 'musical_ly',
-        'app_version': '34.1.2',
-        'manifest_app_version': '2023401020',
+        'app_version': '35.1.3',
+        'manifest_app_version': '2023501030',
        # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
        'aid': '0',
    }
@ -114,7 +115,7 @@ def _get_universal_data(self, webpage, display_id):
            'universal data', display_id, end_pattern=r'</script>', default={}),
            ('__DEFAULT_SCOPE__', {dict})) or {}

-    def _call_api_impl(self, ep, query, video_id, fatal=True,
+    def _call_api_impl(self, ep, video_id, query=None, data=None, headers=None, fatal=True,
                       note='Downloading API JSON', errnote='Unable to download API page'):
        self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
        webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
@ -125,7 +126,8 @@ def _call_api_impl(self, ep, query, video_id, fatal=True,
            fatal=fatal, note=note, errnote=errnote, headers={
                'User-Agent': self._APP_USER_AGENT,
                'Accept': 'application/json',
-            }, query=query)
+                **(headers or {}),
+            }, query=query, data=data)

    def _build_api_query(self, query):
        return filter_dict({
@ -174,7 +176,7 @@ def _build_api_query(self, query):
            'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
        })

-    def _call_api(self, ep, query, video_id, fatal=True,
+    def _call_api(self, ep, video_id, query=None, data=None, headers=None, fatal=True,
                  note='Downloading API JSON', errnote='Unable to download API page'):
        if not self._APP_INFO and not self._get_next_app_info():
            message = 'No working app info is available'
@ -187,9 +189,11 @@ def _call_api(self, ep, query, video_id, fatal=True,
        max_tries = len(self._APP_INFO_POOL) + 1  # _APP_INFO_POOL + _APP_INFO
        for count in itertools.count(1):
            self.write_debug(str(self._APP_INFO))
-            real_query = self._build_api_query(query)
+            real_query = self._build_api_query(query or {})
            try:
-                return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote)
+                return self._call_api_impl(
+                    ep, video_id, query=real_query, data=data, headers=headers,
+                    fatal=fatal, note=note, errnote=errnote)
            except ExtractorError as e:
                if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
                    message = str(e.cause or e.msg)
@ -204,12 +208,13 @@ def _call_api(self, ep, query, video_id, fatal=True,
                raise

    def _extract_aweme_app(self, aweme_id):
-        feed_list = self._call_api(
-            'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed',
-            errnote='Unable to download video feed').get('aweme_list') or []
-        aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
+        aweme_detail = traverse_obj(
+            self._call_api('multi/aweme/detail', aweme_id, data=urlencode_postdata({
+                'aweme_ids': f'[{aweme_id}]',
+                'request_source': '0',
+            }), headers={'X-Argus': ''}), ('aweme_details', 0, {dict}))
        if not aweme_detail:
-            raise ExtractorError('Unable to find video in feed', video_id=aweme_id)
+            raise ExtractorError('Unable to extract aweme detail info', video_id=aweme_id)
        return self._parse_aweme_video_app(aweme_detail)

    def _extract_web_data_and_status(self, url, video_id, fatal=True):
@ -1037,7 +1042,8 @@ def _entries(self, list_id, display_id):
            for retry in self.RetryManager():
                try:
                    post_list = self._call_api(
-                        self._API_ENDPOINT, query, display_id, note=f'Downloading video list page {page}',
+                        self._API_ENDPOINT, display_id, query=query,
+                        note=f'Downloading video list page {page}',
                        errnote='Unable to download video list')
                except ExtractorError as e:
                    if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -3797,6 +3797,8 @@ def _needs_live_processing(self, live_status, duration):

    def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration):
        CHUNK_SIZE = 10 << 20
+        PREFERRED_LANG_VALUE = 10
+        original_language = None
        itags, stream_ids = collections.defaultdict(set), []
        itag_qualities, res_qualities = {}, {0: None}
        q = qualities([
@ -3845,6 +3847,13 @@ def build_fragments(f):
                    itag_qualities[itag] = quality
                if height:
                    res_qualities[height] = quality
+
+            is_default = audio_track.get('audioIsDefault')
+            is_descriptive = 'descriptive' in (audio_track.get('displayName') or '').lower()
+            language_code = audio_track.get('id', '').split('.')[0]
+            if language_code and is_default:
+                original_language = language_code
+
            # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
            # (adding `&sq=0` to the URL) and parsing emsg box to determine the
            # number of fragment that would subsequently requested with (`&sq=N`)
@ -3870,7 +3879,6 @@ def build_fragments(f):
                    continue

            query = parse_qs(fmt_url)
-            throttled = False
            if query.get('n'):
                try:
                    decrypt_nsig = self._cached(self._decrypt_nsig, 'nsig', query['n'][0])
@ -3884,20 +3892,16 @@ def build_fragments(f):
                                          f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n')
                    if player_url:
                        self.report_warning(
-                            f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}'
+                            f'nsig extraction failed: Some formats may be missing\n{phantomjs_hint}'
                            f'         n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True)
                        self.write_debug(e, only_once=True)
                    else:
                        self.report_warning(
-                            'Cannot decrypt nsig without player_url: You may experience throttling for some formats',
+                            'Cannot decrypt nsig without player_url: Some formats may be missing',
                            video_id=video_id, only_once=True)
-                    throttled = True
+                    continue

            tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
-            language_preference = (
-                10 if audio_track.get('audioIsDefault') and 10
-                else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10
-                else -1)
            format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)}))
            # Some formats may have much smaller duration than others (possibly damaged during encoding)
            # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823
@ -3924,17 +3928,15 @@ def build_fragments(f):
                'filesize': int_or_none(fmt.get('contentLength')),
                'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}',
                'format_note': join_nonempty(
-                    join_nonempty(audio_track.get('displayName'),
-                                  language_preference > 0 and ' (default)', delim=''),
+                    join_nonempty(audio_track.get('displayName'), is_default and ' (default)', delim=''),
                    name, fmt.get('isDrc') and 'DRC',
                    try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
                    try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
-                    throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN',
+                    is_damaged and 'DAMAGED', is_broken and 'BROKEN',
                    (self.get_param('verbose') or all_formats) and client_name,
                    delim=', '),
                # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372
-                'source_preference': ((-10 if throttled else -5 if itag == '22' else -1)
-                                      + (100 if 'Premium' in name else 0)),
+                'source_preference': (-5 if itag == '22' else -1) + (100 if 'Premium' in name else 0),
                'fps': fps if fps > 1 else None,  # For some formats, fps is wrongly returned as 1
                'audio_channels': fmt.get('audioChannels'),
                'height': height,
@ -3944,9 +3946,8 @@ def build_fragments(f):
                'filesize_approx': filesize_from_tbr(tbr, format_duration),
                'url': fmt_url,
                'width': int_or_none(fmt.get('width')),
-                'language': join_nonempty(audio_track.get('id', '').split('.')[0],
-                                          'desc' if language_preference < -1 else '') or None,
-                'language_preference': language_preference,
+                'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None,
+                'language_preference': PREFERRED_LANG_VALUE if is_default else -10 if is_descriptive else -1,
                # Strictly de-prioritize broken, damaged and 3gp formats
                'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None,
            }
@ -4007,6 +4008,10 @@ def process_manifest_format(f, proto, client_name, itag):
            elif itag:
                f['format_id'] = itag

+            if original_language and f.get('language') == original_language:
+                f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ')
+                f['language_preference'] = PREFERRED_LANG_VALUE
+
            if f.get('source_preference') is None:
                f['source_preference'] = -1

@ -4351,7 +4356,7 @@ def is_bad_format(fmt):
            'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
            'live_status': live_status,
            'release_timestamp': live_start_time,
-            '_format_sort_fields': (  # source_preference is lower for throttled/potentially damaged formats
+            '_format_sort_fields': (  # source_preference is lower for potentially damaged formats
                'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang', 'proto'),
        }