From 14a14335b280766fbf5a469ae26836d6c1fe450a Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 8 Jun 2023 18:58:49 +0530 Subject: [PATCH] [extractor/youtube] Misc cleanup Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 93 ++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 6e7485c03..1b1266360 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -292,6 +292,7 @@ class BadgeType(enum.Enum): AVAILABILITY_PREMIUM = enum.auto() AVAILABILITY_SUBSCRIPTION = enum.auto() LIVE_NOW = enum.auto() + VERIFIED = enum.auto() class YoutubeBaseInfoExtractor(InfoExtractor): @@ -791,17 +792,23 @@ def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): def _extract_and_report_alerts(self, data, *args, **kwargs): return self._report_alerts(self._extract_alerts(data), *args, **kwargs) - def _extract_badges(self, renderer: dict): - privacy_icon_map = { + def _extract_badges(self, badge_list: list): + """ + Extract known BadgeType's from a list of badge renderers. + @returns [{'type': BadgeType}] + """ + icon_type_map = { 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, - 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, + 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, + 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, } badge_style_map = { 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, - 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, } label_map = { @@ -809,13 +816,13 @@ def _extract_badges(self, renderer: dict): 'private': BadgeType.AVAILABILITY_PRIVATE, 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, 'live': BadgeType.LIVE_NOW, - 'premium': BadgeType.AVAILABILITY_PREMIUM + 'premium': BadgeType.AVAILABILITY_PREMIUM, } badges = [] - for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer')): + for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))): badge_type = ( - privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) or badge_style_map.get(traverse_obj(badge, 'style')) ) if badge_type: @@ -823,11 +830,12 @@ def _extract_badges(self, renderer: dict): continue # fallback, won't work in some languages - label = traverse_obj(badge, 'label', expected_type=str, default='') + label = traverse_obj( + badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='') for match, label_badge_type in label_map.items(): if match in label.lower(): - badges.append({'type': badge_type}) - continue + badges.append({'type': label_badge_type}) + break return badges @@ -1020,8 +1028,7 @@ def _extract_video(self, renderer): overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) - badges = self._extract_badges(renderer) - + badges = self._extract_badges(traverse_obj(renderer, 'badges')) navigation_url = urljoin('https://www.youtube.com/', traverse_obj( renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) or '' @@ -1079,7 +1086,7 @@ def _extract_video(self, renderer): needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), view_count_field: view_count, - 'live_status': live_status + 'live_status': live_status, } @@ -1332,6 +1339,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', 'uploader_id': '@PhilippHagemeister', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1415,6 +1423,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'The Witcher', 'uploader_url': 'https://www.youtube.com/@thewitcher', 'uploader_id': '@thewitcher', + 'comment_count': int, + 'heatmap': 'count:100', }, }, { @@ -1894,6 +1904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Bernie Sanders', 'uploader_url': 'https://www.youtube.com/@BernieSanders', 'uploader_id': '@BernieSanders', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1955,6 +1966,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Vsauce', 'uploader_url': 'https://www.youtube.com/@Vsauce', 'uploader_id': '@Vsauce', + 'comment_count': int, }, 'params': { 'skip_download': True, @@ -2147,6 +2159,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'kudvenkat', 'uploader_url': 'https://www.youtube.com/@Csharp-video-tutorialsBlogspot', 'uploader_id': '@Csharp-video-tutorialsBlogspot', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -2227,6 +2240,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'CBS Mornings', 'uploader_url': 'https://www.youtube.com/@CBSMornings', 'uploader_id': '@CBSMornings', + 'comment_count': int, } }, { @@ -2297,6 +2311,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'colinfurze', 'uploader_url': 'https://www.youtube.com/@colinfurze', 'uploader_id': '@colinfurze', + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': { 'format': '17', # 3gp format available on android @@ -2342,6 +2358,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'SciShow', 'uploader_url': 'https://www.youtube.com/@SciShow', 'uploader_id': '@SciShow', + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': {'format': 'mhtml', 'skip_download': True} }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -2370,6 +2388,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Leon Nguyen', 'uploader_url': 'https://www.youtube.com/@LeonNguyen', 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', } }, { # Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date @@ -2398,6 +2417,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Leon Nguyen', 'uploader_url': 'https://www.youtube.com/@LeonNguyen', 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', }, 'params': {'compat_opts': ['no-youtube-prefer-utc-upload-date']} }, { @@ -2428,6 +2448,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Quackity', 'uploader_id': '@Quackity', 'uploader_url': 'https://www.youtube.com/@Quackity', + 'comment_count': int, + 'heatmap': 'count:100', } }, { # continuous livestream. Microformat upload date should be preferred. @@ -2594,6 +2616,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'MrBeast', 'uploader_url': 'https://www.youtube.com/@MrBeast', 'uploader_id': '@MrBeast', + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, }, { @@ -2655,6 +2679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'さなちゃんねる', 'uploader_url': 'https://www.youtube.com/@sana_natori', 'uploader_id': '@sana_natori', + 'heatmap': 'count:100', }, }, { @@ -2684,6 +2709,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': r're:^https?://.*\.webp', 'channel_url': 'https://www.youtube.com/channel/UCxzC4EngIsMrPmbm6Nxvb-A', 'playable_in_embed': True, + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': { 'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}}, @@ -2720,6 +2747,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Christopher Sykes', 'uploader_url': 'https://www.youtube.com/@ChristopherSykesDocumentaries', 'uploader_id': '@ChristopherSykesDocumentaries', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -3312,10 +3340,9 @@ def _extract_comment(self, comment_renderer, parent=None): if comment_abr is not None: info['is_favorited'] = 'creatorHeart' in comment_abr - comment_ab_icontype = traverse_obj( - comment_renderer, ('authorCommentBadge', 'authorCommentBadgeRenderer', 'icon', 'iconType')) - if comment_ab_icontype is not None: - info['author_is_verified'] = comment_ab_icontype in ('CHECK_CIRCLE_THICK', 'OFFICIAL_ARTIST_BADGE') + badges = self._extract_badges([traverse_obj(comment_renderer, 'authorCommentBadge')]) + if self._has_badge(badges, BadgeType.VERIFIED): + info['author_is_verified'] = True is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge') if is_pinned: @@ -4481,7 +4508,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if v: info[d_k] = v - badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False)) + badges = self._extract_badges(traverse_obj(vpir, 'badges')) is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or get_first(video_details, 'isPrivate', expected_type=bool)) @@ -4554,13 +4581,14 @@ def _extract_channel_renderer(self, renderer): channel_id = self.ucid_or_none(renderer['channelId']) title = self._get_text(renderer, 'title') channel_url = format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None) - # As of 2023-03-01 YouTube doesn't use the channel handles on these renderers yet. - # However we can expect them to change that in the future. channel_handle = self.handle_from_url( traverse_obj(renderer, ( 'navigationEndpoint', (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl')), {str}), get_all=False)) + if not channel_handle: + # As of 2023-06-01, YouTube sets subscriberCountText to the handle in search + channel_handle = self.handle_or_none(self._get_text(renderer, 'subscriberCountText')) return { '_type': 'url', 'url': channel_url, @@ -4573,9 +4601,15 @@ def _extract_channel_renderer(self, renderer): 'title': title, 'uploader_id': channel_handle, 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), - 'channel_follower_count': self._get_count(renderer, 'subscriberCountText'), + # See above. YouTube sets videoCountText to the subscriber text in search channel renderers. + # However, in feed/channels this is set correctly to the subscriber count + 'channel_follower_count': traverse_obj( + renderer, 'subscriberCountText', 'videoCountText', expected_type=self._get_count), 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), - 'playlist_count': self._get_count(renderer, 'videoCountText'), + 'playlist_count': ( + # videoCountText may be the subscriber count + self._get_count(renderer, 'videoCountText') + if self._get_count(renderer, 'subscriberCountText') is not None else None), 'description': self._get_text(renderer, 'descriptionSnippet'), } @@ -5100,7 +5134,7 @@ def _extract_availability(self, data): playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {} player_header_privacy = playlist_header_renderer.get('privacy') - badges = self._extract_badges(sidebar_renderer) + badges = self._extract_badges(traverse_obj(sidebar_renderer, 'badges')) # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge privacy_setting_icon = get_first( @@ -5350,7 +5384,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader': '3Blue1Brown', 'tags': ['Mathematics'], - 'channel_follower_count': int + 'channel_follower_count': int, }, }, { 'note': 'playlists, singlepage', @@ -5690,7 +5724,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'AlTsmyW4auo', # This will keep changing + 'id': 'hGkQjiJLjWQ', # This will keep changing 'ext': 'mp4', 'title': str, 'upload_date': r're:\d{8}', @@ -6202,7 +6236,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': str, 'uploader': str, 'uploader_url': str, - 'uploader_id': str + 'uploader_id': str, } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -6865,12 +6899,14 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): 'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc', 'title': 'Kurzgesagt – In a Nutshell', 'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q', - 'playlist_count': int, # XXX: should have a way of saying > 1 + # No longer available for search as it is set to the handle. + # 'playlist_count': int, 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', 'thumbnails': list, 'uploader_id': '@kurzgesagt', 'uploader_url': 'https://www.youtube.com/@kurzgesagt', 'uploader': 'Kurzgesagt – In a Nutshell', + 'channel_follower_count': int, } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -7134,6 +7170,8 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'live_status': 'not_live', 'channel_follower_count': int, 'chapters': 'count:20', + 'comment_count': int, + 'heatmap': 'count:100', } }] @@ -7194,6 +7232,7 @@ class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): 'channel': 'さなちゃんねる', 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', 'uploader': 'さなちゃんねる', + 'heatmap': 'count:100', }, 'add_ie': ['Youtube'], 'params': {'skip_download': 'Youtube'},