From e389d172b6f42e4f332ae679dc48543fb7b9b61d Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 12 Mar 2023 14:46:09 +0530 Subject: [PATCH 01/97] Fix 2a23d92d9ec44a0168079e38bcf3d383e5c4c7bb Closes #6517 --- yt_dlp/extractor/youtube.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4165d795c..d7cd0dc62 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3630,6 +3630,7 @@ def _needs_live_processing(self, live_status, duration): return live_status def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): + CHUNK_SIZE = 10 << 20 itags, stream_ids = collections.defaultdict(set), [] itag_qualities, res_qualities = {}, {0: None} q = qualities([ @@ -3642,6 +3643,13 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...)) all_formats = self._configuration_arg('include_duplicate_formats') + def build_fragments(f): + return LazyList({ + 'url': update_url_query(f['url'], { + 'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, f["filesize"])}' + }) + } for range_start in range(0, f['filesize'], CHUNK_SIZE)) + for fmt in streaming_formats: if fmt.get('targetDurationSec'): continue @@ -3771,17 +3779,12 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l if single_stream and dct.get('ext'): dct['container'] = dct['ext'] + '_dash' - CHUNK_SIZE = 10 << 20 if dct['filesize']: yield { **dct, 'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'], 'protocol': 'http_dash_segments', - 'fragments': LazyList({ - 'url': update_url_query(dct['url'], { - 'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, dct["filesize"])}' - }) - } for range_start in range(0, dct['filesize'], CHUNK_SIZE)) + 'fragments': build_fragments(dct), } if not all_formats: continue From 0181b9a1b31db3fde943f7cd3fe9662f23bff292 Mon Sep 17 00:00:00 2001 From: Ha Tien Loi Date: Sun, 12 Mar 2023 23:34:22 +0700 Subject: [PATCH 02/97] [extractor/thesun] Update `_VALID_URL` (#6522) Authored by: hatienl0i261299 Closes #6479 --- yt_dlp/extractor/thesun.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/thesun.py b/yt_dlp/extractor/thesun.py index ba5848283..5edcf1cc1 100644 --- a/yt_dlp/extractor/thesun.py +++ b/yt_dlp/extractor/thesun.py @@ -5,15 +5,22 @@ class TheSunIE(InfoExtractor): - _VALID_URL = r'https://(?:www\.)?thesun\.co\.uk/[^/]+/(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?the-?sun(\.co\.uk|\.com)/[^/]+/(?P\d+)' + _TESTS = [{ 'url': 'https://www.thesun.co.uk/tvandshowbiz/2261604/orlando-bloom-and-katy-perry-post-adorable-instagram-video-together-celebrating-thanksgiving-after-split-rumours/', 'info_dict': { 'id': '2261604', 'title': 'md5:cba22f48bad9218b64d5bbe0e16afddf', }, 'playlist_count': 2, - } + }, { + 'url': 'https://www.the-sun.com/entertainment/7611415/1000lb-sisters-fans-rip-amy-dangerous-health-decision/', + 'info_dict': { + 'id': '7611415', + 'title': 'md5:e0b9b976f79dc770e5c80f22f40bb844', + }, + 'playlist_count': 1, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): From 026435714cb7c39613a0d7d2acd15d3823b78d94 Mon Sep 17 00:00:00 2001 From: Ha Tien Loi Date: Mon, 13 Mar 2023 00:20:40 +0700 Subject: [PATCH 03/97] [extractor/LastFM] Rewrite playlist extraction (#6379) Authored by: hatienl0i261299, pukkandan Closes #5975 --- yt_dlp/extractor/lastfm.py | 43 ++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/lastfm.py b/yt_dlp/extractor/lastfm.py index f14198cfd..67103352e 100644 --- a/yt_dlp/extractor/lastfm.py +++ b/yt_dlp/extractor/lastfm.py @@ -1,33 +1,24 @@ +import itertools import re from .common import InfoExtractor -from ..utils import int_or_none, format_field +from ..utils import int_or_none, parse_qs, traverse_obj class LastFMPlaylistBaseIE(InfoExtractor): def _entries(self, url, playlist_id): - webpage = self._download_webpage(url, playlist_id) - start_page_number = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) or 1 - last_page_number = int_or_none(self._search_regex( - r'>(\d+)[^<]*[^<]*]+class="pagination-next', webpage, 'last_page', default=None)) - - for page_number in range(start_page_number, (last_page_number or start_page_number) + 1): + single_page = traverse_obj(parse_qs(url), ('page', -1, {int_or_none})) + for page in itertools.count(single_page or 1): webpage = self._download_webpage( - url, playlist_id, - note='Downloading page %d%s' % (page_number, format_field(last_page_number, None, ' of %d')), - query={'page': page_number}) - page_entries = [ - self.url_result(player_url, 'Youtube') - for player_url in set(re.findall(r'data-youtube-url="([^"]+)"', webpage)) - ] - - for e in page_entries: - yield e + url, playlist_id, f'Downloading page {page}', query={'page': page}) + videos = re.findall(r'data-youtube-url="([^"]+)"', webpage) + yield from videos + if single_page or not videos: + return def _real_extract(self, url): playlist_id = self._match_id(url) - return self.playlist_result(self._entries(url, playlist_id), playlist_id) + return self.playlist_from_matches(self._entries(url, playlist_id), playlist_id, ie='Youtube') class LastFMPlaylistIE(LastFMPlaylistBaseIE): @@ -37,7 +28,7 @@ class LastFMPlaylistIE(LastFMPlaylistBaseIE): 'info_dict': { 'id': 'Oasis', }, - 'playlist_count': 11, + 'playlist_mincount': 11, }, { 'url': 'https://www.last.fm/music/Oasis', 'only_matching': True, @@ -73,6 +64,18 @@ class LastFMUserIE(LastFMPlaylistBaseIE): 'id': '12319471', }, 'playlist_count': 30, + }, { + 'url': 'https://www.last.fm/user/naamloos1/playlists/12543760', + 'info_dict': { + 'id': '12543760', + }, + 'playlist_mincount': 80, + }, { + 'url': 'https://www.last.fm/user/naamloos1/playlists/12543760?page=3', + 'info_dict': { + 'id': '12543760', + }, + 'playlist_count': 32, }] From 1e3c2b6ec28d7ab5e31341fa93c47b65be4fbff4 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sun, 12 Mar 2023 19:38:27 +0200 Subject: [PATCH 04/97] [extractor/medaltv] Fix clips (#6502) Closes #6489 Authored by: xenova --- yt_dlp/extractor/medaltv.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py index 82be823b8..9e57ee21a 100644 --- a/yt_dlp/extractor/medaltv.py +++ b/yt_dlp/extractor/medaltv.py @@ -8,12 +8,12 @@ float_or_none, int_or_none, str_or_none, - traverse_obj, + traverse_obj ) class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/(?Pgames/[^/?#&]+/clips)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/games/[^/?#&]+/clips/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K', 'md5': '6930f8972914b6b9fdc2bb3918098ba0', @@ -80,25 +80,14 @@ class MedalTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - path = self._match_valid_url(url).group('path') webpage = self._download_webpage(url, video_id) - next_data = self._search_json( - ']*__NEXT_DATA__[^>]*>', webpage, + hydration_data = self._search_json( + r']*>[^<]*\bhydrationData\s*=', webpage, 'next data', video_id, end_pattern='', fatal=False) - build_id = next_data.get('buildId') - if not build_id: - raise ExtractorError( - 'Could not find build ID.', video_id=video_id) - - locale = next_data.get('locale', 'en') - - api_response = self._download_json( - f'https://medal.tv/_next/data/{build_id}/{locale}/{path}/{video_id}.json', video_id) - - clip = traverse_obj(api_response, ('pageProps', 'clip')) or {} + clip = traverse_obj(hydration_data, ('clips', ...), get_all=False) if not clip: raise ExtractorError( 'Could not find video information.', video_id=video_id) @@ -152,7 +141,7 @@ def add_item(container, item_url, height, id_key='format_id', item_id=None): # Necessary because the id of the author is not known in advance. # Won't raise an issue if no profile can be found as this is optional. - author = traverse_obj(api_response, ('pageProps', 'profile')) or {} + author = traverse_obj(hydration_data, ('profiles', ...), get_all=False) or {} author_id = str_or_none(author.get('userId')) author_url = format_field(author_id, None, 'https://medal.tv/users/%s') From 80ea6d3dea8483cddd39fc89b5ee1fc06670c33c Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sun, 12 Mar 2023 14:02:17 -0400 Subject: [PATCH 05/97] [extractor/Parler] Rewrite extractor (#6446) Authored by: JChris246 Closes #6068 --- yt_dlp/extractor/parler.py | 94 +++++++++++++++----------------------- 1 file changed, 37 insertions(+), 57 deletions(-) diff --git a/yt_dlp/extractor/parler.py b/yt_dlp/extractor/parler.py index 68a60bc84..2af805e7f 100644 --- a/yt_dlp/extractor/parler.py +++ b/yt_dlp/extractor/parler.py @@ -1,13 +1,14 @@ +import functools + from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( clean_html, - format_field, int_or_none, strip_or_none, traverse_obj, unified_timestamp, - urlencode_postdata, + urljoin, ) @@ -24,7 +25,7 @@ class ParlerIE(InfoExtractor): 'thumbnail': 'https://bl-images.parler.com/videos/6ce7cdf3-a27a-4d72-bf9c-d3e17ce39a66/thumbnail.jpeg', 'title': 'Parler video #df79fdba-07cc-48fe-b085-3293897520d7', 'description': 'md5:6f220bde2df4a97cbb89ac11f1fd8197', - 'timestamp': 1659744000, + 'timestamp': 1659785481, 'upload_date': '20220806', 'uploader': 'Tulsi Gabbard', 'uploader_id': 'TulsiGabbard', @@ -34,78 +35,57 @@ class ParlerIE(InfoExtractor): 'repost_count': int, }, }, - { - 'url': 'https://parler.com/feed/a7406eb4-91e5-4793-b5e3-ade57a24e287', - 'md5': '11687e2f5bb353682cee338d181422ed', - 'info_dict': { - 'id': 'a7406eb4-91e5-4793-b5e3-ade57a24e287', - 'ext': 'mp4', - 'thumbnail': 'https://bl-images.parler.com/videos/317827a8-1e48-4cbc-981f-7dd17d4c1183/thumbnail.jpeg', - 'title': 'Parler video #a7406eb4-91e5-4793-b5e3-ade57a24e287', - 'description': 'This man should run for office', - 'timestamp': 1659657600, - 'upload_date': '20220805', - 'uploader': 'Benny Johnson', - 'uploader_id': 'BennyJohnson', - 'uploader_url': 'https://parler.com/BennyJohnson', - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { 'url': 'https://parler.com/feed/f23b85c1-6558-470f-b9ff-02c145f28da5', 'md5': 'eaba1ff4a10fe281f5ce74e930ab2cb4', 'info_dict': { 'id': 'r5vkSaz8PxQ', 'ext': 'mp4', - 'thumbnail': 'https://i.ytimg.com/vi_webp/r5vkSaz8PxQ/maxresdefault.webp', - 'title': 'Tom MacDonald Names Reaction', - 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea', - 'upload_date': '20220716', - 'duration': 1267, - 'uploader': 'Mahesh Chookolingo', - 'uploader_id': 'maheshchookolingo', - 'uploader_url': 'http://www.youtube.com/user/maheshchookolingo', - 'channel': 'Mahesh Chookolingo', - 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w', - 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w', - 'categories': ['Entertainment'], - 'tags': list, - 'availability': 'public', 'live_status': 'not_live', - 'view_count': int, 'comment_count': int, + 'duration': 1267, 'like_count': int, 'channel_follower_count': int, - 'age_limit': 0, + 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w', + 'upload_date': '20220716', + 'thumbnail': 'https://i.ytimg.com/vi/r5vkSaz8PxQ/maxresdefault.jpg', + 'tags': 'count:17', + 'availability': 'public', + 'categories': ['Entertainment'], 'playable_in_embed': True, + 'channel': 'Who Knows What! With Mahesh & Friends', + 'title': 'Tom MacDonald Names Reaction', + 'uploader': 'Who Knows What! With Mahesh & Friends', + 'uploader_id': '@maheshchookolingo', + 'age_limit': 0, + 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea', + 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w', + 'view_count': int, + 'uploader_url': 'http://www.youtube.com/@maheshchookolingo', }, }, ] def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'https://parler.com/open-api/ParleyDetailEndpoint.php', video_id, - data=urlencode_postdata({'uuid': video_id}))['data'][0] - primary = data['primary'] - - embed = self._parse_json(primary.get('V2LINKLONG') or '', video_id, fatal=False) - if embed: - return self.url_result(embed[0], YoutubeIE) + data = self._download_json(f'https://api.parler.com/v0/public/parleys/{video_id}', + video_id)['data'] + if data.get('link'): + return self.url_result(data['link'], YoutubeIE) return { 'id': video_id, - 'url': traverse_obj(primary, ('video_data', 'videoSrc')), - 'thumbnail': traverse_obj(primary, ('video_data', 'thumbnailUrl')), - 'title': '', - 'description': strip_or_none(clean_html(primary.get('full_body'))) or None, - 'timestamp': unified_timestamp(primary.get('date_created')), - 'uploader': strip_or_none(primary.get('name')), - 'uploader_id': strip_or_none(primary.get('username')), - 'uploader_url': format_field(strip_or_none(primary.get('username')), None, 'https://parler.com/%s'), - 'view_count': int_or_none(primary.get('view_count')), - 'comment_count': int_or_none(traverse_obj(data, ('engagement', 'commentCount'))), - 'repost_count': int_or_none(traverse_obj(data, ('engagement', 'echoCount'))), + 'title': strip_or_none(data.get('title')) or '', + **traverse_obj(data, { + 'url': ('video', 'videoSrc'), + 'thumbnail': ('video', 'thumbnailUrl'), + 'description': ('body', {clean_html}), + 'timestamp': ('date_created', {unified_timestamp}), + 'uploader': ('user', 'name', {strip_or_none}), + 'uploader_id': ('user', 'username', {str}), + 'uploader_url': ('user', 'username', {functools.partial(urljoin, 'https://parler.com/')}), + 'view_count': ('views', {int_or_none}), + 'comment_count': ('total_comments', {int_or_none}), + 'repost_count': ('echos', {int_or_none}), + }) } From cf9fd52fabe71d6e7c30d3ea525029ffa561fc9c Mon Sep 17 00:00:00 2001 From: Chris Caruso Date: Sun, 12 Mar 2023 11:07:34 -0700 Subject: [PATCH 06/97] [extractor/jwplatform] Update `_extract_embed_urls` (#6383) Authored by: carusocr --- yt_dlp/extractor/jwplatform.py | 37 ++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/jwplatform.py b/yt_dlp/extractor/jwplatform.py index c94968943..bc47aa6d3 100644 --- a/yt_dlp/extractor/jwplatform.py +++ b/yt_dlp/extractor/jwplatform.py @@ -8,14 +8,16 @@ class JWPlatformIE(InfoExtractor): _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P[a-zA-Z0-9]{8})' _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', - 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'md5': '3aa16e4f6860e6e78b7df5829519aed3', 'info_dict': { 'id': 'nPripu9l', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Big Buck Bunny Trailer', 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', 'upload_date': '20081127', 'timestamp': 1227796140, + 'duration': 32.0, + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/nPripu9l/poster.jpg?width=720', } }, { 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js', @@ -37,18 +39,31 @@ class JWPlatformIE(InfoExtractor): }, }, { # Player url not surrounded by quotes - 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin', + 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/school-trip', 'info_dict': { - 'id': 'R10NQdhY', - 'title': 'Playgirl', + 'id': 'jUxh5uin', + 'title': 'Klassenfahrt', 'ext': 'mp4', - 'upload_date': '20220624', - 'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720', - 'timestamp': 1656064800, - 'description': 'BRD 1966, Will Tremper', - 'duration': 5146.0, + 'upload_date': '20230109', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/jUxh5uin/poster.jpg?width=720', + 'timestamp': 1673270298, + 'description': '', + 'duration': 5193.0, }, 'params': {'allowed_extractors': ['generic', 'jwplatform']}, + }, { + # iframe src attribute includes backslash before URL string + 'url': 'https://www.elespectador.com/colombia/video-asi-se-evito-la-fuga-de-john-poulos-presunto-feminicida-de-valentina-trespalacios-explicacion', + 'info_dict': { + 'id': 'QD3gsexj', + 'title': 'Así se evitó la fuga de John Poulos, presunto feminicida de Valentina Trespalacios', + 'ext': 'mp4', + 'upload_date': '20230127', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/QD3gsexj/poster.jpg?width=720', + 'timestamp': 1674862986, + 'description': 'md5:128fd74591c4e1fc2da598c5cb6f5ce4', + 'duration': 263.0, + }, }] @classmethod @@ -57,7 +72,7 @@ def _extract_embed_urls(cls, url, webpage): # is used by hyland.com # if we find