From c91ac833ea99b00506e470a44cf930e4e23378c9 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sun, 4 Jun 2023 16:04:47 +0800 Subject: [PATCH 01/74] [extractor/acast] Support embeds (#7212) Authored by: pabs3 --- yt_dlp/extractor/acast.py | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/acast.py b/yt_dlp/extractor/acast.py index f2f828f8e..427d04c31 100644 --- a/yt_dlp/extractor/acast.py +++ b/yt_dlp/extractor/acast.py @@ -40,28 +40,33 @@ def _call_api(self, path, video_id, query=None): class ACastIE(ACastBaseIE): IE_NAME = 'acast' - _VALID_URL = r'''(?x) + _VALID_URL = r'''(?x: https?:// (?: (?:(?:embed|www)\.)?acast\.com/| play\.acast\.com/s/ ) - (?P[^/]+)/(?P[^/#?]+) - ''' + (?P[^/]+)/(?P[^/#?"]+) + )''' + _EMBED_REGEX = [rf'(?x)]+\bsrc=[\'"](?P{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', + 'description': 'md5:013959207e05011ad14a222cf22278cc', 'timestamp': 1477346700, 'upload_date': '20161024', 'duration': 2766, - 'creator': 'Anton Berg & Martin Johnson', + 'creator': 'Third Ear Studio', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', + 'thumbnail': 'https://assets.pippa.io/shows/616ebe1886d7b1398620b943/616ebe33c7e6e70013cae7da.jpg', + 'episode_number': 2, + 'display_id': '2.raggarmordet-rosterurdetforflutna', + 'season_number': 4, + 'season': 'Season 4', } }, { 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', @@ -73,6 +78,23 @@ class ACastIE(ACastBaseIE): 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'https://ausi.anu.edu.au/news/democracy-sausage-episode-can-labor-be-long-form-government', + 'info_dict': { + 'id': '646c68fb21fbf20011e9c651', + 'ext': 'mp3', + 'creator': 'The Australian National University', + 'display_id': 'can-labor-be-a-long-form-government', + 'duration': 2618, + 'thumbnail': 'https://assets.pippa.io/shows/6113e8578b4903809f16f7e5/1684821529295-515b9520db9ce53275b995eb302f941c.jpeg', + 'title': 'Can Labor be a long-form government?', + 'episode': 'Can Labor be a long-form government?', + 'upload_date': '20230523', + 'series': 'Democracy Sausage with Mark Kenny', + 'timestamp': 1684826362, + 'description': 'md5:feabe1fc5004c78ee59c84a46bf4ba16', + } + }] def _real_extract(self, url): channel, display_id = self._match_valid_url(url).groups() From 12037d8b0a578fcc78a5c8f98964e48ee6060e25 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 06:10:30 -0500 Subject: [PATCH 02/74] [extractor/substack] Fix extraction (#7218) Closes #7155 Authored by: bashonly --- yt_dlp/extractor/substack.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index fa3826388..3782ceed1 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -2,7 +2,7 @@ import urllib.parse from .common import InfoExtractor -from ..utils import str_or_none, traverse_obj +from ..utils import js_to_json, str_or_none, traverse_obj class SubstackIE(InfoExtractor): @@ -14,7 +14,7 @@ class SubstackIE(InfoExtractor): 'id': '47660949', 'ext': 'mp4', 'title': 'I MADE A VLOG', - 'description': 'md5:10c01ff93439a62e70ce963b2aa0b7f6', + 'description': 'md5:9248af9a759321e1027226f988f54d96', 'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18', 'uploader': 'Maybe Baby', 'uploader_id': '33628', @@ -77,7 +77,9 @@ def _real_extract(self, url): display_id, username = self._match_valid_url(url).group('id', 'username') webpage = self._download_webpage(url, display_id) - webpage_info = self._search_json(r']*>\s*window\._preloads\s*=', webpage, 'preloads', display_id) + webpage_info = self._parse_json(self._search_json( + r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string', + display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id) post_type = webpage_info['post']['type'] formats, subtitles = [], {} From 971d901d129403e875a04dd92109507a03fbc070 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 07:03:44 -0500 Subject: [PATCH 03/74] [extractor/tencent] Fix fatal metadata extraction (#7219) Closes #7177 Authored by: bashonly --- yt_dlp/extractor/tencent.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/tencent.py b/yt_dlp/extractor/tencent.py index 42a2175b0..6618ea4e6 100644 --- a/yt_dlp/extractor/tencent.py +++ b/yt_dlp/extractor/tencent.py @@ -163,11 +163,9 @@ class VQQBaseIE(TencentBaseIE): _REFERER = 'v.qq.com' def _get_webpage_metadata(self, webpage, video_id): - return self._parse_json( - self._search_regex( - r'(?s)]*>[^<]*window\.__pinia\s*=\s*([^<]+)', - webpage, 'pinia data', fatal=False), - video_id, transform_source=js_to_json, fatal=False) + return self._search_json( + r']*>[^<]*window\.__(?:pinia|PINIA__)\s*=', + webpage, 'pinia data', video_id, transform_source=js_to_json, fatal=False) class VQQVideoIE(VQQBaseIE): @@ -176,7 +174,7 @@ class VQQVideoIE(VQQBaseIE): _TESTS = [{ 'url': 'https://v.qq.com/x/page/q326831cny0.html', - 'md5': '84568b3722e15e9cd023b5594558c4a7', + 'md5': 'b11c9cb781df710d686b950376676e2a', 'info_dict': { 'id': 'q326831cny0', 'ext': 'mp4', @@ -187,7 +185,7 @@ class VQQVideoIE(VQQBaseIE): }, }, { 'url': 'https://v.qq.com/x/page/o3013za7cse.html', - 'md5': 'cc431c4f9114a55643893c2c8ebf5592', + 'md5': 'a1bcf42c6d28c189bd2fe2d468abb287', 'info_dict': { 'id': 'o3013za7cse', 'ext': 'mp4', @@ -208,6 +206,7 @@ class VQQVideoIE(VQQBaseIE): 'series': '鸡毛飞上天', 'format_id': r're:^shd', }, + 'skip': '404', }, { 'url': 'https://v.qq.com/x/cover/mzc00200p29k31e/s0043cwsgj0.html', 'md5': 'fadd10bf88aec3420f06f19ee1d24c5b', @@ -220,6 +219,7 @@ class VQQVideoIE(VQQBaseIE): 'series': '青年理工工作者生活研究所', 'format_id': r're:^shd', }, + 'params': {'skip_download': 'm3u8'}, }, { # Geo-restricted to China 'url': 'https://v.qq.com/x/cover/mcv8hkc8zk8lnov/x0036x5qqsr.html', From 5ee9a7d6e18ceea956e831994cf11c423979354f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 07:15:09 -0500 Subject: [PATCH 04/74] [extractor/sverigesradio] Support slug URLs (#7220) Closes #7145 Authored by: bashonly --- yt_dlp/extractor/sverigesradio.py | 62 +++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/sverigesradio.py b/yt_dlp/extractor/sverigesradio.py index 65da615d0..01a07b399 100644 --- a/yt_dlp/extractor/sverigesradio.py +++ b/yt_dlp/extractor/sverigesradio.py @@ -1,8 +1,13 @@ from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, + get_element_by_id, + get_element_html_by_class, int_or_none, str_or_none, + traverse_obj, + url_or_none, ) @@ -21,7 +26,15 @@ class SverigesRadioBaseIE(InfoExtractor): } def _real_extract(self, url): - audio_id = self._match_id(url) + audio_id, display_id = self._match_valid_url(url).group('id', 'slug') + if not audio_id: + webpage = self._download_webpage(url, display_id) + audio_id = ( + traverse_obj( + get_element_html_by_class('audio-button', webpage), + ({extract_attributes}, ('data-audio-id', 'data-publication-id')), get_all=False) + or self._parse_json(get_element_by_id('gtm-metadata', webpage), display_id)['pageId']) + query = { 'id': audio_id, 'type': self._AUDIO_TYPE, @@ -30,7 +43,6 @@ def _real_extract(self, url): item = self._download_json( self._BASE_URL + 'audiometadata', audio_id, 'Downloading audio JSON metadata', query=query)['items'][0] - title = item['subtitle'] query['format'] = 'iis' urls = [] @@ -61,18 +73,20 @@ def _real_extract(self, url): return { 'id': audio_id, - 'title': title, 'formats': formats, - 'series': item.get('title'), - 'duration': int_or_none(item.get('duration')), - 'thumbnail': item.get('displayimageurl'), - 'description': item.get('description'), + **traverse_obj(item, { + 'title': 'subtitle', + 'series': 'title', + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('displayimageurl', {url_or_none}), + 'description': 'description', + }), } class SverigesRadioPublicationIE(SverigesRadioBaseIE): IE_NAME = 'sverigesradio:publication' - _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/sida/(?:artikel|gruppsida)\.aspx\?.*?\bartikel=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?(?:artikel|gruppsida)(?:\.aspx\?.*?\bartikel=(?P[0-9]+)|/(?P[\w-]+))' _TESTS = [{ 'url': 'https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7038546', 'md5': '6a4917e1923fccb080e5a206a5afa542', @@ -85,6 +99,18 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE): 'description': 'md5:daf7ce66a8f0a53d5465a5984d3839df', 'thumbnail': r're:^https?://.*\.jpg', }, + }, { + 'url': 'https://sverigesradio.se/artikel/tysk-fotbollsfeber-bayern-munchens-10-ariga-segersvit-kan-brytas', + 'md5': 'f8a914ad50f491bb74eed403ab4bfef6', + 'info_dict': { + 'id': '8360345', + 'ext': 'm4a', + 'title': 'Tysk fotbollsfeber när Bayern Münchens 10-åriga segersvit kan brytas', + 'series': 'Radiosporten', + 'description': 'md5:5254610e20ce527ecb3a6102a06dcc5f', + 'duration': 72, + 'thumbnail': r're:^https?://.*\.jpg', + }, }, { 'url': 'https://sverigesradio.se/sida/gruppsida.aspx?programid=3304&grupp=6247&artikel=7146887', 'only_matching': True, @@ -94,8 +120,8 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE): class SverigesRadioEpisodeIE(SverigesRadioBaseIE): IE_NAME = 'sverigesradio:episode' - _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?P[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?:(?P\d+)|(?P[\w-]+))(?:$|[#?])' + _TESTS = [{ 'url': 'https://sverigesradio.se/avsnitt/1140922?programid=1300', 'md5': '20dc4d8db24228f846be390b0c59a07c', 'info_dict': { @@ -106,6 +132,18 @@ class SverigesRadioEpisodeIE(SverigesRadioBaseIE): 'title': 'Metoo och valen', 'description': 'md5:fcb5c1f667f00badcc702b196f10a27e', 'thumbnail': r're:^https?://.*\.jpg', - } - } + }, + }, { + 'url': 'https://sverigesradio.se/avsnitt/p4-live-med-first-aid-kit-scandinavium-mars-2023', + 'md5': 'ce17fb82520a8033dbb846993d5589fe', + 'info_dict': { + 'id': '2160416', + 'ext': 'm4a', + 'title': 'P4 Live med First Aid Kit', + 'description': 'md5:6d5b78eed3d2b65f6de04daa45e9285d', + 'thumbnail': r're:^https?://.*\.jpg', + 'series': 'P4 Live', + 'duration': 5640, + }, + }] _AUDIO_TYPE = 'episode' From 97d60ad8cd6c99f01e463a9acfce8693aff2a609 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 08:37:59 -0500 Subject: [PATCH 05/74] [extractor/foxnews] Fix extractors (#7222) Closes #6050 Authored by: bashonly --- yt_dlp/extractor/amp.py | 9 +++-- yt_dlp/extractor/foxnews.py | 77 +++++++++++++++++++++++++++---------- 2 files changed, 62 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/amp.py b/yt_dlp/extractor/amp.py index b0cbd775c..0d259c549 100644 --- a/yt_dlp/extractor/amp.py +++ b/yt_dlp/extractor/amp.py @@ -5,6 +5,7 @@ int_or_none, mimetype2ext, parse_iso8601, + strip_jsonp, unified_timestamp, url_or_none, ) @@ -15,7 +16,7 @@ class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _extract_feed_info(self, url): feed = self._download_json( url, None, 'Downloading Akamai AMP feed', - 'Unable to download Akamai AMP feed') + 'Unable to download Akamai AMP feed', transform_source=strip_jsonp) item = feed.get('channel', {}).get('item') if not item: raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error'])) @@ -73,8 +74,10 @@ def get_media_node(name, default=None): media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), diff --git a/yt_dlp/extractor/foxnews.py b/yt_dlp/extractor/foxnews.py index 52172aace..6aa63614e 100644 --- a/yt_dlp/extractor/foxnews.py +++ b/yt_dlp/extractor/foxnews.py @@ -7,8 +7,37 @@ class FoxNewsIE(AMPIE): IE_NAME = 'foxnews' IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?Pvideo\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://video\.(?:insider\.)?fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ + { + 'url': 'https://video.foxnews.com/v/6320653836112', + 'info_dict': { + 'id': '6320653836112', + 'ext': 'mp4', + 'title': 'Tucker Carlson joins \'Gutfeld!\' to discuss his new documentary', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 404, + 'upload_date': '20230217', + 'description': 'md5:858a8a36f59e9ca897d758855bcdfa02', + 'timestamp': 1676611344.0, + }, + 'params': {'skip_download': 'm3u8'}, + }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'info_dict': { + 'id': '5099377331001', + 'ext': 'mp4', + 'title': '82416_censoring', + 'description': '82416_censoring', + 'upload_date': '20160826', + 'timestamp': 1472169708.0, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 521, + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', 'md5': '32aaded6ba3ef0d1c04e238d01031e5e', @@ -22,6 +51,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20110503', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'skip': '404 page', }, { 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips', @@ -36,10 +66,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20141204', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': 'm3u8 HTTP error 400 in web browser', }, { 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', @@ -49,11 +76,6 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, - { - # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words - 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', - 'only_matching': True, - }, ] @classmethod @@ -67,10 +89,10 @@ def _extract_embed_urls(cls, url, webpage): yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() + video_id = self._match_id(url) info = self._extract_feed_info( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + f'https://api.foxnews.com/v3/video-player/{video_id}?callback=uid_{video_id}') info['id'] = video_id return info @@ -78,6 +100,19 @@ def _real_extract(self, url): class FoxNewsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P\d+)' _TESTS = [{ + 'url': 'https://www.foxnews.com/video/6328632286112', + 'info_dict': { + 'id': '6328632286112', + 'ext': 'mp4', + 'title': 'Review: 2023 Toyota Prius Prime', + 'duration': 155, + 'thumbnail': r're:^https://.+\.jpg$', + 'timestamp': 1685720177.0, + 'upload_date': '20230602', + 'description': 'md5:b69aafb125b41c1402e9744f53d6edc4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'https://www.foxnews.com/video/6313058664112', 'info_dict': { 'id': '6313058664112', @@ -89,8 +124,7 @@ class FoxNewsVideoIE(InfoExtractor): 'title': 'Gutfeld! - Thursday, September 29', 'timestamp': 1664527538, }, - 'expected_warnings': ['Ignoring subtitle tracks'], - 'params': {'skip_download': 'm3u8'}, + 'skip': '404 page', }] def _real_extract(self, url): @@ -104,19 +138,22 @@ class FoxNewsArticleIE(InfoExtractor): _TESTS = [{ # data-video-id - 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '83d44e1aff1433e7a29a7b537d1700b5', + 'url': 'https://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', + 'md5': 'd2dd6ce809cedeefa96460e964821437', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', - 'description': 'Veterans react on \'The Kelly File\'', + 'description': 'Veterans and Fox News host Dana Perino react on \'The Kelly File\' to NBC\'s presidential forum', 'timestamp': 1473301045, 'upload_date': '20160908', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 426, }, + 'params': {'skip_download': 'm3u8'}, }, { # iframe embed - 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'url': 'https://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', 'info_dict': { 'id': '5748266721001', 'ext': 'flv', @@ -127,9 +164,7 @@ class FoxNewsArticleIE(InfoExtractor): 'timestamp': 1520594670, 'upload_date': '20180309', }, - 'params': { - 'skip_download': True, - }, + 'skip': '404 page', }, { 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', 'only_matching': True, From 4815d35c191e7d375b94492a6486dd2ba43a8954 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 08:49:10 -0500 Subject: [PATCH 06/74] [extractor/sonyliv] Fix login with token (#7223) Authored by: bashonly --- yt_dlp/extractor/sonyliv.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/sonyliv.py b/yt_dlp/extractor/sonyliv.py index aaad420f1..5ebe20df7 100644 --- a/yt_dlp/extractor/sonyliv.py +++ b/yt_dlp/extractor/sonyliv.py @@ -10,6 +10,8 @@ from ..utils import ( ExtractorError, int_or_none, + jwt_decode_hs256, + try_call, try_get, ) @@ -77,8 +79,10 @@ def _perform_login(self, username, password): self._HEADERS['device_id'] = self._get_device_id() self._HEADERS['content-type'] = 'application/json' - if username.lower() == 'token' and len(password) > 1198: + if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): self._HEADERS['authorization'] = password + self.report_login() + return elif len(username) != 10 or not username.isdigit(): raise ExtractorError(f'Invalid username/password; {self._LOGIN_HINT}') From 7bc92517463f5766e9d9b92c3823b5cf403c0e3d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 09:07:13 -0500 Subject: [PATCH 07/74] [extractor/shemaroome] Pass `stream_key` header to downloader (#7224) Closes #7133 Authored by: bashonly --- yt_dlp/extractor/shemaroome.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/shemaroome.py b/yt_dlp/extractor/shemaroome.py index 7a78c6e05..ec9938b8c 100644 --- a/yt_dlp/extractor/shemaroome.py +++ b/yt_dlp/extractor/shemaroome.py @@ -73,7 +73,10 @@ def _real_extract(self, url): key = bytes_to_intlist(compat_b64decode(data_json['key'])) iv = [0] * 16 m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii') - formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']}) + headers = {'stream_key': data_json['stream_key']} + formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers=headers) + for fmt in formats: + fmt['http_headers'] = headers release_date = self._html_search_regex( (r'itemprop="uploadDate">\s*([\d-]+)', r'id="release_date" value="([\d-]+)'), From 7f8ddebbb51c9fd4a347306332a718ba41b371b8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 09:19:16 -0500 Subject: [PATCH 08/74] [extractor/hotstar] Support `/shows/` URLs (#7225) Closes #6463 Authored by: bashonly --- yt_dlp/extractor/hotstar.py | 40 +++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index cea1812f1..591e23b8a 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -83,7 +83,7 @@ class HotStarIE(HotStarBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) (?: - (?Pmovies|sports|episode|(?Ptv))/ + (?Pmovies|sports|episode|(?Ptv|shows))/ (?(tv)(?:[^/?#]+/){2}|[^?#]*) )? [^/?#]+/ @@ -122,6 +122,25 @@ class HotStarIE(HotStarBaseIE): 'episode': 'Janhvi Targets Suman', 'episode_number': 8, } + }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/anupama-anuj-share-a-moment/1000282843', + 'info_dict': { + 'id': '1000282843', + 'ext': 'mp4', + 'title': 'Anupama, Anuj Share a Moment', + 'season': 'Chapter 1', + 'description': 'md5:8d74ed2248423b8b06d5c8add4d7a0c0', + 'timestamp': 1678149000, + 'channel': 'StarPlus', + 'series': 'Anupama', + 'season_number': 1, + 'season_id': 7399, + 'upload_date': '20230307', + 'episode': 'Anupama, Anuj Share a Moment', + 'episode_number': 853, + 'duration': 1272, + 'channel_id': 3, + }, }, { 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', 'only_matching': True, @@ -139,6 +158,7 @@ class HotStarIE(HotStarBaseIE): 'sports': 'match', 'episode': 'episode', 'tv': 'episode', + 'shows': 'episode', None: 'content', } @@ -304,13 +324,16 @@ def _real_extract(self, url): class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/tv(?:/[^/]+){2}/list/[^/]+/t-(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)(?:/[^/]+){2}/list/[^/]+/t-(?P\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { 'id': '3_2_26', }, 'playlist_mincount': 20, + }, { + 'url': 'https://www.hotstar.com/shows/savdhaan-india/s-26/list/popular-clips/t-3_2_26', + 'only_matching': True, }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, @@ -327,7 +350,7 @@ def _real_extract(self, url): class HotStarSeasonIE(HotStarBaseIE): IE_NAME = 'hotstar:season' - _VALID_URL = r'(?Phttps?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/\w+)/seasons/[^/]+/ss-(?P\w+)' + _VALID_URL = r'(?Phttps?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/\w+)/seasons/[^/]+/ss-(?P\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/radhakrishn/1260000646/seasons/season-2/ss-8028', 'info_dict': { @@ -346,6 +369,9 @@ class HotStarSeasonIE(HotStarBaseIE): 'id': '8208', }, 'playlist_mincount': 19, + }, { + 'url': 'https://www.hotstar.com/in/shows/bigg-boss/14714/seasons/season-4/ss-8208/', + 'only_matching': True, }] def _real_extract(self, url): @@ -356,7 +382,7 @@ def _real_extract(self, url): class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?Phttps?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P\d+))/?(?:[#?]|$)' + _VALID_URL = r'(?Phttps?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/(?P\d+))/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { @@ -375,6 +401,12 @@ class HotStarSeriesIE(HotStarBaseIE): 'id': '435', }, 'playlist_mincount': 267, + }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/', + 'info_dict': { + 'id': '1260022017', + }, + 'playlist_mincount': 940, }] def _real_extract(self, url): From c2a1bdb00931969193f2a31ea27b9c66a07aaec2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 09:28:40 -0500 Subject: [PATCH 09/74] [extractor/tiktok] Extract 1080p adaptive formats (#7228) Closes #7109 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 63708229e..49035e971 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -62,7 +62,7 @@ def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ - 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', + 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)', 'Accept': 'application/json', }, query=query) @@ -79,11 +79,11 @@ def _build_api_query(self, query, app_version, manifest_app_version): '_rticket': int(time.time() * 1000), 'ts': int(time.time()), 'device_brand': 'Google', - 'device_type': 'Pixel 4', + 'device_type': 'Pixel 7', 'device_platform': 'android', - 'resolution': '1080*1920', + 'resolution': '1080*2400', 'dpi': 420, - 'os_version': '10', + 'os_version': '13', 'os_api': '29', 'carrier_region': 'US', 'sys_region': 'US', @@ -624,6 +624,32 @@ class TikTokIE(TikTokBaseIE): 'thumbnails': 'count:3', }, 'expected_warnings': ['Unable to find video in feed'], + }, { + # 1080p format + 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', + 'md5': '982512017a8a917124d5a08c8ae79621', + 'info_dict': { + 'id': '7107337212743830830', + 'ext': 'mp4', + 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok', + 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok', + 'uploader': 'tatemcrae', + 'uploader_id': '86328792343818240', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', + 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', + 'creator': 't8', + 'artist': 't8', + 'track': 'original sound', + 'upload_date': '20220609', + 'timestamp': 1654805899, + 'duration': 150, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'thumbnail': r're:^https://.+\.webp', + }, + 'params': {'format': 'bytevc1_1080p_808907-0'}, }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', From ee0ed0338df328cd986f97315c8162b5a151476d Mon Sep 17 00:00:00 2001 From: bashonly Date: Mon, 5 Jun 2023 10:40:48 -0500 Subject: [PATCH 10/74] [extractor/zdf] Fix formats extraction Closes #7238, Closes #7240 Authored by: bashonly --- yt_dlp/extractor/zdf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index c863c46ed..c04d51b7e 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -24,7 +24,7 @@ class ZDFBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'uhd') + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd') def _call_api(self, url, video_id, item, api_token=None, referrer=None): headers = {} @@ -61,6 +61,9 @@ def _extract_format(self, video_id, formats, format_urls, meta): elif mime_type == 'application/f4m+xml' or ext == 'f4m': new_formats = self._extract_f4m_formats( update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) + elif ext == 'mpd': + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) else: f = parse_codecs(meta.get('mimeCodec')) if not f and meta.get('type'): From 59d9fe08312bbb76ee26238d207a8ca35410a48d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 5 Jun 2023 10:52:45 -0500 Subject: [PATCH 11/74] [extractor/mgtv] Fix formats extraction (#7234) Closes #7008 Authored by: bashonly --- yt_dlp/extractor/mgtv.py | 65 ++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/mgtv.py b/yt_dlp/extractor/mgtv.py index edc92b371..06edcb396 100644 --- a/yt_dlp/extractor/mgtv.py +++ b/yt_dlp/extractor/mgtv.py @@ -1,17 +1,17 @@ import base64 import time +import urllib.error import uuid from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) from ..utils import ( ExtractorError, int_or_none, + parse_resolution, + traverse_obj, try_get, url_or_none, + urljoin, ) @@ -30,16 +30,18 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': r're:^https?://.*\.jpg$', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15588271.html', 'info_dict': { 'id': '15588271', 'ext': 'mp4', - 'title': '春日迟迟再出发 沉浸版', + 'title': '春日迟迟再出发 沉浸版第1期:陆莹结婚半年查出肾炎被离婚 吴雅婷把一半票根退给前夫', 'description': 'md5:a7a05a05b1aa87bd50cae619b19bbca6', 'thumbnail': r're:^https?://.+\.jpg', 'duration': 4026, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/333652/7329822.html', 'info_dict': { @@ -50,6 +52,7 @@ class MGTVIE(InfoExtractor): 'thumbnail': r're:^https?://.+\.jpg', 'duration': 2656, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15591647.html', 'only_matching': True, @@ -64,6 +67,13 @@ class MGTVIE(InfoExtractor): 'only_matching': True, }] + _RESOLUTIONS = { + '标清': ('480p', '854x480'), + '高清': ('540p', '960x540'), + '超清': ('720p', '1280x720'), + '蓝光': ('1080p', '1920x1080'), + } + def _real_extract(self, url): video_id = self._match_id(url) tk2 = base64.urlsafe_b64encode( @@ -76,55 +86,60 @@ def _real_extract(self, url): 'type': 'pch5' }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: error = self._parse_json(e.cause.read().decode(), None) if error.get('code') == 40005: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(error['msg'], expected=True) raise - info = api_data['info'] - title = info['title'].strip() + stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ - 'pm2': api_data['atc']['pm2'], 'tk2': tk2, + 'pm2': api_data['atc']['pm2'], 'video_id': video_id, + 'type': 'pch5', 'src': 'intelmgtv', }, headers=self.geo_verification_headers())['data'] - stream_domain = stream_data['stream_domain'][0] + stream_domain = traverse_obj(stream_data, ('stream_domain', ..., {url_or_none}), get_all=False) formats = [] - for idx, stream in enumerate(stream_data['stream']): - stream_path = stream.get('url') - if not stream_path: - continue - format_data = self._download_json( - stream_domain + stream_path, video_id, - note=f'Download video info for format #{idx}') - format_url = format_data.get('info') + for idx, stream in enumerate(traverse_obj(stream_data, ('stream', lambda _, v: v['url']))): + stream_name = traverse_obj(stream, 'name', 'standardName', 'barName', expected_type=str) + resolution = traverse_obj( + self._RESOLUTIONS, (stream_name, 1 if stream.get('scale') == '16:9' else 0)) + format_url = traverse_obj(self._download_json( + urljoin(stream_domain, stream['url']), video_id, fatal=False, + note=f'Downloading video info for format {resolution or stream_name}'), + ('info', {url_or_none})) if not format_url: continue tbr = int_or_none(stream.get('filebitrate') or self._search_regex( r'_(\d+)_mp4/', format_url, 'tbr', default=None)) formats.append({ - 'format_id': compat_str(tbr or idx), - 'url': url_or_none(format_url), + 'format_id': str(tbr or idx), + 'url': format_url, 'ext': 'mp4', 'tbr': tbr, + 'vcodec': stream.get('videoFormat'), + 'acodec': stream.get('audioFormat'), + **parse_resolution(resolution), 'protocol': 'm3u8_native', 'http_headers': { 'Referer': url, }, - 'format_note': stream.get('name'), + 'format_note': stream_name, }) return { 'id': video_id, - 'title': title, 'formats': formats, - 'description': info.get('desc'), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': info.get('thumb'), + **traverse_obj(api_data, ('info', { + 'title': ('title', {str.strip}), + 'description': ('desc', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('thumb', {url_or_none}), + })), 'subtitles': self.extract_subtitles(video_id, stream_domain), } From c2b801fea59628d5c873e06a0727fbf2051bbd1f Mon Sep 17 00:00:00 2001 From: stanoarn <74262064+stanoarn@users.noreply.github.com> Date: Wed, 7 Jun 2023 22:18:06 +0200 Subject: [PATCH 12/74] [extractor/rozhlas] `MujRozhlas`: Add extractor (#7129) Authored by: stanoarn --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rozhlas.py | 164 ++++++++++++++++++++++++++++---- 2 files changed, 144 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 7120fd37d..f54024211 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1625,6 +1625,7 @@ from .rozhlas import ( RozhlasIE, RozhlasVltavaIE, + MujRozhlasIE, ) from .rte import RteIE, RteRadioIE from .rtlnl import ( diff --git a/yt_dlp/extractor/rozhlas.py b/yt_dlp/extractor/rozhlas.py index 5cc664e00..5f83d42e8 100644 --- a/yt_dlp/extractor/rozhlas.py +++ b/yt_dlp/extractor/rozhlas.py @@ -1,10 +1,15 @@ +import itertools +import urllib.error + from .common import InfoExtractor from ..utils import ( + ExtractorError, extract_attributes, int_or_none, remove_start, str_or_none, traverse_obj, + unified_timestamp, url_or_none, ) @@ -51,7 +56,40 @@ def _real_extract(self, url): } -class RozhlasVltavaIE(InfoExtractor): +class RozhlasBaseIE(InfoExtractor): + def _extract_formats(self, entry, audio_id): + formats = [] + for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))): + ext = audio.get('variant') + for retry in self.RetryManager(): + if retry.attempt > 1: + self._sleep(1, audio_id) + try: + if ext == 'dash': + formats.extend(self._extract_mpd_formats( + audio['url'], audio_id, mpd_id=ext)) + elif ext == 'hls': + formats.extend(self._extract_m3u8_formats( + audio['url'], audio_id, 'm4a', m3u8_id=ext)) + else: + formats.append({ + 'url': audio['url'], + 'ext': ext, + 'format_id': ext, + 'abr': int_or_none(audio.get('bitrate')), + 'acodec': ext, + 'vcodec': 'none', + }) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 429: + retry.error = e.cause + else: + self.report_warning(e.msg) + + return formats + + +class RozhlasVltavaIE(RozhlasBaseIE): _VALID_URL = r'https?://(?:\w+\.rozhlas|english\.radio)\.cz/[\w-]+-(?P\d+)' _TESTS = [{ 'url': 'https://wave.rozhlas.cz/papej-masicko-porcujeme-a-bilancujeme-filmy-a-serialy-ktere-letos-zabily-8891337', @@ -168,33 +206,14 @@ class RozhlasVltavaIE(InfoExtractor): }] def _extract_video(self, entry): - formats = [] audio_id = entry['meta']['ga']['contentId'] - for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))): - ext = audio.get('variant') - if ext == 'dash': - formats.extend(self._extract_mpd_formats( - audio['url'], audio_id, mpd_id=ext, fatal=False)) - elif ext == 'hls': - formats.extend(self._extract_m3u8_formats( - audio['url'], audio_id, 'm4a', m3u8_id=ext, fatal=False)) - else: - formats.append({ - 'url': audio['url'], - 'ext': ext, - 'format_id': ext, - 'abr': int_or_none(audio.get('bitrate')), - 'acodec': ext, - 'vcodec': 'none', - }) - chapter_number = traverse_obj(entry, ('meta', 'ga', 'contentSerialPart', {int_or_none})) return { 'id': audio_id, 'chapter': traverse_obj(entry, ('meta', 'ga', 'contentNameShort')) if chapter_number else None, 'chapter_number': chapter_number, - 'formats': formats, + 'formats': self._extract_formats(entry, audio_id), **traverse_obj(entry, { 'title': ('meta', 'ga', 'contentName'), 'description': 'title', @@ -219,3 +238,106 @@ def _real_extract(self, url): 'title': traverse_obj(data, ('series', 'title')), 'entries': map(self._extract_video, data['playlist']), } + + +class MujRozhlasIE(RozhlasBaseIE): + _VALID_URL = r'https?://(?:www\.)?mujrozhlas\.cz/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + # single episode extraction + 'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli', + 'md5': '6f8fd68663e64936623e67c152a669e0', + 'info_dict': { + 'id': '10739193', + 'ext': 'mp3', + 'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli', + 'description': 'md5:db7141e9caaedc9041ec7cefb9a62908', + 'timestamp': 1684915200, + 'modified_timestamp': 1684922446, + 'series': 'Vykopávky', + 'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg', + 'channel_id': 'radio-wave', + 'upload_date': '20230524', + 'modified_date': '20230524', + }, + }, { + # serial extraction + 'url': 'https://www.mujrozhlas.cz/radiokniha/jaroslava-janackova-pribeh-tajemneho-psani-o-pramenech-genezi-babicky', + 'playlist_mincount': 7, + 'info_dict': { + 'id': 'bb2b5f4e-ffb4-35a6-a34a-046aa62d6f6b', + 'title': 'Jaroslava Janáčková: Příběh tajemného psaní. O pramenech a genezi Babičky', + 'description': 'md5:7434d8fac39ac9fee6df098e11dfb1be', + }, + }, { + # show extraction + 'url': 'https://www.mujrozhlas.cz/nespavci', + 'playlist_mincount': 14, + 'info_dict': { + 'id': '09db9b37-d0f4-368c-986a-d3439f741f08', + 'title': 'Nespavci', + 'description': 'md5:c430adcbf9e2b9eac88b745881e814dc', + }, + }] + + def _call_api(self, path, item_id, msg='API JSON'): + return self._download_json( + f'https://api.mujrozhlas.cz/{path}/{item_id}', item_id, + note=f'Downloading {msg}', errnote=f'Failed to download {msg}')['data'] + + def _extract_audio_entry(self, entry): + audio_id = entry['meta']['ga']['contentId'] + + return { + 'id': audio_id, + 'formats': self._extract_formats(entry['attributes'], audio_id), + **traverse_obj(entry, { + 'title': ('attributes', 'title'), + 'description': ('attributes', 'description'), + 'episode_number': ('attributes', 'part'), + 'series': ('attributes', 'mirroredShow', 'title'), + 'chapter': ('attributes', 'mirroredSerial', 'title'), + 'artist': ('meta', 'ga', 'contentAuthor'), + 'channel_id': ('meta', 'ga', 'contentCreator'), + 'timestamp': ('attributes', 'since', {unified_timestamp}), + 'modified_timestamp': ('attributes', 'updated', {unified_timestamp}), + 'thumbnail': ('attributes', 'asset', 'url', {url_or_none}), + }) + } + + def _entries(self, api_url, playlist_id): + for page in itertools.count(1): + episodes = self._download_json( + api_url, playlist_id, note=f'Downloading episodes page {page}', + errnote=f'Failed to download episodes page {page}', fatal=False) + for episode in traverse_obj(episodes, ('data', lambda _, v: v['meta']['ga']['contentId'])): + yield self._extract_audio_entry(episode) + api_url = traverse_obj(episodes, ('links', 'next', {url_or_none})) + if not api_url: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + info = self._search_json(r'\bvar\s+dl\s*=', webpage, 'info json', display_id) + + entity = info['siteEntityBundle'] + + if entity == 'episode': + return self._extract_audio_entry(self._call_api( + 'episodes', info['contentId'], 'episode info API JSON')) + + elif entity in ('show', 'serial'): + playlist_id = info['contentShow'].split(':')[0] if entity == 'show' else info['contentId'] + data = self._call_api(f'{entity}s', playlist_id, f'{entity} playlist JSON') + api_url = data['relationships']['episodes']['links']['related'] + return self.playlist_result( + self._entries(api_url, playlist_id), playlist_id, + **traverse_obj(data, ('attributes', { + 'title': 'title', + 'description': 'description', + }))) + + else: + # `entity == 'person'` not implemented yet by API, ref: + # https://api.mujrozhlas.cz/persons/8367e456-2a57-379a-91bb-e699619bea49/participation + raise ExtractorError(f'Unsupported entity type "{entity}"') From 14a14335b280766fbf5a469ae26836d6c1fe450a Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 8 Jun 2023 18:58:49 +0530 Subject: [PATCH 13/74] [extractor/youtube] Misc cleanup Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 93 ++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 6e7485c03..1b1266360 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -292,6 +292,7 @@ class BadgeType(enum.Enum): AVAILABILITY_PREMIUM = enum.auto() AVAILABILITY_SUBSCRIPTION = enum.auto() LIVE_NOW = enum.auto() + VERIFIED = enum.auto() class YoutubeBaseInfoExtractor(InfoExtractor): @@ -791,17 +792,23 @@ def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): def _extract_and_report_alerts(self, data, *args, **kwargs): return self._report_alerts(self._extract_alerts(data), *args, **kwargs) - def _extract_badges(self, renderer: dict): - privacy_icon_map = { + def _extract_badges(self, badge_list: list): + """ + Extract known BadgeType's from a list of badge renderers. + @returns [{'type': BadgeType}] + """ + icon_type_map = { 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, - 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, + 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, + 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, } badge_style_map = { 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, - 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, } label_map = { @@ -809,13 +816,13 @@ def _extract_badges(self, renderer: dict): 'private': BadgeType.AVAILABILITY_PRIVATE, 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, 'live': BadgeType.LIVE_NOW, - 'premium': BadgeType.AVAILABILITY_PREMIUM + 'premium': BadgeType.AVAILABILITY_PREMIUM, } badges = [] - for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer')): + for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))): badge_type = ( - privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) or badge_style_map.get(traverse_obj(badge, 'style')) ) if badge_type: @@ -823,11 +830,12 @@ def _extract_badges(self, renderer: dict): continue # fallback, won't work in some languages - label = traverse_obj(badge, 'label', expected_type=str, default='') + label = traverse_obj( + badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='') for match, label_badge_type in label_map.items(): if match in label.lower(): - badges.append({'type': badge_type}) - continue + badges.append({'type': label_badge_type}) + break return badges @@ -1020,8 +1028,7 @@ def _extract_video(self, renderer): overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) - badges = self._extract_badges(renderer) - + badges = self._extract_badges(traverse_obj(renderer, 'badges')) navigation_url = urljoin('https://www.youtube.com/', traverse_obj( renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) or '' @@ -1079,7 +1086,7 @@ def _extract_video(self, renderer): needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), view_count_field: view_count, - 'live_status': live_status + 'live_status': live_status, } @@ -1332,6 +1339,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', 'uploader_id': '@PhilippHagemeister', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1415,6 +1423,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'The Witcher', 'uploader_url': 'https://www.youtube.com/@thewitcher', 'uploader_id': '@thewitcher', + 'comment_count': int, + 'heatmap': 'count:100', }, }, { @@ -1894,6 +1904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Bernie Sanders', 'uploader_url': 'https://www.youtube.com/@BernieSanders', 'uploader_id': '@BernieSanders', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1955,6 +1966,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Vsauce', 'uploader_url': 'https://www.youtube.com/@Vsauce', 'uploader_id': '@Vsauce', + 'comment_count': int, }, 'params': { 'skip_download': True, @@ -2147,6 +2159,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'kudvenkat', 'uploader_url': 'https://www.youtube.com/@Csharp-video-tutorialsBlogspot', 'uploader_id': '@Csharp-video-tutorialsBlogspot', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -2227,6 +2240,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'CBS Mornings', 'uploader_url': 'https://www.youtube.com/@CBSMornings', 'uploader_id': '@CBSMornings', + 'comment_count': int, } }, { @@ -2297,6 +2311,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'colinfurze', 'uploader_url': 'https://www.youtube.com/@colinfurze', 'uploader_id': '@colinfurze', + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': { 'format': '17', # 3gp format available on android @@ -2342,6 +2358,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'SciShow', 'uploader_url': 'https://www.youtube.com/@SciShow', 'uploader_id': '@SciShow', + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': {'format': 'mhtml', 'skip_download': True} }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -2370,6 +2388,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Leon Nguyen', 'uploader_url': 'https://www.youtube.com/@LeonNguyen', 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', } }, { # Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date @@ -2398,6 +2417,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Leon Nguyen', 'uploader_url': 'https://www.youtube.com/@LeonNguyen', 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', }, 'params': {'compat_opts': ['no-youtube-prefer-utc-upload-date']} }, { @@ -2428,6 +2448,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Quackity', 'uploader_id': '@Quackity', 'uploader_url': 'https://www.youtube.com/@Quackity', + 'comment_count': int, + 'heatmap': 'count:100', } }, { # continuous livestream. Microformat upload date should be preferred. @@ -2594,6 +2616,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'MrBeast', 'uploader_url': 'https://www.youtube.com/@MrBeast', 'uploader_id': '@MrBeast', + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, }, { @@ -2655,6 +2679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'さなちゃんねる', 'uploader_url': 'https://www.youtube.com/@sana_natori', 'uploader_id': '@sana_natori', + 'heatmap': 'count:100', }, }, { @@ -2684,6 +2709,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': r're:^https?://.*\.webp', 'channel_url': 'https://www.youtube.com/channel/UCxzC4EngIsMrPmbm6Nxvb-A', 'playable_in_embed': True, + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': { 'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}}, @@ -2720,6 +2747,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Christopher Sykes', 'uploader_url': 'https://www.youtube.com/@ChristopherSykesDocumentaries', 'uploader_id': '@ChristopherSykesDocumentaries', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -3312,10 +3340,9 @@ def _extract_comment(self, comment_renderer, parent=None): if comment_abr is not None: info['is_favorited'] = 'creatorHeart' in comment_abr - comment_ab_icontype = traverse_obj( - comment_renderer, ('authorCommentBadge', 'authorCommentBadgeRenderer', 'icon', 'iconType')) - if comment_ab_icontype is not None: - info['author_is_verified'] = comment_ab_icontype in ('CHECK_CIRCLE_THICK', 'OFFICIAL_ARTIST_BADGE') + badges = self._extract_badges([traverse_obj(comment_renderer, 'authorCommentBadge')]) + if self._has_badge(badges, BadgeType.VERIFIED): + info['author_is_verified'] = True is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge') if is_pinned: @@ -4481,7 +4508,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if v: info[d_k] = v - badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False)) + badges = self._extract_badges(traverse_obj(vpir, 'badges')) is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or get_first(video_details, 'isPrivate', expected_type=bool)) @@ -4554,13 +4581,14 @@ def _extract_channel_renderer(self, renderer): channel_id = self.ucid_or_none(renderer['channelId']) title = self._get_text(renderer, 'title') channel_url = format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None) - # As of 2023-03-01 YouTube doesn't use the channel handles on these renderers yet. - # However we can expect them to change that in the future. channel_handle = self.handle_from_url( traverse_obj(renderer, ( 'navigationEndpoint', (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl')), {str}), get_all=False)) + if not channel_handle: + # As of 2023-06-01, YouTube sets subscriberCountText to the handle in search + channel_handle = self.handle_or_none(self._get_text(renderer, 'subscriberCountText')) return { '_type': 'url', 'url': channel_url, @@ -4573,9 +4601,15 @@ def _extract_channel_renderer(self, renderer): 'title': title, 'uploader_id': channel_handle, 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), - 'channel_follower_count': self._get_count(renderer, 'subscriberCountText'), + # See above. YouTube sets videoCountText to the subscriber text in search channel renderers. + # However, in feed/channels this is set correctly to the subscriber count + 'channel_follower_count': traverse_obj( + renderer, 'subscriberCountText', 'videoCountText', expected_type=self._get_count), 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), - 'playlist_count': self._get_count(renderer, 'videoCountText'), + 'playlist_count': ( + # videoCountText may be the subscriber count + self._get_count(renderer, 'videoCountText') + if self._get_count(renderer, 'subscriberCountText') is not None else None), 'description': self._get_text(renderer, 'descriptionSnippet'), } @@ -5100,7 +5134,7 @@ def _extract_availability(self, data): playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {} player_header_privacy = playlist_header_renderer.get('privacy') - badges = self._extract_badges(sidebar_renderer) + badges = self._extract_badges(traverse_obj(sidebar_renderer, 'badges')) # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge privacy_setting_icon = get_first( @@ -5350,7 +5384,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader': '3Blue1Brown', 'tags': ['Mathematics'], - 'channel_follower_count': int + 'channel_follower_count': int, }, }, { 'note': 'playlists, singlepage', @@ -5690,7 +5724,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'AlTsmyW4auo', # This will keep changing + 'id': 'hGkQjiJLjWQ', # This will keep changing 'ext': 'mp4', 'title': str, 'upload_date': r're:\d{8}', @@ -6202,7 +6236,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': str, 'uploader': str, 'uploader_url': str, - 'uploader_id': str + 'uploader_id': str, } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -6865,12 +6899,14 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): 'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc', 'title': 'Kurzgesagt – In a Nutshell', 'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q', - 'playlist_count': int, # XXX: should have a way of saying > 1 + # No longer available for search as it is set to the handle. + # 'playlist_count': int, 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', 'thumbnails': list, 'uploader_id': '@kurzgesagt', 'uploader_url': 'https://www.youtube.com/@kurzgesagt', 'uploader': 'Kurzgesagt – In a Nutshell', + 'channel_follower_count': int, } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -7134,6 +7170,8 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'live_status': 'not_live', 'channel_follower_count': int, 'chapters': 'count:20', + 'comment_count': int, + 'heatmap': 'count:100', } }] @@ -7194,6 +7232,7 @@ class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): 'channel': 'さなちゃんねる', 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', 'uploader': 'さなちゃんねる', + 'heatmap': 'count:100', }, 'add_ie': ['Youtube'], 'params': {'skip_download': 'Youtube'}, From 8213ce28a485e200f6a7e1af1434a987c8e702bd Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Thu, 8 Jun 2023 19:50:05 +1200 Subject: [PATCH 14/74] [extractor/youtube] Extract `channel_is_verified` (#7213) Authored by: coletdjnz --- README.md | 1 + yt_dlp/extractor/common.py | 1 + yt_dlp/extractor/youtube.py | 38 +++++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/README.md b/README.md index 3d89c0af9..ce555c66f 100644 --- a/README.md +++ b/README.md @@ -1292,6 +1292,7 @@ # OUTPUT TEMPLATE - `channel` (string): Full name of the channel the video is uploaded on - `channel_id` (string): Id of the channel - `channel_follower_count` (numeric): Number of followers of the channel + - `channel_is_verified` (boolean): Whether the channel is verified on the platform - `location` (string): Physical location where the video was filmed - `duration` (numeric): Length of the video in seconds - `duration_string` (string): Length of the video (HH:mm:ss) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index fa46a5240..ca2164a5d 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -286,6 +286,7 @@ class InfoExtractor: channel_id: Id of the channel. channel_url: Full URL to a channel webpage. channel_follower_count: Number of followers of the channel. + channel_is_verified: Whether the channel is verified on the platform. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1b1266360..47ad1da76 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -803,12 +803,15 @@ def _extract_badges(self, badge_list: list): 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, + 'CHECK': BadgeType.VERIFIED, } badge_style_map = { 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, + 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED, + 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED } label_map = { @@ -817,6 +820,8 @@ def _extract_badges(self, badge_list: list): 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, 'live': BadgeType.LIVE_NOW, 'premium': BadgeType.AVAILABILITY_PREMIUM, + 'verified': BadgeType.VERIFIED, + 'official artist channel': BadgeType.VERIFIED } badges = [] @@ -1029,6 +1034,7 @@ def _extract_video(self, renderer): renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) badges = self._extract_badges(traverse_obj(renderer, 'badges')) + owner_badges = self._extract_badges(traverse_obj(renderer, 'ownerBadges')) navigation_url = urljoin('https://www.youtube.com/', traverse_obj( renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) or '' @@ -1087,6 +1093,7 @@ def _extract_video(self, renderer): is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), view_count_field: view_count, 'live_status': live_status, + 'channel_is_verified': True if self._has_badge(owner_badges, BadgeType.VERIFIED) else None } @@ -1424,6 +1431,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@thewitcher', 'uploader_id': '@thewitcher', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, }, @@ -1454,6 +1462,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@FlyingKitty900', 'uploader_id': '@FlyingKitty900', 'comment_count': int, + 'channel_is_verified': True, }, }, { @@ -1587,6 +1596,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Olympics', 'uploader_url': 'https://www.youtube.com/@Olympics', 'uploader_id': '@Olympics', + 'channel_is_verified': True, }, 'params': { 'skip_download': 'requires avconv', @@ -1904,6 +1914,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Bernie Sanders', 'uploader_url': 'https://www.youtube.com/@BernieSanders', 'uploader_id': '@BernieSanders', + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': { @@ -1967,6 +1978,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@Vsauce', 'uploader_id': '@Vsauce', 'comment_count': int, + 'channel_is_verified': True, }, 'params': { 'skip_download': True, @@ -2159,6 +2171,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'kudvenkat', 'uploader_url': 'https://www.youtube.com/@Csharp-video-tutorialsBlogspot', 'uploader_id': '@Csharp-video-tutorialsBlogspot', + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': { @@ -2241,6 +2254,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@CBSMornings', 'uploader_id': '@CBSMornings', 'comment_count': int, + 'channel_is_verified': True, } }, { @@ -2312,6 +2326,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@colinfurze', 'uploader_id': '@colinfurze', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': { @@ -2359,6 +2374,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@SciShow', 'uploader_id': '@SciShow', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': {'format': 'mhtml', 'skip_download': True} }, { @@ -2449,6 +2465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@Quackity', 'uploader_url': 'https://www.youtube.com/@Quackity', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', } }, @@ -2617,6 +2634,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@MrBeast', 'uploader_id': '@MrBeast', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, @@ -2679,6 +2697,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'さなちゃんねる', 'uploader_url': 'https://www.youtube.com/@sana_natori', 'uploader_id': '@sana_natori', + 'channel_is_verified': True, 'heatmap': 'count:100', }, }, @@ -2710,6 +2729,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCxzC4EngIsMrPmbm6Nxvb-A', 'playable_in_embed': True, 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': { @@ -4483,6 +4503,9 @@ def process_language(container, base_url, lang_code, sub_name, query): info['artist'] = mrr_contents_text elif mrr_title == 'Song': info['track'] = mrr_contents_text + owner_badges = self._extract_badges(traverse_obj(vsir, ('owner', 'videoOwnerRenderer', 'badges'))) + if self._has_badge(owner_badges, BadgeType.VERIFIED): + info['channel_is_verified'] = True info.update({ 'uploader': info.get('channel'), @@ -4611,6 +4634,8 @@ def _extract_channel_renderer(self, renderer): self._get_count(renderer, 'videoCountText') if self._get_count(renderer, 'subscriberCountText') is not None else None), 'description': self._get_text(renderer, 'descriptionSnippet'), + 'channel_is_verified': True if self._has_badge( + self._extract_badges(traverse_obj(renderer, 'ownerBadges')), BadgeType.VERIFIED) else None, } def _grid_entries(self, grid_renderer): @@ -5026,6 +5051,10 @@ def _get_uncropped(url): 'uploader_id': channel_handle, 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), }) + + channel_badges = self._extract_badges(traverse_obj(data, ('header', ..., 'badges'), get_all=False)) + if self._has_badge(channel_badges, BadgeType.VERIFIED): + info['channel_is_verified'] = True # Playlist stats is a text runs array containing [video count, view count, last updated]. # last updated or (view count and last updated) may be missing. playlist_stats = get_first( @@ -5385,6 +5414,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': '3Blue1Brown', 'tags': ['Mathematics'], 'channel_follower_count': int, + 'channel_is_verified': True, }, }, { 'note': 'playlists, singlepage', @@ -5561,6 +5591,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', + 'channel_is_verified': True, }, }, { 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', @@ -5748,6 +5779,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@SkyNews', 'uploader_id': '@SkyNews', 'uploader': 'Sky News', + 'channel_is_verified': True, }, 'params': { 'skip_download': True, @@ -6237,6 +6269,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': str, 'uploader_url': str, 'uploader_id': str, + 'channel_is_verified': bool, # this will keep changing } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -6272,6 +6305,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': 'PewDiePie', 'uploader_url': 'https://www.youtube.com/@PewDiePie', 'uploader_id': '@PewDiePie', + 'channel_is_verified': True, } }], 'params': {'extract_flat': True}, @@ -6290,6 +6324,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', + 'channel_is_verified': True, }, 'playlist_count': 0, }, { @@ -6324,6 +6359,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'description': 'I make music', 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A', 'channel_follower_count': int, + 'channel_is_verified': True, }, 'playlist_mincount': 10, }] @@ -6906,6 +6942,7 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@kurzgesagt', 'uploader_url': 'https://www.youtube.com/@kurzgesagt', 'uploader': 'Kurzgesagt – In a Nutshell', + 'channel_is_verified': True, 'channel_follower_count': int, } }], @@ -7232,6 +7269,7 @@ class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): 'channel': 'さなちゃんねる', 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', 'uploader': 'さなちゃんねる', + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'add_ie': ['Youtube'], From 44c0d66442b568d9e1359e669d8b029b08a77fa7 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 8 Jun 2023 13:36:09 -0500 Subject: [PATCH 15/74] [extractor/lbry] Extract original quality formats (#7257) Closes #7251 Authored by: bashonly --- yt_dlp/extractor/lbry.py | 129 ++++++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 48 deletions(-) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index b5def1e07..23d3daf13 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -1,8 +1,8 @@ import functools import json +import urllib.parse from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote from ..utils import ( ExtractorError, HEADRequest, @@ -12,7 +12,10 @@ int_or_none, mimetype2ext, parse_qs, + traverse_obj, try_get, + url_or_none, + urlhandle_detect_ext, urljoin, ) @@ -52,38 +55,25 @@ def _permanent_url(self, url, claim_name, claim_id): '/%s:%s' % (claim_name, claim_id)) def _parse_stream(self, stream, url): - stream_value = stream.get('value') or {} - stream_type = stream_value.get('stream_type') - source = stream_value.get('source') or {} - media = stream_value.get(stream_type) or {} - signing_channel = stream.get('signing_channel') or {} - channel_name = signing_channel.get('name') - channel_claim_id = signing_channel.get('claim_id') - channel_url = None - if channel_name and channel_claim_id: - channel_url = self._permanent_url(url, channel_name, channel_claim_id) + stream_type = traverse_obj(stream, ('value', 'stream_type', {str})) + + info = traverse_obj(stream, { + 'title': ('value', 'title', {str}), + 'thumbnail': ('value', 'thumbnail', 'url', {url_or_none}), + 'description': ('value', 'description', {str}), + 'license': ('value', 'license', {str}), + 'timestamp': ('timestamp', {int_or_none}), + 'release_timestamp': ('value', 'release_time', {int_or_none}), + 'tags': ('value', 'tags', ..., {lambda x: x or None}), + 'duration': ('value', stream_type, 'duration', {int_or_none}), + 'channel': ('signing_channel', 'value', 'title', {str}), + 'channel_id': ('signing_channel', 'claim_id', {str}), + }) + + channel_name = traverse_obj(stream, ('signing_channel', 'name', {str})) + if channel_name and info.get('channel_id'): + info['channel_url'] = self._permanent_url(url, channel_name, info['channel_id']) - info = { - 'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str), - 'description': stream_value.get('description'), - 'license': stream_value.get('license'), - 'timestamp': int_or_none(stream.get('timestamp')), - 'release_timestamp': int_or_none(stream_value.get('release_time')), - 'tags': stream_value.get('tags'), - 'duration': int_or_none(media.get('duration')), - 'channel': try_get(signing_channel, lambda x: x['value']['title']), - 'channel_id': channel_claim_id, - 'channel_url': channel_url, - 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), - 'filesize': int_or_none(source.get('size')), - } - if stream_type == 'audio': - info['vcodec'] = 'none' - else: - info.update({ - 'width': int_or_none(media.get('width')), - 'height': int_or_none(media.get('height')), - }) return info @@ -186,6 +176,28 @@ class LBRYIE(LBRYBaseIE): 'license': 'None', }, 'params': {'skip_download': True} + }, { + # original quality format w/higher resolution than HLS formats + 'url': 'https://odysee.com/@wickedtruths:2/Biotechnological-Invasion-of-Skin-(April-2023):4', + 'md5': '305b0b3b369bde1b984961f005b67193', + 'info_dict': { + 'id': '41fbfe805eb73c8d3012c0c49faa0f563274f634', + 'ext': 'mp4', + 'title': 'Biotechnological Invasion of Skin (April 2023)', + 'description': 'md5:709a2f4c07bd8891cda3a7cc2d6fcf5c', + 'channel': 'Wicked Truths', + 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'timestamp': 1685790036, + 'upload_date': '20230603', + 'release_timestamp': 1685617473, + 'release_date': '20230601', + 'duration': 1063, + 'thumbnail': 'https://thumbs.odycdn.com/4e6d39da4df0cfdad45f64e253a15959.webp', + 'tags': ['smart skin surveillance', 'biotechnology invasion of skin', 'morgellons'], + 'license': 'None', + 'protocol': 'https', # test for direct mp4 download + }, }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, @@ -221,41 +233,64 @@ def _real_extract(self, url): display_id = display_id.split('/', 2)[-1].replace('/', ':') else: display_id = display_id.replace(':', '#') - display_id = compat_urllib_parse_unquote(display_id) + display_id = urllib.parse.unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') headers = {'Referer': 'https://odysee.com/'} - if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES: + + formats = [] + stream_type = traverse_obj(result, ('value', 'stream_type', {str})) + + if stream_type in self._SUPPORTED_STREAM_TYPES: claim_id, is_live = result['claim_id'], False streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + + # GET request returns original video/audio file if available + ext = urlhandle_detect_ext(self._request_webpage( + streaming_url, display_id, 'Checking for original quality', headers=headers)) + if ext != 'm3u8': + formats.append({ + 'url': streaming_url, + 'format_id': 'original', + 'quality': 1, + **traverse_obj(result, ('value', { + 'ext': ('source', (('name', {determine_ext}), ('media_type', {mimetype2ext}))), + 'filesize': ('source', 'size', {int_or_none}), + 'width': ('video', 'width', {int_or_none}), + 'height': ('video', 'height', {int_or_none}), + }), get_all=False), + 'vcodec': 'none' if stream_type == 'audio' else None, + }) + + # HEAD request returns redirect response to m3u8 URL if available final_url = self._request_webpage( HEADRequest(streaming_url), display_id, headers=headers, note='Downloading streaming redirect url info').geturl() + elif result.get('value_type') == 'stream': claim_id, is_live = result['signing_channel']['claim_id'], True live_data = self._download_json( 'https://api.odysee.live/livestream/is_live', claim_id, query={'channel_claim_id': claim_id}, note='Downloading livestream JSON metadata')['data'] - streaming_url = final_url = live_data.get('VideoURL') + final_url = live_data.get('VideoURL') # Upcoming videos may still give VideoURL if not live_data.get('Live'): - streaming_url = final_url = None + final_url = None self.raise_no_formats('This stream is not live', True, claim_id) + else: raise UnsupportedError(url) - info = self._parse_stream(result, url) if determine_ext(final_url) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers) - else: - info['url'] = streaming_url + formats.extend(self._extract_m3u8_formats( + final_url, display_id, 'mp4', m3u8_id='hls', live=is_live, headers=headers)) + return { - **info, + **self._parse_stream(result, url), 'id': claim_id, - 'title': result['value']['title'], + 'formats': formats, 'is_live': is_live, 'http_headers': headers, } @@ -299,14 +334,12 @@ def _fetch_page(self, claim_id, url, params, page): if not (stream_claim_name and stream_claim_id): continue - info = self._parse_stream(item, url) - info.update({ + yield { + **self._parse_stream(item, url), '_type': 'url', 'id': stream_claim_id, - 'title': try_get(item, lambda x: x['value']['title']), 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), - }) - yield info + } def _real_extract(self, url): display_id = self._match_id(url).replace(':', '#') From d1795f4a6af99c976c9d3ea2dabe5cf4f8965d3c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 8 Jun 2023 13:47:13 -0500 Subject: [PATCH 16/74] [extractor/twitter] Add login support (#7258) Closes #6951 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 213 +++++++++++++++++++++++++++++++++--- 1 file changed, 198 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 4624ce503..f854d9c4a 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -3,7 +3,6 @@ from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE -from ..compat import functools # isort: split from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, @@ -30,11 +29,67 @@ class TwitterBaseIE(InfoExtractor): + _NETRC_MACHINE = 'twitter' _API_BASE = 'https://api.twitter.com/1.1/' _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} _guest_token = None + _flow_token = None + + _LOGIN_INIT_DATA = json.dumps({ + 'input_flow_data': { + 'flow_context': { + 'debug_overrides': {}, + 'start_location': { + 'location': 'unknown' + } + } + }, + 'subtask_versions': { + 'action_list': 2, + 'alert_dialog': 1, + 'app_download_cta': 1, + 'check_logged_in_account': 1, + 'choice_selection': 3, + 'contacts_live_sync_permission_prompt': 0, + 'cta': 7, + 'email_verification': 2, + 'end_flow': 1, + 'enter_date': 1, + 'enter_email': 2, + 'enter_password': 5, + 'enter_phone': 2, + 'enter_recaptcha': 1, + 'enter_text': 5, + 'enter_username': 2, + 'generic_urt': 3, + 'in_app_notification': 1, + 'interest_picker': 3, + 'js_instrumentation': 1, + 'menu_dialog': 1, + 'notifications_permission_prompt': 2, + 'open_account': 2, + 'open_home_timeline': 1, + 'open_link': 1, + 'phone_verification': 4, + 'privacy_options': 1, + 'security_key': 3, + 'select_avatar': 4, + 'select_banner': 2, + 'settings_list': 7, + 'show_code': 1, + 'sign_up': 2, + 'sign_up_review': 4, + 'tweet_selection_urt': 1, + 'update_users': 1, + 'upload_media': 1, + 'user_recommendations_list': 4, + 'user_recommendations_urt': 1, + 'wait_spinner': 3, + 'web_modal': 1 + } + }, separators=(',', ':')).encode() def _extract_variant_formats(self, variant, video_id): variant_url = variant.get('url') @@ -86,18 +141,151 @@ def _search_dimensions_in_video_url(a_format, video_url): 'height': int(m.group('height')), }) - @functools.cached_property + @property def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - def _call_api(self, path, video_id, query={}, graphql=False): - cookies = self._get_cookies(self._API_BASE) + def _fetch_guest_token(self, headers, display_id): + headers.pop('x-guest-token', None) + self._guest_token = traverse_obj(self._download_json( + f'{self._API_BASE}guest/activate.json', display_id, + 'Downloading guest token', data=b'', headers=headers), 'guest_token') + if not self._guest_token: + raise ExtractorError('Could not retrieve guest token') + + def _set_base_headers(self): headers = self._AUTH.copy() + csrf_token = try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value) + if csrf_token: + headers['x-csrf-token'] = csrf_token + return headers - csrf_cookie = cookies.get('ct0') - if csrf_cookie: - headers['x-csrf-token'] = csrf_cookie.value + def _call_login_api(self, note, headers, query={}, data=None): + response = self._download_json( + f'{self._API_BASE}onboarding/task.json', None, note, + headers=headers, query=query, data=data, expected_status=400) + error = traverse_obj(response, ('errors', 0, 'message', {str})) + if error: + raise ExtractorError(f'Login failed, Twitter API says: {error}', expected=True) + elif traverse_obj(response, 'status') != 'success': + raise ExtractorError('Login was unsuccessful') + subtask = traverse_obj( + response, ('subtasks', ..., 'subtask_id', {str}), get_all=False) + if not subtask: + raise ExtractorError('Twitter API did not return next login subtask') + + self._flow_token = response['flow_token'] + + return subtask + + def _perform_login(self, username, password): + if self.is_logged_in: + return + + self._request_webpage('https://twitter.com/', None, 'Requesting cookies') + headers = self._set_base_headers() + self._fetch_guest_token(headers, None) + headers.update({ + 'content-type': 'application/json', + 'x-guest-token': self._guest_token, + 'x-twitter-client-language': 'en', + 'x-twitter-active-user': 'yes', + 'Referer': 'https://twitter.com/', + 'Origin': 'https://twitter.com', + }) + + def build_login_json(*subtask_inputs): + return json.dumps({ + 'flow_token': self._flow_token, + 'subtask_inputs': subtask_inputs + }, separators=(',', ':')).encode() + + def input_dict(subtask_id, text): + return { + 'subtask_id': subtask_id, + 'enter_text': { + 'text': text, + 'link': 'next_link' + } + } + + next_subtask = self._call_login_api( + 'Downloading flow token', headers, query={'flow_name': 'login'}, data=self._LOGIN_INIT_DATA) + + while not self.is_logged_in: + if next_subtask == 'LoginJsInstrumentationSubtask': + next_subtask = self._call_login_api( + 'Submitting JS instrumentation response', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'js_instrumentation': { + 'response': '{}', + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterUserIdentifierSSO': + next_subtask = self._call_login_api( + 'Submitting username', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'settings_list': { + 'setting_responses': [{ + 'key': 'user_identifier', + 'response_data': { + 'text_data': { + 'result': username + } + } + }], + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterAlternateIdentifierSubtask': + next_subtask = self._call_login_api( + 'Submitting alternate identifier', headers, + data=build_login_json(input_dict(next_subtask, self._get_tfa_info( + 'one of username, phone number or email that was not used as --username')))) + + elif next_subtask == 'LoginEnterPassword': + next_subtask = self._call_login_api( + 'Submitting password', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'enter_password': { + 'password': password, + 'link': 'next_link' + } + })) + + elif next_subtask == 'AccountDuplicationCheck': + next_subtask = self._call_login_api( + 'Submitting account duplication check', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'check_logged_in_account': { + 'link': 'AccountDuplicationCheck_false' + } + })) + + elif next_subtask == 'LoginTwoFactorAuthChallenge': + next_subtask = self._call_login_api( + 'Submitting 2FA token', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('two-factor authentication token')))) + + elif next_subtask == 'LoginAcid': + next_subtask = self._call_login_api( + 'Submitting confirmation code', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('confirmation code sent to your email or phone')))) + + elif next_subtask == 'LoginSuccessSubtask': + raise ExtractorError('Twitter API did not grant auth token cookie') + + else: + raise ExtractorError(f'Unrecognized subtask ID "{next_subtask}"') + + self.report_login() + + def _call_api(self, path, video_id, query={}, graphql=False): + headers = self._set_base_headers() if self.is_logged_in: headers.update({ 'x-twitter-auth-type': 'OAuth2Session', @@ -106,15 +294,10 @@ def _call_api(self, path, video_id, query={}, graphql=False): }) for first_attempt in (True, False): - if not self.is_logged_in and not self._guest_token: - headers.pop('x-guest-token', None) - self._guest_token = traverse_obj(self._download_json( - f'{self._API_BASE}guest/activate.json', video_id, - 'Downloading guest token', data=b'', headers=headers), 'guest_token') - if self._guest_token: + if not self.is_logged_in: + if not self._guest_token: + self._fetch_guest_token(headers, video_id) headers['x-guest-token'] = self._guest_token - elif not self.is_logged_in: - raise ExtractorError('Could not retrieve guest token') allowed_status = {400, 401, 403, 404} if graphql else {403} result = self._download_json( From 4f7b11cc1c1cebf598107e00cd7295588ed484da Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 10 Jun 2023 15:43:22 -0500 Subject: [PATCH 17/74] [extractor/voot] Fix extractor (#7227) Closes #6715 Authored by: bashonly --- yt_dlp/extractor/voot.py | 177 ++++++++++++++++++++++++++------------- 1 file changed, 119 insertions(+), 58 deletions(-) diff --git a/yt_dlp/extractor/voot.py b/yt_dlp/extractor/voot.py index b709b74e2..dd41647aa 100644 --- a/yt_dlp/extractor/voot.py +++ b/yt_dlp/extractor/voot.py @@ -1,14 +1,86 @@ +import json +import time +import urllib.error +import uuid + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, + float_or_none, int_or_none, + jwt_decode_hs256, + parse_age_limit, + traverse_obj, + try_call, try_get, - unified_timestamp, + unified_strdate, ) -class VootIE(InfoExtractor): +class VootBaseIE(InfoExtractor): + _NETRC_MACHINE = 'voot' + _GEO_BYPASS = False + _LOGIN_HINT = 'Log in with "-u -p ", or use "-u token -p " to login with auth token.' + _TOKEN = None + _EXPIRY = 0 + _API_HEADERS = {'Origin': 'https://www.voot.com', 'Referer': 'https://www.voot.com/'} + + def _perform_login(self, username, password): + if self._TOKEN and self._EXPIRY: + return + + if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): + VootBaseIE._TOKEN = password + VootBaseIE._EXPIRY = jwt_decode_hs256(password)['exp'] + self.report_login() + + # Mobile number as username is not supported + elif not username.isdigit(): + check_username = self._download_json( + 'https://userauth.voot.com/usersV3/v3/checkUser', None, data=json.dumps({ + 'type': 'email', + 'email': username + }, separators=(',', ':')).encode(), headers={ + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + }, note='Checking username', expected_status=403) + if not traverse_obj(check_username, ('isExist', {bool})): + if traverse_obj(check_username, ('status', 'code', {int})) == 9999: + self.raise_geo_restricted(countries=['IN']) + raise ExtractorError('Incorrect username', expected=True) + auth_token = traverse_obj(self._download_json( + 'https://userauth.voot.com/usersV3/v3/login', None, data=json.dumps({ + 'type': 'traditional', + 'deviceId': str(uuid.uuid4()), + 'deviceBrand': 'PC/MAC', + 'data': { + 'email': username, + 'password': password + } + }, separators=(',', ':')).encode(), headers={ + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + }, note='Logging in', expected_status=400), ('data', 'authToken', {dict})) + if not auth_token: + raise ExtractorError('Incorrect password', expected=True) + VootBaseIE._TOKEN = auth_token['accessToken'] + VootBaseIE._EXPIRY = auth_token['expirationTime'] + + else: + raise ExtractorError(self._LOGIN_HINT, expected=True) + + def _check_token_expiry(self): + if int(time.time()) >= self._EXPIRY: + raise ExtractorError('Access token has expired', expected=True) + + def _real_initialize(self): + if not self._TOKEN: + self.raise_login_required(self._LOGIN_HINT, method=None) + self._check_token_expiry() + + +class VootIE(VootBaseIE): _VALID_URL = r'''(?x) (?: voot:| @@ -20,27 +92,25 @@ class VootIE(InfoExtractor): ) (?P\d{3,}) ''' - _GEO_COUNTRIES = ['IN'] _TESTS = [{ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', 'info_dict': { - 'id': '0_8ledb18o', + 'id': '441353', 'ext': 'mp4', - 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', + 'title': 'Is this the end of Kamini?', 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', - 'timestamp': 1472162937, + 'timestamp': 1472103000, 'upload_date': '20160825', 'series': 'Ishq Ka Rang Safed', 'season_number': 1, 'episode': 'Is this the end of Kamini?', 'episode_number': 340, - 'view_count': int, - 'like_count': int, + 'release_date': '20160825', + 'season': 'Season 1', + 'age_limit': 13, + 'duration': 1146.0, }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download m3u8 information'], + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', 'only_matching': True, @@ -55,59 +125,50 @@ class VootIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) media_info = self._download_json( - 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, - query={ - 'platform': 'Web', - 'pId': 2, - 'mediaId': video_id, - }) + 'https://psapi.voot.com/jio/voot/v1/voot-web/content/query/asset-details', video_id, + query={'ids': f'include:{video_id}', 'responseType': 'common'}, headers={'accesstoken': self._TOKEN}) - status_code = try_get(media_info, lambda x: x['status']['code'], int) - if status_code != 0: - raise ExtractorError(media_info['status']['message'], expected=True) + try: + m3u8_url = self._download_json( + 'https://vootapi.media.jio.com/playback/v1/playbackrights', video_id, + 'Downloading playback JSON', data=b'{}', headers={ + **self.geo_verification_headers(), + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + 'platform': 'androidwebdesktop', + 'vootid': video_id, + 'voottoken': self._TOKEN, + })['m3u8'] + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 400: + self._check_token_expiry() + raise - media = media_info['assets'] + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._remove_duplicate_formats(formats) - entry_id = media['EntryId'] - title = media['MediaName'] - formats = self._extract_m3u8_formats( - 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, - video_id, 'mp4', m3u8_id='hls') - - description, series, season_number, episode, episode_number = [None] * 5 - - for meta in try_get(media, lambda x: x['Metas'], list) or []: - key, value = meta.get('Key'), meta.get('Value') - if not key or not value: - continue - if key == 'ContentSynopsis': - description = value - elif key == 'RefSeriesTitle': - series = value - elif key == 'RefSeriesSeason': - season_number = int_or_none(value) - elif key == 'EpisodeMainTitle': - episode = value - elif key == 'EpisodeNo': - episode_number = int_or_none(value) return { - 'extractor_key': 'Kaltura', - 'id': entry_id, - 'title': title, - 'description': description, - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'timestamp': unified_timestamp(media.get('CreationDate')), - 'duration': int_or_none(media.get('Duration')), - 'view_count': int_or_none(media.get('ViewCounter')), - 'like_count': int_or_none(media.get('like_counter')), - 'formats': formats, + 'id': video_id, + # '/_definst_/smil:vod/' m3u8 manifests claim to have 720p+ formats but max out at 480p + 'formats': traverse_obj(formats, ( + lambda _, v: '/_definst_/smil:vod/' not in v['url'] or v['height'] <= 480)), + 'http_headers': self._API_HEADERS, + **traverse_obj(media_info, ('result', 0, { + 'title': ('fullTitle', {str}), + 'description': ('fullSynopsis', {str}), + 'series': ('showName', {str}), + 'season_number': ('season', {int_or_none}), + 'episode': ('fullTitle', {str}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('uploadTime', {int_or_none}), + 'release_date': ('telecastDate', {unified_strdate}), + 'age_limit': ('ageNemonic', {parse_age_limit}), + 'duration': ('duration', {float_or_none}), + })), } -class VootSeriesIE(InfoExtractor): +class VootSeriesIE(VootBaseIE): _VALID_URL = r'https?://(?:www\.)?voot\.com/shows/[^/]+/(?P\d{3,})' _TESTS = [{ 'url': 'https://www.voot.com/shows/chakravartin-ashoka-samrat/100002', From b4a252fba81f53631c07ca40ce7583f5d19a8a36 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 10 Jun 2023 17:49:12 -0500 Subject: [PATCH 18/74] [jsinterp] Fix division (#7279) * Fixes nsig decryption for Youtube JS player `8c7583ff` Authored by: bashonly --- test/test_jsinterp.py | 7 +++++++ test/test_youtube_signature.py | 4 ++++ yt_dlp/jsinterp.py | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 4d44e6efe..b01477e6f 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -28,6 +28,13 @@ def test_basic(self): def test_calc(self): self._test('function f(a){return 2*a+1;}', 7, args=[3]) + def test_div(self): + jsi = JSInterpreter('function f(a, b){return a / b;}') + self.assertTrue(math.isnan(jsi.call_function('f', 0, 0))) + self.assertTrue(math.isnan(jsi.call_function('f', JS_Undefined, 1))) + self.assertTrue(math.isinf(jsi.call_function('f', 2, 0))) + self.assertEqual(jsi.call_function('f', 0, 3), 0) + def test_empty_return(self): self._test('function f(){return; y()}', None) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 13120d97f..01f09de88 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -150,6 +150,10 @@ 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', 'aCi3iElgd2kq0bxVbQ', 'QX1y8jGb2IbZ0w', ), + ( + 'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', + '1wWCVpRR96eAmMI87L', 'KSkWAVv1ZQxC3A', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 7c7940efd..d6d555733 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -44,7 +44,7 @@ def wrapped(a, b): def _js_div(a, b): - if JS_Undefined in (a, b) or not (a and b): + if JS_Undefined in (a, b) or not (a or b): return float('nan') return (a or 0) / b if b else float('inf') From f8ae441501596733e2b967430471643a1d7cacb8 Mon Sep 17 00:00:00 2001 From: DataGhost Date: Sun, 11 Jun 2023 17:17:26 +0200 Subject: [PATCH 19/74] [extractor/Dumpert] Fix m3u8 and support new URL pattern (#6091) Authored by: DataGhost, pukkandan Closes #5032 --- yt_dlp/extractor/dumpert.py | 49 +++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) mode change 100644 => 100755 yt_dlp/extractor/dumpert.py diff --git a/yt_dlp/extractor/dumpert.py b/yt_dlp/extractor/dumpert.py old mode 100644 new mode 100755 index 010c2d092..0cf84263c --- a/yt_dlp/extractor/dumpert.py +++ b/yt_dlp/extractor/dumpert.py @@ -1,12 +1,17 @@ from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, qualities, ) class DumpertIE(InfoExtractor): - _VALID_URL = r'(?Phttps?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:mediabase|embed|item)/(?P[0-9]+[/_][0-9a-zA-Z]+)' + _VALID_URL = r'''(?x) + (?Phttps?)://(?:(?:www|legacy)\.)?dumpert\.nl(?: + /(?:mediabase|embed|item)/| + (?:/toppers|/latest|/?)\?selectedId= + )(?P[0-9]+[/_][0-9a-zA-Z]+)''' _TESTS = [{ 'url': 'https://www.dumpert.nl/item/6646981_951bc60f', 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', @@ -16,6 +21,9 @@ class DumpertIE(InfoExtractor): 'title': 'Ik heb nieuws voor je', 'description': 'Niet schrikken hoor', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 9, + 'view_count': int, + 'like_count': int, } }, { 'url': 'https://www.dumpert.nl/embed/6675421_dc440fe7', @@ -26,6 +34,28 @@ class DumpertIE(InfoExtractor): }, { 'url': 'http://legacy.dumpert.nl/embed/6675421/dc440fe7', 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/item/100031688_b317a185', + 'info_dict': { + 'id': '100031688/b317a185', + 'ext': 'mp4', + 'title': 'Epic schijnbeweging', + 'description': '

Die zag je niet eh

', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'duration': 12, + 'view_count': int, + 'like_count': int, + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://www.dumpert.nl/toppers?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/latest?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/?selectedId=100031688_b317a185', + 'only_matching': True, }] def _real_extract(self, url): @@ -36,18 +66,23 @@ def _real_extract(self, url): title = item['title'] media = next(m for m in item['media'] if m.get('mediatype') == 'VIDEO') - quality = qualities(['flv', 'mobile', 'tablet', '720p']) + quality = qualities(['flv', 'mobile', 'tablet', '720p', '1080p']) formats = [] for variant in media.get('variants', []): uri = variant.get('uri') if not uri: continue version = variant.get('version') - formats.append({ - 'url': uri, - 'format_id': version, - 'quality': quality(version), - }) + preference = quality(version) + if determine_ext(uri) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', m3u8_id=version, quality=preference)) + else: + formats.append({ + 'url': uri, + 'format_id': version, + 'quality': preference, + }) thumbnails = [] stills = item.get('stills') or {} From 1a2eb5bda51d8b7a78a65acebf72a0dcf9da196b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 11 Jun 2023 12:06:34 -0500 Subject: [PATCH 20/74] [extractor/odnoklassniki] Fix formats extraction (#7217) Closes #2959, Closes #4462, Closes #7201 Authored by: bashonly --- yt_dlp/extractor/odnoklassniki.py | 56 ++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index 4b73eed37..0d0ad0bb8 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -1,3 +1,5 @@ +import urllib.parse + from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, @@ -7,6 +9,7 @@ ) from ..utils import ( ExtractorError, + HEADRequest, float_or_none, int_or_none, qualities, @@ -15,6 +18,7 @@ unescapeHTML, unified_strdate, unsmuggle_url, + url_or_none, urlencode_postdata, ) @@ -41,7 +45,7 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1545580896, 'view_count': int, - 'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'title': 'Народная забава', 'uploader': 'Nevata', 'upload_date': '20181223', @@ -65,13 +69,14 @@ class OdnoklassnikiIE(InfoExtractor): 'title': str, 'uploader': str, }, + 'skip': 'vk extractor error', }, { - # metadata in JSON + # metadata in JSON, webm_dash with Firefox UA 'url': 'http://ok.ru/video/20079905452', - 'md5': '5d2b64756e2af296e3b383a0bc02a6aa', + 'md5': '8f477d8931c531374a3e36daec617b2c', 'info_dict': { 'id': '20079905452', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Культура меняет нас (прекрасный ролик!))', 'thumbnail': str, 'duration': 100, @@ -81,10 +86,14 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, + 'params': { + 'format': 'bv[ext=webm]', + 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'}, + }, }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', - 'md5': 'f8c951122516af72e6e6ffdd3c41103b', + 'md5': '2bae2f58eefe1b3d26f3926c4a64d2f3', 'info_dict': { 'id': '63567059965189-0', 'ext': 'mp4', @@ -98,10 +107,11 @@ class OdnoklassnikiIE(InfoExtractor): 'age_limit': 0, 'start_time': 5, }, + 'params': {'skip_download': 'm3u8'}, }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) 'url': 'https://ok.ru/video/3952212382174', - 'md5': '91749d0bd20763a28d083fa335bbd37a', + 'md5': '5fb5f83ce16cb212d6bf887282b5da53', 'info_dict': { 'id': '5axVgHHDBvU', 'ext': 'mp4', @@ -116,7 +126,7 @@ class OdnoklassnikiIE(InfoExtractor): 'live_status': 'not_live', 'view_count': int, 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8', - 'uploader_url': 'http://www.youtube.com/user/MrKewlkid94', + 'uploader_url': 'https://www.youtube.com/@MrKewlkid94', 'channel_follower_count': int, 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'], 'channel_id': 'UCVGtvURtEURYHtJFUegdSug', @@ -145,7 +155,6 @@ class OdnoklassnikiIE(InfoExtractor): }, 'skip': 'Video has not been found', }, { - # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading 'note': 'Only available in mobile webpage', 'url': 'https://m.ok.ru/video/2361249957145', 'info_dict': { @@ -153,8 +162,8 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'title': 'Быковское крещение', 'duration': 3038.181, + 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+', }, - 'skip': 'HTTP Error 400', }, { 'note': 'subtitles', 'url': 'https://ok.ru/video/4249587550747', @@ -226,6 +235,14 @@ class OdnoklassnikiIE(InfoExtractor): 'skip': 'Site no longer embeds', }] + def _clear_cookies(self, cdn_url): + # Direct http downloads will fail if CDN cookies are set + # so we need to reset them after each format extraction + if self._get_cookies('https://notarealsubdomain.mycdn.me/'): + self.cookiejar.clear(domain='.mycdn.me') + if self._get_cookies(cdn_url): + self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname) + @classmethod def _extract_embed_urls(cls, url, webpage): for x in super()._extract_embed_urls(url, webpage): @@ -364,14 +381,22 @@ def _extract_desktop(self, url): formats = [{ 'url': f['url'], 'ext': 'mp4', - 'format_id': f['name'], - } for f in metadata['videos']] + 'format_id': f.get('name'), + } for f in traverse_obj(metadata, ('videos', lambda _, v: url_or_none(v['url'])))] - m3u8_url = metadata.get('hlsManifestUrl') + m3u8_url = traverse_obj(metadata, 'hlsManifestUrl', 'ondemandHls') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) + + for mpd_id, mpd_key in [('dash', 'ondemandDash'), ('webm', 'metadataWebmUrl')]: + mpd_url = metadata.get(mpd_key) + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, video_id, mpd_id=mpd_id, fatal=False)) + self._clear_cookies(mpd_url) dash_manifest = metadata.get('metadataEmbedded') if dash_manifest: @@ -390,6 +415,7 @@ def _extract_desktop(self, url): if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) rtmp_url = metadata.get('rtmpUrl') if rtmp_url: formats.append({ @@ -423,6 +449,10 @@ def _extract_mobile(self, url): r'data-video="(.+?)"', webpage, 'json data') json_data = self._parse_json(unescapeHTML(json_data), video_id) or {} + redirect_url = self._request_webpage(HEADRequest( + json_data['videoSrc']), video_id, 'Requesting download URL').geturl() + self._clear_cookies(redirect_url) + return { 'id': video_id, 'title': json_data.get('videoName'), @@ -430,7 +460,7 @@ def _extract_mobile(self, url): 'thumbnail': json_data.get('videoPosterSrc'), 'formats': [{ 'format_id': 'mobile', - 'url': json_data.get('videoSrc'), + 'url': redirect_url, 'ext': 'mp4', }] } From 9d7fde89a40360396f0baa2ee8bf507f92108b32 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 11 Jun 2023 12:15:05 -0500 Subject: [PATCH 21/74] [extractor/zee5] Fix extraction of new content (#7280) Authored by: bashonly --- yt_dlp/extractor/zee5.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py index a64eb9ed0..b4734cc8f 100644 --- a/yt_dlp/extractor/zee5.py +++ b/yt_dlp/extractor/zee5.py @@ -1,14 +1,16 @@ import json -import random -import string +import time +import uuid from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + jwt_decode_hs256, parse_age_limit, str_or_none, + try_call, try_get, unified_strdate, unified_timestamp, @@ -94,12 +96,12 @@ class Zee5IE(InfoExtractor): 'url': 'https://www.zee5.com/music-videos/details/adhento-gaani-vunnapaatuga-jersey-nani-shraddha-srinath/0-0-56973', 'only_matching': True }] - _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails/secure?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' - _DEVICE_ID = ''.join(random.choices(string.ascii_letters + string.digits, k=20)).ljust(32, '0') + _DEVICE_ID = str(uuid.uuid4()) _USER_TOKEN = None _LOGIN_HINT = 'Use "--username " to login using otp or "--username token" and "--password " to login using user token.' _NETRC_MACHINE = 'zee5' _GEO_COUNTRIES = ['IN'] + _USER_COUNTRY = None def _perform_login(self, username, password): if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: @@ -118,11 +120,16 @@ def _perform_login(self, username, password): self._USER_TOKEN = otp_verify_json.get('token') if not self._USER_TOKEN: raise ExtractorError(otp_request_json['message'], expected=True) - elif username.lower() == 'token' and len(password) > 1198: + elif username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): self._USER_TOKEN = password else: raise ExtractorError(self._LOGIN_HINT, expected=True) + token = jwt_decode_hs256(self._USER_TOKEN) + if token.get('exp', 0) <= int(time.time()): + raise ExtractorError('User token has expired', expected=True) + self._USER_COUNTRY = token.get('current_country') + def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') access_token_request = self._download_json( @@ -137,8 +144,13 @@ def _real_extract(self, url): data['X-Z5-Guest-Token'] = self._DEVICE_ID json_data = self._download_json( - self._DETAIL_API_URL.format(video_id, self._DEVICE_ID), - video_id, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8')) + 'https://spapi.zee5.com/singlePlayback/getDetails/secure', video_id, query={ + 'content_id': video_id, + 'device_id': self._DEVICE_ID, + 'platform_name': 'desktop_web', + 'country': self._USER_COUNTRY or self.get_param('geo_bypass_country') or 'IN', + 'check_parental_control': False, + }, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8')) asset_data = json_data['assetDetails'] show_data = json_data.get('showDetails', {}) if 'premium' in asset_data['business_type']: From ab6057ec80aa75db6303b8206916d00c376c622c Mon Sep 17 00:00:00 2001 From: puc9 <51006296+puc9@users.noreply.github.com> Date: Sun, 11 Jun 2023 11:57:59 -0700 Subject: [PATCH 22/74] [extractor/tiktok] Fix resolution extraction (#7237) Authored by: puc9 --- yt_dlp/extractor/tiktok.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 49035e971..9c6d74007 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -218,8 +218,8 @@ def mp3_meta(url): def extract_addr(addr, add_meta={}): parsed_meta, res = parse_url_key(addr.get('url_key', '')) if res: - known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height')) - known_resolutions[res].setdefault('width', add_meta.get('width')) + known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height')) + known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width')) parsed_meta.update(known_resolutions.get(res, {})) add_meta.setdefault('height', int_or_none(res[:-1])) return [{ From 8790ea7b2536332777bce68590386b1aa935fac7 Mon Sep 17 00:00:00 2001 From: linsui <36977733+linsui@users.noreply.github.com> Date: Mon, 12 Jun 2023 08:02:50 +0000 Subject: [PATCH 23/74] [extractor/ximalaya] Sort playlist entries (#7292) Authored by: linsui --- yt_dlp/extractor/ximalaya.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/ximalaya.py b/yt_dlp/extractor/ximalaya.py index ff18ba697..3d5e6cf90 100644 --- a/yt_dlp/extractor/ximalaya.py +++ b/yt_dlp/extractor/ximalaya.py @@ -158,7 +158,7 @@ def _fetch_page(self, playlist_id, page_idx): return self._download_json( 'https://www.ximalaya.com/revision/album/v1/getTracksList', playlist_id, note=f'Downloading tracks list page {page_idx}', - query={'albumId': playlist_id, 'pageNum': page_idx, 'sort': 1})['data'] + query={'albumId': playlist_id, 'pageNum': page_idx})['data'] def _get_entries(self, page_data): for e in page_data['tracks']: From 345b4c0aedd9d19898ce00d5cef35fe0d277a052 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 12 Jun 2023 14:12:09 -0400 Subject: [PATCH 24/74] [extractor/zaiko] Add extractor (#7254) Closes #7196 Authored by: c-basalt --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/zaiko.py | 92 +++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 yt_dlp/extractor/zaiko.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f54024211..921b7dee9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2441,6 +2441,7 @@ from .youporn import YouPornIE from .yourporn import YourPornIE from .yourupload import YourUploadIE +from .zaiko import ZaikoIE from .zapiks import ZapiksIE from .zattoo import ( BBVTVIE, diff --git a/yt_dlp/extractor/zaiko.py b/yt_dlp/extractor/zaiko.py new file mode 100644 index 000000000..59fc64c5a --- /dev/null +++ b/yt_dlp/extractor/zaiko.py @@ -0,0 +1,92 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + str_or_none, + traverse_obj, + unescapeHTML, + url_or_none, +) + + +class ZaikoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?zaiko\.io/event/(?P\d+)/stream(?:/\d+)+' + _TESTS = [{ + 'url': 'https://zaiko.io/event/324868/stream/20571/20571', + 'info_dict': { + 'id': '324868', + 'ext': 'mp4', + 'title': 'ZAIKO STREAMING TEST', + 'alt_title': '[VOD] ZAIKO STREAMING TEST_20210603(Do Not Delete)', + 'uploader_id': '454', + 'uploader': 'ZAIKO ZERO', + 'release_timestamp': 1583809200, + 'thumbnail': r're:https://[a-z0-9]+.cloudfront.net/[a-z0-9_]+/[a-z0-9_]+', + 'release_date': '20200310', + 'categories': ['Tech House'], + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _parse_vue_element_attr(self, name, string, video_id): + page_elem = self._search_regex(rf'(<{name}[^>]+>)', string, name) + attrs = {} + for key, value in extract_attributes(page_elem).items(): + if key.startswith(':'): + attrs[key[1:]] = self._parse_json( + value, video_id, transform_source=unescapeHTML, fatal=False) + return attrs + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, video_id) + final_url = urlh.geturl() + if 'zaiko.io/login' in final_url: + self.raise_login_required() + elif '/_buy/' in final_url: + raise ExtractorError('Your account does not have tickets to this event', expected=True) + stream_meta = self._parse_vue_element_attr('stream-page', webpage, video_id) + + player_page = self._download_webpage( + stream_meta['stream-access']['video_source'], video_id, + 'Downloading player page', headers={'referer': 'https://zaiko.io/'}) + player_meta = self._parse_vue_element_attr('player', player_page, video_id) + status = traverse_obj(player_meta, ('initial_event_info', 'status', {str})) + live_status, msg, expected = { + 'vod': ('was_live', 'No VOD stream URL was found', False), + 'archiving': ('post_live', 'Event VOD is still being processed', True), + 'deleting': ('post_live', 'This event has ended', True), + 'deleted': ('post_live', 'This event has ended', True), + 'error': ('post_live', 'This event has ended', True), + 'disconnected': ('post_live', 'Stream has been disconnected', True), + 'live_to_disconnected': ('post_live', 'Stream has been disconnected', True), + 'live': ('is_live', 'No livestream URL found was found', False), + 'waiting': ('is_upcoming', 'Live event has not yet started', True), + 'cancelled': ('not_live', 'Event has been cancelled', True), + }.get(status) or ('not_live', f'Unknown event status "{status}"', False) + + stream_url = traverse_obj(player_meta, ('initial_event_info', 'endpoint', {url_or_none})) + formats = self._extract_m3u8_formats( + stream_url, video_id, live=True, fatal=False) if stream_url else [] + if not formats: + self.raise_no_formats(msg, expected=expected) + + return { + 'id': video_id, + 'formats': formats, + 'live_status': live_status, + **traverse_obj(stream_meta, { + 'title': ('event', 'name', {str}), + 'uploader': ('profile', 'name', {str}), + 'uploader_id': ('profile', 'id', {str_or_none}), + 'release_timestamp': ('stream', 'start', 'timestamp', {int_or_none}), + 'categories': ('event', 'genres', ..., {lambda x: x or None}), + }), + **traverse_obj(player_meta, ('initial_event_info', { + 'alt_title': ('title', {str}), + 'thumbnail': ('poster_url', {url_or_none}), + })), + } From cab94a0cd8b6d3fffed5a6faff030274adbed182 Mon Sep 17 00:00:00 2001 From: Cyberes <64224601+Cyberes@users.noreply.github.com> Date: Mon, 12 Jun 2023 21:23:17 -0600 Subject: [PATCH 25/74] [extractor/funker530] Add extractor (#7291) Authored by: Cyberes --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/funker530.py | 79 +++++++++++++++++++++++++++++++++ yt_dlp/extractor/rumble.py | 2 +- 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/funker530.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 921b7dee9..69c7a9e90 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -664,6 +664,7 @@ FunimationShowIE, ) from .funk import FunkIE +from .funker530 import Funker530IE from .fusion import FusionIE from .fuyintv import FuyinTVIE from .gab import ( diff --git a/yt_dlp/extractor/funker530.py b/yt_dlp/extractor/funker530.py new file mode 100644 index 000000000..ba5ab7d4e --- /dev/null +++ b/yt_dlp/extractor/funker530.py @@ -0,0 +1,79 @@ +from .common import InfoExtractor +from .rumble import RumbleEmbedIE +from .youtube import YoutubeIE +from ..utils import ExtractorError, clean_html, get_element_by_class, strip_or_none + + +class Funker530IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?funker530\.com/video/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://funker530.com/video/azov-patrol-caught-in-open-under-automatic-grenade-launcher-fire/', + 'md5': '085f50fea27523a388bbc22e123e09c8', + 'info_dict': { + 'id': 'v2qbmu4', + 'ext': 'mp4', + 'title': 'Azov Patrol Caught In Open Under Automatic Grenade Launcher Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Funker530', + 'channel': 'Funker530', + 'channel_url': 'https://rumble.com/c/c-1199543', + 'width': 1280, + 'height': 720, + 'fps': 25, + 'duration': 27, + 'upload_date': '20230608', + 'timestamp': 1686241321, + 'live_status': 'not_live', + 'description': 'md5:bea2e1f458095414e04b5ac189c2f980', + } + }, { + 'url': 'https://funker530.com/video/my-friends-joined-the-russians-civdiv/', + 'md5': 'a42c2933391210662e93e867d7124b70', + 'info_dict': { + 'id': 'k-pk4bOvoac', + 'ext': 'mp4', + 'view_count': int, + 'channel': 'Civ Div', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/k-pk4bOvoac/maxresdefault.jpg', + 'uploader_id': '@CivDiv', + 'duration': 357, + 'channel_url': 'https://www.youtube.com/channel/UCgsCiwJ88up-YyMHo7hL5-A', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@CivDiv', + 'channel_id': 'UCgsCiwJ88up-YyMHo7hL5-A', + 'like_count': int, + 'description': 'md5:aef75ec3f59c07a0e39400f609b24429', + 'live_status': 'not_live', + 'age_limit': 0, + 'uploader': 'Civ Div', + 'categories': ['People & Blogs'], + 'title': 'My “Friends” joined the Russians.', + 'availability': 'public', + 'upload_date': '20230608', + 'playable_in_embed': True, + 'heatmap': 'count:100', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) + if rumble_url: + info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} + else: + youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) + if youtube_url: + info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} + if not info: + raise ExtractorError('No videos found on webpage', expected=True) + + return { + **info, + '_type': 'url_transparent', + 'description': strip_or_none(self._search_regex( + r'(?s)(.+)About the Author', clean_html(get_element_by_class('video-desc-paragraph', webpage)), + 'description', default=None)) + } diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 98f660f8b..82f3f0f8c 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -144,7 +144,7 @@ def _extract_embed_urls(cls, url, webpage): if embeds: return embeds return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( - r'