From 06d5041f7415980061acb7f71778dfcdb4fb5466 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Thu, 13 Jun 2024 16:15:05 -0700 Subject: [PATCH] CNNArticleIE Extractor: Update to recognize modern video links CNNArticleIE currently matches with URLs but fails to extract video. Update CNNArticleIE to work with the way most CNN video links are embedded, and update the tests to include some of these links. The type of URL for this can just use the default extractor. Also updating the regex to capture a date or text before the /video/ subcategory, as many video links are structured this way. Removing old test that has an old article without available media anymore. Example URLs: Landing pages: https://www.cnn.com/videos https://www.cnn.com/videos/ https://edition.cnn.com/videos Specific Videos: https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid https://www.cnn.com/2024/06/11/style/video/king-charles-portrait-vandalized-activists-foster-intl-digvid --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/cnn.py | 68 ++++++++++++++++----------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0f599c9db..88022bf44 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -397,7 +397,6 @@ from .cnn import ( CNNIE, CNNArticleIE, - CNNBlogsIE, CNNIndonesiaIE, ) from .comedycentral import ( diff --git a/yt_dlp/extractor/cnn.py b/yt_dlp/extractor/cnn.py index fe7615a89..b7fbc6f05 100644 --- a/yt_dlp/extractor/cnn.py +++ b/yt_dlp/extractor/cnn.py @@ -99,48 +99,48 @@ def _real_extract(self, url): }) -class CNNBlogsIE(InfoExtractor): - _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+' - _TEST = { - 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', - 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', - 'info_dict': { - 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', - 'ext': 'mp4', - 'title': 'Criminalizing journalism?', - 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', - 'upload_date': '20140209', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'add_ie': ['CNN'], - } - - def _real_extract(self, url): - webpage = self._download_webpage(url, url_basename(url)) - cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') - return self.url_result(cnn_url, CNNIE.ie_key()) - - class CNNArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' - _TEST = { - 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', - 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', + + _TESTS = [{ + 'url': 'https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl', 'info_dict': { - 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', + 'id': 'jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl-1553374-1920x1080_8000k', 'ext': 'mp4', - 'title': 'Obama: Cyberattack not an act of war', - 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', - 'upload_date': '20141221', + 'direct': True, + 'title': 'jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl-1553374-1920x1080_8000k', + 'timestamp': 1717148749.0, + 'upload_date': '20240531', }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'add_ie': ['CNN'], - } + }, { + 'url': 'https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid', + 'info_dict': { + 'id': 'inmates-vote-jail-nevada-murray-dnt-ac360-digvid-1563291-1920x1080_8000k', + 'ext': 'mp4', + 'direct': True, + 'title': 'inmates-vote-jail-nevada-murray-dnt-ac360-digvid-1563291-1920x1080_8000k', + 'timestamp': 1718158370.0, + 'upload_date': '20240612', + }, + }, { + 'url': 'https://www.cnn.com/2024/06/11/style/video/king-charles-portrait-vandalized-activists-foster-intl-digvid', + 'info_dict': { + 'id': 'king-charles-portrait-vandalized-activists-foster-intl-digvid-1562674-1920x1080_8000k', + 'ext': 'mp4', + 'direct': True, + 'title': 'king-charles-portrait-vandalized-activists-foster-intl-digvid-1562674-1920x1080_8000k', + 'timestamp': 1718116155.0, + 'upload_date': '20240611', + }, + }] def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) - cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') - return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) + cnn_url = self._search_regex(r'"@type":"VideoObject","contentUrl":"(.*?)"', webpage, 'content URL') + if (cnn_url): + return self.url_result(cnn_url, 'Generic') + else: + return self.url_result(url, CNNIE.ie_key()) class CNNIndonesiaIE(InfoExtractor):