1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2024-06-29 00:52:21 +02:00

Compare commits

...

3 Commits

Author SHA1 Message Date
kylegustavo
ee28622f07
Merge 06d5041f74 into f3411af12e 2024-06-25 21:54:09 +02:00
megumin
f3411af12e
[ie/matchtv] Fix extractor (#10190)
Authored by: megumintyan
2024-06-25 00:49:09 +02:00
Kyle Gonsalves
06d5041f74 CNNArticleIE Extractor: Update to recognize modern video links
CNNArticleIE currently matches with URLs but fails to extract video.
Update CNNArticleIE to work with the way most CNN video
links are embedded, and update the tests to include some of these
links. The type of URL for this can just use the default extractor.
Also updating the regex to capture a date or text before
the /video/ subcategory, as many video links are structured
this way. Removing old test that has an old article without
available media anymore.

Example URLs:

Landing pages:
https://www.cnn.com/videos
https://www.cnn.com/videos/
https://edition.cnn.com/videos

Specific Videos:
https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl
https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid
https://www.cnn.com/2024/06/11/style/video/king-charles-portrait-vandalized-activists-foster-intl-digvid
2024-06-14 17:54:53 -07:00
3 changed files with 46 additions and 63 deletions

View File

@ -397,7 +397,6 @@
from .cnn import (
CNNIE,
CNNArticleIE,
CNNBlogsIE,
CNNIndonesiaIE,
)
from .comedycentral import (

View File

@ -99,48 +99,48 @@ def _real_extract(self, url):
})
class CNNBlogsIE(InfoExtractor):
_VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+'
_TEST = {
'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/',
'md5': '3e56f97b0b6ffb4b79f4ea0749551084',
'info_dict': {
'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn',
'ext': 'mp4',
'title': 'Criminalizing journalism?',
'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.',
'upload_date': '20140209',
},
'expected_warnings': ['Failed to download m3u8 information'],
'add_ie': ['CNN'],
}
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url')
return self.url_result(cnn_url, CNNIE.ie_key())
class CNNArticleIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)'
_TEST = {
'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
_TESTS = [{
'url': 'https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl',
'info_dict': {
'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn',
'id': 'jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl-1553374-1920x1080_8000k',
'ext': 'mp4',
'title': 'Obama: Cyberattack not an act of war',
'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b',
'upload_date': '20141221',
'direct': True,
'title': 'jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl-1553374-1920x1080_8000k',
'timestamp': 1717148749.0,
'upload_date': '20240531',
},
'expected_warnings': ['Failed to download m3u8 information'],
'add_ie': ['CNN'],
}
}, {
'url': 'https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid',
'info_dict': {
'id': 'inmates-vote-jail-nevada-murray-dnt-ac360-digvid-1563291-1920x1080_8000k',
'ext': 'mp4',
'direct': True,
'title': 'inmates-vote-jail-nevada-murray-dnt-ac360-digvid-1563291-1920x1080_8000k',
'timestamp': 1718158370.0,
'upload_date': '20240612',
},
}, {
'url': 'https://www.cnn.com/2024/06/11/style/video/king-charles-portrait-vandalized-activists-foster-intl-digvid',
'info_dict': {
'id': 'king-charles-portrait-vandalized-activists-foster-intl-digvid-1562674-1920x1080_8000k',
'ext': 'mp4',
'direct': True,
'title': 'king-charles-portrait-vandalized-activists-foster-intl-digvid-1562674-1920x1080_8000k',
'timestamp': 1718116155.0,
'upload_date': '20240611',
},
}]
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key())
cnn_url = self._search_regex(r'"@type":"VideoObject","contentUrl":"(.*?)"', webpage, 'content URL')
if (cnn_url):
return self.url_result(cnn_url, 'Generic')
else:
return self.url_result(url, CNNIE.ie_key())
class CNNIndonesiaIE(InfoExtractor):

View File

@ -1,51 +1,35 @@
import random
from .common import InfoExtractor
from ..utils import xpath_text
class MatchTVIE(InfoExtractor):
_VALID_URL = r'https?://matchtv\.ru(?:/on-air|/?#live-player)'
_VALID_URL = [
r'https?://matchtv\.ru/on-air/?(?:$|[?#])',
r'https?://video\.matchtv\.ru/iframe/channel/106/?(?:$|[?#])',
]
_TESTS = [{
'url': 'http://matchtv.ru/#live-player',
'url': 'http://matchtv.ru/on-air/',
'info_dict': {
'id': 'matchtv-live',
'ext': 'flv',
'ext': 'mp4',
'title': r're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'is_live': True,
'live_status': 'is_live',
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://matchtv.ru/on-air/',
'url': 'https://video.matchtv.ru/iframe/channel/106',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = 'matchtv-live'
video_url = self._download_json(
'http://player.matchtv.ntvplus.tv/player/smil', video_id,
query={
'ts': '',
'quality': 'SD',
'contentId': '561d2c0df7159b37178b4567',
'sign': '',
'includeHighlights': '0',
'userId': '',
'sessionId': random.randint(1, 1000000000),
'contentType': 'channel',
'timeShift': '0',
'platform': 'portal',
},
headers={
'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf',
})['data']['videoUrl']
f4m_url = xpath_text(self._download_xml(video_url, video_id), './to')
formats = self._extract_f4m_formats(f4m_url, video_id)
webpage = self._download_webpage('https://video.matchtv.ru/iframe/channel/106', video_id)
video_url = self._html_search_regex(
r'data-config="config=(https?://[^?"]+)[?"]', webpage, 'video URL').replace('/feed/', '/media/') + '.m3u8'
return {
'id': video_id,
'title': 'Матч ТВ - Прямой эфир',
'is_live': True,
'formats': formats,
'formats': self._extract_m3u8_formats(video_url, video_id, 'mp4', live=True),
}