Merge 06d5041f74 into f3411af12e

[ie/matchtv] Fix extractor (#10190 )
Authored by: megumintyan
2024-06-29 00:52:21 +02:00 · 2024-06-25 21:54:09 +02:00 · 2024-06-25 00:49:09 +02:00 · 2024-06-14 17:54:53 -07:00
3 changed files with 46 additions and 63 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -397,7 +397,6 @@
 from .cnn import (
    CNNIE,
    CNNArticleIE,
-    CNNBlogsIE,
    CNNIndonesiaIE,
 )
 from .comedycentral import (
--- a/yt_dlp/extractor/cnn.py
+++ b/yt_dlp/extractor/cnn.py
@ -99,48 +99,48 @@ def _real_extract(self, url):
            })


-class CNNBlogsIE(InfoExtractor):
-    _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+'
-    _TEST = {
-        'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/',
-        'md5': '3e56f97b0b6ffb4b79f4ea0749551084',
-        'info_dict': {
-            'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn',
-            'ext': 'mp4',
-            'title': 'Criminalizing journalism?',
-            'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.',
-            'upload_date': '20140209',
-        },
-        'expected_warnings': ['Failed to download m3u8 information'],
-        'add_ie': ['CNN'],
-    }
-
-    def _real_extract(self, url):
-        webpage = self._download_webpage(url, url_basename(url))
-        cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url')
-        return self.url_result(cnn_url, CNNIE.ie_key())
-
-
 class CNNArticleIE(InfoExtractor):
    _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)'
-    _TEST = {
-        'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
-        'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
+
+    _TESTS = [{
+        'url': 'https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl',
        'info_dict': {
-            'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn',
+            'id': 'jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl-1553374-1920x1080_8000k',
            'ext': 'mp4',
-            'title': 'Obama: Cyberattack not an act of war',
-            'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b',
-            'upload_date': '20141221',
+            'direct': True,
+            'title': 'jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl-1553374-1920x1080_8000k',
+            'timestamp': 1717148749.0,
+            'upload_date': '20240531',
        },
-        'expected_warnings': ['Failed to download m3u8 information'],
-        'add_ie': ['CNN'],
-    }
+    }, {
+        'url': 'https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid',
+        'info_dict': {
+            'id': 'inmates-vote-jail-nevada-murray-dnt-ac360-digvid-1563291-1920x1080_8000k',
+            'ext': 'mp4',
+            'direct': True,
+            'title': 'inmates-vote-jail-nevada-murray-dnt-ac360-digvid-1563291-1920x1080_8000k',
+            'timestamp': 1718158370.0,
+            'upload_date': '20240612',
+        },
+    }, {
+        'url': 'https://www.cnn.com/2024/06/11/style/video/king-charles-portrait-vandalized-activists-foster-intl-digvid',
+        'info_dict': {
+            'id': 'king-charles-portrait-vandalized-activists-foster-intl-digvid-1562674-1920x1080_8000k',
+            'ext': 'mp4',
+            'direct': True,
+            'title': 'king-charles-portrait-vandalized-activists-foster-intl-digvid-1562674-1920x1080_8000k',
+            'timestamp': 1718116155.0,
+            'upload_date': '20240611',
+        },
+    }]

    def _real_extract(self, url):
        webpage = self._download_webpage(url, url_basename(url))
-        cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
-        return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key())
+        cnn_url = self._search_regex(r'"@type":"VideoObject","contentUrl":"(.*?)"', webpage, 'content URL')
+        if (cnn_url):
+            return self.url_result(cnn_url, 'Generic')
+        else:
+            return self.url_result(url, CNNIE.ie_key())


 class CNNIndonesiaIE(InfoExtractor):
--- a/yt_dlp/extractor/matchtv.py
+++ b/yt_dlp/extractor/matchtv.py
@ -1,51 +1,35 @@
-import random
-
 from .common import InfoExtractor
-from ..utils import xpath_text


 class MatchTVIE(InfoExtractor):
-    _VALID_URL = r'https?://matchtv\.ru(?:/on-air|/?#live-player)'
+    _VALID_URL = [
+        r'https?://matchtv\.ru/on-air/?(?:$|[?#])',
+        r'https?://video\.matchtv\.ru/iframe/channel/106/?(?:$|[?#])',
+    ]
    _TESTS = [{
-        'url': 'http://matchtv.ru/#live-player',
+        'url': 'http://matchtv.ru/on-air/',
        'info_dict': {
            'id': 'matchtv-live',
-            'ext': 'flv',
+            'ext': 'mp4',
            'title': r're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
-            'is_live': True,
+            'live_status': 'is_live',
        },
        'params': {
            'skip_download': True,
        },
    }, {
-        'url': 'http://matchtv.ru/on-air/',
+        'url': 'https://video.matchtv.ru/iframe/channel/106',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = 'matchtv-live'
-        video_url = self._download_json(
-            'http://player.matchtv.ntvplus.tv/player/smil', video_id,
-            query={
-                'ts': '',
-                'quality': 'SD',
-                'contentId': '561d2c0df7159b37178b4567',
-                'sign': '',
-                'includeHighlights': '0',
-                'userId': '',
-                'sessionId': random.randint(1, 1000000000),
-                'contentType': 'channel',
-                'timeShift': '0',
-                'platform': 'portal',
-            },
-            headers={
-                'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf',
-            })['data']['videoUrl']
-        f4m_url = xpath_text(self._download_xml(video_url, video_id), './to')
-        formats = self._extract_f4m_formats(f4m_url, video_id)
+        webpage = self._download_webpage('https://video.matchtv.ru/iframe/channel/106', video_id)
+        video_url = self._html_search_regex(
+            r'data-config="config=(https?://[^?"]+)[?"]', webpage, 'video URL').replace('/feed/', '/media/') + '.m3u8'
        return {
            'id': video_id,
            'title': 'Матч ТВ - Прямой эфир',
            'is_live': True,
-            'formats': formats,
+            'formats': self._extract_m3u8_formats(video_url, video_id, 'mp4', live=True),
        }