[mdr] Add support for modern URLs (Fixes #2775)

2024-11-07 19:52:40 +01:00 · 2014-04-21 06:25:21 +02:00 · 2014-04-21 06:25:21 +02:00 · e8f2025edf
commit e8f2025edf
parent a4eb9578af
1 changed files with 11 additions and 8 deletions
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@ -1,3 +1,5 @@
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
@ -7,9 +9,13 @@
 class MDRIE(InfoExtractor):
-    _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*'
+    _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)'
    # No tests, MDR regularily deletes its videos
    _TEST = {
        'url': 'http://www.mdr.de/fakt/video189002.html',
        'only_matching': True,
    }
    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url)
@ -19,9 +25,9 @@ def _real_extract(self, url):
        # determine title and media streams from webpage
        html = self._download_webpage(url, video_id)
-        title = self._html_search_regex(r'<h2>(.*?)</h2>', html, u'title')
+        title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title')
        xmlurl = self._search_regex(
-            r'(/mediathek/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, u'XML URL')
+            r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL')
        doc = self._download_xml(domain + xmlurl, video_id)
        formats = []
@ -41,7 +47,7 @@ def _real_extract(self, url):
            if vbr_el is None:
                format.update({
                    'vcodec': 'none',
-                    'format_id': u'%s-%d' % (media_type, abr),
+                    'format_id': '%s-%d' % (media_type, abr),
                })
            else:
                vbr = int(vbr_el.text) // 1000
@ -49,12 +55,9 @@ def _real_extract(self, url):
                    'vbr': vbr,
                    'width': int(a.find('frameWidth').text),
                    'height': int(a.find('frameHeight').text),
-                    'format_id': u'%s-%d' % (media_type, vbr),
+                    'format_id': '%s-%d' % (media_type, vbr),
                })
            formats.append(format)
        if not formats:
            raise ExtractorError(u'Could not find any valid formats')
        self._sort_formats(formats)
        return {