[heise] Fix description, thumbnail and format ID

2024-11-02 09:12:40 +01:00 · 2014-11-04 23:14:16 +01:00 · 2014-11-04 23:14:16 +01:00 · 711ede6e1b
commit 711ede6e1b
parent a32f253112
2 changed files with 21 additions and 14 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -404,7 +404,7 @@ def playlist_result(entries, playlist_id=None, playlist_title=None):
            video_info['title'] = playlist_title
        return video_info

-    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
@ -425,8 +425,11 @@ def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True,
            _name = name

        if mobj:
-            # return the first matching group
-            return next(g for g in mobj.groups() if g is not None)
+            if group is None:
+                # return the first matching group
+                return next(g for g in mobj.groups() if g is not None)
+            else:
+                return mobj.group(group)
        elif default is not _NO_DEFAULT:
            return default
        elif fatal:
@ -436,11 +439,11 @@ def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True,
                'please report this issue on http://yt-dl.org/bug' % _name)
            return None

-    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
-        res = self._search_regex(pattern, string, name, default, fatal, flags)
+        res = self._search_regex(pattern, string, name, default, fatal, flags, group)
        if res:
            return clean_html(res).strip()
        else:
@ -534,9 +537,9 @@ def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs
            display_name = name
        return self._html_search_regex(
            r'''(?ix)<meta
-                    (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
-                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
-            html, display_name, fatal=fatal, **kwargs)
+                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+                    [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
+            html, display_name, fatal=fatal, group='content', **kwargs)

    def _dc_search_uploader(self, html):
        return self._html_search_meta('dc.creator', html, 'uploader')
--- a/youtube_dl/extractor/heise.py
+++ b/youtube_dl/extractor/heise.py
@ -3,7 +3,7 @@

 from .common import InfoExtractor
 from ..utils import (
-    get_meta_content,
+    determine_ext,
    int_or_none,
    parse_iso8601,
 )
@ -25,11 +25,11 @@ class HeiseIE(InfoExtractor):
            'title': (
                "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone"
            ),
-            'format_id': 'mp4_720',
+            'format_id': 'mp4_720p',
            'timestamp': 1411812600,
            'upload_date': '20140927',
            'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.',
-            'thumbnail': 're:https?://.*\.jpg$',
+            'thumbnail': 're:^https?://.*\.jpe?g$',
        }
    }

@ -49,11 +49,12 @@ def _real_extract(self, url):
        info = {
            'id': video_id,
            'thumbnail': self._og_search_thumbnail(webpage),
-            'timestamp': parse_iso8601(get_meta_content('date', webpage)),
+            'timestamp': parse_iso8601(
+                self._html_search_meta('date', webpage)),
            'description': self._og_search_description(webpage),
        }

-        title = get_meta_content('fulltitle', webpage)
+        title = self._html_search_meta('fulltitle', webpage)
        if title:
            info['title'] = title
        else:
@ -64,9 +65,12 @@ def _real_extract(self, url):
            label = source_node.attrib['label']
            height = int_or_none(self._search_regex(
                r'^(.*?_)?([0-9]+)p$', label, 'height', default=None))
+            video_url = source_node.attrib['file']
+            ext = determine_ext(video_url, '')
            formats.append({
-                'url': source_node.attrib['file'],
+                'url': video_url,
                'format_note': label,
+                'format_id': '%s_%s' % (ext, label),
                'height': height,
            })
        self._sort_formats(formats)