From a00d781b730b052d8a6486a03854ca4122389af8 Mon Sep 17 00:00:00 2001 From: Felix S Date: Sun, 18 Apr 2021 02:09:48 +0200 Subject: [PATCH] [elonet] Use common code for subtitle extraction --- yt_dlp/extractor/elonet.py | 82 ++++++++------------------------------ 1 file changed, 17 insertions(+), 65 deletions(-) diff --git a/yt_dlp/extractor/elonet.py b/yt_dlp/extractor/elonet.py index 3647c0a9c..eefba4e24 100644 --- a/yt_dlp/extractor/elonet.py +++ b/yt_dlp/extractor/elonet.py @@ -1,9 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import os import re -import tempfile from .common import InfoExtractor from ..utils import ( @@ -12,12 +10,12 @@ try_get, ) from ..compat import compat_str -from ..downloader.hls import HlsFD class ElonetIE(InfoExtractor): _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P[0-9]+)' - _TEST = { + _TESTS = [{ + # m3u8 with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867', 'md5': '8efc954b96c543711707f87de757caea', 'info_dict': { @@ -27,62 +25,17 @@ class ElonetIE(InfoExtractor): 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...', 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large', }, - } - - def _download_m3u8_chunked_subtitle(self, chunklist_url): - """ - Download VTT subtitles from pieces in manifest URL. - Return a string containing joined chunks with extra headers removed. - """ - with tempfile.NamedTemporaryFile(delete=True) as outfile: - fname = outfile.name - hlsdl = HlsFD(self._downloader, {}) - hlsdl.download(compat_str(fname), {"url": chunklist_url}) - with open(fname, 'r') as fin: - # Remove (some) headers - fdata = re.sub(r'X-TIMESTAMP-MAP.*\n+|WEBVTT\n+', '', fin.read()) - os.remove(fname) - return "WEBVTT\n\n" + fdata - - def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url): - """ - Parse subtitles from HLS / m3u8 manifest. - """ - subtitles = {} - baseurl = m3u8_url[:m3u8_url.rindex('/') + 1] - for line in m3u8_doc.split('\n'): - if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line: - lang = self._search_regex( - r'LANGUAGE="(.+?)"', line, 'lang', default=False) - uri = self._search_regex( - r'URI="(.+?)"', line, 'uri', default=False) - if lang and uri: - data = self._download_m3u8_chunked_subtitle(baseurl + uri) - subtitles[lang] = [{'ext': 'vtt', 'data': data}] - return subtitles - - def _parse_mpd_subtitles(self, mpd_doc): - """ - Parse subtitles from MPD manifest. - """ - ns = '{urn:mpeg:dash:schema:mpd:2011}' - subtitles = {} - for aset in mpd_doc.findall(".//%sAdaptationSet[@mimeType='text/vtt']" % (ns)): - lang = aset.attrib.get('lang', 'unk') - url = aset.find("./%sRepresentation/%sBaseURL" % (ns, ns)).text - subtitles[lang] = [{'ext': 'vtt', 'url': url}] - return subtitles - - def _get_subtitles(self, fmt, doc, url): - if fmt == 'm3u8': - subs = self._parse_m3u8_subtitles(doc, url) - elif fmt == 'mpd': - subs = self._parse_mpd_subtitles(doc) - else: - self.report_warning( - "Cannot download subtitles from '%s' streams." % (fmt)) - subs = {} - return subs + }, { + # DASH with subtitles + 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539', + 'info_dict': { + 'id': '116539', + 'ext': 'mp4', + 'title': 'Minulla on tiikeri', + 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...', + 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr', + } + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -101,8 +54,8 @@ def _real_extract(self, url): self._parse_json(json_s, video_id), lambda x: x[0]["src"], compat_str) formats = [] + subtitles = {} if re.search(r'\.m3u8\??', src): - fmt = 'm3u8' res = self._download_webpage_handle( # elonet servers have certificate problems src.replace('https:', 'http:'), video_id, @@ -111,11 +64,10 @@ def _real_extract(self, url): if res: doc, urlh = res url = urlh.geturl() - formats = self._parse_m3u8_formats(doc, url) + formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url) for f in formats: f['ext'] = 'mp4' elif re.search(r'\.mpd\??', src): - fmt = 'mpd' res = self._download_xml_handle( src, video_id, note='Downloading MPD manifest', @@ -123,7 +75,7 @@ def _real_extract(self, url): if res: doc, urlh = res url = base_url(urlh.geturl()) - formats = self._parse_mpd_formats(doc, mpd_base_url=url) + formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url) else: raise ExtractorError("Unknown streaming format") @@ -133,5 +85,5 @@ def _real_extract(self, url): 'description': description, 'thumbnail': thumbnail, 'formats': formats, - 'subtitles': self.extract_subtitles(fmt, doc, url), + 'subtitles': subtitles, }