From 8dc831f7150bcd2cd07629fb41764778b85a4455 Mon Sep 17 00:00:00 2001 From: Ashish Gupta <39122144+Ashish0804@users.noreply.github.com> Date: Sat, 25 Sep 2021 16:55:33 +0530 Subject: [PATCH] [LinkedInLearning] Add subtitles (#1077) Authored by: Ashish0804 Closes #1072 --- yt_dlp/extractor/linkedin.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index 6d54d638a..f47d59a38 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +from itertools import zip_longest import re from .common import InfoExtractor @@ -8,6 +9,8 @@ ExtractorError, float_or_none, int_or_none, + srt_subtitles_timecode, + try_get, urlencode_postdata, urljoin, ) @@ -86,6 +89,16 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): }, } + def json2srt(self, transcript_lines, duration=None): + srt_data = '' + for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])): + start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption'] + end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1 + srt_data += '%d\n%s --> %s\n%s\n' % (line + 1, srt_subtitles_timecode(start_time), + srt_subtitles_timecode(end_time), + caption) + return srt_data + def _real_extract(self, url): course_slug, video_slug = self._match_valid_url(url).groups() @@ -101,6 +114,7 @@ def _real_extract(self, url): formats.append({ 'format_id': 'progressive-%dp' % height, 'url': progressive_url, + 'ext': 'mp4', 'height': height, 'width': width, 'source_preference': 1, @@ -128,6 +142,14 @@ def _real_extract(self, url): # However, unless someone can confirm this, the old # behaviour is being kept as-is self._sort_formats(formats, ('res', 'source_preference')) + subtitles = {} + duration = int_or_none(video_data.get('durationInSeconds')) + transcript_lines = try_get(video_data, lambda x: x['transcript']['lines'], expected_type=list) + if transcript_lines: + subtitles['en'] = [{ + 'ext': 'srt', + 'data': self.json2srt(transcript_lines, duration) + }] return { 'id': self._get_video_id(video_data, course_slug, video_slug), @@ -135,7 +157,8 @@ def _real_extract(self, url): 'formats': formats, 'thumbnail': video_data.get('defaultThumbnail'), 'timestamp': float_or_none(video_data.get('publishedOn'), 1000), - 'duration': int_or_none(video_data.get('durationInSeconds')), + 'duration': duration, + 'subtitles': subtitles, }