From 2e5d60b7db7020b726cd54ee4cad8f2afbd1479d Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 21 Feb 2013 20:51:35 +0100 Subject: [PATCH 01/14] Removed conversion from youtube closed caption format to srt since youtube api supports the 'srt' format --- test/test_youtube_subtitles.py | 4 ++-- youtube_dl/InfoExtractors.py | 24 ++++-------------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 5d3566a35..ff09ea459 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -43,7 +43,7 @@ def test_youtube_subtitles(self): DL.params['writesubtitles'] = True IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - self.assertEqual(md5(info_dict[0]['subtitles']), 'c3228550d59116f3c29fba370b55d033') + self.assertEqual(md5(info_dict[0]['subtitles']), '4cd9278a35ba2305f47354ee13472260') def test_youtube_subtitles_it(self): DL = FakeDownloader() @@ -51,7 +51,7 @@ def test_youtube_subtitles_it(self): DL.params['subtitleslang'] = 'it' IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - self.assertEqual(md5(info_dict[0]['subtitles']), '132a88a0daf8e1520f393eb58f1f646a') + self.assertEqual(md5(info_dict[0]['subtitles']), '164a51f16f260476a05b50fe4c2f161d') if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index d3c3ac264..e3998fbe8 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -228,23 +228,6 @@ def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') - def _closed_captions_xml_to_srt(self, xml_string): - srt = '' - texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) - # TODO parse xml instead of regex - for n, (start, dur_tag, dur, caption) in enumerate(texts): - if not dur: dur = '4' - start = float(start) - end = start + float(dur) - start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) - end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = unescapeHTML(caption) - caption = unescapeHTML(caption) # double cycle, intentional - srt += str(n+1) + '\n' - srt += start + ' --> ' + end + '\n' - srt += caption + '\n\n' - return srt - def _extract_subtitles(self, video_id): self.report_video_subtitles_download(video_id) request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) @@ -268,15 +251,16 @@ def _extract_subtitles(self, video_id): 'lang': srt_lang, 'name': srt_lang_list[srt_lang].encode('utf-8'), 'v': video_id, + 'fmt': 'srt', }) url = 'http://www.youtube.com/api/timedtext?' + params try: - srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8') + srt = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - if not srt_xml: + if not srt: return (u'WARNING: Did not fetch video subtitles', None) - return (None, self._closed_captions_xml_to_srt(srt_xml)) + return (None, srt) def _print_formats(self, formats): print('Available formats:') From cdb130b09a16865b81fd34d19b74fa634d45cad7 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 21 Feb 2013 22:12:36 +0100 Subject: [PATCH 02/14] Added new option '--only-srt' to download only the subtitles of a video Improved option '--srt-lang' - it shows the argument in case of missing subtitles - added language suffix for non-english languages (e.g. video.it.srt) --- test/test_youtube_subtitles.py | 7 +++++++ youtube_dl/FileDownloader.py | 5 +++++ youtube_dl/InfoExtractors.py | 7 ++++++- youtube_dl/__init__.py | 4 ++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index ff09ea459..77c275b75 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -53,5 +53,12 @@ def test_youtube_subtitles_it(self): info_dict = IE.extract('QRS8MkLhQmM') self.assertEqual(md5(info_dict[0]['subtitles']), '164a51f16f260476a05b50fe4c2f161d') + def test_youtube_onlysubtitles(self): + DL = FakeDownloader() + DL.params['onlysubtitles'] = True + IE = YoutubeIE(DL) + info_dict = IE.extract('QRS8MkLhQmM') + self.assertEqual(md5(info_dict[0]['subtitles']), '4cd9278a35ba2305f47354ee13472260') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 53c2d1dce..487c9dadb 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -79,6 +79,7 @@ class FileDownloader(object): writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file writesubtitles: Write the video subtitles to a .srt file + onlysubtitles: Downloads only the subtitles of the video subtitleslang: Language of the subtitles to download test: Download only first bytes to test the downloader. keepvideo: Keep the video file after post-processing @@ -443,9 +444,13 @@ def process_info(self, info_dict): # that way it will silently go on when used with unsupporting IE try: srtfn = filename.rsplit('.', 1)[0] + u'.srt' + if self.params.get('subtitleslang', False): + srtfn = filename.rsplit('.', 1)[0] + u'.' + self.params['subtitleslang'] + u'.srt' self.report_writesubtitles(srtfn) with io.open(encodeFilename(srtfn), 'w', encoding='utf-8') as srtfile: srtfile.write(info_dict['subtitles']) + if self.params.get('onlysubtitles', False): + return except (OSError, IOError): self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) return diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index e3998fbe8..51b263383 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -228,6 +228,7 @@ def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') + def _extract_subtitles(self, video_id): self.report_video_subtitles_download(video_id) request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) @@ -246,7 +247,7 @@ def _extract_subtitles(self, video_id): else: srt_lang = list(srt_lang_list.keys())[0] if not srt_lang in srt_lang_list: - return (u'WARNING: no closed captions found in the specified language', None) + return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None) params = compat_urllib_parse.urlencode({ 'lang': srt_lang, 'name': srt_lang_list[srt_lang].encode('utf-8'), @@ -483,6 +484,10 @@ def _real_extract(self, url): # closed captions video_subtitles = None + if self._downloader.params.get('subtitleslang', False): + self._downloader.params['writesubtitles'] = True + if self._downloader.params.get('onlysubtitles', False): + self._downloader.params['writesubtitles'] = True if self._downloader.params.get('writesubtitles', False): (srt_error, video_subtitles) = self._extract_subtitles(video_id) if srt_error: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 23e3c2ac2..ababeac87 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -176,6 +176,9 @@ def _find_term_columns(): video_format.add_option('--write-srt', action='store_true', dest='writesubtitles', help='write video closed captions to a .srt file (currently youtube only)', default=False) + video_format.add_option('--only-srt', + action='store_true', dest='onlysubtitles', + help='downloads only the subtitles of the video (currently youtube only)', default=False) video_format.add_option('--srt-lang', action='store', dest='subtitleslang', metavar='LANG', help='language of the closed captions to download (optional) use IETF language tags like \'en\'') @@ -450,6 +453,7 @@ def _real_main(): 'writedescription': opts.writedescription, 'writeinfojson': opts.writeinfojson, 'writesubtitles': opts.writesubtitles, + 'onlysubtitles': opts.onlysubtitles, 'subtitleslang': opts.subtitleslang, 'matchtitle': decodeOption(opts.matchtitle), 'rejecttitle': decodeOption(opts.rejecttitle), From 47dcd621c0ee23018ba306ee7f6ba6e338ef06da Mon Sep 17 00:00:00 2001 From: Tim Douglas Date: Wed, 6 Mar 2013 12:46:45 -0500 Subject: [PATCH 03/14] Escapist videos are acutally .mp4, not .flv --- youtube_dl/InfoExtractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 7ce84fe79..6b03bf307 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -2557,7 +2557,7 @@ def _real_extract(self, url): 'uploader': showName, 'upload_date': None, 'title': showName, - 'ext': 'flv', + 'ext': 'mp4', 'thumbnail': imgUrl, 'description': description, 'player_url': playerUrl, From e5edd51de458d52f3824e6d8fc7c0713659694a4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 8 Mar 2013 20:12:05 +0100 Subject: [PATCH 04/14] Clear up error messages (#734) --- youtube_dl/FileDownloader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 57f741c30..3b2adf84b 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -372,8 +372,11 @@ def prepare_filename(self, info_dict): filename = self.params['outtmpl'] % template_dict return filename - except (ValueError, KeyError) as err: - self.trouble(u'ERROR: invalid system charset or erroneous output template') + except KeyError as err: + self.trouble(u'ERROR: Erroneous output template') + return None + except ValueError as err: + self.trouble(u'ERROR: Insufficient system charset ' + repr(preferredencoding())) return None def _match_entry(self, info_dict): From 3d3423574d35a0fe71062f21dd57ada02a5225b4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 8 Mar 2013 20:47:06 +0100 Subject: [PATCH 05/14] Fix Unicode handling GenericIE (Fixes #734) --- test/tests.json | 6 ++++++ youtube_dl/InfoExtractors.py | 10 +++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/test/tests.json b/test/tests.json index 7af3c2892..e4ea0b41e 100644 --- a/test/tests.json +++ b/test/tests.json @@ -293,5 +293,11 @@ "info_dict": { "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" } + }, + { + "name": "Generic", + "url": "http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html", + "file": "13601338388002.mp4", + "md5": "85b90ccc9d73b4acd9138d3af4c27f89" } ] diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 6b03bf307..baba4a9a2 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1281,7 +1281,8 @@ def __init__(self, downloader=None): def report_download_webpage(self, video_id): """Report webpage download.""" - self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') + if not self._downloader.params.get('test', False): + self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id) def report_extraction(self, video_id): @@ -1351,13 +1352,8 @@ def _real_extract(self, url): if self._test_redirect(url): return video_id = url.split('/')[-1] - request = compat_urllib_request.Request(url) try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return + webpage = self._download_webpage(url, video_id) except ValueError as err: # since this is the last-resort InfoExtractor, if # this error is thrown, it'll be thrown here From 8c42c506cdaab6f8e1cc65a2f3f2f756188a68fe Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 12 Mar 2013 00:10:05 +0100 Subject: [PATCH 06/14] Add configuration to -v output --- youtube_dl/__init__.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3983e2f0e..8a7aab7ac 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -274,12 +274,20 @@ def _find_term_columns(): xdg_config_home = os.environ.get('XDG_CONFIG_HOME') if xdg_config_home: - userConf = os.path.join(xdg_config_home, 'youtube-dl.conf') + userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') else: - userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') - argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:] + userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') + systemConf = _readOptions('/etc/youtube-dl.conf') + userConf = _readOptions(userConfFile) + commandLineConf = sys.argv[1:] + argv = systemConf + userConf + commandLineConf opts, args = parser.parse_args(argv) + if opts.verbose: + print(u'[debug] System config: ' + repr(systemConf)) + print(u'[debug] User config: ' + repr(userConf)) + print(u'[debug] Command-line args: ' + repr(commandLineConf)) + return parser, opts, args def _real_main(): From e32b06e977447f6be78c02c66f35f609f81331ce Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 12 Mar 2013 01:08:54 +0100 Subject: [PATCH 07/14] Spiegel IE --- test/tests.json | 9 ++++++++ youtube_dl/InfoExtractors.py | 43 +++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/test/tests.json b/test/tests.json index e4ea0b41e..fd9d33332 100644 --- a/test/tests.json +++ b/test/tests.json @@ -299,5 +299,14 @@ "url": "http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html", "file": "13601338388002.mp4", "md5": "85b90ccc9d73b4acd9138d3af4c27f89" + }, + { + "name": "Spiegel", + "url": "http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html", + "file": "1259285.mp4", + "md5": "2c2754212136f35fb4b19767d242f66e", + "info_dict": { + "title": "Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv" + } } ] diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index baba4a9a2..44b4c4376 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -126,8 +126,14 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): """ Returns the data of the page as a string """ urlh = self._request_webpage(url_or_request, video_id, note, errnote) + content_type = urlh.headers.get('Content-Type', '') + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + encoding = 'utf-8' webpage_bytes = urlh.read() - return webpage_bytes.decode('utf-8', 'replace') + return webpage_bytes.decode(encoding, 'replace') class YoutubeIE(InfoExtractor): @@ -4090,6 +4096,40 @@ def _real_extract(self, url): } return [info] +class SpiegelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:\.html)?$' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('videoID') + + webpage = self._download_webpage(url, video_id) + m = re.search(r'
(.*?)
', webpage) + if not m: + raise ExtractorError(u'Cannot find title') + video_title = unescapeHTML(m.group(1)) + + xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' + xml_code = self._download_webpage(xml_url, video_id, + note=u'Downloading XML', errnote=u'Failed to download XML') + + idoc = xml.etree.ElementTree.fromstring(xml_code) + last_type = idoc[-1] + filename = last_type.findall('./filename')[0].text + duration = float(last_type.findall('./duration')[0].text) + + video_url = 'http://video2.spiegel.de/flash/' + filename + video_ext = filename.rpartition('.')[2] + info = { + 'id': video_id, + 'url': video_url, + 'ext': video_ext, + 'title': video_title, + 'duration': duration, + } + return [info] + + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4138,6 +4178,7 @@ def gen_extractors(): KeekIE(), TEDIE(), MySpassIE(), + SpiegelIE(), GenericIE() ] From ae608b8076497d70e2a95e5e939c1fb31e2dde53 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Fri, 22 Feb 2013 02:52:55 +0100 Subject: [PATCH 08/14] Added new option '--all-srt' to download all the subtitles of a video. Only works in youtube for the moment. --- test/parameters.json | 6 ++- test/test_youtube_subtitles.py | 31 ++++++++++++--- youtube_dl/FileDownloader.py | 28 ++++++++++--- youtube_dl/InfoExtractors.py | 73 ++++++++++++++++++++++++---------- youtube_dl/__init__.py | 4 ++ 5 files changed, 107 insertions(+), 35 deletions(-) diff --git a/test/parameters.json b/test/parameters.json index 8215d25c5..0d4bd644c 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -36,5 +36,7 @@ "verbose": true, "writedescription": false, "writeinfojson": true, - "writesubtitles": false -} \ No newline at end of file + "writesubtitles": false, + "onlysubtitles": false, + "allsubtitles": false +} diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 77c275b75..3b5a53fca 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -38,27 +38,48 @@ def download(self, x): md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class TestYoutubeSubtitles(unittest.TestCase): + def setUp(self): + DL = FakeDownloader() + DL.params['allsubtitles'] = False + DL.params['writesubtitles'] = False + + def test_youtube_no_subtitles(self): + DL = FakeDownloader() + DL.params['writesubtitles'] = False + IE = YoutubeIE(DL) + info_dict = IE.extract('QRS8MkLhQmM') + subtitles = info_dict[0]['subtitles'] + self.assertEqual(subtitles, None) def test_youtube_subtitles(self): DL = FakeDownloader() DL.params['writesubtitles'] = True IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - self.assertEqual(md5(info_dict[0]['subtitles']), '4cd9278a35ba2305f47354ee13472260') - + sub = info_dict[0]['subtitles'][0] + self.assertEqual(md5(sub[2]), '4cd9278a35ba2305f47354ee13472260') def test_youtube_subtitles_it(self): DL = FakeDownloader() DL.params['writesubtitles'] = True DL.params['subtitleslang'] = 'it' IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - self.assertEqual(md5(info_dict[0]['subtitles']), '164a51f16f260476a05b50fe4c2f161d') - + sub = info_dict[0]['subtitles'][0] + self.assertEqual(md5(sub[2]), '164a51f16f260476a05b50fe4c2f161d') def test_youtube_onlysubtitles(self): DL = FakeDownloader() + DL.params['writesubtitles'] = True DL.params['onlysubtitles'] = True IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - self.assertEqual(md5(info_dict[0]['subtitles']), '4cd9278a35ba2305f47354ee13472260') + sub = info_dict[0]['subtitles'][0] + self.assertEqual(md5(sub[2]), '4cd9278a35ba2305f47354ee13472260') + def test_youtube_allsubtitles(self): + DL = FakeDownloader() + DL.params['allsubtitles'] = True + IE = YoutubeIE(DL) + info_dict = IE.extract('QRS8MkLhQmM') + subtitles = info_dict[0]['subtitles'] + self.assertEqual(len(subtitles), 12) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 487c9dadb..e496b8a8d 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -80,6 +80,7 @@ class FileDownloader(object): writeinfojson: Write the video description to a .info.json file writesubtitles: Write the video subtitles to a .srt file onlysubtitles: Downloads only the subtitles of the video + allsubtitles: Downloads all the subtitles of the video subtitleslang: Language of the subtitles to download test: Download only first bytes to test the downloader. keepvideo: Keep the video file after post-processing @@ -442,18 +443,33 @@ def process_info(self, info_dict): if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE + subtitle = info_dict['subtitles'][0] + (srt_error, srt_lang, srt) = subtitle try: - srtfn = filename.rsplit('.', 1)[0] + u'.srt' - if self.params.get('subtitleslang', False): - srtfn = filename.rsplit('.', 1)[0] + u'.' + self.params['subtitleslang'] + u'.srt' + srtfn = filename.rsplit('.', 1)[0] + u'.' + srt_lang + u'.srt' self.report_writesubtitles(srtfn) with io.open(encodeFilename(srtfn), 'w', encoding='utf-8') as srtfile: - srtfile.write(info_dict['subtitles']) - if self.params.get('onlysubtitles', False): - return + srtfile.write(srt) except (OSError, IOError): self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) return + if self.params.get('onlysubtitles', False): + return + + if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: + subtitles = info_dict['subtitles'] + for subtitle in subtitles: + (srt_error, srt_lang, srt) = subtitle + try: + srtfn = filename.rsplit('.', 1)[0] + u'.' + srt_lang + u'.srt' + self.report_writesubtitles(srtfn) + with io.open(encodeFilename(srtfn), 'w', encoding='utf-8') as srtfile: + srtfile.write(srt) + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) + return + if self.params.get('onlysubtitles', False): + return if self.params.get('writeinfojson', False): infofn = filename + u'.info.json' diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 51b263383..a220de80a 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -216,6 +216,10 @@ def report_video_subtitles_download(self, video_id): """Report attempt to download video info webpage.""" self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + def report_video_subtitles_request(self, video_id, lang): + """Report attempt to download video info webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for lang: %s' % (video_id,lang)) + def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) @@ -228,9 +232,7 @@ def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') - - def _extract_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) + def _get_available_subtitles(self, video_id): request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8') @@ -240,19 +242,15 @@ def _extract_subtitles(self, video_id): srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) if not srt_lang_list: return (u'WARNING: video has no closed captions', None) - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = list(srt_lang_list.keys())[0] - if not srt_lang in srt_lang_list: - return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None) + return srt_lang_list + + def _request_subtitle(self, str_lang, str_name, video_id, format = 'srt'): + self.report_video_subtitles_request(video_id, str_lang) params = compat_urllib_parse.urlencode({ - 'lang': srt_lang, - 'name': srt_lang_list[srt_lang].encode('utf-8'), + 'lang': str_lang, + 'name': str_name, 'v': video_id, - 'fmt': 'srt', + 'fmt': format, }) url = 'http://www.youtube.com/api/timedtext?' + params try: @@ -261,7 +259,32 @@ def _extract_subtitles(self, video_id): return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) if not srt: return (u'WARNING: Did not fetch video subtitles', None) - return (None, srt) + return (None, str_lang, srt) + + def _extract_subtitle(self, video_id): + self.report_video_subtitles_download(video_id) + srt_lang_list = self._get_available_subtitles(video_id) + + if self._downloader.params.get('subtitleslang', False): + srt_lang = self._downloader.params.get('subtitleslang') + elif 'en' in srt_lang_list: + srt_lang = 'en' + else: + srt_lang = list(srt_lang_list.keys())[0] + if not srt_lang in srt_lang_list: + return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None) + + sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id) + return [sub] + + def _extract_all_subtitles(self, video_id): + self.report_video_subtitles_download(video_id) + srt_lang_list = self._get_available_subtitles(video_id) + subs = [] + for srt_lang in srt_lang_list: + sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id) + subs.append(sub) + return subs def _print_formats(self, formats): print('Available formats:') @@ -484,14 +507,20 @@ def _real_extract(self, url): # closed captions video_subtitles = None - if self._downloader.params.get('subtitleslang', False): - self._downloader.params['writesubtitles'] = True - if self._downloader.params.get('onlysubtitles', False): - self._downloader.params['writesubtitles'] = True + if self._downloader.params.get('writesubtitles', False): - (srt_error, video_subtitles) = self._extract_subtitles(video_id) - if srt_error: - self._downloader.trouble(srt_error) + video_subtitles = self._extract_subtitle(video_id) + if video_subtitles: + (srt_error, srt_lang, srt) = video_subtitles[0] + if srt_error: + self._downloader.trouble(srt_error) + + if self._downloader.params.get('allsubtitles', False): + video_subtitles = self._extract_all_subtitles(video_id) + for video_subtitle in video_subtitles: + (srt_error, srt_lang, srt) = video_subtitle + if srt_error: + self._downloader.trouble(srt_error) if 'length_seconds' not in video_info: self._downloader.trouble(u'WARNING: unable to extract video duration') diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ababeac87..20a22a4d1 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -179,6 +179,9 @@ def _find_term_columns(): video_format.add_option('--only-srt', action='store_true', dest='onlysubtitles', help='downloads only the subtitles of the video (currently youtube only)', default=False) + video_format.add_option('--all-srt', + action='store_true', dest='allsubtitles', + help='downloads all the available subtitles of the video (currently youtube only)', default=False) video_format.add_option('--srt-lang', action='store', dest='subtitleslang', metavar='LANG', help='language of the closed captions to download (optional) use IETF language tags like \'en\'') @@ -454,6 +457,7 @@ def _real_main(): 'writeinfojson': opts.writeinfojson, 'writesubtitles': opts.writesubtitles, 'onlysubtitles': opts.onlysubtitles, + 'allsubtitles': opts.allsubtitles, 'subtitleslang': opts.subtitleslang, 'matchtitle': decodeOption(opts.matchtitle), 'rejecttitle': decodeOption(opts.rejecttitle), From 553d097442ad5ee62d227de2e2703a2377dcf40f Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Fri, 22 Feb 2013 03:13:28 +0100 Subject: [PATCH 09/14] Refactor subtitle options from srt to the more generic 'sub'. In order to be more consistent with different subtitle formats. From: * --write-srt to --write-sub * --only-srt to --only-sub * --all-srt to --all-subs * --srt-lang to --sub-lang' Refactored also all the mentions of srt for sub in all the source code. --- youtube_dl/FileDownloader.py | 26 +++++++------- youtube_dl/InfoExtractors.py | 68 ++++++++++++++++++------------------ youtube_dl/__init__.py | 14 ++++---- 3 files changed, 54 insertions(+), 54 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index e496b8a8d..4549dd464 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -78,7 +78,7 @@ class FileDownloader(object): updatetime: Use the Last-modified header to set output file timestamps. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file - writesubtitles: Write the video subtitles to a .srt file + writesubtitles: Write the video subtitles to a file (default=srt) onlysubtitles: Downloads only the subtitles of the video allsubtitles: Downloads all the subtitles of the video subtitleslang: Language of the subtitles to download @@ -291,9 +291,9 @@ def report_writedescription(self, descfn): """ Report that the description file is being written """ self.to_screen(u'[info] Writing video description to: ' + descfn) - def report_writesubtitles(self, srtfn): + def report_writesubtitles(self, sub_filename): """ Report that the subtitles file is being written """ - self.to_screen(u'[info] Writing video subtitles to: ' + srtfn) + self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename) def report_writeinfojson(self, infofn): """ Report that the metadata file has been written """ @@ -444,12 +444,12 @@ def process_info(self, info_dict): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitle = info_dict['subtitles'][0] - (srt_error, srt_lang, srt) = subtitle + (sub_error, sub_lang, sub) = subtitle try: - srtfn = filename.rsplit('.', 1)[0] + u'.' + srt_lang + u'.srt' - self.report_writesubtitles(srtfn) - with io.open(encodeFilename(srtfn), 'w', encoding='utf-8') as srtfile: - srtfile.write(srt) + sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.srt' + self.report_writesubtitles(sub_filename) + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + subfile.write(sub) except (OSError, IOError): self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) return @@ -459,12 +459,12 @@ def process_info(self, info_dict): if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: subtitles = info_dict['subtitles'] for subtitle in subtitles: - (srt_error, srt_lang, srt) = subtitle + (sub_error, sub_lang, sub) = subtitle try: - srtfn = filename.rsplit('.', 1)[0] + u'.' + srt_lang + u'.srt' - self.report_writesubtitles(srtfn) - with io.open(encodeFilename(srtfn), 'w', encoding='utf-8') as srtfile: - srtfile.write(srt) + sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.srt' + self.report_writesubtitles(sub_filename) + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + subfile.write(sub) except (OSError, IOError): self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) return diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index a220de80a..e078bb083 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -47,7 +47,7 @@ class InfoExtractor(object): uploader_id: Nickname or id of the video uploader. location: Physical location of the video. player_url: SWF Player URL (used for rtmpdump). - subtitles: The .srt file contents. + subtitles: The subtitle file contents. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen @@ -235,56 +235,56 @@ def report_rtmp_download(self): def _get_available_subtitles(self, video_id): request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) - srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) - if not srt_lang_list: + sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) + if not sub_lang_list: return (u'WARNING: video has no closed captions', None) - return srt_lang_list + return sub_lang_list - def _request_subtitle(self, str_lang, str_name, video_id, format = 'srt'): - self.report_video_subtitles_request(video_id, str_lang) + def _request_subtitle(self, sub_lang, sub_name, video_id, format = 'srt'): + self.report_video_subtitles_request(video_id, sub_lang) params = compat_urllib_parse.urlencode({ - 'lang': str_lang, - 'name': str_name, + 'lang': sub_lang, + 'name': sub_name, 'v': video_id, 'fmt': format, }) url = 'http://www.youtube.com/api/timedtext?' + params try: - srt = compat_urllib_request.urlopen(url).read().decode('utf-8') + sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - if not srt: + if not sub: return (u'WARNING: Did not fetch video subtitles', None) - return (None, str_lang, srt) + return (None, sub_lang, sub) def _extract_subtitle(self, video_id): self.report_video_subtitles_download(video_id) - srt_lang_list = self._get_available_subtitles(video_id) + sub_lang_list = self._get_available_subtitles(video_id) if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' + sub_lang = self._downloader.params.get('subtitleslang') + elif 'en' in sub_lang_list: + sub_lang = 'en' else: - srt_lang = list(srt_lang_list.keys())[0] - if not srt_lang in srt_lang_list: - return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None) + sub_lang = list(sub_lang_list.keys())[0] + if not sub_lang in sub_lang_list: + return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None) - sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id) - return [sub] + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id) + return [subtitle] def _extract_all_subtitles(self, video_id): self.report_video_subtitles_download(video_id) - srt_lang_list = self._get_available_subtitles(video_id) - subs = [] - for srt_lang in srt_lang_list: - sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id) - subs.append(sub) - return subs + sub_lang_list = self._get_available_subtitles(video_id) + subtitles = [] + for sub_lang in sub_lang_list: + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id) + subtitles.append(subtitle) + return subtitles def _print_formats(self, formats): print('Available formats:') @@ -511,16 +511,16 @@ def _real_extract(self, url): if self._downloader.params.get('writesubtitles', False): video_subtitles = self._extract_subtitle(video_id) if video_subtitles: - (srt_error, srt_lang, srt) = video_subtitles[0] - if srt_error: - self._downloader.trouble(srt_error) + (sub_error, sub_lang, sub) = video_subtitles[0] + if sub_error: + self._downloader.trouble(sub_error) if self._downloader.params.get('allsubtitles', False): video_subtitles = self._extract_all_subtitles(video_id) for video_subtitle in video_subtitles: - (srt_error, srt_lang, srt) = video_subtitle - if srt_error: - self._downloader.trouble(srt_error) + (sub_error, sub_lang, sub) = video_subtitle + if sub_error: + self._downloader.trouble(sub_error) if 'length_seconds' not in video_info: self._downloader.trouble(u'WARNING: unable to extract video duration') diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 20a22a4d1..495b5ac41 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -173,18 +173,18 @@ def _find_term_columns(): action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') video_format.add_option('-F', '--list-formats', action='store_true', dest='listformats', help='list all available formats (currently youtube only)') - video_format.add_option('--write-srt', + video_format.add_option('--write-sub', action='store_true', dest='writesubtitles', - help='write video closed captions to a .srt file (currently youtube only)', default=False) - video_format.add_option('--only-srt', + help='write subtitle file (currently youtube only)', default=False) + video_format.add_option('--only-sub', action='store_true', dest='onlysubtitles', - help='downloads only the subtitles of the video (currently youtube only)', default=False) - video_format.add_option('--all-srt', + help='downloads only the subtitles (no video)', default=False) + video_format.add_option('--all-subs', action='store_true', dest='allsubtitles', help='downloads all the available subtitles of the video (currently youtube only)', default=False) - video_format.add_option('--srt-lang', + video_format.add_option('--sub-lang', action='store', dest='subtitleslang', metavar='LANG', - help='language of the closed captions to download (optional) use IETF language tags like \'en\'') + help='language of the subtitles to download (optional) use IETF language tags like \'en\'') verbosity.add_option('-q', '--quiet', action='store_true', dest='quiet', help='activates quiet mode', default=False) From 9e62bc443996c1950de0841997c76d110cb77c6e Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Fri, 22 Feb 2013 03:53:54 +0100 Subject: [PATCH 10/14] Added new option '--sub-format' to choose the format of the subtitles to downloade (defaut=srt) --- test/parameters.json | 1 + test/test_youtube_subtitles.py | 10 +++++++++- youtube_dl/FileDownloader.py | 9 ++++++--- youtube_dl/InfoExtractors.py | 11 ++++++----- youtube_dl/__init__.py | 4 ++++ 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/test/parameters.json b/test/parameters.json index 0d4bd644c..750b1c96e 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -29,6 +29,7 @@ "simulate": false, "skip_download": false, "subtitleslang": null, + "subtitlesformat": "srt", "test": true, "updatetime": true, "usenetrc": false, diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 3b5a53fca..94adc4555 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -42,7 +42,7 @@ def setUp(self): DL = FakeDownloader() DL.params['allsubtitles'] = False DL.params['writesubtitles'] = False - + DL.params['subtitlesformat'] = 'srt' def test_youtube_no_subtitles(self): DL = FakeDownloader() DL.params['writesubtitles'] = False @@ -80,6 +80,14 @@ def test_youtube_allsubtitles(self): info_dict = IE.extract('QRS8MkLhQmM') subtitles = info_dict[0]['subtitles'] self.assertEqual(len(subtitles), 12) + def test_youtube_subtitles_format(self): + DL = FakeDownloader() + DL.params['writesubtitles'] = True + DL.params['subtitlesformat'] = 'sbv' + IE = YoutubeIE(DL) + info_dict = IE.extract('QRS8MkLhQmM') + sub = info_dict[0]['subtitles'][0] + self.assertEqual(md5(sub[2]), '13aeaa0c245a8bed9a451cb643e3ad8b') if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 4549dd464..a041e1219 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -78,9 +78,10 @@ class FileDownloader(object): updatetime: Use the Last-modified header to set output file timestamps. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file - writesubtitles: Write the video subtitles to a file (default=srt) + writesubtitles: Write the video subtitles to a file onlysubtitles: Downloads only the subtitles of the video allsubtitles: Downloads all the subtitles of the video + subtitlesformat: Subtitle format [sbv/srt] (default=srt) subtitleslang: Language of the subtitles to download test: Download only first bytes to test the downloader. keepvideo: Keep the video file after post-processing @@ -445,8 +446,9 @@ def process_info(self, info_dict): # that way it will silently go on when used with unsupporting IE subtitle = info_dict['subtitles'][0] (sub_error, sub_lang, sub) = subtitle + sub_format = self.params.get('subtitlesformat') try: - sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.srt' + sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format self.report_writesubtitles(sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: subfile.write(sub) @@ -458,10 +460,11 @@ def process_info(self, info_dict): if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: subtitles = info_dict['subtitles'] + sub_format = self.params.get('subtitlesformat') for subtitle in subtitles: (sub_error, sub_lang, sub) = subtitle try: - sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.srt' + sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format self.report_writesubtitles(sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: subfile.write(sub) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index e078bb083..62522bb6c 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -244,7 +244,7 @@ def _get_available_subtitles(self, video_id): return (u'WARNING: video has no closed captions', None) return sub_lang_list - def _request_subtitle(self, sub_lang, sub_name, video_id, format = 'srt'): + def _request_subtitle(self, sub_lang, sub_name, video_id, format): self.report_video_subtitles_request(video_id, sub_lang) params = compat_urllib_parse.urlencode({ 'lang': sub_lang, @@ -264,7 +264,7 @@ def _request_subtitle(self, sub_lang, sub_name, video_id, format = 'srt'): def _extract_subtitle(self, video_id): self.report_video_subtitles_download(video_id) sub_lang_list = self._get_available_subtitles(video_id) - + sub_format = self._downloader.params.get('subtitlesformat') if self._downloader.params.get('subtitleslang', False): sub_lang = self._downloader.params.get('subtitleslang') elif 'en' in sub_lang_list: @@ -274,15 +274,16 @@ def _extract_subtitle(self, video_id): if not sub_lang in sub_lang_list: return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None) - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id) + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) return [subtitle] def _extract_all_subtitles(self, video_id): self.report_video_subtitles_download(video_id) sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') subtitles = [] for sub_lang in sub_lang_list: - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id) + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) subtitles.append(subtitle) return subtitles @@ -505,7 +506,7 @@ def _real_extract(self, url): else: video_description = '' - # closed captions + # subtitles video_subtitles = None if self._downloader.params.get('writesubtitles', False): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 495b5ac41..914d030a3 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -182,6 +182,9 @@ def _find_term_columns(): video_format.add_option('--all-subs', action='store_true', dest='allsubtitles', help='downloads all the available subtitles of the video (currently youtube only)', default=False) + video_format.add_option('--sub-format', + action='store', dest='subtitlesformat', metavar='LANG', + help='subtitle format [srt/sbv] (default=srt) (currently youtube only)', default='srt') video_format.add_option('--sub-lang', action='store', dest='subtitleslang', metavar='LANG', help='language of the subtitles to download (optional) use IETF language tags like \'en\'') @@ -458,6 +461,7 @@ def _real_main(): 'writesubtitles': opts.writesubtitles, 'onlysubtitles': opts.onlysubtitles, 'allsubtitles': opts.allsubtitles, + 'subtitlesformat': opts.subtitlesformat, 'subtitleslang': opts.subtitleslang, 'matchtitle': decodeOption(opts.matchtitle), 'rejecttitle': decodeOption(opts.rejecttitle), From 2a4093eaf3af07fa0a74926ce09cb49aba73017e Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Fri, 22 Feb 2013 04:50:05 +0100 Subject: [PATCH 11/14] Added new option '--list-subs' to show the available subtitle languages --- test/parameters.json | 3 ++- test/test_youtube_subtitles.py | 7 +++++++ youtube_dl/FileDownloader.py | 1 + youtube_dl/InfoExtractors.py | 26 +++++++++++++++++++------- youtube_dl/__init__.py | 4 ++++ 5 files changed, 33 insertions(+), 8 deletions(-) diff --git a/test/parameters.json b/test/parameters.json index 750b1c96e..96998b5c3 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -39,5 +39,6 @@ "writeinfojson": true, "writesubtitles": false, "onlysubtitles": false, - "allsubtitles": false + "allsubtitles": false, + "listssubtitles": false } diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 94adc4555..30f2246dd 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -43,6 +43,7 @@ def setUp(self): DL.params['allsubtitles'] = False DL.params['writesubtitles'] = False DL.params['subtitlesformat'] = 'srt' + DL.params['listsubtitles'] = False def test_youtube_no_subtitles(self): DL = FakeDownloader() DL.params['writesubtitles'] = False @@ -88,6 +89,12 @@ def test_youtube_subtitles_format(self): info_dict = IE.extract('QRS8MkLhQmM') sub = info_dict[0]['subtitles'][0] self.assertEqual(md5(sub[2]), '13aeaa0c245a8bed9a451cb643e3ad8b') + def test_youtube_list_subtitles(self): + DL = FakeDownloader() + DL.params['listsubtitles'] = True + IE = YoutubeIE(DL) + info_dict = IE.extract('QRS8MkLhQmM') + self.assertEqual(info_dict, None) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index a041e1219..164d25e54 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -81,6 +81,7 @@ class FileDownloader(object): writesubtitles: Write the video subtitles to a file onlysubtitles: Downloads only the subtitles of the video allsubtitles: Downloads all the subtitles of the video + listsubtitles: Lists all available subtitles for the video subtitlesformat: Subtitle format [sbv/srt] (default=srt) subtitleslang: Language of the subtitles to download test: Download only first bytes to test the downloader. diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 62522bb6c..ff1fab773 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -214,11 +214,16 @@ def report_video_info_webpage_download(self, video_id): def report_video_subtitles_download(self, video_id): """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id) - def report_video_subtitles_request(self, video_id, lang): + def report_video_subtitles_request(self, video_id, sub_lang, format): """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for lang: %s' % (video_id,lang)) + self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) + + def report_video_subtitles_available(self, video_id, sub_lang_list): + """Report available subtitles.""" + sub_lang = ",".join(list(sub_lang_list.keys())) + self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang)) def report_information_extraction(self, video_id): """Report attempt to extract video information.""" @@ -233,6 +238,7 @@ def report_rtmp_download(self): self._downloader.to_screen(u'[youtube] RTMP download detected') def _get_available_subtitles(self, video_id): + self.report_video_subtitles_download(video_id) request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') @@ -241,11 +247,15 @@ def _get_available_subtitles(self, video_id): sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) if not sub_lang_list: - return (u'WARNING: video has no closed captions', None) + return (u'WARNING: video doesn\'t have download', None) return sub_lang_list + def _list_available_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + self.report_video_subtitles_available(video_id, sub_lang_list) + def _request_subtitle(self, sub_lang, sub_name, video_id, format): - self.report_video_subtitles_request(video_id, sub_lang) + self.report_video_subtitles_request(video_id, sub_lang, format) params = compat_urllib_parse.urlencode({ 'lang': sub_lang, 'name': sub_name, @@ -262,7 +272,6 @@ def _request_subtitle(self, sub_lang, sub_name, video_id, format): return (None, sub_lang, sub) def _extract_subtitle(self, video_id): - self.report_video_subtitles_download(video_id) sub_lang_list = self._get_available_subtitles(video_id) sub_format = self._downloader.params.get('subtitlesformat') if self._downloader.params.get('subtitleslang', False): @@ -278,7 +287,6 @@ def _extract_subtitle(self, video_id): return [subtitle] def _extract_all_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) sub_lang_list = self._get_available_subtitles(video_id) sub_format = self._downloader.params.get('subtitlesformat') subtitles = [] @@ -523,6 +531,10 @@ def _real_extract(self, url): if sub_error: self._downloader.trouble(sub_error) + if self._downloader.params.get('listsubtitles', False): + sub_lang_list = self._list_available_subtitles(video_id) + return + if 'length_seconds' not in video_info: self._downloader.trouble(u'WARNING: unable to extract video duration') video_duration = '' diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 914d030a3..e5a7469af 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -182,6 +182,9 @@ def _find_term_columns(): video_format.add_option('--all-subs', action='store_true', dest='allsubtitles', help='downloads all the available subtitles of the video (currently youtube only)', default=False) + video_format.add_option('--list-subs', + action='store_true', dest='listsubtitles', + help='lists all available subtitles for the video (currently youtube only)', default=False) video_format.add_option('--sub-format', action='store', dest='subtitlesformat', metavar='LANG', help='subtitle format [srt/sbv] (default=srt) (currently youtube only)', default='srt') @@ -461,6 +464,7 @@ def _real_main(): 'writesubtitles': opts.writesubtitles, 'onlysubtitles': opts.onlysubtitles, 'allsubtitles': opts.allsubtitles, + 'listsubtitles': opts.listsubtitles, 'subtitlesformat': opts.subtitlesformat, 'subtitleslang': opts.subtitleslang, 'matchtitle': decodeOption(opts.matchtitle), From c0ba10467457a58e7198b58793f3c4683b1c3ec7 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Sat, 23 Feb 2013 16:24:59 +0100 Subject: [PATCH 12/14] Fixed typo in error message when no subtitles were available. --- youtube_dl/InfoExtractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index ff1fab773..ab8bd2104 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -247,7 +247,7 @@ def _get_available_subtitles(self, video_id): sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) if not sub_lang_list: - return (u'WARNING: video doesn\'t have download', None) + return (u'WARNING: video doesn\'t have subtitles', None) return sub_lang_list def _list_available_subtitles(self, video_id): From b9fc428494b22623529d364387b8693cc3cb1503 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Wed, 20 Mar 2013 11:29:07 +0100 Subject: [PATCH 13/14] add '--write-srt' and '--srt-lang' aliases for backwards compatibility --- youtube_dl/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index e5a7469af..c4f64893d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -173,7 +173,7 @@ def _find_term_columns(): action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') video_format.add_option('-F', '--list-formats', action='store_true', dest='listformats', help='list all available formats (currently youtube only)') - video_format.add_option('--write-sub', + video_format.add_option('--write-sub', '--write-srt', action='store_true', dest='writesubtitles', help='write subtitle file (currently youtube only)', default=False) video_format.add_option('--only-sub', @@ -188,7 +188,7 @@ def _find_term_columns(): video_format.add_option('--sub-format', action='store', dest='subtitlesformat', metavar='LANG', help='subtitle format [srt/sbv] (default=srt) (currently youtube only)', default='srt') - video_format.add_option('--sub-lang', + video_format.add_option('--sub-lang', '--srt-lang', action='store', dest='subtitleslang', metavar='LANG', help='language of the subtitles to download (optional) use IETF language tags like \'en\'') From f10b2a9c14db686e7f9b7d050f41b26d5cc35e01 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Wed, 20 Mar 2013 12:13:52 +0100 Subject: [PATCH 14/14] fix KeekIE --- youtube_dl/InfoExtractors.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 14fd644a2..835428f32 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3986,11 +3986,11 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) m = re.search(r'[\s\n]+

(?P\w+)

', webpage) - uploader = unescapeHTML(m.group('uploader')) + m = re.search(r'
[\S\s]+?

(?P.+?)

', webpage) + uploader = clean_html(m.group('uploader')) info = { - 'id':video_id, - 'url':video_url, + 'id': video_id, + 'url': video_url, 'ext': 'mp4', 'title': title, 'thumbnail': thumbnail,