From 3951a7faae1c3916827d01b1e242bb12a9cdadec Mon Sep 17 00:00:00 2001 From: geauxlo <66712139+geauxlo@users.noreply.github.com> Date: Wed, 10 Jun 2020 06:38:32 +0000 Subject: [PATCH] Prefer API to scraping HTML when possible Also changed instances of `var is None` to `var == None`, and replaced `var.replace('http%3A', 'http:')` with a regex --- youtube_dl/extractor/screencast.py | 52 +++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index 69a0d01f3..d52d46cc3 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_parse_qs, @@ -13,6 +15,8 @@ class ScreencastIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P[a-zA-Z0-9]+)' + _API_URL = 'https://www.screencast.com/api/external/oembed?url=%s&format=json' + _TESTS = [{ 'url': 'http://www.screencast.com/t/3ZEjQXlT', 'md5': '917df1c13798a3e96211dd1561fded83', @@ -60,13 +64,32 @@ class ScreencastIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + + # The info JSON given by the API has a thumbnail URL, + # but it's inferior to the webpage's thumbnail. + # It also has no video description, so we + # definitely still need to get the webpage. + + info = self._download_json( + self._API_URL % url, video_id, + 'Downloading video info JSON') + + video_url = info.get('url') + if video_url != None: + video_url_raw = compat_urllib_request.quote(video_url) + video_url = re.sub(r'^(?Phttps|http)%3A', + lambda match: '%s:' % match.group('proto'), + video_url_raw) + + title = info.get('title') webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'http|https)%3A', + lambda match: '%s:' % match.group('proto'), + video_url_raw) - if video_url is None: + if video_url == None: video_meta = self._html_search_meta( 'og:video', webpage, default=None) if video_meta: @@ -90,28 +115,31 @@ def _real_extract(self, url): r'src=(.*?)(?:$|&)', video_meta, 'meta tag video URL', default=None) - if video_url is None: + if video_url == None: video_url = self._html_search_regex( r'MediaContentUrl["\']\s*:(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video url', default=None, group='url') - if video_url is None: + if video_url == None: video_url = self._html_search_meta( 'og:video', webpage, default=None) - if video_url is None: + if video_url == None: raise ExtractorError('Cannot find video') - title = self._og_search_title(webpage, default=None) - if title is None: + if title == None: + title = self._og_search_title(webpage, default=None) + + if title == None: title = self._html_search_regex( [r'Title: ([^<]+)', r'class="tabSeperator">>(.+?)<', r'([^<]+)'], webpage, 'title') + thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage, default=None) - if description is None: + if description == None: description = self._html_search_meta('description', webpage) return {