1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2024-11-02 17:22:31 +01:00

[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

This commit is contained in:
Philipp Hagemeister 2014-10-26 17:05:44 +01:00
parent 488447455d
commit 23be51d8ce
2 changed files with 32 additions and 28 deletions

View File

@ -242,7 +242,6 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns a tuple (page content as string, URL handle) """ """ Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038) # Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)): if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0] url_or_request = url_or_request.partition('#')[0]
@ -251,6 +250,10 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
if urlh is False: if urlh is False:
assert not fatal assert not fatal
return False return False
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
return (content, urlh)
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
content_type = urlh.headers.get('Content-Type', '') content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read() webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@ -309,7 +312,7 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
msg += ' Visit %s for more details' % blocked_iframe msg += ' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True) raise ExtractorError(msg, expected=True)
return (content, urlh) return content
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the data of the page as a string """ """ Returns the data of the page as a string """

View File

@ -503,14 +503,14 @@ def _real_extract(self, url):
self.to_screen('%s: Requesting header' % video_id) self.to_screen('%s: Requesting header' % video_id)
head_req = HEADRequest(url) head_req = HEADRequest(url)
response = self._request_webpage( head_response = self._request_webpage(
head_req, video_id, head_req, video_id,
note=False, errnote='Could not send HEAD request to %s' % url, note=False, errnote='Could not send HEAD request to %s' % url,
fatal=False) fatal=False)
if response is not False: if head_response is not False:
# Check for redirect # Check for redirect
new_url = response.geturl() new_url = head_response.geturl()
if url != new_url: if url != new_url:
self.report_following_redirect(new_url) self.report_following_redirect(new_url)
if force_videoid: if force_videoid:
@ -518,34 +518,35 @@ def _real_extract(self, url):
new_url, {'force_videoid': force_videoid}) new_url, {'force_videoid': force_videoid})
return self.url_result(new_url) return self.url_result(new_url)
# Check for direct link to a video full_response = None
content_type = response.headers.get('Content-Type', '') if head_response is False:
m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) full_response = self._request_webpage(url, video_id)
if m: head_response = full_response
upload_date = response.headers.get('Last-Modified')
if upload_date: # Check for direct link to a video
upload_date = unified_strdate(upload_date) content_type = head_response.headers.get('Content-Type', '')
return { m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
'id': video_id, if m:
'title': os.path.splitext(url_basename(url))[0], upload_date = unified_strdate(
'formats': [{ head_response.headers.get('Last-Modified'))
'format_id': m.group('format_id'), return {
'url': url, 'id': video_id,
'vcodec': 'none' if m.group('type') == 'audio' else None 'title': os.path.splitext(url_basename(url))[0],
}], 'formats': [{
'upload_date': upload_date, 'format_id': m.group('format_id'),
} 'url': url,
'vcodec': 'none' if m.group('type') == 'audio' else None
}],
'upload_date': upload_date,
}
if not self._downloader.params.get('test', False) and not is_intentional: if not self._downloader.params.get('test', False) and not is_intentional:
self._downloader.report_warning('Falling back on generic information extractor.') self._downloader.report_warning('Falling back on generic information extractor.')
try: if full_response:
webpage = _webpage_read_content(url, video_id)
else:
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
except ValueError:
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
raise ExtractorError('Failed to download URL: %s' % url)
self.report_extraction(video_id) self.report_extraction(video_id)
# Is it an RSS feed? # Is it an RSS feed?