1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2024-11-17 00:22:34 +01:00

Combined extractors and refactored embed URL extraction

This commit is contained in:
Sean Ellingham 2024-06-21 19:51:00 +01:00
parent 3ae4eba8c4
commit 51df76a206
2 changed files with 57 additions and 53 deletions

View File

@ -2314,11 +2314,7 @@
) )
from .vidlii import VidLiiIE from .vidlii import VidLiiIE
from .vidly import VidlyIE from .vidly import VidlyIE
from .vidyard import ( from .vidyard import VidyardIE
VidyardEmbedIE,
VidyardPlayIE,
VidyardWatchIE,
)
from .viewlift import ( from .viewlift import (
ViewLiftEmbedIE, ViewLiftEmbedIE,
ViewLiftIE, ViewLiftIE,

View File

@ -1,8 +1,14 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import int_or_none, traverse_obj from ..utils import (
extract_attributes,
int_or_none,
traverse_obj,
)
class VidyardBaseInfoExtractor(InfoExtractor): class VidyardBaseIE(InfoExtractor):
def _get_formats_and_subtitles(self, video_source, video_id): def _get_formats_and_subtitles(self, video_source, video_id):
video_source = video_source or {} video_source = video_source or {}
@ -36,33 +42,13 @@ def _get_direct_subtitles(self, caption_json):
def _webpage_url(self, url, video_id): def _webpage_url(self, url, video_id):
return url return url
def _real_extract(self, url):
video_id = self._match_valid_url(url).group('id')
webpage = self._download_webpage(self._webpage_url(url, video_id), video_id)
json_data = self._download_json( class VidyardIE(VidyardBaseIE):
f'https://play.vidyard.com/player/{video_id}.json', video_id)['payload']['chapters'][0] _VALID_URL = [
r'https?://(?:[\w-]+\.hubs|share)\.vidyard\.com/watch/(?P<id>[\w-]+)',
formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], video_id) r'https?://embed\.vidyard\.com/share/(?P<id>[\w-]+)',
self._merge_subtitles(self._get_direct_subtitles(json_data.get('captions')), target=subtitles) r'https?://play\.vidyard\.com/(?P<id>[\w-]+)\.html',
]
return {
'id': str(json_data['videoId']),
'title': json_data.get('name') or self._og_search_title(webpage, default=None) or self._html_extract_title(webpage),
'description': json_data.get('description') or self._og_search_description(webpage, default=None),
'duration': int_or_none(json_data.get('seconds')),
'formats': formats,
'subtitles': subtitles,
'thumbnails': [{'url': thumbnail_url}
for thumbnail_url in traverse_obj(json_data, ('thumbnailUrls', ...))],
'http_headers': {
'referer': 'https://play.vidyard.com/',
},
}
class VidyardWatchIE(VidyardBaseInfoExtractor):
_VALID_URL = r'https?://(?:[\w-]+\.hubs|share)\.vidyard\.com/watch/(?P<id>[\w-]+)'
_TESTS = [ _TESTS = [
{ {
'url': 'https://vyexample03.hubs.vidyard.com/watch/oTDMPlUv--51Th455G5u7Q', 'url': 'https://vyexample03.hubs.vidyard.com/watch/oTDMPlUv--51Th455G5u7Q',
@ -86,12 +72,6 @@ class VidyardWatchIE(VidyardBaseInfoExtractor):
'duration': 41, 'duration': 41,
}, },
}, },
]
class VidyardEmbedIE(VidyardBaseInfoExtractor):
_VALID_URL = r'https?://embed\.vidyard\.com/share/(?P<id>[\w-]+)'
_TESTS = [
{ {
'url': 'https://embed.vidyard.com/share/oTDMPlUv--51Th455G5u7Q', 'url': 'https://embed.vidyard.com/share/oTDMPlUv--51Th455G5u7Q',
'info_dict': { 'info_dict': {
@ -103,12 +83,6 @@ class VidyardEmbedIE(VidyardBaseInfoExtractor):
'duration': 99, 'duration': 99,
}, },
}, },
]
class VidyardPlayIE(VidyardBaseInfoExtractor):
_VALID_URL = r'https?://play\.vidyard\.com/(?P<id>[\w-]+)\.[\w]+'
_TESTS = [
{ {
# URL of iframe embed src # URL of iframe embed src
'url': 'https://play.vidyard.com/iDqTwWGrd36vaLuaCY3nTs.html', 'url': 'https://play.vidyard.com/iDqTwWGrd36vaLuaCY3nTs.html',
@ -121,8 +95,8 @@ class VidyardPlayIE(VidyardBaseInfoExtractor):
}, },
}, },
{ {
# URL of inline/lightbox embed src # URL of iframe embed src (protocol relative URL)
'url': 'https://play.vidyard.com/iDqTwWGrd36vaLuaCY3nTs.jpg', 'url': '//play.vidyard.com/iDqTwWGrd36vaLuaCY3nTs.html?',
'info_dict': { 'info_dict': {
'id': '9281009', 'id': '9281009',
'ext': 'mp4', 'ext': 'mp4',
@ -134,13 +108,11 @@ class VidyardPlayIE(VidyardBaseInfoExtractor):
] ]
_EMBED_REGEX = [ _EMBED_REGEX = [
# iframe embed # iframe embed
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//play\.vidyard\.com/[\w-]+.html)\1', r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//play\.vidyard\.com/[\w-]+.\w+)\1',
# inline/lightbox embed
r'<img[^>]+?src=(["\'])(?P<url>(?:https?:)?//play\.vidyard\.com/[\w-]+.jpg)\1',
] ]
_WEBPAGE_TESTS = [ _WEBPAGE_TESTS = [
{ {
# URL containing embedded video # URL containing inline/lightbox embedded video
'url': 'https://resources.altium.com/p/2-the-extreme-importance-of-pc-board-stack-up', 'url': 'https://resources.altium.com/p/2-the-extreme-importance-of-pc-board-stack-up',
'info_dict': { 'info_dict': {
'id': '3225198', 'id': '3225198',
@ -152,5 +124,41 @@ class VidyardPlayIE(VidyardBaseInfoExtractor):
}, },
] ]
def _webpage_url(self, url, video_id): @classmethod
return f'https://play.vidyard.com/{video_id}.html' def _extract_embed_urls(cls, url, webpage):
# Handle protocol-less embed URLs
for embed_url in super()._extract_embed_urls(url, webpage):
if embed_url.startswith('//'):
embed_url = f'https:{embed_url}'
yield embed_url
# Extract inline/lightbox embeds
for embed_elm in re.findall(r'(<img[^>]+class=(["\'])(?:[^>"\']* )?vidyard-player-embed(?: [^>"\']*)?\2[^>]+[^>]*>)', webpage):
embed = extract_attributes(embed_elm[0]) or {}
uuid = embed.get('data-uuid')
if uuid:
yield f'https://play.vidyard.com/{uuid}.html'
def _real_extract(self, url):
video_id = self._match_valid_url(url).group('id')
webpage = self._download_webpage(self._webpage_url(url, video_id), video_id)
json_data = self._download_json(
f'https://play.vidyard.com/player/{video_id}.json', video_id)['payload']['chapters'][0]
formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], video_id)
self._merge_subtitles(self._get_direct_subtitles(json_data.get('captions')), target=subtitles)
return {
'id': str(json_data['videoId']),
'title': json_data.get('name') or self._og_search_title(webpage, default=None) or self._html_extract_title(webpage),
'description': json_data.get('description') or self._og_search_description(webpage, default=None),
'duration': int_or_none(json_data.get('seconds')),
'formats': formats,
'subtitles': subtitles,
'thumbnails': [{'url': thumbnail_url}
for thumbnail_url in traverse_obj(json_data, ('thumbnailUrls', ...))],
'http_headers': {
'referer': 'https://play.vidyard.com/',
},
}