From 3462ffa8929d2a40588669578ca912d57a0da1bb Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 22 Nov 2020 03:51:09 +0530 Subject: [PATCH] Implemented all Youtube Feeds (ytfav, ytwatchlater, ytsubs, ythistory, ytrec) and SearchURL --- docs/supportedsites.md | 5 +- test/test_all_urls.py | 5 +- youtube_dlc/YoutubeDL.py | 2 +- youtube_dlc/extractor/extractors.py | 1 - youtube_dlc/extractor/youtube.py | 338 ++++++++++++++++------------ 5 files changed, 202 insertions(+), 149 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0481f7db9..860766f20 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1146,7 +1146,7 @@ # Supported sites - **YourPorn** - **YourUpload** - **youtube**: YouTube.com - - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) + - **youtube:favorites**: YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists @@ -1154,11 +1154,10 @@ # Supported sites - **youtube:search**: YouTube.com searches - **youtube:search:date**: YouTube.com searches, newest videos first - **youtube:search_url**: YouTube.com search URLs - - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **YoutubeYtUser**: YouTube.com user videos (URL or "ytuser" keyword) - **youtube:tab**: YouTube.com tab - - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication) - **Zapiks** - **Zaq1** - **Zattoo** diff --git a/test/test_all_urls.py b/test/test_all_urls.py index a44cf7549..4784c633f 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -35,6 +35,9 @@ def test_youtube_playlist_matching(self): assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('PL63F0C78739B09958') + assertTab('https://www.youtube.com/AsapSCIENCE') + assertTab('https://www.youtube.com/embedded') + assertTab('https://www.youtube.com/feed') # Own channel's home page assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') @@ -47,7 +50,7 @@ def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) # 668 self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) - self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) + # self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) # /v/ is no longer valid self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube']) self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index bf02192eb..3c2970d9f 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -832,7 +832,7 @@ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_in try: try: temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) - except (AssertionError, IndexError): + except (AssertionError, IndexError, AttributeError): temp_id = None if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): self.to_screen("[%s] %s: has already been recorded in archive" % ( diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 9e832450a..ee52492dc 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1514,7 +1514,6 @@ YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE, - YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index bbd9b2c4c..3f3f9c58b 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -34,7 +34,6 @@ get_element_by_id, int_or_none, mimetype2ext, - orderedSet, parse_codecs, parse_count, parse_duration, @@ -64,11 +63,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _RESERVED_NAMES = ( + r'course|embed|watch|w|results|storefront|' + r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|' + r'feed/(watch_later|history|subscriptions|library|trending|recommended)') + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|LL|WL)' _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -2495,7 +2499,13 @@ def decrypt_sig(mobj): class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' - _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P[^/?#&]+)' + # (?x)^ will cause warning in LiveIE. So I cant split this into multiple lines using ''' + _VALID_URL = ( + r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/' + r'(?:(?!(%s)([/#?]|$))|' + r'(?:channel|c|user)/|' + r'(?:playlist|watch)\?.*?\blist=)' + r'(?P[^/?#&]+)') % YoutubeBaseInfoExtractor._RESERVED_NAMES IE_NAME = 'youtube:tab' _TESTS = [{ @@ -2692,8 +2702,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubeLiveIE.suitable(url) else super( - YoutubeTabIE, cls).suitable(url) + IGNORE = (YoutubeLiveIE,) + return ( + False if any(ie.suitable(url) for ie in IGNORE) + else super(YoutubeTabIE, cls).suitable(url)) def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( @@ -2808,6 +2820,26 @@ def _playlist_entries(self, video_list_renderer): continue yield self._extract_video(renderer) + def _itemSection_entries(self, item_sect_renderer): + for content in item_sect_renderer['contents']: + if not isinstance(content, dict): + continue + renderer = content.get('videoRenderer', {}) + if not isinstance(renderer, dict): + continue + video_id = renderer.get('videoId') + if not video_id: + continue + yield self._extract_video(renderer) + + def _rich_entries(self, rich_grid_renderer): + renderer = try_get( + rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) + video_id = renderer.get('videoId') + if not video_id: + return + yield self._extract_video(renderer) + def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') if video_id: @@ -2899,49 +2931,67 @@ def _extract_continuation(cls, renderer): } def _entries(self, tab, identity_token): - continuation = None - slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or [] - for slr_content in slr_contents: - if not isinstance(slr_content, dict): - continue - is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): - continue - renderer = isr_content.get('playlistVideoListRenderer') - if renderer: - for entry in self._playlist_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('gridRenderer') - if renderer: - for entry in self._grid_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('shelfRenderer') - if renderer: - for entry in self._shelf_entries(renderer): - yield entry - continue - renderer = isr_content.get('backstagePostThreadRenderer') - if renderer: - for entry in self._post_thread_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('videoRenderer') - if renderer: - entry = self._video_entry(renderer) - if entry: - yield entry - if not continuation: - continuation = self._extract_continuation(is_renderer) + def extract_entries(parent_renderer): + slr_contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] + for slr_content in slr_contents: + if not isinstance(slr_content, dict): + continue + is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: + renderer = slr_content.get('richItemRenderer') + if renderer: + for entry in self._rich_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(is_renderer) + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(parent_renderer) + + continuation_list = [None] # Python 2 doesnot support nonlocal + parent_renderer = ( + try_get(tab, lambda x: x['sectionListRenderer'], dict) + or try_get(tab, lambda x: x['richGridRenderer'], dict) or {}) + if parent_renderer: + for entry in extract_entries(parent_renderer): + yield entry + + continuation = continuation_list[0] headers = { 'x-youtube-client-name': '1', @@ -2953,6 +3003,8 @@ def _entries(self, tab, identity_token): for page_num in itertools.count(1): if not continuation: break + if hasattr(self, '_MAX_PAGES') and page_num > self._MAX_PAGES: + break browse = self._download_json( 'https://www.youtube.com/browse_ajax', None, 'Downloading page %d' % page_num, @@ -2984,6 +3036,13 @@ def _entries(self, tab, identity_token): yield entry continuation = self._extract_continuation(continuation_renderer) continue + continuation_renderer = continuation_contents.get('sectionListContinuation') + if continuation_renderer: + continuation_list = [None] + for entry in extract_entries(continuation_renderer): + yield entry + continuation = continuation_list[0] + continue continuation_items = try_get( response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) @@ -2998,7 +3057,12 @@ def _entries(self, tab, identity_token): yield entry continuation = self._extract_continuation(video_list_renderer) continue - + renderer = continuation_item.get('itemSectionRenderer') + if renderer: + for entry in self._itemSection_entries(renderer): + yield entry + continuation = self._extract_continuation({'contents': continuation_items}) + continue break @staticmethod @@ -3036,6 +3100,7 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + playlist_id = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -3050,6 +3115,8 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): title = renderer.get('title') description = None playlist_id = item_id + if playlist_id is None: + return None playlist = self.playlist_result( self._entries(selected_tab['content'], identity_token), playlist_id=playlist_id, playlist_title=title, @@ -3214,7 +3281,7 @@ def _real_extract(self, url): class YoutubeLiveIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?Phttps?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P[^/]+))/live' + _VALID_URL = r'(?P%s)/live' % YoutubeTabIE._VALID_URL IE_NAME = 'youtube:live' _TESTS = [{ @@ -3361,12 +3428,42 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_PARAMS = 'CAI%3D' -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +class YoutubeSearchURLIE(InfoExtractor): + IE_DESC = 'YouTube.com search URLs' + IE_NAME = 'youtube:search_url' + _PARAM_REGEX = r'' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results/?(?:\?|\?[^#]*?&)(?:sp=(?P[^&#]+)&(?:[^#]*&)?)?(?:q|search_query)=(?P[^#&]+)(?:[^#]*?&sp=(?P[^#&]+))?' + _MAX_RESULTS = 100 + _TESTS = [{ + 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', + 'playlist_mincount': 5, + 'info_dict': { + 'title': 'youtube-dl test video', + } + }, { + 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query = compat_urllib_parse_unquote_plus(mobj.group('query')) + IE = YoutubeSearchIE(self._downloader) + IE._SEARCH_PARAMS = mobj.group('param1') or mobj.group('param2') + self._downloader.to_screen(IE._SEARCH_PARAMS) + IE._MAX_RESULTS = self._MAX_RESULTS + return IE._get_n_results(query, self._MAX_RESULTS) + + +class YoutubeFeedsInfoExtractor(YoutubeTabIE): """ Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True + _TESTS = [] + + # _MAX_PAGES = 5 @property def IE_NAME(self): @@ -3375,50 +3472,39 @@ def IE_NAME(self): def _real_initialize(self): self._login() - def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + def _shelf_entries(self, shelf_renderer): + renderer = try_get(shelf_renderer, lambda x: x['content']['gridRenderer'], dict) + if not renderer: + return + for entry in self._grid_entries(renderer): + yield entry - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: - break - - ids.extend(new_ids) - - for entry in self._ids_to_results(new_ids): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] + def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + selected_tab = self._extract_selected_tab(tabs) + return self.playlist_result( + self._entries(selected_tab['content'], identity_token), + playlist_title=self._PLAYLIST_TITLE) def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) + item_id = self._FEED_NAME + url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME + webpage = self._download_webpage(url, item_id) + identity_token = self._search_regex( + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + 'identity token', default=None) + data = self._extract_yt_initial_data(item_id, webpage) + tabs = try_get( + data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + if tabs: + return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + # Failed to recognize + raise ExtractorError('Unable to recognize feed page') -class YoutubeWatchLaterIE(InfoExtractor): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' +class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater' + _FEED_NAME = 'watchlater' _TESTS = [{ 'url': 'https://www.youtube.com/feed/watch_later', @@ -3429,25 +3515,33 @@ class YoutubeWatchLaterIE(InfoExtractor): }] def _real_extract(self, url): - return self.url_result( - 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) - _, video = self._check_download_just_video(url, 'WL') - if video: - return video - _, playlist = self._extract_playlist('WL') - return playlist + return self.url_result('WL', ie=YoutubePlaylistIE.ie_key()) + + +class YoutubeFavouritesIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication)' + _VALID_URL = r':ytfav(?:ou?rite)s?' + _FEED_NAME = 'favourites' + + _TESTS = [{ + 'url': ':ytfav', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result('LL', ie=YoutubePlaylistIE.ie_key()) class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?' _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' @@ -3525,40 +3619,9 @@ def _real_extract(self, url): expected=True) -# Old extractors. Are these cases handled elsewhere? - -class YoutubeSearchURLIE(YoutubeSearchIE): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P[^&]+)(?:[&]|$)' - _TESTS = [{ - 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', - 'playlist_mincount': 5, - 'info_dict': { - 'title': 'youtube-dl test video', - } - }, { - 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', - 'only_matching': True, - }] - - def _process_json_dict(self, obj, videos, c): - if "videoId" in obj: - videos.append(obj) - return - - if "nextContinuationData" in obj: - c["continuation"] = obj["nextContinuationData"] - return - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query) - - -class YoutubeShowIE(InfoExtractor): +# Do Youtube show urls even exist anymore? I couldn't find any +r''' +class YoutubeShowIE(YoutubeTabIE): IE_DESC = 'YouTube.com (multi-season) shows' _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P[^?#]*)' IE_NAME = 'youtube:show' @@ -3575,15 +3638,4 @@ def _real_extract(self, url): playlist_id = self._match_id(url) return super(YoutubeShowIE, self)._real_extract( 'https://www.youtube.com/show/%s/playlists' % playlist_id) - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') - return self.url_result(playlist_id, 'YoutubePlaylist') +'''