From c62ecf0d90a8cee7e4462a3e7aa31338aa6798bf Mon Sep 17 00:00:00 2001 From: MinePlayersPE Date: Sun, 26 Dec 2021 05:41:38 +0700 Subject: [PATCH] [BiliIntl] Fix extractor (#2077) Closes #1744 Authored by: MinePlayersPE --- yt_dlp/extractor/bilibili.py | 140 ++++++++++++++++++++--------------- 1 file changed, 80 insertions(+), 60 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index e019ec6a8..2cb01ff83 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -19,14 +19,15 @@ parse_iso8601, traverse_obj, try_get, + parse_count, smuggle_url, srt_subtitles_timecode, str_or_none, - str_to_int, strip_jsonp, unified_timestamp, unsmuggle_url, urlencode_postdata, + url_or_none, OnDemandPagedList ) @@ -722,10 +723,10 @@ def _real_extract(self, url): class BiliIntlBaseIE(InfoExtractor): - _API_URL = 'https://api.bili{}/intl/gateway{}' + _API_URL = 'https://api.bilibili.tv/intl/gateway' - def _call_api(self, type, endpoint, id): - return self._download_json(self._API_URL.format(type, endpoint), id)['data'] + def _call_api(self, endpoint, *args, **kwargs): + return self._download_json(self._API_URL + endpoint, *args, **kwargs)['data'] def json2srt(self, json): data = '\n\n'.join( @@ -733,29 +734,40 @@ def json2srt(self, json): for i, line in enumerate(json['body'])) return data - def _get_subtitles(self, type, ep_id): - sub_json = self._call_api(type, f'/m/subtitle?ep_id={ep_id}&platform=web', ep_id) + def _get_subtitles(self, ep_id): + sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id) subtitles = {} - for sub in sub_json.get('subtitles', []): + for sub in sub_json.get('subtitles') or []: sub_url = sub.get('url') if not sub_url: continue - sub_data = self._download_json(sub_url, ep_id, fatal=False) + sub_data = self._download_json( + sub_url, ep_id, errnote='Unable to download subtitles', fatal=False, + note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') if not sub_data: continue - subtitles.setdefault(sub.get('key', 'en'), []).append({ + subtitles.setdefault(sub.get('lang_key', 'en'), []).append({ 'ext': 'srt', 'data': self.json2srt(sub_data) }) return subtitles - def _get_formats(self, type, ep_id): - video_json = self._call_api(type, f'/web/playurl?ep_id={ep_id}&platform=web', ep_id) - if not video_json: - self.raise_login_required(method='cookies') + def _get_formats(self, ep_id): + video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id, + note='Downloading video formats', errnote='Unable to download video formats') + if video_json.get('code'): + if video_json['code'] in (10004004, 10004005, 10023006): + self.raise_login_required(method='cookies') + elif video_json['code'] == 10004001: + self.raise_geo_restricted() + elif video_json.get('message') and str(video_json['code']) != video_json['message']: + raise ExtractorError( + f'Unable to download video formats: {self.IE_NAME} said: {video_json["message"]}', expected=True) + else: + raise ExtractorError('Unable to download video formats') video_json = video_json['playurl'] formats = [] - for vid in video_json.get('video', []): + for vid in video_json.get('video') or []: video_res = vid.get('video_resource') or {} video_info = vid.get('stream_info') or {} if not video_res.get('url'): @@ -771,7 +783,7 @@ def _get_formats(self, type, ep_id): 'vcodec': video_res.get('codecs'), 'filesize': video_res.get('size'), }) - for aud in video_json.get('audio_resource', []): + for aud in video_json.get('audio_resource') or []: if not aud.get('url'): continue formats.append({ @@ -786,85 +798,93 @@ def _get_formats(self, type, ep_id): self._sort_formats(formats) return formats - def _extract_ep_info(self, type, episode_data, ep_id): + def _extract_ep_info(self, episode_data, ep_id): return { 'id': ep_id, - 'title': episode_data.get('long_title') or episode_data['title'], + 'title': episode_data.get('title_display') or episode_data['title'], 'thumbnail': episode_data.get('cover'), - 'episode_number': str_to_int(episode_data.get('title')), - 'formats': self._get_formats(type, ep_id), - 'subtitles': self._get_subtitles(type, ep_id), + 'episode_number': int_or_none(self._search_regex( + r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)), + 'formats': self._get_formats(ep_id), + 'subtitles': self._get_subtitles(ep_id), 'extractor_key': BiliIntlIE.ie_key(), } class BiliIntlIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?Pbili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P\d+)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P\d+)/(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613/341736', 'info_dict': { 'id': '341736', 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', + 'title': 'E2 - The First Night', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 2, - }, - 'params': { - 'format': 'bv', - }, + } + }, { + 'url': 'https://www.bilibili.tv/en/play/1033760/11005006', + 'info_dict': { + 'id': '11005006', + 'ext': 'mp4', + 'title': 'E3 - Who?', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 3, + } }, { 'url': 'https://www.biliintl.com/en/play/34613/341736', - 'info_dict': { - 'id': '341736', - 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', - 'episode_number': 2, - }, - 'params': { - 'format': 'bv', - }, + 'only_matching': True, }] def _real_extract(self, url): - type, season_id, id = self._match_valid_url(url).groups() - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={season_id}', id) - episode_data = next( - episode for episode in data_json.get('episodes', []) - if str(episode.get('ep_id')) == id) - return self._extract_ep_info(type, episode_data, id) + season_id, video_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id) + # Bstation layout + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), video_id, fatal=False) or {} + episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + + if not episode_data: + # Non-Bstation layout, read through episode list + season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) + episode_data = next( + episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict) + if str(episode.get('episode_id')) == video_id) + return self._extract_ep_info(episode_data, video_id) class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?Pbili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P\d+)$' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P\d+)$' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, 'info_dict': { 'id': '34613', + 'title': 'Fly Me to the Moon', + 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627', + 'categories': ['Romance', 'Comedy', 'Slice of life'], + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'view_count': int, }, 'params': { 'skip_download': True, - 'format': 'bv', }, }, { 'url': 'https://www.biliintl.com/en/play/34613', - 'playlist_mincount': 15, - 'info_dict': { - 'id': '34613', - }, - 'params': { - 'skip_download': True, - 'format': 'bv', - }, + 'only_matching': True, }] - def _entries(self, id, type): - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={id}', id) - for episode in data_json.get('episodes', []): - episode_id = str(episode.get('ep_id')) - yield self._extract_ep_info(type, episode, episode_id) + def _entries(self, series_id): + series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): + episode_id = str(episode.get('episode_id')) + yield self._extract_ep_info(episode, episode_id) def _real_extract(self, url): - type, id = self._match_valid_url(url).groups() - return self.playlist_result(self._entries(id, type), playlist_id=id) + series_id = self._match_id(url) + series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {} + return self.playlist_result( + self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'), + categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none), + thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view')))