import base64 import functools import hashlib import itertools import math import re import time import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from ..dependencies import Cryptodome from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, GeoRestrictedError, InAdvancePagedList, OnDemandPagedList, bool_or_none, filter_dict, float_or_none, format_field, int_or_none, join_nonempty, make_archive_id, merge_dicts, mimetype2ext, parse_count, parse_qs, qualities, smuggle_url, srt_subtitles_timecode, str_or_none, traverse_obj, try_call, unified_timestamp, unsmuggle_url, url_or_none, urlencode_postdata, variadic, ) class BilibiliBaseIE(InfoExtractor): _FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?') def extract_formats(self, play_info): format_names = { r['quality']: traverse_obj(r, 'new_description', 'display_desc') for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) } audios = traverse_obj(play_info, ('dash', (None, 'dolby'), 'audio', ..., {dict})) flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) if flac_audio: audios.append(flac_audio) formats = [{ 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), 'acodec': traverse_obj(audio, ('codecs', {str.lower})), 'vcodec': 'none', 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), 'filesize': int_or_none(audio.get('size')), 'format_id': str_or_none(audio.get('id')), } for audio in audios] formats.extend({ 'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')), 'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')), 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'vcodec': video.get('codecs'), 'acodec': 'none' if audios else None, 'dynamic_range': {126: 'DV', 125: 'HDR10'}.get(int_or_none(video.get('id'))), 'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'filesize': int_or_none(video.get('size')), 'quality': int_or_none(video.get('id')), 'format_id': traverse_obj( video, (('baseUrl', 'base_url'), {self._FORMAT_ID_RE.search}, 1), ('id', {str_or_none}), get_all=False), 'format': format_names.get(video.get('id')), } for video in traverse_obj(play_info, ('dash', 'video', ...))) missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality'))) if missing_formats: self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; ' f'you have to login or become premium member to download them. {self._login_hint()}') return formats def json2srt(self, json_data): srt_data = '' for idx, line in enumerate(json_data.get('body') or []): srt_data += (f'{idx + 1}\n' f'{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n' f'{line["content"]}\n\n') return srt_data def _get_subtitles(self, video_id, aid, cid): subtitles = { 'danmaku': [{ 'ext': 'xml', 'url': f'https://comment.bilibili.com/{cid}.xml', }] } video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id) for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)): subtitles.setdefault(s['lan'], []).append({ 'ext': 'srt', 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) }) return subtitles def _get_chapters(self, aid, cid): chapters = aid and cid and self._download_json( 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid}, note='Extracting chapters', fatal=False) return traverse_obj(chapters, ('data', 'view_points', ..., { 'title': 'content', 'start_time': 'from', 'end_time': 'to', })) or None def _get_comments(self, aid): for idx in itertools.count(1): replies = traverse_obj( self._download_json( f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={aid}&type=1&jsonp=jsonp&sort=2&_=1567227301685', aid, note=f'Extracting comments from page {idx}', fatal=False), ('data', 'replies')) if not replies: return for children in map(self._get_all_children, replies): yield from children def _get_all_children(self, reply): yield { 'author': traverse_obj(reply, ('member', 'uname')), 'author_id': traverse_obj(reply, ('member', 'mid')), 'id': reply.get('rpid'), 'text': traverse_obj(reply, ('content', 'message')), 'timestamp': reply.get('ctime'), 'parent': reply.get('parent') or 'root', } for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children def _get_episodes_from_season(self, ss_id, url): season_info = self._download_json( 'https://api.bilibili.com/pgc/web/season/section', ss_id, note='Downloading season info', query={'season_id': ss_id}, headers={'Referer': url, **self.geo_verification_headers()}) for entry in traverse_obj(season_info, ( 'result', 'main_section', 'episodes', lambda _, v: url_or_none(v['share_url']) and v['id'])): yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}') class BiliBiliIE(BilibiliBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', 'info_dict': { 'id': 'BV13x41117TL', 'title': '阿滴英文|英文歌分享#6 "Closer', 'ext': 'mp4', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', 'uploader_id': '65880958', 'uploader': '阿滴英文', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'duration': 554.117, 'tags': list, 'comment_count': int, 'upload_date': '20170301', 'timestamp': 1488353834, 'like_count': int, 'view_count': int, }, }, { # old av URL version 'url': 'http://www.bilibili.com/video/av1074402/', 'info_dict': { 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', 'ext': 'mp4', 'uploader': '菊子桑', 'uploader_id': '156160', 'id': 'BV11x411K7CN', 'title': '【金坷垃】金泡沫', 'duration': 308.36, 'upload_date': '20140420', 'timestamp': 1397983878, 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', 'like_count': int, 'comment_count': int, 'view_count': int, 'tags': list, }, 'params': {'skip_download': True}, }, { 'note': 'Anthology', 'url': 'https://www.bilibili.com/video/BV1bK411W797', 'info_dict': { 'id': 'BV1bK411W797', 'title': '物语中的人物是如何吐槽自己的OP的' }, 'playlist_count': 18, 'playlist': [{ 'info_dict': { 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'tags': 'count:11', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', 'uploader_id': '150259984', 'like_count': int, 'comment_count': int, 'upload_date': '20200516', 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, } }] }, { 'note': 'Specific page of Anthology', 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1', 'info_dict': { 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'tags': 'count:11', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', 'uploader_id': '150259984', 'like_count': int, 'comment_count': int, 'upload_date': '20200516', 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, } }, { 'note': 'video has subtitles', 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh', 'info_dict': { 'id': 'BV12N4y1M7rh', 'ext': 'mp4', 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1', 'tags': list, 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', 'duration': 313.557, 'upload_date': '20220709', 'uploader': '小夫太渴', 'timestamp': 1657347907, 'uploader_id': '1326814124', 'comment_count': int, 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'subtitles': 'count:2' }, 'params': {'listsubtitles': True}, }, { 'url': 'https://www.bilibili.com/video/av8903802/', 'info_dict': { 'id': 'BV13x41117TL', 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'upload_date': '20170301', 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', 'timestamp': 1488353834, 'uploader_id': '65880958', 'uploader': '阿滴英文', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'duration': 554.117, 'tags': list, 'comment_count': int, 'view_count': int, 'like_count': int, }, 'params': { 'skip_download': True, }, }, { 'note': 'video has chapter', 'url': 'https://www.bilibili.com/video/BV1vL411G7N7/', 'info_dict': { 'id': 'BV1vL411G7N7', 'ext': 'mp4', 'title': '如何为你的B站视频添加进度条分段', 'timestamp': 1634554558, 'upload_date': '20211018', 'description': 'md5:a9a3d6702b3a94518d419b2e9c320a6d', 'tags': list, 'uploader': '爱喝咖啡的当麻', 'duration': 669.482, 'uploader_id': '1680903', 'chapters': 'count:6', 'comment_count': int, 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, }, { 'note': 'video redirects to festival page', 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h', 'info_dict': { 'id': 'BV1wP4y1P72h', 'ext': 'mp4', 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】', 'timestamp': 1643947497, 'upload_date': '20220204', 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6', 'uploader': '叨叨冯聊音乐', 'duration': 246.719, 'uploader_id': '528182630', 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, }, { 'note': 'newer festival video', 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f', 'info_dict': { 'id': 'BV1ay4y1d77f', 'ext': 'mp4', 'title': '【崩坏3新春剧场】为特别的你送上祝福!', 'timestamp': 1674273600, 'upload_date': '20230121', 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8', 'uploader': '果蝇轰', 'duration': 1111.722, 'uploader_id': '8469526', 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) is_festival = 'videoData' not in initial_state if is_festival: video_data = initial_state['videoInfo'] else: play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] video_data = initial_state['videoData'] video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. page_list_json = not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, note='Extracting videos in anthology'), 'data', expected_type=list) or [] is_anthology = len(page_list_json) > 1 part_id = int_or_none(parse_qs(url).get('p', [None])[-1]) if is_anthology and not part_id and self._yes_playlist(video_id, video_id): return self.playlist_from_matches( page_list_json, video_id, title, ie=BiliBiliIE, getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') if is_anthology: part_id = part_id or 1 title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}' aid = video_data.get('aid') old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') festival_info = {} if is_festival: play_info = self._download_json( 'https://api.bilibili.com/x/player/playurl', video_id, query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, note='Extracting festival video formats')['data'] festival_info = traverse_obj(initial_state, { 'uploader': ('videoInfo', 'upName'), 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), 'like_count': ('videoStatus', 'like', {int_or_none}), 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), }, get_all=False) return { **traverse_obj(initial_state, { 'uploader': ('upData', 'name'), 'uploader_id': ('upData', 'mid', {str_or_none}), 'like_count': ('videoData', 'stat', 'like', {int_or_none}), 'tags': ('tags', ..., 'tag_name'), 'thumbnail': ('videoData', 'pic', {url_or_none}), }), **festival_info, **traverse_obj(video_data, { 'description': 'desc', 'timestamp': ('pubdate', {int_or_none}), 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}), 'comment_count': ('stat', 'reply', {int_or_none}), }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'chapters': self._get_chapters(aid, cid), 'subtitles': self.extract_subtitles(video_id, aid, cid), '__post_extractor': self.extract_comments(aid), 'http_headers': {'Referer': url}, } class BiliBiliBangumiIE(BilibiliBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?Pep\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { 'id': '267851', 'ext': 'mp4', 'series': '鬼灭之刃', 'series_id': '4358', 'season': '鬼灭之刃', 'season_id': '26801', 'season_number': 1, 'episode': '残酷', 'episode_id': '267851', 'episode_number': 1, 'title': '1 残酷', 'duration': 1425.256, 'timestamp': 1554566400, 'upload_date': '20190406', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' }, 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.' }] def _real_extract(self, url): video_id = self._match_id(url) episode_id = video_id[2:] webpage = self._download_webpage(url, video_id) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') headers = {'Referer': url, **self.geo_verification_headers()} play_info = self._download_json( 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, headers=headers) premium_only = play_info.get('code') == -10403 play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} formats = self.extract_formats(play_info) if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): self.raise_login_required('This video is for premium members only') bangumi_info = self._download_json( 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details', query={'ep_id': episode_id}, headers=headers)['result'] episode_number, episode_info = next(( (idx, ep) for idx, ep in enumerate(traverse_obj( bangumi_info, ('episodes', ..., {dict})), 1) if str_or_none(ep.get('id')) == episode_id), (1, {})) season_id = bangumi_info.get('season_id') season_number = season_id and next(( idx + 1 for idx, e in enumerate( traverse_obj(bangumi_info, ('seasons', ...))) if e.get('season_id') == season_id ), None) aid = episode_info.get('aid') return { 'id': video_id, 'formats': formats, **traverse_obj(bangumi_info, { 'series': ('series', 'series_title', {str}), 'series_id': ('series', 'series_id', {str_or_none}), 'thumbnail': ('square_cover', {url_or_none}), }), 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info), 'episode': episode_info.get('long_title'), 'episode_id': episode_id, 'episode_number': int_or_none(episode_info.get('title')) or episode_number, 'season_id': str_or_none(season_id), 'season_number': season_number, 'timestamp': int_or_none(episode_info.get('pub_time')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')), '__post_extractor': self.extract_comments(aid), 'http_headers': headers, } class BiliBiliBangumiMediaIE(BilibiliBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'info_dict': { 'id': '24097891', }, 'playlist_mincount': 25, }] def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) ss_id = self._search_json( r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id'] return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id) class BiliBiliBangumiSeasonIE(BilibiliBaseIE): _VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'info_dict': { 'id': '26801' }, 'playlist_mincount': 26 }] def _real_extract(self, url): ss_id = self._match_id(url) return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id) class BilibiliSpaceBaseIE(InfoExtractor): def _extract_playlist(self, fetch_page, get_metadata, get_entries): first_page = fetch_page(0) metadata = get_metadata(first_page) paged_list = InAdvancePagedList( lambda idx: get_entries(fetch_page(idx) if idx else first_page), metadata['page_count'], metadata['page_size']) return metadata, paged_list class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)(?P