modified: yt_dlp/extractor/bilibili.py

2024-10-04 16:37:09 +02:00 · 2024-06-30 20:53:08 +12:00 · 2024-06-30 20:53:08 +12:00 · ddca238423
commit ddca238423
parent 72fac58401
1 changed files with 50 additions and 28 deletions
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@ -1530,38 +1530,44 @@ class BiliBiliSearchAllIE(SearchInfoExtractor):

    def _search_results(self, query):
        headers = self.geo_verification_headers()
+        page_size = 50
        live_room_prefix = 'https://live.bilibili.com/'
        bili_user_prefix = 'https://space.bilibili.com/'
        if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
            self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
        for page_num in itertools.count(1):
+            query_params = {
+                'keyword': query,
+                'page': page_num,
+                'dynamic_offset': (page_num - 1) * page_size,
+                'platform': 'pc',
+            }
+            api_url = r'https://api.bilibili.com/x/web-interface/search/all/v2'
            try:
                search_all_result = self._download_json(
-                    r'https://api.bilibili.com/x/web-interface/search/all/v2',
-                    video_id=query, query={
-                        'keyword': query,
-                        'page': page_num,
-                    }, headers=headers)
+                    api_url, video_id=query, query=query_params, headers=headers)
            except ExtractorError as e:
                if isinstance(e.cause, HTTPError) and e.cause.status == 412:
                    raise ExtractorError('Request is blocked by server (-412).', expected=True)
+                raise
            status_code = search_all_result['code']
            if status_code == -400:
                raise ExtractorError('Invalid request (-400).', expected=True)
            result_list = search_all_result['data'].get('result')
-            if result_list is None:
+            if not result_list:
                self.write_debug(f'Response: {search_all_result}')
                raise ExtractorError(f'Result not found in the response ({status_code}).',
                                     expected=True)
            for result_type_dict in result_list:
                for result_data in result_type_dict['data']:
-                    if result_data['type'] == 'video':
+                    result_type = result_data.get('type')
+                    if result_type == 'video':
                        yield self.url_result(result_data['arcurl'])
-                    elif result_data['type'] == 'live_room':
+                    elif result_type == 'live_room':
                        yield self.url_result(live_room_prefix + str(result_data['roomid']))
-                    elif result_data['type'] in ['media_ft', 'media_bangumi']:
+                    elif result_type in ['media_ft', 'media_bangumi']:
                        yield self.url_result(result_data['url'])
-                    elif result_data['type'] == 'bili_user':
+                    elif result_type == 'bili_user':
                        yield self.url_result(bili_user_prefix + str(result_data['mid']))


@ -2315,7 +2321,7 @@ class BiliBiliSearchPageIE(BilibiliBaseIE):
    _VALID_URL = r'https?://search\.bilibili\.com/(?P<type>all|video|bangumi|pgc|live|upuser).*'
    _TESTS = [{
        'url': r'https://search.bilibili.com/all?keyword=yt+-+dlp+%E4%B8%8B%E8%BD%BD%E5%99%A8',
-        'playlist_count': 20,
+        'playlist_count': 36,
        'info_dict': {
            'id': 'yt - dlp 下载器',
            'title': 'yt - dlp 下载器',
@ -2330,7 +2336,7 @@ class BiliBiliSearchPageIE(BilibiliBaseIE):
        'skip': 'geo-restricted',
    }, {
        'url': r'https://search.bilibili.com/video?keyword=%E8%AE%A9%E5%AD%90%E5%BC%B9%E9%A3%9E&from_source=webtop_search&spm_id_from=333.1007&search_source=5&order=dm&duration=4&tids=181&page=3&o=72',
-        'playlist_mincount': 5,
+        'playlist_count': 4,
        'info_dict': {
            'id': '让子弹飞',
            'title': '让子弹飞',
@ -2338,15 +2344,20 @@ class BiliBiliSearchPageIE(BilibiliBaseIE):
    }]

    def _real_extract(self, url):
+        live_room_prefix = 'https://live.bilibili.com/'
+        bili_user_prefix = 'https://space.bilibili.com/'
        headers = self.geo_verification_headers()
        entries = []
        params = parse_qs(url)
-        query = {}
+        query = {
+            'platform': 'pc',
+            'page_size': 36,
+        }
        if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
            self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
        search_type = self._match_valid_url(url).group('type')
        raw_playlist_id = traverse_obj(params, ('keyword', 0))
-        if raw_playlist_id is None:
+        if not raw_playlist_id:
            raise ExtractorError('Please specify the keyword to search for!', expected=True)
        playlist_id = urllib.parse.unquote_plus(raw_playlist_id)
        search_type_mapping = {
@ -2355,6 +2366,7 @@ def _real_extract(self, url):
            'pgc': 'media_ft',
            'live': 'live_room',
            'upuser': 'bili_user',
+            'all': 'video',  # 'all' search calls video search after page 1
        }
        valid_params = [
            'keyword',
@ -2362,7 +2374,7 @@ def _real_extract(self, url):
            'order',
            'duration',
            'tids',
-            'search_type',
+            'search_type',  # Only when searching for live_room or live_user
            'order_sort',
            'user_type',
        ]
@ -2370,11 +2382,17 @@ def _real_extract(self, url):
            param_value = traverse_obj(params, (valid_param, 0))
            if param_value is not None:
                query[valid_param] = param_value
-        live_room_prefix = 'https://live.bilibili.com/'
-        bili_user_prefix = 'https://space.bilibili.com/'
+        page_num = int(query.get('page', 1))
+        param_offset = int_or_none(traverse_obj(params, ('o', 0)))
+        if page_num == 1:
+            query['dynamic_offset'] = 0
+        elif param_offset is not None:
+            query['dynamic_offset'] = param_offset
+        else:
+            query['dynamic_offset'] = query['page_size'] * (page_num - 1)
        if search_type == 'live' and traverse_obj(params, ('search_type', 0)) == 'live_user':
            raise ExtractorError('Live users are not downloadable!', expected=True)
-        if search_type == 'all':
+        if search_type == 'all' and page_num == 1:
            try:
                search_all_result = self._download_json(
                    r'https://api.bilibili.com/x/web-interface/search/all/v2',
@ -2387,18 +2405,20 @@ def _real_extract(self, url):
            if status_code == -400:
                raise ExtractorError('Invalid request (-400).', expected=True)
            result_list = search_all_result['data'].get('result')
-            if result_list is None:
+            if not result_list:
+                self.write_debug(f'Response: {search_all_result}')
                raise ExtractorError(f'Result not found in the response ({status_code}).',
                                     expected=True)
            for result_type_dict in result_list:
                for result_data in result_type_dict['data']:
-                    if result_data['type'] == 'video':
+                    result_type = result_data.get('type')
+                    if result_type == 'video':
                        entries.append(self.url_result(result_data['arcurl']))
-                    elif result_data['type'] == 'live_room':
+                    elif result_type == 'live_room':
                        entries.append(self.url_result(live_room_prefix + str(result_data['roomid'])))
-                    elif result_data['type'] in ['media_ft', 'media_bangumi']:
+                    elif result_type in ['media_ft', 'media_bangumi']:
                        entries.append(self.url_result(result_data['url']))
-                    elif result_data['type'] == 'bili_user':
+                    elif result_type == 'bili_user':
                        entries.append(self.url_result(bili_user_prefix + str(result_data['mid'])))
        else:
            try:
@ -2416,18 +2436,20 @@ def _real_extract(self, url):
            if status_code == -400:
                raise ExtractorError('Invalid request (-400).')
            result_list = search_type_result['data'].get('result')
-            if result_list is None:
+            if not result_list:
+                self.write_debug(f'Response: {search_type_result}')
                raise ExtractorError(
                    f'Result not found in the response ({status_code}). '
                    'You might want to try a VPN or a proxy server (with --proxy)', expected=True)
            for result_data in result_list:
-                if result_data['type'] == 'video':
+                result_type = result_data.get('type')
+                if result_type == 'video':
                    entries.append(self.url_result(result_data['arcurl']))
-                elif result_data['type'] == 'live_room':
+                elif result_type == 'live_room':
                    entries.append(self.url_result(live_room_prefix + str(result_data['roomid'])))
-                elif result_data['type'] in ['media_ft', 'media_bangumi']:
+                elif result_type in ['media_ft', 'media_bangumi']:
                    entries.append(self.url_result(result_data['url']))
-                elif result_data['type'] == 'bili_user':
+                elif result_type == 'bili_user':
                    entries.append(self.url_result(bili_user_prefix + str(result_data['mid'])))

        return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=playlist_id)