From 9652bca1bd02f6bc1b8cb1e186f2ccbf32225561 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 16 Sep 2023 19:38:09 -0500 Subject: [PATCH] [ie/web.archive:vlive] Remove extractor (#8132) Closes #8122 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/archiveorg.py | 235 -------------------------------- yt_dlp/extractor/naver.py | 2 +- 3 files changed, 1 insertion(+), 237 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4fed6d66a..bf0c67542 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -122,7 +122,6 @@ from .archiveorg import ( ArchiveOrgIE, YoutubeWebArchiveIE, - VLiveWebArchiveIE, ) from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 2541cd6fd..a0b26ac5a 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -3,7 +3,6 @@ import urllib.parse from .common import InfoExtractor -from .naver import NaverBaseIE from .youtube import YoutubeBaseInfoExtractor, YoutubeIE from ..compat import compat_urllib_parse_unquote from ..networking import HEADRequest @@ -947,237 +946,3 @@ def _real_extract(self, url): if not info.get('title'): info['title'] = video_id return info - - -class VLiveWebArchiveIE(InfoExtractor): - IE_NAME = 'web.archive:vlive' - IE_DESC = 'web.archive.org saved vlive videos' - _VALID_URL = r'''(?x) - (?:https?://)?web\.archive\.org/ - (?:web/)?(?:(?P[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional - (?:https?(?::|%3[Aa])//)?(?: - (?:(?:www|m)\.)?vlive\.tv(?::(?:80|443))?/(?:video|embed)/(?P[0-9]+) # VLive URL - ) - ''' - _TESTS = [{ - 'url': 'https://web.archive.org/web/20221221144331/http://www.vlive.tv/video/1326', - 'md5': 'cc7314812855ce56de70a06a27314983', - 'info_dict': { - 'id': '1326', - 'ext': 'mp4', - 'title': "Girl's Day's Broadcast", - 'creator': "Girl's Day", - 'view_count': int, - 'uploader_id': 'muploader_a', - 'uploader_url': None, - 'uploader': None, - 'upload_date': '20150817', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': 1439816449, - 'like_count': int, - 'channel': 'Girl\'s Day', - 'channel_id': 'FDF27', - 'comment_count': int, - 'release_timestamp': 1439818140, - 'release_date': '20150817', - 'duration': 1014, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://web.archive.org/web/20221221182103/http://www.vlive.tv/video/16937', - 'info_dict': { - 'id': '16937', - 'ext': 'mp4', - 'title': '첸백시 걍방', - 'creator': 'EXO', - 'view_count': int, - 'subtitles': 'mincount:12', - 'uploader_id': 'muploader_j', - 'uploader_url': 'http://vlive.tv', - 'uploader': None, - 'upload_date': '20161112', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': 1478923074, - 'like_count': int, - 'channel': 'EXO', - 'channel_id': 'F94BD', - 'comment_count': int, - 'release_timestamp': 1478924280, - 'release_date': '20161112', - 'duration': 906, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870', - 'info_dict': { - 'id': '101870', - 'ext': 'mp4', - 'title': '[ⓓ xV] “레벨이들 매력에 반해? 안 반해?” 움직이는 HD 포토 (레드벨벳:Red Velvet)', - 'creator': 'Dispatch', - 'view_count': int, - 'subtitles': 'mincount:6', - 'uploader_id': 'V__FRA08071', - 'uploader_url': 'http://vlive.tv', - 'uploader': None, - 'upload_date': '20181130', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': 1543601327, - 'like_count': int, - 'channel': 'Dispatch', - 'channel_id': 'C796F3', - 'comment_count': int, - 'release_timestamp': 1543601040, - 'release_date': '20181130', - 'duration': 279, - }, - 'params': { - 'skip_download': True, - }, - }] - - # The wayback machine has special timestamp and "mode" values: - # timestamp: - # 1 = the first capture - # 2 = the last capture - # mode: - # id_ = Identity - perform no alterations of the original resource, return it as it was archived. - _WAYBACK_BASE_URL = 'https://web.archive.org/web/2id_/' - - def _download_archived_page(self, url, video_id, *, timestamp='2', **kwargs): - for retry in self.RetryManager(): - try: - return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 404: - raise ExtractorError('Page was not archived', expected=True) - retry.error = e - continue - - def _download_archived_json(self, url, video_id, **kwargs): - page = self._download_archived_page(url, video_id, **kwargs) - if not page: - raise ExtractorError('Page was not archived', expected=True) - else: - return self._parse_json(page, video_id) - - def _extract_formats_from_m3u8(self, m3u8_url, params, video_id): - m3u8_doc = self._download_archived_page(m3u8_url, video_id, note='Downloading m3u8', query=params, fatal=False) - if not m3u8_doc: - return - - # M3U8 document should be changed to archive domain - m3u8_doc = m3u8_doc.splitlines() - url_base = m3u8_url.rsplit('/', 1)[0] - first_segment = None - for i, line in enumerate(m3u8_doc): - if not line.startswith('#'): - m3u8_doc[i] = f'{self._WAYBACK_BASE_URL}{url_base}/{line}?{urllib.parse.urlencode(params)}' - first_segment = first_segment or m3u8_doc[i] - - # Segments may not have been archived. See https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870 - urlh = self._request_webpage(HEADRequest(first_segment), video_id, errnote=False, - fatal=False, note='Check first segment availablity') - if urlh: - formats, subtitles = self._parse_m3u8_formats_and_subtitles('\n'.join(m3u8_doc), ext='mp4', video_id=video_id) - if subtitles: - self._report_ignoring_subs('m3u8') - return formats - - # Closely follows the logic of the ArchiveTeam grab script - # See: https://github.com/ArchiveTeam/vlive-grab/blob/master/vlive.lua - def _real_extract(self, url): - video_id, url_date = self._match_valid_url(url).group('id', 'date') - - webpage = self._download_archived_page(f'https://www.vlive.tv/video/{video_id}', video_id, timestamp=url_date) - - player_info = self._search_json(r'__PRELOADED_STATE__\s*=', webpage, 'player info', video_id) - user_country = traverse_obj(player_info, ('common', 'userCountry')) - - main_script_url = self._search_regex(r' 1: - self.report_warning('Multiple streams found. Only the first stream will be downloaded.') - stream = streams[0] - - max_stream = max( - stream.get('videos') or [], - key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None) - if max_stream is not None: - params = {arg.get('name'): arg.get('value') for arg in stream.get('keys', []) if arg.get('type') == 'param'} - formats = self._extract_formats_from_m3u8(max_stream.get('source'), params, video_id) or [] - - # For parts of the project MP4 files were archived - max_video = max( - traverse_obj(vod_data, ('videos', 'list', ...)), - key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None) - if max_video is not None: - video_url = self._WAYBACK_BASE_URL + max_video.get('source') - urlh = self._request_webpage(HEADRequest(video_url), video_id, errnote=False, - fatal=False, note='Check video availablity') - if urlh: - formats.append({'url': video_url}) - - return { - 'id': video_id, - 'formats': formats, - **traverse_obj(player_info, ('postDetail', 'post', { - 'title': ('officialVideo', 'title', {str}), - 'creator': ('author', 'nickname', {str}), - 'channel': ('channel', 'channelName', {str}), - 'channel_id': ('channel', 'channelCode', {str}), - 'duration': ('officialVideo', 'playTime', {int_or_none}), - 'view_count': ('officialVideo', 'playCount', {int_or_none}), - 'like_count': ('officialVideo', 'likeCount', {int_or_none}), - 'comment_count': ('officialVideo', 'commentCount', {int_or_none}), - 'timestamp': ('officialVideo', 'createdAt', {lambda x: int_or_none(x, scale=1000)}), - 'release_timestamp': ('officialVideo', 'willStartAt', {lambda x: int_or_none(x, scale=1000)}), - })), - **traverse_obj(vod_data, ('meta', { - 'uploader_id': ('user', 'id', {str}), - 'uploader': ('user', 'name', {str}), - 'uploader_url': ('user', 'url', {url_or_none}), - 'thumbnail': ('cover', 'source', {url_or_none}), - }), expected_type=lambda x: x or None), - **NaverBaseIE.process_subtitles(vod_data, lambda x: [self._WAYBACK_BASE_URL + x]), - } diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index d79caf5f3..2d8459b02 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -21,7 +21,7 @@ class NaverBaseIE(InfoExtractor): _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' - @staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE + @staticmethod # NB: Used in WeverseIE def process_subtitles(vod_data, process_url): ret = {'subtitles': {}, 'automatic_captions': {}} for caption in traverse_obj(vod_data, ('captions', 'list', ...)):