From f0d785d3ed59e879a69f69f3c9334754f11747e0 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Fri, 7 Jan 2022 11:03:02 +0000 Subject: [PATCH] [youtube:tab] Extract more playlist metadata (#2069) * Add fields modified_date, modified_timestamp * Add field playlist_count * [youtube:tab] Extract view_count, playlist_count, modified_date Authored by: coletdjnz, pukkandan --- README.md | 5 +++- yt_dlp/YoutubeDL.py | 17 ++++++++---- yt_dlp/extractor/common.py | 16 ++++++++--- yt_dlp/extractor/youtube.py | 53 +++++++++++++++++++++++++------------ 4 files changed, 65 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index db559c83e..b40f5c693 100644 --- a/README.md +++ b/README.md @@ -1120,8 +1120,10 @@ # OUTPUT TEMPLATE - `creator` (string): The creator of the video - `timestamp` (numeric): UNIX timestamp of the moment the video became available - `upload_date` (string): Video upload date (YYYYMMDD) - - `release_date` (string): The date (YYYYMMDD) when the video was released - `release_timestamp` (numeric): UNIX timestamp of the moment the video was released + - `release_date` (string): The date (YYYYMMDD) when the video was released + - `modified_timestamp` (numeric): UNIX timestamp of the moment the video was last modified + - `modified_date` (string): The date (YYYYMMDD) when the video was last modified - `uploader_id` (string): Nickname or id of the video uploader - `channel` (string): Full name of the channel the video is uploaded on - `channel_id` (string): Id of the channel @@ -1167,6 +1169,7 @@ # OUTPUT TEMPLATE - `video_autonumber` (numeric): Number that will be increased with each video - `n_entries` (numeric): Total number of extracted items in the playlist - `playlist` (string): Name or id of the playlist that contains the video + - `playlist_count` (numeric): Total number of items in the playlist. May not be known if entire playlist is not extracted - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index - `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist - `playlist_id` (string): Playlist identifier diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 463251789..dff4b17b3 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1636,14 +1636,15 @@ def iter_playlistitems(format): playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) ie_entries = ie_result['entries'] - msg = ( - 'Downloading %d videos' if not isinstance(ie_entries, list) - else 'Collected %d videos; downloading %%d of them' % len(ie_entries)) - if isinstance(ie_entries, list): + playlist_count = len(ie_result) + msg = f'Collected {playlist_count} videos; downloading %d of them' + ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count + def get_entry(i): return ie_entries[i - 1] else: + msg = 'Downloading %d videos' if not isinstance(ie_entries, (PagedList, LazyList)): ie_entries = LazyList(ie_entries) @@ -1652,7 +1653,7 @@ def get_entry(i): lambda self, i: ie_entries[i - 1] )(self, i) - entries = [] + entries, broken = [], False items = playlistitems if playlistitems is not None else itertools.count(playliststart) for i in items: if i == 0: @@ -1674,6 +1675,7 @@ def get_entry(i): if entry is not None: self._match_entry(entry, incomplete=True, silent=True) except (ExistingVideoReached, RejectedVideoReached): + broken = True break ie_result['entries'] = entries @@ -1684,6 +1686,9 @@ def get_entry(i): if entry is not None] n_entries = len(entries) + if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend): + ie_result['playlist_count'] = n_entries + if not playlistitems and (playliststart != 1 or playlistend): playlistitems = list(range(playliststart, playliststart + n_entries)) ie_result['requested_entries'] = playlistitems @@ -1733,6 +1738,7 @@ def get_entry(i): extra = { 'n_entries': n_entries, '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries), + 'playlist_count': ie_result.get('playlist_count'), 'playlist_index': playlist_index, 'playlist_autonumber': i, 'playlist': playlist, @@ -2331,6 +2337,7 @@ def sanitize_numeric_fields(info): for ts_key, date_key in ( ('timestamp', 'upload_date'), ('release_timestamp', 'release_date'), + ('modified_timestamp', 'modified_date'), ): if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: # Working around out-of-range timestamp values (e.g. negative ones on Windows, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 79f53c9c2..7c83991ea 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -243,11 +243,16 @@ class InfoExtractor(object): uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. - release_timestamp: UNIX timestamp of the moment the video was released. - release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video was uploaded upload_date: Video upload date (YYYYMMDD). - If not explicitly set, calculated from timestamp. + If not explicitly set, calculated from timestamp + release_timestamp: UNIX timestamp of the moment the video was released. + If it is not clear whether to use timestamp or this, use the former + release_date: The date (YYYYMMDD) when the video was released. + If not explicitly set, calculated from release_timestamp + modified_timestamp: UNIX timestamp of the moment the video was last modified. + modified_date: The date (YYYYMMDD) when the video was last modified. + If not explicitly set, calculated from modified_timestamp uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. channel: Full name of the channel the video is uploaded on. @@ -383,6 +388,11 @@ class InfoExtractor(object): Additionally, playlists can have "id", "title", and any other relevent attributes with the same semantics as videos (see above). + It can also have the following optional fields: + + playlist_count: The total number of videos in a playlist. If not given, + YoutubeDL tries to calculate it from "entries" + _type "multi_video" indicates that there are multiple videos that form a single show, for examples multiple acts of an opera or TV episode. diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 65d59802b..d266a36c6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -62,6 +62,7 @@ try_get, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, update_url_query, url_or_none, @@ -667,6 +668,14 @@ def _get_text(data, *path_list, max_runs=None): if text: return text + def _get_count(self, data, *path_list): + count_text = self._get_text(data, *path_list) or '' + count = parse_count(count_text) + if count is None: + count = str_to_int( + self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) + return count + @staticmethod def _extract_thumbnails(data, *path_list): """ @@ -695,12 +704,15 @@ def _extract_thumbnails(data, *path_list): def extract_relative_time(relative_time_text): """ Extracts a relative time from string and converts to dt object - e.g. 'streamed 6 days ago', '5 seconds ago (edited)' + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' """ - mobj = re.search(r'(?P