From 7d7aaa135da2073c9598ec82978e87c1d8ebcb74 Mon Sep 17 00:00:00 2001 From: "sebastian.kondraciuk" Date: Mon, 27 May 2024 11:56:08 +0200 Subject: [PATCH 1/7] extract missing channel follower count for facebook --- yt_dlp/extractor/facebook.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index b76407a5c..e536fcf65 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -481,6 +481,32 @@ def _extract_from_url(self, url, video_id): webpage = self._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) + def extract_follower_count(webpage): + post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( + r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] + post = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + + with open('post.json', 'w') as f: + json.dump(post, f) + + followers = get_first(post, ('user', 'profile_header_renderer', 'user', 'profile_social_context', 'content', ..., 'text', + lambda k, v: k == 'text' and isinstance(v, str) and v.endswith('followers'))) or None + if not isinstance(followers, str): + return None + + matches = re.search(r"(\d+)([K|M])?", followers) + if matches is None: + return None + + count = int(matches[1]) + unit = matches[2] + if unit == "K": + count *= 1_000 + elif unit == "M": + count *= 1_000_000 + return count + def extract_metadata(webpage): post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] @@ -504,6 +530,8 @@ def extract_metadata(webpage): or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) or get_first(post, ('node', 'actors', ..., {dict})) or get_first(post, ('event', 'event_creator', {dict})) or {}) + uploader_profile = get_first(post, ('attachments', ..., 'media', 'creation_story', 'comet_sections', 'actor_photo', 'story', 'actors', ..., {dict})) or {} + profile_url = uploader_profile.get('profile_url') uploader = uploader_data.get('name') or ( clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or self._search_regex( @@ -529,6 +557,7 @@ def extract_metadata(webpage): webpage, 'view count', default=None)), 'concurrent_view_count': get_first(post, ( ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), + 'profile_url': profile_url, } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -711,6 +740,11 @@ def parse_attachment(attachment, key='media'): video_info = entries[0] if entries else {'id': video_id} webpage_info = extract_metadata(webpage) + if profile_url := webpage_info.get('profile_url'): + profile_page = self._download_webpage(profile_url, None) + follower_count = extract_follower_count(profile_page) + webpage_info['channel_follower_count'] = follower_count + del webpage_info['profile_url'] # honor precise duration in video info if video_info.get('duration'): webpage_info['duration'] = video_info['duration'] From 30647d27def29d6d5bccf7f0ce6ab55cba253972 Mon Sep 17 00:00:00 2001 From: "sebastian.kondraciuk" Date: Mon, 27 May 2024 12:08:01 +0200 Subject: [PATCH 2/7] remove debug dump --- yt_dlp/extractor/facebook.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index e536fcf65..a735c44c8 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -487,9 +487,6 @@ def extract_follower_count(webpage): post = traverse_obj(post_data, ( ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - with open('post.json', 'w') as f: - json.dump(post, f) - followers = get_first(post, ('user', 'profile_header_renderer', 'user', 'profile_social_context', 'content', ..., 'text', lambda k, v: k == 'text' and isinstance(v, str) and v.endswith('followers'))) or None if not isinstance(followers, str): From 8f95c7a3b11ed1601aa430badc90535420775889 Mon Sep 17 00:00:00 2001 From: "sebastian.kondraciuk" Date: Mon, 27 May 2024 13:07:25 +0200 Subject: [PATCH 3/7] remove unnecessary var def --- yt_dlp/extractor/facebook.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index a735c44c8..4b94fbbce 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -527,8 +527,7 @@ def extract_metadata(webpage): or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) or get_first(post, ('node', 'actors', ..., {dict})) or get_first(post, ('event', 'event_creator', {dict})) or {}) - uploader_profile = get_first(post, ('attachments', ..., 'media', 'creation_story', 'comet_sections', 'actor_photo', 'story', 'actors', ..., {dict})) or {} - profile_url = uploader_profile.get('profile_url') + profile_url = get_first(post, ('attachments', ..., 'media', 'creation_story', 'comet_sections', 'actor_photo', 'story', 'actors', ..., 'profile_url')) or None uploader = uploader_data.get('name') or ( clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or self._search_regex( From 0c0a631e992d9ed987e36b713934c7f60c8b404f Mon Sep 17 00:00:00 2001 From: "sebastian.kondraciuk" Date: Mon, 27 May 2024 13:26:53 +0200 Subject: [PATCH 4/7] remove profile url shenanigans --- yt_dlp/extractor/facebook.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 4b94fbbce..a5a0b5c06 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -542,6 +542,7 @@ def extract_metadata(webpage): # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): thumbnail = None + follower_count = extract_follower_count(self._download_webpage(profile_url, None)) info_dict = { 'description': description, 'uploader': uploader, @@ -553,7 +554,7 @@ def extract_metadata(webpage): webpage, 'view count', default=None)), 'concurrent_view_count': get_first(post, ( ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), - 'profile_url': profile_url, + 'channel_follower_count': follower_count, } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -736,11 +737,6 @@ def parse_attachment(attachment, key='media'): video_info = entries[0] if entries else {'id': video_id} webpage_info = extract_metadata(webpage) - if profile_url := webpage_info.get('profile_url'): - profile_page = self._download_webpage(profile_url, None) - follower_count = extract_follower_count(profile_page) - webpage_info['channel_follower_count'] = follower_count - del webpage_info['profile_url'] # honor precise duration in video info if video_info.get('duration'): webpage_info['duration'] = video_info['duration'] From 6d20f7dc69ab49b58d1a6ecec1cd10a7efa6a08c Mon Sep 17 00:00:00 2001 From: "sebastian.kondraciuk" Date: Mon, 27 May 2024 13:29:04 +0200 Subject: [PATCH 5/7] move profile_url closer to its usage --- yt_dlp/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index a5a0b5c06..364f8ca4b 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -527,7 +527,6 @@ def extract_metadata(webpage): or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) or get_first(post, ('node', 'actors', ..., {dict})) or get_first(post, ('event', 'event_creator', {dict})) or {}) - profile_url = get_first(post, ('attachments', ..., 'media', 'creation_story', 'comet_sections', 'actor_photo', 'story', 'actors', ..., 'profile_url')) or None uploader = uploader_data.get('name') or ( clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or self._search_regex( @@ -542,6 +541,7 @@ def extract_metadata(webpage): # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): thumbnail = None + profile_url = get_first(post, ('attachments', ..., 'media', 'creation_story', 'comet_sections', 'actor_photo', 'story', 'actors', ..., 'profile_url')) or None follower_count = extract_follower_count(self._download_webpage(profile_url, None)) info_dict = { 'description': description, From 25f44777c0b7f54a1948eb162a46dcb8703b494a Mon Sep 17 00:00:00 2001 From: "sebastian.kondraciuk" Date: Mon, 27 May 2024 13:43:44 +0200 Subject: [PATCH 6/7] apply formatting --- yt_dlp/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 364f8ca4b..2eb74e797 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -488,7 +488,7 @@ def extract_follower_count(webpage): ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] followers = get_first(post, ('user', 'profile_header_renderer', 'user', 'profile_social_context', 'content', ..., 'text', - lambda k, v: k == 'text' and isinstance(v, str) and v.endswith('followers'))) or None + lambda k, v: k == 'text' and isinstance(v, str) and v.endswith('followers'))) or None if not isinstance(followers, str): return None From c1589ea3fe3edaeb643118f7b8d425da97597ea5 Mon Sep 17 00:00:00 2001 From: "sebastian.kondraciuk" Date: Mon, 27 May 2024 13:52:03 +0200 Subject: [PATCH 7/7] remove or None --- yt_dlp/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 2eb74e797..fb3e36954 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -541,7 +541,7 @@ def extract_metadata(webpage): # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): thumbnail = None - profile_url = get_first(post, ('attachments', ..., 'media', 'creation_story', 'comet_sections', 'actor_photo', 'story', 'actors', ..., 'profile_url')) or None + profile_url = get_first(post, ('attachments', ..., 'media', 'creation_story', 'comet_sections', 'actor_photo', 'story', 'actors', ..., 'profile_url')) follower_count = extract_follower_count(self._download_webpage(profile_url, None)) info_dict = { 'description': description,