From 6857df609b60859e2864aadc61a869689d5ad2d0 Mon Sep 17 00:00:00 2001 From: WolfganP <2248211+WolfganP@users.noreply.github.com> Date: Sun, 8 Nov 2020 14:07:12 +0000 Subject: [PATCH] ITV BTCC new pages' URL update (articles instead of races) Not my changes, but from @franhp that didn't get merged on yt-dl on time It supports BTCC new pages' schema from 2019 an on (/articles/ instead of /races/) --- youtube_dlc/extractor/itv.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dlc/extractor/itv.py b/youtube_dlc/extractor/itv.py index ad2f4eca5..9817745e8 100644 --- a/youtube_dlc/extractor/itv.py +++ b/youtube_dlc/extractor/itv.py @@ -20,6 +20,7 @@ merge_dicts, parse_duration, smuggle_url, + try_get, url_or_none, xpath_with_ns, xpath_element, @@ -280,12 +281,12 @@ def extract_subtitle(sub_url): class ITVBTCCIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P[^/?#&]+)' _TEST = { - 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', + 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', 'info_dict': { - 'id': 'btcc-2018-all-the-action-from-brands-hatch', - 'title': 'BTCC 2018: All the action from Brands Hatch', + 'id': 'btcc-2019-brands-hatch-gp-race-action', + 'title': 'BTCC 2019: Brands Hatch GP race action', }, - 'playlist_mincount': 9, + 'playlist_mincount': 12, } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' @@ -294,6 +295,16 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) + json_map = try_get(self._parse_json(self._html_search_regex( + '(?s)]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)', webpage, 'json_map'), playlist_id), + lambda x: x['props']['pageProps']['article']['body']['content']) or [] + + # Discard empty objects + video_ids = [] + for video in json_map: + if video['data'].get('id'): + video_ids.append(video['data']['id']) + entries = [ self.url_result( smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { @@ -305,7 +316,7 @@ def _real_extract(self, url): 'referrer': url, }), ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)] + for video_id in video_ids] title = self._og_search_title(webpage, fatal=False)