From d71eb83b057d4933c3a0c655951ea4ad7a36c132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Fernando=20Rodr=C3=ADguez=20Var=C3=B3n?= Date: Thu, 19 Nov 2020 23:51:43 -0500 Subject: [PATCH] Extract embedded youtube and twitter videos --- youtube_dlc/extractor/tmz.py | 50 ++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/youtube_dlc/extractor/tmz.py b/youtube_dlc/extractor/tmz.py index a2f100922..aee2273b8 100644 --- a/youtube_dlc/extractor/tmz.py +++ b/youtube_dlc/extractor/tmz.py @@ -1,7 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_attribute, +) class TMZIE(InfoExtractor): @@ -97,11 +103,55 @@ class TMZIE(InfoExtractor): "upload_date": "20201031", }, }, + { + "url": "https://www.tmz.com/2020/11/05/gervonta-davis-car-crash-hit-and-run-police/", + "info_dict": { + "id": "Dddb6IGe-ws", + "ext": "mp4", + "title": "SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing", + "uploader": "ESNEWS", + "description": "md5:49675bc58883ccf80474b8aa701e1064", + "upload_date": "20201101", + "uploader_id": "ESNEWS", + }, + }, + { + "url": "https://www.tmz.com/2020/11/19/conor-mcgregor-dustin-poirier-contract-fight-ufc-257-fight-island/", + "info_dict": { + "id": "1329450007125225473", + "ext": "mp4", + "title": "TheMacLife - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.", + "uploader": "TheMacLife", + "description": "md5:56e6009bbc3d12498e10d08a8e1f1c69", + "upload_date": "20201119", + "uploader_id": "Maclifeofficial", + "timestamp": 1605800556, + }, + }, ] def _real_extract(self, url): webpage = self._download_webpage(url, url) jsonld = self._search_json_ld(webpage, url) + if not jsonld or "url" not in jsonld: + # try to extract from YouTube Player API + # see https://developers.google.com/youtube/iframe_api_reference#Video_Queueing_Functions + match_obj = re.search(r'\.cueVideoById\(\s*(?P[\'"])(?P.*?)(?P=quote)', webpage) + if match_obj: + res = self.url_result(match_obj.group("id")) + return res + # try to extract from twitter + blockquote_el = get_element_by_attribute("class", "twitter-tweet", webpage) + if blockquote_el: + matches = re.findall( + r']+href=\s*(?P[\'"])(?P.*?)(?P=quote)', + blockquote_el) + if matches: + for _, match in matches: + if "/status/" in match: + res = self.url_result(match) + return res + raise ExtractorError("No video found!") if id not in jsonld: jsonld["id"] = url return jsonld