yt-dlp/yt_dlp/extractor/rumble.py

import itertools
import re

from .common import InfoExtractor
from ..compat import compat_str, compat_HTTPError
from ..utils import (
    determine_ext,
    int_or_none,
    parse_iso8601,
    try_get,
    unescapeHTML,
    ExtractorError,
)


class RumbleEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
    _TESTS = [{
        'url': 'https://rumble.com/embed/v5pv5f',
        'md5': '36a18a049856720189f30977ccbb2c34',
        'info_dict': {
            'id': 'v5pv5f',
            'ext': 'mp4',
            'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
            'timestamp': 1571611968,
            'upload_date': '20191020',
        }
    }, {
        'url': 'https://rumble.com/embed/vslb7v',
        'md5': '7418035de1a30a178b8af34dc2b6a52b',
        'info_dict': {
            'id': 'vslb7v',
            'ext': 'mp4',
            'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
            'timestamp': 1645142135,
            'upload_date': '20220217',
            'channel_url': 'https://rumble.com/c/CyberTechNews',
            'channel': 'CTNews',
            'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
            'duration': 901,
        }
    }, {
        'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return [
            mobj.group('url')
            for mobj in re.finditer(
                r'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL,
                webpage)]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video = self._download_json(
            'https://rumble.com/embedJS/', video_id,
            query={'request': 'video', 'v': video_id})
        title = unescapeHTML(video['title'])

        formats = []
        for height, ua in (video.get('ua') or {}).items():
            for i in range(2):
                f_url = try_get(ua, lambda x: x[i], compat_str)
                if f_url:
                    ext = determine_ext(f_url)
                    f = {
                        'ext': ext,
                        'format_id': '%s-%sp' % (ext, height),
                        'height': int_or_none(height),
                        'url': f_url,
                    }
                    bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
                    if bitrate:
                        f['tbr'] = int_or_none(bitrate)
                    formats.append(f)
        self._sort_formats(formats)

        author = video.get('author') or {}

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'thumbnail': video.get('i'),
            'timestamp': parse_iso8601(video.get('pubDate')),
            'channel': author.get('name'),
            'channel_url': author.get('url'),
            'duration': int_or_none(video.get('duration')),
        }


class RumbleChannelIE(InfoExtractor):
    _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))'

    _TESTS = [{
        'url': 'https://rumble.com/c/Styxhexenhammer666',
        'playlist_mincount': 1160,
        'info_dict': {
            'id': 'Styxhexenhammer666',
        },
    }, {
        'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
        'playlist_count': 4,
        'info_dict': {
            'id': 'goldenpoodleharleyeuna',
        },
    }]

    def entries(self, url, playlist_id):
        for page in itertools.count(1):
            try:
                webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
            except ExtractorError as e:
                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
                    break
                raise
            for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
                yield self.url_result('https://rumble.com' + video_url)

    def _real_extract(self, url):
        url, playlist_id = self._match_valid_url(url).groups()
        return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)
[Rumble] Add RumbleChannelIE (#1088) Authored by: Ashish0804 2021-09-27 23:01:23 +02:00			`import itertools`
[rumble] Add support for video page (Closes #80) 2021-02-15 15:37:03 +01:00			`import re`

Update to release 2020.11.24 except youtube and skyit extractors 2020-11-23 22:03:08 +01:00			`from .common import InfoExtractor`
[Rumble] Add RumbleChannelIE (#1088) Authored by: Ashish0804 2021-09-27 23:01:23 +02:00			`from ..compat import compat_str, compat_HTTPError`
Update to release 2020.11.24 except youtube and skyit extractors 2020-11-23 22:03:08 +01:00			`from ..utils import (`
			`determine_ext,`
			`int_or_none,`
			`parse_iso8601,`
			`try_get,`
[rumble] unescape title 2022-03-16 21:07:04 +01:00			`unescapeHTML,`
[Rumble] Add RumbleChannelIE (#1088) Authored by: Ashish0804 2021-09-27 23:01:23 +02:00			`ExtractorError,`
Update to release 2020.11.24 except youtube and skyit extractors 2020-11-23 22:03:08 +01:00			`)`


			`class RumbleEmbedIE(InfoExtractor):`
			`_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'`
			`_TESTS = [{`
			`'url': 'https://rumble.com/embed/v5pv5f',`
			`'md5': '36a18a049856720189f30977ccbb2c34',`
			`'info_dict': {`
			`'id': 'v5pv5f',`
			`'ext': 'mp4',`
			`'title': 'WMAR 2 News Latest Headlines \| October 20, 6pm',`
			`'timestamp': 1571611968,`
			`'upload_date': '20191020',`
			`}`
[rumble] unescape title 2022-03-16 21:07:04 +01:00			`}, {`
			`'url': 'https://rumble.com/embed/vslb7v',`
			`'md5': '7418035de1a30a178b8af34dc2b6a52b',`
			`'info_dict': {`
			`'id': 'vslb7v',`
			`'ext': 'mp4',`
			`'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',`
			`'timestamp': 1645142135,`
			`'upload_date': '20220217',`
			`'channel_url': 'https://rumble.com/c/CyberTechNews',`
			`'channel': 'CTNews',`
			`'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',`
			`'duration': 901,`
			`}`
Update to release 2020.11.24 except youtube and skyit extractors 2020-11-23 22:03:08 +01:00			`}, {`
			`'url': 'https://rumble.com/embed/ufe9n.v5pv5f',`
			`'only_matching': True,`
			`}]`

[rumble] Add support for video page (Closes #80) 2021-02-15 15:37:03 +01:00			`@staticmethod`
			`def _extract_urls(webpage):`
			`return [`
			`mobj.group('url')`
			`for mobj in re.finditer(`
			`r'(?:<(?:script\|iframe)[^>]+\bsrc=\|["\']embedUrl["\']\s:\s)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL,`
			`webpage)]`

Update to release 2020.11.24 except youtube and skyit extractors 2020-11-23 22:03:08 +01:00			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`
			`video = self._download_json(`
			`'https://rumble.com/embedJS/', video_id,`
			`query={'request': 'video', 'v': video_id})`
[rumble] unescape title 2022-03-16 21:07:04 +01:00			`title = unescapeHTML(video['title'])`
Update to release 2020.11.24 except youtube and skyit extractors 2020-11-23 22:03:08 +01:00
			`formats = []`
			`for height, ua in (video.get('ua') or {}).items():`
			`for i in range(2):`
			`f_url = try_get(ua, lambda x: x[i], compat_str)`
			`if f_url:`
			`ext = determine_ext(f_url)`
			`f = {`
			`'ext': ext,`
			`'format_id': '%s-%sp' % (ext, height),`
			`'height': int_or_none(height),`
			`'url': f_url,`
			`}`
			`bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])`
			`if bitrate:`
			`f['tbr'] = int_or_none(bitrate)`
			`formats.append(f)`
			`self._sort_formats(formats)`

			`author = video.get('author') or {}`

			`return {`
			`'id': video_id,`
			`'title': title,`
			`'formats': formats,`
			`'thumbnail': video.get('i'),`
			`'timestamp': parse_iso8601(video.get('pubDate')),`
			`'channel': author.get('name'),`
			`'channel_url': author.get('url'),`
			`'duration': int_or_none(video.get('duration')),`
			`}`
[Rumble] Add RumbleChannelIE (#1088) Authored by: Ashish0804 2021-09-27 23:01:23 +02:00

			`class RumbleChannelIE(InfoExtractor):`
			`_VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c\|user)/(?P<id>[^&?#$/]+))'`

			`_TESTS = [{`
			`'url': 'https://rumble.com/c/Styxhexenhammer666',`
			`'playlist_mincount': 1160,`
			`'info_dict': {`
			`'id': 'Styxhexenhammer666',`
			`},`
			`}, {`
			`'url': 'https://rumble.com/user/goldenpoodleharleyeuna',`
			`'playlist_count': 4,`
			`'info_dict': {`
			`'id': 'goldenpoodleharleyeuna',`
			`},`
			`}]`

			`def entries(self, url, playlist_id):`
			`for page in itertools.count(1):`
			`try:`
			`webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)`
			`except ExtractorError as e:`
			`if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:`
			`break`
			`raise`
			`for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):`
			`yield self.url_result('https://rumble.com' + video_url)`

			`def _real_extract(self, url):`
			`url, playlist_id = self._match_valid_url(url).groups()`
			`return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)`