yt-dlp/yt_dlp/extractor/bitchute.py

import itertools
import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    GeoRestrictedError,
    orderedSet,
    unified_strdate,
    urlencode_postdata,
)


class BitChuteIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
    _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
    _TESTS = [{
        'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
        'md5': '7e427d7ed7af5a75b5855705ec750e2b',
        'info_dict': {
            'id': 'szoMrox2JEI',
            'ext': 'mp4',
            'title': 'This is the first video on #BitChute !',
            'description': 'md5:a0337e7b1fe39e32336974af8173a034',
            'thumbnail': r're:^https?://.*\.jpg$',
            'uploader': 'BitChute',
            'upload_date': '20170103',
        },
    }, {
        'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
        'only_matching': True,
    }, {
        'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(
            'https://www.bitchute.com/video/%s' % video_id, video_id, headers={
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
            })

        title = self._html_search_regex(
            (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),
            webpage, 'title', default=None) or self._html_search_meta(
            'description', webpage, 'title',
            default=None) or self._og_search_description(webpage)

        format_urls = []
        for mobj in re.finditer(
                r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
            format_urls.append(mobj.group('url'))
        format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage))

        formats = [
            {'url': format_url}
            for format_url in orderedSet(format_urls)]

        if not formats:
            entries = self._parse_html5_media_entries(
                url, webpage, video_id)
            if not entries:
                error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video')
                if error == 'Video Unavailable':
                    raise GeoRestrictedError(error)
                raise ExtractorError(error, expected=True)
            formats = entries[0]['formats']

        self._check_formats(formats, video_id)
        if not formats:
            raise self.raise_no_formats('Video is unavailable', expected=True, video_id=video_id)
        self._sort_formats(formats)

        description = self._html_search_regex(
            r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',
            webpage, 'description', fatal=False)
        thumbnail = self._og_search_thumbnail(
            webpage, default=None) or self._html_search_meta(
            'twitter:image:src', webpage, 'thumbnail')
        uploader = self._html_search_regex(
            (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>',
             r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),
            webpage, 'uploader', fatal=False)

        upload_date = unified_strdate(self._search_regex(
            r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',
            webpage, 'upload date', fatal=False))

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'uploader': uploader,
            'upload_date': upload_date,
            'formats': formats,
        }


class BitChuteChannelIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
    _TEST = {
        'url': 'https://www.bitchute.com/channel/victoriaxrave/',
        'playlist_mincount': 185,
        'info_dict': {
            'id': 'victoriaxrave',
        },
    }

    _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'

    def _entries(self, channel_id):
        channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
        offset = 0
        for page_num in itertools.count(1):
            data = self._download_json(
                '%sextend/' % channel_url, channel_id,
                'Downloading channel page %d' % page_num,
                data=urlencode_postdata({
                    'csrfmiddlewaretoken': self._TOKEN,
                    'name': '',
                    'offset': offset,
                }), headers={
                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'Referer': channel_url,
                    'X-Requested-With': 'XMLHttpRequest',
                    'Cookie': 'csrftoken=%s' % self._TOKEN,
                })
            if data.get('success') is False:
                break
            html = data.get('html')
            if not html:
                break
            video_ids = re.findall(
                r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
                html)
            if not video_ids:
                break
            offset += len(video_ids)
            for video_id in video_ids:
                yield self.url_result(
                    'https://www.bitchute.com/video/%s' % video_id,
                    ie=BitChuteIE.ie_key(), video_id=video_id)

    def _real_extract(self, url):
        channel_id = self._match_id(url)
        return self.playlist_result(
            self._entries(channel_id), playlist_id=channel_id)
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`import itertools`
			`import re`

			`from .common import InfoExtractor`
[bitchute] Fix extraction (closes #18567) 2019-01-01 12:12:44 +01:00			`from ..utils import (`
[bitchute] Fix error for geoblocking Closes #26564. 2020-09-11 23:31:44 +02:00			`ExtractorError,`
			`GeoRestrictedError,`
[bitchute] Fix extraction (closes #18567) 2019-01-01 12:12:44 +01:00			`orderedSet,`
[bitchute] Extract upload date (closes #22990) (#23193) 2019-11-26 18:20:39 +01:00			`unified_strdate,`
[bitchute] Fix extraction (closes #18567) 2019-01-01 12:12:44 +01:00			`urlencode_postdata,`
			`)`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00

			`class BitChuteIE(InfoExtractor):`
			`_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video\|embed\|torrent/[^/]+)/(?P<id>[^/?#&]+)'`
[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 03:23:25 +02:00			`_EMBED_REGEX = [rf'<(?:script\|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`_TESTS = [{`
[bitchute] Fix test (#758) Authored by: mahanstreamer 2021-08-22 21:58:23 +02:00			`'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',`
			`'md5': '7e427d7ed7af5a75b5855705ec750e2b',`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`'info_dict': {`
			`'id': 'szoMrox2JEI',`
			`'ext': 'mp4',`
[bitchute] Fix test (#758) Authored by: mahanstreamer 2021-08-22 21:58:23 +02:00			`'title': 'This is the first video on #BitChute !',`
			`'description': 'md5:a0337e7b1fe39e32336974af8173a034',`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[bitchute] Fix test (#758) Authored by: mahanstreamer 2021-08-22 21:58:23 +02:00			`'uploader': 'BitChute',`
			`'upload_date': '20170103',`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`},`
			`}, {`
			`'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',`
			`'only_matching': True,`
			`}, {`
			`'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',`
			`'only_matching': True,`
			`}]`

			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`

			`webpage = self._download_webpage(`
[bitchute] Fix extraction by pass custom User-Agent 2018-08-27 17:04:56 +02:00			`'https://www.bitchute.com/video/%s' % video_id, video_id, headers={`
			`'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',`
			`})`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00
[bitchute] use _html_search_regex for title extraction 2018-11-18 16:15:10 +01:00			`title = self._html_search_regex(`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`(r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),`
			`webpage, 'title', default=None) or self._html_search_meta(`
			`'description', webpage, 'title',`
			`default=None) or self._og_search_description(webpage)`

[bitchute] Fix extraction (closes #18567) 2019-01-01 12:12:44 +01:00			`format_urls = []`
			`for mobj in re.finditer(`
			`r'addWebSeed\s\(\s(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):`
			`format_urls.append(mobj.group('url'))`
			`format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage))`

[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`formats = [`
[bitchute] Fix extraction (closes #18567) 2019-01-01 12:12:44 +01:00			`{'url': format_url}`
			`for format_url in orderedSet(format_urls)]`
[bitchute] Extract HTML5 formats (closes #21306) 2019-06-07 17:58:19 +02:00
			`if not formats:`
[bitchute] Fix error for geoblocking Closes #26564. 2020-09-11 23:31:44 +02:00			`entries = self._parse_html5_media_entries(`
			`url, webpage, video_id)`
			`if not entries:`
			`error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video')`
			`if error == 'Video Unavailable':`
			`raise GeoRestrictedError(error)`
[extractor/bitchute] Mark errors as expected Closes #4685 2022-08-20 01:22:25 +02:00			`raise ExtractorError(error, expected=True)`
[bitchute] Fix error for geoblocking Closes #26564. 2020-09-11 23:31:44 +02:00			`formats = entries[0]['formats']`
[bitchute] Extract HTML5 formats (closes #21306) 2019-06-07 17:58:19 +02:00
[bitchute] Check formats (#18833) 2019-01-12 21:57:31 +01:00			`self._check_formats(formats, video_id)`
[extractor/bitchute] Mark errors as expected Closes #4685 2022-08-20 01:22:25 +02:00			`if not formats:`
			`raise self.raise_no_formats('Video is unavailable', expected=True, video_id=video_id)`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`self._sort_formats(formats)`

			`description = self._html_search_regex(`
			`r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',`
			`webpage, 'description', fatal=False)`
			`thumbnail = self._og_search_thumbnail(`
			`webpage, default=None) or self._html_search_meta(`
			`'twitter:image:src', webpage, 'thumbnail')`
			`uploader = self._html_search_regex(`
[bitchute] Fix uploader extraction (#21076) 2019-05-22 22:51:50 +02:00			`(r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>',`
			`r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),`
			`webpage, 'uploader', fatal=False)`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00
[bitchute] Extract upload date (closes #22990) (#23193) 2019-11-26 18:20:39 +01:00			`upload_date = unified_strdate(self._search_regex(`
			`r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',`
			`webpage, 'upload date', fatal=False))`

[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`return {`
			`'id': video_id,`
			`'title': title,`
			`'description': description,`
			`'thumbnail': thumbnail,`
			`'uploader': uploader,`
[bitchute] Extract upload date (closes #22990) (#23193) 2019-11-26 18:20:39 +01:00			`'upload_date': upload_date,`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`'formats': formats,`
			`}`


			`class BitChuteChannelIE(InfoExtractor):`
			`_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'`
			`_TEST = {`
			`'url': 'https://www.bitchute.com/channel/victoriaxrave/',`
			`'playlist_mincount': 185,`
			`'info_dict': {`
			`'id': 'victoriaxrave',`
			`},`
			`}`

			`_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'`

			`def _entries(self, channel_id):`
			`channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id`
[bitchute] Improve page offset 2018-08-11 20:52:50 +02:00			`offset = 0`
			`for page_num in itertools.count(1):`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`data = self._download_json(`
			`'%sextend/' % channel_url, channel_id,`
[bitchute] Improve page offset 2018-08-11 20:52:50 +02:00			`'Downloading channel page %d' % page_num,`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`data=urlencode_postdata({`
			`'csrfmiddlewaretoken': self._TOKEN,`
			`'name': '',`
[bitchute] Improve page offset 2018-08-11 20:52:50 +02:00			`'offset': offset,`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`}), headers={`
			`'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',`
			`'Referer': channel_url,`
			`'X-Requested-With': 'XMLHttpRequest',`
			`'Cookie': 'csrftoken=%s' % self._TOKEN,`
			`})`
			`if data.get('success') is False:`
			`break`
			`html = data.get('html')`
			`if not html:`
			`break`
			`video_ids = re.findall(`
			`r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',`
			`html)`
			`if not video_ids:`
			`break`
[bitchute] Improve page offset 2018-08-11 20:52:50 +02:00			`offset += len(video_ids)`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`for video_id in video_ids:`
			`yield self.url_result(`
			`'https://www.bitchute.com/video/%s' % video_id,`
			`ie=BitChuteIE.ie_key(), video_id=video_id)`

			`def _real_extract(self, url):`
			`channel_id = self._match_id(url)`
			`return self.playlist_result(`
			`self._entries(channel_id), playlist_id=channel_id)`