yt-dlp/yt_dlp/extractor/mit.py

import re
import json

from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
    clean_html,
    ExtractorError,
    get_element_by_id,
)


class TechTVMITIE(InfoExtractor):
    IE_NAME = 'techtv.mit.edu'
    _VALID_URL = r'https?://techtv\.mit\.edu/(?:videos|embeds)/(?P<id>\d+)'

    _TEST = {
        'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
        'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7',
        'info_dict': {
            'id': '25418',
            'ext': 'mp4',
            'title': 'MIT DNA and Protein Sets',
            'description': 'md5:46f5c69ce434f0a97e7c628cc142802d',
        },
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        raw_page = self._download_webpage(
            'http://techtv.mit.edu/videos/%s' % video_id, video_id)
        clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)

        base_url = self._proto_relative_url(self._search_regex(
            r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:')
        formats_json = self._search_regex(
            r'bitrates: (\[.+?\])', raw_page, 'video formats')
        formats_mit = json.loads(formats_json)
        formats = [
            {
                'format_id': f['label'],
                'url': base_url + f['url'].partition(':')[2],
                'ext': f['url'].partition(':')[0],
                'format': f['label'],
                'width': f['width'],
                'vbr': f['bitrate'],
            }
            for f in formats_mit
        ]

        title = get_element_by_id('edit-title', clean_page)
        description = clean_html(get_element_by_id('edit-description', clean_page))
        thumbnail = self._search_regex(
            r'playlist:.*?url: \'(.+?)\'',
            raw_page, 'thumbnail', flags=re.DOTALL)

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'description': description,
            'thumbnail': thumbnail,
        }


class OCWMITIE(InfoExtractor):
    IE_NAME = 'ocw.mit.edu'
    _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
    _BASE_URL = 'http://ocw.mit.edu/'

    _TESTS = [
        {
            'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
            'info_dict': {
                'id': 'EObHWIEKGjA',
                'ext': 'webm',
                'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
                'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
                'upload_date': '20121109',
                'uploader_id': 'MIT',
                'uploader': 'MIT OpenCourseWare',
            }
        },
        {
            'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
            'info_dict': {
                'id': '7K1sB05pE0A',
                'ext': 'mp4',
                'title': 'Session 1: Introduction to Derivatives',
                'upload_date': '20090818',
                'uploader_id': 'MIT',
                'uploader': 'MIT OpenCourseWare',
                'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
            }
        }
    ]

    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
        topic = mobj.group('topic')

        webpage = self._download_webpage(url, topic)
        title = self._html_search_meta('WT.cg_s', webpage)
        description = self._html_search_meta('Description', webpage)

        # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
        embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
        if embed_chapter_media:
            metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
            metadata = re.split(r', ?', metadata)
            yt = metadata[1]
        else:
            # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
            embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
            if embed_media:
                metadata = re.sub(r'[\'"]', '', embed_media.group(1))
                metadata = re.split(r', ?', metadata)
                yt = metadata[1]
            else:
                raise ExtractorError('Unable to find embedded YouTube video.')
        video_id = YoutubeIE.extract_id(yt)

        return {
            '_type': 'url_transparent',
            'id': video_id,
            'title': title,
            'description': description,
            'url': yt,
            'ie_key': 'Youtube',
        }
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00			`import re`
			`import json`

			`from .common import InfoExtractor`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`from .youtube import YoutubeIE`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 12:24:42 +01:00			`from ..utils import (`
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00			`clean_html,`
[mit] Add import 2014-02-26 00:41:13 +01:00			`ExtractorError,`
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00			`get_element_by_id,`
			`)`


			`class TechTVMITIE(InfoExtractor):`
[mit] Modernize 2014-02-26 00:06:31 +01:00			`IE_NAME = 'techtv.mit.edu'`
[mit] Modernize 2014-12-17 00:04:24 +01:00			`_VALID_URL = r'https?://techtv\.mit\.edu/(?:videos\|embeds)/(?P<id>\d+)'`
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00
			`_TEST = {`
[mit] Modernize 2014-02-26 00:06:31 +01:00			`'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',`
[techtvmit] Update test 2015-09-06 06:32:42 +02:00			`'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7',`
[mit] Modernize 2014-02-26 00:06:31 +01:00			`'info_dict': {`
			`'id': '25418',`
			`'ext': 'mp4',`
[techtvmit] Update test 2015-09-06 06:32:42 +02:00			`'title': 'MIT DNA and Protein Sets',`
			`'description': 'md5:46f5c69ce434f0a97e7c628cc142802d',`
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00			`},`
			`}`

			`def _real_extract(self, url):`
[mit] Modernize 2014-12-17 00:04:24 +01:00			`video_id = self._match_id(url)`
Fix MIT extractor for Python 2.6 The HTML for the MIT page does not parse cleanly for Python 2.6 due to script tags within an actual script element. The offending piece is inside a comment block, so removing all such comment blocks fixes the parsing. 2013-08-28 21:00:59 +02:00			`raw_page = self._download_webpage(`
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00			`'http://techtv.mit.edu/videos/%s' % video_id, video_id)`
[mit] Modernize 2014-02-26 00:06:31 +01:00			`clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)`
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00
[techtvmit] Fix extraction 2015-09-06 06:28:40 +02:00			`base_url = self._proto_relative_url(self._search_regex(`
			`r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:')`
[mit] Modernize 2014-02-26 00:06:31 +01:00			`formats_json = self._search_regex(`
			`r'bitrates: (\[.+?\])', raw_page, 'video formats')`
[mit] Add support for multiple formats 2013-12-24 12:38:08 +01:00			`formats_mit = json.loads(formats_json)`
			`formats = [`
			`{`
			`'format_id': f['label'],`
			`'url': base_url + f['url'].partition(':')[2],`
			`'ext': f['url'].partition(':')[0],`
			`'format': f['label'],`
			`'width': f['width'],`
			`'vbr': f['bitrate'],`
			`}`
			`for f in formats_mit`
			`]`
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00
Fix MIT extractor for Python 2.6 The HTML for the MIT page does not parse cleanly for Python 2.6 due to script tags within an actual script element. The offending piece is inside a comment block, so removing all such comment blocks fixes the parsing. 2013-08-28 21:00:59 +02:00			`title = get_element_by_id('edit-title', clean_page)`
			`description = clean_html(get_element_by_id('edit-description', clean_page))`
[mit] Modernize 2014-02-26 00:06:31 +01:00			`thumbnail = self._search_regex(`
			`r'playlist:.*?url: \'(.+?)\'',`
			`raw_page, 'thumbnail', flags=re.DOTALL)`
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00
[mit] Modernize 2014-02-26 00:06:31 +01:00			`return {`
			`'id': video_id,`
			`'title': title,`
			`'formats': formats,`
			`'description': description,`
			`'thumbnail': thumbnail,`
			`}`
Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu 2013-08-28 12:51:22 +02:00

Add support for ocw.mit.edu video lectures 2014-02-25 21:44:34 +01:00			`class OCWMITIE(InfoExtractor):`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`IE_NAME = 'ocw.mit.edu'`
Add support for https for all extractors as preventive and future-proof measure 2016-03-21 16:36:32 +01:00			`_VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`_BASE_URL = 'http://ocw.mit.edu/'`
Add support for ocw.mit.edu video lectures 2014-02-25 21:44:34 +01:00
			`_TESTS = [`
			`{`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',`
			`'info_dict': {`
			`'id': 'EObHWIEKGjA',`
update tests related to the change in youtube http format sorting the change was done in 82156fdbf0913c75181484dcc813565713bf78e9 2016-03-05 21:52:24 +01:00			`'ext': 'webm',`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',`
			`'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',`
[mit] Amend test definitions 2015-01-07 11:45:19 +01:00			`'upload_date': '20121109',`
			`'uploader_id': 'MIT',`
			`'uploader': 'MIT OpenCourseWare',`
Add support for ocw.mit.edu video lectures 2014-02-25 21:44:34 +01:00			`}`
			`},`
			`{`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',`
			`'info_dict': {`
			`'id': '7K1sB05pE0A',`
			`'ext': 'mp4',`
			`'title': 'Session 1: Introduction to Derivatives',`
[mit] Amend test definitions 2015-01-07 11:45:19 +01:00			`'upload_date': '20090818',`
			`'uploader_id': 'MIT',`
			`'uploader': 'MIT OpenCourseWare',`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',`
Add support for ocw.mit.edu video lectures 2014-02-25 21:44:34 +01:00			`}`
			`}`
			`]`

			`def _real_extract(self, url):`
[extractor] Common function `_match_valid_url` 2021-08-19 03:41:24 +02:00			`mobj = self._match_valid_url(url)`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`topic = mobj.group('topic')`

			`webpage = self._download_webpage(url, topic)`
			`title = self._html_search_meta('WT.cg_s', webpage)`
			`description = self._html_search_meta('Description', webpage)`
Add support for ocw.mit.edu video lectures 2014-02-25 21:44:34 +01:00
			`# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)`
			`embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)`
			`if embed_chapter_media:`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))`
Add support for ocw.mit.edu video lectures 2014-02-25 21:44:34 +01:00			`metadata = re.split(r', ?', metadata)`
			`yt = metadata[1]`
			`else:`
			`# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)`
			`embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)`
			`if embed_media:`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`metadata = re.sub(r'[\'"]', '', embed_media.group(1))`
Add support for ocw.mit.edu video lectures 2014-02-25 21:44:34 +01:00			`metadata = re.split(r', ?', metadata)`
			`yt = metadata[1]`
			`else:`
			`raise ExtractorError('Unable to find embedded YouTube video.')`
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`video_id = YoutubeIE.extract_id(yt)`
Add support for ocw.mit.edu video lectures 2014-02-25 21:44:34 +01:00
[mit] Fix ocw tests 2014-02-26 00:29:45 +01:00			`return {`
			`'_type': 'url_transparent',`
			`'id': video_id,`
			`'title': title,`
			`'description': description,`
			`'url': yt,`
			`'ie_key': 'Youtube',`
			`}`