mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-20 01:42:50 +01:00
Merge branch 'yt-dlp:master' into cleanup/2024-06
This commit is contained in:
commit
308f430075
50
.github/workflows/build.yml
vendored
50
.github/workflows/build.yml
vendored
@ -237,27 +237,43 @@ jobs:
|
||||
macos:
|
||||
needs: process
|
||||
if: inputs.macos
|
||||
permissions:
|
||||
contents: read
|
||||
actions: write # For cleaning up cache
|
||||
runs-on: macos-12
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
# NB: Building universal2 does not work with python from actions/setup-python
|
||||
|
||||
- name: Restore cached requirements
|
||||
id: restore-cache
|
||||
uses: actions/cache/restore@v4
|
||||
env:
|
||||
SEGMENT_DOWNLOAD_TIMEOUT_MINS: 1
|
||||
with:
|
||||
path: |
|
||||
~/yt-dlp-build-venv
|
||||
key: cache-reqs-${{ github.job }}
|
||||
|
||||
- name: Install Requirements
|
||||
run: |
|
||||
brew install coreutils
|
||||
python3 devscripts/install_deps.py --user -o --include build
|
||||
python3 -m venv ~/yt-dlp-build-venv
|
||||
source ~/yt-dlp-build-venv/bin/activate
|
||||
python3 devscripts/install_deps.py -o --include build
|
||||
python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt
|
||||
# We need to ignore wheels otherwise we break universal2 builds
|
||||
python3 -m pip install -U --user --no-binary :all: -r requirements.txt
|
||||
python3 -m pip install -U --no-binary :all: -r requirements.txt
|
||||
# We need to fuse our own universal2 wheels for curl_cffi
|
||||
python3 -m pip install -U --user delocate
|
||||
python3 -m pip install -U delocate
|
||||
mkdir curl_cffi_whls curl_cffi_universal2
|
||||
python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt
|
||||
for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do
|
||||
python3 -m pip download \
|
||||
--only-binary=:all: \
|
||||
--platform "${platform}" \
|
||||
--pre -d curl_cffi_whls \
|
||||
-d curl_cffi_whls \
|
||||
-r requirements.txt
|
||||
done
|
||||
( # Overwrite x86_64-only libs with fat/universal2 libs or else Pyinstaller will do the opposite
|
||||
@ -274,9 +290,10 @@ jobs:
|
||||
)
|
||||
python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/curl_cffi*.whl -w curl_cffi_universal2
|
||||
python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/cffi*.whl -w curl_cffi_universal2
|
||||
cd curl_cffi_universal2
|
||||
for wheel in ./*cffi*.whl; do mv -n -- "${wheel}" "${wheel/x86_64/universal2}"; done
|
||||
python3 -m pip install -U --user ./*cffi*.whl
|
||||
for wheel in curl_cffi_universal2/*cffi*.whl; do
|
||||
mv -n -- "${wheel}" "${wheel/x86_64/universal2}"
|
||||
done
|
||||
python3 -m pip install --force-reinstall -U curl_cffi_universal2/*cffi*.whl
|
||||
|
||||
- name: Prepare
|
||||
run: |
|
||||
@ -284,6 +301,7 @@ jobs:
|
||||
python3 devscripts/make_lazy_extractors.py
|
||||
- name: Build
|
||||
run: |
|
||||
source ~/yt-dlp-build-venv/bin/activate
|
||||
python3 -m bundle.pyinstaller --target-architecture universal2 --onedir
|
||||
(cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .)
|
||||
python3 -m bundle.pyinstaller --target-architecture universal2
|
||||
@ -307,6 +325,24 @@ jobs:
|
||||
dist/yt-dlp_macos.zip
|
||||
compression-level: 0
|
||||
|
||||
- name: Cleanup cache
|
||||
if: steps.restore-cache.outputs.cache-hit == 'true'
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
cache_key: cache-reqs-${{ github.job }}
|
||||
repository: ${{ github.repository }}
|
||||
branch: ${{ github.ref }}
|
||||
run: |
|
||||
gh extension install actions/gh-actions-cache
|
||||
gh actions-cache delete "${cache_key}" -R "${repository}" -B "${branch}" --confirm
|
||||
|
||||
- name: Cache requirements
|
||||
uses: actions/cache/save@v4
|
||||
with:
|
||||
path: |
|
||||
~/yt-dlp-build-venv
|
||||
key: cache-reqs-${{ github.job }}
|
||||
|
||||
macos_legacy:
|
||||
needs: process
|
||||
if: inputs.macos_legacy
|
||||
|
3
.github/workflows/release-master.yml
vendored
3
.github/workflows/release-master.yml
vendored
@ -24,6 +24,7 @@ jobs:
|
||||
source: master
|
||||
permissions:
|
||||
contents: write
|
||||
packages: write
|
||||
packages: write # For package cache
|
||||
actions: write # For cleaning up cache
|
||||
id-token: write # mandatory for trusted publishing
|
||||
secrets: inherit
|
||||
|
3
.github/workflows/release-nightly.yml
vendored
3
.github/workflows/release-nightly.yml
vendored
@ -37,6 +37,7 @@ jobs:
|
||||
source: nightly
|
||||
permissions:
|
||||
contents: write
|
||||
packages: write
|
||||
packages: write # For package cache
|
||||
actions: write # For cleaning up cache
|
||||
id-token: write # mandatory for trusted publishing
|
||||
secrets: inherit
|
||||
|
1
.github/workflows/release.yml
vendored
1
.github/workflows/release.yml
vendored
@ -229,6 +229,7 @@ jobs:
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write # For package cache
|
||||
actions: write # For cleaning up cache
|
||||
secrets:
|
||||
GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }}
|
||||
|
||||
|
@ -1779,8 +1779,9 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.)
|
||||
* `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off
|
||||
|
||||
#### generic
|
||||
* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg
|
||||
* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Note that if the stream has an HLS AES-128 key, then the query parameters will be passed to the key URI as well, unless the `key_query` extractor-arg is passed, or unless an external key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg
|
||||
* `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE`
|
||||
* `key_query`: Passthrough the master m3u8 URL query to its HLS AES-128 decryption key URI if no value is provided, or else apply the query string given as `key_query=VALUE`. Note that this will have no effect if the key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg
|
||||
* `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist
|
||||
* `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live`
|
||||
|
||||
|
@ -582,8 +582,9 @@ class YoutubeDL:
|
||||
'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns',
|
||||
'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data',
|
||||
'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies',
|
||||
'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options',
|
||||
'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time',
|
||||
'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'extra_param_to_key_url',
|
||||
'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version',
|
||||
'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time',
|
||||
}
|
||||
_deprecated_multivalue_fields = {
|
||||
'album_artist': 'album_artists',
|
||||
|
@ -108,7 +108,7 @@ def supports(cls, info_dict):
|
||||
return all((
|
||||
not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES,
|
||||
'+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES,
|
||||
not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url'),
|
||||
not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url', 'extra_param_to_key_url'),
|
||||
all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')),
|
||||
))
|
||||
|
||||
|
@ -160,10 +160,12 @@ def is_ad_fragment_end(s):
|
||||
extra_state = ctx.setdefault('extra_state', {})
|
||||
|
||||
format_index = info_dict.get('format_index')
|
||||
extra_query = None
|
||||
extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
|
||||
if extra_param_to_segment_url:
|
||||
extra_query = urllib.parse.parse_qs(extra_param_to_segment_url)
|
||||
extra_segment_query = None
|
||||
if extra_param_to_segment_url := info_dict.get('extra_param_to_segment_url'):
|
||||
extra_segment_query = urllib.parse.parse_qs(extra_param_to_segment_url)
|
||||
extra_key_query = None
|
||||
if extra_param_to_key_url := info_dict.get('extra_param_to_key_url'):
|
||||
extra_key_query = urllib.parse.parse_qs(extra_param_to_key_url)
|
||||
i = 0
|
||||
media_sequence = 0
|
||||
decrypt_info = {'METHOD': 'NONE'}
|
||||
@ -190,8 +192,8 @@ def is_ad_fragment_end(s):
|
||||
if frag_index <= ctx['fragment_index']:
|
||||
continue
|
||||
frag_url = urljoin(man_url, line)
|
||||
if extra_query:
|
||||
frag_url = update_url_query(frag_url, extra_query)
|
||||
if extra_segment_query:
|
||||
frag_url = update_url_query(frag_url, extra_segment_query)
|
||||
|
||||
fragments.append({
|
||||
'frag_index': frag_index,
|
||||
@ -212,8 +214,8 @@ def is_ad_fragment_end(s):
|
||||
frag_index += 1
|
||||
map_info = parse_m3u8_attributes(line[11:])
|
||||
frag_url = urljoin(man_url, map_info.get('URI'))
|
||||
if extra_query:
|
||||
frag_url = update_url_query(frag_url, extra_query)
|
||||
if extra_segment_query:
|
||||
frag_url = update_url_query(frag_url, extra_segment_query)
|
||||
|
||||
if map_info.get('BYTERANGE'):
|
||||
splitted_byte_range = map_info.get('BYTERANGE').split('@')
|
||||
@ -244,8 +246,10 @@ def is_ad_fragment_end(s):
|
||||
decrypt_info['KEY'] = external_aes_key
|
||||
else:
|
||||
decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI'])
|
||||
if extra_query:
|
||||
decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
|
||||
if extra_key_query or extra_segment_query:
|
||||
# Fall back to extra_segment_query to key for backwards compat
|
||||
decrypt_info['URI'] = update_url_query(
|
||||
decrypt_info['URI'], extra_key_query or extra_segment_query)
|
||||
if decrypt_url != decrypt_info['URI']:
|
||||
decrypt_info['KEY'] = None
|
||||
|
||||
|
@ -1755,7 +1755,10 @@
|
||||
RTVETelevisionIE,
|
||||
)
|
||||
from .rtvs import RTVSIE
|
||||
from .rtvslo import RTVSLOIE
|
||||
from .rtvslo import (
|
||||
RTVSLOIE,
|
||||
RTVSLOShowIE,
|
||||
)
|
||||
from .rudovideo import RudoVideoIE
|
||||
from .rule34video import Rule34VideoIE
|
||||
from .rumble import (
|
||||
@ -1925,6 +1928,10 @@
|
||||
)
|
||||
from .springboardplatform import SpringboardPlatformIE
|
||||
from .sprout import SproutIE
|
||||
from .sproutvideo import (
|
||||
SproutVideoIE,
|
||||
VidsIoIE,
|
||||
)
|
||||
from .srgssr import (
|
||||
SRGSSRIE,
|
||||
SRGSSRPlayIE,
|
||||
|
@ -387,7 +387,7 @@ def _build_brightcove_url_from_js(cls, object_js):
|
||||
@classmethod
|
||||
def _make_brightcove_url(cls, params):
|
||||
return update_url_query(
|
||||
'http://c.brightcove.com/services/viewer/htmlFederated', params)
|
||||
'https://c.brightcove.com/services/viewer/htmlFederated', params)
|
||||
|
||||
@classmethod
|
||||
def _extract_brightcove_url(cls, webpage):
|
||||
@ -471,7 +471,7 @@ def _real_extract(self, url):
|
||||
if referer:
|
||||
headers['Referer'] = referer
|
||||
player_page = self._download_webpage(
|
||||
'http://link.brightcove.com/services/player/bcpid' + player_id[0],
|
||||
'https://link.brightcove.com/services/player/bcpid' + player_id[0],
|
||||
video_id, headers=headers, fatal=False)
|
||||
if player_page:
|
||||
player_key = self._search_regex(
|
||||
@ -481,7 +481,7 @@ def _real_extract(self, url):
|
||||
enc_pub_id = player_key.split(',')[1].replace('~', '=')
|
||||
publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0]
|
||||
if publisher_id:
|
||||
brightcove_new_url = f'http://players.brightcove.net/{publisher_id}/default_default/index.html?videoId={video_id}'
|
||||
brightcove_new_url = f'https://players.brightcove.net/{publisher_id}/default_default/index.html?videoId={video_id}'
|
||||
if referer:
|
||||
brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer})
|
||||
return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id)
|
||||
@ -797,7 +797,7 @@ def _extract_brightcove_urls(ie, webpage):
|
||||
# Look for iframe embeds [1]
|
||||
for _, url in re.findall(
|
||||
r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
|
||||
entries.append(url if url.startswith('http') else 'http:' + url)
|
||||
entries.append(url if url.startswith(('http:', 'https:')) else 'https:' + url)
|
||||
|
||||
# Look for <video> tags [2] and embed_in_page embeds [3]
|
||||
# [2] looks like:
|
||||
@ -826,7 +826,7 @@ def _extract_brightcove_urls(ie, webpage):
|
||||
player_id = player_id or attrs.get('data-player') or 'default'
|
||||
embed = embed or attrs.get('data-embed') or 'default'
|
||||
|
||||
bc_url = f'http://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}'
|
||||
bc_url = f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}'
|
||||
|
||||
# Some brightcove videos may be embedded with video tag only and
|
||||
# without script tag or any mentioning of brightcove at all. Such
|
||||
@ -863,7 +863,7 @@ def _real_extract(self, url):
|
||||
store_pk = lambda x: self.cache.store('brightcove', policy_key_id, x)
|
||||
|
||||
def extract_policy_key():
|
||||
base_url = f'http://players.brightcove.net/{account_id}/{player_id}_{embed}/'
|
||||
base_url = f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/'
|
||||
config = self._download_json(
|
||||
base_url + 'config.json', video_id, fatal=False) or {}
|
||||
policy_key = try_get(
|
||||
|
@ -234,7 +234,14 @@ class InfoExtractor:
|
||||
'maybe' if the format may have DRM and has to be tested before download.
|
||||
* extra_param_to_segment_url A query string to append to each
|
||||
fragment's URL, or to update each existing query string
|
||||
with. Only applied by the native HLS/DASH downloaders.
|
||||
with. If it is an HLS stream with an AES-128 decryption key,
|
||||
the query paramaters will be passed to the key URI as well,
|
||||
unless there is an `extra_param_to_key_url` given,
|
||||
or unless an external key URI is provided via `hls_aes`.
|
||||
Only applied by the native HLS/DASH downloaders.
|
||||
* extra_param_to_key_url A query string to append to the URL
|
||||
of the format's HLS AES-128 decryption key.
|
||||
Only applied by the native HLS downloader.
|
||||
* hls_aes A dictionary of HLS AES-128 decryption information
|
||||
used by the native HLS downloader to override the
|
||||
values in the media playlist when an '#EXT-X-KEY' tag
|
||||
|
@ -5,6 +5,7 @@
|
||||
from .dailymotion import DailymotionIE
|
||||
from ..networking import HEADRequest
|
||||
from ..utils import (
|
||||
clean_html,
|
||||
determine_ext,
|
||||
filter_dict,
|
||||
format_field,
|
||||
@ -33,6 +34,7 @@ class FranceTVIE(InfoExtractor):
|
||||
_GEO_BYPASS = False
|
||||
|
||||
_TESTS = [{
|
||||
# tokenized url is in dinfo['video']['token']
|
||||
'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1',
|
||||
'info_dict': {
|
||||
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
|
||||
@ -44,6 +46,19 @@ class FranceTVIE(InfoExtractor):
|
||||
'upload_date': '20170813',
|
||||
},
|
||||
'params': {'skip_download': 'm3u8'},
|
||||
}, {
|
||||
# tokenized url is in dinfo['video']['token']['akamai']
|
||||
'url': 'francetv:c5bda21d-2c6f-4470-8849-3d8327adb2ba',
|
||||
'info_dict': {
|
||||
'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba',
|
||||
'ext': 'mp4',
|
||||
'title': '13h15, le dimanche... - Les mystères de Jésus',
|
||||
'timestamp': 1514118300,
|
||||
'duration': 2880,
|
||||
'thumbnail': r're:^https?://.*\.jpg$',
|
||||
'upload_date': '20171224',
|
||||
},
|
||||
'params': {'skip_download': 'm3u8'},
|
||||
}, {
|
||||
'url': 'francetv:162311093',
|
||||
'only_matching': True,
|
||||
@ -68,6 +83,7 @@ class FranceTVIE(InfoExtractor):
|
||||
def _extract_video(self, video_id, hostname=None):
|
||||
is_live = None
|
||||
videos = []
|
||||
drm_formats = False
|
||||
title = None
|
||||
subtitle = None
|
||||
episode_number = None
|
||||
@ -85,13 +101,12 @@ def _extract_video(self, video_id, hostname=None):
|
||||
'device_type': device_type,
|
||||
'browser': browser,
|
||||
'domain': hostname,
|
||||
}), fatal=False)
|
||||
}), fatal=False, expected_status=422) # 422 json gives detailed error code/message
|
||||
|
||||
if not dinfo:
|
||||
continue
|
||||
|
||||
video = traverse_obj(dinfo, ('video', {dict}))
|
||||
if video:
|
||||
if video := traverse_obj(dinfo, ('video', {dict})):
|
||||
videos.append(video)
|
||||
if duration is None:
|
||||
duration = video.get('duration')
|
||||
@ -99,9 +114,19 @@ def _extract_video(self, video_id, hostname=None):
|
||||
is_live = video.get('is_live')
|
||||
if spritesheets is None:
|
||||
spritesheets = video.get('spritesheets')
|
||||
elif code := traverse_obj(dinfo, ('code', {int})):
|
||||
if code == 2009:
|
||||
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
|
||||
elif code in (2015, 2017):
|
||||
# 2015: L'accès à cette vidéo est impossible. (DRM-only)
|
||||
# 2017: Cette vidéo n'est pas disponible depuis le site web mobile (b/c DRM)
|
||||
drm_formats = True
|
||||
continue
|
||||
self.report_warning(
|
||||
f'{self.IE_NAME} said: {code} "{clean_html(dinfo.get("message"))}"')
|
||||
continue
|
||||
|
||||
meta = traverse_obj(dinfo, ('meta', {dict}))
|
||||
if meta:
|
||||
if meta := traverse_obj(dinfo, ('meta', {dict})):
|
||||
if title is None:
|
||||
title = meta.get('title')
|
||||
# meta['pre_title'] contains season and episode number for series in format "S<ID> E<ID>"
|
||||
@ -114,12 +139,15 @@ def _extract_video(self, video_id, hostname=None):
|
||||
if timestamp is None:
|
||||
timestamp = parse_iso8601(meta.get('broadcasted_at'))
|
||||
|
||||
if not videos and drm_formats:
|
||||
self.report_drm(video_id)
|
||||
|
||||
formats, subtitles, video_url = [], {}, None
|
||||
for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])):
|
||||
video_url = video['url']
|
||||
format_id = video.get('format')
|
||||
|
||||
if token_url := url_or_none(video.get('token')):
|
||||
if token_url := traverse_obj(video, ('token', (None, 'akamai'), {url_or_none}, any)):
|
||||
tokenized_url = traverse_obj(self._download_json(
|
||||
token_url, video_id, f'Downloading signed {format_id} manifest URL',
|
||||
fatal=False, query={
|
||||
@ -225,13 +253,13 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
|
||||
_TESTS = [{
|
||||
'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
|
||||
'info_dict': {
|
||||
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
|
||||
'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba',
|
||||
'ext': 'mp4',
|
||||
'title': '13h15, le dimanche... - Les mystères de Jésus',
|
||||
'timestamp': 1502623500,
|
||||
'duration': 2580,
|
||||
'timestamp': 1514118300,
|
||||
'duration': 2880,
|
||||
'thumbnail': r're:^https?://.*\.jpg$',
|
||||
'upload_date': '20170813',
|
||||
'upload_date': '20171224',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
|
@ -2167,7 +2167,15 @@ def _extra_manifest_info(self, info, manifest_url):
|
||||
urllib.parse.urlparse(fragment_query).query or fragment_query
|
||||
or urllib.parse.urlparse(manifest_url).query or None)
|
||||
|
||||
hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None
|
||||
key_query = self._configuration_arg('key_query', [None], casesense=True)[0]
|
||||
if key_query is not None:
|
||||
info['extra_param_to_key_url'] = (
|
||||
urllib.parse.urlparse(key_query).query or key_query
|
||||
or urllib.parse.urlparse(manifest_url).query or None)
|
||||
|
||||
def hex_or_none(value):
|
||||
return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None
|
||||
|
||||
info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
|
||||
'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
|
||||
}) or None
|
||||
|
@ -3,43 +3,52 @@
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
int_or_none,
|
||||
make_archive_id,
|
||||
parse_iso8601,
|
||||
try_get,
|
||||
str_or_none,
|
||||
traverse_obj,
|
||||
url_or_none,
|
||||
urljoin,
|
||||
)
|
||||
|
||||
|
||||
class KhanAcademyBaseIE(InfoExtractor):
|
||||
_VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
|
||||
|
||||
_PUBLISHED_CONTENT_VERSION = '171419ab20465d931b356f22d20527f13969bb70'
|
||||
|
||||
def _parse_video(self, video):
|
||||
return {
|
||||
'_type': 'url_transparent',
|
||||
'url': video['youtubeId'],
|
||||
'id': video.get('slug'),
|
||||
'title': video.get('title'),
|
||||
'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
|
||||
'duration': int_or_none(video.get('duration')),
|
||||
'description': video.get('description'),
|
||||
'id': video['youtubeId'],
|
||||
'ie_key': 'Youtube',
|
||||
**traverse_obj(video, {
|
||||
'display_id': ('id', {str_or_none}),
|
||||
'title': ('translatedTitle', {str}),
|
||||
'thumbnail': ('thumbnailUrls', ..., 'url', {url_or_none}),
|
||||
'duration': ('duration', {int_or_none}),
|
||||
'description': ('description', {str}),
|
||||
}, get_all=False),
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
display_id = self._match_id(url)
|
||||
content = self._download_json(
|
||||
'https://www.khanacademy.org/api/internal/graphql/FetchContentData',
|
||||
display_id, query={
|
||||
'https://www.khanacademy.org/api/internal/graphql/ContentForPath', display_id,
|
||||
query={
|
||||
'fastly_cacheable': 'persist_until_publish',
|
||||
'hash': '4134764944',
|
||||
'lang': 'en',
|
||||
'pcv': self._PUBLISHED_CONTENT_VERSION,
|
||||
'hash': '1242644265',
|
||||
'variables': json.dumps({
|
||||
'path': display_id,
|
||||
'queryParams': 'lang=en',
|
||||
'isModal': False,
|
||||
'followRedirects': True,
|
||||
'countryCode': 'US',
|
||||
'kaLocale': 'en',
|
||||
'clientPublishedContentVersion': self._PUBLISHED_CONTENT_VERSION,
|
||||
}),
|
||||
})['data']['contentJson']
|
||||
return self._parse_component_props(self._parse_json(content, display_id)['componentProps'])
|
||||
'lang': 'en',
|
||||
})['data']['contentRoute']['listedPathData']
|
||||
return self._parse_component_props(content, display_id)
|
||||
|
||||
|
||||
class KhanAcademyIE(KhanAcademyBaseIE):
|
||||
@ -47,64 +56,98 @@ class KhanAcademyIE(KhanAcademyBaseIE):
|
||||
_VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
|
||||
_TEST = {
|
||||
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
|
||||
'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
|
||||
'md5': '1d5c2e70fa6aa29c38eca419f12515ce',
|
||||
'info_dict': {
|
||||
'id': 'FlIG3TvQCBQ',
|
||||
'ext': 'mp4',
|
||||
'title': 'The one-time pad',
|
||||
'description': 'The perfect cipher',
|
||||
'display_id': '716378217',
|
||||
'duration': 176,
|
||||
'uploader': 'Brit Cruise',
|
||||
'uploader_id': 'khanacademy',
|
||||
'uploader': 'Khan Academy',
|
||||
'uploader_id': '@khanacademy',
|
||||
'uploader_url': 'https://www.youtube.com/@khanacademy',
|
||||
'upload_date': '20120411',
|
||||
'timestamp': 1334170113,
|
||||
'license': 'cc-by-nc-sa',
|
||||
'live_status': 'not_live',
|
||||
'channel': 'Khan Academy',
|
||||
'channel_id': 'UC4a-Gbdw7vOaccHmFo40b9g',
|
||||
'channel_url': 'https://www.youtube.com/channel/UC4a-Gbdw7vOaccHmFo40b9g',
|
||||
'channel_is_verified': True,
|
||||
'playable_in_embed': True,
|
||||
'categories': ['Education'],
|
||||
'creators': ['Brit Cruise'],
|
||||
'tags': [],
|
||||
'age_limit': 0,
|
||||
'availability': 'public',
|
||||
'comment_count': int,
|
||||
'channel_follower_count': int,
|
||||
'thumbnail': str,
|
||||
'view_count': int,
|
||||
'like_count': int,
|
||||
'heatmap': list,
|
||||
},
|
||||
'add_ie': ['Youtube'],
|
||||
}
|
||||
|
||||
def _parse_component_props(self, component_props):
|
||||
video = component_props['tutorialPageData']['contentModel']
|
||||
info = self._parse_video(video)
|
||||
author_names = video.get('authorNames')
|
||||
info.update({
|
||||
'uploader': ', '.join(author_names) if author_names else None,
|
||||
'timestamp': parse_iso8601(video.get('dateAdded')),
|
||||
'license': video.get('kaUserLicense'),
|
||||
})
|
||||
return info
|
||||
def _parse_component_props(self, component_props, display_id):
|
||||
video = component_props['content']
|
||||
return {
|
||||
**self._parse_video(video),
|
||||
**traverse_obj(video, {
|
||||
'creators': ('authorNames', ..., {str}),
|
||||
'timestamp': ('dateAdded', {parse_iso8601}),
|
||||
'license': ('kaUserLicense', {str}),
|
||||
}),
|
||||
}
|
||||
|
||||
|
||||
class KhanAcademyUnitIE(KhanAcademyBaseIE):
|
||||
IE_NAME = 'khanacademy:unit'
|
||||
_VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
|
||||
_TEST = {
|
||||
_VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('1,2', '')) + '/?(?:[?#&]|$)'
|
||||
_TESTS = [{
|
||||
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
|
||||
'info_dict': {
|
||||
'id': 'cryptography',
|
||||
'id': 'x48c910b6',
|
||||
'title': 'Cryptography',
|
||||
'description': 'How have humans protected their secret messages through history? What has changed today?',
|
||||
'display_id': 'computing/computer-science/cryptography',
|
||||
'_old_archive_ids': ['khanacademyunit cryptography'],
|
||||
},
|
||||
'playlist_mincount': 31,
|
||||
}
|
||||
}, {
|
||||
'url': 'https://www.khanacademy.org/computing/computer-science',
|
||||
'info_dict': {
|
||||
'id': 'x301707a0',
|
||||
'title': 'Computer science theory',
|
||||
'description': 'md5:4b472a4646e6cf6ec4ccb52c4062f8ba',
|
||||
'display_id': 'computing/computer-science',
|
||||
'_old_archive_ids': ['khanacademyunit computer-science'],
|
||||
},
|
||||
'playlist_mincount': 50,
|
||||
}]
|
||||
|
||||
def _parse_component_props(self, component_props):
|
||||
curation = component_props['curation']
|
||||
def _parse_component_props(self, component_props, display_id):
|
||||
course = component_props['course']
|
||||
selected_unit = traverse_obj(course, (
|
||||
'unitChildren', lambda _, v: v['relativeUrl'] == f'/{display_id}', any)) or course
|
||||
|
||||
entries = []
|
||||
tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
|
||||
for tutorial_number, tutorial in enumerate(tutorials, 1):
|
||||
chapter_info = {
|
||||
'chapter': tutorial.get('title'),
|
||||
'chapter_number': tutorial_number,
|
||||
'chapter_id': tutorial.get('id'),
|
||||
}
|
||||
for content_item in (tutorial.get('contentItems') or []):
|
||||
if content_item.get('kind') == 'Video':
|
||||
info = self._parse_video(content_item)
|
||||
info.update(chapter_info)
|
||||
entries.append(info)
|
||||
def build_entry(entry):
|
||||
return self.url_result(urljoin(
|
||||
'https://www.khanacademy.org', entry['canonicalUrl']),
|
||||
KhanAcademyIE, title=entry.get('translatedTitle'))
|
||||
|
||||
entries = traverse_obj(selected_unit, (
|
||||
(('unitChildren', ...), None), 'allOrderedChildren', ..., 'curatedChildren',
|
||||
lambda _, v: v['contentKind'] == 'Video' and v['canonicalUrl'], {build_entry}))
|
||||
|
||||
return self.playlist_result(
|
||||
entries, curation.get('unit'), curation.get('title'),
|
||||
curation.get('description'))
|
||||
entries,
|
||||
display_id=display_id,
|
||||
**traverse_obj(selected_unit, {
|
||||
'id': ('id', {str}),
|
||||
'title': ('translatedTitle', {str}),
|
||||
'description': ('translatedDescription', {str}),
|
||||
'_old_archive_ids': ('slug', {str}, {lambda x: [make_archive_id(self, x)] if x else None}),
|
||||
}))
|
||||
|
@ -4,6 +4,7 @@
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
clean_html,
|
||||
filter_dict,
|
||||
get_element_by_class,
|
||||
int_or_none,
|
||||
join_nonempty,
|
||||
@ -590,21 +591,22 @@ class NhkRadiruIE(InfoExtractor):
|
||||
IE_DESC = 'NHK らじる (Radiru/Rajiru)'
|
||||
_VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
|
||||
_TESTS = [{
|
||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210',
|
||||
'skip': 'Episode expired on 2024-02-24',
|
||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_4003239',
|
||||
'skip': 'Episode expired on 2024-06-09',
|
||||
'info_dict': {
|
||||
'title': 'ジャズ・トゥナイト シリーズJAZZジャイアンツ 56 ジョニー・ホッジス',
|
||||
'id': '0449_01_3926210',
|
||||
'title': 'ジャズ・トゥナイト ジャズ「Night and Day」特集',
|
||||
'id': '0449_01_4003239',
|
||||
'ext': 'm4a',
|
||||
'uploader': 'NHK FM 東京',
|
||||
'description': 'md5:ad05f3c3f3f6e99b2e69f9b5e49551dc',
|
||||
'series': 'ジャズ・トゥナイト',
|
||||
'uploader': 'NHK-FM',
|
||||
'channel': 'NHK-FM',
|
||||
'channel': 'NHK FM 東京',
|
||||
'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
|
||||
'release_date': '20240217',
|
||||
'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811',
|
||||
'timestamp': 1708185600,
|
||||
'release_timestamp': 1708178400,
|
||||
'upload_date': '20240217',
|
||||
'upload_date': '20240601',
|
||||
'series_id': '0449_01',
|
||||
'release_date': '20240601',
|
||||
'timestamp': 1717257600,
|
||||
'release_timestamp': 1717250400,
|
||||
},
|
||||
}, {
|
||||
# playlist, airs every weekday so it should _hopefully_ be okay forever
|
||||
@ -613,71 +615,145 @@ class NhkRadiruIE(InfoExtractor):
|
||||
'id': '0458_01',
|
||||
'title': 'ベストオブクラシック',
|
||||
'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
|
||||
'channel': 'NHK-FM',
|
||||
'uploader': 'NHK-FM',
|
||||
'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
|
||||
'series_id': '0458_01',
|
||||
'uploader': 'NHK FM',
|
||||
'channel': 'NHK FM',
|
||||
'series': 'ベストオブクラシック',
|
||||
},
|
||||
'playlist_mincount': 3,
|
||||
}, {
|
||||
# one with letters in the id
|
||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
|
||||
'note': 'Expires on 2024-03-31',
|
||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F683_01_3910688',
|
||||
'note': 'Expires on 2025-03-31',
|
||||
'info_dict': {
|
||||
'id': 'F300_06_3738470',
|
||||
'id': 'F683_01_3910688',
|
||||
'ext': 'm4a',
|
||||
'title': '有島武郎「一房のぶどう」',
|
||||
'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)',
|
||||
'channel': 'NHKラジオ第1、NHK-FM',
|
||||
'uploader': 'NHKラジオ第1、NHK-FM',
|
||||
'timestamp': 1635757200,
|
||||
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
|
||||
'release_date': '20161207',
|
||||
'series': 'らじる文庫 by ラジオ深夜便 ',
|
||||
'release_timestamp': 1481126700,
|
||||
'upload_date': '20211101',
|
||||
'title': '夏目漱石「文鳥」第1回',
|
||||
'series': '【らじる文庫】夏目漱石「文鳥」(全4回)',
|
||||
'series_id': 'F683_01',
|
||||
'description': '朗読:浅井理アナウンサー',
|
||||
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F683/img/roudoku_05_rod_640.jpg',
|
||||
'upload_date': '20240106',
|
||||
'release_date': '20240106',
|
||||
'uploader': 'NHK R1',
|
||||
'release_timestamp': 1704511800,
|
||||
'channel': 'NHK R1',
|
||||
'timestamp': 1704512700,
|
||||
},
|
||||
'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'],
|
||||
'expected_warnings': ['Unable to download JSON metadata',
|
||||
'Failed to get extended metadata. API returned Error 1: Invalid parameters'],
|
||||
}, {
|
||||
# news
|
||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
|
||||
'skip': 'Expires on 2023-04-17',
|
||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_4012173',
|
||||
'info_dict': {
|
||||
'id': 'F261_01_3855109',
|
||||
'id': 'F261_01_4012173',
|
||||
'ext': 'm4a',
|
||||
'channel': 'NHKラジオ第1',
|
||||
'uploader': 'NHKラジオ第1',
|
||||
'timestamp': 1681635900,
|
||||
'release_date': '20230416',
|
||||
'series': 'NHKラジオニュース',
|
||||
'title': '午後6時のNHKニュース',
|
||||
'title': '午前0時のNHKニュース',
|
||||
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
|
||||
'upload_date': '20230416',
|
||||
'release_timestamp': 1681635600,
|
||||
'release_timestamp': 1718290800,
|
||||
'release_date': '20240613',
|
||||
'timestamp': 1718291400,
|
||||
'upload_date': '20240613',
|
||||
},
|
||||
}, {
|
||||
# fallback when extended metadata fails
|
||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=2834_01_4009298',
|
||||
'skip': 'Expires on 2024-06-07',
|
||||
'info_dict': {
|
||||
'id': '2834_01_4009298',
|
||||
'title': 'まち☆キラ!開成町特集',
|
||||
'ext': 'm4a',
|
||||
'release_date': '20240531',
|
||||
'upload_date': '20240531',
|
||||
'series': 'はま☆キラ!',
|
||||
'thumbnail': 'https://www.nhk.or.jp/prog/img/2834/g2834.jpg',
|
||||
'channel': 'NHK R1,FM',
|
||||
'description': '',
|
||||
'timestamp': 1717123800,
|
||||
'uploader': 'NHK R1,FM',
|
||||
'release_timestamp': 1717120800,
|
||||
'series_id': '2834_01',
|
||||
},
|
||||
'expected_warnings': ['Failed to get extended metadata. API returned empty list.'],
|
||||
}]
|
||||
|
||||
_API_URL_TMPL = None
|
||||
|
||||
def _extract_extended_description(self, episode_id, episode):
|
||||
service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')}))
|
||||
aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str}))
|
||||
def _extract_extended_metadata(self, episode_id, aa_vinfo):
|
||||
service, _, area = traverse_obj(aa_vinfo, (2, {str}, {lambda x: (x or '').partition(',')}))
|
||||
detail_url = try_call(
|
||||
lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3))
|
||||
lambda: self._API_URL_TMPL.format(area=area, service=service, dateid=aa_vinfo[3]))
|
||||
if not detail_url:
|
||||
return
|
||||
return {}
|
||||
|
||||
full_meta = traverse_obj(
|
||||
self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False),
|
||||
('list', service, 0, {dict})) or {}
|
||||
return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta)
|
||||
response = self._download_json(
|
||||
detail_url, episode_id, 'Downloading extended metadata',
|
||||
'Failed to download extended metadata', fatal=False, expected_status=400)
|
||||
if not response:
|
||||
return {}
|
||||
|
||||
def _extract_episode_info(self, headline, programme_id, series_meta):
|
||||
if error := traverse_obj(response, ('error', {dict})):
|
||||
self.report_warning(
|
||||
'Failed to get extended metadata. API returned '
|
||||
f'Error {join_nonempty("code", "message", from_dict=error, delim=": ")}')
|
||||
return {}
|
||||
|
||||
full_meta = traverse_obj(response, ('list', service, 0, {dict}))
|
||||
if not full_meta:
|
||||
self.report_warning('Failed to get extended metadata. API returned empty list.')
|
||||
return {}
|
||||
|
||||
station = ' '.join(traverse_obj(full_meta, (('service', 'area'), 'name', {str}))) or None
|
||||
thumbnails = [{
|
||||
'id': str(id_),
|
||||
'preference': 1 if id_.startswith('thumbnail') else -2 if id_.startswith('logo') else -1,
|
||||
**traverse_obj(thumb, {
|
||||
'url': 'url',
|
||||
'width': ('width', {int_or_none}),
|
||||
'height': ('height', {int_or_none}),
|
||||
}),
|
||||
} for id_, thumb in traverse_obj(full_meta, ('images', {dict.items}, lambda _, v: v[1]['url']))]
|
||||
|
||||
return filter_dict({
|
||||
'channel': station,
|
||||
'uploader': station,
|
||||
'description': join_nonempty(
|
||||
'subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta),
|
||||
'thumbnails': thumbnails,
|
||||
**traverse_obj(full_meta, {
|
||||
'title': ('title', {str}),
|
||||
'timestamp': ('end_time', {unified_timestamp}),
|
||||
'release_timestamp': ('start_time', {unified_timestamp}),
|
||||
}),
|
||||
})
|
||||
|
||||
def _extract_episode_info(self, episode, programme_id, series_meta):
|
||||
episode_id = f'{programme_id}_{episode["id"]}'
|
||||
aa_vinfo = traverse_obj(episode, ('aa_contents_id', {lambda x: x.split(';')}))
|
||||
extended_metadata = self._extract_extended_metadata(episode_id, aa_vinfo)
|
||||
fallback_start_time, _, fallback_end_time = traverse_obj(
|
||||
aa_vinfo, (4, {str}, {lambda x: (x or '').partition('_')}))
|
||||
|
||||
return {
|
||||
**series_meta,
|
||||
'id': episode_id,
|
||||
'formats': self._extract_m3u8_formats(episode.get('stream_url'), episode_id, fatal=False),
|
||||
'container': 'm4a_dash', # force fixup, AAC-only HLS
|
||||
'was_live': True,
|
||||
'title': episode.get('program_title'),
|
||||
'description': episode.get('program_sub_title'), # fallback
|
||||
'timestamp': unified_timestamp(fallback_end_time),
|
||||
'release_timestamp': unified_timestamp(fallback_start_time),
|
||||
**extended_metadata,
|
||||
}
|
||||
|
||||
def _extract_news_info(self, headline, programme_id, series_meta):
|
||||
episode_id = f'{programme_id}_{headline["headline_id"]}'
|
||||
episode = traverse_obj(headline, ('file_list', 0, {dict}))
|
||||
description = self._extract_extended_description(episode_id, episode)
|
||||
if not description:
|
||||
self.report_warning('Failed to get extended description, falling back to summary')
|
||||
description = traverse_obj(episode, ('file_title_sub', {str}))
|
||||
|
||||
return {
|
||||
**series_meta,
|
||||
@ -687,9 +763,9 @@ def _extract_episode_info(self, headline, programme_id, series_meta):
|
||||
'was_live': True,
|
||||
'series': series_meta.get('title'),
|
||||
'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
|
||||
'description': description,
|
||||
**traverse_obj(episode, {
|
||||
'title': 'file_title',
|
||||
'title': ('file_title', {str}),
|
||||
'description': ('file_title_sub', {str}),
|
||||
'timestamp': ('open_time', {unified_timestamp}),
|
||||
'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
|
||||
}),
|
||||
@ -706,32 +782,58 @@ def _real_extract(self, url):
|
||||
site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
|
||||
programme_id = f'{site_id}_{corner_id}'
|
||||
|
||||
if site_id == 'F261':
|
||||
json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
|
||||
else:
|
||||
json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
|
||||
|
||||
meta = self._download_json(json_url, programme_id)['main']
|
||||
|
||||
if site_id == 'F261': # XXX: News programmes use old API (for now?)
|
||||
meta = self._download_json(
|
||||
'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json', programme_id)['main']
|
||||
series_meta = traverse_obj(meta, {
|
||||
'title': 'program_name',
|
||||
'channel': 'media_name',
|
||||
'uploader': 'media_name',
|
||||
'title': ('program_name', {str}),
|
||||
'channel': ('media_name', {str}),
|
||||
'uploader': ('media_name', {str}),
|
||||
'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
|
||||
}, get_all=False)
|
||||
|
||||
if headline_id:
|
||||
return self._extract_episode_info(
|
||||
traverse_obj(meta, (
|
||||
'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False),
|
||||
programme_id, series_meta)
|
||||
headline = traverse_obj(
|
||||
meta, ('detail_list', lambda _, v: v['headline_id'] == headline_id, any))
|
||||
if not headline:
|
||||
raise ExtractorError('Content not found; it has most likely expired', expected=True)
|
||||
return self._extract_news_info(headline, programme_id, series_meta)
|
||||
|
||||
def entries():
|
||||
def news_entries():
|
||||
for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
|
||||
yield self._extract_episode_info(headline, programme_id, series_meta)
|
||||
yield self._extract_news_info(headline, programme_id, series_meta)
|
||||
|
||||
return self.playlist_result(
|
||||
entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta)
|
||||
news_entries(), programme_id, description=meta.get('site_detail'), **series_meta)
|
||||
|
||||
meta = self._download_json(
|
||||
'https://www.nhk.or.jp/radio-api/app/v1/web/ondemand/series', programme_id, query={
|
||||
'site_id': site_id,
|
||||
'corner_site_id': corner_id,
|
||||
})
|
||||
|
||||
fallback_station = join_nonempty('NHK', traverse_obj(meta, ('radio_broadcast', {str})), delim=' ')
|
||||
series_meta = {
|
||||
'series': join_nonempty('title', 'corner_name', delim=' ', from_dict=meta),
|
||||
'series_id': programme_id,
|
||||
'thumbnail': traverse_obj(meta, ('thumbnail_url', {url_or_none})),
|
||||
'channel': fallback_station,
|
||||
'uploader': fallback_station,
|
||||
}
|
||||
|
||||
if headline_id:
|
||||
episode = traverse_obj(meta, ('episodes', lambda _, v: v['id'] == int(headline_id), any))
|
||||
if not episode:
|
||||
raise ExtractorError('Content not found; it has most likely expired', expected=True)
|
||||
return self._extract_episode_info(episode, programme_id, series_meta)
|
||||
|
||||
def entries():
|
||||
for episode in traverse_obj(meta, ('episodes', ..., {dict})):
|
||||
yield self._extract_episode_info(episode, programme_id, series_meta)
|
||||
|
||||
return self.playlist_result(
|
||||
entries(), programme_id, title=series_meta.get('series'),
|
||||
description=meta.get('series_description'), **series_meta)
|
||||
|
||||
|
||||
class NhkRadioNewsPageIE(InfoExtractor):
|
||||
|
@ -2,6 +2,7 @@
|
||||
import urllib.parse
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .sproutvideo import VidsIoIE
|
||||
from .vimeo import VimeoIE
|
||||
from ..networking.exceptions import HTTPError
|
||||
from ..utils import (
|
||||
@ -12,6 +13,7 @@
|
||||
int_or_none,
|
||||
mimetype2ext,
|
||||
parse_iso8601,
|
||||
smuggle_url,
|
||||
str_or_none,
|
||||
traverse_obj,
|
||||
url_or_none,
|
||||
@ -305,22 +307,27 @@ def _real_extract(self, url):
|
||||
'channel_follower_count': ('attributes', 'patron_count', {int_or_none}),
|
||||
}))
|
||||
|
||||
# all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo
|
||||
headers = {'referer': 'https://patreon.com/'}
|
||||
|
||||
# handle Vimeo embeds
|
||||
if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
|
||||
v_url = urllib.parse.unquote(self._html_search_regex(
|
||||
r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)',
|
||||
traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '')
|
||||
if url_or_none(v_url) and self._request_webpage(
|
||||
v_url, video_id, 'Checking Vimeo embed URL',
|
||||
headers={'Referer': 'https://patreon.com/'},
|
||||
fatal=False, errnote=False):
|
||||
v_url, video_id, 'Checking Vimeo embed URL', headers=headers, fatal=False, errnote=False):
|
||||
entries.append(self.url_result(
|
||||
VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'),
|
||||
VimeoIE, url_transparent=True))
|
||||
|
||||
embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))
|
||||
if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False):
|
||||
entries.append(self.url_result(embed_url))
|
||||
if embed_url and (urlh := self._request_webpage(
|
||||
embed_url, video_id, 'Checking embed URL', headers=headers,
|
||||
fatal=False, errnote=False, expected_status=403)):
|
||||
# Password-protected vids.io embeds return 403 errors w/o --video-password or session cookie
|
||||
if urlh.status != 403 or VidsIoIE.suitable(embed_url):
|
||||
entries.append(self.url_result(smuggle_url(embed_url, headers)))
|
||||
|
||||
post_file = traverse_obj(attributes, ('post_file', {dict}))
|
||||
if post_file:
|
||||
|
@ -1,28 +1,40 @@
|
||||
from .common import InfoExtractor
|
||||
from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call
|
||||
from ..utils import (
|
||||
OnDemandPagedList,
|
||||
clean_html,
|
||||
int_or_none,
|
||||
jwt_decode_hs256,
|
||||
url_or_none,
|
||||
)
|
||||
from ..utils.traversal import traverse_obj
|
||||
|
||||
|
||||
def result_from_props(props, episode_id=None):
|
||||
def result_from_props(props):
|
||||
return {
|
||||
'id': props.get('podcast_id') or episode_id,
|
||||
'title': props.get('title'),
|
||||
'url': props['mediaURL'],
|
||||
**traverse_obj(props, {
|
||||
'id': ('_id', {str}),
|
||||
'title': ('title', {str}),
|
||||
'url': ('mediaURL', {url_or_none}),
|
||||
'description': ('description', {clean_html}),
|
||||
'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}),
|
||||
'timestamp': ('timestamp', {int_or_none}),
|
||||
'duration': ('duration', {int_or_none}),
|
||||
}),
|
||||
'ext': 'mp3',
|
||||
'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']),
|
||||
'timestamp': props.get('timestamp'),
|
||||
'duration': int_or_none(props.get('duration')),
|
||||
'vcodec': 'none',
|
||||
}
|
||||
|
||||
|
||||
class PodbayFMIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$'
|
||||
_VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)'
|
||||
_TESTS = [{
|
||||
'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
|
||||
'md5': '98b41285dcf7989d105a4ed0404054cf',
|
||||
'md5': '895ac8505de349515f5ee8a4a3195c93',
|
||||
'info_dict': {
|
||||
'id': '1647338400',
|
||||
'id': '62306451f4a48e58d0c4d6a8',
|
||||
'title': 'Part One: Kissinger',
|
||||
'ext': 'mp3',
|
||||
'description': r're:^We begin our epic six part series on Henry Kissinger.+',
|
||||
'thumbnail': r're:^https?://.*\.jpg',
|
||||
'timestamp': 1647338400,
|
||||
'duration': 5001,
|
||||
@ -34,24 +46,25 @@ def _real_extract(self, url):
|
||||
episode_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, episode_id)
|
||||
data = self._search_nextjs_data(webpage, episode_id)
|
||||
return result_from_props(data['props']['pageProps']['episode'], episode_id)
|
||||
return result_from_props(data['props']['pageProps']['episode'])
|
||||
|
||||
|
||||
class PodbayFMChannelIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$'
|
||||
_VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$|[?#])'
|
||||
_TESTS = [{
|
||||
'url': 'https://podbay.fm/p/behind-the-bastards',
|
||||
'info_dict': {
|
||||
'id': 'behind-the-bastards',
|
||||
'title': 'Behind the Bastards',
|
||||
},
|
||||
'playlist_mincount': 21,
|
||||
}]
|
||||
_PAGE_SIZE = 10
|
||||
|
||||
def _fetch_page(self, channel_id, pagenum):
|
||||
return self._download_json(
|
||||
f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
|
||||
channel_id)['podcast']
|
||||
f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast']
|
||||
|
||||
@staticmethod
|
||||
def _results_from_page(channel_id, page):
|
||||
|
@ -1,3 +1,5 @@
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
@ -6,6 +8,7 @@
|
||||
traverse_obj,
|
||||
unified_timestamp,
|
||||
url_or_none,
|
||||
urljoin,
|
||||
)
|
||||
|
||||
|
||||
@ -21,8 +24,7 @@ class RTVSLOIE(InfoExtractor):
|
||||
_API_BASE = 'https://api.rtvslo.si/ava/{}/{}?client_id=82013fb3a531d5414f478747c1aca622'
|
||||
SUB_LANGS_MAP = {'Slovenski': 'sl'}
|
||||
|
||||
_TESTS = [
|
||||
{
|
||||
_TESTS = [{
|
||||
'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv',
|
||||
'info_dict': {
|
||||
'id': '174842550',
|
||||
@ -88,8 +90,7 @@ class RTVSLOIE(InfoExtractor):
|
||||
}, {
|
||||
'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550',
|
||||
'only_matching': True,
|
||||
},
|
||||
]
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
v_id = self._match_id(url)
|
||||
@ -164,3 +165,26 @@ def _real_extract(self, url):
|
||||
'series': meta.get('showName'),
|
||||
'series_id': meta.get('showId'),
|
||||
}
|
||||
|
||||
|
||||
class RTVSLOShowIE(InfoExtractor):
|
||||
IE_NAME = 'rtvslo.si:show'
|
||||
_VALID_URL = r'https?://(?:365|4d)\.rtvslo.si/oddaja/[^/?#&]+/(?P<id>\d+)'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'https://365.rtvslo.si/oddaja/ekipa-bled/173250997',
|
||||
'info_dict': {
|
||||
'id': '173250997',
|
||||
'title': 'Ekipa Bled',
|
||||
},
|
||||
'playlist_count': 18,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
playlist_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, playlist_id)
|
||||
|
||||
return self.playlist_from_matches(
|
||||
re.findall(r'<a [^>]*\bhref="(/arhiv/[^"]+)"', webpage),
|
||||
playlist_id, self._html_extract_title(webpage),
|
||||
getter=lambda x: urljoin('https://365.rtvslo.si', x), ie=RTVSLOIE)
|
||||
|
@ -95,7 +95,7 @@ def _update_client_id(self):
|
||||
return
|
||||
raise ExtractorError('Unable to extract client id')
|
||||
|
||||
def _download_json(self, *args, **kwargs):
|
||||
def _call_api(self, *args, **kwargs):
|
||||
non_fatal = kwargs.get('fatal') is False
|
||||
if non_fatal:
|
||||
del kwargs['fatal']
|
||||
@ -104,7 +104,7 @@ def _download_json(self, *args, **kwargs):
|
||||
query['client_id'] = self._CLIENT_ID
|
||||
kwargs['query'] = query
|
||||
try:
|
||||
return super()._download_json(*args, **kwargs)
|
||||
return self._download_json(*args, **kwargs)
|
||||
except ExtractorError as e:
|
||||
if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
|
||||
self._store_client_id(None)
|
||||
@ -163,7 +163,7 @@ def genNumBlock():
|
||||
'user_agent': self._USER_AGENT
|
||||
}
|
||||
|
||||
response = self._download_json(
|
||||
response = self._call_api(
|
||||
self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
|
||||
None, note='Verifying login token...', fatal=False,
|
||||
data=json.dumps(payload).encode())
|
||||
@ -217,12 +217,26 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f
|
||||
query['secret_token'] = secret_token
|
||||
|
||||
if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'):
|
||||
download_url = update_url_query(
|
||||
self._API_V2_BASE + 'tracks/' + track_id + '/download', query)
|
||||
redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri')
|
||||
if redirect_url:
|
||||
try:
|
||||
# Do not use _call_api(); HTTP Error codes have different meanings for this request
|
||||
download_data = self._download_json(
|
||||
f'{self._API_V2_BASE}tracks/{track_id}/download', track_id,
|
||||
'Downloading original download format info JSON', query=query, headers=self._HEADERS)
|
||||
except ExtractorError as e:
|
||||
if isinstance(e.cause, HTTPError) and e.cause.status == 401:
|
||||
self.report_warning(
|
||||
'Original download format is only available '
|
||||
f'for registered users. {self._login_hint()}')
|
||||
elif isinstance(e.cause, HTTPError) and e.cause.status == 403:
|
||||
self.write_debug('Original download format is not available for this client')
|
||||
else:
|
||||
self.report_warning(e.msg)
|
||||
download_data = None
|
||||
|
||||
if redirect_url := traverse_obj(download_data, ('redirectUri', {url_or_none})):
|
||||
urlh = self._request_webpage(
|
||||
HEADRequest(redirect_url), track_id, 'Checking for original download format', fatal=False)
|
||||
HEADRequest(redirect_url), track_id, 'Checking original download format availability',
|
||||
'Original download format is not available', fatal=False)
|
||||
if urlh:
|
||||
format_url = urlh.url
|
||||
format_urls.add(format_url)
|
||||
@ -303,7 +317,7 @@ def add_format(f, protocol, is_preview=False):
|
||||
stream = None
|
||||
for retry in self.RetryManager(fatal=False):
|
||||
try:
|
||||
stream = self._download_json(
|
||||
stream = self._call_api(
|
||||
format_url, track_id, f'Downloading {identifier} format info JSON',
|
||||
query=query, headers=self._HEADERS)
|
||||
except ExtractorError as e:
|
||||
@ -630,7 +644,7 @@ def _real_extract(self, url):
|
||||
resolve_title += f'/{token}'
|
||||
info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
|
||||
|
||||
info = self._download_json(
|
||||
info = self._call_api(
|
||||
info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
|
||||
|
||||
return self._extract_info_dict(info, full_title, token)
|
||||
@ -641,7 +655,7 @@ def _extract_set(self, playlist, token=None):
|
||||
playlist_id = str(playlist['id'])
|
||||
tracks = playlist.get('tracks') or []
|
||||
if not all(t.get('permalink_url') for t in tracks) and token:
|
||||
tracks = self._download_json(
|
||||
tracks = self._call_api(
|
||||
self._API_V2_BASE + 'tracks', playlist_id,
|
||||
'Downloading tracks', query={
|
||||
'ids': ','.join([str(t['id']) for t in tracks]),
|
||||
@ -699,7 +713,7 @@ def _real_extract(self, url):
|
||||
if token:
|
||||
full_title += '/' + token
|
||||
|
||||
info = self._download_json(self._resolv_url(
|
||||
info = self._call_api(self._resolv_url(
|
||||
self._BASE_URL + full_title), full_title, headers=self._HEADERS)
|
||||
|
||||
if 'errors' in info:
|
||||
@ -730,7 +744,7 @@ def _entries(self, url, playlist_id):
|
||||
for i in itertools.count():
|
||||
for retry in self.RetryManager():
|
||||
try:
|
||||
response = self._download_json(
|
||||
response = self._call_api(
|
||||
url, playlist_id, query=query, headers=self._HEADERS,
|
||||
note=f'Downloading track page {i + 1}')
|
||||
break
|
||||
@ -838,7 +852,7 @@ def _real_extract(self, url):
|
||||
mobj = self._match_valid_url(url)
|
||||
uploader = mobj.group('user')
|
||||
|
||||
user = self._download_json(
|
||||
user = self._call_api(
|
||||
self._resolv_url(self._BASE_URL + uploader),
|
||||
uploader, 'Downloading user info', headers=self._HEADERS)
|
||||
|
||||
@ -864,7 +878,7 @@ class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE):
|
||||
|
||||
def _real_extract(self, url):
|
||||
user_id = self._match_id(url)
|
||||
user = self._download_json(
|
||||
user = self._call_api(
|
||||
self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS)
|
||||
|
||||
return self._extract_playlist(
|
||||
@ -886,7 +900,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
|
||||
def _real_extract(self, url):
|
||||
track_name = self._match_id(url)
|
||||
|
||||
track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS)
|
||||
track = self._call_api(self._resolv_url(url), track_name, headers=self._HEADERS)
|
||||
track_id = self._search_regex(
|
||||
r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
|
||||
|
||||
@ -930,7 +944,7 @@ class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE):
|
||||
def _real_extract(self, url):
|
||||
slug, relation = self._match_valid_url(url).group('slug', 'relation')
|
||||
|
||||
track = self._download_json(
|
||||
track = self._call_api(
|
||||
self._resolv_url(self._BASE_URL + slug),
|
||||
slug, 'Downloading track info', headers=self._HEADERS)
|
||||
|
||||
@ -965,7 +979,7 @@ def _real_extract(self, url):
|
||||
if token:
|
||||
query['secret_token'] = token
|
||||
|
||||
data = self._download_json(
|
||||
data = self._call_api(
|
||||
self._API_V2_BASE + 'playlists/' + playlist_id,
|
||||
playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS)
|
||||
|
||||
@ -1000,7 +1014,7 @@ def _get_collection(self, endpoint, collection_id, **query):
|
||||
next_url = update_url_query(self._API_V2_BASE + endpoint, query)
|
||||
|
||||
for i in itertools.count(1):
|
||||
response = self._download_json(
|
||||
response = self._call_api(
|
||||
next_url, collection_id, f'Downloading page {i}',
|
||||
'Unable to download API page', headers=self._HEADERS)
|
||||
|
||||
|
198
yt_dlp/extractor/sproutvideo.py
Normal file
198
yt_dlp/extractor/sproutvideo.py
Normal file
@ -0,0 +1,198 @@
|
||||
import base64
|
||||
import urllib.parse
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..networking.exceptions import HTTPError
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
int_or_none,
|
||||
qualities,
|
||||
remove_start,
|
||||
smuggle_url,
|
||||
unsmuggle_url,
|
||||
update_url_query,
|
||||
url_or_none,
|
||||
urlencode_postdata,
|
||||
)
|
||||
from ..utils.traversal import traverse_obj
|
||||
|
||||
|
||||
class SproutVideoIE(InfoExtractor):
|
||||
_NO_SCHEME_RE = r'//videos\.sproutvideo\.com/embed/(?P<id>[\da-f]+)/[\da-f]+'
|
||||
_VALID_URL = rf'https?:{_NO_SCHEME_RE}'
|
||||
_EMBED_REGEX = [rf'<iframe [^>]*\bsrc=["\'](?P<url>(?:https?:)?{_NO_SCHEME_RE}[^"\']*)["\']']
|
||||
_TESTS = [{
|
||||
'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3',
|
||||
'md5': '1343ce1a6cb39d67889bfa07c7b02b0e',
|
||||
'info_dict': {
|
||||
'id': '4c9dddb01910e3c9c4',
|
||||
'ext': 'mp4',
|
||||
'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
|
||||
'duration': 576,
|
||||
'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
|
||||
},
|
||||
}, {
|
||||
'url': 'https://videos.sproutvideo.com/embed/a79fdcb21f1be2c62e/93bf31e41e39ca27',
|
||||
'md5': 'cebae5cf558cca83271917cf4ec03f26',
|
||||
'info_dict': {
|
||||
'id': 'a79fdcb21f1be2c62e',
|
||||
'ext': 'mp4',
|
||||
'title': 'HS_01_Live Stream 2023-01-14 10:00',
|
||||
'duration': 703,
|
||||
'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
|
||||
},
|
||||
}, {
|
||||
# http formats 'sd' and 'hd' are available
|
||||
'url': 'https://videos.sproutvideo.com/embed/119cd6bc1a18e6cd98/30751a1761ae5b90',
|
||||
'md5': 'f368c78df07e78a749508b221528672c',
|
||||
'info_dict': {
|
||||
'id': '119cd6bc1a18e6cd98',
|
||||
'ext': 'mp4',
|
||||
'title': '3. Updating your Partner details',
|
||||
'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
|
||||
'duration': 60,
|
||||
},
|
||||
'params': {'format': 'hd'},
|
||||
}, {
|
||||
# subtitles
|
||||
'url': 'https://videos.sproutvideo.com/embed/119dd8ba121ee0cc98/4ee50c88a343215d?type=hd',
|
||||
'md5': '7f6798f037d7a3e3e07e67959de68fc6',
|
||||
'info_dict': {
|
||||
'id': '119dd8ba121ee0cc98',
|
||||
'ext': 'mp4',
|
||||
'title': 'Recipients Setup - Domestic Wire Only',
|
||||
'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
|
||||
'duration': 77,
|
||||
'subtitles': {'en': 'count:1'},
|
||||
},
|
||||
}]
|
||||
_WEBPAGE_TESTS = [{
|
||||
'url': 'https://www.solidarum.org/vivre-ensemble/adrien-labaeye-berlin-des-communautes-aux-communs',
|
||||
'info_dict': {
|
||||
'id': '4c9dddb01910e3c9c4',
|
||||
'ext': 'mp4',
|
||||
'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
|
||||
'duration': 576,
|
||||
'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
|
||||
},
|
||||
}]
|
||||
_M3U8_URL_TMPL = 'https://{base}.videos.sproutvideo.com/{s3_user_hash}/{s3_video_hash}/video/index.m3u8'
|
||||
_QUALITIES = ('hd', 'uhd', 'source') # Exclude 'sd' to prioritize hls formats above it
|
||||
|
||||
@staticmethod
|
||||
def _policy_to_qs(policy, signature_key, as_string=False):
|
||||
query = {}
|
||||
for key, value in policy['signatures'][signature_key].items():
|
||||
query[remove_start(key, 'CloudFront-')] = value
|
||||
query['sessionID'] = policy['sessionID']
|
||||
return urllib.parse.urlencode(query, doseq=True) if as_string else query
|
||||
|
||||
@classmethod
|
||||
def _extract_embed_urls(cls, url, webpage):
|
||||
for embed_url in super()._extract_embed_urls(url, webpage):
|
||||
if embed_url.startswith('//'):
|
||||
embed_url = f'https:{embed_url}'
|
||||
yield smuggle_url(embed_url, {'referer': url})
|
||||
|
||||
def _real_extract(self, url):
|
||||
url, smuggled_data = unsmuggle_url(url, {})
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(
|
||||
url, video_id, headers=traverse_obj(smuggled_data, {'Referer': 'referer'}))
|
||||
data = self._search_json(
|
||||
r'var\s+dat\s*=\s*["\']', webpage, 'data', video_id, contains_pattern=r'[A-Za-z0-9+/=]+',
|
||||
end_pattern=r'["\'];', transform_source=lambda x: base64.b64decode(x).decode())
|
||||
|
||||
formats, subtitles = [], {}
|
||||
headers = {
|
||||
'Accept': '*/*',
|
||||
'Origin': 'https://videos.sproutvideo.com',
|
||||
'Referer': url,
|
||||
}
|
||||
|
||||
# HLS extraction is fatal; only attempt it if the JSON data says it's available
|
||||
if traverse_obj(data, 'hls'):
|
||||
manifest_query = self._policy_to_qs(data, 'm')
|
||||
fragment_query = self._policy_to_qs(data, 't', as_string=True)
|
||||
key_query = self._policy_to_qs(data, 'k', as_string=True)
|
||||
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
self._M3U8_URL_TMPL.format(**data), video_id, 'mp4',
|
||||
m3u8_id='hls', headers=headers, query=manifest_query))
|
||||
for fmt in formats:
|
||||
fmt.update({
|
||||
'url': update_url_query(fmt['url'], manifest_query),
|
||||
'extra_param_to_segment_url': fragment_query,
|
||||
'extra_param_to_key_url': key_query,
|
||||
})
|
||||
|
||||
if downloads := traverse_obj(data, ('downloads', {dict.items}, lambda _, v: url_or_none(v[1]))):
|
||||
quality = qualities(self._QUALITIES)
|
||||
acodec = 'none' if data.get('has_audio') is False else None
|
||||
formats.extend([{
|
||||
'format_id': str(format_id),
|
||||
'url': format_url,
|
||||
'ext': 'mp4',
|
||||
'quality': quality(format_id),
|
||||
'acodec': acodec,
|
||||
} for format_id, format_url in downloads])
|
||||
|
||||
for sub_data in traverse_obj(data, ('subtitleData', lambda _, v: url_or_none(v['src']))):
|
||||
subtitles.setdefault(sub_data.get('srclang', 'en'), []).append({
|
||||
'url': sub_data['src'],
|
||||
})
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'http_headers': headers,
|
||||
**traverse_obj(data, {
|
||||
'title': ('title', {str}),
|
||||
'duration': ('duration', {int_or_none}),
|
||||
'thumbnail': ('posterframe_url', {url_or_none}),
|
||||
}),
|
||||
}
|
||||
|
||||
|
||||
class VidsIoIE(InfoExtractor):
|
||||
IE_NAME = 'vids.io'
|
||||
_VALID_URL = r'https?://[\w-]+\.vids\.io/videos/(?P<id>[\da-f]+)/(?P<display_id>[\w-]+)'
|
||||
_TESTS = [{
|
||||
'url': 'https://how-to-video.vids.io/videos/799cd8b11c10efc1f0/how-to-video-live-streaming',
|
||||
'md5': '9bbbb2c0c0739eb163b80f87b8d77c9e',
|
||||
'info_dict': {
|
||||
'id': '799cd8b11c10efc1f0',
|
||||
'ext': 'mp4',
|
||||
'title': 'How to Video: Live Streaming',
|
||||
'duration': 2787,
|
||||
'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
|
||||
},
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
|
||||
webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=403)
|
||||
|
||||
if urlh.status == 403:
|
||||
password = self.get_param('videopassword')
|
||||
if not password:
|
||||
raise ExtractorError(
|
||||
'This video is password-protected; use the --video-password option', expected=True)
|
||||
try:
|
||||
webpage = self._download_webpage(
|
||||
url, display_id, 'Submitting video password',
|
||||
data=urlencode_postdata({
|
||||
'password': password,
|
||||
**self._hidden_inputs(webpage),
|
||||
}))
|
||||
# Requests with user's session cookie `_sproutvideo_session` are now authorized
|
||||
except ExtractorError as e:
|
||||
if isinstance(e.cause, HTTPError) and e.cause.status == 403:
|
||||
raise ExtractorError('Incorrect password', expected=True)
|
||||
raise
|
||||
|
||||
if embed_url := next(SproutVideoIE._extract_embed_urls(url, webpage), None):
|
||||
return self.url_result(embed_url, SproutVideoIE, video_id)
|
||||
|
||||
raise ExtractorError('Unable to extract any SproutVideo embed url')
|
@ -213,8 +213,19 @@ def _extract_aweme_app(self, aweme_id):
|
||||
return self._parse_aweme_video_app(aweme_detail)
|
||||
|
||||
def _extract_web_data_and_status(self, url, video_id, fatal=True):
|
||||
webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or ''
|
||||
video_data, status = {}, None
|
||||
video_data, status = {}, -1
|
||||
|
||||
res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
if res is False:
|
||||
return video_data, status
|
||||
|
||||
webpage, urlh = res
|
||||
if urllib.parse.urlparse(urlh.url).path == '/login':
|
||||
message = 'TikTok is requiring login for access to this content'
|
||||
if fatal:
|
||||
self.raise_login_required(message)
|
||||
self.report_warning(f'{message}. {self._login_hint()}')
|
||||
return video_data, status
|
||||
|
||||
if universal_data := self._get_universal_data(webpage, video_id):
|
||||
self.write_debug('Found universal data for rehydration')
|
||||
|
@ -13,6 +13,7 @@
|
||||
|
||||
|
||||
class TubiTvIE(InfoExtractor):
|
||||
IE_NAME = 'tubitv'
|
||||
_VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?P<type>video|movies|tv-shows)/(?P<id>\d+)'
|
||||
_LOGIN_URL = 'http://tubitv.com/login'
|
||||
_NETRC_MACHINE = 'tubitv'
|
||||
@ -148,30 +149,54 @@ def _real_extract(self, url):
|
||||
|
||||
|
||||
class TubiTvShowIE(InfoExtractor):
|
||||
_WORKING = False
|
||||
_VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/[0-9]+/(?P<show_name>[^/?#]+)'
|
||||
IE_NAME = 'tubitv:series'
|
||||
_VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/\d+/(?P<show_name>[^/?#]+)(?:/season-(?P<season>\d+))?'
|
||||
_TESTS = [{
|
||||
'url': 'https://tubitv.com/series/3936/the-joy-of-painting-with-bob-ross?start=true',
|
||||
'playlist_mincount': 390,
|
||||
'playlist_mincount': 389,
|
||||
'info_dict': {
|
||||
'id': 'the-joy-of-painting-with-bob-ross',
|
||||
},
|
||||
}, {
|
||||
'url': 'https://tubitv.com/series/2311/the-saddle-club/season-1',
|
||||
'playlist_count': 26,
|
||||
'info_dict': {
|
||||
'id': 'the-saddle-club-season-1',
|
||||
},
|
||||
}, {
|
||||
'url': 'https://tubitv.com/series/2311/the-saddle-club/season-3',
|
||||
'playlist_count': 19,
|
||||
'info_dict': {
|
||||
'id': 'the-saddle-club-season-3',
|
||||
},
|
||||
}, {
|
||||
'url': 'https://tubitv.com/series/2311/the-saddle-club/',
|
||||
'playlist_mincount': 71,
|
||||
'info_dict': {
|
||||
'id': 'the-saddle-club',
|
||||
},
|
||||
}]
|
||||
|
||||
def _entries(self, show_url, show_name):
|
||||
show_webpage = self._download_webpage(show_url, show_name)
|
||||
def _entries(self, show_url, playlist_id, selected_season):
|
||||
webpage = self._download_webpage(show_url, playlist_id)
|
||||
|
||||
show_json = self._parse_json(self._search_regex(
|
||||
r'window\.__data\s*=\s*({[^<]+});\s*</script>',
|
||||
show_webpage, 'data'), show_name, transform_source=js_to_json)['video']
|
||||
data = self._search_json(
|
||||
r'window\.__data\s*=', webpage, 'data', playlist_id,
|
||||
transform_source=js_to_json)['video']
|
||||
|
||||
for episode_id in show_json['fullContentById']:
|
||||
if traverse_obj(show_json, ('byId', episode_id, 'type')) == 's':
|
||||
continue
|
||||
# v['number'] is already a decimal string, but stringify to protect against API changes
|
||||
path = [lambda _, v: str(v['number']) == selected_season] if selected_season else [..., {dict}]
|
||||
|
||||
for season in traverse_obj(data, ('byId', lambda _, v: v['type'] == 's', 'seasons', *path)):
|
||||
season_number = int_or_none(season.get('number'))
|
||||
for episode in traverse_obj(season, ('episodes', lambda _, v: v['id'])):
|
||||
episode_id = episode['id']
|
||||
yield self.url_result(
|
||||
f'https://tubitv.com/tv-shows/{episode_id}/',
|
||||
ie=TubiTvIE.ie_key(), video_id=episode_id)
|
||||
f'https://tubitv.com/tv-shows/{episode_id}/', TubiTvIE, episode_id,
|
||||
season_number=season_number, episode_number=int_or_none(episode.get('num')))
|
||||
|
||||
def _real_extract(self, url):
|
||||
show_name = self._match_valid_url(url).group('show_name')
|
||||
return self.playlist_result(self._entries(url, show_name), playlist_id=show_name)
|
||||
playlist_id, selected_season = self._match_valid_url(url).group('show_name', 'season')
|
||||
if selected_season:
|
||||
playlist_id = f'{playlist_id}-season-{selected_season}'
|
||||
return self.playlist_result(self._entries(url, playlist_id, selected_season), playlist_id)
|
||||
|
@ -885,14 +885,14 @@ def _get_count(self, data, *path_list):
|
||||
return count
|
||||
|
||||
@staticmethod
|
||||
def _extract_thumbnails(data, *path_list):
|
||||
def _extract_thumbnails(data, *path_list, final_key='thumbnails'):
|
||||
"""
|
||||
Extract thumbnails from thumbnails dict
|
||||
@param path_list: path list to level that contains 'thumbnails' key
|
||||
"""
|
||||
thumbnails = []
|
||||
for path in path_list or [()]:
|
||||
for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...)):
|
||||
for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)):
|
||||
thumbnail_url = url_or_none(thumbnail.get('url'))
|
||||
if not thumbnail_url:
|
||||
continue
|
||||
@ -5124,6 +5124,10 @@ def _extract_metadata_from_tabs(self, item_id, data):
|
||||
else:
|
||||
metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict)
|
||||
|
||||
# pageHeaderViewModel slow rollout began April 2024
|
||||
page_header_view_model = traverse_obj(data, (
|
||||
'header', 'pageHeaderRenderer', 'content', 'pageHeaderViewModel', {dict}))
|
||||
|
||||
# We can get the uncropped banner/avatar by replacing the crop params with '=s0'
|
||||
# See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714
|
||||
def _get_uncropped(url):
|
||||
@ -5139,8 +5143,10 @@ def _get_uncropped(url):
|
||||
'preference': 1,
|
||||
})
|
||||
|
||||
channel_banners = self._extract_thumbnails(
|
||||
data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner')))
|
||||
channel_banners = (
|
||||
self._extract_thumbnails(data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner')))
|
||||
or self._extract_thumbnails(
|
||||
page_header_view_model, ('banner', 'imageBannerViewModel', 'image'), final_key='sources'))
|
||||
for banner in channel_banners:
|
||||
banner['preference'] = -10
|
||||
|
||||
@ -5167,7 +5173,11 @@ def _get_uncropped(url):
|
||||
or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag'))
|
||||
or info['id']),
|
||||
'availability': self._extract_availability(data),
|
||||
'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')),
|
||||
'channel_follower_count': (
|
||||
self._get_count(data, ('header', ..., 'subscriberCountText'))
|
||||
or traverse_obj(page_header_view_model, (
|
||||
'metadata', 'contentMetadataViewModel', 'metadataRows', ..., 'metadataParts',
|
||||
lambda _, v: 'subscribers' in v['text']['content'], 'text', 'content', {parse_count}, any))),
|
||||
'description': try_get(metadata_renderer, lambda x: x.get('description', '')),
|
||||
'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str}))
|
||||
or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))),
|
||||
|
Loading…
Reference in New Issue
Block a user