1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2024-07-03 10:39:12 +02:00

Merge branch 'yt-dlp:master' into pr/live-sections

This commit is contained in:
bashonly 2024-05-10 13:52:35 -05:00
commit 172dfbeaed
No known key found for this signature in database
GPG Key ID: 783F096F253D15B0
37 changed files with 676 additions and 326 deletions

View File

@ -12,6 +12,9 @@ on:
unix: unix:
default: true default: true
type: boolean type: boolean
linux_static:
default: true
type: boolean
linux_arm: linux_arm:
default: true default: true
type: boolean type: boolean
@ -27,9 +30,6 @@ on:
windows32: windows32:
default: true default: true
type: boolean type: boolean
meta_files:
default: true
type: boolean
origin: origin:
required: false required: false
default: '' default: ''
@ -52,7 +52,11 @@ on:
default: stable default: stable
type: string type: string
unix: unix:
description: yt-dlp, yt-dlp.tar.gz, yt-dlp_linux, yt-dlp_linux.zip description: yt-dlp, yt-dlp.tar.gz
default: true
type: boolean
linux_static:
description: yt-dlp_linux
default: true default: true
type: boolean type: boolean
linux_arm: linux_arm:
@ -75,10 +79,6 @@ on:
description: yt-dlp_x86.exe description: yt-dlp_x86.exe
default: true default: true
type: boolean type: boolean
meta_files:
description: SHA2-256SUMS, SHA2-512SUMS, _update_spec
default: true
type: boolean
origin: origin:
description: Origin description: Origin
required: false required: false
@ -112,27 +112,9 @@ jobs:
- uses: actions/setup-python@v5 - uses: actions/setup-python@v5
with: with:
python-version: "3.10" python-version: "3.10"
- uses: conda-incubator/setup-miniconda@v3
with:
miniforge-variant: Mambaforge
use-mamba: true
channels: conda-forge
auto-update-conda: true
activate-environment: ""
auto-activate-base: false
- name: Install Requirements - name: Install Requirements
run: | run: |
sudo apt -y install zip pandoc man sed sudo apt -y install zip pandoc man sed
cat > ./requirements.txt << EOF
python=3.10.*
pyinstaller
brotli-python
EOF
python devscripts/install_deps.py --print \
--exclude brotli --exclude brotlicffi \
--include secretstorage >> ./requirements.txt
mamba create -n build --file ./requirements.txt
- name: Prepare - name: Prepare
run: | run: |
python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}"
@ -141,30 +123,15 @@ jobs:
- name: Build Unix platform-independent binary - name: Build Unix platform-independent binary
run: | run: |
make all tar make all tar
- name: Build Unix standalone binary
shell: bash -l {0}
run: |
unset LD_LIBRARY_PATH # Harmful; set by setup-python
conda activate build
python -m bundle.pyinstaller --onedir
(cd ./dist/yt-dlp_linux && zip -r ../yt-dlp_linux.zip .)
python -m bundle.pyinstaller
mv ./dist/yt-dlp_linux ./yt-dlp_linux
mv ./dist/yt-dlp_linux.zip ./yt-dlp_linux.zip
- name: Verify --update-to - name: Verify --update-to
if: vars.UPDATE_TO_VERIFICATION if: vars.UPDATE_TO_VERIFICATION
run: | run: |
binaries=("yt-dlp" "yt-dlp_linux") chmod +x ./yt-dlp
for binary in "${binaries[@]}"; do cp ./yt-dlp ./yt-dlp_downgraded
chmod +x ./${binary} version="$(./yt-dlp --version)"
cp ./${binary} ./${binary}_downgraded ./yt-dlp_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04
version="$(./${binary} --version)" downgraded_version="$(./yt-dlp_downgraded --version)"
./${binary}_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 [[ "$version" != "$downgraded_version" ]]
downgraded_version="$(./${binary}_downgraded --version)"
[[ "$version" != "$downgraded_version" ]]
done
- name: Upload artifacts - name: Upload artifacts
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
@ -172,8 +139,39 @@ jobs:
path: | path: |
yt-dlp yt-dlp
yt-dlp.tar.gz yt-dlp.tar.gz
yt-dlp_linux compression-level: 0
yt-dlp_linux.zip
linux_static:
needs: process
if: inputs.linux_static
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build static executable
env:
channel: ${{ inputs.channel }}
origin: ${{ needs.process.outputs.origin }}
version: ${{ inputs.version }}
run: |
mkdir ~/build
cd bundle/docker
docker compose up --build static
sudo chown "${USER}:docker" ~/build/yt-dlp_linux
- name: Verify --update-to
if: vars.UPDATE_TO_VERIFICATION
run: |
chmod +x ~/build/yt-dlp_linux
cp ~/build/yt-dlp_linux ~/build/yt-dlp_linux_downgraded
version="$(~/build/yt-dlp_linux --version)"
~/build/yt-dlp_linux_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04
downgraded_version="$(~/build/yt-dlp_linux_downgraded --version)"
[[ "$version" != "$downgraded_version" ]]
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: build-bin-${{ github.job }}
path: |
~/build/yt-dlp_linux
compression-level: 0 compression-level: 0
linux_arm: linux_arm:
@ -254,7 +252,7 @@ jobs:
# We need to fuse our own universal2 wheels for curl_cffi # We need to fuse our own universal2 wheels for curl_cffi
python3 -m pip install -U --user delocate python3 -m pip install -U --user delocate
mkdir curl_cffi_whls curl_cffi_universal2 mkdir curl_cffi_whls curl_cffi_universal2
python3 devscripts/install_deps.py --print -o --include curl_cffi > requirements.txt python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt
for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do
python3 -m pip download \ python3 -m pip download \
--only-binary=:all: \ --only-binary=:all: \
@ -300,7 +298,7 @@ jobs:
macos_legacy: macos_legacy:
needs: process needs: process
if: inputs.macos_legacy if: inputs.macos_legacy
runs-on: macos-latest runs-on: macos-12
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -362,7 +360,7 @@ jobs:
- name: Install Requirements - name: Install Requirements
run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds
python devscripts/install_deps.py -o --include build python devscripts/install_deps.py -o --include build
python devscripts/install_deps.py --include py2exe --include curl_cffi python devscripts/install_deps.py --include py2exe --include curl-cffi
python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl" python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl"
- name: Prepare - name: Prepare
@ -447,10 +445,11 @@ jobs:
compression-level: 0 compression-level: 0
meta_files: meta_files:
if: inputs.meta_files && always() && !cancelled() if: always() && !cancelled()
needs: needs:
- process - process
- unix - unix
- linux_static
- linux_arm - linux_arm
- macos - macos
- macos_legacy - macos_legacy

View File

@ -53,7 +53,7 @@ jobs:
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install test requirements - name: Install test requirements
run: python3 ./devscripts/install_deps.py --include dev --include curl_cffi run: python3 ./devscripts/install_deps.py --include dev --include curl-cffi
- name: Run tests - name: Run tests
continue-on-error: False continue-on-error: False
run: | run: |

View File

@ -202,7 +202,7 @@ #### Impersonation
The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting.
* [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) * [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE)
* Can be installed with the `curl_cffi` group, e.g. `pip install yt-dlp[default,curl_cffi]` * Can be installed with the `curl-cffi` group, e.g. `pip install yt-dlp[default,curl-cffi]`
* Currently only included in `yt-dlp.exe` and `yt-dlp_macos` builds * Currently only included in `yt-dlp.exe` and `yt-dlp_macos` builds

10
bundle/docker/compose.yml Normal file
View File

@ -0,0 +1,10 @@
services:
static:
build: static
environment:
channel: ${channel}
origin: ${origin}
version: ${version}
volumes:
- ~/build:/build
- ../..:/yt-dlp

View File

@ -0,0 +1,21 @@
FROM alpine:3.19 as base
RUN apk --update add --no-cache \
build-base \
python3 \
pipx \
;
RUN pipx install pyinstaller
# Requires above step to prepare the shared venv
RUN ~/.local/share/pipx/shared/bin/python -m pip install -U wheel
RUN apk --update add --no-cache \
scons \
patchelf \
binutils \
;
RUN pipx install staticx
WORKDIR /yt-dlp
COPY entrypoint.sh /entrypoint.sh
ENTRYPOINT /entrypoint.sh

View File

@ -0,0 +1,13 @@
#!/bin/ash
set -e
source ~/.local/share/pipx/venvs/pyinstaller/bin/activate
python -m devscripts.install_deps --include secretstorage
python -m devscripts.make_lazy_extractors
python devscripts/update-version.py -c "${channel}" -r "${origin}" "${version}"
python -m bundle.pyinstaller
deactivate
source ~/.local/share/pipx/venvs/staticx/bin/activate
staticx /yt-dlp/dist/yt-dlp_linux /build/yt-dlp_linux
deactivate

View File

@ -53,7 +53,7 @@ dependencies = [
[project.optional-dependencies] [project.optional-dependencies]
default = [] default = []
curl_cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"] curl-cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"]
secretstorage = [ secretstorage = [
"cffi", "cffi",
"secretstorage", "secretstorage",

View File

@ -1906,6 +1906,15 @@ def test_response_with_expected_status_returns_content(self):
expected_status=TEAPOT_RESPONSE_STATUS) expected_status=TEAPOT_RESPONSE_STATUS)
self.assertEqual(content, TEAPOT_RESPONSE_BODY) self.assertEqual(content, TEAPOT_RESPONSE_BODY)
def test_search_nextjs_data(self):
data = '<script id="__NEXT_DATA__" type="application/json">{"props":{}}</script>'
self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}})
self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {})
self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None)
self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {})
with self.assertRaises(DeprecationWarning):
self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -785,6 +785,25 @@ def test_supported_impersonate_targets(self, handler):
assert res.status == 200 assert res.status == 200
assert std_headers['user-agent'].lower() not in res.read().decode().lower() assert std_headers['user-agent'].lower() not in res.read().decode().lower()
def test_response_extensions(self, handler):
with handler() as rh:
for target in rh.supported_targets:
request = Request(
f'http://127.0.0.1:{self.http_port}/gen_200', extensions={'impersonate': target})
res = validate_and_send(rh, request)
assert res.extensions['impersonate'] == rh._get_request_target(request)
def test_http_error_response_extensions(self, handler):
with handler() as rh:
for target in rh.supported_targets:
request = Request(
f'http://127.0.0.1:{self.http_port}/gen_404', extensions={'impersonate': target})
try:
validate_and_send(rh, request)
except HTTPError as e:
res = e.response
assert res.extensions['impersonate'] == rh._get_request_target(request)
class TestRequestHandlerMisc: class TestRequestHandlerMisc:
"""Misc generic tests for request handlers, not related to request or validation testing""" """Misc generic tests for request handlers, not related to request or validation testing"""

View File

@ -2064,7 +2064,22 @@ def test_extract_basic_auth(self):
assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz') assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz')
@unittest.skipUnless(compat_os_name == 'nt', 'Only relevant on Windows') @unittest.skipUnless(compat_os_name == 'nt', 'Only relevant on Windows')
def test_Popen_windows_escaping(self): def test_windows_escaping(self):
tests = [
'test"&',
'%CMDCMDLINE:~-1%&',
'a\nb',
'"',
'\\',
'!',
'^!',
'a \\ b',
'a \\" b',
'a \\ b\\',
# We replace \r with \n
('a\r\ra', 'a\n\na'),
]
def run_shell(args): def run_shell(args):
stdout, stderr, error = Popen.run( stdout, stderr, error = Popen.run(
args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@ -2072,15 +2087,15 @@ def run_shell(args):
assert not error assert not error
return stdout return stdout
# Test escaping for argument in tests:
assert run_shell(['echo', 'test"&']) == '"test""&"\n' if isinstance(argument, str):
assert run_shell(['echo', '%CMDCMDLINE:~-1%&']) == '"%CMDCMDLINE:~-1%&"\n' expected = argument
assert run_shell(['echo', 'a\nb']) == '"a"\n"b"\n' else:
assert run_shell(['echo', '"']) == '""""\n' argument, expected = argument
assert run_shell(['echo', '\\']) == '\\\n'
# Test if delayed expansion is disabled args = [sys.executable, '-c', 'import sys; print(end=sys.argv[1])', argument, 'end']
assert run_shell(['echo', '^!']) == '"^!"\n' assert run_shell(args) == expected
assert run_shell('echo "^!"') == '"^!"\n' assert run_shell(shell_quote(args, shell=True)) == expected
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -2141,6 +2141,11 @@ def _filter(f):
def _check_formats(self, formats): def _check_formats(self, formats):
for f in formats: for f in formats:
working = f.get('__working')
if working is not None:
if working:
yield f
continue
self.to_screen('[info] Testing format %s' % f['format_id']) self.to_screen('[info] Testing format %s' % f['format_id'])
path = self.get_output_path('temp') path = self.get_output_path('temp')
if not self._ensure_dir_exists(f'{path}/'): if not self._ensure_dir_exists(f'{path}/'):
@ -2157,33 +2162,44 @@ def _check_formats(self, formats):
os.remove(temp_file.name) os.remove(temp_file.name)
except OSError: except OSError:
self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
f['__working'] = success
if success: if success:
yield f yield f
else: else:
self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
def _select_formats(self, formats, selector):
return list(selector({
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
def _default_format_spec(self, info_dict, download=True): def _default_format_spec(self, info_dict, download=True):
download = download and not self.params.get('simulate')
prefer_best = download and (
self.params['outtmpl']['default'] == '-'
or info_dict.get('is_live') and not self.params.get('live_from_start'))
def can_merge(): def can_merge():
merger = FFmpegMergerPP(self) merger = FFmpegMergerPP(self)
return merger.available and merger.can_merge() return merger.available and merger.can_merge()
prefer_best = ( if not prefer_best and download and not can_merge():
not self.params.get('simulate') prefer_best = True
and download formats = self._get_formats(info_dict)
and ( evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec))
not can_merge() if evaluate_formats('b/bv+ba') != evaluate_formats('bv*+ba/b'):
or info_dict.get('is_live') and not self.params.get('live_from_start') self.report_warning('ffmpeg not found. The downloaded format may not be the best available. '
or self.params['outtmpl']['default'] == '-')) 'Installing ffmpeg is strongly recommended: https://github.com/yt-dlp/yt-dlp#dependencies')
compat = (
prefer_best
or self.params.get('allow_multiple_audio_streams', False)
or 'format-spec' in self.params['compat_opts'])
return ( compat = (self.params.get('allow_multiple_audio_streams')
'best/bestvideo+bestaudio' if prefer_best or 'format-spec' in self.params['compat_opts'])
else 'bestvideo*+bestaudio/best' if not compat
else 'bestvideo+bestaudio/best') return ('best/bestvideo+bestaudio' if prefer_best
else 'bestvideo+bestaudio/best' if compat
else 'bestvideo*+bestaudio/best')
def build_format_selector(self, format_spec): def build_format_selector(self, format_spec):
def syntax_error(note, start): def syntax_error(note, start):
@ -2933,12 +2949,7 @@ def is_wellformed(f):
self.write_debug(f'Default format spec: {req_format}') self.write_debug(f'Default format spec: {req_format}')
format_selector = self.build_format_selector(req_format) format_selector = self.build_format_selector(req_format)
formats_to_download = list(format_selector({ formats_to_download = self._select_formats(formats, format_selector)
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
if interactive_format_selection and not formats_to_download: if interactive_format_selection and not formats_to_download:
self.report_error('Requested format is not available', tb=False, is_error=False) self.report_error('Requested format is not available', tb=False, is_error=False)
continue continue

View File

@ -387,7 +387,11 @@
ComedyCentralIE, ComedyCentralIE,
ComedyCentralTVIE, ComedyCentralTVIE,
) )
from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonmistakes import (
BlobIE,
CommonMistakesIE,
UnicodeBOMIE,
)
from .commonprotocols import ( from .commonprotocols import (
MmsIE, MmsIE,
RtmpIE, RtmpIE,

View File

@ -105,7 +105,7 @@ def _real_extract(self, url):
video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_] video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_]
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
event_data = traverse_obj( event_data = traverse_obj(
self._search_nextjs_data(webpage, video_id, default='{}'), self._search_nextjs_data(webpage, video_id, default={}),
('props', 'pageProps', 'eventCMSData', { ('props', 'pageProps', 'eventCMSData', {
'title': ('event_name', {str}), 'title': ('event_name', {str}),
'thumbnail': ('event_thumbnail_image', {url_or_none}), 'thumbnail': ('event_thumbnail_image', {url_or_none}),

View File

@ -93,11 +93,11 @@ def extract_formats(self, play_info):
return formats return formats
def _download_playinfo(self, video_id, cid): def _download_playinfo(self, video_id, cid, headers=None):
return self._download_json( return self._download_json(
'https://api.bilibili.com/x/player/playurl', video_id, 'https://api.bilibili.com/x/player/playurl', video_id,
query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
note=f'Downloading video formats for cid {cid}')['data'] note=f'Downloading video formats for cid {cid}', headers=headers)['data']
def json2srt(self, json_data): def json2srt(self, json_data):
srt_data = '' srt_data = ''
@ -493,7 +493,8 @@ class BiliBiliIE(BilibiliBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, video_id) headers = self.geo_verification_headers()
webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers)
if not self._match_valid_url(urlh.url): if not self._match_valid_url(urlh.url):
return self.url_result(urlh.url) return self.url_result(urlh.url)
@ -531,7 +532,7 @@ def _real_extract(self, url):
self._download_json( self._download_json(
'https://api.bilibili.com/x/player/pagelist', video_id, 'https://api.bilibili.com/x/player/pagelist', video_id,
fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
note='Extracting videos in anthology'), note='Extracting videos in anthology', headers=headers),
'data', expected_type=list) or [] 'data', expected_type=list) or []
is_anthology = len(page_list_json) > 1 is_anthology = len(page_list_json) > 1
@ -552,7 +553,7 @@ def _real_extract(self, url):
festival_info = {} festival_info = {}
if is_festival: if is_festival:
play_info = self._download_playinfo(video_id, cid) play_info = self._download_playinfo(video_id, cid, headers=headers)
festival_info = traverse_obj(initial_state, { festival_info = traverse_obj(initial_state, {
'uploader': ('videoInfo', 'upName'), 'uploader': ('videoInfo', 'upName'),
@ -666,14 +667,15 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
episode_id = self._match_id(url) episode_id = self._match_id(url)
webpage = self._download_webpage(url, episode_id) headers = self.geo_verification_headers()
webpage = self._download_webpage(url, episode_id, headers=headers)
if '您所在的地区无法观看本片' in webpage: if '您所在的地区无法观看本片' in webpage:
raise GeoRestrictedError('This video is restricted') raise GeoRestrictedError('This video is restricted')
elif '正在观看预览,大会员免费看全片' in webpage: elif '正在观看预览,大会员免费看全片' in webpage:
self.raise_login_required('This video is for premium members only') self.raise_login_required('This video is for premium members only')
headers = {'Referer': url, **self.geo_verification_headers()} headers['Referer'] = url
play_info = self._download_json( play_info = self._download_json(
'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id, 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id,
'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
@ -724,7 +726,7 @@ def _real_extract(self, url):
'duration': float_or_none(play_info.get('timelength'), scale=1000), 'duration': float_or_none(play_info.get('timelength'), scale=1000),
'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid), 'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid),
'__post_extractor': self.extract_comments(aid), '__post_extractor': self.extract_comments(aid),
'http_headers': headers, 'http_headers': {'Referer': url},
} }
@ -1049,9 +1051,10 @@ def fetch_page(page_idx):
raise ExtractorError( raise ExtractorError(
'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True)
raise raise
if response['code'] == -401: if response['code'] in (-352, -401):
raise ExtractorError( raise ExtractorError(
'Request is blocked by server (401), please add cookies, wait and try later.', expected=True) f'Request is blocked by server ({-response["code"]}), '
'please add cookies, wait and try later.', expected=True)
return response['data'] return response['data']
def get_metadata(page_data): def get_metadata(page_data):

View File

@ -1,7 +1,11 @@
import json
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from .youtube import YoutubeIE
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
bug_reports_message,
int_or_none, int_or_none,
qualities, qualities,
str_or_none, str_or_none,
@ -162,9 +166,19 @@ def _extract_formats(self, player_urls, video_id):
def _real_extract(self, url): def _real_extract(self, url):
user, post_id = self._match_valid_url(url).group('user', 'post_id') user, post_id = self._match_valid_url(url).group('user', 'post_id')
auth_headers = {}
auth_cookie = self._get_cookies('https://boosty.to/').get('auth')
if auth_cookie is not None:
try:
auth_data = json.loads(urllib.parse.unquote(auth_cookie.value))
auth_headers['Authorization'] = f'Bearer {auth_data["accessToken"]}'
except (json.JSONDecodeError, KeyError):
self.report_warning(f'Failed to extract token from auth cookie{bug_reports_message()}')
post = self._download_json( post = self._download_json(
f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id, f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id,
note='Downloading post data', errnote='Unable to download post data') note='Downloading post data', errnote='Unable to download post data', headers=auth_headers)
post_title = post.get('title') post_title = post.get('title')
if not post_title: if not post_title:
@ -202,7 +216,9 @@ def _real_extract(self, url):
'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}), 'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}),
}, get_all=False)}) }, get_all=False)})
if not entries: if not entries and not post.get('hasAccess'):
self.raise_login_required('This post requires a subscription', metadata_available=True)
elif not entries:
raise ExtractorError('No videos found', expected=True) raise ExtractorError('No videos found', expected=True)
if len(entries) == 1: if len(entries) == 1:
return entries[0] return entries[0]

View File

@ -40,7 +40,7 @@ class CanalAlphaIE(InfoExtractor):
'id': '24484', 'id': '24484',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Ces innovations qui veulent rendre lagriculture plus durable', 'title': 'Ces innovations qui veulent rendre lagriculture plus durable',
'description': 'md5:3de3f151180684621e85be7c10e4e613', 'description': 'md5:85d594a3b5dc6ccfc4a85aba6e73b129',
'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg',
'upload_date': '20211026', 'upload_date': '20211026',
'duration': 360, 'duration': 360,
@ -58,14 +58,25 @@ class CanalAlphaIE(InfoExtractor):
'duration': 360, 'duration': 360,
}, },
'params': {'skip_download': True} 'params': {'skip_download': True}
}, {
'url': 'https://www.canalalpha.ch/play/le-journal/topic/33500/encore-des-mesures-deconomie-dans-le-jura',
'info_dict': {
'id': '33500',
'ext': 'mp4',
'title': 'Encore des mesures d\'économie dans le Jura',
'description': 'md5:938b5b556592f2d1b9ab150268082a80',
'thumbnail': 'https://static.canalalpha.ch/poster/news/news_46665.jpg',
'upload_date': '20240411',
'duration': 105,
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, id) webpage = self._download_webpage(url, video_id)
data_json = self._parse_json(self._search_regex( data_json = self._parse_json(self._search_regex(
r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;',
webpage, 'data_json'), id)['1']['data']['data'] webpage, 'data_json'), video_id)['1']['data']['data']
manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {}
subtitles = {} subtitles = {}
formats = [{ formats = [{
@ -75,15 +86,17 @@ def _real_extract(self, url):
'height': try_get(video, lambda x: x['res']['height'], expected_type=int), 'height': try_get(video, lambda x: x['res']['height'], expected_type=int),
} for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')]
if manifests.get('hls'): if manifests.get('hls'):
m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id) fmts, subs = self._extract_m3u8_formats_and_subtitles(
formats.extend(m3u8_frmts) manifests['hls'], video_id, m3u8_id='hls', fatal=False)
subtitles = self._merge_subtitles(subtitles, m3u8_subs) formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if manifests.get('dash'): if manifests.get('dash'):
dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash']) fmts, subs = self._extract_mpd_formats_and_subtitles(
formats.extend(dash_frmts) manifests['dash'], video_id, mpd_id='dash', fatal=False)
subtitles = self._merge_subtitles(subtitles, dash_subs) formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return { return {
'id': id, 'id': video_id,
'title': data_json.get('title').strip(), 'title': data_json.get('title').strip(),
'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))),
'thumbnail': data_json.get('poster'), 'thumbnail': data_json.get('poster'),

View File

@ -151,7 +151,7 @@ def _real_extract(self, url):
class CBCPlayerIE(InfoExtractor): class CBCPlayerIE(InfoExtractor):
IE_NAME = 'cbc.ca:player' IE_NAME = 'cbc.ca:player'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)' _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.cbc.ca/player/play/2683190193', 'url': 'http://www.cbc.ca/player/play/2683190193',
'md5': '64d25f841ddf4ddb28a235338af32e2c', 'md5': '64d25f841ddf4ddb28a235338af32e2c',
@ -277,6 +277,28 @@ class CBCPlayerIE(InfoExtractor):
'location': 'Canada', 'location': 'Canada',
'media_type': 'Full Program', 'media_type': 'Full Program',
}, },
}, {
'url': 'https://www.cbc.ca/player/play/video/1.7194274',
'md5': '188b96cf6bdcb2540e178a6caa957128',
'info_dict': {
'id': '2334524995812',
'ext': 'mp4',
'title': '#TheMoment a rare white spirit moose was spotted in Alberta',
'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3',
'timestamp': 1714788791,
'duration': 77.678,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg',
'uploader': 'CBCC-NEW',
'chapters': 'count:0',
'upload_date': '20240504',
'categories': 'count:3',
'series': 'The National',
'tags': 'count:15',
'creators': ['encoder'],
'location': 'Canada',
'media_type': 'Excerpt',
},
}, { }, {
'url': 'cbcplayer:1.7159484', 'url': 'cbcplayer:1.7159484',
'only_matching': True, 'only_matching': True,

View File

@ -1738,12 +1738,16 @@ def traverse_json_ld(json_ld, at_top_level=True):
traverse_json_ld(json_ld) traverse_json_ld(json_ld)
return filter_dict(info) return filter_dict(info)
def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
return self._parse_json( if default == '{}':
self._search_regex( self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', default = {}
webpage, 'next.js data', fatal=fatal, **kw), if default is not NO_DEFAULT:
video_id, transform_source=transform_source, fatal=fatal) fatal = False
return self._search_json(
r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""

View File

@ -40,3 +40,19 @@ def _real_extract(self, url):
'Your URL starts with a Byte Order Mark (BOM). ' 'Your URL starts with a Byte Order Mark (BOM). '
'Removing the BOM and looking for "%s" ...' % real_url) 'Removing the BOM and looking for "%s" ...' % real_url)
return self.url_result(real_url) return self.url_result(real_url)
class BlobIE(InfoExtractor):
IE_DESC = False
_VALID_URL = r'blob:'
_TESTS = [{
'url': 'blob:https://www.youtube.com/4eb3d090-a761-46e6-8083-c32016a36e3b',
'only_matching': True,
}]
def _real_extract(self, url):
raise ExtractorError(
'You\'ve asked yt-dlp to download a blob URL. '
'A blob URL exists only locally in your browser. '
'It is not possible for yt-dlp to access it.', expected=True)

View File

@ -24,11 +24,15 @@ class CrunchyrollBaseIE(InfoExtractor):
_BASE_URL = 'https://www.crunchyroll.com' _BASE_URL = 'https://www.crunchyroll.com'
_API_BASE = 'https://api.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com'
_NETRC_MACHINE = 'crunchyroll' _NETRC_MACHINE = 'crunchyroll'
_REFRESH_TOKEN = None
_AUTH_HEADERS = None _AUTH_HEADERS = None
_AUTH_EXPIRY = None
_API_ENDPOINT = None _API_ENDPOINT = None
_BASIC_AUTH = None _BASIC_AUTH = 'Basic ' + base64.b64encode(':'.join((
't-kdgp2h8c3jub8fn0fq',
'yfLDfMfrYvKXh4JXS1LEI2cCqu1v5Wan',
)).encode()).decode()
_IS_PREMIUM = None _IS_PREMIUM = None
_CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q')
_LOCALE_LOOKUP = { _LOCALE_LOOKUP = {
'ar': 'ar-SA', 'ar': 'ar-SA',
'de': 'de-DE', 'de': 'de-DE',
@ -43,69 +47,78 @@ class CrunchyrollBaseIE(InfoExtractor):
'hi': 'hi-IN', 'hi': 'hi-IN',
} }
@property def _set_auth_info(self, response):
def is_logged_in(self): CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(response, ('access_token', {jwt_decode_hs256}, 'benefits', ...))
return bool(self._get_cookies(self._BASE_URL).get('etp_rt')) CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': response['token_type'] + ' ' + response['access_token']}
CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10)
def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'):
try:
return self._download_json(
f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote,
headers=headers, data=urlencode_postdata(data), impersonate=True)
except ExtractorError as error:
if not isinstance(error.cause, HTTPError) or error.cause.status != 403:
raise
if target := error.cause.response.extensions.get('impersonate'):
raise ExtractorError(f'Got HTTP Error 403 when using impersonate target "{target}"')
raise ExtractorError(
'Request blocked by Cloudflare. '
'Install the required impersonation dependency if possible, '
'or else navigate to Crunchyroll in your browser, '
'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
'and your browser\'s User-Agent (with --user-agent)', expected=True)
def _perform_login(self, username, password): def _perform_login(self, username, password):
if self.is_logged_in: if not CrunchyrollBaseIE._REFRESH_TOKEN:
CrunchyrollBaseIE._REFRESH_TOKEN = self.cache.load(self._NETRC_MACHINE, username)
if CrunchyrollBaseIE._REFRESH_TOKEN:
return return
upsell_response = self._download_json(
f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id',
query={
'sess_id': 1,
'device_id': 'whatvalueshouldbeforweb',
'device_type': 'com.crunchyroll.static',
'access_token': 'giKq5eY27ny3cqz',
'referer': f'{self._BASE_URL}/welcome/login'
})
if upsell_response['code'] != 'ok':
raise ExtractorError('Could not get session id')
session_id = upsell_response['data']['session_id']
login_response = self._download_json(
f'{self._API_BASE}/login.1.json', None, 'Logging in',
data=urlencode_postdata({
'account': username,
'password': password,
'session_id': session_id
}))
if login_response['code'] != 'ok':
raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
if not self.is_logged_in:
raise ExtractorError('Login succeeded but did not set etp_rt cookie')
def _update_auth(self):
if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds():
return
if not CrunchyrollBaseIE._BASIC_AUTH:
cx_api_param = self._CLIENT_ID[self.is_logged_in]
self.write_debug(f'Using cxApiParam={cx_api_param}')
CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode()
auth_headers = {'Authorization': CrunchyrollBaseIE._BASIC_AUTH}
if self.is_logged_in:
grant_type = 'etp_rt_cookie'
else:
grant_type = 'client_id'
auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
try: try:
auth_response = self._download_json( login_response = self._request_token(
f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', headers={'Authorization': self._BASIC_AUTH}, data={
headers=auth_headers, data=f'grant_type={grant_type}'.encode()) 'username': username,
'password': password,
'grant_type': 'password',
'scope': 'offline_access',
}, note='Logging in', errnote='Failed to log in')
except ExtractorError as error: except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 403: if isinstance(error.cause, HTTPError) and error.cause.status == 401:
raise ExtractorError( raise ExtractorError('Invalid username and/or password', expected=True)
'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, '
'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
'and your browser\'s User-Agent (with --user-agent)', expected=True)
raise raise
CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(auth_response, ('access_token', {jwt_decode_hs256}, 'benefits', ...)) CrunchyrollBaseIE._REFRESH_TOKEN = login_response['refresh_token']
CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} self.cache.store(self._NETRC_MACHINE, username, CrunchyrollBaseIE._REFRESH_TOKEN)
CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) self._set_auth_info(login_response)
def _update_auth(self):
if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_EXPIRY > time_seconds():
return
auth_headers = {'Authorization': self._BASIC_AUTH}
if CrunchyrollBaseIE._REFRESH_TOKEN:
data = {
'refresh_token': CrunchyrollBaseIE._REFRESH_TOKEN,
'grant_type': 'refresh_token',
'scope': 'offline_access',
}
else:
data = {'grant_type': 'client_id'}
auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
try:
auth_response = self._request_token(auth_headers, data)
except ExtractorError as error:
username, password = self._get_login_info()
if not username or not isinstance(error.cause, HTTPError) or error.cause.status != 400:
raise
self.to_screen('Refresh token has expired. Re-logging in')
CrunchyrollBaseIE._REFRESH_TOKEN = None
self.cache.store(self._NETRC_MACHINE, username, None)
self._perform_login(username, password)
return
self._set_auth_info(auth_response)
def _locale_from_language(self, language): def _locale_from_language(self, language):
config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True) config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True)
@ -168,7 +181,8 @@ def _extract_stream(self, identifier, display_id=None):
self._update_auth() self._update_auth()
stream_response = self._download_json( stream_response = self._download_json(
f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play', f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play',
display_id, note='Downloading stream info', headers=CrunchyrollBaseIE._AUTH_HEADERS) display_id, note='Downloading stream info', errnote='Failed to download stream info',
headers=CrunchyrollBaseIE._AUTH_HEADERS)
available_formats = {'': ('', '', stream_response['url'])} available_formats = {'': ('', '', stream_response['url'])}
for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])): for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])):
@ -383,11 +397,12 @@ def entries():
if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
message = f'This {object_type} is for premium members only' message = f'This {object_type} is for premium members only'
if self.is_logged_in: if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True) self.raise_no_formats(message, expected=True, video_id=internal_id)
self.raise_login_required(message) else:
self.raise_login_required(message, method='password', metadata_available=True)
result['formats'], result['subtitles'] = self._extract_stream(internal_id) else:
result['formats'], result['subtitles'] = self._extract_stream(internal_id)
result['chapters'] = self._extract_chapters(internal_id) result['chapters'] = self._extract_chapters(internal_id)
@ -573,14 +588,16 @@ def _real_extract(self, url):
if not response: if not response:
raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True)
result = self._transform_music_response(response)
if not self._IS_PREMIUM and response.get('isPremiumOnly'): if not self._IS_PREMIUM and response.get('isPremiumOnly'):
message = f'This {response.get("type") or "media"} is for premium members only' message = f'This {response.get("type") or "media"} is for premium members only'
if self.is_logged_in: if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True) self.raise_no_formats(message, expected=True, video_id=internal_id)
self.raise_login_required(message) else:
self.raise_login_required(message, method='password', metadata_available=True)
result = self._transform_music_response(response) else:
result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)
return result return result

View File

@ -560,7 +560,7 @@ def extract_from_jsmods_instances(js_data):
js_data, lambda x: x['jsmods']['instances'], list) or []) js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats): def extract_dash_manifest(video, formats):
dash_manifest = video.get('dash_manifest') dash_manifest = traverse_obj(video, 'dash_manifest', 'playlist', expected_type=str)
if dash_manifest: if dash_manifest:
formats.extend(self._parse_mpd_formats( formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),

View File

@ -1,6 +1,12 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..utils import ExtractorError, UserNotLive, int_or_none, url_or_none from ..utils import (
ExtractorError,
UserNotLive,
int_or_none,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj from ..utils.traversal import traverse_obj
@ -9,17 +15,20 @@ class MixchIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://mixch.tv/u/16236849/live', 'url': 'https://mixch.tv/u/16943797/live',
'skip': 'don\'t know if this live persists', 'skip': 'don\'t know if this live persists',
'info_dict': { 'info_dict': {
'id': '16236849', 'id': '16943797',
'title': '24配信シェア⭕投票🙏💦', 'ext': 'mp4',
'comment_count': 13145, 'title': '#EntView #カリナ #セブチ 2024-05-05 06:58',
'view_count': 28348, 'comment_count': int,
'timestamp': 1636189377, 'view_count': int,
'uploader': '🦥伊咲👶🏻#フレアワ', 'timestamp': 1714726805,
'uploader_id': '16236849', 'uploader': 'Ent.View K-news🎶💕',
} 'uploader_id': '16943797',
'live_status': 'is_live',
'upload_date': '20240503',
},
}, { }, {
'url': 'https://mixch.tv/u/16137876/live', 'url': 'https://mixch.tv/u/16137876/live',
'only_matching': True, 'only_matching': True,
@ -48,8 +57,20 @@ def _real_extract(self, url):
'protocol': 'm3u8', 'protocol': 'm3u8',
}], }],
'is_live': True, 'is_live': True,
'__post_extractor': self.extract_comments(video_id),
} }
def _get_comments(self, video_id):
yield from traverse_obj(self._download_json(
f'https://mixch.tv/api-web/lives/{video_id}/messages', video_id,
note='Downloading comments', errnote='Failed to download comments'), (..., {
'author': ('name', {str}),
'author_id': ('user_id', {str_or_none}),
'id': ('message_id', {str}, {lambda x: x or None}),
'text': ('body', {str}),
'timestamp': ('created', {int}),
}))
class MixchArchiveIE(InfoExtractor): class MixchArchiveIE(InfoExtractor):
IE_NAME = 'mixch:archive' IE_NAME = 'mixch:archive'

View File

@ -561,7 +561,8 @@ def _real_extract(self, url):
'timestamp': ('createTime', {self.kilo_or_none}), 'timestamp': ('createTime', {self.kilo_or_none}),
}) })
if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']): if not self._yes_playlist(
info['songs'] and program_id, info['mainSong']['id'], playlist_label='program', video_label='song'):
formats = self.extract_formats(info['mainSong']) formats = self.extract_formats(info['mainSong'])
return { return {

View File

@ -1,8 +1,8 @@
import itertools import itertools
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .vimeo import VimeoIE from .vimeo import VimeoIE
from ..compat import compat_urllib_parse_unquote
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
KNOWN_EXTENSIONS, KNOWN_EXTENSIONS,
@ -14,7 +14,6 @@
parse_iso8601, parse_iso8601,
str_or_none, str_or_none,
traverse_obj, traverse_obj,
try_get,
url_or_none, url_or_none,
urljoin, urljoin,
) )
@ -199,7 +198,50 @@ class PatreonIE(PatreonBaseIE):
'channel_id': '2147162', 'channel_id': '2147162',
'uploader_url': 'https://www.patreon.com/yaboyroshi', 'uploader_url': 'https://www.patreon.com/yaboyroshi',
}, },
}, {
# NSFW vimeo embed URL
'url': 'https://www.patreon.com/posts/4k-spiderman-4k-96414599',
'info_dict': {
'id': '902250943',
'ext': 'mp4',
'title': '❤️(4K) Spiderman Girl Yeonhwas Gift ❤️(4K) 스파이더맨걸 연화의 선물',
'description': '❤️(4K) Spiderman Girl Yeonhwas Gift \n❤️(4K) 스파이더맨걸 연화의 선물',
'uploader': 'Npickyeonhwa',
'uploader_id': '90574422',
'uploader_url': 'https://www.patreon.com/Yeonhwa726',
'channel_id': '10237902',
'channel_url': 'https://www.patreon.com/Yeonhwa726',
'duration': 70,
'timestamp': 1705150153,
'upload_date': '20240113',
'comment_count': int,
'like_count': int,
'thumbnail': r're:^https?://.+',
},
'params': {'skip_download': 'm3u8'},
}, {
# multiple attachments/embeds
'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977',
'playlist_count': 3,
'info_dict': {
'id': '100601977',
'title': '"Holy Wars" (Megadeth) Solos Transcription & Lesson/Analysis',
'description': 'md5:d099ab976edfce6de2a65c2b169a88d3',
'uploader': 'Bradley Hall',
'uploader_id': '24401883',
'uploader_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_id': '3193932',
'channel_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_follower_count': int,
'timestamp': 1710777855,
'upload_date': '20240318',
'like_count': int,
'comment_count': int,
'thumbnail': r're:^https?://.+',
},
'skip': 'Patron-only content',
}] }]
_RETURN_TYPE = 'video'
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -214,95 +256,108 @@ def _real_extract(self, url):
'include': 'audio,user,user_defined_tags,campaign,attachments_media', 'include': 'audio,user,user_defined_tags,campaign,attachments_media',
}) })
attributes = post['data']['attributes'] attributes = post['data']['attributes']
title = attributes['title'].strip() info = traverse_obj(attributes, {
image = attributes.get('image') or {} 'title': ('title', {str.strip}),
info = { 'description': ('content', {clean_html}),
'id': video_id, 'thumbnail': ('image', ('large_url', 'url'), {url_or_none}, any),
'title': title, 'timestamp': ('published_at', {parse_iso8601}),
'description': clean_html(attributes.get('content')), 'like_count': ('like_count', {int_or_none}),
'thumbnail': image.get('large_url') or image.get('url'), 'comment_count': ('comment_count', {int_or_none}),
'timestamp': parse_iso8601(attributes.get('published_at')), })
'like_count': int_or_none(attributes.get('like_count')),
'comment_count': int_or_none(attributes.get('comment_count')),
}
can_view_post = traverse_obj(attributes, 'current_user_can_view')
if can_view_post and info['comment_count']:
info['__post_extractor'] = self.extract_comments(video_id)
for i in post.get('included', []): entries = []
i_type = i.get('type') idx = 0
if i_type == 'media': for include in traverse_obj(post, ('included', lambda _, v: v['type'])):
media_attributes = i.get('attributes') or {} include_type = include['type']
download_url = media_attributes.get('download_url') if include_type == 'media':
media_attributes = traverse_obj(include, ('attributes', {dict})) or {}
download_url = url_or_none(media_attributes.get('download_url'))
ext = mimetype2ext(media_attributes.get('mimetype')) ext = mimetype2ext(media_attributes.get('mimetype'))
# if size_bytes is None, this media file is likely unavailable # if size_bytes is None, this media file is likely unavailable
# See: https://github.com/yt-dlp/yt-dlp/issues/4608 # See: https://github.com/yt-dlp/yt-dlp/issues/4608
size_bytes = int_or_none(media_attributes.get('size_bytes')) size_bytes = int_or_none(media_attributes.get('size_bytes'))
if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None: if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None:
# XXX: what happens if there are multiple attachments? idx += 1
return { entries.append({
**info, 'id': f'{video_id}-{idx}',
'ext': ext, 'ext': ext,
'filesize': size_bytes, 'filesize': size_bytes,
'url': download_url, 'url': download_url,
}
elif i_type == 'user':
user_attributes = i.get('attributes')
if user_attributes:
info.update({
'uploader': user_attributes.get('full_name'),
'uploader_id': str_or_none(i.get('id')),
'uploader_url': user_attributes.get('url'),
}) })
elif i_type == 'post_tag': elif include_type == 'user':
info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value'))) info.update(traverse_obj(include, {
'uploader': ('attributes', 'full_name', {str}),
'uploader_id': ('id', {str_or_none}),
'uploader_url': ('attributes', 'url', {url_or_none}),
}))
elif i_type == 'campaign': elif include_type == 'post_tag':
info.update({ if post_tag := traverse_obj(include, ('attributes', 'value', {str})):
'channel': traverse_obj(i, ('attributes', 'title')), info.setdefault('tags', []).append(post_tag)
'channel_id': str_or_none(i.get('id')),
'channel_url': traverse_obj(i, ('attributes', 'url')), elif include_type == 'campaign':
'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))), info.update(traverse_obj(include, {
}) 'channel': ('attributes', 'title', {str}),
'channel_id': ('id', {str_or_none}),
'channel_url': ('attributes', 'url', {url_or_none}),
'channel_follower_count': ('attributes', 'patron_count', {int_or_none}),
}))
# handle Vimeo embeds # handle Vimeo embeds
if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
embed_html = try_get(attributes, lambda x: x['embed']['html']) v_url = urllib.parse.unquote(self._html_search_regex(
v_url = url_or_none(compat_urllib_parse_unquote( r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)',
self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '')
if v_url: if url_or_none(v_url) and self._request_webpage(
v_url = VimeoIE._smuggle_referrer(v_url, 'https://patreon.com') v_url, video_id, 'Checking Vimeo embed URL',
if self._request_webpage(v_url, video_id, 'Checking Vimeo embed URL', fatal=False, errnote=False): headers={'Referer': 'https://patreon.com/'},
return self.url_result(v_url, VimeoIE, url_transparent=True, **info) fatal=False, errnote=False):
entries.append(self.url_result(
VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'),
VimeoIE, url_transparent=True))
embed_url = try_get(attributes, lambda x: x['embed']['url']) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))
if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False):
return self.url_result(embed_url, **info) entries.append(self.url_result(embed_url))
post_file = traverse_obj(attributes, 'post_file') post_file = traverse_obj(attributes, ('post_file', {dict}))
if post_file: if post_file:
name = post_file.get('name') name = post_file.get('name')
ext = determine_ext(name) ext = determine_ext(name)
if ext in KNOWN_EXTENSIONS: if ext in KNOWN_EXTENSIONS:
return { entries.append({
**info, 'id': video_id,
'ext': ext, 'ext': ext,
'url': post_file['url'], 'url': post_file['url'],
} })
elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id)
return { entries.append({
**info, 'id': video_id,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} })
if can_view_post is False: can_view_post = traverse_obj(attributes, 'current_user_can_view')
comments = None
if can_view_post and info.get('comment_count'):
comments = self.extract_comments(video_id)
if not entries and can_view_post is False:
self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True) self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True)
else: elif not entries:
self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True) self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True)
elif len(entries) == 1:
info.update(entries[0])
else:
for entry in entries:
entry.update(info)
return self.playlist_result(entries, video_id, **info, __post_extractor=comments)
info['id'] = video_id
info['__post_extractor'] = comments
return info return info
def _get_comments(self, post_id): def _get_comments(self, post_id):

View File

@ -361,7 +361,7 @@ def extract_count(key):
'like_count': extract_count('favoritings') or extract_count('likes'), 'like_count': extract_count('favoritings') or extract_count('likes'),
'comment_count': extract_count('comment'), 'comment_count': extract_count('comment'),
'repost_count': extract_count('reposts'), 'repost_count': extract_count('reposts'),
'genre': info.get('genre'), 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)),
'formats': formats if not extract_flat else None 'formats': formats if not extract_flat else None
} }
@ -395,10 +395,10 @@ class SoundcloudIE(SoundcloudBaseIE):
_TESTS = [ _TESTS = [
{ {
'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
'md5': 'ebef0a451b909710ed1d7787dddbf0d7', 'md5': 'de9bac153e7427a7333b4b0c1b6a18d2',
'info_dict': { 'info_dict': {
'id': '62986583', 'id': '62986583',
'ext': 'mp3', 'ext': 'opus',
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
'uploader': 'E.T. ExTerrestrial Music', 'uploader': 'E.T. ExTerrestrial Music',
@ -411,6 +411,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg',
'uploader_url': 'https://soundcloud.com/ethmusic',
'genres': [],
} }
}, },
# geo-restricted # geo-restricted
@ -418,7 +421,7 @@ class SoundcloudIE(SoundcloudBaseIE):
'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
'info_dict': { 'info_dict': {
'id': '47127627', 'id': '47127627',
'ext': 'mp3', 'ext': 'opus',
'title': 'Goldrushed', 'title': 'Goldrushed',
'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept', 'uploader': 'The Royal Concept',
@ -431,6 +434,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'uploader_url': 'https://soundcloud.com/the-concept-band',
'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg',
'genres': ['Alternative'],
}, },
}, },
# private link # private link
@ -452,6 +458,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'uploader_url': 'https://soundcloud.com/jaimemf',
'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
'genres': ['youtubedl'],
}, },
}, },
# private link (alt format) # private link (alt format)
@ -473,6 +482,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'uploader_url': 'https://soundcloud.com/jaimemf',
'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
'genres': ['youtubedl'],
}, },
}, },
# downloadable song # downloadable song
@ -482,6 +494,21 @@ class SoundcloudIE(SoundcloudBaseIE):
'info_dict': { 'info_dict': {
'id': '343609555', 'id': '343609555',
'ext': 'wav', 'ext': 'wav',
'title': 'The Following',
'description': '',
'uploader': '80M',
'uploader_id': '312384765',
'uploader_url': 'https://soundcloud.com/the80m',
'upload_date': '20170922',
'timestamp': 1506120436,
'duration': 397.228,
'thumbnail': 'https://i1.sndcdn.com/artworks-000243916348-ktoo7d-original.jpg',
'license': 'all-rights-reserved',
'like_count': int,
'comment_count': int,
'repost_count': int,
'view_count': int,
'genres': ['Dance & EDM'],
}, },
}, },
# private link, downloadable format # private link, downloadable format
@ -503,6 +530,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg',
'uploader_url': 'https://soundcloud.com/oriuplift',
'genres': ['Trance'],
}, },
}, },
# no album art, use avatar pic for thumbnail # no album art, use avatar pic for thumbnail
@ -525,6 +555,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'uploader_url': 'https://soundcloud.com/garyvee',
'genres': [],
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -532,13 +564,13 @@ class SoundcloudIE(SoundcloudBaseIE):
}, },
{ {
'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', 'md5': '8227c3473a4264df6b02ad7e5b7527ac',
'info_dict': { 'info_dict': {
'id': '583011102', 'id': '583011102',
'ext': 'mp3', 'ext': 'opus',
'title': 'Mezzo Valzer', 'title': 'Mezzo Valzer',
'description': 'md5:4138d582f81866a530317bae316e8b61', 'description': 'md5:f4d5f39d52e0ccc2b4f665326428901a',
'uploader': 'Micronie', 'uploader': 'Giovanni Sarani',
'uploader_id': '3352531', 'uploader_id': '3352531',
'timestamp': 1551394171, 'timestamp': 1551394171,
'upload_date': '20190228', 'upload_date': '20190228',
@ -549,6 +581,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'genres': ['Piano'],
'uploader_url': 'https://soundcloud.com/giovannisarani',
}, },
}, },
{ {

View File

@ -174,7 +174,7 @@ class TheaterComplexTownBaseIE(StacommuBaseIE):
class TheaterComplexTownVODIE(TheaterComplexTownBaseIE): class TheaterComplexTownVODIE(TheaterComplexTownBaseIE):
_VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?videos/episodes/(?P<id>\w+)' _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:(?:en|ja)/)?videos/episodes/(?P<id>\w+)'
IE_NAME = 'theatercomplextown:vod' IE_NAME = 'theatercomplextown:vod'
_TESTS = [{ _TESTS = [{
'url': 'https://www.theater-complex.town/videos/episodes/hoxqidYNoAn7bP92DN6p78', 'url': 'https://www.theater-complex.town/videos/episodes/hoxqidYNoAn7bP92DN6p78',
@ -195,6 +195,9 @@ class TheaterComplexTownVODIE(TheaterComplexTownBaseIE):
}, { }, {
'url': 'https://www.theater-complex.town/en/videos/episodes/6QT7XYwM9dJz5Gf9VB6K5y', 'url': 'https://www.theater-complex.town/en/videos/episodes/6QT7XYwM9dJz5Gf9VB6K5y',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.theater-complex.town/ja/videos/episodes/hoxqidYNoAn7bP92DN6p78',
'only_matching': True,
}] }]
_API_PATH = 'videoEpisodes' _API_PATH = 'videoEpisodes'
@ -204,7 +207,7 @@ def _real_extract(self, url):
class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE): class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE):
_VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?ppv/(?P<id>\w+)' _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:(?:en|ja)/)?ppv/(?P<id>\w+)'
IE_NAME = 'theatercomplextown:ppv' IE_NAME = 'theatercomplextown:ppv'
_TESTS = [{ _TESTS = [{
'url': 'https://www.theater-complex.town/ppv/wytW3X7khrjJBUpKuV3jen', 'url': 'https://www.theater-complex.town/ppv/wytW3X7khrjJBUpKuV3jen',
@ -223,6 +226,9 @@ class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE):
}, { }, {
'url': 'https://www.theater-complex.town/en/ppv/wytW3X7khrjJBUpKuV3jen', 'url': 'https://www.theater-complex.town/en/ppv/wytW3X7khrjJBUpKuV3jen',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.theater-complex.town/ja/ppv/qwUVmLmGEiZ3ZW6it9uGys',
'only_matching': True,
}] }]
_API_PATH = 'events' _API_PATH = 'events'

View File

@ -41,7 +41,7 @@ def _real_extract(self, url):
ptype, video_id = self._match_valid_url(url).groups() ptype, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id, fatal=False) or '' webpage = self._download_webpage(url, video_id, fatal=False) or ''
props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {} props = self._search_nextjs_data(webpage, video_id, default={}).get('props') or {}
player_api_cache = try_get( player_api_cache = try_get(
props, lambda x: x['initialReduxState']['playerApiCache']) or {} props, lambda x: x['initialReduxState']['playerApiCache']) or {}

View File

@ -776,7 +776,7 @@ def _real_extract(self, url):
status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'): elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
self.write_debug('Found next.js data') self.write_debug('Found next.js data')
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))

View File

@ -1,10 +1,9 @@
import functools
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import float_or_none, int_or_none, smuggle_url, strip_or_none
float_or_none, from ..utils.traversal import traverse_obj
int_or_none,
smuggle_url,
strip_or_none,
)
class TVAIE(InfoExtractor): class TVAIE(InfoExtractor):
@ -49,11 +48,20 @@ class QubIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '6084352463001', 'id': '6084352463001',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Épisode 01', 'title': 'Ép 01. Mon dernier jour',
'uploader_id': '5481942443001', 'uploader_id': '5481942443001',
'upload_date': '20190907', 'upload_date': '20190907',
'timestamp': 1567899756, 'timestamp': 1567899756,
'description': 'md5:9c0d7fbb90939420c651fd977df90145', 'description': 'md5:9c0d7fbb90939420c651fd977df90145',
'thumbnail': r're:https://.+\.jpg',
'episode': 'Ép 01. Mon dernier jour',
'episode_number': 1,
'tags': ['alerte amber', 'alerte amber saison 1', 'surdemande'],
'duration': 2625.963,
'season': 'Season 1',
'season_number': 1,
'series': 'Alerte Amber',
'channel': 'TVA',
}, },
}, { }, {
'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943',
@ -64,22 +72,24 @@ class QubIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
entity_id = self._match_id(url) entity_id = self._match_id(url)
entity = self._download_json( webpage = self._download_webpage(url, entity_id)
'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities', entity = self._search_nextjs_data(webpage, entity_id)['props']['initialProps']['pageProps']['fallbackData']
entity_id, query={'id': entity_id})
video_id = entity['videoId'] video_id = entity['videoId']
episode = strip_or_none(entity.get('name')) episode = strip_or_none(entity.get('name'))
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',
'url': f'https://videos.tva.ca/details/_{video_id}',
'ie_key': TVAIE.ie_key(),
'id': video_id, 'id': video_id,
'title': episode, 'title': episode,
# 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'],
'url': 'https://videos.tva.ca/details/_' + video_id,
'description': entity.get('longDescription'),
'duration': float_or_none(entity.get('durationMillis'), 1000),
'episode': episode, 'episode': episode,
'episode_number': int_or_none(entity.get('episodeNumber')), **traverse_obj(entity, {
# 'ie_key': 'BrightcoveNew', 'description': ('longDescription', {str}),
'ie_key': TVAIE.ie_key(), 'duration': ('durationMillis', {functools.partial(float_or_none, scale=1000)}),
'channel': ('knownEntities', 'channel', 'name', {str}),
'series': ('knownEntities', 'videoShow', 'name', {str}),
'season_number': ('slug', {lambda x: re.search(r'/s(?:ai|ea)son-(\d+)/', x)}, 1, {int_or_none}),
'episode_number': ('episodeNumber', {int_or_none}),
}),
} }

View File

@ -451,6 +451,7 @@ def _real_extract(self, url):
info_page, 'view count', default=None)) info_page, 'view count', default=None))
formats = [] formats = []
subtitles = {}
for format_id, format_url in data.items(): for format_id, format_url in data.items():
format_url = url_or_none(format_url) format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//', 'rtmp')): if not format_url or not format_url.startswith(('http', '//', 'rtmp')):
@ -462,12 +463,21 @@ def _real_extract(self, url):
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
'url': format_url, 'url': format_url,
'ext': 'mp4',
'source_preference': 1,
'height': height, 'height': height,
}) })
elif format_id == 'hls': elif format_id == 'hls':
formats.extend(self._extract_m3u8_formats( fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', 'm3u8_native', format_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False, live=is_live)) m3u8_id=format_id, fatal=False, live=is_live)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif format_id.startswith('dash_'):
fmts, subs = self._extract_mpd_formats_and_subtitles(
format_url, video_id, mpd_id=format_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif format_id == 'rtmp': elif format_id == 'rtmp':
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -475,7 +485,6 @@ def _real_extract(self, url):
'ext': 'flv', 'ext': 'flv',
}) })
subtitles = {}
for sub in data.get('subs') or {}: for sub in data.get('subs') or {}:
subtitles.setdefault(sub.get('lang', 'en'), []).append({ subtitles.setdefault(sub.get('lang', 'en'), []).append({
'ext': sub.get('title', '.srt').split('.')[-1], 'ext': sub.get('title', '.srt').split('.')[-1],
@ -496,6 +505,7 @@ def _real_extract(self, url):
'comment_count': int_or_none(mv_data.get('commcount')), 'comment_count': int_or_none(mv_data.get('commcount')),
'is_live': is_live, 'is_live': is_live,
'subtitles': subtitles, 'subtitles': subtitles,
'_format_sort_fields': ('res', 'source'),
} }

View File

@ -12,6 +12,7 @@
jwt_decode_hs256, jwt_decode_hs256,
traverse_obj, traverse_obj,
try_call, try_call,
url_basename,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
variadic, variadic,
@ -147,7 +148,7 @@ def _download_metadata(self, url, video_id, lang, props_keys):
metadata = self._call_api(video_id, msg='metadata', query={'al': lang or 'ja'}, auth=False, fatal=False) metadata = self._call_api(video_id, msg='metadata', query={'al': lang or 'ja'}, auth=False, fatal=False)
if not metadata: if not metadata:
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
nextjs_data = self._search_nextjs_data(webpage, video_id) nextjs_data = self._search_nextjs_data(webpage, video_id, fatal=False)
metadata = traverse_obj(nextjs_data, ( metadata = traverse_obj(nextjs_data, (
'props', 'pageProps', *variadic(props_keys, (str, bytes, dict, set)), {dict})) or {} 'props', 'pageProps', *variadic(props_keys, (str, bytes, dict, set)), {dict})) or {}
return metadata return metadata
@ -194,8 +195,7 @@ def _real_extract(self, url):
return { return {
'id': video_id, 'id': video_id,
'formats': self._get_formats(video_data, ( 'formats': self._get_formats(video_data, ('protocolHls', 'url', {url_or_none}), video_id),
(('protocolHls', 'url'), ('chromecastUrls', ...)), {url_or_none}), video_id),
**traverse_obj(metadata, { **traverse_obj(metadata, {
'title': ('displayName', {str}), 'title': ('displayName', {str}),
'description': ('description', {str}), 'description': ('description', {str}),
@ -259,6 +259,10 @@ class WrestleUniversePPVIE(WrestleUniverseBaseIE):
'params': { 'params': {
'skip_download': 'm3u8', 'skip_download': 'm3u8',
}, },
}, {
'note': 'manifest provides live-a (partial) and live-b (full) streams',
'url': 'https://www.wrestle-universe.com/en/lives/umc99R9XsexXrxr9VjTo9g',
'only_matching': True,
}] }]
_API_PATH = 'events' _API_PATH = 'events'
@ -285,12 +289,16 @@ def _real_extract(self, url):
video_data, decrypt = self._call_encrypted_api( video_data, decrypt = self._call_encrypted_api(
video_id, ':watchArchive', 'watch archive', data={'method': 1}) video_id, ':watchArchive', 'watch archive', data={'method': 1})
info['formats'] = self._get_formats(video_data, ( # 'chromecastUrls' can be only partial videos, avoid
('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id) info['formats'] = self._get_formats(video_data, ('hls', (('urls', ...), 'url'), {url_or_none}), video_id)
for f in info['formats']: for f in info['formats']:
# bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values
if f.get('tbr'): if f.get('tbr'):
f['tbr'] = int(f['tbr'] / 2.5) f['tbr'] = int(f['tbr'] / 2.5)
# prefer variants with the same basename as the master playlist to avoid partial streams
f['format_id'] = url_basename(f['url']).partition('.')[0]
if not f['format_id'].startswith(url_basename(f['manifest_url']).partition('.')[0]):
f['preference'] = -10
hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt})) hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt}))
if hls_aes_key: if hls_aes_key:

View File

@ -259,15 +259,15 @@ def _real_extract(self, url):
webpage = self._download_webpage(redirect, video_id, note='Redirecting') webpage = self._download_webpage(redirect, video_id, note='Redirecting')
data_json = self._search_json( data_json = self._search_json(
r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}')
serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state')
webpage, 'server state').replace('State', 'Settings')
uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)', uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)',
webpage, 'uploader', default='<a>') webpage, 'uploader', default='<a>')
uploader_name = extract_attributes(uploader).get('aria-label') uploader_name = extract_attributes(uploader).get('aria-label')
video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict) item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str}))
stream_urls = try_get(video_json, lambda x: x['video']['streams']) video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {}
formats, subtitles = [], {} formats, subtitles = [], {}
for s_url in stream_urls: for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})):
ext = determine_ext(s_url) ext = determine_ext(s_url)
if ext == 'mpd': if ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash') fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash')

View File

@ -72,15 +72,15 @@ class YouPornIE(InfoExtractor):
'id': '16290308', 'id': '16290308',
'age_limit': 18, 'age_limit': 18,
'categories': [], 'categories': [],
'description': 'md5:00ea70f642f431c379763c17c2f396bc', 'description': str, # TODO: detect/remove SEO spam description in ytdl backport
'display_id': 'tinderspecial-trailer1', 'display_id': 'tinderspecial-trailer1',
'duration': 298.0, 'duration': 298.0,
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20201123', 'upload_date': '20201123',
'uploader': 'Ersties', 'uploader': 'Ersties',
'tags': [], 'tags': [],
'thumbnail': 'https://fi1.ypncdn.com/202011/23/16290308/original/8/tinderspecial-trailer1-8(m=eaAaaEPbaaaa).jpg', 'thumbnail': r're:https://.+\.jpg',
'timestamp': 1606089600, 'timestamp': 1606147564,
'title': 'Tinder In Real Life', 'title': 'Tinder In Real Life',
'view_count': int, 'view_count': int,
} }
@ -88,11 +88,17 @@ class YouPornIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id') video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
definitions = self._download_json( self._set_cookie('.youporn.com', 'age_verified', '1')
f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id) webpage = self._download_webpage(f'https://www.youporn.com/watch/{video_id}', video_id)
definitions = self._search_json(r'\bplayervars\s*:', webpage, 'player vars', video_id)['mediaDefinitions']
def get_format_data(data, f): def get_format_data(data, stream_type):
return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl'])) info_url = traverse_obj(data, (lambda _, v: v['format'] == stream_type, 'videoUrl', {url_or_none}, any))
if not info_url:
return []
return traverse_obj(
self._download_json(info_url, video_id, f'Downloading {stream_type} info JSON', fatal=False),
lambda _, v: v['format'] == stream_type and url_or_none(v['videoUrl']))
formats = [] formats = []
# Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s # Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
@ -123,10 +129,6 @@ def get_format_data(data, f):
f['height'] = height f['height'] = height
formats.append(f) formats.append(f)
webpage = self._download_webpage(
'http://www.youporn.com/watch/%s' % video_id, display_id,
headers={'Cookie': 'age_verified=1'})
title = self._html_search_regex( title = self._html_search_regex(
r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
webpage, 'title', default=None) or self._og_search_title( webpage, 'title', default=None) or self._og_search_title(

View File

@ -132,6 +132,16 @@ def _check_extensions(self, extensions):
extensions.pop('cookiejar', None) extensions.pop('cookiejar', None)
extensions.pop('timeout', None) extensions.pop('timeout', None)
def send(self, request: Request) -> Response:
target = self._get_request_target(request)
try:
response = super().send(request)
except HTTPError as e:
e.response.extensions['impersonate'] = target
raise
response.extensions['impersonate'] = target
return response
def _send(self, request: Request): def _send(self, request: Request):
max_redirects_exceeded = False max_redirects_exceeded = False
session: curl_cffi.requests.Session = self._get_instance( session: curl_cffi.requests.Session = self._get_instance(

View File

@ -497,6 +497,7 @@ class Response(io.IOBase):
@param headers: response headers. @param headers: response headers.
@param status: Response HTTP status code. Default is 200 OK. @param status: Response HTTP status code. Default is 200 OK.
@param reason: HTTP status reason. Will use built-in reasons based on status code if not provided. @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
@param extensions: Dictionary of handler-specific response extensions.
""" """
def __init__( def __init__(
@ -505,7 +506,9 @@ def __init__(
url: str, url: str,
headers: Mapping[str, str], headers: Mapping[str, str],
status: int = 200, status: int = 200,
reason: str = None): reason: str = None,
extensions: dict = None
):
self.fp = fp self.fp = fp
self.headers = Message() self.headers = Message()
@ -517,6 +520,7 @@ def __init__(
self.reason = reason or HTTPStatus(status).phrase self.reason = reason or HTTPStatus(status).phrase
except ValueError: except ValueError:
self.reason = None self.reason = None
self.extensions = extensions or {}
def readable(self): def readable(self):
return self.fp.readable() return self.fp.readable()

View File

@ -69,6 +69,10 @@ def _get_variant_and_executable_path():
# Ref: https://en.wikipedia.org/wiki/Uname#Examples # Ref: https://en.wikipedia.org/wiki/Uname#Examples
if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'): if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'):
machine = '_x86' if platform.architecture()[0][:2] == '32' else '' machine = '_x86' if platform.architecture()[0][:2] == '32' else ''
# sys.executable returns a /tmp/ path for staticx builds (linux_static)
# Ref: https://staticx.readthedocs.io/en/latest/usage.html#run-time-information
if static_exe_path := os.getenv('STATICX_PROG_PATH'):
path = static_exe_path
return f'{remove_end(sys.platform, "32")}{machine}_exe', path return f'{remove_end(sys.platform, "32")}{machine}_exe', path
path = os.path.dirname(__file__) path = os.path.dirname(__file__)

View File

@ -1638,16 +1638,14 @@ def get_filesystem_encoding():
return encoding if encoding is not None else 'utf-8' return encoding if encoding is not None else 'utf-8'
_WINDOWS_QUOTE_TRANS = str.maketrans({'"': '\\"', '\\': '\\\\'}) _WINDOWS_QUOTE_TRANS = str.maketrans({'"': R'\"'})
_CMD_QUOTE_TRANS = str.maketrans({ _CMD_QUOTE_TRANS = str.maketrans({
# Keep quotes balanced by replacing them with `""` instead of `\\"` # Keep quotes balanced by replacing them with `""` instead of `\\"`
'"': '""', '"': '""',
# Requires a variable `=` containing `"^\n\n"` (set in `utils.Popen`) # These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`)
# `=` should be unique since variables containing `=` cannot be set using cmd # `=` should be unique since variables containing `=` cannot be set using cmd
'\n': '%=%', '\n': '%=%',
# While we are only required to escape backslashes immediately before quotes, '\r': '%=%',
# we instead escape all of 'em anyways to be consistent
'\\': '\\\\',
# Use zero length variable replacement so `%` doesn't get expanded # Use zero length variable replacement so `%` doesn't get expanded
# `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`) # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
'%': '%%cd:~,%', '%': '%%cd:~,%',
@ -1656,19 +1654,14 @@ def get_filesystem_encoding():
def shell_quote(args, *, shell=False): def shell_quote(args, *, shell=False):
args = list(variadic(args)) args = list(variadic(args))
if any(isinstance(item, bytes) for item in args):
deprecation_warning('Passing bytes to utils.shell_quote is deprecated')
encoding = get_filesystem_encoding()
for index, item in enumerate(args):
if isinstance(item, bytes):
args[index] = item.decode(encoding)
if compat_os_name != 'nt': if compat_os_name != 'nt':
return shlex.join(args) return shlex.join(args)
trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
return ' '.join( return ' '.join(
s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII) else s.translate(trans).join('""') s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII)
else re.sub(r'(\\+)("|$)', r'\1\1\2', s).translate(trans).join('""')
for s in args) for s in args)