diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 93668a7bf..aa11c6194 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -192,7 +192,7 @@ jobs: - name: Install Requirements run: | brew install coreutils - /usr/bin/python3 -m pip install -U --user pip Pyinstaller -r requirements.txt + /usr/bin/python3 -m pip install -U --user pip Pyinstaller==5.8 -r requirements.txt - name: Prepare run: | diff --git a/README.md b/README.md index de83e421f..c1f34235d 100644 --- a/README.md +++ b/README.md @@ -463,15 +463,11 @@ ## Geo-restriction: specified by --proxy (or none, if the option is not present) is used for the actual downloading - --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header (default) - --no-geo-bypass Do not bypass geographic restriction via - faking X-Forwarded-For HTTP header - --geo-bypass-country CODE Force bypass geographic restriction with - explicitly provided two-letter ISO 3166-2 - country code - --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with - explicitly provided IP block in CIDR notation + --xff VALUE How to fake X-Forwarded-For HTTP header to + try bypassing geographic restriction. One of + "default" (Only when known to be useful), + "never", a two-letter ISO 3166-2 country + code, or an IP block in CIDR notation ## Video Selection: -I, --playlist-items ITEM_SPEC Comma separated playlist_index of the items @@ -752,6 +748,7 @@ ## Internet Shortcut Options: ## Verbosity and Simulation Options: -q, --quiet Activate quiet mode. If used with --verbose, print the log to stderr + --no-quiet Deactivate quiet mode. (Default) --no-warnings Ignore warnings -s, --simulate Do not download the video and do not write anything to disk @@ -1246,7 +1243,7 @@ # OUTPUT TEMPLATE 1. **Alternatives**: Alternate fields can be specified separated with a `,`. E.g. `%(release_date>%Y,upload_date>%Y|Unknown)s` -1. **Replacement**: A replacement value can be specified using a `&` separator. If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. +1. **Replacement**: A replacement value can be specified using a `&` separator according to the [`str.format` mini-language](https://docs.python.org/3/library/string.html#format-specification-mini-language). If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. E.g. `%(chapters&has chapters|no chapters)s`, `%(title&TITLE={:>20}|NO TITLE)s` 1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-placeholder`. E.g. `%(uploader|Unknown)s` @@ -1797,7 +1794,10 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.) * `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off #### generic -* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments. Does not apply to ffmpeg +* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg +* `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE` +* `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist +* `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live` #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` @@ -1833,7 +1833,7 @@ #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` #### twitter -* `force_graphql`: Force usage of the GraphQL API. By default it will only be used if login cookies are provided +* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed **Note**: These options may be changed/removed in the future without concern for backward compatibility @@ -2164,6 +2164,10 @@ #### Not recommended --youtube-skip-hls-manifest --extractor-args "youtube:skip=hls" (Alias: --no-youtube-include-hls-manifest) --youtube-include-dash-manifest Default (Alias: --no-youtube-skip-dash-manifest) --youtube-include-hls-manifest Default (Alias: --no-youtube-skip-hls-manifest) + --geo-bypass --xff "default" + --no-geo-bypass --xff "never" + --geo-bypass-country CODE --xff CODE + --geo-bypass-ip-block IP_BLOCK --xff IP_BLOCK #### Developer options diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index e8d94a6ac..1f60abfd2 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1406,6 +1406,7 @@ def test_parse_ism_formats(self): 'vcodec': 'none', 'acodec': 'AACL', 'protocol': 'ism', + 'audio_channels': 2, '_download_params': { 'stream_type': 'audio', 'duration': 8880746666, @@ -1419,9 +1420,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'audio_ext': 'isma', - 'video_ext': 'none', - 'abr': 128, }, { 'format_id': 'video-100', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1445,9 +1443,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 100, }, { 'format_id': 'video-326', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1471,9 +1466,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 326, }, { 'format_id': 'video-698', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1497,9 +1489,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 698, }, { 'format_id': 'video-1493', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1523,9 +1512,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 1493, }, { 'format_id': 'video-4482', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1549,9 +1535,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 4482, }], { 'eng': [ @@ -1575,34 +1558,6 @@ def test_parse_ism_formats(self): 'ec-3_test', 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', [{ - 'format_id': 'audio_deu_1-224', - 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', - 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', - 'ext': 'isma', - 'tbr': 224, - 'asr': 48000, - 'vcodec': 'none', - 'acodec': 'EC-3', - 'protocol': 'ism', - '_download_params': - { - 'stream_type': 'audio', - 'duration': 370000000, - 'timescale': 10000000, - 'width': 0, - 'height': 0, - 'fourcc': 'EC-3', - 'language': 'deu', - 'codec_private_data': '00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00', - 'sampling_rate': 48000, - 'channels': 6, - 'bits_per_sample': 16, - 'nal_unit_length_field': 4 - }, - 'audio_ext': 'isma', - 'video_ext': 'none', - 'abr': 224, - }, { 'format_id': 'audio_deu-127', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1612,8 +1567,9 @@ def test_parse_ism_formats(self): 'vcodec': 'none', 'acodec': 'AACL', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + 'audio_channels': 2, + '_download_params': { 'stream_type': 'audio', 'duration': 370000000, 'timescale': 10000000, @@ -1627,9 +1583,32 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'audio_ext': 'isma', - 'video_ext': 'none', - 'abr': 127, + }, { + 'format_id': 'audio_deu_1-224', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'isma', + 'tbr': 224, + 'asr': 48000, + 'vcodec': 'none', + 'acodec': 'EC-3', + 'protocol': 'ism', + 'language': 'deu', + 'audio_channels': 6, + '_download_params': { + 'stream_type': 'audio', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 0, + 'height': 0, + 'fourcc': 'EC-3', + 'language': 'deu', + 'codec_private_data': '00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00', + 'sampling_rate': 48000, + 'channels': 6, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, }, { 'format_id': 'video_deu-23', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1641,8 +1620,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1655,9 +1634,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 23, }, { 'format_id': 'video_deu-403', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1669,8 +1645,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1683,9 +1659,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 403, }, { 'format_id': 'video_deu-680', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1697,8 +1670,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1711,9 +1684,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 680, }, { 'format_id': 'video_deu-1253', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1725,8 +1695,9 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'vbr': 1253, + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1739,9 +1710,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 1253, }, { 'format_id': 'video_deu-2121', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1753,8 +1721,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1767,9 +1735,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 2121, }, { 'format_id': 'video_deu-3275', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1781,8 +1746,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1795,9 +1760,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 3275, }, { 'format_id': 'video_deu-5300', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1809,8 +1771,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1823,9 +1785,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 5300, }, { 'format_id': 'video_deu-8079', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1837,8 +1796,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1851,9 +1810,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 8079, }], {}, ), diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 8da1e5e4b..3c26bd7c6 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -822,6 +822,10 @@ def expect_same_infodict(out): test('%(title&foo|baz)s.bar', 'baz.bar') test('%(x,id&foo|baz)s.bar', 'foo.bar') test('%(x,title&foo|baz)s.bar', 'baz.bar') + test('%(id&a\nb|)s', ('a\nb', 'a b')) + test('%(id&hi {:>10} {}|)s', 'hi 1234 1234') + test(R'%(id&{0} {}|)s', 'NA') + test(R'%(id&{0.1}|)s', 'NA') # Laziness def gen(): diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index e090dc791..3283657d7 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -445,6 +445,22 @@ def test_bitwise_operators_overflow(self): jsi = JSInterpreter('function x(){return 1236566549 << 5}') self.assertEqual(jsi.call_function('x'), 915423904) + def test_negative(self): + jsi = JSInterpreter("function f(){return 2 * -2.0;}") + self.assertEqual(jsi.call_function('f'), -4) + + jsi = JSInterpreter('function f(){return 2 - - -2;}') + self.assertEqual(jsi.call_function('f'), 0) + + jsi = JSInterpreter('function f(){return 2 - - - -2;}') + self.assertEqual(jsi.call_function('f'), 4) + + jsi = JSInterpreter('function f(){return 2 - + + - -2;}') + self.assertEqual(jsi.call_function('f'), 0) + + jsi = JSInterpreter('function f(){return 2 + - + - -2;}') + self.assertEqual(jsi.call_function('f'), 0) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 87d6ef05b..b4c071f5b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1195,6 +1195,13 @@ def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') + def test_js_to_json_template_literal(self): + self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"') + self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"') + self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"') + self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""') + self.assertEqual(js_to_json('`${name}`', {}), '"name"') + def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) @@ -2014,6 +2021,8 @@ def test_traverse_obj(self): msg='nested `...` queries should work') self.assertCountEqual(traverse_obj(_TEST_DATA, (..., ..., 'index')), range(4), msg='`...` query result should be flattened') + self.assertEqual(traverse_obj(iter(range(4)), ...), list(range(4)), + msg='`...` should accept iterables') # Test function as key self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)), @@ -2021,6 +2030,8 @@ def test_traverse_obj(self): msg='function as query key should perform a filter based on (key, value)') self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), {'str'}, msg='exceptions in the query function should be catched') + self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2], + msg='function key should accept iterables') if __debug__: with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'): traverse_obj(_TEST_DATA, lambda a: ...) @@ -2045,6 +2056,17 @@ def test_traverse_obj(self): with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'): traverse_obj(_TEST_DATA, {str.upper, str}) + # Test `slice` as a key + _SLICE_DATA = [0, 1, 2, 3, 4] + self.assertEqual(traverse_obj(_TEST_DATA, ('dict', slice(1))), None, + msg='slice on a dictionary should not throw') + self.assertEqual(traverse_obj(_SLICE_DATA, slice(1)), _SLICE_DATA[:1], + msg='slice key should apply slice to sequence') + self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 2)), _SLICE_DATA[1:2], + msg='slice key should apply slice to sequence') + self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 4, 2)), _SLICE_DATA[1:4:2], + msg='slice key should apply slice to sequence') + # Test alternative paths self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str', msg='multiple `paths` should be treated as alternative paths') @@ -2228,6 +2250,12 @@ def test_traverse_obj(self): self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), traverse_string=True), ['s', 'r'], msg='branching should result in list if `traverse_string`') + self.assertEqual(traverse_obj({}, (0, ...), traverse_string=True), [], + msg='branching should result in list if `traverse_string`') + self.assertEqual(traverse_obj({}, (0, lambda x, y: True), traverse_string=True), [], + msg='branching should result in list if `traverse_string`') + self.assertEqual(traverse_obj({}, (0, slice(1)), traverse_string=True), [], + msg='branching should result in list if `traverse_string`') # Test is_user_input behavior _IS_USER_INPUT_DATA = {'range8': list(range(8))} diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 336e80291..e2b3f0870 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -142,6 +142,10 @@ 'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js', 'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A', ), + ( + 'https://www.youtube.com/s/player/6f20102c/player_ias.vflset/en_US/base.js', + 'lE8DhoDmKqnmJJ', 'pJTTX6XyJP2BYw', + ), ] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 149193c77..eb1f06b59 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -21,7 +21,7 @@ import traceback import unicodedata import urllib.request -from string import ascii_letters +from string import Formatter, ascii_letters from .cache import Cache from .compat import compat_os_name, compat_shlex_quote @@ -1161,7 +1161,7 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): } MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})' MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) - INTERNAL_FORMAT_RE = re.compile(rf'''(?x) + INTERNAL_FORMAT_RE = re.compile(rf'''(?xs) (?P-)? (?P{FIELD_RE}) (?P(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*) @@ -1242,6 +1242,14 @@ def _dumpjson_default(obj): return list(obj) return repr(obj) + class _ReplacementFormatter(Formatter): + def get_field(self, field_name, args, kwargs): + if field_name.isdigit(): + return args[0], -1 + raise ValueError('Unsupported field') + + replacement_formatter = _ReplacementFormatter() + def create_key(outer_mobj): if not outer_mobj.group('has_key'): return outer_mobj.group(0) @@ -1263,7 +1271,13 @@ def create_key(outer_mobj): if fmt == 's' and value is not None and key in field_size_compat_map.keys(): fmt = f'0{field_size_compat_map[key]:d}d' - value = default if value is None else value if replacement is None else replacement + if value is None: + value = default + elif replacement is not None: + try: + value = replacement_formatter.format(replacement, value) + except ValueError: + value = na flags = outer_mobj.group('conversion') or '' str_fmt = f'{fmt[:-1]}s' @@ -1668,7 +1682,7 @@ def process_ie_result(self, ie_result, download=True, extra_info=None): self.add_extra_info(info_copy, extra_info) info_copy, _ = self.pre_process(info_copy) self._fill_common_fields(info_copy, False) - self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) + self.__forced_printings(info_copy) self._raise_pending_errors(info_copy) if self.params.get('force_write_download_archive', False): self.record_download_archive(info_copy) @@ -1937,7 +1951,7 @@ def _build_format_filter(self, filter_spec): '!=': operator.ne, } operator_rex = re.compile(r'''(?x)\s* - (?Pwidth|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s* + (?P[\w.-]+)\s* (?P%s)(?P\s*\?)?\s* (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s* ''' % '|'.join(map(re.escape, OPERATORS.keys()))) @@ -2710,7 +2724,7 @@ def is_wellformed(f): self.list_formats(info_dict) if list_only: # Without this printing, -F --print-json will not work - self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) + self.__forced_printings(info_dict) return info_dict format_selector = self.format_selector @@ -2870,6 +2884,12 @@ def _forceprint(self, key, info_dict): if info_dict is None: return info_copy = info_dict.copy() + info_copy.setdefault('filename', self.prepare_filename(info_dict)) + if info_dict.get('requested_formats') is not None: + # For RTMP URLs, also include the playpath + info_copy['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats']) + elif info_dict.get('url'): + info_copy['urls'] = info_dict['url'] + info_dict.get('play_path', '') info_copy['formats_table'] = self.render_formats_table(info_dict) info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict) info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles')) @@ -2895,46 +2915,36 @@ def format_tmpl(tmpl): tmpl = format_tmpl(tmpl) self.to_screen(f'[info] Writing {tmpl!r} to: {filename}') if self._ensure_dir_exists(filename): - with open(filename, 'a', encoding='utf-8') as f: - f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n') + with open(filename, 'a', encoding='utf-8', newline='') as f: + f.write(self.evaluate_outtmpl(tmpl, info_copy) + os.linesep) - def __forced_printings(self, info_dict, filename, incomplete): - def print_mandatory(field, actual_field=None): - if actual_field is None: - actual_field = field - if (self.params.get('force%s' % field, False) - and (not incomplete or info_dict.get(actual_field) is not None)): - self.to_stdout(info_dict[actual_field]) - - def print_optional(field): - if (self.params.get('force%s' % field, False) - and info_dict.get(field) is not None): - self.to_stdout(info_dict[field]) - - info_dict = info_dict.copy() - if filename is not None: - info_dict['filename'] = filename - if info_dict.get('requested_formats') is not None: - # For RTMP URLs, also include the playpath - info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats']) - elif info_dict.get('url'): - info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '') + return info_copy + def __forced_printings(self, info_dict, filename=None, incomplete=True): if (self.params.get('forcejson') or self.params['forceprint'].get('video') or self.params['print_to_file'].get('video')): self.post_extract(info_dict) - self._forceprint('video', info_dict) + if filename: + info_dict['filename'] = filename + info_copy = self._forceprint('video', info_dict) - print_mandatory('title') - print_mandatory('id') - print_mandatory('url', 'urls') - print_optional('thumbnail') - print_optional('description') - print_optional('filename') - if self.params.get('forceduration') and info_dict.get('duration') is not None: - self.to_stdout(formatSeconds(info_dict['duration'])) - print_mandatory('format') + def print_field(field, actual_field=None, optional=False): + if actual_field is None: + actual_field = field + if self.params.get(f'force{field}') and ( + info_copy.get(field) is not None or (not optional and not incomplete)): + self.to_stdout(info_copy[actual_field]) + + print_field('title') + print_field('id') + print_field('url', 'urls') + print_field('thumbnail', optional=True) + print_field('description', optional=True) + print_field('filename', optional=True) + if self.params.get('forceduration') and info_copy.get('duration') is not None: + self.to_stdout(formatSeconds(info_copy['duration'])) + print_field('format') if self.params.get('forcejson'): self.to_stdout(json.dumps(self.sanitize_info(info_dict))) @@ -3316,7 +3326,7 @@ def ffmpeg_fixup(cndn, msg, cls): or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None, 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', FFmpegFixupM3u8PP) - ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD', + ffmpeg_fixup(info_dict.get('is_live') and downloader == 'dashsegments', 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP) ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP) @@ -3482,7 +3492,7 @@ def run_pp(self, pp, infodict): *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)') return infodict - def run_all_pps(self, key, info, *, additional_pps=None): + def run_all_pps(self, key, info, *, additional_pps=None, fatal=True): if key != 'video': self._forceprint(key, info) for pp in (additional_pps or []) + self._pps[key]: diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 8e0281b72..0699650db 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -412,12 +412,17 @@ def metadataparser_actions(f): except Exception as err: raise ValueError(f'Invalid playlist-items {opts.playlist_items!r}: {err}') - geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country - if geo_bypass_code is not None: + opts.geo_bypass_country, opts.geo_bypass_ip_block = None, None + if opts.geo_bypass.lower() not in ('default', 'never'): try: - GeoUtils.random_ipv4(geo_bypass_code) + GeoUtils.random_ipv4(opts.geo_bypass) except Exception: - raise ValueError('unsupported geo-bypass country or ip-block') + raise ValueError(f'Unsupported --xff "{opts.geo_bypass}"') + if len(opts.geo_bypass) == 2: + opts.geo_bypass_country = opts.geo_bypass + else: + opts.geo_bypass_ip_block = opts.geo_bypass + opts.geo_bypass = opts.geo_bypass.lower() != 'never' opts.match_filter = match_filter_func(opts.match_filter, opts.breaking_match_filter) @@ -720,7 +725,8 @@ def parse_options(argv=None): 'dumpjson', 'dump_single_json', 'getdescription', 'getduration', 'getfilename', 'getformat', 'getid', 'getthumbnail', 'gettitle', 'geturl' )) - opts.quiet = opts.quiet or any_getting or opts.print_json or bool(opts.forceprint) + if opts.quiet is None: + opts.quiet = any_getting or opts.print_json or bool(opts.forceprint) playlist_pps = [pp for pp in postprocessors if pp.get('when') == 'playlist'] write_playlist_infojson = (opts.writeinfojson and not opts.clean_infojson diff --git a/yt_dlp/dependencies/Cryptodome.py b/yt_dlp/dependencies/Cryptodome.py index 74ab6575c..2cfa4c952 100644 --- a/yt_dlp/dependencies/Cryptodome.py +++ b/yt_dlp/dependencies/Cryptodome.py @@ -1,4 +1,4 @@ -import types +from ..compat.compat_utils import passthrough_module try: import Cryptodome as _parent @@ -6,9 +6,11 @@ try: import Crypto as _parent except (ImportError, SyntaxError): # Old Crypto gives SyntaxError in newer Python - _parent = types.ModuleType('no_Cryptodome') + _parent = passthrough_module(__name__, 'no_Cryptodome') __bool__ = lambda: False +del passthrough_module + __version__ = '' AES = PKCS1_v1_5 = Blowfish = PKCS1_OAEP = SHA1 = CMAC = RSA = None try: diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 01281b5a1..974c8a254 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -254,6 +254,14 @@ BRMediathekIE, ) from .bravotv import BravoTVIE +from .brainpop import ( + BrainPOPIE, + BrainPOPJrIE, + BrainPOPELLIE, + BrainPOPEspIE, + BrainPOPFrIE, + BrainPOPIlIE, +) from .breakcom import BreakIE from .breitbart import BreitBartIE from .brightcove import ( @@ -298,7 +306,10 @@ CBCGemPlaylistIE, CBCGemLiveIE, ) -from .cbs import CBSIE +from .cbs import ( + CBSIE, + ParamountPressExpressIE, +) from .cbslocal import ( CBSLocalIE, CBSLocalArticleIE, @@ -345,6 +356,7 @@ ) from .ciscowebex import CiscoWebexIE from .cjsw import CJSWIE +from .clipchamp import ClipchampIE from .cliphunter import CliphunterIE from .clippit import ClippitIE from .cliprs import ClipRsIE @@ -441,6 +453,10 @@ ) from .democracynow import DemocracynowIE from .detik import DetikEmbedIE +from .dlf import ( + DLFIE, + DLFCorpusIE, +) from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE @@ -674,10 +690,18 @@ from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE +from .globalplayer import ( + GlobalPlayerLiveIE, + GlobalPlayerLivePlaylistIE, + GlobalPlayerAudioIE, + GlobalPlayerAudioEpisodeIE, + GlobalPlayerVideoIE +) from .globo import ( GloboIE, GloboArticleIE, ) +from .gmanetwork import GMANetworkVideoIE from .go import GoIE from .godtube import GodTubeIE from .gofile import GofileIE @@ -709,13 +733,16 @@ from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE from .hketv import HKETVIE from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE +from .hollywoodreporter import ( + HollywoodReporterIE, + HollywoodReporterPlaylistIE, +) from .holodex import HolodexIE from .hotnewhiphop import HotNewHipHopIE from .hotstar import ( @@ -727,6 +754,7 @@ ) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrefli import HrefLiRedirectIE from .hrfensehen import HRFernsehenIE from .hrti import ( HRTiIE, @@ -936,10 +964,6 @@ LimelightChannelIE, LimelightChannelListIE, ) -from .line import ( - LineLiveIE, - LineLiveChannelIE, -) from .linkedin import ( LinkedInIE, LinkedInLearningIE, @@ -1219,6 +1243,8 @@ NhkForSchoolBangumiIE, NhkForSchoolSubjectIE, NhkForSchoolProgramListIE, + NhkRadioNewsPageIE, + NhkRadiruIE, ) from .nhl import NHLIE from .nick import ( @@ -1390,6 +1416,7 @@ PeriscopeIE, PeriscopeUserIE, ) +from .pgatour import PGATourIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE @@ -1606,6 +1633,11 @@ from .rtp import RTPIE from .rtrfm import RTRFMIE from .rts import RTSIE +from .rtvcplay import ( + RTVCPlayIE, + RTVCPlayEmbedIE, + RTVCKalturaIE, +) from .rtve import ( RTVEALaCartaIE, RTVEAudioIE, @@ -1675,6 +1707,7 @@ ) from .scrolller import ScrolllerIE from .seeker import SeekerIE +from .senalcolombia import SenalColombiaLiveIE from .senategov import SenateISVPIE, SenateGovIE from .sendtonews import SendtoNewsIE from .servus import ServusIE @@ -1772,6 +1805,7 @@ BellatorIE, ParamountNetworkIE, ) +from .stageplus import StagePlusVODConcertIE from .startrek import StarTrekIE from .stitcher import ( StitcherIE, @@ -1954,6 +1988,7 @@ from .triller import ( TrillerIE, TrillerUserIE, + TrillerShortIE, ) from .trilulilu import TriluliluIE from .trovo import ( @@ -2280,6 +2315,8 @@ WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .wevidi import WeVidiIE +from .whyp import WhypIE from .wikimedia import WikimediaIE from .willow import WillowIE from .wimtv import WimTVIE @@ -2334,8 +2371,6 @@ from .yahoo import ( YahooIE, YahooSearchIE, - YahooGyaOPlayerIE, - YahooGyaOIE, YahooJapanNewsIE, ) from .yandexdisk import YandexDiskIE diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index f611c1f2c..c9166b6b8 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -436,6 +436,16 @@ def _real_extract(self, url): if 3 not in ondemand_types: # cannot acquire decryption key for these streams self.report_warning('This is a premium-only stream') + info.update(traverse_obj(api_response, { + 'series': ('series', 'title'), + 'season': ('season', 'title'), + 'season_number': ('season', 'sequence'), + 'episode_number': ('episode', 'number'), + })) + if not title: + title = traverse_obj(api_response, ('episode', 'title')) + if not description: + description = traverse_obj(api_response, ('episode', 'content')) m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8' elif video_type == 'slots': diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index e5944f714..68a970f68 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1573,7 +1573,7 @@ def extract_redirect_url(html, url=None, fatal=False): }), headers={ 'Content-Type': 'application/x-www-form-urlencoded' }) - elif mso_id == 'Spectrum': + elif mso_id in ('Spectrum', 'Charter_Direct'): # Spectrum's login for is dynamically loaded via JS so we need to hardcode the flow # as a one-off implementation. provider_redirect_page, urlh = provider_redirect_page_res diff --git a/yt_dlp/extractor/aeonco.py b/yt_dlp/extractor/aeonco.py index 4655862e3..390eae32b 100644 --- a/yt_dlp/extractor/aeonco.py +++ b/yt_dlp/extractor/aeonco.py @@ -1,5 +1,6 @@ from .common import InfoExtractor from .vimeo import VimeoIE +from ..utils import ExtractorError, traverse_obj, url_or_none class AeonCoIE(InfoExtractor): @@ -19,22 +20,55 @@ class AeonCoIE(InfoExtractor): } }, { 'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it', - 'md5': '4e5f3dad9dbda0dbfa2da41a851e631e', + 'md5': '03582d795382e49f2fd0b427b55de409', 'info_dict': { - 'id': '728595228', + 'id': '759576926', 'ext': 'mp4', 'title': 'Wrought', - 'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280', - 'uploader': 'Biofilm Productions', - 'uploader_id': 'user140352216', - 'uploader_url': 'https://vimeo.com/user140352216', + 'thumbnail': 'https://i.vimeocdn.com/video/1525599692-84614af88e446612f49ca966cf8f80eab2c73376bedd80555741c521c26f9a3e-d_1280', + 'uploader': 'Aeon Video', + 'uploader_id': 'aeonvideo', + 'uploader_url': 'https://vimeo.com/aeonvideo', 'duration': 1344 } + }, { + 'url': 'https://aeon.co/videos/chew-over-the-prisoners-dilemma-and-see-if-you-can-find-the-rational-path-out', + 'md5': '1cfda0bf3ae24df17d00f2c0cb6cc21b', + 'info_dict': { + 'id': 'emyi4z-O0ls', + 'ext': 'mp4', + 'title': 'How to outsmart the Prisoner’s Dilemma - Lucas Husted', + 'thumbnail': 'https://i.ytimg.com/vi_webp/emyi4z-O0ls/maxresdefault.webp', + 'uploader': 'TED-Ed', + 'uploader_id': '@TEDEd', + 'uploader_url': 'https://www.youtube.com/@TEDEd', + 'duration': 344, + 'upload_date': '20200827', + 'channel_id': 'UCsooa4yRKGN_zEE8iknghZA', + 'playable_in_embed': True, + 'description': 'md5:c0959524f08cb60f96fd010f3dfb17f3', + 'categories': ['Education'], + 'like_count': int, + 'channel': 'TED-Ed', + 'chapters': 'count:7', + 'channel_url': 'https://www.youtube.com/channel/UCsooa4yRKGN_zEE8iknghZA', + 'tags': 'count:26', + 'availability': 'public', + 'channel_follower_count': int, + 'view_count': int, + 'age_limit': 0, + 'live_status': 'not_live', + 'comment_count': int, + }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - vimeo_id = self._search_regex(r'hosterId":\s*"(?P[0-9]+)', webpage, 'vimeo id') - vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co') - return self.url_result(vimeo_url, VimeoIE) + embed_url = traverse_obj(self._yield_json_ld(webpage, video_id), ( + lambda _, v: v['@type'] == 'VideoObject', 'embedUrl', {url_or_none}), get_all=False) + if not embed_url: + raise ExtractorError('No embed URL found in webpage') + if 'player.vimeo.com' in embed_url: + embed_url = VimeoIE._smuggle_referrer(embed_url, 'https://aeon.co/') + return self.url_result(embed_url) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index c34439779..faa2218ce 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -26,6 +26,7 @@ srt_subtitles_timecode, str_or_none, traverse_obj, + unified_timestamp, unsmuggle_url, url_or_none, urlencode_postdata, @@ -133,7 +134,7 @@ def _get_all_children(self, reply): class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P[^/?#&]+)' + _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -281,19 +282,60 @@ class BiliBiliIE(BilibiliBaseIE): 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, + }, { + 'note': 'video redirects to festival page', + 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h', + 'info_dict': { + 'id': 'BV1wP4y1P72h', + 'ext': 'mp4', + 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】', + 'timestamp': 1643947497, + 'upload_date': '20220204', + 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6', + 'uploader': '叨叨冯聊音乐', + 'duration': 246.719, + 'uploader_id': '528182630', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, + }, { + 'note': 'newer festival video', + 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f', + 'info_dict': { + 'id': 'BV1ay4y1d77f', + 'ext': 'mp4', + 'title': '【崩坏3新春剧场】为特别的你送上祝福!', + 'timestamp': 1674273600, + 'upload_date': '20230121', + 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8', + 'uploader': '果蝇轰', + 'duration': 1111.722, + 'uploader_id': '8469526', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] - video_data = initial_state['videoData'] + is_festival = 'videoData' not in initial_state + if is_festival: + video_data = initial_state['videoInfo'] + else: + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + video_data = initial_state['videoData'] + video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - page_list_json = traverse_obj( + page_list_json = not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, @@ -316,20 +358,39 @@ def _real_extract(self, url): cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') + festival_info = {} + if is_festival: + play_info = self._download_json( + 'https://api.bilibili.com/x/player/playurl', video_id, + query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, + note='Extracting festival video formats')['data'] + + festival_info = traverse_obj(initial_state, { + 'uploader': ('videoInfo', 'upName'), + 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), + 'like_count': ('videoStatus', 'like', {int_or_none}), + 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), + }, get_all=False) + return { + **traverse_obj(initial_state, { + 'uploader': ('upData', 'name'), + 'uploader_id': ('upData', 'mid', {str_or_none}), + 'like_count': ('videoData', 'stat', 'like', {int_or_none}), + 'tags': ('tags', ..., 'tag_name'), + 'thumbnail': ('videoData', 'pic', {url_or_none}), + }), + **festival_info, + **traverse_obj(video_data, { + 'description': 'desc', + 'timestamp': ('pubdate', {int_or_none}), + 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}), + 'comment_count': ('stat', 'reply', {int_or_none}), + }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, - 'description': traverse_obj(initial_state, ('videoData', 'desc')), - 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), - 'uploader': traverse_obj(initial_state, ('upData', 'name')), - 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), - 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), - 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), - 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')), - 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')), - 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'chapters': self._get_chapters(aid, cid), 'subtitles': self.extract_subtitles(video_id, aid, cid), @@ -996,6 +1057,53 @@ class BiliIntlIE(BiliIntlBaseIE): 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$', 'upload_date': '20221212', 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', + }, + }, { + # episode comment extraction + 'url': 'https://www.bilibili.tv/en/play/34580/340317', + 'info_dict': { + 'id': '340317', + 'ext': 'mp4', + 'timestamp': 1604057820, + 'upload_date': '20201030', + 'episode_number': 5, + 'title': 'E5 - My Own Steel', + 'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2', + 'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode': 'Episode 5', + 'comment_count': int, + 'chapters': [{ + 'start_time': 0, + 'end_time': 61.0, + 'title': '' + }, { + 'start_time': 61.0, + 'end_time': 134.0, + 'title': 'Intro' + }, { + 'start_time': 1290.0, + 'end_time': 1379.0, + 'title': 'Outro' + }], + }, + 'params': { + 'getcomments': True + } + }, { + # user generated content comment extraction + 'url': 'https://www.bilibili.tv/en/video/2045730385', + 'info_dict': { + 'id': '2045730385', + 'ext': 'mp4', + 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a', + 'timestamp': 1667891924, + 'upload_date': '20221108', + 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation', + 'comment_count': int, + 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg', + }, + 'params': { + 'getcomments': True } }, { # episode id without intro and outro @@ -1055,11 +1163,69 @@ def _extract_video_metadata(self, url, video_id, season_id): # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found return merge_dicts( - self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), { + self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), { 'title': self._html_search_meta('og:title', webpage), 'description': self._html_search_meta('og:description', webpage) }) + def _get_comments_reply(self, root_id, next_id=0, display_id=None): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/detail', display_id, + note=f'Downloading reply comment of {root_id} - {next_id}', + query={ + 'platform': 'web', + 'ps': 20, # comment's reply per page (default: 3) + 'root': root_id, + 'next': next_id, + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'parent': replies.get('parent'), + 'timestamp': unified_timestamp(replies.get('ctime_text')) + } + + if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + yield from self._get_comments_reply( + root_id, comment_api_raw_data['data']['cursor']['next'], display_id) + + def _get_comments(self, video_id, ep_id): + for i in itertools.count(0): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/root', video_id, + note=f'Downloading comment page {i + 1}', + query={ + 'platform': 'web', + 'pn': i, # page number + 'ps': 20, # comment per page (default: 20) + 'oid': video_id, + 'type': 3 if ep_id else 1, # 1: user generated content, 3: series content + 'sort_type': 1, # 1: best, 2: recent + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'timestamp': unified_timestamp(replies.get('ctime_text')), + 'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))), + } + if replies.get('count'): + yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id) + + if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + break + def _real_extract(self, url): season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') video_id = ep_id or aid @@ -1087,7 +1253,8 @@ def _real_extract(self, url): **self._extract_video_metadata(url, video_id, season_id), 'formats': self._get_formats(ep_id=ep_id, aid=aid), 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), - 'chapters': chapters + 'chapters': chapters, + '__post_extractor': self.extract_comments(video_id, ep_id) } diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index 10e7b0b2b..a6779505e 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -77,7 +77,10 @@ class BitChuteIE(InfoExtractor): def _check_format(self, video_url, video_id): urls = orderedSet( re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url) - for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153')) + for host in (r'\g<2>', 'seed122', 'seed125', 'seed126', 'seed128', + 'seed132', 'seed150', 'seed151', 'seed152', 'seed153', + 'seed167', 'seed171', 'seed177', 'seed305', 'seed307', + 'seedp29xb', 'zb10-7gsop1v78')) for url in urls: try: response = self._request_webpage( diff --git a/yt_dlp/extractor/brainpop.py b/yt_dlp/extractor/brainpop.py new file mode 100644 index 000000000..1200437e6 --- /dev/null +++ b/yt_dlp/extractor/brainpop.py @@ -0,0 +1,318 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import ( + classproperty, + int_or_none, + traverse_obj, + urljoin +) + + +class BrainPOPBaseIE(InfoExtractor): + _NETRC_MACHINE = 'brainpop' + _ORIGIN = '' # So that _VALID_URL doesn't crash + _LOGIN_ERRORS = { + 1502: 'The username and password you entered did not match.', # LOGIN_FAILED + 1503: 'Payment method is expired.', # LOGIN_FAILED_ACCOUNT_NOT_ACTIVE + 1506: 'Your BrainPOP plan has expired.', # LOGIN_FAILED_ACCOUNT_EXPIRED + 1507: 'Terms not accepted.', # LOGIN_FAILED_TERMS_NOT_ACCEPTED + 1508: 'Account not activated.', # LOGIN_FAILED_SUBSCRIPTION_NOT_ACTIVE + 1512: 'The maximum number of devices permitted are logged in with your account right now.', # LOGIN_FAILED_LOGIN_LIMIT_REACHED + 1513: 'You are trying to access your account from outside of its allowed IP range.', # LOGIN_FAILED_INVALID_IP + 1514: 'Individual accounts are not included in your plan. Try again with your shared username and password.', # LOGIN_FAILED_MBP_DISABLED + 1515: 'Account not activated.', # LOGIN_FAILED_TEACHER_NOT_ACTIVE + 1523: 'That username and password won\'t work on this BrainPOP site.', # LOGIN_FAILED_NO_ACCESS + 1524: 'You\'ll need to join a class before you can login.', # LOGIN_FAILED_STUDENT_NO_PERIOD + 1526: 'Your account is locked. Reset your password, or ask a teacher or administrator for help.', # LOGIN_FAILED_ACCOUNT_LOCKED + } + + @classproperty + def _VALID_URL(cls): + root = re.escape(cls._ORIGIN).replace(r'https:', r'https?:').replace(r'www\.', r'(?:www\.)?') + return rf'{root}/(?P[^/]+/[^/]+/(?P[^/?#&]+))' + + def _assemble_formats(self, slug, format_id, display_id, token='', extra_fields={}): + formats = [] + formats = self._extract_m3u8_formats( + f'{urljoin(self._HLS_URL, slug)}.m3u8?{token}', + display_id, 'mp4', m3u8_id=f'{format_id}-hls', fatal=False) + formats.append({ + 'format_id': format_id, + 'url': f'{urljoin(self._VIDEO_URL, slug)}?{token}', + }) + for f in formats: + f.update(extra_fields) + return formats + + def _extract_adaptive_formats(self, data, token, display_id, key_format='%s', extra_fields={}): + formats = [] + additional_key_formats = { + '%s': {}, + 'ad_%s': { + 'format_note': 'Audio description', + 'source_preference': -2 + } + } + for additional_key_format, additional_key_fields in additional_key_formats.items(): + for key_quality, key_index in enumerate(('high', 'low')): + full_key_index = additional_key_format % (key_format % key_index) + if data.get(full_key_index): + formats.extend(self._assemble_formats(data[full_key_index], full_key_index, display_id, token, { + 'quality': -1 - key_quality, + **additional_key_fields, + **extra_fields + })) + return formats + + def _perform_login(self, username, password): + login_res = self._download_json( + 'https://api.brainpop.com/api/login', None, + data=json.dumps({'username': username, 'password': password}).encode(), + headers={ + 'Content-Type': 'application/json', + 'Referer': self._ORIGIN + }, note='Logging in', errnote='Unable to log in', expected_status=400) + status_code = int_or_none(login_res['status_code']) + if status_code != 1505: + self.report_warning( + f'Unable to login: {self._LOGIN_ERRORS.get(status_code) or login_res.get("message")}' + or f'Got status code {status_code}') + + +class BrainPOPIE(BrainPOPBaseIE): + _ORIGIN = 'https://www.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com' + _TESTS = [{ + 'url': 'https://www.brainpop.com/health/conflictresolution/martinlutherkingjr/movie?ref=null', + 'md5': '3ead374233ae74c7f1b0029a01c972f0', + 'info_dict': { + 'id': '1f3259fa457292b4', + 'ext': 'mp4', + 'title': 'Martin Luther King, Jr.', + 'display_id': 'martinlutherkingjr', + 'description': 'md5:f403dbb2bf3ccc7cf4c59d9e43e3c349', + }, + }, { + 'url': 'https://www.brainpop.com/science/space/bigbang/', + 'md5': '9a1ff0e77444dd9e437354eb669c87ec', + 'info_dict': { + 'id': 'acae52cd48c99acf', + 'ext': 'mp4', + 'title': 'Big Bang', + 'display_id': 'bigbang', + 'description': 'md5:3e53b766b0f116f631b13f4cae185d38', + }, + 'skip': 'Requires login', + }] + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + movie_data = self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}/movie?full=1', display_id, + 'Downloading movie data JSON', 'Unable to download movie data')['data'] + topic_data = traverse_obj(self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}?full=1', display_id, + 'Downloading topic data JSON', 'Unable to download topic data', fatal=False), + ('data', 'topic'), expected_type=dict) or movie_data['topic'] + + if not traverse_obj(movie_data, ('access', 'allow')): + reason = traverse_obj(movie_data, ('access', 'reason')) + if 'logged' in reason: + self.raise_login_required(reason, metadata_available=True) + else: + self.raise_no_formats(reason, video_id=display_id) + movie_feature = movie_data['feature'] + movie_feature_data = movie_feature['data'] + + formats, subtitles = [], {} + formats.extend(self._extract_adaptive_formats(movie_feature_data, movie_feature_data.get('token', ''), display_id, '%s_v2', { + 'language': movie_feature.get('language') or 'en', + 'language_preference': 10 + })) + for lang, localized_feature in traverse_obj(movie_feature, 'localization', default={}, expected_type=dict).items(): + formats.extend(self._extract_adaptive_formats(localized_feature, localized_feature.get('token', ''), display_id, '%s_v2', { + 'language': lang, + 'language_preference': -10 + })) + + # TODO: Do localization fields also have subtitles? + for name, url in movie_feature_data.items(): + lang = self._search_regex( + r'^subtitles_(?P\w+)$', name, 'subtitle metadata', default=None) + if lang and url: + subtitles.setdefault(lang, []).append({ + 'url': urljoin(self._CDN_URL, url) + }) + + return { + 'id': topic_data['topic_id'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BrainPOPLegacyBaseIE(BrainPOPBaseIE): + def _parse_js_topic_data(self, topic_data, display_id, token): + movie_data = topic_data['movies'] + # TODO: Are there non-burned subtitles? + formats = self._extract_adaptive_formats(movie_data, token, display_id) + + return { + 'id': topic_data['EntryID'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'alt_title': topic_data.get('title'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + } + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + webpage = self._download_webpage(url, display_id) + topic_data = self._search_json( + r'var\s+content\s*=\s*', webpage, 'content data', + display_id, end_pattern=';')['category']['unit']['topic'] + token = self._search_regex(r'ec_token\s*:\s*[\'"]([^\'"]+)', webpage, 'video token') + return self._parse_js_topic_data(topic_data, display_id, token) + + +class BrainPOPJrIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://jr.brainpop.com' + _VIDEO_URL = 'https://svideos-jr.brainpop.com' + _HLS_URL = 'https://hls-jr.brainpop.com' + _CDN_URL = 'https://cdn-jr.brainpop.com' + _TESTS = [{ + 'url': 'https://jr.brainpop.com/health/feelingsandsel/emotions/', + 'md5': '04e0561bb21770f305a0ce6cf0d869ab', + 'info_dict': { + 'id': '347', + 'ext': 'mp4', + 'title': 'Emotions', + 'display_id': 'emotions', + }, + }, { + 'url': 'https://jr.brainpop.com/science/habitats/arctichabitats/', + 'md5': 'b0ed063bbd1910df00220ee29340f5d6', + 'info_dict': { + 'id': '29', + 'ext': 'mp4', + 'title': 'Arctic Habitats', + 'display_id': 'arctichabitats', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPELLIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://ell.brainpop.com' + _VIDEO_URL = 'https://svideos-esl.brainpop.com' + _HLS_URL = 'https://hls-esl.brainpop.com' + _CDN_URL = 'https://cdn-esl.brainpop.com' + _TESTS = [{ + 'url': 'https://ell.brainpop.com/level1/unit1/lesson1/', + 'md5': 'a2012700cfb774acb7ad2e8834eed0d0', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': 'Lesson 1', + 'display_id': 'lesson1', + 'alt_title': 'Personal Pronouns', + }, + }, { + 'url': 'https://ell.brainpop.com/level3/unit6/lesson5/', + 'md5': 'be19c8292c87b24aacfb5fda2f3f8363', + 'info_dict': { + 'id': '101', + 'ext': 'mp4', + 'title': 'Lesson 5', + 'display_id': 'lesson5', + 'alt_title': 'Review: Unit 6', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPEspIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Español' + _ORIGIN = 'https://esp.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/mx' + _TESTS = [{ + 'url': 'https://esp.brainpop.com/ciencia/la_diversidad_de_la_vida/ecosistemas/', + 'md5': 'cb3f062db2b3c5240ddfcfde7108f8c9', + 'info_dict': { + 'id': '3893', + 'ext': 'mp4', + 'title': 'Ecosistemas', + 'display_id': 'ecosistemas', + 'description': 'md5:80fc55b07e241f8c8f2aa8d74deaf3c3', + }, + }, { + 'url': 'https://esp.brainpop.com/espanol/la_escritura/emily_dickinson/', + 'md5': '98c1b9559e0e33777209c425cda7dac4', + 'info_dict': { + 'id': '7146', + 'ext': 'mp4', + 'title': 'Emily Dickinson', + 'display_id': 'emily_dickinson', + 'description': 'md5:2795ad87b1d239c9711c1e92ab5a978b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPFrIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Français' + _ORIGIN = 'https://fr.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/fr' + _TESTS = [{ + 'url': 'https://fr.brainpop.com/sciencesdelaterre/energie/sourcesdenergie/', + 'md5': '97e7f48af8af93f8a2be11709f239371', + 'info_dict': { + 'id': '1651', + 'ext': 'mp4', + 'title': 'Sources d\'énergie', + 'display_id': 'sourcesdenergie', + 'description': 'md5:7eece350f019a21ef9f64d4088b2d857', + }, + }, { + 'url': 'https://fr.brainpop.com/francais/ecrire/plagiat/', + 'md5': '0cf2b4f89804d0dd4a360a51310d445a', + 'info_dict': { + 'id': '5803', + 'ext': 'mp4', + 'title': 'Plagiat', + 'display_id': 'plagiat', + 'description': 'md5:4496d87127ace28e8b1eda116e77cd2b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPIlIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Hebrew' + _ORIGIN = 'https://il.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/he' + _TESTS = [{ + 'url': 'https://il.brainpop.com/category_9/subcategory_150/subjects_3782/', + 'md5': '9e4ea9dc60ecd385a6e5ca12ccf31641', + 'info_dict': { + 'id': '3782', + 'ext': 'mp4', + 'title': 'md5:e993632fcda0545d9205602ec314ad67', + 'display_id': 'subjects_3782', + 'description': 'md5:4cc084a8012beb01f037724423a4d4ed', + }, + }] diff --git a/yt_dlp/extractor/bravotv.py b/yt_dlp/extractor/bravotv.py index d4895848e..d4bf9b53b 100644 --- a/yt_dlp/extractor/bravotv.py +++ b/yt_dlp/extractor/bravotv.py @@ -1,117 +1,185 @@ -import re - from .adobepass import AdobePassIE from ..utils import ( - smuggle_url, - update_url_query, - int_or_none, + extract_attributes, float_or_none, - try_get, - dict_get, + get_element_html_by_class, + int_or_none, + merge_dicts, + parse_age_limit, + remove_end, + str_or_none, + traverse_obj, + unescapeHTML, + unified_timestamp, + update_url_query, + url_or_none, ) class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', - 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', 'info_dict': { - 'id': 'epL0pmK1kQlT', + 'id': '3923059', 'ext': 'mp4', 'title': 'The Top Chef Season 16 Winner Is...', 'description': 'Find out who takes the title of Top Chef!', - 'uploader': 'NBCU-BRAV', 'upload_date': '20190314', 'timestamp': 1552591860, 'season_number': 16, 'episode_number': 15, 'series': 'Top Chef', 'episode': 'The Top Chef Season 16 Winner Is...', - 'duration': 190.0, - } + 'duration': 190.357, + 'season': 'Season 16', + 'thumbnail': r're:^https://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', - 'only_matching': True, + 'url': 'https://www.bravotv.com/top-chef/season-20/episode-1/london-calling', + 'info_dict': { + 'id': '9000234570', + 'ext': 'mp4', + 'title': 'London Calling', + 'description': 'md5:5af95a8cbac1856bd10e7562f86bb759', + 'upload_date': '20230310', + 'timestamp': 1678410000, + 'season_number': 20, + 'episode_number': 1, + 'series': 'Top Chef', + 'episode': 'London Calling', + 'duration': 3266.03, + 'season': 'Season 20', + 'chapters': 'count:7', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-1/closing-night', + 'info_dict': { + 'id': '3692045', + 'ext': 'mp4', + 'title': 'Closing Night', + 'description': 'md5:3170065c5c2f19548d72a4cbc254af63', + 'upload_date': '20180401', + 'timestamp': 1522623600, + 'season_number': 1, + 'episode_number': 1, + 'series': 'In Ice Cold Blood', + 'episode': 'Closing Night', + 'duration': 2629.051, + 'season': 'Season 1', + 'chapters': 'count:6', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', }, { 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'info_dict': { + 'id': '3974019', + 'ext': 'mp4', + 'title': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'description': 'md5:f9d638dd6946a1c1c0533a9c6100eae5', + 'upload_date': '20190617', + 'timestamp': 1560790800, + 'season_number': 2, + 'episode_number': 16, + 'series': 'In Ice Cold Blood', + 'episode': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'duration': 68.235, + 'season': 'Season 2', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, }] def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() + site, display_id = self._match_valid_url(url).group('site', 'id') webpage = self._download_webpage(url, display_id) - settings = self._parse_json(self._search_regex( - r']+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})', webpage, 'drupal settings'), - display_id) - info = {} + settings = self._search_json( + r']+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id) + tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') query = { - 'mbr': 'true', + 'manifest': 'm3u', + 'formats': 'm3u,mpeg4', } - account_pid, release_pid = [None] * 2 - tve = settings.get('ls_tve') + if tve: - query['manifest'] = 'm3u' - mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) - if mobj: - account_pid, tp_path = mobj.groups() - release_pid = tp_path.strip('/').split('/')[-1] - else: - account_pid = 'HNK2IC' - tp_path = release_pid = tve['release_pid'] - if tve.get('entitlement') == 'auth': - adobe_pass = settings.get('tve_adobe_auth', {}) - if site == 'bravotv': - site = 'bravo' + account_pid = tve.get('data-mpx-media-account-pid') or 'HNK2IC' + account_id = tve['data-mpx-media-account-id'] + metadata = self._parse_json( + tve.get('data-normalized-video', ''), display_id, fatal=False, transform_source=unescapeHTML) + video_id = tve.get('data-guid') or metadata['guid'] + if tve.get('data-entitlement') == 'auth': + auth = traverse_obj(settings, ('tve_adobe_auth', {dict})) or {} + site = remove_end(site, 'tv') + release_pid = tve['data-release-pid'] resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId') or site, - tve['title'], release_pid, tve.get('rating')) - query['auth'] = self._extract_mvpd_auth( - url, release_pid, - adobe_pass.get('adobePassRequestorId') or site, resource) + tve.get('data-adobe-pass-resource-id') or auth.get('adobePassResourceId') or site, + tve['data-title'], release_pid, tve.get('data-rating')) + query.update({ + 'switch': 'HLSServiceSecure', + 'auth': self._extract_mvpd_auth( + url, release_pid, auth.get('adobePassRequestorId') or site, resource), + }) + else: - shared_playlist = settings['ls_playlist'] - account_pid = shared_playlist['account_pid'] - metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] - tp_path = release_pid = metadata.get('release_pid') - if not release_pid: - release_pid = metadata['guid'] - tp_path = 'media/guid/2140479951/' + release_pid - info.update({ - 'title': metadata['title'], - 'description': metadata.get('description'), - 'season_number': int_or_none(metadata.get('season_num')), - 'episode_number': int_or_none(metadata.get('episode_num')), - }) - query['switch'] = 'progressive' - - tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path) + ls_playlist = traverse_obj(settings, ('ls_playlist', ..., {dict}), get_all=False) or {} + account_pid = ls_playlist.get('mpxMediaAccountPid') or 'PHSl-B' + account_id = ls_playlist['mpxMediaAccountId'] + video_id = ls_playlist['defaultGuid'] + metadata = traverse_obj( + ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, {dict}), get_all=False) + tp_url = f'https://link.theplatform.com/s/{account_pid}/media/guid/{account_id}/{video_id}' tp_metadata = self._download_json( - update_url_query(tp_url, {'format': 'preview'}), - display_id, fatal=False) - if tp_metadata: - info.update({ - 'title': tp_metadata.get('title'), - 'description': tp_metadata.get('description'), - 'duration': float_or_none(tp_metadata.get('duration'), 1000), - 'season_number': int_or_none( - dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))), - 'episode_number': int_or_none( - dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))), - # For some reason the series is sometimes wrapped into a single element array. - 'series': try_get( - dict_get(tp_metadata, ('pl1$show', 'nbcu$show')), - lambda x: x[0] if isinstance(x, list) else x, - expected_type=str), - 'episode': dict_get( - tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')), - }) + update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'id': release_pid, - 'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}), - 'ie_key': 'ThePlatform', - }) - return info + seconds_or_none = lambda x: float_or_none(x, 1000) + chapters = traverse_obj(tp_metadata, ('chapters', ..., { + 'start_time': ('startTime', {seconds_or_none}), + 'end_time': ('endTime', {seconds_or_none}), + })) + # prune pointless single chapters that span the entire duration from short videos + if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): + chapters = None + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + update_url_query(f'{tp_url}/stream.m3u8', query), video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'chapters': chapters, + **merge_dicts(traverse_obj(tp_metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {seconds_or_none}), + 'timestamp': ('pubDate', {seconds_or_none}), + 'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}), + 'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}), + 'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}), + 'episode': (('title', 'pl1$episodeNumber', 'nbcu$episodeNumber'), {str_or_none}), + 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}), + }, get_all=False), traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('durationInSeconds', {int_or_none}), + 'timestamp': ('airDate', {unified_timestamp}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode': 'episodeTitle', + 'series': 'show', + })) + } diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index 2b7ddcae8..cd0e8ff27 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -575,6 +575,7 @@ def build_format_id(kind): self.raise_no_formats( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + headers.pop('Authorization', None) # or else http formats will give error 400 for f in formats: f.setdefault('http_headers', {}).update(headers) @@ -895,8 +896,9 @@ def extract_policy_key(): store_pk(policy_key) return policy_key - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) - headers = {} + token = smuggled_data.get('token') + api_url = f'https://{"edge-auth" if token else "edge"}.api.brightcove.com/playback/v1/accounts/{account_id}/{content_type}s/{video_id}' + headers = {'Authorization': f'Bearer {token}'} if token else {} referrer = smuggled_data.get('referrer') # XXX: notice the spelling/case of the key if referrer: headers.update({ diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index eadb3f8c0..e42f06246 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -8,14 +8,16 @@ compat_str, ) from ..utils import ( + ExtractorError, int_or_none, join_nonempty, js_to_json, orderedSet, + parse_iso8601, smuggle_url, strip_or_none, + traverse_obj, try_get, - ExtractorError, ) @@ -404,7 +406,7 @@ def _real_extract(self, url): class CBCGemPlaylistIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:playlist' - _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P(?P[0-9a-z-]+)/s(?P[0-9]+))/?(?:[?#]|$)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P(?P[0-9a-z-]+)/s(?P[0-9]+))/?(?:[?#]|$)' _TESTS = [{ # TV show playlist, all public videos 'url': 'https://gem.cbc.ca/media/schitts-creek/s06', @@ -414,6 +416,9 @@ class CBCGemPlaylistIE(InfoExtractor): 'title': 'Season 6', 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', }, + }, { + 'url': 'https://gem.cbc.ca/schitts-creek/s06', + 'only_matching': True, }] _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' @@ -473,49 +478,90 @@ def _real_extract(self, url): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P\d+)' - _TEST = { - 'url': 'https://gem.cbc.ca/live/920604739687', - 'info_dict': { - 'title': 'Ottawa', - 'description': 'The live TV channel and local programming from Ottawa', - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', - 'is_live': True, - 'id': 'AyqZwxRqh8EH', - 'ext': 'mp4', - 'timestamp': 1492106160, - 'upload_date': '20170413', - 'uploader': 'CBCC-NEW', + _VALID_URL = r'https?://gem\.cbc\.ca/live(?:-event)?/(?P\d+)' + _TESTS = [ + { + 'url': 'https://gem.cbc.ca/live/920604739687', + 'info_dict': { + 'title': 'Ottawa', + 'description': 'The live TV channel and local programming from Ottawa', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', + 'is_live': True, + 'id': 'AyqZwxRqh8EH', + 'ext': 'mp4', + 'timestamp': 1492106160, + 'upload_date': '20170413', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Live might have ended', }, - 'skip': 'Live might have ended', - } - - # It's unclear where the chars at the end come from, but they appear to be - # constant. Might need updating in the future. - # There are two URLs, some livestreams are in one, and some - # in the other. The JSON schema is the same for both. - _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] + { + 'url': 'https://gem.cbc.ca/live/44', + 'info_dict': { + 'id': '44', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^Ottawa [0-9\-: ]+', + 'description': 'The live TV channel and local programming from Ottawa', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*' + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + }, + { + 'url': 'https://gem.cbc.ca/live-event/10835', + 'info_dict': { + 'id': '10835', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^The National \| Biden’s trip wraps up, Paltrow testifies, Bird flu [0-9\-: ]+', + 'description': 'March 24, 2023 | President Biden’s Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*', + 'timestamp': 1679706000, + 'upload_date': '20230325', + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + } + ] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data'] - for api_url in self._API_URLS: - video_info = next(( - stream for stream in self._download_json(api_url, video_id)['entries'] - if stream.get('guid') == video_id), None) - if video_info: - break - else: + # Two types of metadata JSON + if not video_info.get('formattedIdMedia'): + video_info = traverse_obj( + video_info, (('freeTv', ('streams', ...)), 'items', lambda _, v: v['key'] == video_id, {dict}), + get_all=False, default={}) + + video_stream_id = video_info.get('formattedIdMedia') + if not video_stream_id: raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) + stream_data = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={ + 'appCode': 'mpx', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'idMedia': video_stream_id, + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestType': 'desktop', + }) + return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': video_info['content'][0]['url'], 'id': video_id, - 'title': video_info.get('title'), - 'description': video_info.get('description'), - 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')), - 'thumbnail': video_info.get('cbc$staticImage'), + 'formats': self._extract_m3u8_formats(stream_data['url'], video_id, 'mp4', live=True), 'is_live': True, + **traverse_obj(video_info, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('images', 'card', 'url'), + 'timestamp': ('airDate', {parse_iso8601}), + }) } diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index 9aacd50c4..1c0dbdea9 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -1,8 +1,14 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor from .theplatform import ThePlatformFeedIE +from .youtube import YoutubeIE from ..utils import ( ExtractorError, + extract_attributes, + get_element_html_by_id, int_or_none, find_xpath_attr, + smuggle_url, xpath_element, xpath_text, update_url_query, @@ -162,3 +168,110 @@ def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), 'thumbnail': url_or_none(xpath_text(video_data, 'previewImageURL')), }) + + +class ParamountPressExpressIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountpressexpress\.com(?:/[\w-]+)+/(?Pyt-)?video/?\?watch=(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/shows/survivor/video/?watch=pnzew7e2hx', + 'md5': '56631dbcadaab980d1fc47cb7b76cba4', + 'info_dict': { + 'id': '6322981580112', + 'ext': 'mp4', + 'title': 'I’m Felicia', + 'description': 'md5:88fad93f8eede1c9c8f390239e4c6290', + 'uploader_id': '6055873637001', + 'upload_date': '20230320', + 'timestamp': 1679334960, + 'duration': 49.557, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/video/?watch=2s5eh8kppc', + 'md5': 'edcb03e3210b88a3e56c05aa863e0e5b', + 'info_dict': { + 'id': '6323036027112', + 'ext': 'mp4', + 'title': '‘Y&R’ Set Visit: Jerry O’Connell Quizzes Cast on Pre-Love Scene Rituals and More', + 'description': 'md5:b929867a357aac5544b783d834c78383', + 'uploader_id': '6055873637001', + 'upload_date': '20230321', + 'timestamp': 1679430180, + 'duration': 132.032, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/paramount-plus/yt-video/?watch=OX9wJWOcqck', + 'info_dict': { + 'id': 'OX9wJWOcqck', + 'ext': 'mp4', + 'title': 'Rugrats | Season 2 Official Trailer | Paramount+', + 'description': 'md5:1f7e26f5625a9f0d6564d9ad97a9f7de', + 'uploader': 'Paramount Plus', + 'uploader_id': '@paramountplus', + 'uploader_url': 'http://www.youtube.com/@paramountplus', + 'channel': 'Paramount Plus', + 'channel_id': 'UCrRttZIypNTA1Mrfwo745Sg', + 'channel_url': 'https://www.youtube.com/channel/UCrRttZIypNTA1Mrfwo745Sg', + 'upload_date': '20230316', + 'duration': 88, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/OX9wJWOcqck/maxresdefault.jpg', + 'categories': ['Entertainment'], + 'tags': ['Rugrats'], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/showtime/yt-video/?watch=_ljssSoDLkw', + 'info_dict': { + 'id': '_ljssSoDLkw', + 'ext': 'mp4', + 'title': 'Lavell Crawford: THEE Lavell Crawford Comedy Special Official Trailer | SHOWTIME', + 'description': 'md5:39581bcc3fd810209b642609f448af70', + 'uploader': 'SHOWTIME', + 'uploader_id': '@Showtime', + 'uploader_url': 'http://www.youtube.com/@Showtime', + 'channel': 'SHOWTIME', + 'channel_id': 'UCtwMWJr2BFPkuJTnSvCESSQ', + 'channel_url': 'https://www.youtube.com/channel/UCtwMWJr2BFPkuJTnSvCESSQ', + 'upload_date': '20230209', + 'duration': 49, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi_webp/_ljssSoDLkw/maxresdefault.webp', + 'categories': ['People & Blogs'], + 'tags': 'count:27', + }, + }] + + def _real_extract(self, url): + display_id, is_youtube = self._match_valid_url(url).group('id', 'yt') + if is_youtube: + return self.url_result(display_id, YoutubeIE) + + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'\bvideo_id\s*=\s*["\'](\d+)["\']\s*,', webpage, 'Brightcove ID') + token = self._search_regex(r'\btoken\s*=\s*["\']([\w.-]+)["\']', webpage, 'token') + + player = extract_attributes(get_element_html_by_id('vcbrightcoveplayer', webpage) or '') + account_id = player.get('data-account') or '6055873637001' + player_id = player.get('data-player') or 'OtLKgXlO9F' + embed = player.get('data-embed') or 'default' + + return self.url_result(smuggle_url( + f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}', + {'token': token}), BrightcoveNewIE) diff --git a/yt_dlp/extractor/clipchamp.py b/yt_dlp/extractor/clipchamp.py new file mode 100644 index 000000000..a8bdf7e50 --- /dev/null +++ b/yt_dlp/extractor/clipchamp.py @@ -0,0 +1,61 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class ClipchampIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU', + 'info_dict': { + 'id': 'gRXZ4ZhdDaU', + 'ext': 'mp4', + 'title': 'Untitled video', + 'uploader': 'Alexander Schwartz', + 'timestamp': 1680805580, + 'upload_date': '20230406', + 'thumbnail': r're:^https?://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' + _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] + + storage_location = data.get('storage_location') + if storage_location != 'cf_stream': + raise ExtractorError(f'Unsupported clip storage location "{storage_location}"') + + path = data['download_url'] + iframe = self._download_webpage( + f'https://iframe.cloudflarestream.com/{path}', video_id, 'Downloading player iframe') + subdomain = self._search_regex( + r'\bcustomer-domain-prefix=["\']([\w-]+)["\']', iframe, + 'subdomain', fatal=False) or 'customer-2ut9yn3y6fta1yxe' + + formats = self._extract_mpd_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id, + query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash') + formats.extend(self._extract_m3u8_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4', + query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls')) + + return { + 'id': video_id, + 'formats': formats, + 'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), {str}))) or None, + **traverse_obj(data, { + 'title': ('project', 'project_name', {str}), + 'timestamp': ('created_at', {unified_timestamp}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + }), + } diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 815538248..e36b17dbb 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2998,6 +2998,8 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): 'protocol': 'ism', 'fragments': fragments, 'has_drm': ism_doc.find('Protection') is not None, + 'language': stream_language, + 'audio_channels': int_or_none(track.get('Channels')), '_download_params': { 'stream_type': stream_type, 'duration': duration, @@ -3528,8 +3530,8 @@ def _RETURN_TYPE(cls): @classmethod def is_single_video(cls, url): """Returns whether the URL is of a single video, None if unknown""" - assert cls.suitable(url), 'The URL must be suitable for the extractor' - return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) + if cls.suitable(url): + return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) @classmethod def is_suitable(cls, age_limit): @@ -3671,18 +3673,22 @@ def _extract_chapters_helper(self, chapter_list, start_function, title_function, 'start_time': start_function(chapter), 'title': title_function(chapter), } for chapter in chapter_list or []] - if not strict: + if strict: + warn = self.report_warning + else: + warn = self.write_debug chapter_list.sort(key=lambda c: c['start_time'] or 0) chapters = [{'start_time': 0}] for idx, chapter in enumerate(chapter_list): if chapter['start_time'] is None: - self.report_warning(f'Incomplete chapter {idx}') + warn(f'Incomplete chapter {idx}') elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: chapters.append(chapter) elif chapter not in chapters: - self.report_warning( - f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') + issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration + else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}') + warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"') return chapters[1:] def _extract_chapters_from_description(self, description, duration): diff --git a/yt_dlp/extractor/dlf.py b/yt_dlp/extractor/dlf.py new file mode 100644 index 000000000..88a4149b5 --- /dev/null +++ b/yt_dlp/extractor/dlf.py @@ -0,0 +1,192 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + extract_attributes, + int_or_none, + traverse_obj, + url_or_none, +) + + +class DLFBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/' + _BUTTON_REGEX = r'(]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)' + + def _parse_button_attrs(self, button, audio_id=None): + attrs = extract_attributes(button) + audio_id = audio_id or attrs['data-audio-diraid'] + + url = traverse_obj( + attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference', + 'data-audio-src', expected_type=url_or_none) + ext = determine_ext(url) + + return { + 'id': audio_id, + 'extractor_key': DLFIE.ie_key(), + 'extractor': DLFIE.IE_NAME, + **traverse_obj(attrs, { + 'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), {str}), + 'duration': (('data-audioduration', 'data-audio-duration'), {int_or_none}), + 'thumbnail': ('data-audioimage', {url_or_none}), + 'uploader': 'data-audio-producer', + 'series': 'data-audio-series', + 'channel': 'data-audio-origin-site-name', + 'webpage_url': ('data-audio-download-tracking-path', {url_or_none}), + }, get_all=False), + 'formats': (self._extract_m3u8_formats(url, audio_id, fatal=False) + if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}]) + } + + +class DLFIE(DLFBaseIE): + IE_NAME = 'dlf' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P[\da-f]{8})-100\.html' + _TESTS = [ + # Audio as an HLS stream + { + 'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html', + 'info_dict': { + 'id': '03a3eb19', + 'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien', + 'ext': 'm4a', + 'duration': 3298, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'On Stage', + 'channel': 'deutschlandfunk' + }, + 'params': { + 'skip_download': 'm3u8' + }, + 'skip': 'This webpage no longer exists' + }, { + 'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html', + 'info_dict': { + 'id': 'd9cc1856', + 'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner', + 'ext': 'mp3', + 'duration': 291, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'Kommentare und Themen der Woche', + 'channel': 'deutschlandfunk' + } + }, + ] + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + + return self._parse_button_attrs( + self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id) + + +class DLFCorpusIE(DLFBaseIE): + IE_NAME = 'dlf:corpus' + IE_DESC = 'DLF Multi-feed Archives' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html' + _TESTS = [ + # Recorded news broadcast with referrals to related broadcasts + { + 'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html', + 'info_dict': { + 'id': 'fechten-russland-belarus-ukraine-protest-100', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad' + }, + 'playlist_mincount': 5, + 'playlist': [{ + 'info_dict': { + 'id': '1fc5d64a', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'ext': 'mp3', + 'duration': 252, + 'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '2ada145f', + 'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten', + 'ext': 'mp3', + 'duration': 336, + 'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005', + 'uploader': 'Deutschlandfunk', + 'series': 'Deutschlandfunk Nova', + 'channel': 'deutschlandfunk-nova' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '47e1a096', + 'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"', + 'ext': 'mp3', + 'duration': 602, + 'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }] + }, + # Podcast feed with tag buttons, playlist count fluctuates + { + 'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html', + 'info_dict': { + 'id': 'kommentare-und-themen-der-woche-100', + 'title': 'Meinung - Kommentare und Themen der Woche', + 'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5', + }, + 'playlist_mincount': 10, + }, + # Podcast feed with no description + { + 'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html', + 'info_dict': { + 'id': 'podcast-tolle-idee-100', + 'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?', + }, + 'playlist_mincount': 11, + }, + ] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage, default=None), + 'title': self._html_search_meta( + ['og:title', 'twitter:title'], webpage, default=None), + 'entries': map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)), + } diff --git a/yt_dlp/extractor/drtv.py b/yt_dlp/extractor/drtv.py index 470546bbc..6c381aa14 100644 --- a/yt_dlp/extractor/drtv.py +++ b/yt_dlp/extractor/drtv.py @@ -12,7 +12,6 @@ mimetype2ext, str_or_none, traverse_obj, - try_get, unified_timestamp, update_url_query, url_or_none, @@ -25,7 +24,7 @@ class DRTVIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?:radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?Pradio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P[\da-z_-]+) @@ -80,7 +79,7 @@ class DRTVIE(InfoExtractor): 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', 'timestamp': 1546628400, 'upload_date': '20190104', - 'duration': 3504.618, + 'duration': 3504.619, 'formats': 'mincount:20', 'release_year': 2017, 'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35', @@ -101,14 +100,16 @@ class DRTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Bonderøven 2019 (1:8)', 'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd', - 'timestamp': 1603188600, - 'upload_date': '20201020', + 'timestamp': 1654856100, + 'upload_date': '20220610', 'duration': 2576.6, 'season': 'Bonderøven 2019', 'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5', 'release_year': 2019, 'season_number': 2019, - 'series': 'Frank & Kastaniegaarden' + 'series': 'Frank & Kastaniegaarden', + 'episode_number': 1, + 'episode': 'Episode 1', }, 'params': { 'skip_download': True, @@ -140,10 +141,26 @@ class DRTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'this video has been removed', + }, { + 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/regionale-nyheder-2023-03-14-10-30-9', + 'info_dict': { + 'ext': 'mp4', + 'id': '14802310112', + 'timestamp': 1678786200, + 'duration': 120.043, + 'season_id': 'urn:dr:mu:bundle:63a4f7c87140143504b6710f', + 'series': 'P4 København regionale nyheder', + 'upload_date': '20230314', + 'release_year': 0, + 'description': 'Hør seneste regionale nyheder fra P4 København.', + 'season': 'Regionale nyheder', + 'title': 'Regionale nyheder', + }, }] def _real_extract(self, url): - raw_video_id = self._match_id(url) + raw_video_id, is_radio_url = self._match_valid_url(url).group('id', 'radio') webpage = self._download_webpage(url, raw_video_id) @@ -170,15 +187,17 @@ def _real_extract(self, url): programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id) else: programcard_url = _PROGRAMCARD_BASE - page = self._parse_json( - self._search_regex( - r'data\s*=\s*({.+?})\s*(?:;|[^?/#]+)' + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?:videos|(?P
a))/(?P[^?/#]+)' _TESTS = [{ 'url': 'https://genius.com/videos/Vince-staples-breaks-down-the-meaning-of-when-sparks-fly', 'md5': '64c2ad98cfafcfda23bfa0ad0c512f4c', @@ -41,19 +41,37 @@ class GeniusIE(InfoExtractor): 'timestamp': 1631209167, 'thumbnail': r're:^https?://.*\.jpg$', }, + }, { + 'url': 'https://genius.com/a/cordae-anderson-paak-break-down-the-meaning-of-two-tens', + 'md5': 'f98a4e03b16b0a2821bd6e52fb3cc9d7', + 'info_dict': { + 'id': '6321509903112', + 'ext': 'mp4', + 'title': 'Cordae & Anderson .Paak Breaks Down The Meaning Of “Two Tens”', + 'description': 'md5:1255f0e1161d07342ce56a8464ac339d', + 'tags': ['song id: 5457554'], + 'uploader_id': '4863540648001', + 'duration': 361.813, + 'upload_date': '20230301', + 'timestamp': 1677703908, + 'thumbnail': r're:^https?://.*\.jpg$', + }, }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id, is_article = self._match_valid_url(url).group('id', 'article') webpage = self._download_webpage(url, display_id) metadata = self._search_json( - r'[^?/#]+)-lyrics[?/#]?' + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P[^?/#]+)-lyrics(?:[?/#]|$)' _TESTS = [{ 'url': 'https://genius.com/Lil-baby-heyy-lyrics', 'playlist_mincount': 2, diff --git a/yt_dlp/extractor/globalplayer.py b/yt_dlp/extractor/globalplayer.py new file mode 100755 index 000000000..e0c0d58fd --- /dev/null +++ b/yt_dlp/extractor/globalplayer.py @@ -0,0 +1,254 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + join_nonempty, + parse_duration, + str_or_none, + traverse_obj, + unified_strdate, + unified_timestamp, + urlhandle_detect_ext, +) + + +class GlobalPlayerBaseIE(InfoExtractor): + def _get_page_props(self, url, video_id): + webpage = self._download_webpage(url, video_id) + return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + + def _request_ext(self, url, video_id): + return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests + url, video_id, note='Determining source extension')) + + def _extract_audio(self, episode, series): + return { + 'vcodec': 'none', + **traverse_obj(series, { + 'series': 'title', + 'series_id': 'id', + 'thumbnail': 'imageUrl', + 'uploader': 'itunesAuthor', # podcasts only + }), + **traverse_obj(episode, { + 'id': 'id', + 'description': ('description', {clean_html}), + 'duration': ('duration', {parse_duration}), + 'thumbnail': 'imageUrl', + 'url': 'streamUrl', + 'timestamp': (('pubDate', 'startDate'), {unified_timestamp}), + 'title': 'title', + }, get_all=False) + } + + +class GlobalPlayerLiveIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/live/(?P\w+)/\w+' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/live/smoothchill/uk/', + 'info_dict': { + 'id': '2mx1E', + 'ext': 'aac', + 'display_id': 'smoothchill-uk', + 'title': 're:^Smooth Chill.+$', + 'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png', + 'description': 'Music To Chill To', + 'live_status': 'is_live', + }, + }, { + # national station + 'url': 'https://www.globalplayer.com/live/heart/uk/', + 'info_dict': { + 'id': '2mwx4', + 'ext': 'aac', + 'description': 'turn up the feel good!', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'live_status': 'is_live', + 'title': 're:^Heart UK.+$', + 'display_id': 'heart-uk', + }, + }, { + # regional variation + 'url': 'https://www.globalplayer.com/live/heart/london/', + 'info_dict': { + 'id': 'AMqg', + 'ext': 'aac', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'title': 're:^Heart London.+$', + 'live_status': 'is_live', + 'display_id': 'heart-london', + 'description': 'turn up the feel good!', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['station'] + stream_url = station['streamUrl'] + + return { + 'id': station['id'], + 'display_id': join_nonempty('brandSlug', 'slug', from_dict=station) or station.get('legacyStationPrefix'), + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': (('name', 'brandName'), {str_or_none}), + 'description': 'tagline', + 'thumbnail': 'brandLogo', + }, get_all=False), + } + + +class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P\w+)' + _TESTS = [{ + # "live playlist" + 'url': 'https://www.globalplayer.com/playlists/8bLk/', + 'info_dict': { + 'id': '8bLk', + 'ext': 'aac', + 'live_status': 'is_live', + 'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d', + 'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', + 'title': 're:^Classic FM Hall of Fame.+$' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['playlistData'] + stream_url = station['streamUrl'] + + return { + 'id': video_id, + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': 'title', + 'description': 'description', + 'thumbnail': 'image', + }), + } + + +class GlobalPlayerAudioIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?Ppodcasts)/|catchup/\w+/\w+/)(?P\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/42KuaM/', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '42KuaM', + 'title': 'Filthy Ritual', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'categories': ['Society & Culture', 'True Crime'], + 'uploader': 'Global', + 'description': 'md5:da5b918eac9ae319454a10a563afacf9', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/', + 'playlist_mincount': 3, + 'info_dict': { + 'id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + series = props['podcastInfo'] if podcast else props['catchupInfo'] + + return { + '_type': 'playlist', + 'id': video_id, + 'entries': [self._extract_audio(ep, series) for ep in traverse_obj( + series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))], + 'categories': traverse_obj(series, ('categories', ..., 'name')) or None, + **traverse_obj(series, { + 'description': 'description', + 'thumbnail': 'imageUrl', + 'title': 'title', + 'uploader': 'itunesAuthor', # podcasts only + }), + } + + +class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?Ppodcasts)|catchup/\w+/\w+)/episodes/(?P\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/', + 'info_dict': { + 'id': '7DrfNnE', + 'ext': 'mp3', + 'title': 'Filthy Ritual - Trailer', + 'description': 'md5:1f1562fd0f01b4773b590984f94223e0', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'duration': 225.0, + 'timestamp': 1681254900, + 'series': 'Filthy Ritual', + 'series_id': '42KuaM', + 'upload_date': '20230411', + 'uploader': 'Global', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/', + 'info_dict': { + 'id': '2zGq26Vcv1fCWhddC4JAwETXWe', + 'ext': 'm4a', + 'timestamp': 1682056800, + 'series': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + 'upload_date': '20230421', + 'series_id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'duration': 10800.0, + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + episode = props['podcastEpisode'] if podcast else props['catchupEpisode'] + + return self._extract_audio( + episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {}) + + +class GlobalPlayerVideoIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P\w+)' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/', + 'info_dict': { + 'id': '2JsSZ7Gm2uP', + 'ext': 'mp4', + 'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd', + 'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550', + 'upload_date': '20230420', + 'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._get_page_props(url, video_id)['videoData'] + + return { + 'id': video_id, + **traverse_obj(meta, { + 'url': 'url', + 'thumbnail': ('image', 'url'), + 'title': 'title', + 'upload_date': ('publish_date', {unified_strdate}), + 'description': 'description', + }), + } diff --git a/yt_dlp/extractor/gmanetwork.py b/yt_dlp/extractor/gmanetwork.py new file mode 100644 index 000000000..62fff4ead --- /dev/null +++ b/yt_dlp/extractor/gmanetwork.py @@ -0,0 +1,83 @@ +from .common import InfoExtractor +from .dailymotion import DailymotionIE +from .youtube import YoutubeIE + + +class GMANetworkVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www)\.gmanetwork\.com/(?:\w+/){3}(?P\d+)/(?P[\w-]+)/video' + _TESTS = [{ + 'url': 'https://www.gmanetwork.com/fullepisodes/home/running_man_philippines/168677/running-man-philippines-catch-the-thief-full-chapter-2/video?section=home', + 'info_dict': { + 'id': '28BqW0AXPe0', + 'ext': 'mp4', + 'upload_date': '20220919', + 'uploader_url': 'http://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'like_count': int, + 'view_count': int, + 'uploader': 'YoüLOL', + 'channel_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'duration': 5313, + 'comment_count': int, + 'tags': 'count:22', + 'uploader_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'title': 'Running Man Philippines: Catch the Thief (FULL CHAPTER 2)', + 'channel_url': 'https://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'thumbnail': 'https://i.ytimg.com/vi/28BqW0AXPe0/maxresdefault.jpg', + 'release_timestamp': 1663594212, + 'age_limit': 0, + 'channel_follower_count': int, + 'categories': ['Entertainment'], + 'description': 'md5:811bdcea74f9c48051824e494756e926', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'YoüLOL', + 'availability': 'public', + 'release_date': '20220919', + } + }, { + 'url': 'https://www.gmanetwork.com/fullepisodes/home/more_than_words/87059/more-than-words-full-episode-80/video?section=home', + 'info_dict': { + 'id': 'yiDOExw2aSA', + 'ext': 'mp4', + 'live_status': 'not_live', + 'channel': 'GMANetwork', + 'like_count': int, + 'channel_follower_count': int, + 'description': 'md5:6d00cd658394fa1a5071200d3ed4be05', + 'duration': 1419, + 'age_limit': 0, + 'comment_count': int, + 'upload_date': '20181003', + 'thumbnail': 'https://i.ytimg.com/vi_webp/yiDOExw2aSA/maxresdefault.webp', + 'availability': 'public', + 'playable_in_embed': True, + 'channel_id': 'UCKL5hAuzgFQsyrsQKgU0Qng', + 'title': 'More Than Words: Full Episode 80 (Finale)', + 'uploader_id': 'GMANETWORK', + 'categories': ['Entertainment'], + 'uploader': 'GMANetwork', + 'channel_url': 'https://www.youtube.com/channel/UCKL5hAuzgFQsyrsQKgU0Qng', + 'tags': 'count:29', + 'view_count': int, + 'uploader_url': 'http://www.youtube.com/user/GMANETWORK', + } + }] + + def _real_extract(self, url): + content_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage = self._download_webpage(url, display_id) + # webpage route + youtube_id = self._search_regex( + r'var\s*YOUTUBE_VIDEO\s*=\s*[\'"]+(?P[\w-]+)', webpage, 'youtube_id', fatal=False) + if youtube_id: + return self.url_result(youtube_id, YoutubeIE, youtube_id) + + # api call route + # more info at https://aphrodite.gmanetwork.com/fullepisodes/assets/fullepisodes/js/dist/fullepisodes_video.js?v=1.1.11 + network_url = self._search_regex( + r'NETWORK_URL\s*=\s*[\'"](?P[^\'"]+)', webpage, 'network_url') + json_data = self._download_json(f'{network_url}api/data/content/video/{content_id}', display_id) + if json_data.get('video_file'): + return self.url_result(json_data['video_file'], YoutubeIE, json_data['video_file']) + else: + return self.url_result(json_data['dailymotion_file'], DailymotionIE, json_data['dailymotion_file']) diff --git a/yt_dlp/extractor/gronkh.py b/yt_dlp/extractor/gronkh.py index b9370e36c..1ae0a6893 100644 --- a/yt_dlp/extractor/gronkh.py +++ b/yt_dlp/extractor/gronkh.py @@ -3,6 +3,7 @@ from .common import InfoExtractor from ..utils import ( OnDemandPagedList, + float_or_none, traverse_obj, unified_strdate, ) @@ -19,7 +20,9 @@ class GronkhIE(InfoExtractor): 'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg', - 'upload_date': '20221111' + 'upload_date': '20221111', + 'chapters': 'count:3', + 'duration': 31463, }, 'params': {'skip_download': True} }, { @@ -30,7 +33,8 @@ class GronkhIE(InfoExtractor): 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', - 'upload_date': '20211001' + 'upload_date': '20211001', + 'duration': 32058, }, 'params': {'skip_download': True} }, { @@ -56,6 +60,12 @@ def _real_extract(self, url): 'upload_date': unified_strdate(data_json.get('created_at')), 'formats': formats, 'subtitles': subtitles, + 'duration': float_or_none(data_json.get('source_length')), + 'chapters': traverse_obj(data_json, ( + 'chapters', lambda _, v: float_or_none(v['offset']) is not None, { + 'title': 'title', + 'start_time': ('offset', {float_or_none}), + })) or None, } diff --git a/yt_dlp/extractor/hentaistigma.py b/yt_dlp/extractor/hentaistigma.py deleted file mode 100644 index ca5ffc2ae..000000000 --- a/yt_dlp/extractor/hentaistigma.py +++ /dev/null @@ -1,37 +0,0 @@ -from .common import InfoExtractor - - -class HentaiStigmaIE(InfoExtractor): - _VALID_URL = r'^https?://hentai\.animestigma\.com/(?P[^/]+)' - _TEST = { - 'url': 'http://hentai.animestigma.com/inyouchuu-etsu-bonus/', - 'md5': '4e3d07422a68a4cc363d8f57c8bf0d23', - 'info_dict': { - 'id': 'inyouchuu-etsu-bonus', - 'ext': 'mp4', - 'title': 'Inyouchuu Etsu Bonus', - 'age_limit': 18, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r']+class="posttitle"[^>]*>]*>([^<]+)', - webpage, 'title') - wrap_url = self._html_search_regex( - r']+src="([^"]+mp4)"', webpage, 'wrapper url') - wrap_webpage = self._download_webpage(wrap_url, video_id) - - video_url = self._html_search_regex( - r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url') - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'age_limit': 18, - } diff --git a/yt_dlp/extractor/hollywoodreporter.py b/yt_dlp/extractor/hollywoodreporter.py new file mode 100644 index 000000000..1f7eb89bc --- /dev/null +++ b/yt_dlp/extractor/hollywoodreporter.py @@ -0,0 +1,72 @@ +import functools +import re + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from ..utils import ( + ExtractorError, + OnDemandPagedList, + extract_attributes, + get_element_by_class, + get_element_html_by_class, +) + + +class HollywoodReporterIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/video/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.hollywoodreporter.com/video/chris-pine-michelle-rodriguez-dungeons-dragons-cast-directors-on-what-it-took-to-make-film-sxsw-2023/', + 'info_dict': { + 'id': 'zH4jZaR5', + 'ext': 'mp4', + 'title': 'md5:a9a1c073770a32f178955997712c4bd9', + 'description': 'The cast and directors of \'Dungeons & Dragons: Honor Among Thieves\' talk about their new film.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/zH4jZaR5/poster.jpg?width=720', + 'upload_date': '20230312', + 'timestamp': 1678586423, + 'duration': 242.0, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + data = extract_attributes(get_element_html_by_class('vlanding-video-card__link', webpage) or '') + video_id = data['data-video-showcase-trigger'] + showcase_type = data['data-video-showcase-type'] + + if showcase_type == 'jwplayer': + return self.url_result(f'jwplatform:{video_id}', JWPlatformIE) + elif showcase_type == 'youtube': + return self.url_result(video_id, 'Youtube') + else: + raise ExtractorError(f'Unsupported showcase type "{showcase_type}"') + + +class HollywoodReporterPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/vcategory/(?P[\w-]+)-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.hollywoodreporter.com/vcategory/heat-vision-breakdown-57822/', + 'playlist_mincount': 109, + 'info_dict': { + 'id': '57822', + 'title': 'heat-vision-breakdown', + } + }] + + def _fetch_page(self, slug, pl_id, page): + page += 1 + webpage = self._download_webpage( + f'https://www.hollywoodreporter.com/vcategory/{slug}-{pl_id}/page/{page}/', + pl_id, note=f'Downloading playlist page {page}') + section = get_element_by_class('video-playlist-river', webpage) or '' + + for url in re.findall(r']+href="([^"]+)"[^>]+class="c-title__link', section): + yield self.url_result(url, HollywoodReporterIE) + + def _real_extract(self, url): + slug, pl_id = self._match_valid_url(url).group('slug', 'id') + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, slug, pl_id), 15), pl_id, slug) diff --git a/yt_dlp/extractor/hrefli.py b/yt_dlp/extractor/hrefli.py new file mode 100644 index 000000000..77db2ea68 --- /dev/null +++ b/yt_dlp/extractor/hrefli.py @@ -0,0 +1,15 @@ +from .common import InfoExtractor + + +class HrefLiRedirectIE(InfoExtractor): + IE_NAME = 'href.li' + IE_DESC = False # Do not list + _VALID_URL = r'https?://href\.li/\?(?P.+)' + + _TESTS = [{ + 'url': 'https://href.li/?https://www.reddit.com/r/cats/comments/12bluel/my_cat_helps_me_with_water/?utm_source=share&utm_medium=android_app&utm_name=androidcss&utm_term=1&utm_content=share_button', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result(self._match_valid_url(url).group('url')) diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index ec3e59c6d..a5aad26ee 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -1,239 +1,199 @@ -import itertools -import re +import functools import urllib.parse +import hashlib from .common import InfoExtractor from ..utils import ( + ExtractorError, + OnDemandPagedList, int_or_none, mimetype2ext, - remove_end, - strip_or_none, - unified_strdate, - url_or_none, - urljoin, + qualities, + traverse_obj, + unified_timestamp, ) -class IwaraBaseIE(InfoExtractor): - _BASE_REGEX = r'(?Phttps?://(?:www\.|ecchi\.)?iwara\.tv)' - - def _extract_playlist(self, base_url, webpage): - for path in re.findall(r'class="title">\s*[a-zA-Z0-9]+)' +class IwaraIE(InfoExtractor): + IE_NAME = 'iwara' + _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', - # md5 is unstable + # this video cannot be played because of migration + 'only_matching': True, + 'url': 'https://www.iwara.tv/video/k2ayoueezfkx6gvq', 'info_dict': { - 'id': 'amVwUl1EHpAD9RD', + 'id': 'k2ayoueezfkx6gvq', 'ext': 'mp4', - 'title': '【MMD R-18】ガールフレンド carry_me_off', 'age_limit': 18, - 'thumbnail': 'https://i.iwara.tv/sites/default/files/videos/thumbnails/7951/thumbnail-7951_0001.png', - 'uploader': 'Reimu丨Action', - 'upload_date': '20150828', - 'description': 'md5:1d4905ce48c66c9299c617f08e106e0f', + 'title': 'Defeat of Irybelda - アイリベルダの敗北', + 'description': 'md5:70278abebe706647a8b4cb04cf23e0d3', + 'uploader': 'Inwerwm', + 'uploader_id': 'inwerwm', + 'tags': 'count:1', + 'like_count': 6133, + 'view_count': 1050343, + 'comment_count': 1, + 'timestamp': 1677843869, + 'modified_timestamp': 1679056362, }, }, { - 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', - 'md5': '7e5f1f359cd51a027ba4a7b7710a50f0', + 'url': 'https://iwara.tv/video/1ywe1sbkqwumpdxz5/', + 'md5': '20691ce1473ec2766c0788e14c60ce66', 'info_dict': { - 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', - 'ext': 'mp4', - 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4', - 'age_limit': 18, - }, - 'add_ie': ['GoogleDrive'], - }, { - 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', - # md5 is unstable - 'info_dict': { - 'id': '6liAP9s2Ojc', + 'id': '1ywe1sbkqwumpdxz5', 'ext': 'mp4', 'age_limit': 18, - 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', - 'description': 'md5:590c12c0df1443d833fbebe05da8c47a', - 'upload_date': '20160910', - 'uploader': 'aMMDsork', - 'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A', + 'title': 'Aponia 阿波尼亚SEX Party Tonight 手动脱衣 大奶 裸腿', + 'description': 'md5:0c4c310f2e0592d68b9f771d348329ca', + 'uploader': '龙也zZZ', + 'uploader_id': 'user792540', + 'tags': [ + 'uncategorized' + ], + 'like_count': 1809, + 'view_count': 25156, + 'comment_count': 1, + 'timestamp': 1678732213, + 'modified_timestamp': 1679110271, }, - 'add_ie': ['Youtube'], }] + def _extract_formats(self, video_id, fileurl): + up = urllib.parse.urlparse(fileurl) + q = urllib.parse.parse_qs(up.query) + paths = up.path.rstrip('/').split('/') + # https://github.com/yt-dlp/yt-dlp/issues/6549#issuecomment-1473771047 + x_version = hashlib.sha1('_'.join((paths[-1], q['expires'][0], '5nFp9kmbNnHdAFhaqMvt')).encode()).hexdigest() + + preference = qualities(['preview', '360', '540', 'Source']) + + files = self._download_json(fileurl, video_id, headers={'X-Version': x_version}) + for fmt in files: + yield traverse_obj(fmt, { + 'format_id': 'name', + 'url': ('src', ('view', 'download'), {self._proto_relative_url}), + 'ext': ('type', {mimetype2ext}), + 'quality': ('name', {preference}), + 'height': ('name', {int_or_none}), + }, get_all=False) + def _real_extract(self, url): video_id = self._match_id(url) + video_data = self._download_json(f'https://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True) + errmsg = video_data.get('message') + # at this point we can actually get uploaded user info, but do we need it? + if errmsg == 'errors.privateVideo': + self.raise_login_required('Private video. Login if you have permissions to watch') + elif errmsg: + raise ExtractorError(f'Iwara says: {errmsg}') - webpage, urlh = self._download_webpage_handle(url, video_id) - - hostname = urllib.parse.urlparse(urlh.geturl()).hostname - # ecchi is 'sexy' in Japanese - age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 - - video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id) - - if not video_data: - iframe_url = self._html_search_regex( - r']+src=([\'"])(?P[^\'"]+)\1', - webpage, 'iframe URL', group='url') - return { - '_type': 'url_transparent', - 'url': iframe_url, - 'age_limit': age_limit, - } - - title = remove_end(self._html_extract_title(webpage), ' | Iwara') - - thumbnail = self._html_search_regex( - r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None) - - uploader = self._html_search_regex( - r'class="username">([^<]+)', webpage, 'uploader', fatal=False) - - upload_date = unified_strdate(self._html_search_regex( - r'作成日:([^\s]+)', webpage, 'upload_date', fatal=False)) - - description = strip_or_none(self._search_regex( - r'

(.+?(?=[^/?#&]+)' - IE_NAME = 'iwara:playlist' - - _TESTS = [{ - 'url': 'https://ecchi.iwara.tv/playlist/best-enf', - 'info_dict': { - 'title': 'Best enf', - 'uploader': 'Jared98112', - 'id': 'best-enf', - }, - 'playlist_mincount': 1097, - }, { - # urlencoded - 'url': 'https://ecchi.iwara.tv/playlist/%E3%83%97%E3%83%AC%E3%82%A4%E3%83%AA%E3%82%B9%E3%83%88-2', - 'info_dict': { - 'id': 'プレイリスト-2', - 'title': 'プレイリスト', - 'uploader': 'mainyu', - }, - 'playlist_mincount': 91, - }] - - def _real_extract(self, url): - playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') - playlist_id = urllib.parse.unquote(playlist_id) - webpage = self._download_webpage(url, playlist_id) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._html_search_regex(r'class="title"[^>]*>([^<]+)', webpage, 'title', fatal=False), - 'uploader': self._html_search_regex(r'

([^<]+)', webpage, 'uploader', fatal=False), - 'entries': self._extract_playlist(base_url, webpage), - } - - -class IwaraUserIE(IwaraBaseIE): - _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/users/(?P[^/?#&]+)' +class IwaraUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?iwara\.tv/profile/(?P[^/?#&]+)' IE_NAME = 'iwara:user' + _PER_PAGE = 32 _TESTS = [{ - 'note': 'number of all videos page is just 1 page. less than 40 videos', - 'url': 'https://ecchi.iwara.tv/users/infinityyukarip', + 'url': 'https://iwara.tv/profile/user792540/videos', 'info_dict': { - 'title': 'Uploaded videos from Infinity_YukariP', - 'id': 'infinityyukarip', - 'uploader': 'Infinity_YukariP', - 'uploader_id': 'infinityyukarip', + 'id': 'user792540', }, - 'playlist_mincount': 39, + 'playlist_mincount': 80, }, { - 'note': 'no even all videos page. probably less than 10 videos', - 'url': 'https://ecchi.iwara.tv/users/mmd-quintet', + 'url': 'https://iwara.tv/profile/theblackbirdcalls/videos', 'info_dict': { - 'title': 'Uploaded videos from mmd quintet', - 'id': 'mmd-quintet', - 'uploader': 'mmd quintet', - 'uploader_id': 'mmd-quintet', - }, - 'playlist_mincount': 6, - }, { - 'note': 'has paging. more than 40 videos', - 'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls', - 'info_dict': { - 'title': 'Uploaded videos from TheBlackbirdCalls', 'id': 'theblackbirdcalls', - 'uploader': 'TheBlackbirdCalls', - 'uploader_id': 'theblackbirdcalls', }, - 'playlist_mincount': 420, + 'playlist_mincount': 723, }, { - 'note': 'foreign chars in URL. there must be foreign characters in URL', - 'url': 'https://ecchi.iwara.tv/users/ぶた丼', - 'info_dict': { - 'title': 'Uploaded videos from ぶた丼', - 'id': 'ぶた丼', - 'uploader': 'ぶた丼', - 'uploader_id': 'ぶた丼', - }, - 'playlist_mincount': 170, + 'url': 'https://iwara.tv/profile/user792540', + 'only_matching': True, + }, { + 'url': 'https://iwara.tv/profile/theblackbirdcalls', + 'only_matching': True, }] - def _entries(self, playlist_id, base_url): - webpage = self._download_webpage( - f'{base_url}/users/{playlist_id}', playlist_id) - videos_url = self._search_regex(r'', webpage, 'all videos url', default=None) - if not videos_url: - yield from self._extract_playlist(base_url, webpage) - return - - videos_url = urljoin(base_url, videos_url) - - for n in itertools.count(1): - page = self._download_webpage( - videos_url, playlist_id, note=f'Downloading playlist page {n}', - query={'page': str(n - 1)} if n > 1 else {}) - yield from self._extract_playlist( - base_url, page) - - if f'page={n}' not in page: - break + def _entries(self, playlist_id, user_id, page): + videos = self._download_json( + 'https://api.iwara.tv/videos', playlist_id, + note=f'Downloading page {page}', + query={ + 'page': page, + 'sort': 'date', + 'user': user_id, + 'limit': self._PER_PAGE, + }) + for x in traverse_obj(videos, ('results', ..., 'id')): + yield self.url_result(f'https://iwara.tv/video/{x}') def _real_extract(self, url): - playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') - playlist_id = urllib.parse.unquote(playlist_id) + playlist_id = self._match_id(url) + user_info = self._download_json( + f'https://api.iwara.tv/profile/{playlist_id}', playlist_id, + note='Requesting user info') + user_id = traverse_obj(user_info, ('user', 'id')) return self.playlist_result( - self._entries(playlist_id, base_url), playlist_id) + OnDemandPagedList( + functools.partial(self._entries, playlist_id, user_id), + self._PER_PAGE), + playlist_id, traverse_obj(user_info, ('user', 'name'))) + + +class IwaraPlaylistIE(InfoExtractor): + # the ID is an UUID but I don't think it's necessary to write concrete regex + _VALID_URL = r'https?://(?:www\.)?iwara\.tv/playlist/(?P[0-9a-f-]+)' + IE_NAME = 'iwara:playlist' + _PER_PAGE = 32 + + _TESTS = [{ + 'url': 'https://iwara.tv/playlist/458e5486-36a4-4ac0-b233-7e9eef01025f', + 'info_dict': { + 'id': '458e5486-36a4-4ac0-b233-7e9eef01025f', + }, + 'playlist_mincount': 3, + }] + + def _entries(self, playlist_id, first_page, page): + videos = self._download_json( + 'https://api.iwara.tv/videos', playlist_id, f'Downloading page {page}', + query={'page': page, 'limit': self._PER_PAGE}) if page else first_page + for x in traverse_obj(videos, ('results', ..., 'id')): + yield self.url_result(f'https://iwara.tv/video/{x}') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + page_0 = self._download_json( + f'https://api.iwara.tv/playlist/{playlist_id}?page=0&limit={self._PER_PAGE}', playlist_id, + note='Requesting playlist info') + + return self.playlist_result( + OnDemandPagedList( + functools.partial(self._entries, playlist_id, page_0), + self._PER_PAGE), + playlist_id, traverse_obj(page_0, ('title', 'name'))) diff --git a/yt_dlp/extractor/jwplatform.py b/yt_dlp/extractor/jwplatform.py index c94968943..bc47aa6d3 100644 --- a/yt_dlp/extractor/jwplatform.py +++ b/yt_dlp/extractor/jwplatform.py @@ -8,14 +8,16 @@ class JWPlatformIE(InfoExtractor): _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P[a-zA-Z0-9]{8})' _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', - 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'md5': '3aa16e4f6860e6e78b7df5829519aed3', 'info_dict': { 'id': 'nPripu9l', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Big Buck Bunny Trailer', 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', 'upload_date': '20081127', 'timestamp': 1227796140, + 'duration': 32.0, + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/nPripu9l/poster.jpg?width=720', } }, { 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js', @@ -37,18 +39,31 @@ class JWPlatformIE(InfoExtractor): }, }, { # Player url not surrounded by quotes - 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin', + 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/school-trip', 'info_dict': { - 'id': 'R10NQdhY', - 'title': 'Playgirl', + 'id': 'jUxh5uin', + 'title': 'Klassenfahrt', 'ext': 'mp4', - 'upload_date': '20220624', - 'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720', - 'timestamp': 1656064800, - 'description': 'BRD 1966, Will Tremper', - 'duration': 5146.0, + 'upload_date': '20230109', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/jUxh5uin/poster.jpg?width=720', + 'timestamp': 1673270298, + 'description': '', + 'duration': 5193.0, }, 'params': {'allowed_extractors': ['generic', 'jwplatform']}, + }, { + # iframe src attribute includes backslash before URL string + 'url': 'https://www.elespectador.com/colombia/video-asi-se-evito-la-fuga-de-john-poulos-presunto-feminicida-de-valentina-trespalacios-explicacion', + 'info_dict': { + 'id': 'QD3gsexj', + 'title': 'Así se evitó la fuga de John Poulos, presunto feminicida de Valentina Trespalacios', + 'ext': 'mp4', + 'upload_date': '20230127', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/QD3gsexj/poster.jpg?width=720', + 'timestamp': 1674862986, + 'description': 'md5:128fd74591c4e1fc2da598c5cb6f5ce4', + 'duration': 263.0, + }, }] @classmethod @@ -57,7 +72,7 @@ def _extract_embed_urls(cls, url, webpage): # is used by hyland.com # if we find