diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index a086166948..f3578efe10 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -684,17 +684,186 @@ def test_parse_m3u8_formats(self):
'width': 1920,
'height': 1080,
'vcodec': 'avc1.64002a',
- }]
+ }],
+ {}
+ ),
+ (
+ 'bipbop_16x9',
+ 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8',
+ [{
+ "format_id": "bipbop_audio-BipBop Audio 2",
+ "format_index": None,
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/alternate_audio_aac/prog_index.m3u8",
+ "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
+ "language": "eng",
+ "ext": "mp4",
+ "protocol": "m3u8",
+ "preference": None,
+ "quality": None,
+ "vcodec": "none",
+ "audio_ext": "mp4",
+ "video_ext": "none",
+ }, {
+ "format_id": "41",
+ "format_index": None,
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear0/prog_index.m3u8",
+ "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
+ "tbr": 41.457,
+ "ext": "mp4",
+ "fps": None,
+ "protocol": "m3u8",
+ "preference": None,
+ "quality": None,
+ "vcodec": "none",
+ "acodec": "mp4a.40.2",
+ "audio_ext": "mp4",
+ "video_ext": "none",
+ "abr": 41.457,
+ }, {
+ "format_id": "263",
+ "format_index": None,
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear1/prog_index.m3u8",
+ "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
+ "tbr": 263.851,
+ "ext": "mp4",
+ "fps": None,
+ "protocol": "m3u8",
+ "preference": None,
+ "quality": None,
+ "width": 416,
+ "height": 234,
+ "vcodec": "avc1.4d400d",
+ "acodec": "mp4a.40.2",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 263.851,
+ "abr": 0,
+ }, {
+ "format_id": "577",
+ "format_index": None,
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear2/prog_index.m3u8",
+ "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
+ "tbr": 577.61,
+ "ext": "mp4",
+ "fps": None,
+ "protocol": "m3u8",
+ "preference": None,
+ "quality": None,
+ "width": 640,
+ "height": 360,
+ "vcodec": "avc1.4d401e",
+ "acodec": "mp4a.40.2",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 577.61,
+ "abr": 0,
+ }, {
+ "format_id": "915",
+ "format_index": None,
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear3/prog_index.m3u8",
+ "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
+ "tbr": 915.905,
+ "ext": "mp4",
+ "fps": None,
+ "protocol": "m3u8",
+ "preference": None,
+ "quality": None,
+ "width": 960,
+ "height": 540,
+ "vcodec": "avc1.4d401f",
+ "acodec": "mp4a.40.2",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 915.905,
+ "abr": 0,
+ }, {
+ "format_id": "1030",
+ "format_index": None,
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear4/prog_index.m3u8",
+ "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
+ "tbr": 1030.138,
+ "ext": "mp4",
+ "fps": None,
+ "protocol": "m3u8",
+ "preference": None,
+ "quality": None,
+ "width": 1280,
+ "height": 720,
+ "vcodec": "avc1.4d401f",
+ "acodec": "mp4a.40.2",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 1030.138,
+ "abr": 0,
+ }, {
+ "format_id": "1924",
+ "format_index": None,
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear5/prog_index.m3u8",
+ "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
+ "tbr": 1924.009,
+ "ext": "mp4",
+ "fps": None,
+ "protocol": "m3u8",
+ "preference": None,
+ "quality": None,
+ "width": 1920,
+ "height": 1080,
+ "vcodec": "avc1.4d401f",
+ "acodec": "mp4a.40.2",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 1924.009,
+ "abr": 0,
+ }],
+ {
+ "en": [{
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/eng/prog_index.m3u8",
+ "ext": "vtt",
+ "protocol": "m3u8_native"
+ }, {
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/eng_forced/prog_index.m3u8",
+ "ext": "vtt",
+ "protocol": "m3u8_native"
+ }],
+ "fr": [{
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/fra/prog_index.m3u8",
+ "ext": "vtt",
+ "protocol": "m3u8_native"
+ }, {
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/fra_forced/prog_index.m3u8",
+ "ext": "vtt",
+ "protocol": "m3u8_native"
+ }],
+ "es": [{
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/spa/prog_index.m3u8",
+ "ext": "vtt",
+ "protocol": "m3u8_native"
+ }, {
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/spa_forced/prog_index.m3u8",
+ "ext": "vtt",
+ "protocol": "m3u8_native"
+ }],
+ "ja": [{
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/jpn/prog_index.m3u8",
+ "ext": "vtt",
+ "protocol": "m3u8_native"
+ }, {
+ "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/jpn_forced/prog_index.m3u8",
+ "ext": "vtt",
+ "protocol": "m3u8_native"
+ }],
+ }
),
]
- for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:
+ for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES:
with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
mode='r', encoding='utf-8') as f:
- formats = self.ie._parse_m3u8_formats(
+ formats, subs = self.ie._parse_m3u8_formats_and_subtitles(
f.read(), m3u8_url, ext='mp4')
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
+ expect_value(self, subs, expected_subs, None)
def test_parse_mpd_formats(self):
_TEST_CASES = [
@@ -780,7 +949,8 @@ def test_parse_mpd_formats(self):
'tbr': 5997.485,
'width': 1920,
'height': 1080,
- }]
+ }],
+ {},
), (
# https://github.com/ytdl-org/youtube-dl/pull/14844
'urls_only',
@@ -863,7 +1033,8 @@ def test_parse_mpd_formats(self):
'tbr': 4400,
'width': 1920,
'height': 1080,
- }]
+ }],
+ {},
), (
# https://github.com/ytdl-org/youtube-dl/issues/20346
# Media considered unfragmented even though it contains
@@ -909,18 +1080,328 @@ def test_parse_mpd_formats(self):
'width': 360,
'height': 360,
'fps': 30,
- }]
+ }],
+ {},
+ ), (
+ 'subtitles',
+ 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
+ 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/',
+ [{
+ "format_id": "audio=128001",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "ext": "m4a",
+ "tbr": 128.001,
+ "asr": 48000,
+ "format_note": "DASH audio",
+ "container": "m4a_dash",
+ "vcodec": "none",
+ "acodec": "mp4a.40.2",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
+ "protocol": "http_dash_segments",
+ "audio_ext": "m4a",
+ "video_ext": "none",
+ "abr": 128.001,
+ }, {
+ "format_id": "video=100000",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "ext": "mp4",
+ "width": 336,
+ "height": 144,
+ "tbr": 100,
+ "format_note": "DASH video",
+ "container": "mp4_dash",
+ "vcodec": "avc1.4D401F",
+ "acodec": "none",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
+ "protocol": "http_dash_segments",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 100,
+ }, {
+ "format_id": "video=326000",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "ext": "mp4",
+ "width": 562,
+ "height": 240,
+ "tbr": 326,
+ "format_note": "DASH video",
+ "container": "mp4_dash",
+ "vcodec": "avc1.4D401F",
+ "acodec": "none",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
+ "protocol": "http_dash_segments",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 326,
+ }, {
+ "format_id": "video=698000",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "ext": "mp4",
+ "width": 844,
+ "height": 360,
+ "tbr": 698,
+ "format_note": "DASH video",
+ "container": "mp4_dash",
+ "vcodec": "avc1.4D401F",
+ "acodec": "none",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
+ "protocol": "http_dash_segments",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 698,
+ }, {
+ "format_id": "video=1493000",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "ext": "mp4",
+ "width": 1126,
+ "height": 480,
+ "tbr": 1493,
+ "format_note": "DASH video",
+ "container": "mp4_dash",
+ "vcodec": "avc1.4D401F",
+ "acodec": "none",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
+ "protocol": "http_dash_segments",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 1493,
+ }, {
+ "format_id": "video=4482000",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "ext": "mp4",
+ "width": 1688,
+ "height": 720,
+ "tbr": 4482,
+ "format_note": "DASH video",
+ "container": "mp4_dash",
+ "vcodec": "avc1.4D401F",
+ "acodec": "none",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
+ "protocol": "http_dash_segments",
+ "video_ext": "mp4",
+ "audio_ext": "none",
+ "vbr": 4482,
+ }],
+ {
+ "en": [
+ {
+ "ext": "mp4",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
+ "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
+ "protocol": "http_dash_segments",
+ }
+ ]
+ },
)
]
- for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES:
+ for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES:
with io.open('./test/testdata/mpd/%s.mpd' % mpd_file,
mode='r', encoding='utf-8') as f:
- formats = self.ie._parse_mpd_formats(
+ formats, subtitles = self.ie._parse_mpd_formats_and_subtitles(
compat_etree_fromstring(f.read().encode('utf-8')),
mpd_base_url=mpd_base_url, mpd_url=mpd_url)
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
+ expect_value(self, subtitles, expected_subtitles, None)
+
+ def test_parse_ism_formats(self):
+ _TEST_CASES = [
+ (
+ 'sintel',
+ 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
+ [{
+ "format_id": "audio-128",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "ext": "isma",
+ "tbr": 128,
+ "asr": 48000,
+ "vcodec": "none",
+ "acodec": "AACL",
+ "protocol": "ism",
+ "_download_params": {
+ "stream_type": "audio",
+ "duration": 8880746666,
+ "timescale": 10000000,
+ "width": 0,
+ "height": 0,
+ "fourcc": "AACL",
+ "codec_private_data": "1190",
+ "sampling_rate": 48000,
+ "channels": 2,
+ "bits_per_sample": 16,
+ "nal_unit_length_field": 4
+ },
+ "audio_ext": "isma",
+ "video_ext": "none",
+ "abr": 128,
+ }, {
+ "format_id": "video-100",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "ext": "ismv",
+ "width": 336,
+ "height": 144,
+ "tbr": 100,
+ "vcodec": "AVC1",
+ "acodec": "none",
+ "protocol": "ism",
+ "_download_params": {
+ "stream_type": "video",
+ "duration": 8880746666,
+ "timescale": 10000000,
+ "width": 336,
+ "height": 144,
+ "fourcc": "AVC1",
+ "codec_private_data": "00000001674D401FDA0544EFFC2D002CBC40000003004000000C03C60CA80000000168EF32C8",
+ "channels": 2,
+ "bits_per_sample": 16,
+ "nal_unit_length_field": 4
+ },
+ "video_ext": "ismv",
+ "audio_ext": "none",
+ "vbr": 100,
+ }, {
+ "format_id": "video-326",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "ext": "ismv",
+ "width": 562,
+ "height": 240,
+ "tbr": 326,
+ "vcodec": "AVC1",
+ "acodec": "none",
+ "protocol": "ism",
+ "_download_params": {
+ "stream_type": "video",
+ "duration": 8880746666,
+ "timescale": 10000000,
+ "width": 562,
+ "height": 240,
+ "fourcc": "AVC1",
+ "codec_private_data": "00000001674D401FDA0241FE23FFC3BC83BA44000003000400000300C03C60CA800000000168EF32C8",
+ "channels": 2,
+ "bits_per_sample": 16,
+ "nal_unit_length_field": 4
+ },
+ "video_ext": "ismv",
+ "audio_ext": "none",
+ "vbr": 326,
+ }, {
+ "format_id": "video-698",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "ext": "ismv",
+ "width": 844,
+ "height": 360,
+ "tbr": 698,
+ "vcodec": "AVC1",
+ "acodec": "none",
+ "protocol": "ism",
+ "_download_params": {
+ "stream_type": "video",
+ "duration": 8880746666,
+ "timescale": 10000000,
+ "width": 844,
+ "height": 360,
+ "fourcc": "AVC1",
+ "codec_private_data": "00000001674D401FDA0350BFB97FF06AF06AD1000003000100000300300F1832A00000000168EF32C8",
+ "channels": 2,
+ "bits_per_sample": 16,
+ "nal_unit_length_field": 4
+ },
+ "video_ext": "ismv",
+ "audio_ext": "none",
+ "vbr": 698,
+ }, {
+ "format_id": "video-1493",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "ext": "ismv",
+ "width": 1126,
+ "height": 480,
+ "tbr": 1493,
+ "vcodec": "AVC1",
+ "acodec": "none",
+ "protocol": "ism",
+ "_download_params": {
+ "stream_type": "video",
+ "duration": 8880746666,
+ "timescale": 10000000,
+ "width": 1126,
+ "height": 480,
+ "fourcc": "AVC1",
+ "codec_private_data": "00000001674D401FDA011C3DE6FFF0D890D871000003000100000300300F1832A00000000168EF32C8",
+ "channels": 2,
+ "bits_per_sample": 16,
+ "nal_unit_length_field": 4
+ },
+ "video_ext": "ismv",
+ "audio_ext": "none",
+ "vbr": 1493,
+ }, {
+ "format_id": "video-4482",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "ext": "ismv",
+ "width": 1688,
+ "height": 720,
+ "tbr": 4482,
+ "vcodec": "AVC1",
+ "acodec": "none",
+ "protocol": "ism",
+ "_download_params": {
+ "stream_type": "video",
+ "duration": 8880746666,
+ "timescale": 10000000,
+ "width": 1688,
+ "height": 720,
+ "fourcc": "AVC1",
+ "codec_private_data": "00000001674D401FDA01A816F97FFC1ABC1AB440000003004000000C03C60CA80000000168EF32C8",
+ "channels": 2,
+ "bits_per_sample": 16,
+ "nal_unit_length_field": 4
+ },
+ "video_ext": "ismv",
+ "audio_ext": "none",
+ "vbr": 4482,
+ }],
+ {
+ "eng": [
+ {
+ "ext": "ismt",
+ "protocol": "ism",
+ "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
+ "_download_params": {
+ "stream_type": "text",
+ "duration": 8880746666,
+ "timescale": 10000000,
+ "fourcc": "TTML",
+ "codec_private_data": ""
+ }
+ }
+ ]
+ },
+ ),
+ ]
+
+ for ism_file, ism_url, expected_formats, expected_subtitles in _TEST_CASES:
+ with io.open('./test/testdata/ism/%s.Manifest' % ism_file,
+ mode='r', encoding='utf-8') as f:
+ formats, subtitles = self.ie._parse_ism_formats_and_subtitles(
+ compat_etree_fromstring(f.read().encode('utf-8')), ism_url=ism_url)
+ self.ie._sort_formats(formats)
+ expect_value(self, formats, expected_formats, None)
+ expect_value(self, subtitles, expected_subtitles, None)
def test_parse_f4m_formats(self):
_TEST_CASES = [
diff --git a/test/testdata/ism/sintel.Manifest b/test/testdata/ism/sintel.Manifest
new file mode 100644
index 0000000000..2ff8c24478
--- /dev/null
+++ b/test/testdata/ism/sintel.Manifest
@@ -0,0 +1,988 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/testdata/m3u8/bipbop_16x9.m3u8 b/test/testdata/m3u8/bipbop_16x9.m3u8
new file mode 100644
index 0000000000..1ce87dd041
--- /dev/null
+++ b/test/testdata/m3u8/bipbop_16x9.m3u8
@@ -0,0 +1,38 @@
+#EXTM3U
+
+#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="bipbop_audio",LANGUAGE="eng",NAME="BipBop Audio 1",AUTOSELECT=YES,DEFAULT=YES
+#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="bipbop_audio",LANGUAGE="eng",NAME="BipBop Audio 2",AUTOSELECT=NO,DEFAULT=NO,URI="alternate_audio_aac/prog_index.m3u8"
+
+
+#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,LANGUAGE="en",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/eng/prog_index.m3u8"
+#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="English (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="en",URI="subtitles/eng_forced/prog_index.m3u8"
+#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Français",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="fr",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/fra/prog_index.m3u8"
+#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Français (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="fr",URI="subtitles/fra_forced/prog_index.m3u8"
+#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Español",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="es",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/spa/prog_index.m3u8"
+#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Español (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="es",URI="subtitles/spa_forced/prog_index.m3u8"
+#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="日本語",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="ja",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/jpn/prog_index.m3u8"
+#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="日本語 (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="ja",URI="subtitles/jpn_forced/prog_index.m3u8"
+
+
+#EXT-X-STREAM-INF:BANDWIDTH=263851,CODECS="mp4a.40.2, avc1.4d400d",RESOLUTION=416x234,AUDIO="bipbop_audio",SUBTITLES="subs"
+gear1/prog_index.m3u8
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=28451,CODECS="avc1.4d400d",URI="gear1/iframe_index.m3u8"
+
+#EXT-X-STREAM-INF:BANDWIDTH=577610,CODECS="mp4a.40.2, avc1.4d401e",RESOLUTION=640x360,AUDIO="bipbop_audio",SUBTITLES="subs"
+gear2/prog_index.m3u8
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=181534,CODECS="avc1.4d401e",URI="gear2/iframe_index.m3u8"
+
+#EXT-X-STREAM-INF:BANDWIDTH=915905,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=960x540,AUDIO="bipbop_audio",SUBTITLES="subs"
+gear3/prog_index.m3u8
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=297056,CODECS="avc1.4d401f",URI="gear3/iframe_index.m3u8"
+
+#EXT-X-STREAM-INF:BANDWIDTH=1030138,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=1280x720,AUDIO="bipbop_audio",SUBTITLES="subs"
+gear4/prog_index.m3u8
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=339492,CODECS="avc1.4d401f",URI="gear4/iframe_index.m3u8"
+
+#EXT-X-STREAM-INF:BANDWIDTH=1924009,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=1920x1080,AUDIO="bipbop_audio",SUBTITLES="subs"
+gear5/prog_index.m3u8
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=669554,CODECS="avc1.4d401f",URI="gear5/iframe_index.m3u8"
+
+#EXT-X-STREAM-INF:BANDWIDTH=41457,CODECS="mp4a.40.2",AUDIO="bipbop_audio",SUBTITLES="subs"
+gear0/prog_index.m3u8
diff --git a/test/testdata/mpd/subtitles.mpd b/test/testdata/mpd/subtitles.mpd
new file mode 100644
index 0000000000..6f948adba9
--- /dev/null
+++ b/test/testdata/mpd/subtitles.mpd
@@ -0,0 +1,351 @@
+
+
+
+
+ dash/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/yt_dlp/compat.py b/yt_dlp/compat.py
index 3ebf1ee7a6..863bd2287c 100644
--- a/yt_dlp/compat.py
+++ b/yt_dlp/compat.py
@@ -3018,10 +3018,24 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
return ctypes.WINFUNCTYPE(*args, **kwargs)
+try:
+ compat_Pattern = re.Pattern
+except AttributeError:
+ compat_Pattern = type(re.compile(''))
+
+
+try:
+ compat_Match = re.Match
+except AttributeError:
+ compat_Match = type(re.compile('').match(''))
+
+
__all__ = [
'compat_HTMLParseError',
'compat_HTMLParser',
'compat_HTTPError',
+ 'compat_Match',
+ 'compat_Pattern',
'compat_Struct',
'compat_b64decode',
'compat_basestring',
diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py
index a0c1d13ac2..fadd0dfc5f 100644
--- a/yt_dlp/downloader/fragment.py
+++ b/yt_dlp/downloader/fragment.py
@@ -77,7 +77,10 @@ def _read_ytdl_file(self, ctx):
assert 'ytdl_corrupt' not in ctx
stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r')
try:
- ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index']
+ ytdl_data = json.loads(stream.read())
+ ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index']
+ if 'extra_state' in ytdl_data['downloader']:
+ ctx['extra_state'] = ytdl_data['downloader']['extra_state']
except Exception:
ctx['ytdl_corrupt'] = True
finally:
@@ -90,6 +93,8 @@ def _write_ytdl_file(self, ctx):
'index': ctx['fragment_index'],
},
}
+ if 'extra_state' in ctx:
+ downloader['extra_state'] = ctx['extra_state']
if ctx.get('fragment_count') is not None:
downloader['fragment_count'] = ctx['fragment_count']
frag_index_stream.write(json.dumps({'downloader': downloader}))
diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py
index f4e41a6c7b..270b33b22e 100644
--- a/yt_dlp/downloader/hls.py
+++ b/yt_dlp/downloader/hls.py
@@ -2,6 +2,7 @@
import errno
import re
+import io
import binascii
try:
from Crypto.Cipher import AES
@@ -27,7 +28,9 @@
parse_m3u8_attributes,
sanitize_open,
update_url_query,
+ bug_reports_message,
)
+from .. import webvtt
class HlsFD(FragmentFD):
@@ -78,6 +81,8 @@ def real_download(self, filename, info_dict):
man_url = info_dict['url']
self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
+ is_webvtt = info_dict['ext'] == 'vtt'
+
urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
man_url = urlh.geturl()
s = urlh.read().decode('utf-8', 'ignore')
@@ -142,6 +147,8 @@ def is_ad_fragment_end(s):
else:
self._prepare_and_start_frag_download(ctx)
+ extra_state = ctx.setdefault('extra_state', {})
+
fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
test = self.params.get('test', False)
@@ -308,6 +315,76 @@ def download_fragment(fragment):
return frag_content, frag_index
+ pack_fragment = lambda frag_content, _: frag_content
+
+ if is_webvtt:
+ def pack_fragment(frag_content, frag_index):
+ output = io.StringIO()
+ adjust = 0
+ for block in webvtt.parse_fragment(frag_content):
+ if isinstance(block, webvtt.CueBlock):
+ block.start += adjust
+ block.end += adjust
+
+ dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
+ cue = block.as_json
+
+ # skip the cue if an identical one appears
+ # in the window of potential duplicates
+ # and prune the window of unviable candidates
+ i = 0
+ skip = True
+ while i < len(dedup_window):
+ window_cue = dedup_window[i]
+ if window_cue == cue:
+ break
+ if window_cue['end'] >= cue['start']:
+ i += 1
+ continue
+ del dedup_window[i]
+ else:
+ skip = False
+
+ if skip:
+ continue
+
+ # add the cue to the window
+ dedup_window.append(cue)
+ elif isinstance(block, webvtt.Magic):
+ # take care of MPEG PES timestamp overflow
+ if block.mpegts is None:
+ block.mpegts = 0
+ extra_state.setdefault('webvtt_mpegts_adjust', 0)
+ block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33
+ if block.mpegts < extra_state.get('webvtt_mpegts_last', 0):
+ extra_state['webvtt_mpegts_adjust'] += 1
+ block.mpegts += 1 << 33
+ extra_state['webvtt_mpegts_last'] = block.mpegts
+
+ if frag_index == 1:
+ extra_state['webvtt_mpegts'] = block.mpegts or 0
+ extra_state['webvtt_local'] = block.local or 0
+ # XXX: block.local = block.mpegts = None ?
+ else:
+ if block.mpegts is not None and block.local is not None:
+ adjust = (
+ (block.mpegts - extra_state.get('webvtt_mpegts', 0))
+ - (block.local - extra_state.get('webvtt_local', 0))
+ )
+ continue
+ elif isinstance(block, webvtt.HeaderBlock):
+ if frag_index != 1:
+ # XXX: this should probably be silent as well
+ # or verify that all segments contain the same data
+ self.report_warning(bug_reports_message(
+ 'Discarding a %s block found in the middle of the stream; '
+ 'if the subtitles display incorrectly,'
+ % (type(block).__name__)))
+ continue
+ block.write_into(output)
+
+ return output.getvalue().encode('utf-8')
+
def append_fragment(frag_content, frag_index):
if frag_content:
fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], frag_index)
@@ -315,6 +392,7 @@ def append_fragment(frag_content, frag_index):
file, frag_sanitized = sanitize_open(fragment_filename, 'rb')
ctx['fragment_filename_sanitized'] = frag_sanitized
file.close()
+ frag_content = pack_fragment(frag_content, frag_index)
self._append_fragment(ctx, frag_content)
return True
except EnvironmentError as ose:
diff --git a/yt_dlp/downloader/ism.py b/yt_dlp/downloader/ism.py
index 1ca666b4a1..07d74aef0b 100644
--- a/yt_dlp/downloader/ism.py
+++ b/yt_dlp/downloader/ism.py
@@ -48,7 +48,7 @@ def write_piff_header(stream, params):
language = params.get('language', 'und')
height = params.get('height', 0)
width = params.get('width', 0)
- is_audio = width == 0 and height == 0
+ stream_type = params['stream_type']
creation_time = modification_time = int(time.time())
ftyp_payload = b'isml' # major brand
@@ -77,7 +77,7 @@ def write_piff_header(stream, params):
tkhd_payload += u32.pack(0) * 2 # reserved
tkhd_payload += s16.pack(0) # layer
tkhd_payload += s16.pack(0) # alternate group
- tkhd_payload += s88.pack(1 if is_audio else 0) # volume
+ tkhd_payload += s88.pack(1 if stream_type == 'audio' else 0) # volume
tkhd_payload += u16.pack(0) # reserved
tkhd_payload += unity_matrix
tkhd_payload += u1616.pack(width)
@@ -93,19 +93,34 @@ def write_piff_header(stream, params):
mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box
hdlr_payload = u32.pack(0) # pre defined
- hdlr_payload += b'soun' if is_audio else b'vide' # handler type
- hdlr_payload += u32.pack(0) * 3 # reserved
- hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0' # name
+ if stream_type == 'audio': # handler type
+ hdlr_payload += b'soun'
+ hdlr_payload += u32.pack(0) * 3 # reserved
+ hdlr_payload += b'SoundHandler\0' # name
+ elif stream_type == 'video':
+ hdlr_payload += b'vide'
+ hdlr_payload += u32.pack(0) * 3 # reserved
+ hdlr_payload += b'VideoHandler\0' # name
+ elif stream_type == 'text':
+ hdlr_payload += b'subt'
+ hdlr_payload += u32.pack(0) * 3 # reserved
+ hdlr_payload += b'SubtitleHandler\0' # name
+ else:
+ assert False
mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box
- if is_audio:
+ if stream_type == 'audio':
smhd_payload = s88.pack(0) # balance
smhd_payload += u16.pack(0) # reserved
media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header
- else:
+ elif stream_type == 'video':
vmhd_payload = u16.pack(0) # graphics mode
vmhd_payload += u16.pack(0) * 3 # opcolor
media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header
+ elif stream_type == 'text':
+ media_header_box = full_box(b'sthd', 0, 0, b'') # Subtitle Media Header
+ else:
+ assert False
minf_payload = media_header_box
dref_payload = u32.pack(1) # entry count
@@ -117,7 +132,7 @@ def write_piff_header(stream, params):
sample_entry_payload = u8.pack(0) * 6 # reserved
sample_entry_payload += u16.pack(1) # data reference index
- if is_audio:
+ if stream_type == 'audio':
sample_entry_payload += u32.pack(0) * 2 # reserved
sample_entry_payload += u16.pack(params.get('channels', 2))
sample_entry_payload += u16.pack(params.get('bits_per_sample', 16))
@@ -127,7 +142,7 @@ def write_piff_header(stream, params):
if fourcc == 'AACL':
sample_entry_box = box(b'mp4a', sample_entry_payload)
- else:
+ elif stream_type == 'video':
sample_entry_payload += u16.pack(0) # pre defined
sample_entry_payload += u16.pack(0) # reserved
sample_entry_payload += u32.pack(0) * 3 # pre defined
@@ -155,6 +170,18 @@ def write_piff_header(stream, params):
avcc_payload += pps
sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record
sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry
+ else:
+ assert False
+ elif stream_type == 'text':
+ if fourcc == 'TTML':
+ sample_entry_payload += b'http://www.w3.org/ns/ttml\0' # namespace
+ sample_entry_payload += b'\0' # schema location
+ sample_entry_payload += b'\0' # auxilary mime types(??)
+ sample_entry_box = box(b'stpp', sample_entry_payload)
+ else:
+ assert False
+ else:
+ assert False
stsd_payload += sample_entry_box
stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box
@@ -221,10 +248,13 @@ def real_download(self, filename, info_dict):
self._prepare_and_start_frag_download(ctx)
+ extra_state = ctx.setdefault('extra_state', {
+ 'ism_track_written': False,
+ })
+
fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
- track_written = False
frag_index = 0
for i, segment in enumerate(segments):
frag_index += 1
@@ -236,11 +266,11 @@ def real_download(self, filename, info_dict):
success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
if not success:
return False
- if not track_written:
+ if not extra_state['ism_track_written']:
tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])
info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]
write_piff_header(ctx['dest_stream'], info_dict['_download_params'])
- track_written = True
+ extra_state['ism_track_written'] = True
self._append_fragment(ctx, frag_content)
break
except compat_urllib_error.HTTPError as err:
diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py
index c2cec98452..4afde8f90e 100644
--- a/yt_dlp/extractor/atresplayer.py
+++ b/yt_dlp/extractor/atresplayer.py
@@ -86,18 +86,19 @@ def _real_extract(self, url):
title = episode['titulo']
formats = []
+ subtitles = {}
for source in episode.get('sources', []):
src = source.get('src')
if not src:
continue
src_type = source.get('type')
if src_type == 'application/vnd.apple.mpegurl':
- formats.extend(self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats(
src, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
elif src_type == 'application/dash+xml':
- formats.extend(self._extract_mpd_formats(
- src, video_id, mpd_id='dash', fatal=False))
+ formats, subtitles = self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False)
self._sort_formats(formats)
heartbeat = episode.get('heartbeat') or {}
@@ -115,4 +116,5 @@ def _real_extract(self, url):
'channel': get_meta('channel'),
'season': get_meta('season'),
'episode_number': int_or_none(get_meta('episodeNumber')),
+ 'subtitles': subtitles,
}
diff --git a/yt_dlp/extractor/byutv.py b/yt_dlp/extractor/byutv.py
index 0b11bf11fc..7c6c826d7c 100644
--- a/yt_dlp/extractor/byutv.py
+++ b/yt_dlp/extractor/byutv.py
@@ -82,6 +82,7 @@ def _real_extract(self, url):
info = {}
formats = []
+ subtitles = {}
for format_id, ep in video.items():
if not isinstance(ep, dict):
continue
@@ -90,12 +91,16 @@ def _real_extract(self, url):
continue
ext = determine_ext(video_url)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
+ formats.extend(m3u8_fmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- video_url, video_id, mpd_id='dash', fatal=False))
+ mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles(
+ video_url, video_id, mpd_id='dash', fatal=False)
+ formats.extend(mpd_fmts)
+ subtitles = self._merge_subtitles(subtitles, mpd_subs)
else:
formats.append({
'url': video_url,
@@ -114,4 +119,5 @@ def _real_extract(self, url):
'display_id': display_id,
'title': display_id,
'formats': formats,
+ 'subtitles': subtitles,
})
diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py
index eefbab241b..1b7c1d2ff7 100644
--- a/yt_dlp/extractor/canvas.py
+++ b/yt_dlp/extractor/canvas.py
@@ -83,24 +83,31 @@ def _real_extract(self, url):
description = data.get('description')
formats = []
+ subtitles = {}
for target in data['targetUrls']:
format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
if not format_url or not format_type:
continue
format_type = format_type.upper()
if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
- m3u8_id=format_type, fatal=False))
+ m3u8_id=format_type, fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif format_type == 'HDS':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_type, fatal=False))
elif format_type == 'MPEG_DASH':
- formats.extend(self._extract_mpd_formats(
- format_url, video_id, mpd_id=format_type, fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ format_url, video_id, mpd_id=format_type, fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif format_type == 'HSS':
- formats.extend(self._extract_ism_formats(
- format_url, video_id, ism_id='mss', fatal=False))
+ fmts, subs = self._extract_ism_formats_and_subtitles(
+ format_url, video_id, ism_id='mss', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
else:
formats.append({
'format_id': format_type,
@@ -108,7 +115,6 @@ def _real_extract(self, url):
})
self._sort_formats(formats)
- subtitles = {}
subtitle_urls = data.get('subtitleUrls')
if isinstance(subtitle_urls, list):
for subtitle in subtitle_urls:
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 4487c53756..2ca25951b2 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1879,11 +1879,21 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m
'format_note': 'Quality selection URL',
}
- def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
- entry_protocol='m3u8', preference=None, quality=None,
- m3u8_id=None, note=None, errnote=None,
- fatal=True, live=False, data=None, headers={},
- query={}):
+ def _extract_m3u8_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self.report_warning(bug_reports_message(
+ "Ignoring subtitle tracks found in the HLS manifest; "
+ "if any subtitle tracks are missing,"
+ ))
+ return fmts
+
+ def _extract_m3u8_formats_and_subtitles(
+ self, m3u8_url, video_id, ext=None, entry_protocol='m3u8',
+ preference=None, quality=None, m3u8_id=None, note=None,
+ errnote=None, fatal=True, live=False, data=None, headers={},
+ query={}):
+
res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
@@ -1891,30 +1901,34 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
- return []
+ return [], {}
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
- return self._parse_m3u8_formats(
+ return self._parse_m3u8_formats_and_subtitles(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
preference=preference, quality=quality, m3u8_id=m3u8_id,
note=note, errnote=errnote, fatal=fatal, live=live, data=data,
headers=headers, query=query, video_id=video_id)
- def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
- entry_protocol='m3u8', preference=None, quality=None,
- m3u8_id=None, live=False, note=None, errnote=None,
- fatal=True, data=None, headers={}, query={}, video_id=None):
+ def _parse_m3u8_formats_and_subtitles(
+ self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8',
+ preference=None, quality=None, m3u8_id=None, live=False, note=None,
+ errnote=None, fatal=True, data=None, headers={}, query={},
+ video_id=None):
+
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
- return []
+ return [], {}
if (not self._downloader.params.get('allow_unplayable_formats')
and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
- return []
+ return [], {}
formats = []
+ subtitles = {}
+
format_url = lambda u: (
u
if re.match(r'^https?://', u)
@@ -2001,7 +2015,7 @@ def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None
}
formats.append(f)
- return formats
+ return formats, subtitles
groups = {}
last_stream_inf = {}
@@ -2013,6 +2027,21 @@ def extract_media(x_media_line):
if not (media_type and group_id and name):
return
groups.setdefault(group_id, []).append(media)
+ #
+ if media_type == 'SUBTITLES':
+ lang = media['LANGUAGE'] # XXX: normalise?
+ url = format_url(media['URI'])
+ sub_info = {
+ 'url': url,
+ 'ext': determine_ext(url),
+ }
+ if sub_info['ext'] == 'm3u8':
+ # Per RFC 8216 §3.1, the only possible subtitle format m3u8
+ # files may contain is WebVTT:
+ #
+ sub_info['ext'] = 'vtt'
+ sub_info['protocol'] = 'm3u8_native'
+ subtitles.setdefault(lang, []).append(sub_info)
if media_type not in ('VIDEO', 'AUDIO'):
return
media_url = media.get('URI')
@@ -2160,7 +2189,7 @@ def build_stream_name():
formats.append(http_f)
last_stream_inf = {}
- return formats
+ return formats, subtitles
@staticmethod
def _xpath_ns(path, namespace=None):
@@ -2403,23 +2432,44 @@ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
})
return entries
- def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+ def _extract_mpd_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self.report_warning(bug_reports_message(
+ "Ignoring subtitle tracks found in the DASH manifest; "
+ "if any subtitle tracks are missing,"
+ ))
+ return fmts
+
+ def _extract_mpd_formats_and_subtitles(
+ self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
+ fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
mpd_url, video_id,
note=note or 'Downloading MPD manifest',
errnote=errnote or 'Failed to download MPD manifest',
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
- return []
+ return [], {}
mpd_doc, urlh = res
if mpd_doc is None:
- return []
+ return [], {}
mpd_base_url = base_url(urlh.geturl())
- return self._parse_mpd_formats(
+ return self._parse_mpd_formats_and_subtitles(
mpd_doc, mpd_id, mpd_base_url, mpd_url)
- def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
+ def _parse_mpd_formats(self, *args, **kwargs):
+ fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self.report_warning(bug_reports_message(
+ "Ignoring subtitle tracks found in the DASH manifest; "
+ "if any subtitle tracks are missing,"
+ ))
+ return fmts
+
+ def _parse_mpd_formats_and_subtitles(
+ self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@@ -2429,7 +2479,7 @@ def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None
"""
if not self._downloader.params.get('dynamic_mpd', True):
if mpd_doc.get('type') == 'dynamic':
- return []
+ return [], {}
namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
@@ -2501,6 +2551,7 @@ def extract_Initialization(source):
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats = []
+ subtitles = {}
for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
@@ -2518,11 +2569,9 @@ def extract_Initialization(source):
representation_attrib.update(representation.attrib)
# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
mime_type = representation_attrib['mimeType']
- content_type = mime_type.split('/')[0]
- if content_type == 'text':
- # TODO implement WebVTT downloading
- pass
- elif content_type in ('video', 'audio'):
+ content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
+
+ if content_type in ('video', 'audio', 'text'):
base_url = ''
for element in (representation, adaptation_set, period, mpd_doc):
base_url_e = element.find(_add_ns('BaseURL'))
@@ -2539,21 +2588,28 @@ def extract_Initialization(source):
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
- f = {
- 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
- 'manifest_url': mpd_url,
- 'ext': mimetype2ext(mime_type),
- 'width': int_or_none(representation_attrib.get('width')),
- 'height': int_or_none(representation_attrib.get('height')),
- 'tbr': float_or_none(bandwidth, 1000),
- 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
- 'fps': int_or_none(representation_attrib.get('frameRate')),
- 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
- 'format_note': 'DASH %s' % content_type,
- 'filesize': filesize,
- 'container': mimetype2ext(mime_type) + '_dash',
- }
- f.update(parse_codecs(representation_attrib.get('codecs')))
+ if content_type in ('video', 'audio'):
+ f = {
+ 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
+ 'manifest_url': mpd_url,
+ 'ext': mimetype2ext(mime_type),
+ 'width': int_or_none(representation_attrib.get('width')),
+ 'height': int_or_none(representation_attrib.get('height')),
+ 'tbr': float_or_none(bandwidth, 1000),
+ 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
+ 'fps': int_or_none(representation_attrib.get('frameRate')),
+ 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
+ 'format_note': 'DASH %s' % content_type,
+ 'filesize': filesize,
+ 'container': mimetype2ext(mime_type) + '_dash',
+ }
+ f.update(parse_codecs(representation_attrib.get('codecs')))
+ elif content_type == 'text':
+ f = {
+ 'ext': mimetype2ext(mime_type),
+ 'manifest_url': mpd_url,
+ 'filesize': filesize,
+ }
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers):
@@ -2700,26 +2756,38 @@ def add_segment_url():
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
- formats.append(f)
+ if content_type in ('video', 'audio'):
+ formats.append(f)
+ elif content_type == 'text':
+ subtitles.setdefault(lang or 'und', []).append(f)
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
- return formats
+ return formats, subtitles
- def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+ def _extract_ism_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self.report_warning(bug_reports_message(
+ "Ignoring subtitle tracks found in the ISM manifest; "
+ "if any subtitle tracks are missing,"
+ ))
+ return fmts
+
+ def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
ism_url, video_id,
note=note or 'Downloading ISM manifest',
errnote=errnote or 'Failed to download ISM manifest',
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
- return []
+ return [], {}
ism_doc, urlh = res
if ism_doc is None:
- return []
+ return [], {}
- return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
+ return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
- def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
"""
Parse formats from ISM manifest.
References:
@@ -2727,26 +2795,28 @@ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
https://msdn.microsoft.com/en-us/library/ff469518.aspx
"""
if ism_doc.get('IsLive') == 'TRUE':
- return []
+ return [], {}
if (not self._downloader.params.get('allow_unplayable_formats')
and ism_doc.find('Protection') is not None):
- return []
+ return [], {}
duration = int(ism_doc.attrib['Duration'])
timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
formats = []
+ subtitles = {}
for stream in ism_doc.findall('StreamIndex'):
stream_type = stream.get('Type')
- if stream_type not in ('video', 'audio'):
+ if stream_type not in ('video', 'audio', 'text'):
continue
url_pattern = stream.attrib['Url']
stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
stream_name = stream.get('Name')
+ stream_language = stream.get('Language', 'und')
for track in stream.findall('QualityLevel'):
fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
# TODO: add support for WVC1 and WMAP
- if fourcc not in ('H264', 'AVC1', 'AACL'):
+ if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
self.report_warning('%s is not a supported codec' % fourcc)
continue
tbr = int(track.attrib['Bitrate']) // 1000
@@ -2789,33 +2859,52 @@ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
format_id.append(stream_name)
format_id.append(compat_str(tbr))
- formats.append({
- 'format_id': '-'.join(format_id),
- 'url': ism_url,
- 'manifest_url': ism_url,
- 'ext': 'ismv' if stream_type == 'video' else 'isma',
- 'width': width,
- 'height': height,
- 'tbr': tbr,
- 'asr': sampling_rate,
- 'vcodec': 'none' if stream_type == 'audio' else fourcc,
- 'acodec': 'none' if stream_type == 'video' else fourcc,
- 'protocol': 'ism',
- 'fragments': fragments,
- '_download_params': {
- 'duration': duration,
- 'timescale': stream_timescale,
- 'width': width or 0,
- 'height': height or 0,
- 'fourcc': fourcc,
- 'codec_private_data': track.get('CodecPrivateData'),
- 'sampling_rate': sampling_rate,
- 'channels': int_or_none(track.get('Channels', 2)),
- 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
- 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
- },
- })
- return formats
+ if stream_type == 'text':
+ subtitles.setdefault(stream_language, []).append({
+ 'ext': 'ismt',
+ 'protocol': 'ism',
+ 'url': ism_url,
+ 'manifest_url': ism_url,
+ 'fragments': fragments,
+ '_download_params': {
+ 'stream_type': stream_type,
+ 'duration': duration,
+ 'timescale': stream_timescale,
+ 'fourcc': fourcc,
+ 'language': stream_language,
+ 'codec_private_data': track.get('CodecPrivateData'),
+ }
+ })
+ elif stream_type in ('video', 'audio'):
+ formats.append({
+ 'format_id': '-'.join(format_id),
+ 'url': ism_url,
+ 'manifest_url': ism_url,
+ 'ext': 'ismv' if stream_type == 'video' else 'isma',
+ 'width': width,
+ 'height': height,
+ 'tbr': tbr,
+ 'asr': sampling_rate,
+ 'vcodec': 'none' if stream_type == 'audio' else fourcc,
+ 'acodec': 'none' if stream_type == 'video' else fourcc,
+ 'protocol': 'ism',
+ 'fragments': fragments,
+ '_download_params': {
+ 'stream_type': stream_type,
+ 'duration': duration,
+ 'timescale': stream_timescale,
+ 'width': width or 0,
+ 'height': height or 0,
+ 'fourcc': fourcc,
+ 'language': stream_language,
+ 'codec_private_data': track.get('CodecPrivateData'),
+ 'sampling_rate': sampling_rate,
+ 'channels': int_or_none(track.get('Channels', 2)),
+ 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
+ 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
+ },
+ })
+ return formats, subtitles
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
def absolute_url(item_url):
@@ -2940,7 +3029,16 @@ def _media_formats(src, cur_media_type, type_info={}):
entries.append(media_info)
return entries
- def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
+ def _extract_akamai_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self.report_warning(bug_reports_message(
+ "Ignoring subtitle tracks found in the manifests; "
+ "if any subtitle tracks are missing,"
+ ))
+ return fmts
+
+ def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
signed = 'hdnea=' in manifest_url
if not signed:
# https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
@@ -2949,6 +3047,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
'', manifest_url).strip('?')
formats = []
+ subtitles = {}
hdcore_sign = 'hdcore=3.7.0'
f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
@@ -2967,10 +3066,11 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
hls_host = hosts.get('hls')
if hls_host:
m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
- m3u8_formats = self._extract_m3u8_formats(
+ m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
formats.extend(m3u8_formats)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
http_host = hosts.get('http')
if http_host and m3u8_formats and not signed:
@@ -2994,7 +3094,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
formats.append(http_f)
i += 1
- return formats
+ return formats, subtitles
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
query = compat_urlparse.urlparse(url).query
@@ -3319,12 +3419,22 @@ def _merge_subtitle_items(subtitle_list1, subtitle_list2):
return ret
@classmethod
- def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
- """ Merge two subtitle dictionaries, language by language. """
- ret = dict(subtitle_dict1)
- for lang in subtitle_dict2:
- ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
- return ret
+ def _merge_subtitles(cls, *dicts, **kwargs):
+ """ Merge subtitle dictionaries, language by language. """
+
+ target = (lambda target=None: target)(**kwargs)
+ # The above lambda extracts the keyword argument 'target' from kwargs
+ # while ensuring there are no stray ones. When Python 2 support
+ # is dropped, remove it and change the function signature to:
+ #
+ # def _merge_subtitles(cls, *dicts, target=None):
+
+ if target is None:
+ target = {}
+ for d in dicts:
+ for lang, subs in d.items():
+ target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
+ return target
def extract_automatic_captions(self, *args, **kwargs):
if (self._downloader.params.get('writeautomaticsub', False)
diff --git a/yt_dlp/extractor/elonet.py b/yt_dlp/extractor/elonet.py
index 3647c0a9c3..eefba4e242 100644
--- a/yt_dlp/extractor/elonet.py
+++ b/yt_dlp/extractor/elonet.py
@@ -1,9 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
-import os
import re
-import tempfile
from .common import InfoExtractor
from ..utils import (
@@ -12,12 +10,12 @@
try_get,
)
from ..compat import compat_str
-from ..downloader.hls import HlsFD
class ElonetIE(InfoExtractor):
_VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P[0-9]+)'
- _TEST = {
+ _TESTS = [{
+ # m3u8 with subtitles
'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867',
'md5': '8efc954b96c543711707f87de757caea',
'info_dict': {
@@ -27,62 +25,17 @@ class ElonetIE(InfoExtractor):
'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...',
'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large',
},
- }
-
- def _download_m3u8_chunked_subtitle(self, chunklist_url):
- """
- Download VTT subtitles from pieces in manifest URL.
- Return a string containing joined chunks with extra headers removed.
- """
- with tempfile.NamedTemporaryFile(delete=True) as outfile:
- fname = outfile.name
- hlsdl = HlsFD(self._downloader, {})
- hlsdl.download(compat_str(fname), {"url": chunklist_url})
- with open(fname, 'r') as fin:
- # Remove (some) headers
- fdata = re.sub(r'X-TIMESTAMP-MAP.*\n+|WEBVTT\n+', '', fin.read())
- os.remove(fname)
- return "WEBVTT\n\n" + fdata
-
- def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url):
- """
- Parse subtitles from HLS / m3u8 manifest.
- """
- subtitles = {}
- baseurl = m3u8_url[:m3u8_url.rindex('/') + 1]
- for line in m3u8_doc.split('\n'):
- if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line:
- lang = self._search_regex(
- r'LANGUAGE="(.+?)"', line, 'lang', default=False)
- uri = self._search_regex(
- r'URI="(.+?)"', line, 'uri', default=False)
- if lang and uri:
- data = self._download_m3u8_chunked_subtitle(baseurl + uri)
- subtitles[lang] = [{'ext': 'vtt', 'data': data}]
- return subtitles
-
- def _parse_mpd_subtitles(self, mpd_doc):
- """
- Parse subtitles from MPD manifest.
- """
- ns = '{urn:mpeg:dash:schema:mpd:2011}'
- subtitles = {}
- for aset in mpd_doc.findall(".//%sAdaptationSet[@mimeType='text/vtt']" % (ns)):
- lang = aset.attrib.get('lang', 'unk')
- url = aset.find("./%sRepresentation/%sBaseURL" % (ns, ns)).text
- subtitles[lang] = [{'ext': 'vtt', 'url': url}]
- return subtitles
-
- def _get_subtitles(self, fmt, doc, url):
- if fmt == 'm3u8':
- subs = self._parse_m3u8_subtitles(doc, url)
- elif fmt == 'mpd':
- subs = self._parse_mpd_subtitles(doc)
- else:
- self.report_warning(
- "Cannot download subtitles from '%s' streams." % (fmt))
- subs = {}
- return subs
+ }, {
+ # DASH with subtitles
+ 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539',
+ 'info_dict': {
+ 'id': '116539',
+ 'ext': 'mp4',
+ 'title': 'Minulla on tiikeri',
+ 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...',
+ 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr',
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -101,8 +54,8 @@ def _real_extract(self, url):
self._parse_json(json_s, video_id),
lambda x: x[0]["src"], compat_str)
formats = []
+ subtitles = {}
if re.search(r'\.m3u8\??', src):
- fmt = 'm3u8'
res = self._download_webpage_handle(
# elonet servers have certificate problems
src.replace('https:', 'http:'), video_id,
@@ -111,11 +64,10 @@ def _real_extract(self, url):
if res:
doc, urlh = res
url = urlh.geturl()
- formats = self._parse_m3u8_formats(doc, url)
+ formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url)
for f in formats:
f['ext'] = 'mp4'
elif re.search(r'\.mpd\??', src):
- fmt = 'mpd'
res = self._download_xml_handle(
src, video_id,
note='Downloading MPD manifest',
@@ -123,7 +75,7 @@ def _real_extract(self, url):
if res:
doc, urlh = res
url = base_url(urlh.geturl())
- formats = self._parse_mpd_formats(doc, mpd_base_url=url)
+ formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url)
else:
raise ExtractorError("Unknown streaming format")
@@ -133,5 +85,5 @@ def _real_extract(self, url):
'description': description,
'thumbnail': thumbnail,
'formats': formats,
- 'subtitles': self.extract_subtitles(fmt, doc, url),
+ 'subtitles': subtitles,
}
diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py
index 313de343ef..e57e165fc9 100644
--- a/yt_dlp/extractor/francetv.py
+++ b/yt_dlp/extractor/francetv.py
@@ -151,6 +151,7 @@ def sign(manifest_url, manifest_id):
videos.append(fallback_info['video'])
formats = []
+ subtitles = {}
for video in videos:
video_url = video.get('url')
if not video_url:
@@ -171,10 +172,12 @@ def sign(manifest_url, manifest_id):
sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
video_id, f4m_id=format_id, fatal=False))
elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
sign(video_url, format_id), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id,
- fatal=False))
+ fatal=False)
+ formats.extend(m3u8_fmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
@@ -199,13 +202,12 @@ def sign(manifest_url, manifest_id):
title += ' - %s' % subtitle
title = title.strip()
- subtitles = {}
- subtitles_list = [{
- 'url': subformat['url'],
- 'ext': subformat.get('format'),
- } for subformat in info.get('subtitles', []) if subformat.get('url')]
- if subtitles_list:
- subtitles['fr'] = subtitles_list
+ subtitles.setdefault('fr', []).extend(
+ [{
+ 'url': subformat['url'],
+ 'ext': subformat.get('format'),
+ } for subformat in info.get('subtitles', []) if subformat.get('url')]
+ )
return {
'id': video_id,
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 4250d10932..32815476fa 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -2444,8 +2444,9 @@ def _real_extract(self, url):
m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type)
if m:
format_id = compat_str(m.group('format_id'))
+ subtitles = {}
if format_id.endswith('mpegurl'):
- formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
elif format_id == 'f4m':
formats = self._extract_f4m_formats(url, video_id)
else:
@@ -2457,6 +2458,7 @@ def _real_extract(self, url):
info_dict['direct'] = True
self._sort_formats(formats)
info_dict['formats'] = formats
+ info_dict['subtitles'] = subtitles
return info_dict
if not self._downloader.params.get('test', False) and not is_intentional:
@@ -2510,7 +2512,7 @@ def _real_extract(self, url):
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
elif doc.tag == 'SmoothStreamingMedia':
- info_dict['formats'] = self._parse_ism_formats(doc, url)
+ info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
self._sort_formats(info_dict['formats'])
return info_dict
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
@@ -2524,7 +2526,7 @@ def _real_extract(self, url):
xspf_base_url=full_response.geturl()),
video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
- info_dict['formats'] = self._parse_mpd_formats(
+ info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
doc,
mpd_base_url=full_response.geturl().rpartition('/')[0],
mpd_url=url)
diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py
index 1f03a9462d..99964737d8 100644
--- a/yt_dlp/extractor/nytimes.py
+++ b/yt_dlp/extractor/nytimes.py
@@ -46,6 +46,7 @@ def get_file_size(file_size):
urls = []
formats = []
+ subtitles = {}
for video in video_data.get('renditions', []):
video_url = video.get('url')
format_id = video.get('type')
@@ -54,9 +55,11 @@ def get_file_size(file_size):
urls.append(video_url)
ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id=format_id or 'hls', fatal=False))
+ m3u8_id=format_id or 'hls', fatal=False)
+ formats.extend(m3u8_fmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
continue
# formats.extend(self._extract_mpd_formats(
@@ -96,6 +99,7 @@ def get_file_size(file_size):
'uploader': video_data.get('byline'),
'duration': float_or_none(video_data.get('duration'), 1000),
'formats': formats,
+ 'subtitles': subtitles,
'thumbnails': thumbnails,
}
diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py
index 0724cef268..2c815bda63 100644
--- a/yt_dlp/extractor/roosterteeth.py
+++ b/yt_dlp/extractor/roosterteeth.py
@@ -103,7 +103,7 @@ def _real_extract(self, url):
api_episode_url + '/videos', display_id,
'Downloading video JSON metadata')['data'][0]
m3u8_url = video_data['attributes']['url']
- subtitle_m3u8_url = video_data['links']['download']
+ # XXX: additional URL at video_data['links']['download']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
if self._parse_json(e.cause.read().decode(), display_id).get('access') is False:
@@ -111,7 +111,7 @@ def _real_extract(self, url):
'%s is only available for FIRST members' % display_id)
raise
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls')
self._sort_formats(formats)
@@ -134,33 +134,6 @@ def _real_extract(self, url):
'url': img_url,
})
- subtitles = {}
- res = self._download_webpage_handle(
- subtitle_m3u8_url, display_id,
- 'Downloading m3u8 information',
- 'Failed to download m3u8 information',
- fatal=True, data=None, headers={}, query={})
- if res is not False:
- subtitle_m3u8_doc, _ = res
- for line in subtitle_m3u8_doc.split('\n'):
- if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line:
- parts = line.split(',')
- for part in parts:
- if 'LANGUAGE' in part:
- lang = part[part.index('=') + 2:-1]
- elif 'URI' in part:
- uri = part[part.index('=') + 2:-1]
- res = self._download_webpage_handle(
- uri, display_id,
- 'Downloading m3u8 information',
- 'Failed to download m3u8 information',
- fatal=True, data=None, headers={}, query={})
- doc, _ = res
- for l in doc.split('\n'):
- if not l.startswith('#'):
- subtitles[lang] = [{'url': uri[:-uri[::-1].index('/')] + l}]
- break
-
return {
'id': video_id,
'display_id': display_id,
diff --git a/yt_dlp/extractor/srgssr.py b/yt_dlp/extractor/srgssr.py
index ac018e7405..2977b5e670 100644
--- a/yt_dlp/extractor/srgssr.py
+++ b/yt_dlp/extractor/srgssr.py
@@ -87,6 +87,7 @@ def _real_extract(self, url):
title = media_data['title']
formats = []
+ subtitles = {}
q = qualities(['SD', 'HD'])
for source in (media_data.get('resourceList') or []):
format_url = source.get('url')
@@ -104,12 +105,16 @@ def _real_extract(self, url):
if source.get('tokenType') == 'AKAMAI':
format_url = self._get_tokenized_src(
format_url, media_id, format_id)
- formats.extend(self._extract_akamai_formats(
- format_url, media_id))
+ fmts, subs = self._extract_akamai_formats_and_subtitles(
+ format_url, media_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif protocol == 'HLS':
- formats.extend(self._extract_m3u8_formats(
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
format_url, media_id, 'mp4', 'm3u8_native',
- m3u8_id=format_id, fatal=False))
+ m3u8_id=format_id, fatal=False)
+ formats.extend(m3u8_fmts)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif protocol in ('HTTP', 'HTTPS'):
formats.append({
'format_id': format_id,
@@ -133,7 +138,6 @@ def _real_extract(self, url):
})
self._sort_formats(formats)
- subtitles = {}
if media_type == 'video':
for sub in (media_data.get('subtitleList') or []):
sub_url = sub.get('url')
diff --git a/yt_dlp/extractor/threeqsdn.py b/yt_dlp/extractor/threeqsdn.py
index 5eaa991eb5..bb7610352d 100644
--- a/yt_dlp/extractor/threeqsdn.py
+++ b/yt_dlp/extractor/threeqsdn.py
@@ -99,16 +99,21 @@ def _real_extract(self, url):
aspect = float_or_none(config.get('aspect'))
formats = []
+ subtitles = {}
for source_type, source in (config.get('sources') or {}).items():
if not source:
continue
if source_type == 'dash':
- formats.extend(self._extract_mpd_formats(
- source, video_id, mpd_id='mpd', fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ source, video_id, mpd_id='mpd', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif source_type == 'hls':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
elif source_type == 'progressive':
for s in source:
src = s.get('src')
@@ -138,7 +143,6 @@ def _real_extract(self, url):
# behaviour is being kept as-is
self._sort_formats(formats, ('res', 'source_preference'))
- subtitles = {}
for subtitle in (config.get('subtitles') or []):
src = subtitle.get('src')
if not src:
diff --git a/yt_dlp/extractor/tv4.py b/yt_dlp/extractor/tv4.py
index b8ad4fafc4..4043e63662 100644
--- a/yt_dlp/extractor/tv4.py
+++ b/yt_dlp/extractor/tv4.py
@@ -93,18 +93,31 @@ def _real_extract(self, url):
'device': 'browser',
'protocol': 'hls',
})['playbackItem']['manifestUrl']
- formats = self._extract_m3u8_formats(
+ formats = []
+ subtitles = {}
+
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
manifest_url, video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False)
- formats.extend(self._extract_mpd_formats(
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
manifest_url.replace('.m3u8', '.mpd'),
- video_id, mpd_id='dash', fatal=False))
- formats.extend(self._extract_f4m_formats(
+ video_id, mpd_id='dash', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+
+ fmts = self._extract_f4m_formats(
manifest_url.replace('.m3u8', '.f4m'),
- video_id, f4m_id='hds', fatal=False))
- formats.extend(self._extract_ism_formats(
+ video_id, f4m_id='hds', fatal=False)
+ formats.extend(fmts)
+
+ fmts, subs = self._extract_ism_formats_and_subtitles(
re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url),
- video_id, ism_id='mss', fatal=False))
+ video_id, ism_id='mss', fatal=False)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
if not formats and info.get('is_geo_restricted'):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
@@ -115,7 +128,7 @@ def _real_extract(self, url):
'id': video_id,
'title': title,
'formats': formats,
- # 'subtitles': subtitles,
+ 'subtitles': subtitles,
'description': info.get('description'),
'timestamp': parse_iso8601(info.get('broadcast_date_time')),
'duration': int_or_none(info.get('duration')),
diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py
index 8a2a77b710..63c11bd479 100644
--- a/yt_dlp/extractor/twitter.py
+++ b/yt_dlp/extractor/twitter.py
@@ -36,9 +36,9 @@ class TwitterBaseIE(InfoExtractor):
def _extract_variant_formats(self, variant, video_id):
variant_url = variant.get('url')
if not variant_url:
- return []
+ return [], {}
elif '.m3u8' in variant_url:
- return self._extract_m3u8_formats(
+ return self._extract_m3u8_formats_and_subtitles(
variant_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
else:
@@ -49,22 +49,27 @@ def _extract_variant_formats(self, variant, video_id):
'tbr': tbr,
}
self._search_dimensions_in_video_url(f, variant_url)
- return [f]
+ return [f], {}
def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_data = self._download_xml(vmap_url, video_id)
formats = []
+ subtitles = {}
urls = []
for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'):
video_variant.attrib['url'] = compat_urllib_parse_unquote(
video_variant.attrib['url'])
urls.append(video_variant.attrib['url'])
- formats.extend(self._extract_variant_formats(
- video_variant.attrib, video_id))
+ fmts, subs = self._extract_variant_formats(
+ video_variant.attrib, video_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile'))
if video_url not in urls:
- formats.extend(self._extract_variant_formats({'url': video_url}, video_id))
- return formats
+ fmts, subs = self._extract_variant_formats({'url': video_url}, video_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ return formats, subtitles
@staticmethod
def _search_dimensions_in_video_url(a_format, video_url):
@@ -471,8 +476,11 @@ def extract_from_video_info(media):
video_info = media.get('video_info') or {}
formats = []
+ subtitles = {}
for variant in video_info.get('variants', []):
- formats.extend(self._extract_variant_formats(variant, twid))
+ fmts, subs = self._extract_variant_formats(variant, twid)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ formats.extend(fmts)
self._sort_formats(formats)
thumbnails = []
@@ -491,6 +499,7 @@ def add_thumbnail(name, size):
info.update({
'formats': formats,
+ 'subtitles': subtitles,
'thumbnails': thumbnails,
'duration': float_or_none(video_info.get('duration_millis'), 1000),
})
@@ -540,7 +549,7 @@ def get_binding_value(k):
is_amplify = card_name == 'amplify'
vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
- formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
+ formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
self._sort_formats(formats)
thumbnails = []
@@ -558,6 +567,7 @@ def get_binding_value(k):
info.update({
'formats': formats,
+ 'subtitles': subtitles,
'thumbnails': thumbnails,
'duration': int_or_none(get_binding_value(
'content_duration_seconds')),
diff --git a/yt_dlp/extractor/uplynk.py b/yt_dlp/extractor/uplynk.py
index f06bf5b127..c0dba0a6ad 100644
--- a/yt_dlp/extractor/uplynk.py
+++ b/yt_dlp/extractor/uplynk.py
@@ -30,7 +30,7 @@ class UplynkIE(InfoExtractor):
def _extract_uplynk_info(self, uplynk_content_url):
path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups()
display_id = video_id or external_id
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
'http://content.uplynk.com/%s.m3u8' % path,
display_id, 'mp4', 'm3u8_native')
if session_id:
@@ -48,6 +48,7 @@ def _extract_uplynk_info(self, uplynk_content_url):
'duration': float_or_none(asset.get('duration')),
'uploader_id': asset.get('owner'),
'formats': formats,
+ 'subtitles': subtitles,
}
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/wat.py b/yt_dlp/extractor/wat.py
index 05dcc1f17e..0f1d08da35 100644
--- a/yt_dlp/extractor/wat.py
+++ b/yt_dlp/extractor/wat.py
@@ -69,19 +69,24 @@ def _real_extract(self, url):
title = video_info['title']
formats = []
+ subtitles = {}
def extract_formats(manifest_urls):
for f, f_url in manifest_urls.items():
if not f_url:
continue
if f in ('dash', 'mpd'):
- formats.extend(self._extract_mpd_formats(
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'),
- video_id, mpd_id='dash', fatal=False))
+ video_id, mpd_id='dash', fatal=False)
elif f == 'hls':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
f_url, video_id, 'mp4',
- 'm3u8_native', m3u8_id='hls', fatal=False))
+ 'm3u8_native', m3u8_id='hls', fatal=False)
+ else:
+ continue
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
delivery = video_data.get('delivery') or {}
extract_formats({delivery.get('format'): delivery.get('url')})
@@ -103,4 +108,5 @@ def extract_formats(manifest_urls):
video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])),
'duration': int_or_none(video_info.get('duration')),
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 40d9568088..9ddd6453f5 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -2340,15 +2340,20 @@ def make_HTTPS_handler(params, **kwargs):
return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
-def bug_reports_message():
+def bug_reports_message(before=';'):
if ytdl_is_updateable():
update_cmd = 'type yt-dlp -U to update'
else:
update_cmd = 'see https://github.com/yt-dlp/yt-dlp on how to update'
- msg = '; please report this issue on https://github.com/yt-dlp/yt-dlp .'
+ msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
msg += ' Make sure you are using the latest version; %s.' % update_cmd
msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
- return msg
+
+ before = before.rstrip()
+ if not before or before.endswith(('.', '!', '?')):
+ msg = msg[0].title() + msg[1:]
+
+ return (before + ' ' if before else '') + msg
class YoutubeDLError(Exception):
diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py
new file mode 100644
index 0000000000..a184ee3699
--- /dev/null
+++ b/yt_dlp/webvtt.py
@@ -0,0 +1,378 @@
+# coding: utf-8
+from __future__ import unicode_literals, print_function, division
+
+"""
+A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
+to be able to assemble a single stand-alone subtitle file, suitably adjusting
+timestamps on the way, while everything else is passed through unmodified.
+
+Regular expressions based on the W3C WebVTT specification
+. The X-TIMESTAMP-MAP extension is described
+in RFC 8216 §3.5 .
+"""
+
+import re
+import io
+from .utils import int_or_none
+from .compat import (
+ compat_str as str,
+ compat_Pattern,
+ compat_Match,
+)
+
+
+class _MatchParser(object):
+ """
+ An object that maintains the current parsing position and allows
+ conveniently advancing it as syntax elements are successfully parsed.
+ """
+
+ def __init__(self, string):
+ self._data = string
+ self._pos = 0
+
+ def match(self, r):
+ if isinstance(r, compat_Pattern):
+ return r.match(self._data, self._pos)
+ if isinstance(r, str):
+ if self._data.startswith(r, self._pos):
+ return len(r)
+ return None
+ raise ValueError(r)
+
+ def advance(self, by):
+ if by is None:
+ amt = 0
+ elif isinstance(by, compat_Match):
+ amt = len(by.group(0))
+ elif isinstance(by, str):
+ amt = len(by)
+ elif isinstance(by, int):
+ amt = by
+ else:
+ raise ValueError(by)
+ self._pos += amt
+ return by
+
+ def consume(self, r):
+ return self.advance(self.match(r))
+
+ def child(self):
+ return _MatchChildParser(self)
+
+
+class _MatchChildParser(_MatchParser):
+ """
+ A child parser state, which advances through the same data as
+ its parent, but has an independent position. This is useful when
+ advancing through syntax elements we might later want to backtrack
+ from.
+ """
+
+ def __init__(self, parent):
+ super(_MatchChildParser, self).__init__(parent._data)
+ self.__parent = parent
+ self._pos = parent._pos
+
+ def commit(self):
+ """
+ Advance the parent state to the current position of this child state.
+ """
+ self.__parent._pos = self._pos
+ return self.__parent
+
+
+class ParseError(Exception):
+ def __init__(self, parser):
+ super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
+ parser._pos, parser._data[parser._pos:parser._pos + 20]
+ ))
+
+
+_REGEX_TS = re.compile(r'''(?x)
+ (?:([0-9]{2,}):)?
+ ([0-9]{2}):
+ ([0-9]{2})\.
+ ([0-9]{3})?
+''')
+_REGEX_EOF = re.compile(r'\Z')
+_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
+_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
+
+
+def _parse_ts(ts):
+ """
+ Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
+ into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
+ """
+
+ h, min, s, ms = ts.groups()
+ return 90 * (
+ int(h or 0) * 3600000 + # noqa: W504,E221,E222
+ int(min) * 60000 + # noqa: W504,E221,E222
+ int(s) * 1000 + # noqa: W504,E221,E222
+ int(ms) # noqa: W504,E221,E222
+ )
+
+
+def _format_ts(ts):
+ """
+ Convert an MPEG PES timestamp into a WebVTT timestamp.
+ This will lose sub-millisecond precision.
+ """
+
+ ts = int((ts + 45) // 90)
+ ms , ts = divmod(ts, 1000) # noqa: W504,E221,E222,E203
+ s , ts = divmod(ts, 60) # noqa: W504,E221,E222,E203
+ min, h = divmod(ts, 60) # noqa: W504,E221,E222
+ return '%02u:%02u:%02u.%03u' % (h, min, s, ms)
+
+
+class Block(object):
+ """
+ An abstract WebVTT block.
+ """
+
+ def __init__(self, **kwargs):
+ for key, val in kwargs.items():
+ setattr(self, key, val)
+
+ @classmethod
+ def parse(cls, parser):
+ m = parser.match(cls._REGEX)
+ if not m:
+ return None
+ parser.advance(m)
+ return cls(raw=m.group(0))
+
+ def write_into(self, stream):
+ stream.write(self.raw)
+
+
+class HeaderBlock(Block):
+ """
+ A WebVTT block that may only appear in the header part of the file,
+ i.e. before any cue blocks.
+ """
+
+ pass
+
+
+class Magic(HeaderBlock):
+ _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
+
+ # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
+ # , but the RFC
+ # doesn’t specify the exact grammar nor where in the WebVTT
+ # syntax it should be placed; the below has been devised based
+ # on usage in the wild
+ #
+ # And strictly speaking, the presence of this extension violates
+ # the W3C WebVTT spec. Oh well.
+
+ _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
+ _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
+ _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
+
+ @classmethod
+ def __parse_tsmap(cls, parser):
+ parser = parser.child()
+
+ while True:
+ m = parser.consume(cls._REGEX_TSMAP_LOCAL)
+ if m:
+ m = parser.consume(_REGEX_TS)
+ if m is None:
+ raise ParseError(parser)
+ local = _parse_ts(m)
+ if local is None:
+ raise ParseError(parser)
+ else:
+ m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
+ if m:
+ mpegts = int_or_none(m.group(1))
+ if mpegts is None:
+ raise ParseError(parser)
+ else:
+ raise ParseError(parser)
+ if parser.consume(','):
+ continue
+ if parser.consume(_REGEX_NL):
+ break
+ raise ParseError(parser)
+
+ parser.commit()
+ return local, mpegts
+
+ @classmethod
+ def parse(cls, parser):
+ parser = parser.child()
+
+ m = parser.consume(cls._REGEX)
+ if not m:
+ raise ParseError(parser)
+
+ extra = m.group(1)
+ local, mpegts = None, None
+ if parser.consume(cls._REGEX_TSMAP):
+ local, mpegts = cls.__parse_tsmap(parser)
+ if not parser.consume(_REGEX_NL):
+ raise ParseError(parser)
+ parser.commit()
+ return cls(extra=extra, mpegts=mpegts, local=local)
+
+ def write_into(self, stream):
+ stream.write('WEBVTT')
+ if self.extra is not None:
+ stream.write(self.extra)
+ stream.write('\n')
+ if self.local or self.mpegts:
+ stream.write('X-TIMESTAMP-MAP=LOCAL:')
+ stream.write(_format_ts(self.local if self.local is not None else 0))
+ stream.write(',MPEGTS:')
+ stream.write(str(self.mpegts if self.mpegts is not None else 0))
+ stream.write('\n')
+ stream.write('\n')
+
+
+class StyleBlock(HeaderBlock):
+ _REGEX = re.compile(r'''(?x)
+ STYLE[\ \t]*(?:\r\n|[\r\n])
+ ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
+ (?:\r\n|[\r\n])
+ ''')
+
+
+class RegionBlock(HeaderBlock):
+ _REGEX = re.compile(r'''(?x)
+ REGION[\ \t]*
+ ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
+ (?:\r\n|[\r\n])
+ ''')
+
+
+class CommentBlock(Block):
+ _REGEX = re.compile(r'''(?x)
+ NOTE(?:\r\n|[\ \t\r\n])
+ ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
+ (?:\r\n|[\r\n])
+ ''')
+
+
+class CueBlock(Block):
+ """
+ A cue block. The payload is not interpreted.
+ """
+
+ _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
+ _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
+ _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
+ _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
+
+ @classmethod
+ def parse(cls, parser):
+ parser = parser.child()
+
+ id = None
+ m = parser.consume(cls._REGEX_ID)
+ if m:
+ id = m.group(1)
+
+ m0 = parser.consume(_REGEX_TS)
+ if not m0:
+ return None
+ if not parser.consume(cls._REGEX_ARROW):
+ return None
+ m1 = parser.consume(_REGEX_TS)
+ if not m1:
+ return None
+ m2 = parser.consume(cls._REGEX_SETTINGS)
+ if not parser.consume(_REGEX_NL):
+ return None
+
+ start = _parse_ts(m0)
+ end = _parse_ts(m1)
+ settings = m2.group(1) if m2 is not None else None
+
+ text = io.StringIO()
+ while True:
+ m = parser.consume(cls._REGEX_PAYLOAD)
+ if not m:
+ break
+ text.write(m.group(0))
+
+ parser.commit()
+ return cls(
+ id=id,
+ start=start, end=end, settings=settings,
+ text=text.getvalue()
+ )
+
+ def write_into(self, stream):
+ if self.id is not None:
+ stream.write(self.id)
+ stream.write('\n')
+ stream.write(_format_ts(self.start))
+ stream.write(' --> ')
+ stream.write(_format_ts(self.end))
+ if self.settings is not None:
+ stream.write(' ')
+ stream.write(self.settings)
+ stream.write('\n')
+ stream.write(self.text)
+ stream.write('\n')
+
+ @property
+ def as_json(self):
+ return {
+ 'id': self.id,
+ 'start': self.start,
+ 'end': self.end,
+ 'text': self.text,
+ 'settings': self.settings,
+ }
+
+
+def parse_fragment(frag_content):
+ """
+ A generator that yields (partially) parsed WebVTT blocks when given
+ a bytes object containing the raw contents of a WebVTT file.
+ """
+
+ parser = _MatchParser(frag_content.decode('utf-8'))
+
+ yield Magic.parse(parser)
+
+ while not parser.match(_REGEX_EOF):
+ if parser.consume(_REGEX_BLANK):
+ continue
+
+ block = RegionBlock.parse(parser)
+ if block:
+ yield block
+ continue
+ block = StyleBlock.parse(parser)
+ if block:
+ yield block
+ continue
+ block = CommentBlock.parse(parser)
+ if block:
+ yield block # XXX: or skip
+ continue
+
+ break
+
+ while not parser.match(_REGEX_EOF):
+ if parser.consume(_REGEX_BLANK):
+ continue
+
+ block = CommentBlock.parse(parser)
+ if block:
+ yield block # XXX: or skip
+ continue
+ block = CueBlock.parse(parser)
+ if block:
+ yield block
+ continue
+
+ raise ParseError(parser)