diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index a086166948..f3578efe10 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -684,17 +684,186 @@ def test_parse_m3u8_formats(self): 'width': 1920, 'height': 1080, 'vcodec': 'avc1.64002a', - }] + }], + {} + ), + ( + 'bipbop_16x9', + 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8', + [{ + "format_id": "bipbop_audio-BipBop Audio 2", + "format_index": None, + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/alternate_audio_aac/prog_index.m3u8", + "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8", + "language": "eng", + "ext": "mp4", + "protocol": "m3u8", + "preference": None, + "quality": None, + "vcodec": "none", + "audio_ext": "mp4", + "video_ext": "none", + }, { + "format_id": "41", + "format_index": None, + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear0/prog_index.m3u8", + "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8", + "tbr": 41.457, + "ext": "mp4", + "fps": None, + "protocol": "m3u8", + "preference": None, + "quality": None, + "vcodec": "none", + "acodec": "mp4a.40.2", + "audio_ext": "mp4", + "video_ext": "none", + "abr": 41.457, + }, { + "format_id": "263", + "format_index": None, + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear1/prog_index.m3u8", + "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8", + "tbr": 263.851, + "ext": "mp4", + "fps": None, + "protocol": "m3u8", + "preference": None, + "quality": None, + "width": 416, + "height": 234, + "vcodec": "avc1.4d400d", + "acodec": "mp4a.40.2", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 263.851, + "abr": 0, + }, { + "format_id": "577", + "format_index": None, + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear2/prog_index.m3u8", + "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8", + "tbr": 577.61, + "ext": "mp4", + "fps": None, + "protocol": "m3u8", + "preference": None, + "quality": None, + "width": 640, + "height": 360, + "vcodec": "avc1.4d401e", + "acodec": "mp4a.40.2", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 577.61, + "abr": 0, + }, { + "format_id": "915", + "format_index": None, + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear3/prog_index.m3u8", + "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8", + "tbr": 915.905, + "ext": "mp4", + "fps": None, + "protocol": "m3u8", + "preference": None, + "quality": None, + "width": 960, + "height": 540, + "vcodec": "avc1.4d401f", + "acodec": "mp4a.40.2", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 915.905, + "abr": 0, + }, { + "format_id": "1030", + "format_index": None, + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear4/prog_index.m3u8", + "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8", + "tbr": 1030.138, + "ext": "mp4", + "fps": None, + "protocol": "m3u8", + "preference": None, + "quality": None, + "width": 1280, + "height": 720, + "vcodec": "avc1.4d401f", + "acodec": "mp4a.40.2", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 1030.138, + "abr": 0, + }, { + "format_id": "1924", + "format_index": None, + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear5/prog_index.m3u8", + "manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8", + "tbr": 1924.009, + "ext": "mp4", + "fps": None, + "protocol": "m3u8", + "preference": None, + "quality": None, + "width": 1920, + "height": 1080, + "vcodec": "avc1.4d401f", + "acodec": "mp4a.40.2", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 1924.009, + "abr": 0, + }], + { + "en": [{ + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/eng/prog_index.m3u8", + "ext": "vtt", + "protocol": "m3u8_native" + }, { + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/eng_forced/prog_index.m3u8", + "ext": "vtt", + "protocol": "m3u8_native" + }], + "fr": [{ + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/fra/prog_index.m3u8", + "ext": "vtt", + "protocol": "m3u8_native" + }, { + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/fra_forced/prog_index.m3u8", + "ext": "vtt", + "protocol": "m3u8_native" + }], + "es": [{ + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/spa/prog_index.m3u8", + "ext": "vtt", + "protocol": "m3u8_native" + }, { + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/spa_forced/prog_index.m3u8", + "ext": "vtt", + "protocol": "m3u8_native" + }], + "ja": [{ + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/jpn/prog_index.m3u8", + "ext": "vtt", + "protocol": "m3u8_native" + }, { + "url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/jpn_forced/prog_index.m3u8", + "ext": "vtt", + "protocol": "m3u8_native" + }], + } ), ] - for m3u8_file, m3u8_url, expected_formats in _TEST_CASES: + for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES: with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, mode='r', encoding='utf-8') as f: - formats = self.ie._parse_m3u8_formats( + formats, subs = self.ie._parse_m3u8_formats_and_subtitles( f.read(), m3u8_url, ext='mp4') self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + expect_value(self, subs, expected_subs, None) def test_parse_mpd_formats(self): _TEST_CASES = [ @@ -780,7 +949,8 @@ def test_parse_mpd_formats(self): 'tbr': 5997.485, 'width': 1920, 'height': 1080, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/pull/14844 'urls_only', @@ -863,7 +1033,8 @@ def test_parse_mpd_formats(self): 'tbr': 4400, 'width': 1920, 'height': 1080, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/issues/20346 # Media considered unfragmented even though it contains @@ -909,18 +1080,328 @@ def test_parse_mpd_formats(self): 'width': 360, 'height': 360, 'fps': 30, - }] + }], + {}, + ), ( + 'subtitles', + 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/', + [{ + "format_id": "audio=128001", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "ext": "m4a", + "tbr": 128.001, + "asr": 48000, + "format_note": "DASH audio", + "container": "m4a_dash", + "vcodec": "none", + "acodec": "mp4a.40.2", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/", + "protocol": "http_dash_segments", + "audio_ext": "m4a", + "video_ext": "none", + "abr": 128.001, + }, { + "format_id": "video=100000", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "ext": "mp4", + "width": 336, + "height": 144, + "tbr": 100, + "format_note": "DASH video", + "container": "mp4_dash", + "vcodec": "avc1.4D401F", + "acodec": "none", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/", + "protocol": "http_dash_segments", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 100, + }, { + "format_id": "video=326000", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "ext": "mp4", + "width": 562, + "height": 240, + "tbr": 326, + "format_note": "DASH video", + "container": "mp4_dash", + "vcodec": "avc1.4D401F", + "acodec": "none", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/", + "protocol": "http_dash_segments", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 326, + }, { + "format_id": "video=698000", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "ext": "mp4", + "width": 844, + "height": 360, + "tbr": 698, + "format_note": "DASH video", + "container": "mp4_dash", + "vcodec": "avc1.4D401F", + "acodec": "none", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/", + "protocol": "http_dash_segments", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 698, + }, { + "format_id": "video=1493000", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "ext": "mp4", + "width": 1126, + "height": 480, + "tbr": 1493, + "format_note": "DASH video", + "container": "mp4_dash", + "vcodec": "avc1.4D401F", + "acodec": "none", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/", + "protocol": "http_dash_segments", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 1493, + }, { + "format_id": "video=4482000", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "ext": "mp4", + "width": 1688, + "height": 720, + "tbr": 4482, + "format_note": "DASH video", + "container": "mp4_dash", + "vcodec": "avc1.4D401F", + "acodec": "none", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/", + "protocol": "http_dash_segments", + "video_ext": "mp4", + "audio_ext": "none", + "vbr": 4482, + }], + { + "en": [ + { + "ext": "mp4", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd", + "fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/", + "protocol": "http_dash_segments", + } + ] + }, ) ] - for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES: + for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES: with io.open('./test/testdata/mpd/%s.mpd' % mpd_file, mode='r', encoding='utf-8') as f: - formats = self.ie._parse_mpd_formats( + formats, subtitles = self.ie._parse_mpd_formats_and_subtitles( compat_etree_fromstring(f.read().encode('utf-8')), mpd_base_url=mpd_base_url, mpd_url=mpd_url) self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + expect_value(self, subtitles, expected_subtitles, None) + + def test_parse_ism_formats(self): + _TEST_CASES = [ + ( + 'sintel', + 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + [{ + "format_id": "audio-128", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "ext": "isma", + "tbr": 128, + "asr": 48000, + "vcodec": "none", + "acodec": "AACL", + "protocol": "ism", + "_download_params": { + "stream_type": "audio", + "duration": 8880746666, + "timescale": 10000000, + "width": 0, + "height": 0, + "fourcc": "AACL", + "codec_private_data": "1190", + "sampling_rate": 48000, + "channels": 2, + "bits_per_sample": 16, + "nal_unit_length_field": 4 + }, + "audio_ext": "isma", + "video_ext": "none", + "abr": 128, + }, { + "format_id": "video-100", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "ext": "ismv", + "width": 336, + "height": 144, + "tbr": 100, + "vcodec": "AVC1", + "acodec": "none", + "protocol": "ism", + "_download_params": { + "stream_type": "video", + "duration": 8880746666, + "timescale": 10000000, + "width": 336, + "height": 144, + "fourcc": "AVC1", + "codec_private_data": "00000001674D401FDA0544EFFC2D002CBC40000003004000000C03C60CA80000000168EF32C8", + "channels": 2, + "bits_per_sample": 16, + "nal_unit_length_field": 4 + }, + "video_ext": "ismv", + "audio_ext": "none", + "vbr": 100, + }, { + "format_id": "video-326", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "ext": "ismv", + "width": 562, + "height": 240, + "tbr": 326, + "vcodec": "AVC1", + "acodec": "none", + "protocol": "ism", + "_download_params": { + "stream_type": "video", + "duration": 8880746666, + "timescale": 10000000, + "width": 562, + "height": 240, + "fourcc": "AVC1", + "codec_private_data": "00000001674D401FDA0241FE23FFC3BC83BA44000003000400000300C03C60CA800000000168EF32C8", + "channels": 2, + "bits_per_sample": 16, + "nal_unit_length_field": 4 + }, + "video_ext": "ismv", + "audio_ext": "none", + "vbr": 326, + }, { + "format_id": "video-698", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "ext": "ismv", + "width": 844, + "height": 360, + "tbr": 698, + "vcodec": "AVC1", + "acodec": "none", + "protocol": "ism", + "_download_params": { + "stream_type": "video", + "duration": 8880746666, + "timescale": 10000000, + "width": 844, + "height": 360, + "fourcc": "AVC1", + "codec_private_data": "00000001674D401FDA0350BFB97FF06AF06AD1000003000100000300300F1832A00000000168EF32C8", + "channels": 2, + "bits_per_sample": 16, + "nal_unit_length_field": 4 + }, + "video_ext": "ismv", + "audio_ext": "none", + "vbr": 698, + }, { + "format_id": "video-1493", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "ext": "ismv", + "width": 1126, + "height": 480, + "tbr": 1493, + "vcodec": "AVC1", + "acodec": "none", + "protocol": "ism", + "_download_params": { + "stream_type": "video", + "duration": 8880746666, + "timescale": 10000000, + "width": 1126, + "height": 480, + "fourcc": "AVC1", + "codec_private_data": "00000001674D401FDA011C3DE6FFF0D890D871000003000100000300300F1832A00000000168EF32C8", + "channels": 2, + "bits_per_sample": 16, + "nal_unit_length_field": 4 + }, + "video_ext": "ismv", + "audio_ext": "none", + "vbr": 1493, + }, { + "format_id": "video-4482", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "ext": "ismv", + "width": 1688, + "height": 720, + "tbr": 4482, + "vcodec": "AVC1", + "acodec": "none", + "protocol": "ism", + "_download_params": { + "stream_type": "video", + "duration": 8880746666, + "timescale": 10000000, + "width": 1688, + "height": 720, + "fourcc": "AVC1", + "codec_private_data": "00000001674D401FDA01A816F97FFC1ABC1AB440000003004000000C03C60CA80000000168EF32C8", + "channels": 2, + "bits_per_sample": 16, + "nal_unit_length_field": 4 + }, + "video_ext": "ismv", + "audio_ext": "none", + "vbr": 4482, + }], + { + "eng": [ + { + "ext": "ismt", + "protocol": "ism", + "url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest", + "_download_params": { + "stream_type": "text", + "duration": 8880746666, + "timescale": 10000000, + "fourcc": "TTML", + "codec_private_data": "" + } + } + ] + }, + ), + ] + + for ism_file, ism_url, expected_formats, expected_subtitles in _TEST_CASES: + with io.open('./test/testdata/ism/%s.Manifest' % ism_file, + mode='r', encoding='utf-8') as f: + formats, subtitles = self.ie._parse_ism_formats_and_subtitles( + compat_etree_fromstring(f.read().encode('utf-8')), ism_url=ism_url) + self.ie._sort_formats(formats) + expect_value(self, formats, expected_formats, None) + expect_value(self, subtitles, expected_subtitles, None) def test_parse_f4m_formats(self): _TEST_CASES = [ diff --git a/test/testdata/ism/sintel.Manifest b/test/testdata/ism/sintel.Manifest new file mode 100644 index 0000000000..2ff8c24478 --- /dev/null +++ b/test/testdata/ism/sintel.Manifest @@ -0,0 +1,988 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/testdata/m3u8/bipbop_16x9.m3u8 b/test/testdata/m3u8/bipbop_16x9.m3u8 new file mode 100644 index 0000000000..1ce87dd041 --- /dev/null +++ b/test/testdata/m3u8/bipbop_16x9.m3u8 @@ -0,0 +1,38 @@ +#EXTM3U + +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="bipbop_audio",LANGUAGE="eng",NAME="BipBop Audio 1",AUTOSELECT=YES,DEFAULT=YES +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="bipbop_audio",LANGUAGE="eng",NAME="BipBop Audio 2",AUTOSELECT=NO,DEFAULT=NO,URI="alternate_audio_aac/prog_index.m3u8" + + +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,LANGUAGE="en",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/eng/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="English (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="en",URI="subtitles/eng_forced/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Français",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="fr",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/fra/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Français (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="fr",URI="subtitles/fra_forced/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Español",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="es",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/spa/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Español (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="es",URI="subtitles/spa_forced/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="日本語",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="ja",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/jpn/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="日本語 (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="ja",URI="subtitles/jpn_forced/prog_index.m3u8" + + +#EXT-X-STREAM-INF:BANDWIDTH=263851,CODECS="mp4a.40.2, avc1.4d400d",RESOLUTION=416x234,AUDIO="bipbop_audio",SUBTITLES="subs" +gear1/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=28451,CODECS="avc1.4d400d",URI="gear1/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=577610,CODECS="mp4a.40.2, avc1.4d401e",RESOLUTION=640x360,AUDIO="bipbop_audio",SUBTITLES="subs" +gear2/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=181534,CODECS="avc1.4d401e",URI="gear2/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=915905,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=960x540,AUDIO="bipbop_audio",SUBTITLES="subs" +gear3/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=297056,CODECS="avc1.4d401f",URI="gear3/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=1030138,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=1280x720,AUDIO="bipbop_audio",SUBTITLES="subs" +gear4/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=339492,CODECS="avc1.4d401f",URI="gear4/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=1924009,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=1920x1080,AUDIO="bipbop_audio",SUBTITLES="subs" +gear5/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=669554,CODECS="avc1.4d401f",URI="gear5/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=41457,CODECS="mp4a.40.2",AUDIO="bipbop_audio",SUBTITLES="subs" +gear0/prog_index.m3u8 diff --git a/test/testdata/mpd/subtitles.mpd b/test/testdata/mpd/subtitles.mpd new file mode 100644 index 0000000000..6f948adba9 --- /dev/null +++ b/test/testdata/mpd/subtitles.mpd @@ -0,0 +1,351 @@ + + + + + dash/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/yt_dlp/compat.py b/yt_dlp/compat.py index 3ebf1ee7a6..863bd2287c 100644 --- a/yt_dlp/compat.py +++ b/yt_dlp/compat.py @@ -3018,10 +3018,24 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs): return ctypes.WINFUNCTYPE(*args, **kwargs) +try: + compat_Pattern = re.Pattern +except AttributeError: + compat_Pattern = type(re.compile('')) + + +try: + compat_Match = re.Match +except AttributeError: + compat_Match = type(re.compile('').match('')) + + __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', + 'compat_Match', + 'compat_Pattern', 'compat_Struct', 'compat_b64decode', 'compat_basestring', diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index a0c1d13ac2..fadd0dfc5f 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -77,7 +77,10 @@ def _read_ytdl_file(self, ctx): assert 'ytdl_corrupt' not in ctx stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') try: - ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] + ytdl_data = json.loads(stream.read()) + ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index'] + if 'extra_state' in ytdl_data['downloader']: + ctx['extra_state'] = ytdl_data['downloader']['extra_state'] except Exception: ctx['ytdl_corrupt'] = True finally: @@ -90,6 +93,8 @@ def _write_ytdl_file(self, ctx): 'index': ctx['fragment_index'], }, } + if 'extra_state' in ctx: + downloader['extra_state'] = ctx['extra_state'] if ctx.get('fragment_count') is not None: downloader['fragment_count'] = ctx['fragment_count'] frag_index_stream.write(json.dumps({'downloader': downloader})) diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index f4e41a6c7b..270b33b22e 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -2,6 +2,7 @@ import errno import re +import io import binascii try: from Crypto.Cipher import AES @@ -27,7 +28,9 @@ parse_m3u8_attributes, sanitize_open, update_url_query, + bug_reports_message, ) +from .. import webvtt class HlsFD(FragmentFD): @@ -78,6 +81,8 @@ def real_download(self, filename, info_dict): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + is_webvtt = info_dict['ext'] == 'vtt' + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) man_url = urlh.geturl() s = urlh.read().decode('utf-8', 'ignore') @@ -142,6 +147,8 @@ def is_ad_fragment_end(s): else: self._prepare_and_start_frag_download(ctx) + extra_state = ctx.setdefault('extra_state', {}) + fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) test = self.params.get('test', False) @@ -308,6 +315,76 @@ def download_fragment(fragment): return frag_content, frag_index + pack_fragment = lambda frag_content, _: frag_content + + if is_webvtt: + def pack_fragment(frag_content, frag_index): + output = io.StringIO() + adjust = 0 + for block in webvtt.parse_fragment(frag_content): + if isinstance(block, webvtt.CueBlock): + block.start += adjust + block.end += adjust + + dedup_window = extra_state.setdefault('webvtt_dedup_window', []) + cue = block.as_json + + # skip the cue if an identical one appears + # in the window of potential duplicates + # and prune the window of unviable candidates + i = 0 + skip = True + while i < len(dedup_window): + window_cue = dedup_window[i] + if window_cue == cue: + break + if window_cue['end'] >= cue['start']: + i += 1 + continue + del dedup_window[i] + else: + skip = False + + if skip: + continue + + # add the cue to the window + dedup_window.append(cue) + elif isinstance(block, webvtt.Magic): + # take care of MPEG PES timestamp overflow + if block.mpegts is None: + block.mpegts = 0 + extra_state.setdefault('webvtt_mpegts_adjust', 0) + block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33 + if block.mpegts < extra_state.get('webvtt_mpegts_last', 0): + extra_state['webvtt_mpegts_adjust'] += 1 + block.mpegts += 1 << 33 + extra_state['webvtt_mpegts_last'] = block.mpegts + + if frag_index == 1: + extra_state['webvtt_mpegts'] = block.mpegts or 0 + extra_state['webvtt_local'] = block.local or 0 + # XXX: block.local = block.mpegts = None ? + else: + if block.mpegts is not None and block.local is not None: + adjust = ( + (block.mpegts - extra_state.get('webvtt_mpegts', 0)) + - (block.local - extra_state.get('webvtt_local', 0)) + ) + continue + elif isinstance(block, webvtt.HeaderBlock): + if frag_index != 1: + # XXX: this should probably be silent as well + # or verify that all segments contain the same data + self.report_warning(bug_reports_message( + 'Discarding a %s block found in the middle of the stream; ' + 'if the subtitles display incorrectly,' + % (type(block).__name__))) + continue + block.write_into(output) + + return output.getvalue().encode('utf-8') + def append_fragment(frag_content, frag_index): if frag_content: fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], frag_index) @@ -315,6 +392,7 @@ def append_fragment(frag_content, frag_index): file, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized file.close() + frag_content = pack_fragment(frag_content, frag_index) self._append_fragment(ctx, frag_content) return True except EnvironmentError as ose: diff --git a/yt_dlp/downloader/ism.py b/yt_dlp/downloader/ism.py index 1ca666b4a1..07d74aef0b 100644 --- a/yt_dlp/downloader/ism.py +++ b/yt_dlp/downloader/ism.py @@ -48,7 +48,7 @@ def write_piff_header(stream, params): language = params.get('language', 'und') height = params.get('height', 0) width = params.get('width', 0) - is_audio = width == 0 and height == 0 + stream_type = params['stream_type'] creation_time = modification_time = int(time.time()) ftyp_payload = b'isml' # major brand @@ -77,7 +77,7 @@ def write_piff_header(stream, params): tkhd_payload += u32.pack(0) * 2 # reserved tkhd_payload += s16.pack(0) # layer tkhd_payload += s16.pack(0) # alternate group - tkhd_payload += s88.pack(1 if is_audio else 0) # volume + tkhd_payload += s88.pack(1 if stream_type == 'audio' else 0) # volume tkhd_payload += u16.pack(0) # reserved tkhd_payload += unity_matrix tkhd_payload += u1616.pack(width) @@ -93,19 +93,34 @@ def write_piff_header(stream, params): mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box hdlr_payload = u32.pack(0) # pre defined - hdlr_payload += b'soun' if is_audio else b'vide' # handler type - hdlr_payload += u32.pack(0) * 3 # reserved - hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0' # name + if stream_type == 'audio': # handler type + hdlr_payload += b'soun' + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += b'SoundHandler\0' # name + elif stream_type == 'video': + hdlr_payload += b'vide' + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += b'VideoHandler\0' # name + elif stream_type == 'text': + hdlr_payload += b'subt' + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += b'SubtitleHandler\0' # name + else: + assert False mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box - if is_audio: + if stream_type == 'audio': smhd_payload = s88.pack(0) # balance smhd_payload += u16.pack(0) # reserved media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header - else: + elif stream_type == 'video': vmhd_payload = u16.pack(0) # graphics mode vmhd_payload += u16.pack(0) * 3 # opcolor media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header + elif stream_type == 'text': + media_header_box = full_box(b'sthd', 0, 0, b'') # Subtitle Media Header + else: + assert False minf_payload = media_header_box dref_payload = u32.pack(1) # entry count @@ -117,7 +132,7 @@ def write_piff_header(stream, params): sample_entry_payload = u8.pack(0) * 6 # reserved sample_entry_payload += u16.pack(1) # data reference index - if is_audio: + if stream_type == 'audio': sample_entry_payload += u32.pack(0) * 2 # reserved sample_entry_payload += u16.pack(params.get('channels', 2)) sample_entry_payload += u16.pack(params.get('bits_per_sample', 16)) @@ -127,7 +142,7 @@ def write_piff_header(stream, params): if fourcc == 'AACL': sample_entry_box = box(b'mp4a', sample_entry_payload) - else: + elif stream_type == 'video': sample_entry_payload += u16.pack(0) # pre defined sample_entry_payload += u16.pack(0) # reserved sample_entry_payload += u32.pack(0) * 3 # pre defined @@ -155,6 +170,18 @@ def write_piff_header(stream, params): avcc_payload += pps sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry + else: + assert False + elif stream_type == 'text': + if fourcc == 'TTML': + sample_entry_payload += b'http://www.w3.org/ns/ttml\0' # namespace + sample_entry_payload += b'\0' # schema location + sample_entry_payload += b'\0' # auxilary mime types(??) + sample_entry_box = box(b'stpp', sample_entry_payload) + else: + assert False + else: + assert False stsd_payload += sample_entry_box stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box @@ -221,10 +248,13 @@ def real_download(self, filename, info_dict): self._prepare_and_start_frag_download(ctx) + extra_state = ctx.setdefault('extra_state', { + 'ism_track_written': False, + }) + fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - track_written = False frag_index = 0 for i, segment in enumerate(segments): frag_index += 1 @@ -236,11 +266,11 @@ def real_download(self, filename, info_dict): success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) if not success: return False - if not track_written: + if not extra_state['ism_track_written']: tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd']) info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] write_piff_header(ctx['dest_stream'], info_dict['_download_params']) - track_written = True + extra_state['ism_track_written'] = True self._append_fragment(ctx, frag_content) break except compat_urllib_error.HTTPError as err: diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index c2cec98452..4afde8f90e 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -86,18 +86,19 @@ def _real_extract(self, url): title = episode['titulo'] formats = [] + subtitles = {} for source in episode.get('sources', []): src = source.get('src') if not src: continue src_type = source.get('type') if src_type == 'application/vnd.apple.mpegurl': - formats.extend(self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats( src, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) elif src_type == 'application/dash+xml': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) + formats, subtitles = self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False) self._sort_formats(formats) heartbeat = episode.get('heartbeat') or {} @@ -115,4 +116,5 @@ def _real_extract(self, url): 'channel': get_meta('channel'), 'season': get_meta('season'), 'episode_number': int_or_none(get_meta('episodeNumber')), + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/byutv.py b/yt_dlp/extractor/byutv.py index 0b11bf11fc..7c6c826d7c 100644 --- a/yt_dlp/extractor/byutv.py +++ b/yt_dlp/extractor/byutv.py @@ -82,6 +82,7 @@ def _real_extract(self, url): info = {} formats = [] + subtitles = {} for format_id, ep in video.items(): if not isinstance(ep, dict): continue @@ -90,12 +91,16 @@ def _real_extract(self, url): continue ext = determine_ext(video_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) + mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles( + video_url, video_id, mpd_id='dash', fatal=False) + formats.extend(mpd_fmts) + subtitles = self._merge_subtitles(subtitles, mpd_subs) else: formats.append({ 'url': video_url, @@ -114,4 +119,5 @@ def _real_extract(self, url): 'display_id': display_id, 'title': display_id, 'formats': formats, + 'subtitles': subtitles, }) diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index eefbab241b..1b7c1d2ff7 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -83,24 +83,31 @@ def _real_extract(self, url): description = data.get('description') formats = [] + subtitles = {} for target in data['targetUrls']: format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) if not format_url or not format_type: continue format_type = format_type.upper() if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], - m3u8_id=format_type, fatal=False)) + m3u8_id=format_type, fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) elif format_type == 'HDS': formats.extend(self._extract_f4m_formats( format_url, video_id, f4m_id=format_type, fatal=False)) elif format_type == 'MPEG_DASH': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id=format_type, fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=format_type, fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) elif format_type == 'HSS': - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) + fmts, subs = self._extract_ism_formats_and_subtitles( + format_url, video_id, ism_id='mss', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) else: formats.append({ 'format_id': format_type, @@ -108,7 +115,6 @@ def _real_extract(self, url): }) self._sort_formats(formats) - subtitles = {} subtitle_urls = data.get('subtitleUrls') if isinstance(subtitle_urls, list): for subtitle in subtitle_urls: diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 4487c53756..2ca25951b2 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1879,11 +1879,21 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m 'format_note': 'Quality selection URL', } - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, - entry_protocol='m3u8', preference=None, quality=None, - m3u8_id=None, note=None, errnote=None, - fatal=True, live=False, data=None, headers={}, - query={}): + def _extract_m3u8_formats(self, *args, **kwargs): + fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the HLS manifest; " + "if any subtitle tracks are missing," + )) + return fmts + + def _extract_m3u8_formats_and_subtitles( + self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', + preference=None, quality=None, m3u8_id=None, note=None, + errnote=None, fatal=True, live=False, data=None, headers={}, + query={}): + res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -1891,30 +1901,34 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [] + return [], {} m3u8_doc, urlh = res m3u8_url = urlh.geturl() - return self._parse_m3u8_formats( + return self._parse_m3u8_formats_and_subtitles( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, preference=preference, quality=quality, m3u8_id=m3u8_id, note=note, errnote=errnote, fatal=fatal, live=live, data=data, headers=headers, query=query, video_id=video_id) - def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, - entry_protocol='m3u8', preference=None, quality=None, - m3u8_id=None, live=False, note=None, errnote=None, - fatal=True, data=None, headers={}, query={}, video_id=None): + def _parse_m3u8_formats_and_subtitles( + self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', + preference=None, quality=None, m3u8_id=None, live=False, note=None, + errnote=None, fatal=True, data=None, headers={}, query={}, + video_id=None): + if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return [] + return [], {} if (not self._downloader.params.get('allow_unplayable_formats') and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay - return [] + return [], {} formats = [] + subtitles = {} + format_url = lambda u: ( u if re.match(r'^https?://', u) @@ -2001,7 +2015,7 @@ def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None } formats.append(f) - return formats + return formats, subtitles groups = {} last_stream_inf = {} @@ -2013,6 +2027,21 @@ def extract_media(x_media_line): if not (media_type and group_id and name): return groups.setdefault(group_id, []).append(media) + # + if media_type == 'SUBTITLES': + lang = media['LANGUAGE'] # XXX: normalise? + url = format_url(media['URI']) + sub_info = { + 'url': url, + 'ext': determine_ext(url), + } + if sub_info['ext'] == 'm3u8': + # Per RFC 8216 §3.1, the only possible subtitle format m3u8 + # files may contain is WebVTT: + # + sub_info['ext'] = 'vtt' + sub_info['protocol'] = 'm3u8_native' + subtitles.setdefault(lang, []).append(sub_info) if media_type not in ('VIDEO', 'AUDIO'): return media_url = media.get('URI') @@ -2160,7 +2189,7 @@ def build_stream_name(): formats.append(http_f) last_stream_inf = {} - return formats + return formats, subtitles @staticmethod def _xpath_ns(path, namespace=None): @@ -2403,23 +2432,44 @@ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + def _extract_mpd_formats(self, *args, **kwargs): + fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the DASH manifest; " + "if any subtitle tracks are missing," + )) + return fmts + + def _extract_mpd_formats_and_subtitles( + self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, + fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [] + return [], {} mpd_doc, urlh = res if mpd_doc is None: - return [] + return [], {} mpd_base_url = base_url(urlh.geturl()) - return self._parse_mpd_formats( + return self._parse_mpd_formats_and_subtitles( mpd_doc, mpd_id, mpd_base_url, mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): + def _parse_mpd_formats(self, *args, **kwargs): + fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the DASH manifest; " + "if any subtitle tracks are missing," + )) + return fmts + + def _parse_mpd_formats_and_subtitles( + self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): """ Parse formats from MPD manifest. References: @@ -2429,7 +2479,7 @@ def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None """ if not self._downloader.params.get('dynamic_mpd', True): if mpd_doc.get('type') == 'dynamic': - return [] + return [], {} namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) @@ -2501,6 +2551,7 @@ def extract_Initialization(source): mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats = [] + subtitles = {} for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2518,11 +2569,9 @@ def extract_Initialization(source): representation_attrib.update(representation.attrib) # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] - content_type = mime_type.split('/')[0] - if content_type == 'text': - # TODO implement WebVTT downloading - pass - elif content_type in ('video', 'audio'): + content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) + + if content_type in ('video', 'audio', 'text'): base_url = '' for element in (representation, adaptation_set, period, mpd_doc): base_url_e = element.find(_add_ns('BaseURL')) @@ -2539,21 +2588,28 @@ def extract_Initialization(source): url_el = representation.find(_add_ns('BaseURL')) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) bandwidth = int_or_none(representation_attrib.get('bandwidth')) - f = { - 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, - 'manifest_url': mpd_url, - 'ext': mimetype2ext(mime_type), - 'width': int_or_none(representation_attrib.get('width')), - 'height': int_or_none(representation_attrib.get('height')), - 'tbr': float_or_none(bandwidth, 1000), - 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), - 'fps': int_or_none(representation_attrib.get('frameRate')), - 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, - 'format_note': 'DASH %s' % content_type, - 'filesize': filesize, - 'container': mimetype2ext(mime_type) + '_dash', - } - f.update(parse_codecs(representation_attrib.get('codecs'))) + if content_type in ('video', 'audio'): + f = { + 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, + 'manifest_url': mpd_url, + 'ext': mimetype2ext(mime_type), + 'width': int_or_none(representation_attrib.get('width')), + 'height': int_or_none(representation_attrib.get('height')), + 'tbr': float_or_none(bandwidth, 1000), + 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), + 'fps': int_or_none(representation_attrib.get('frameRate')), + 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, + 'format_note': 'DASH %s' % content_type, + 'filesize': filesize, + 'container': mimetype2ext(mime_type) + '_dash', + } + f.update(parse_codecs(representation_attrib.get('codecs'))) + elif content_type == 'text': + f = { + 'ext': mimetype2ext(mime_type), + 'manifest_url': mpd_url, + 'filesize': filesize, + } representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): @@ -2700,26 +2756,38 @@ def add_segment_url(): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - formats.append(f) + if content_type in ('video', 'audio'): + formats.append(f) + elif content_type == 'text': + subtitles.setdefault(lang or 'und', []).append(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - return formats + return formats, subtitles - def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + def _extract_ism_formats(self, *args, **kwargs): + fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the ISM manifest; " + "if any subtitle tracks are missing," + )) + return fmts + + def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( ism_url, video_id, note=note or 'Downloading ISM manifest', errnote=errnote or 'Failed to download ISM manifest', fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [] + return [], {} ism_doc, urlh = res if ism_doc is None: - return [] + return [], {} - return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) + return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id) - def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): """ Parse formats from ISM manifest. References: @@ -2727,26 +2795,28 @@ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): https://msdn.microsoft.com/en-us/library/ff469518.aspx """ if ism_doc.get('IsLive') == 'TRUE': - return [] + return [], {} if (not self._downloader.params.get('allow_unplayable_formats') and ism_doc.find('Protection') is not None): - return [] + return [], {} duration = int(ism_doc.attrib['Duration']) timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 formats = [] + subtitles = {} for stream in ism_doc.findall('StreamIndex'): stream_type = stream.get('Type') - if stream_type not in ('video', 'audio'): + if stream_type not in ('video', 'audio', 'text'): continue url_pattern = stream.attrib['Url'] stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') + stream_language = stream.get('Language', 'und') for track in stream.findall('QualityLevel'): fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) # TODO: add support for WVC1 and WMAP - if fourcc not in ('H264', 'AVC1', 'AACL'): + if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 @@ -2789,33 +2859,52 @@ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): format_id.append(stream_name) format_id.append(compat_str(tbr)) - formats.append({ - 'format_id': '-'.join(format_id), - 'url': ism_url, - 'manifest_url': ism_url, - 'ext': 'ismv' if stream_type == 'video' else 'isma', - 'width': width, - 'height': height, - 'tbr': tbr, - 'asr': sampling_rate, - 'vcodec': 'none' if stream_type == 'audio' else fourcc, - 'acodec': 'none' if stream_type == 'video' else fourcc, - 'protocol': 'ism', - 'fragments': fragments, - '_download_params': { - 'duration': duration, - 'timescale': stream_timescale, - 'width': width or 0, - 'height': height or 0, - 'fourcc': fourcc, - 'codec_private_data': track.get('CodecPrivateData'), - 'sampling_rate': sampling_rate, - 'channels': int_or_none(track.get('Channels', 2)), - 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), - 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), - }, - }) - return formats + if stream_type == 'text': + subtitles.setdefault(stream_language, []).append({ + 'ext': 'ismt', + 'protocol': 'ism', + 'url': ism_url, + 'manifest_url': ism_url, + 'fragments': fragments, + '_download_params': { + 'stream_type': stream_type, + 'duration': duration, + 'timescale': stream_timescale, + 'fourcc': fourcc, + 'language': stream_language, + 'codec_private_data': track.get('CodecPrivateData'), + } + }) + elif stream_type in ('video', 'audio'): + formats.append({ + 'format_id': '-'.join(format_id), + 'url': ism_url, + 'manifest_url': ism_url, + 'ext': 'ismv' if stream_type == 'video' else 'isma', + 'width': width, + 'height': height, + 'tbr': tbr, + 'asr': sampling_rate, + 'vcodec': 'none' if stream_type == 'audio' else fourcc, + 'acodec': 'none' if stream_type == 'video' else fourcc, + 'protocol': 'ism', + 'fragments': fragments, + '_download_params': { + 'stream_type': stream_type, + 'duration': duration, + 'timescale': stream_timescale, + 'width': width or 0, + 'height': height or 0, + 'fourcc': fourcc, + 'language': stream_language, + 'codec_private_data': track.get('CodecPrivateData'), + 'sampling_rate': sampling_rate, + 'channels': int_or_none(track.get('Channels', 2)), + 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), + 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), + }, + }) + return formats, subtitles def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None): def absolute_url(item_url): @@ -2940,7 +3029,16 @@ def _media_formats(src, cur_media_type, type_info={}): entries.append(media_info) return entries - def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): + def _extract_akamai_formats(self, *args, **kwargs): + fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the manifests; " + "if any subtitle tracks are missing," + )) + return fmts + + def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}): signed = 'hdnea=' in manifest_url if not signed: # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html @@ -2949,6 +3047,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): '', manifest_url).strip('?') formats = [] + subtitles = {} hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') @@ -2967,10 +3066,11 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): hls_host = hosts.get('hls') if hls_host: m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - m3u8_formats = self._extract_m3u8_formats( + m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) formats.extend(m3u8_formats) + subtitles = self._merge_subtitles(subtitles, m3u8_subtitles) http_host = hosts.get('http') if http_host and m3u8_formats and not signed: @@ -2994,7 +3094,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats.append(http_f) i += 1 - return formats + return formats, subtitles def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): query = compat_urlparse.urlparse(url).query @@ -3319,12 +3419,22 @@ def _merge_subtitle_items(subtitle_list1, subtitle_list2): return ret @classmethod - def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): - """ Merge two subtitle dictionaries, language by language. """ - ret = dict(subtitle_dict1) - for lang in subtitle_dict2: - ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) - return ret + def _merge_subtitles(cls, *dicts, **kwargs): + """ Merge subtitle dictionaries, language by language. """ + + target = (lambda target=None: target)(**kwargs) + # The above lambda extracts the keyword argument 'target' from kwargs + # while ensuring there are no stray ones. When Python 2 support + # is dropped, remove it and change the function signature to: + # + # def _merge_subtitles(cls, *dicts, target=None): + + if target is None: + target = {} + for d in dicts: + for lang, subs in d.items(): + target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs) + return target def extract_automatic_captions(self, *args, **kwargs): if (self._downloader.params.get('writeautomaticsub', False) diff --git a/yt_dlp/extractor/elonet.py b/yt_dlp/extractor/elonet.py index 3647c0a9c3..eefba4e242 100644 --- a/yt_dlp/extractor/elonet.py +++ b/yt_dlp/extractor/elonet.py @@ -1,9 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import os import re -import tempfile from .common import InfoExtractor from ..utils import ( @@ -12,12 +10,12 @@ try_get, ) from ..compat import compat_str -from ..downloader.hls import HlsFD class ElonetIE(InfoExtractor): _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P[0-9]+)' - _TEST = { + _TESTS = [{ + # m3u8 with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867', 'md5': '8efc954b96c543711707f87de757caea', 'info_dict': { @@ -27,62 +25,17 @@ class ElonetIE(InfoExtractor): 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...', 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large', }, - } - - def _download_m3u8_chunked_subtitle(self, chunklist_url): - """ - Download VTT subtitles from pieces in manifest URL. - Return a string containing joined chunks with extra headers removed. - """ - with tempfile.NamedTemporaryFile(delete=True) as outfile: - fname = outfile.name - hlsdl = HlsFD(self._downloader, {}) - hlsdl.download(compat_str(fname), {"url": chunklist_url}) - with open(fname, 'r') as fin: - # Remove (some) headers - fdata = re.sub(r'X-TIMESTAMP-MAP.*\n+|WEBVTT\n+', '', fin.read()) - os.remove(fname) - return "WEBVTT\n\n" + fdata - - def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url): - """ - Parse subtitles from HLS / m3u8 manifest. - """ - subtitles = {} - baseurl = m3u8_url[:m3u8_url.rindex('/') + 1] - for line in m3u8_doc.split('\n'): - if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line: - lang = self._search_regex( - r'LANGUAGE="(.+?)"', line, 'lang', default=False) - uri = self._search_regex( - r'URI="(.+?)"', line, 'uri', default=False) - if lang and uri: - data = self._download_m3u8_chunked_subtitle(baseurl + uri) - subtitles[lang] = [{'ext': 'vtt', 'data': data}] - return subtitles - - def _parse_mpd_subtitles(self, mpd_doc): - """ - Parse subtitles from MPD manifest. - """ - ns = '{urn:mpeg:dash:schema:mpd:2011}' - subtitles = {} - for aset in mpd_doc.findall(".//%sAdaptationSet[@mimeType='text/vtt']" % (ns)): - lang = aset.attrib.get('lang', 'unk') - url = aset.find("./%sRepresentation/%sBaseURL" % (ns, ns)).text - subtitles[lang] = [{'ext': 'vtt', 'url': url}] - return subtitles - - def _get_subtitles(self, fmt, doc, url): - if fmt == 'm3u8': - subs = self._parse_m3u8_subtitles(doc, url) - elif fmt == 'mpd': - subs = self._parse_mpd_subtitles(doc) - else: - self.report_warning( - "Cannot download subtitles from '%s' streams." % (fmt)) - subs = {} - return subs + }, { + # DASH with subtitles + 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539', + 'info_dict': { + 'id': '116539', + 'ext': 'mp4', + 'title': 'Minulla on tiikeri', + 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...', + 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr', + } + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -101,8 +54,8 @@ def _real_extract(self, url): self._parse_json(json_s, video_id), lambda x: x[0]["src"], compat_str) formats = [] + subtitles = {} if re.search(r'\.m3u8\??', src): - fmt = 'm3u8' res = self._download_webpage_handle( # elonet servers have certificate problems src.replace('https:', 'http:'), video_id, @@ -111,11 +64,10 @@ def _real_extract(self, url): if res: doc, urlh = res url = urlh.geturl() - formats = self._parse_m3u8_formats(doc, url) + formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url) for f in formats: f['ext'] = 'mp4' elif re.search(r'\.mpd\??', src): - fmt = 'mpd' res = self._download_xml_handle( src, video_id, note='Downloading MPD manifest', @@ -123,7 +75,7 @@ def _real_extract(self, url): if res: doc, urlh = res url = base_url(urlh.geturl()) - formats = self._parse_mpd_formats(doc, mpd_base_url=url) + formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url) else: raise ExtractorError("Unknown streaming format") @@ -133,5 +85,5 @@ def _real_extract(self, url): 'description': description, 'thumbnail': thumbnail, 'formats': formats, - 'subtitles': self.extract_subtitles(fmt, doc, url), + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 313de343ef..e57e165fc9 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -151,6 +151,7 @@ def sign(manifest_url, manifest_id): videos.append(fallback_info['video']) formats = [] + subtitles = {} for video in videos: video_url = video.get('url') if not video_url: @@ -171,10 +172,12 @@ def sign(manifest_url, manifest_id): sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False)) + fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) @@ -199,13 +202,12 @@ def sign(manifest_url, manifest_id): title += ' - %s' % subtitle title = title.strip() - subtitles = {} - subtitles_list = [{ - 'url': subformat['url'], - 'ext': subformat.get('format'), - } for subformat in info.get('subtitles', []) if subformat.get('url')] - if subtitles_list: - subtitles['fr'] = subtitles_list + subtitles.setdefault('fr', []).extend( + [{ + 'url': subformat['url'], + 'ext': subformat.get('format'), + } for subformat in info.get('subtitles', []) if subformat.get('url')] + ) return { 'id': video_id, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 4250d10932..32815476fa 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2444,8 +2444,9 @@ def _real_extract(self, url): m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) if m: format_id = compat_str(m.group('format_id')) + subtitles = {} if format_id.endswith('mpegurl'): - formats = self._extract_m3u8_formats(url, video_id, 'mp4') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id) else: @@ -2457,6 +2458,7 @@ def _real_extract(self, url): info_dict['direct'] = True self._sort_formats(formats) info_dict['formats'] = formats + info_dict['subtitles'] = subtitles return info_dict if not self._downloader.params.get('test', False) and not is_intentional: @@ -2510,7 +2512,7 @@ def _real_extract(self, url): if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif doc.tag == 'SmoothStreamingMedia': - info_dict['formats'] = self._parse_ism_formats(doc, url) + info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): @@ -2524,7 +2526,7 @@ def _real_extract(self, url): xspf_base_url=full_response.geturl()), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - info_dict['formats'] = self._parse_mpd_formats( + info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index 1f03a9462d..99964737d8 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -46,6 +46,7 @@ def get_file_size(file_size): urls = [] formats = [] + subtitles = {} for video in video_data.get('renditions', []): video_url = video.get('url') format_id = video.get('type') @@ -54,9 +55,11 @@ def get_file_size(file_size): urls.append(video_url) ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id or 'hls', fatal=False)) + m3u8_id=format_id or 'hls', fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) elif ext == 'mpd': continue # formats.extend(self._extract_mpd_formats( @@ -96,6 +99,7 @@ def get_file_size(file_size): 'uploader': video_data.get('byline'), 'duration': float_or_none(video_data.get('duration'), 1000), 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, } diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py index 0724cef268..2c815bda63 100644 --- a/yt_dlp/extractor/roosterteeth.py +++ b/yt_dlp/extractor/roosterteeth.py @@ -103,7 +103,7 @@ def _real_extract(self, url): api_episode_url + '/videos', display_id, 'Downloading video JSON metadata')['data'][0] m3u8_url = video_data['attributes']['url'] - subtitle_m3u8_url = video_data['links']['download'] + # XXX: additional URL at video_data['links']['download'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: @@ -111,7 +111,7 @@ def _real_extract(self, url): '%s is only available for FIRST members' % display_id) raise - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) @@ -134,33 +134,6 @@ def _real_extract(self, url): 'url': img_url, }) - subtitles = {} - res = self._download_webpage_handle( - subtitle_m3u8_url, display_id, - 'Downloading m3u8 information', - 'Failed to download m3u8 information', - fatal=True, data=None, headers={}, query={}) - if res is not False: - subtitle_m3u8_doc, _ = res - for line in subtitle_m3u8_doc.split('\n'): - if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line: - parts = line.split(',') - for part in parts: - if 'LANGUAGE' in part: - lang = part[part.index('=') + 2:-1] - elif 'URI' in part: - uri = part[part.index('=') + 2:-1] - res = self._download_webpage_handle( - uri, display_id, - 'Downloading m3u8 information', - 'Failed to download m3u8 information', - fatal=True, data=None, headers={}, query={}) - doc, _ = res - for l in doc.split('\n'): - if not l.startswith('#'): - subtitles[lang] = [{'url': uri[:-uri[::-1].index('/')] + l}] - break - return { 'id': video_id, 'display_id': display_id, diff --git a/yt_dlp/extractor/srgssr.py b/yt_dlp/extractor/srgssr.py index ac018e7405..2977b5e670 100644 --- a/yt_dlp/extractor/srgssr.py +++ b/yt_dlp/extractor/srgssr.py @@ -87,6 +87,7 @@ def _real_extract(self, url): title = media_data['title'] formats = [] + subtitles = {} q = qualities(['SD', 'HD']) for source in (media_data.get('resourceList') or []): format_url = source.get('url') @@ -104,12 +105,16 @@ def _real_extract(self, url): if source.get('tokenType') == 'AKAMAI': format_url = self._get_tokenized_src( format_url, media_id, format_id) - formats.extend(self._extract_akamai_formats( - format_url, media_id)) + fmts, subs = self._extract_akamai_formats_and_subtitles( + format_url, media_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) elif protocol == 'HLS': - formats.extend(self._extract_m3u8_formats( + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( format_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) + m3u8_id=format_id, fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) elif protocol in ('HTTP', 'HTTPS'): formats.append({ 'format_id': format_id, @@ -133,7 +138,6 @@ def _real_extract(self, url): }) self._sort_formats(formats) - subtitles = {} if media_type == 'video': for sub in (media_data.get('subtitleList') or []): sub_url = sub.get('url') diff --git a/yt_dlp/extractor/threeqsdn.py b/yt_dlp/extractor/threeqsdn.py index 5eaa991eb5..bb7610352d 100644 --- a/yt_dlp/extractor/threeqsdn.py +++ b/yt_dlp/extractor/threeqsdn.py @@ -99,16 +99,21 @@ def _real_extract(self, url): aspect = float_or_none(config.get('aspect')) formats = [] + subtitles = {} for source_type, source in (config.get('sources') or {}).items(): if not source: continue if source_type == 'dash': - formats.extend(self._extract_mpd_formats( - source, video_id, mpd_id='mpd', fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + source, video_id, mpd_id='mpd', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) elif source_type == 'hls': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) elif source_type == 'progressive': for s in source: src = s.get('src') @@ -138,7 +143,6 @@ def _real_extract(self, url): # behaviour is being kept as-is self._sort_formats(formats, ('res', 'source_preference')) - subtitles = {} for subtitle in (config.get('subtitles') or []): src = subtitle.get('src') if not src: diff --git a/yt_dlp/extractor/tv4.py b/yt_dlp/extractor/tv4.py index b8ad4fafc4..4043e63662 100644 --- a/yt_dlp/extractor/tv4.py +++ b/yt_dlp/extractor/tv4.py @@ -93,18 +93,31 @@ def _real_extract(self, url): 'device': 'browser', 'protocol': 'hls', })['playbackItem']['manifestUrl'] - formats = self._extract_m3u8_formats( + formats = [] + subtitles = {} + + fmts, subs = self._extract_m3u8_formats_and_subtitles( manifest_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(self._extract_mpd_formats( + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + fmts, subs = self._extract_mpd_formats_and_subtitles( manifest_url.replace('.m3u8', '.mpd'), - video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_f4m_formats( + video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + fmts = self._extract_f4m_formats( manifest_url.replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_ism_formats( + video_id, f4m_id='hds', fatal=False) + formats.extend(fmts) + + fmts, subs = self._extract_ism_formats_and_subtitles( re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url), - video_id, ism_id='mss', fatal=False)) + video_id, ism_id='mss', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) if not formats and info.get('is_geo_restricted'): self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) @@ -115,7 +128,7 @@ def _real_extract(self, url): 'id': video_id, 'title': title, 'formats': formats, - # 'subtitles': subtitles, + 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 8a2a77b710..63c11bd479 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -36,9 +36,9 @@ class TwitterBaseIE(InfoExtractor): def _extract_variant_formats(self, variant, video_id): variant_url = variant.get('url') if not variant_url: - return [] + return [], {} elif '.m3u8' in variant_url: - return self._extract_m3u8_formats( + return self._extract_m3u8_formats_and_subtitles( variant_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) else: @@ -49,22 +49,27 @@ def _extract_variant_formats(self, variant, video_id): 'tbr': tbr, } self._search_dimensions_in_video_url(f, variant_url) - return [f] + return [f], {} def _extract_formats_from_vmap_url(self, vmap_url, video_id): vmap_data = self._download_xml(vmap_url, video_id) formats = [] + subtitles = {} urls = [] for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'): video_variant.attrib['url'] = compat_urllib_parse_unquote( video_variant.attrib['url']) urls.append(video_variant.attrib['url']) - formats.extend(self._extract_variant_formats( - video_variant.attrib, video_id)) + fmts, subs = self._extract_variant_formats( + video_variant.attrib, video_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile')) if video_url not in urls: - formats.extend(self._extract_variant_formats({'url': video_url}, video_id)) - return formats + fmts, subs = self._extract_variant_formats({'url': video_url}, video_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + return formats, subtitles @staticmethod def _search_dimensions_in_video_url(a_format, video_url): @@ -471,8 +476,11 @@ def extract_from_video_info(media): video_info = media.get('video_info') or {} formats = [] + subtitles = {} for variant in video_info.get('variants', []): - formats.extend(self._extract_variant_formats(variant, twid)) + fmts, subs = self._extract_variant_formats(variant, twid) + subtitles = self._merge_subtitles(subtitles, subs) + formats.extend(fmts) self._sort_formats(formats) thumbnails = [] @@ -491,6 +499,7 @@ def add_thumbnail(name, size): info.update({ 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) @@ -540,7 +549,7 @@ def get_binding_value(k): is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) - formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) + formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) self._sort_formats(formats) thumbnails = [] @@ -558,6 +567,7 @@ def get_binding_value(k): info.update({ 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'duration': int_or_none(get_binding_value( 'content_duration_seconds')), diff --git a/yt_dlp/extractor/uplynk.py b/yt_dlp/extractor/uplynk.py index f06bf5b127..c0dba0a6ad 100644 --- a/yt_dlp/extractor/uplynk.py +++ b/yt_dlp/extractor/uplynk.py @@ -30,7 +30,7 @@ class UplynkIE(InfoExtractor): def _extract_uplynk_info(self, uplynk_content_url): path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() display_id = video_id or external_id - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( 'http://content.uplynk.com/%s.m3u8' % path, display_id, 'mp4', 'm3u8_native') if session_id: @@ -48,6 +48,7 @@ def _extract_uplynk_info(self, uplynk_content_url): 'duration': float_or_none(asset.get('duration')), 'uploader_id': asset.get('owner'), 'formats': formats, + 'subtitles': subtitles, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/wat.py b/yt_dlp/extractor/wat.py index 05dcc1f17e..0f1d08da35 100644 --- a/yt_dlp/extractor/wat.py +++ b/yt_dlp/extractor/wat.py @@ -69,19 +69,24 @@ def _real_extract(self, url): title = video_info['title'] formats = [] + subtitles = {} def extract_formats(manifest_urls): for f, f_url in manifest_urls.items(): if not f_url: continue if f in ('dash', 'mpd'): - formats.extend(self._extract_mpd_formats( + fmts, subs = self._extract_mpd_formats_and_subtitles( f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), - video_id, mpd_id='dash', fatal=False)) + video_id, mpd_id='dash', fatal=False) elif f == 'hls': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( f_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) + 'm3u8_native', m3u8_id='hls', fatal=False) + else: + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) delivery = video_data.get('delivery') or {} extract_formats({delivery.get('format'): delivery.get('url')}) @@ -103,4 +108,5 @@ def extract_formats(manifest_urls): video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])), 'duration': int_or_none(video_info.get('duration')), 'formats': formats, + 'subtitles': subtitles, } diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 40d9568088..9ddd6453f5 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2340,15 +2340,20 @@ def make_HTTPS_handler(params, **kwargs): return YoutubeDLHTTPSHandler(params, context=context, **kwargs) -def bug_reports_message(): +def bug_reports_message(before=';'): if ytdl_is_updateable(): update_cmd = 'type yt-dlp -U to update' else: update_cmd = 'see https://github.com/yt-dlp/yt-dlp on how to update' - msg = '; please report this issue on https://github.com/yt-dlp/yt-dlp .' + msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .' msg += ' Make sure you are using the latest version; %s.' % update_cmd msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.' - return msg + + before = before.rstrip() + if not before or before.endswith(('.', '!', '?')): + msg = msg[0].title() + msg[1:] + + return (before + ' ' if before else '') + msg class YoutubeDLError(Exception): diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py new file mode 100644 index 0000000000..a184ee3699 --- /dev/null +++ b/yt_dlp/webvtt.py @@ -0,0 +1,378 @@ +# coding: utf-8 +from __future__ import unicode_literals, print_function, division + +""" +A partial parser for WebVTT segments. Interprets enough of the WebVTT stream +to be able to assemble a single stand-alone subtitle file, suitably adjusting +timestamps on the way, while everything else is passed through unmodified. + +Regular expressions based on the W3C WebVTT specification +. The X-TIMESTAMP-MAP extension is described +in RFC 8216 §3.5 . +""" + +import re +import io +from .utils import int_or_none +from .compat import ( + compat_str as str, + compat_Pattern, + compat_Match, +) + + +class _MatchParser(object): + """ + An object that maintains the current parsing position and allows + conveniently advancing it as syntax elements are successfully parsed. + """ + + def __init__(self, string): + self._data = string + self._pos = 0 + + def match(self, r): + if isinstance(r, compat_Pattern): + return r.match(self._data, self._pos) + if isinstance(r, str): + if self._data.startswith(r, self._pos): + return len(r) + return None + raise ValueError(r) + + def advance(self, by): + if by is None: + amt = 0 + elif isinstance(by, compat_Match): + amt = len(by.group(0)) + elif isinstance(by, str): + amt = len(by) + elif isinstance(by, int): + amt = by + else: + raise ValueError(by) + self._pos += amt + return by + + def consume(self, r): + return self.advance(self.match(r)) + + def child(self): + return _MatchChildParser(self) + + +class _MatchChildParser(_MatchParser): + """ + A child parser state, which advances through the same data as + its parent, but has an independent position. This is useful when + advancing through syntax elements we might later want to backtrack + from. + """ + + def __init__(self, parent): + super(_MatchChildParser, self).__init__(parent._data) + self.__parent = parent + self._pos = parent._pos + + def commit(self): + """ + Advance the parent state to the current position of this child state. + """ + self.__parent._pos = self._pos + return self.__parent + + +class ParseError(Exception): + def __init__(self, parser): + super(ParseError, self).__init__("Parse error at position %u (near %r)" % ( + parser._pos, parser._data[parser._pos:parser._pos + 20] + )) + + +_REGEX_TS = re.compile(r'''(?x) + (?:([0-9]{2,}):)? + ([0-9]{2}): + ([0-9]{2})\. + ([0-9]{3})? +''') +_REGEX_EOF = re.compile(r'\Z') +_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])') +_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+') + + +def _parse_ts(ts): + """ + Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS) + into an MPEG PES timestamp: a tick counter at 90 kHz resolution. + """ + + h, min, s, ms = ts.groups() + return 90 * ( + int(h or 0) * 3600000 + # noqa: W504,E221,E222 + int(min) * 60000 + # noqa: W504,E221,E222 + int(s) * 1000 + # noqa: W504,E221,E222 + int(ms) # noqa: W504,E221,E222 + ) + + +def _format_ts(ts): + """ + Convert an MPEG PES timestamp into a WebVTT timestamp. + This will lose sub-millisecond precision. + """ + + ts = int((ts + 45) // 90) + ms , ts = divmod(ts, 1000) # noqa: W504,E221,E222,E203 + s , ts = divmod(ts, 60) # noqa: W504,E221,E222,E203 + min, h = divmod(ts, 60) # noqa: W504,E221,E222 + return '%02u:%02u:%02u.%03u' % (h, min, s, ms) + + +class Block(object): + """ + An abstract WebVTT block. + """ + + def __init__(self, **kwargs): + for key, val in kwargs.items(): + setattr(self, key, val) + + @classmethod + def parse(cls, parser): + m = parser.match(cls._REGEX) + if not m: + return None + parser.advance(m) + return cls(raw=m.group(0)) + + def write_into(self, stream): + stream.write(self.raw) + + +class HeaderBlock(Block): + """ + A WebVTT block that may only appear in the header part of the file, + i.e. before any cue blocks. + """ + + pass + + +class Magic(HeaderBlock): + _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])') + + # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5 + # , but the RFC + # doesn’t specify the exact grammar nor where in the WebVTT + # syntax it should be placed; the below has been devised based + # on usage in the wild + # + # And strictly speaking, the presence of this extension violates + # the W3C WebVTT spec. Oh well. + + _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=') + _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:') + _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') + + @classmethod + def __parse_tsmap(cls, parser): + parser = parser.child() + + while True: + m = parser.consume(cls._REGEX_TSMAP_LOCAL) + if m: + m = parser.consume(_REGEX_TS) + if m is None: + raise ParseError(parser) + local = _parse_ts(m) + if local is None: + raise ParseError(parser) + else: + m = parser.consume(cls._REGEX_TSMAP_MPEGTS) + if m: + mpegts = int_or_none(m.group(1)) + if mpegts is None: + raise ParseError(parser) + else: + raise ParseError(parser) + if parser.consume(','): + continue + if parser.consume(_REGEX_NL): + break + raise ParseError(parser) + + parser.commit() + return local, mpegts + + @classmethod + def parse(cls, parser): + parser = parser.child() + + m = parser.consume(cls._REGEX) + if not m: + raise ParseError(parser) + + extra = m.group(1) + local, mpegts = None, None + if parser.consume(cls._REGEX_TSMAP): + local, mpegts = cls.__parse_tsmap(parser) + if not parser.consume(_REGEX_NL): + raise ParseError(parser) + parser.commit() + return cls(extra=extra, mpegts=mpegts, local=local) + + def write_into(self, stream): + stream.write('WEBVTT') + if self.extra is not None: + stream.write(self.extra) + stream.write('\n') + if self.local or self.mpegts: + stream.write('X-TIMESTAMP-MAP=LOCAL:') + stream.write(_format_ts(self.local if self.local is not None else 0)) + stream.write(',MPEGTS:') + stream.write(str(self.mpegts if self.mpegts is not None else 0)) + stream.write('\n') + stream.write('\n') + + +class StyleBlock(HeaderBlock): + _REGEX = re.compile(r'''(?x) + STYLE[\ \t]*(?:\r\n|[\r\n]) + ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* + (?:\r\n|[\r\n]) + ''') + + +class RegionBlock(HeaderBlock): + _REGEX = re.compile(r'''(?x) + REGION[\ \t]* + ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* + (?:\r\n|[\r\n]) + ''') + + +class CommentBlock(Block): + _REGEX = re.compile(r'''(?x) + NOTE(?:\r\n|[\ \t\r\n]) + ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* + (?:\r\n|[\r\n]) + ''') + + +class CueBlock(Block): + """ + A cue block. The payload is not interpreted. + """ + + _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])') + _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+') + _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)') + _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?') + + @classmethod + def parse(cls, parser): + parser = parser.child() + + id = None + m = parser.consume(cls._REGEX_ID) + if m: + id = m.group(1) + + m0 = parser.consume(_REGEX_TS) + if not m0: + return None + if not parser.consume(cls._REGEX_ARROW): + return None + m1 = parser.consume(_REGEX_TS) + if not m1: + return None + m2 = parser.consume(cls._REGEX_SETTINGS) + if not parser.consume(_REGEX_NL): + return None + + start = _parse_ts(m0) + end = _parse_ts(m1) + settings = m2.group(1) if m2 is not None else None + + text = io.StringIO() + while True: + m = parser.consume(cls._REGEX_PAYLOAD) + if not m: + break + text.write(m.group(0)) + + parser.commit() + return cls( + id=id, + start=start, end=end, settings=settings, + text=text.getvalue() + ) + + def write_into(self, stream): + if self.id is not None: + stream.write(self.id) + stream.write('\n') + stream.write(_format_ts(self.start)) + stream.write(' --> ') + stream.write(_format_ts(self.end)) + if self.settings is not None: + stream.write(' ') + stream.write(self.settings) + stream.write('\n') + stream.write(self.text) + stream.write('\n') + + @property + def as_json(self): + return { + 'id': self.id, + 'start': self.start, + 'end': self.end, + 'text': self.text, + 'settings': self.settings, + } + + +def parse_fragment(frag_content): + """ + A generator that yields (partially) parsed WebVTT blocks when given + a bytes object containing the raw contents of a WebVTT file. + """ + + parser = _MatchParser(frag_content.decode('utf-8')) + + yield Magic.parse(parser) + + while not parser.match(_REGEX_EOF): + if parser.consume(_REGEX_BLANK): + continue + + block = RegionBlock.parse(parser) + if block: + yield block + continue + block = StyleBlock.parse(parser) + if block: + yield block + continue + block = CommentBlock.parse(parser) + if block: + yield block # XXX: or skip + continue + + break + + while not parser.match(_REGEX_EOF): + if parser.consume(_REGEX_BLANK): + continue + + block = CommentBlock.parse(parser) + if block: + yield block # XXX: or skip + continue + block = CueBlock.parse(parser) + if block: + yield block + continue + + raise ParseError(parser)