]+class=["\']player["\'][^>]+id=["\'](\d+)',
- r'var\s+cloudId\s*=\s*["\'](\d+)',
+ r'cloudId\s*=\s*["\'](\d+)',
r'class="video-preview current_playing" id="(\d+)"'),
webpage, 'video id')
@@ -90,21 +90,40 @@ class TvigleIE(InfoExtractor):
age_limit = parse_age_limit(item.get('ageRestrictions'))
formats = []
- for vcodec, fmts in item['videos'].items():
+ for vcodec, url_or_fmts in item['videos'].items():
if vcodec == 'hls':
- continue
- for format_id, video_url in fmts.items():
- if format_id == 'm3u8':
+ m3u8_url = url_or_none(url_or_fmts)
+ if not m3u8_url:
continue
- height = self._search_regex(
- r'^(\d+)[pP]$', format_id, 'height', default=None)
- formats.append({
- 'url': video_url,
- 'format_id': '%s-%s' % (vcodec, format_id),
- 'vcodec': vcodec,
- 'height': int_or_none(height),
- 'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)),
- })
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif vcodec == 'dash':
+ mpd_url = url_or_none(url_or_fmts)
+ if not mpd_url:
+ continue
+ formats.extend(self._extract_mpd_formats(
+ mpd_url, video_id, mpd_id='dash', fatal=False))
+ else:
+ if not isinstance(url_or_fmts, dict):
+ continue
+ for format_id, video_url in url_or_fmts.items():
+ if format_id == 'm3u8':
+ continue
+ video_url = url_or_none(video_url)
+ if not video_url:
+ continue
+ height = self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None)
+ filesize = int_or_none(try_get(
+ item, lambda x: x['video_files_size'][vcodec][format_id]))
+ formats.append({
+ 'url': video_url,
+ 'format_id': '%s-%s' % (vcodec, format_id),
+ 'vcodec': vcodec,
+ 'height': int_or_none(height),
+ 'filesize': filesize,
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/tvn24.py b/youtube_dl/extractor/tvn24.py
index 6590e1fd0..de0fb5063 100644
--- a/youtube_dl/extractor/tvn24.py
+++ b/youtube_dl/extractor/tvn24.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ NO_DEFAULT,
unescapeHTML,
)
@@ -17,9 +18,21 @@ class TVN24IE(InfoExtractor):
'id': '1584444',
'ext': 'mp4',
'title': '"Święta mają być wesołe, dlatego, ludziska, wszyscy pod jemiołę"',
- 'description': 'Wyjątkowe orędzie Artura Andrusa, jednego z gości "Szkła kontaktowego".',
+ 'description': 'Wyjątkowe orędzie Artura Andrusa, jednego z gości Szkła kontaktowego.',
'thumbnail': 're:https?://.*[.]jpeg',
}
+ }, {
+ # different layout
+ 'url': 'https://tvnmeteo.tvn24.pl/magazyny/maja-w-ogrodzie,13/odcinki-online,1,4,1,0/pnacza-ptaki-i-iglaki-odc-691-hgtv-odc-29,1771763.html',
+ 'info_dict': {
+ 'id': '1771763',
+ 'ext': 'mp4',
+ 'title': 'Pnącza, ptaki i iglaki (odc. 691 /HGTV odc. 29)',
+ 'thumbnail': 're:https?://.*',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}, {
'url': 'http://fakty.tvn24.pl/ogladaj-online,60/53-konferencja-bezpieczenstwa-w-monachium,716431.html',
'only_matching': True,
@@ -35,18 +48,21 @@ class TVN24IE(InfoExtractor):
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ display_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, display_id)
- title = self._og_search_title(webpage)
+ title = self._og_search_title(
+ webpage, default=None) or self._search_regex(
+ r'
]+class=["\']magazineItemHeader[^>]+>(.+?)(?!\1).+?)\1' % attr, webpage,
- name, group='json', fatal=fatal) or '{}',
- video_id, transform_source=unescapeHTML, fatal=fatal)
+ name, group='json', default=default, fatal=fatal) or '{}',
+ display_id, transform_source=unescapeHTML, fatal=fatal)
quality_data = extract_json('data-quality', 'formats')
@@ -59,16 +75,24 @@ class TVN24IE(InfoExtractor):
})
self._sort_formats(formats)
- description = self._og_search_description(webpage)
+ description = self._og_search_description(webpage, default=None)
thumbnail = self._og_search_thumbnail(
webpage, default=None) or self._html_search_regex(
r'\bdata-poster=(["\'])(?P
(?!\1).+?)\1', webpage,
'thumbnail', group='url')
+ video_id = None
+
share_params = extract_json(
- 'data-share-params', 'share params', fatal=False)
+ 'data-share-params', 'share params', default=None)
if isinstance(share_params, dict):
- video_id = share_params.get('id') or video_id
+ video_id = share_params.get('id')
+
+ if not video_id:
+ video_id = self._search_regex(
+ r'data-vid-id=["\'](\d+)', webpage, 'video id',
+ default=None) or self._search_regex(
+ r',(\d+)\.html', url, 'video id', default=display_id)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 0500e33a6..8c0d70010 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -248,7 +248,7 @@ class TwitchVodIE(TwitchItemBaseIE):
https?://
(?:
(?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/|
- player\.twitch\.tv/\?.*?\bvideo=v
+ player\.twitch\.tv/\?.*?\bvideo=v?
)
(?P\d+)
'''
@@ -306,6 +306,9 @@ class TwitchVodIE(TwitchItemBaseIE):
}, {
'url': 'https://www.twitch.tv/northernlion/video/291940395',
'only_matching': True,
+ }, {
+ 'url': 'https://player.twitch.tv/?video=480452374',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -341,9 +344,8 @@ class TwitchVodIE(TwitchItemBaseIE):
info['subtitles'] = {
'rechat': [{
'url': update_url_query(
- 'https://rechat.twitch.tv/rechat-messages', {
- 'video_id': 'v%s' % item_id,
- 'start': info['timestamp'],
+ 'https://api.twitch.tv/v5/videos/%s/comments' % item_id, {
+ 'client_id': self._CLIENT_ID,
}),
'ext': 'json',
}],
@@ -641,7 +643,7 @@ class TwitchStreamIE(TwitchBaseIE):
class TwitchClipsIE(TwitchBaseIE):
IE_NAME = 'twitch:clips'
- _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:[^/]+/)*|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P[^/?#&]+)'
+ _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P[^/?#&]+)'
_TESTS = [{
'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat',
@@ -664,6 +666,9 @@ class TwitchClipsIE(TwitchBaseIE):
}, {
'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan',
'only_matching': True,
+ }, {
+ 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
index cebb6238c..5f8d90fb4 100644
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -4,32 +4,67 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+ compat_HTTPError,
+ compat_parse_qs,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
- determine_ext,
dict_get,
ExtractorError,
float_or_none,
int_or_none,
- remove_end,
try_get,
+ strip_or_none,
+ unified_timestamp,
+ update_url_query,
xpath_text,
)
-from .periscope import PeriscopeIE
+from .periscope import (
+ PeriscopeBaseIE,
+ PeriscopeIE,
+)
class TwitterBaseIE(InfoExtractor):
+ _API_BASE = 'https://api.twitter.com/1.1/'
+ _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/'
+ _GUEST_TOKEN = None
+
+ def _extract_variant_formats(self, variant, video_id):
+ variant_url = variant.get('url')
+ if not variant_url:
+ return []
+ elif '.m3u8' in variant_url:
+ return self._extract_m3u8_formats(
+ variant_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False)
+ else:
+ tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None
+ f = {
+ 'url': variant_url,
+ 'format_id': 'http' + ('-%d' % tbr if tbr else ''),
+ 'tbr': tbr,
+ }
+ self._search_dimensions_in_video_url(f, variant_url)
+ return [f]
+
def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_data = self._download_xml(vmap_url, video_id)
- video_url = xpath_text(vmap_data, './/MediaFile').strip()
- if determine_ext(video_url) == 'm3u8':
- return self._extract_m3u8_formats(
- video_url, video_id, ext='mp4', m3u8_id='hls',
- entry_protocol='m3u8_native')
- return [{
- 'url': video_url,
- }]
+ formats = []
+ urls = []
+ for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'):
+ video_variant.attrib['url'] = compat_urllib_parse_unquote(
+ video_variant.attrib['url'])
+ urls.append(video_variant.attrib['url'])
+ formats.extend(self._extract_variant_formats(
+ video_variant.attrib, video_id))
+ video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile'))
+ if video_url not in urls:
+ formats.extend(self._extract_variant_formats({'url': video_url}, video_id))
+ return formats
@staticmethod
def _search_dimensions_in_video_url(a_format, video_url):
@@ -40,10 +75,30 @@ class TwitterBaseIE(InfoExtractor):
'height': int(m.group('height')),
})
+ def _call_api(self, path, video_id, query={}):
+ headers = {
+ 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw',
+ }
+ if not self._GUEST_TOKEN:
+ self._GUEST_TOKEN = self._download_json(
+ self._API_BASE + 'guest/activate.json', video_id,
+ 'Downloading guest token', data=b'',
+ headers=headers)['guest_token']
+ headers['x-guest-token'] = self._GUEST_TOKEN
+ try:
+ return self._download_json(
+ self._API_BASE + path, video_id, headers=headers, query=query)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ raise ExtractorError(self._parse_json(
+ e.cause.read().decode(),
+ video_id)['errors'][0]['message'], expected=True)
+ raise
-class TwitterCardIE(TwitterBaseIE):
+
+class TwitterCardIE(InfoExtractor):
IE_NAME = 'twitter:card'
- _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?Pcards/tfw/v1|videos(?:/tweet)?)/(?P\d+)'
+ _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P\d+)'
_TESTS = [
{
'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
@@ -51,19 +106,28 @@ class TwitterCardIE(TwitterBaseIE):
'info_dict': {
'id': '560070183650213889',
'ext': 'mp4',
- 'title': 'Twitter web player',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.",
+ 'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96',
+ 'uploader': 'Twitter',
+ 'uploader_id': 'Twitter',
+ 'thumbnail': r're:^https?://.*\.jpg',
'duration': 30.033,
+ 'timestamp': 1422366112,
+ 'upload_date': '20150127',
},
},
{
'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
- 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8',
+ 'md5': '7137eca597f72b9abbe61e5ae0161399',
'info_dict': {
'id': '623160978427936768',
'ext': 'mp4',
- 'title': 'Twitter web player',
- 'thumbnail': r're:^https?://.*$',
+ 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.",
+ 'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA",
+ 'uploader': 'NASA',
+ 'uploader_id': 'NASA',
+ 'timestamp': 1437408129,
+ 'upload_date': '20150720',
},
},
{
@@ -75,7 +139,7 @@ class TwitterCardIE(TwitterBaseIE):
'title': 'Ubuntu 11.10 Overview',
'description': 'md5:a831e97fa384863d6e26ce48d1c43376',
'upload_date': '20111013',
- 'uploader': 'OMG! Ubuntu!',
+ 'uploader': 'OMG! UBUNTU!',
'uploader_id': 'omgubuntu',
},
'add_ie': ['Youtube'],
@@ -99,190 +163,30 @@ class TwitterCardIE(TwitterBaseIE):
'info_dict': {
'id': '705235433198714880',
'ext': 'mp4',
- 'title': 'Twitter web player',
- 'thumbnail': r're:^https?://.*',
+ 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.",
+ 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns",
+ 'uploader': 'Brent Yarina',
+ 'uploader_id': 'BTNBrentYarina',
+ 'timestamp': 1456976204,
+ 'upload_date': '20160303',
},
+ 'skip': 'This content is no longer available.',
}, {
'url': 'https://twitter.com/i/videos/752274308186120192',
'only_matching': True,
},
]
- _API_BASE = 'https://api.twitter.com/1.1'
-
- def _parse_media_info(self, media_info, video_id):
- formats = []
- for media_variant in media_info.get('variants', []):
- media_url = media_variant['url']
- if media_url.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
- elif media_url.endswith('.mpd'):
- formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
- else:
- tbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000)
- a_format = {
- 'url': media_url,
- 'format_id': 'http-%d' % tbr if tbr else 'http',
- 'tbr': tbr,
- }
- # Reported bitRate may be zero
- if not a_format['tbr']:
- del a_format['tbr']
-
- self._search_dimensions_in_video_url(a_format, media_url)
-
- formats.append(a_format)
- return formats
-
- def _extract_mobile_formats(self, username, video_id):
- webpage = self._download_webpage(
- 'https://mobile.twitter.com/%s/status/%s' % (username, video_id),
- video_id, 'Downloading mobile webpage',
- headers={
- # A recent mobile UA is necessary for `gt` cookie
- 'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0',
- })
- main_script_url = self._html_search_regex(
- r'|$)',
- webpage, 'videoplayer applet', default=None)
- if config_json:
- config = self._parse_json(config_json, display_id, fatal=False)
- if config:
- sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi')
- if sapi and 'query' in sapi:
- info = self._extract_info(display_id, sapi, webpage)
- self._sort_formats(info['formats'])
- return info
-
- items_json = self._search_regex(
- r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
- default=None)
- if items_json is None:
- alias = self._search_regex(
- r'"aliases":{"video":"(.*?)"', webpage, 'alias', default=None)
- if alias is not None:
- alias_info = self._download_json(
- 'https://www.yahoo.com/_td/api/resource/VideoService.videos;video_aliases=["%s"]' % alias,
- display_id, 'Downloading alias info')
- video_id = alias_info[0]['id']
- else:
- CONTENT_ID_REGEXES = [
- r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
- r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
- r'"first_videoid"\s*:\s*"([^"]+)"',
- r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id),
- r']data-uuid=["\']([^"\']+)',
- r']+yahoo://article/view\?.*\buuid=([^&"\']+)',
- r']+["\']ytwnews://cavideo/(?:[^/]+/)+([\da-fA-F-]+)[&"\']',
- ]
- video_id = self._search_regex(
- CONTENT_ID_REGEXES, webpage, 'content ID')
+ url, country, display_id = re.match(self._VALID_URL, url).groups()
+ if not country:
+ country = 'us'
else:
- items = json.loads(items_json)
- info = items['mediaItems']['query']['results']['mediaObj'][0]
- # The 'meta' field is not always in the video webpage, we request it
- # from another page
- video_id = info['id']
- return self._get_info(video_id, display_id, webpage)
+ country = country.split('-')[0]
+ api_base = 'https://%s.yahoo.com/_td/api/resource/' % country
- def _extract_info(self, display_id, query, webpage):
- info = query['query']['results']['mediaObj'][0]
- meta = info.get('meta')
- video_id = info.get('id')
+ for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]):
+ content = self._download_json(
+ api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid,
+ display_id, 'Downloading content JSON metadata', fatal=i == 1)
+ if content:
+ item = content['items'][0]
+ break
- if not meta:
- msg = info['status'].get('msg')
- if msg:
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, msg), expected=True)
- raise ExtractorError('Unable to extract media object meta')
+ if item.get('type') != 'video':
+ entries = []
+ cover = item.get('cover') or {}
+ if cover.get('type') == 'yvideo':
+ cover_url = cover.get('url')
+ if cover_url:
+ entries.append(self.url_result(
+ cover_url, 'Yahoo', cover.get('uuid')))
+
+ for e in item.get('body', []):
+ if e.get('type') == 'videoIframe':
+ iframe_url = e.get('url')
+ if not iframe_url:
+ continue
+ entries.append(self.url_result(iframe_url))
+
+ return self.playlist_result(
+ entries, item.get('uuid'),
+ item.get('title'), item.get('summary'))
+
+ video_id = item['uuid']
+ video = self._download_json(
+ api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id,
+ video_id, 'Downloading video JSON metadata')[0]
+ title = video['title']
+
+ if country == 'malaysia':
+ country = 'my'
+
+ is_live = video.get('live_state') == 'live'
+ fmts = ('m3u8',) if is_live else ('webm', 'mp4')
+
+ urls = []
formats = []
- for s in info['streams']:
- tbr = int_or_none(s.get('bitrate'))
- format_info = {
- 'width': int_or_none(s.get('width')),
- 'height': int_or_none(s.get('height')),
- 'tbr': tbr,
- }
-
- host = s['host']
- path = s['path']
- if host.startswith('rtmp'):
- fmt = 'rtmp'
- format_info.update({
- 'url': host,
- 'play_path': path,
- 'ext': 'flv',
- })
- else:
- if s.get('format') == 'm3u8_playlist':
- fmt = 'hls'
- format_info.update({
- 'protocol': 'm3u8_native',
- 'ext': 'mp4',
- })
- else:
- fmt = format_info['ext'] = determine_ext(path)
- format_url = compat_urlparse.urljoin(host, path)
- format_info['url'] = format_url
- format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '')
- formats.append(format_info)
-
- closed_captions = self._html_search_regex(
- r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
- default='[]')
-
- cc_json = self._parse_json(closed_captions, video_id, fatal=False)
subtitles = {}
- if cc_json:
- for closed_caption in cc_json:
- lang = closed_caption['lang']
- if lang not in subtitles:
- subtitles[lang] = []
- subtitles[lang].append({
- 'url': closed_caption['url'],
- 'ext': mimetype2ext(closed_caption['content_type']),
+ for fmt in fmts:
+ media_obj = self._download_json(
+ 'https://video-api.yql.yahoo.com/v1/video/sapi/streams/' + video_id,
+ video_id, 'Downloading %s JSON metadata' % fmt,
+ headers=self.geo_verification_headers(), query={
+ 'format': fmt,
+ 'region': country.upper(),
+ })['query']['results']['mediaObj'][0]
+ msg = media_obj.get('status', {}).get('msg')
+
+ for s in media_obj.get('streams', []):
+ host = s.get('host')
+ path = s.get('path')
+ if not host or not path:
+ continue
+ s_url = host + path
+ if s.get('format') == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ s_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ continue
+ tbr = int_or_none(s.get('bitrate'))
+ formats.append({
+ 'url': s_url,
+ 'format_id': fmt + ('-%d' % tbr if tbr else ''),
+ 'width': int_or_none(s.get('width')),
+ 'height': int_or_none(s.get('height')),
+ 'tbr': tbr,
+ 'fps': int_or_none(s.get('framerate')),
})
+ for cc in media_obj.get('closedcaptions', []):
+ cc_url = cc.get('url')
+ if not cc_url or cc_url in urls:
+ continue
+ urls.append(cc_url)
+ subtitles.setdefault(cc.get('lang') or 'en-US', []).append({
+ 'url': cc_url,
+ 'ext': mimetype2ext(cc.get('content_type')),
+ })
+
+ streaming_url = video.get('streaming_url')
+ if streaming_url and not is_live:
+ formats.extend(self._extract_m3u8_formats(
+ streaming_url, video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+
+ if not formats and msg == 'geo restricted':
+ self.raise_geo_restricted()
+
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for thumb in video.get('thumbnails', []):
+ thumb_url = thumb.get('url')
+ if not thumb_url:
+ continue
+ thumbnails.append({
+ 'id': thumb.get('tag'),
+ 'url': thumb.get('url'),
+ 'width': int_or_none(thumb.get('width')),
+ 'height': int_or_none(thumb.get('height')),
+ })
+
+ series_info = video.get('series_info') or {}
+
return {
'id': video_id,
- 'display_id': display_id,
- 'title': unescapeHTML(meta['title']),
+ 'title': self._live_title(title) if is_live else title,
'formats': formats,
- 'description': clean_html(meta['description']),
- 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
- 'duration': int_or_none(meta.get('duration')),
+ 'display_id': display_id,
+ 'thumbnails': thumbnails,
+ 'description': clean_html(video.get('description')),
+ 'timestamp': parse_iso8601(video.get('publish_time')),
'subtitles': subtitles,
+ 'duration': int_or_none(video.get('duration')),
+ 'view_count': int_or_none(video.get('view_count')),
+ 'is_live': is_live,
+ 'series': video.get('show_name'),
+ 'season_number': int_or_none(series_info.get('season_number')),
+ 'episode_number': int_or_none(series_info.get('episode_number')),
}
- def _get_info(self, video_id, display_id, webpage):
- region = self._search_regex(
- r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
- webpage, 'region', fatal=False, default='US').upper()
- formats = []
- info = {}
- for fmt in ('webm', 'mp4'):
- query_result = self._download_json(
- 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id,
- display_id, 'Downloading %s video info' % fmt, query={
- 'protocol': 'http',
- 'region': region,
- 'format': fmt,
- })
- info = self._extract_info(display_id, query_result, webpage)
- formats.extend(info['formats'])
- formats.extend(self._extract_m3u8_formats(
- 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region),
- video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
- self._sort_formats(formats)
- info['formats'] = formats
- return info
-
class YahooSearchIE(SearchInfoExtractor):
IE_DESC = 'Yahoo screen search'
@@ -523,7 +383,7 @@ class YahooGyaOPlayerIE(InfoExtractor):
'id': video_id,
'title': video['title'],
'url': smuggle_url(
- 'http://players.brightcove.net/4235717419001/default_default/index.html?videoId=' + video['videoId'],
+ 'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['videoId'],
{'geo_countries': ['JP']}),
'description': video.get('longDescription'),
'ie_key': BrightcoveNewIE.ie_key(),
diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py
index 1aea95383..46529be05 100644
--- a/youtube_dl/extractor/yandexvideo.py
+++ b/youtube_dl/extractor/yandexvideo.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
int_or_none,
url_or_none,
)
@@ -47,6 +48,10 @@ class YandexVideoIE(InfoExtractor):
# episode, sports
'url': 'https://yandex.ru/?stream_channel=1538487871&stream_id=4132a07f71fb0396be93d74b3477131d',
'only_matching': True,
+ }, {
+ # DASH with DRM
+ 'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -59,13 +64,22 @@ class YandexVideoIE(InfoExtractor):
'disable_trackings': 1,
})['content']
- m3u8_url = url_or_none(content.get('content_url')) or url_or_none(
+ content_url = url_or_none(content.get('content_url')) or url_or_none(
content['streams'][0]['url'])
title = content.get('title') or content.get('computed_title')
- formats = self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
+ ext = determine_ext(content_url)
+
+ if ext == 'm3u8':
+ formats = self._extract_m3u8_formats(
+ content_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ elif ext == 'mpd':
+ formats = self._extract_mpd_formats(
+ content_url, video_id, mpd_id='dash')
+ else:
+ formats = [{'url': content_url}]
+
self._sort_formats(formats)
description = content.get('description')
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index b2c714505..b913d07a6 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -31,6 +31,7 @@ from ..utils import (
clean_html,
dict_get,
error_to_compat_str,
+ extract_attributes,
ExtractorError,
float_or_none,
get_element_by_attribute,
@@ -40,7 +41,6 @@ from ..utils import (
orderedSet,
parse_codecs,
parse_duration,
- qualities,
remove_quotes,
remove_start,
smuggle_url,
@@ -69,7 +69,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
+ _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
def _set_language(self):
self._set_cookie(
@@ -324,17 +324,18 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
for video_id, video_title in self.extract_videos_from_page(content):
yield self.url_result(video_id, 'Youtube', video_id, video_title)
- def extract_videos_from_page(self, page):
- ids_in_page = []
- titles_in_page = []
- for mobj in re.finditer(self._VIDEO_RE, page):
+ def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
+ for mobj in re.finditer(video_re, page):
# The link with index 0 is not the first video of the playlist (not sure if still actual)
if 'index' in mobj.groupdict() and mobj.group('id') == '0':
continue
video_id = mobj.group('id')
- video_title = unescapeHTML(mobj.group('title'))
+ video_title = unescapeHTML(
+ mobj.group('title')) if 'title' in mobj.groupdict() else None
if video_title:
video_title = video_title.strip()
+ if video_title == '► Play all':
+ video_title = None
try:
idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]:
@@ -342,6 +343,12 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
except ValueError:
ids_in_page.append(video_id)
titles_in_page.append(video_title)
+
+ def extract_videos_from_page(self, page):
+ ids_in_page = []
+ titles_in_page = []
+ self.extract_videos_from_page_impl(
+ self._VIDEO_RE, page, ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
@@ -365,7 +372,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
_VALID_URL = r"""(?x)^
(
(?:https?://|//) # http(s):// or protocol-independent URL
- (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
+ (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
(?:www\.)?deturl\.com/www\.youtube\.com/|
(?:www\.)?pwnyoutube\.com/|
(?:www\.)?hooktube\.com/|
@@ -376,11 +383,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:(?:www|no)\.)?invidiou\.sh/|
(?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
(?:www\.)?invidious\.kabi\.tk/|
- (?:www\.)?invidious\.enkirton\.net/|
(?:www\.)?invidious\.13ad\.de/|
(?:www\.)?invidious\.mastodon\.host/|
+ (?:www\.)?invidious\.nixnet\.xyz/|
+ (?:www\.)?invidious\.drycat\.fr/|
(?:www\.)?tube\.poal\.co/|
(?:www\.)?vid\.wxzm\.sx/|
+ (?:www\.)?yt\.elukerio\.org/|
+ (?:www\.)?yt\.lelux\.fi/|
+ (?:www\.)?kgg2m7yk5aybusll\.onion/|
+ (?:www\.)?qklhadlycap4cnod\.onion/|
+ (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
+ (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
+ (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
+ (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
+ (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
@@ -1207,6 +1224,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
+ 'only_matching': True,
+ },
]
def __init__(self, *args, **kwargs):
@@ -1595,17 +1616,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_id = mobj.group(2)
return video_id
- def _extract_annotations(self, video_id):
- return self._download_webpage(
- 'https://www.youtube.com/annotations_invideo', video_id,
- note='Downloading annotations',
- errnote='Unable to download video annotations', fatal=False,
- query={
- 'features': 1,
- 'legacy': 1,
- 'video_id': video_id,
- })
-
@staticmethod
def _extract_chapters(description, duration):
if not description:
@@ -1700,6 +1710,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def extract_token(v_info):
return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))
+ def extract_player_response(player_response, video_id):
+ pl_response = str_or_none(player_response)
+ if not pl_response:
+ return
+ pl_response = self._parse_json(pl_response, video_id, fatal=False)
+ if isinstance(pl_response, dict):
+ add_dash_mpd_pr(pl_response)
+ return pl_response
+
player_response = {}
# Get video info
@@ -1722,7 +1741,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
note='Refetching age-gated info webpage',
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
+ pl_response = video_info.get('player_response', [None])[0]
+ player_response = extract_player_response(pl_response, video_id)
add_dash_mpd(video_info)
+ view_count = extract_view_count(video_info)
else:
age_gate = False
video_info = None
@@ -1745,11 +1767,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
is_live = True
sts = ytplayer_config.get('sts')
if not player_response:
- pl_response = str_or_none(args.get('player_response'))
- if pl_response:
- pl_response = self._parse_json(pl_response, video_id, fatal=False)
- if isinstance(pl_response, dict):
- player_response = pl_response
+ player_response = extract_player_response(args.get('player_response'), video_id)
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response)
# We also try looking in get_video_info since it may contain different dashmpd
@@ -1781,9 +1799,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
get_video_info = compat_parse_qs(video_info_webpage)
if not player_response:
pl_response = get_video_info.get('player_response', [None])[0]
- if isinstance(pl_response, dict):
- player_response = pl_response
- add_dash_mpd_pr(player_response)
+ player_response = extract_player_response(pl_response, video_id)
add_dash_mpd(get_video_info)
if view_count is None:
view_count = extract_view_count(get_video_info)
@@ -1806,9 +1822,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
break
def extract_unavailable_message():
- return self._html_search_regex(
- r'(?s)]+id="unavailable-message"[^>]*>(.+?)
',
- video_webpage, 'unavailable message', default=None)
+ messages = []
+ for tag, kind in (('h1', 'message'), ('div', 'submessage')):
+ msg = self._html_search_regex(
+ r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?){tag}>'.format(tag=tag, kind=kind),
+ video_webpage, 'unavailable %s' % kind, default=None)
+ if msg:
+ messages.append(msg)
+ if messages:
+ return '\n'.join(messages)
if not video_info:
unavailable_message = extract_unavailable_message()
@@ -1820,16 +1842,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_details = try_get(
player_response, lambda x: x['videoDetails'], dict) or {}
- # title
- if 'title' in video_info:
- video_title = video_info['title'][0]
- elif 'title' in player_response:
- video_title = video_details['title']
- else:
+ video_title = video_info.get('title', [None])[0] or video_details.get('title')
+ if not video_title:
self._downloader.report_warning('Unable to extract video title')
video_title = '_'
- # description
description_original = video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
@@ -1854,11 +1871,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
''', replace_url, video_description)
video_description = clean_html(video_description)
else:
- fd_mobj = re.search(r'= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
+ elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
+ formats = []
formats_spec = {}
fmt_list = video_info.get('fmt_list', [''])[0]
if fmt_list:
@@ -1932,91 +1949,104 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'width': int_or_none(width_height[0]),
'height': int_or_none(width_height[1]),
}
- q = qualities(['small', 'medium', 'hd720'])
- streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list)
- if streaming_formats:
- for fmt in streaming_formats:
- itag = str_or_none(fmt.get('itag'))
- if not itag:
- continue
- quality = fmt.get('quality')
- quality_label = fmt.get('qualityLabel') or quality
- formats_spec[itag] = {
- 'asr': int_or_none(fmt.get('audioSampleRate')),
- 'filesize': int_or_none(fmt.get('contentLength')),
- 'format_note': quality_label,
- 'fps': int_or_none(fmt.get('fps')),
- 'height': int_or_none(fmt.get('height')),
- 'quality': q(quality),
- # bitrate for itag 43 is always 2147483647
- 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
- 'width': int_or_none(fmt.get('width')),
- }
- formats = []
- for url_data_str in encoded_url_map.split(','):
- url_data = compat_parse_qs(url_data_str)
- if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'):
+ for fmt in streaming_formats:
+ itag = str_or_none(fmt.get('itag'))
+ if not itag:
continue
+ quality = fmt.get('quality')
+ quality_label = fmt.get('qualityLabel') or quality
+ formats_spec[itag] = {
+ 'asr': int_or_none(fmt.get('audioSampleRate')),
+ 'filesize': int_or_none(fmt.get('contentLength')),
+ 'format_note': quality_label,
+ 'fps': int_or_none(fmt.get('fps')),
+ 'height': int_or_none(fmt.get('height')),
+ # bitrate for itag 43 is always 2147483647
+ 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
+ 'width': int_or_none(fmt.get('width')),
+ }
+
+ for fmt in streaming_formats:
+ if fmt.get('drm_families'):
+ continue
+ url = url_or_none(fmt.get('url'))
+
+ if not url:
+ cipher = fmt.get('cipher')
+ if not cipher:
+ continue
+ url_data = compat_parse_qs(cipher)
+ url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
+ if not url:
+ continue
+ else:
+ cipher = None
+ url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+
stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
# Unsupported FORMAT_STREAM_TYPE_OTF
if stream_type == 3:
continue
- format_id = url_data['itag'][0]
- url = url_data['url'][0]
- if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
- ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
- jsplayer_url_json = self._search_regex(
- ASSETS_RE,
- embed_webpage if age_gate else video_webpage,
- 'JS player URL (1)', default=None)
- if not jsplayer_url_json and not age_gate:
- # We need the embed website after all
- if embed_webpage is None:
- embed_url = proto + '://www.youtube.com/embed/%s' % video_id
- embed_webpage = self._download_webpage(
- embed_url, video_id, 'Downloading embed webpage')
+ format_id = fmt.get('itag') or url_data['itag'][0]
+ if not format_id:
+ continue
+ format_id = compat_str(format_id)
+
+ if cipher:
+ if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
+ ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
jsplayer_url_json = self._search_regex(
- ASSETS_RE, embed_webpage, 'JS player URL')
+ ASSETS_RE,
+ embed_webpage if age_gate else video_webpage,
+ 'JS player URL (1)', default=None)
+ if not jsplayer_url_json and not age_gate:
+ # We need the embed website after all
+ if embed_webpage is None:
+ embed_url = proto + '://www.youtube.com/embed/%s' % video_id
+ embed_webpage = self._download_webpage(
+ embed_url, video_id, 'Downloading embed webpage')
+ jsplayer_url_json = self._search_regex(
+ ASSETS_RE, embed_webpage, 'JS player URL')
- player_url = json.loads(jsplayer_url_json)
- if player_url is None:
- player_url_json = self._search_regex(
- r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
- video_webpage, 'age gate player URL')
- player_url = json.loads(player_url_json)
-
- if 'sig' in url_data:
- url += '&signature=' + url_data['sig'][0]
- elif 's' in url_data:
- encrypted_sig = url_data['s'][0]
-
- if self._downloader.params.get('verbose'):
+ player_url = json.loads(jsplayer_url_json)
if player_url is None:
- player_version = 'unknown'
- player_desc = 'unknown'
- else:
- if player_url.endswith('swf'):
- player_version = self._search_regex(
- r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
- 'flash player', fatal=False)
- player_desc = 'flash player %s' % player_version
+ player_url_json = self._search_regex(
+ r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
+ video_webpage, 'age gate player URL')
+ player_url = json.loads(player_url_json)
+
+ if 'sig' in url_data:
+ url += '&signature=' + url_data['sig'][0]
+ elif 's' in url_data:
+ encrypted_sig = url_data['s'][0]
+
+ if self._downloader.params.get('verbose'):
+ if player_url is None:
+ player_version = 'unknown'
+ player_desc = 'unknown'
else:
- player_version = self._search_regex(
- [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
- r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
- player_url,
- 'html5 player', fatal=False)
- player_desc = 'html5 player %s' % player_version
+ if player_url.endswith('swf'):
+ player_version = self._search_regex(
+ r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
+ 'flash player', fatal=False)
+ player_desc = 'flash player %s' % player_version
+ else:
+ player_version = self._search_regex(
+ [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
+ r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
+ player_url,
+ 'html5 player', fatal=False)
+ player_desc = 'html5 player %s' % player_version
- parts_sizes = self._signature_cache_id(encrypted_sig)
- self.to_screen('{%s} signature length %s, %s' %
- (format_id, parts_sizes, player_desc))
+ parts_sizes = self._signature_cache_id(encrypted_sig)
+ self.to_screen('{%s} signature length %s, %s' %
+ (format_id, parts_sizes, player_desc))
- signature = self._decrypt_signature(
- encrypted_sig, video_id, player_url, age_gate)
- sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
- url += '&%s=%s' % (sp, signature)
+ signature = self._decrypt_signature(
+ encrypted_sig, video_id, player_url, age_gate)
+ sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
+ url += '&%s=%s' % (sp, signature)
if 'ratebypass' not in url:
url += '&ratebypass=yes'
@@ -2036,24 +2066,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0])
width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
+ if width is None:
+ width = int_or_none(fmt.get('width'))
+ if height is None:
+ height = int_or_none(fmt.get('height'))
+
filesize = int_or_none(url_data.get(
'clen', [None])[0]) or _extract_filesize(url)
- quality = url_data.get('quality', [None])[0]
+ quality = url_data.get('quality', [None])[0] or fmt.get('quality')
+ quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
+
+ tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
+ or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
+ fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
more_fields = {
'filesize': filesize,
- 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
+ 'tbr': tbr,
'width': width,
'height': height,
- 'fps': int_or_none(url_data.get('fps', [None])[0]),
- 'format_note': url_data.get('quality_label', [None])[0] or quality,
- 'quality': q(quality),
+ 'fps': fps,
+ 'format_note': quality_label or quality,
}
for key, value in more_fields.items():
if value:
dct[key] = value
- type_ = url_data.get('type', [None])[0]
+ type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
if type_:
type_split = type_.split(';')
kind_ext = type_split[0].split('/')
@@ -2101,9 +2140,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
formats.append(a_format)
else:
- error_message = clean_html(video_info.get('reason', [None])[0])
+ error_message = extract_unavailable_message()
if not error_message:
- error_message = extract_unavailable_message()
+ error_message = clean_html(try_get(
+ player_response, lambda x: x['playabilityStatus']['reason'],
+ compat_str))
+ if not error_message:
+ error_message = clean_html(
+ try_get(video_info, lambda x: x['reason'][0], compat_str))
if error_message:
raise ExtractorError(error_message, expected=True)
raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
@@ -2274,7 +2318,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
- video_annotations = self._extract_annotations(video_id)
+ xsrf_token = self._search_regex(
+ r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P[A-Za-z0-9+/=]+)\2',
+ video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
+ invideo_url = try_get(
+ player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
+ if xsrf_token and invideo_url:
+ xsrf_field_name = self._search_regex(
+ r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P\w+)\2',
+ video_webpage, 'xsrf field name',
+ group='xsrf_field_name', default='session_token')
+ video_annotations = self._download_webpage(
+ self._proto_relative_url(invideo_url),
+ video_id, note='Downloading annotations',
+ errnote='Unable to download video annotations', fatal=False,
+ data=urlencode_postdata({xsrf_field_name: xsrf_token}))
chapters = self._extract_chapters(description_original, video_duration)
@@ -2411,7 +2469,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
(?:\w+\.)?
(?:
(?:
- youtube\.com|
+ youtube(?:kids)?\.com|
invidio\.us
)
/
@@ -2423,7 +2481,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
)
(
- (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
+ (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
@@ -2432,7 +2490,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
(%(playlist_id)s)
)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
- _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:&(?:[^"]*?index=(?P\d+))?(?:[^>]+>(?P[^<]+))?)?'
+ _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P\d+))?(?:[^>]+>(?P[^<]+))?)?'
+ _VIDEO_RE = _VIDEO_RE_TPL % r'(?P[0-9A-Za-z_-]{11})'
IE_NAME = 'youtube:playlist'
_TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@@ -2592,11 +2651,42 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
}, {
'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
+ 'only_matching': True,
}]
def _real_initialize(self):
self._login()
+ def extract_videos_from_page(self, page):
+ ids_in_page = []
+ titles_in_page = []
+
+ for item in re.findall(
+ r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
+ attrs = extract_attributes(item)
+ video_id = attrs['data-video-id']
+ video_title = unescapeHTML(attrs.get('data-title'))
+ if video_title:
+ video_title = video_title.strip()
+ ids_in_page.append(video_id)
+ titles_in_page.append(video_title)
+
+ # Fallback with old _VIDEO_RE
+ self.extract_videos_from_page_impl(
+ self._VIDEO_RE, page, ids_in_page, titles_in_page)
+
+ # Relaxed fallbacks
+ self.extract_videos_from_page_impl(
+ r'href="\s*/watch\?v\s*=\s*(?P[0-9A-Za-z_-]{11})', page,
+ ids_in_page, titles_in_page)
+ self.extract_videos_from_page_impl(
+ r'data-video-ids\s*=\s*["\'](?P[0-9A-Za-z_-]{11})', page,
+ ids_in_page, titles_in_page)
+
+ return zip(ids_in_page, titles_in_page)
+
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
@@ -2659,7 +2749,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
page, 'title', default=None)
_UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*\s*]+\bhref='
- uploader = self._search_regex(
+ uploader = self._html_search_regex(
r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
page, 'uploader', default=None)
mobj = re.search(
@@ -2734,7 +2824,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com channels'
- _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P[0-9A-Za-z_-]+)'
+ _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P[0-9A-Za-z_-]+)'
_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
_VIDEO_RE = r'(?:title="(?P[^"]+)"[^>]+)?href="/watch\?v=(?P[0-9A-Za-z_-]+)&?'
IE_NAME = 'youtube:channel'
@@ -2762,6 +2852,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
}, {
'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
+ 'only_matching': True,
}]
@classmethod
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index afa3f6c47..145c123a4 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -41,6 +41,7 @@ class ZDFBaseIE(InfoExtractor):
class ZDFIE(ZDFBaseIE):
_VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html'
_QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh')
+ _GEO_COUNTRIES = ['DE']
_TESTS = [{
'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 70416c25e..fd3f921a8 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -393,7 +393,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
sub_ext = sub_info['ext']
if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt':
sub_langs.append(lang)
- sub_filenames.append(subtitles_filename(filename, lang, sub_ext))
+ sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext))
else:
if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt':
webm_vtt_warn = True
@@ -606,9 +606,9 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
self._downloader.to_screen(
'[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext)
continue
- old_file = subtitles_filename(filename, lang, ext)
+ old_file = subtitles_filename(filename, lang, ext, info.get('ext'))
sub_filenames.append(old_file)
- new_file = subtitles_filename(filename, lang, new_ext)
+ new_file = subtitles_filename(filename, lang, new_ext, info.get('ext'))
if ext in ('dfxp', 'ttml', 'tt'):
self._downloader.report_warning(
@@ -616,7 +616,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
'which results in style information loss')
dfxp_file = old_file
- srt_file = subtitles_filename(filename, lang, 'srt')
+ srt_file = subtitles_filename(filename, lang, 'srt', info.get('ext'))
with open(dfxp_file, 'rb') as f:
srt_data = dfxp2srt(f.read())
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 798757241..328f037a8 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1718,13 +1718,16 @@ DATE_FORMATS = (
'%B %d %Y',
'%B %dst %Y',
'%B %dnd %Y',
+ '%B %drd %Y',
'%B %dth %Y',
'%b %d %Y',
'%b %dst %Y',
'%b %dnd %Y',
+ '%b %drd %Y',
'%b %dth %Y',
'%b %dst %Y %I:%M',
'%b %dnd %Y %I:%M',
+ '%b %drd %Y %I:%M',
'%b %dth %Y %I:%M',
'%Y %m %d',
'%Y-%m-%d',
@@ -2906,8 +2909,8 @@ def determine_ext(url, default_ext='unknown_video'):
return default_ext
-def subtitles_filename(filename, sub_lang, sub_format):
- return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
+def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
+ return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
def date_from_str(date_str):
@@ -3516,8 +3519,8 @@ def str_or_none(v, default=None):
def str_to_int(int_str):
""" A more relaxed version of int_or_none """
- if int_str is None:
- return None
+ if not isinstance(int_str, compat_str):
+ return int_str
int_str = re.sub(r'[,\.\+]', '', int_str)
return int(int_str)
@@ -4979,7 +4982,7 @@ class ISO3166Utils(object):
class GeoUtils(object):
# Major IPv4 address blocks per country
_country_ip_map = {
- 'AD': '85.94.160.0/19',
+ 'AD': '46.172.224.0/19',
'AE': '94.200.0.0/13',
'AF': '149.54.0.0/17',
'AG': '209.59.64.0/18',
@@ -4987,28 +4990,30 @@ class GeoUtils(object):
'AL': '46.99.0.0/16',
'AM': '46.70.0.0/15',
'AO': '105.168.0.0/13',
- 'AP': '159.117.192.0/21',
+ 'AP': '182.50.184.0/21',
+ 'AQ': '23.154.160.0/24',
'AR': '181.0.0.0/12',
'AS': '202.70.112.0/20',
- 'AT': '84.112.0.0/13',
+ 'AT': '77.116.0.0/14',
'AU': '1.128.0.0/11',
'AW': '181.41.0.0/18',
- 'AZ': '5.191.0.0/16',
+ 'AX': '185.217.4.0/22',
+ 'AZ': '5.197.0.0/16',
'BA': '31.176.128.0/17',
'BB': '65.48.128.0/17',
'BD': '114.130.0.0/16',
'BE': '57.0.0.0/8',
- 'BF': '129.45.128.0/17',
+ 'BF': '102.178.0.0/15',
'BG': '95.42.0.0/15',
'BH': '37.131.0.0/17',
'BI': '154.117.192.0/18',
'BJ': '137.255.0.0/16',
- 'BL': '192.131.134.0/24',
+ 'BL': '185.212.72.0/23',
'BM': '196.12.64.0/18',
'BN': '156.31.0.0/16',
'BO': '161.56.0.0/16',
'BQ': '161.0.80.0/20',
- 'BR': '152.240.0.0/12',
+ 'BR': '191.128.0.0/12',
'BS': '24.51.64.0/18',
'BT': '119.2.96.0/19',
'BW': '168.167.0.0/16',
@@ -5016,20 +5021,20 @@ class GeoUtils(object):
'BZ': '179.42.192.0/18',
'CA': '99.224.0.0/11',
'CD': '41.243.0.0/16',
- 'CF': '196.32.200.0/21',
- 'CG': '197.214.128.0/17',
+ 'CF': '197.242.176.0/21',
+ 'CG': '160.113.0.0/16',
'CH': '85.0.0.0/13',
- 'CI': '154.232.0.0/14',
+ 'CI': '102.136.0.0/14',
'CK': '202.65.32.0/19',
'CL': '152.172.0.0/14',
- 'CM': '165.210.0.0/15',
+ 'CM': '102.244.0.0/14',
'CN': '36.128.0.0/10',
'CO': '181.240.0.0/12',
'CR': '201.192.0.0/12',
'CU': '152.206.0.0/15',
'CV': '165.90.96.0/19',
'CW': '190.88.128.0/17',
- 'CY': '46.198.0.0/15',
+ 'CY': '31.153.0.0/16',
'CZ': '88.100.0.0/14',
'DE': '53.0.0.0/8',
'DJ': '197.241.0.0/17',
@@ -5046,6 +5051,7 @@ class GeoUtils(object):
'EU': '2.16.0.0/13',
'FI': '91.152.0.0/13',
'FJ': '144.120.0.0/16',
+ 'FK': '80.73.208.0/21',
'FM': '119.252.112.0/20',
'FO': '88.85.32.0/19',
'FR': '90.0.0.0/9',
@@ -5055,8 +5061,8 @@ class GeoUtils(object):
'GE': '31.146.0.0/16',
'GF': '161.22.64.0/18',
'GG': '62.68.160.0/19',
- 'GH': '45.208.0.0/14',
- 'GI': '85.115.128.0/19',
+ 'GH': '154.160.0.0/12',
+ 'GI': '95.164.0.0/16',
'GL': '88.83.0.0/19',
'GM': '160.182.0.0/15',
'GN': '197.149.192.0/18',
@@ -5085,13 +5091,13 @@ class GeoUtils(object):
'JE': '87.244.64.0/18',
'JM': '72.27.0.0/17',
'JO': '176.29.0.0/16',
- 'JP': '126.0.0.0/8',
+ 'JP': '133.0.0.0/8',
'KE': '105.48.0.0/12',
'KG': '158.181.128.0/17',
'KH': '36.37.128.0/17',
'KI': '103.25.140.0/22',
'KM': '197.255.224.0/20',
- 'KN': '198.32.32.0/19',
+ 'KN': '198.167.192.0/19',
'KP': '175.45.176.0/22',
'KR': '175.192.0.0/10',
'KW': '37.36.0.0/14',
@@ -5099,10 +5105,10 @@ class GeoUtils(object):
'KZ': '2.72.0.0/13',
'LA': '115.84.64.0/18',
'LB': '178.135.0.0/16',
- 'LC': '192.147.231.0/24',
+ 'LC': '24.92.144.0/20',
'LI': '82.117.0.0/19',
'LK': '112.134.0.0/15',
- 'LR': '41.86.0.0/19',
+ 'LR': '102.183.0.0/16',
'LS': '129.232.0.0/17',
'LT': '78.56.0.0/13',
'LU': '188.42.0.0/16',
@@ -5127,7 +5133,7 @@ class GeoUtils(object):
'MT': '46.11.0.0/16',
'MU': '105.16.0.0/12',
'MV': '27.114.128.0/18',
- 'MW': '105.234.0.0/16',
+ 'MW': '102.70.0.0/15',
'MX': '187.192.0.0/11',
'MY': '175.136.0.0/13',
'MZ': '197.218.0.0/15',
@@ -5158,23 +5164,23 @@ class GeoUtils(object):
'PW': '202.124.224.0/20',
'PY': '181.120.0.0/14',
'QA': '37.210.0.0/15',
- 'RE': '139.26.0.0/16',
+ 'RE': '102.35.0.0/16',
'RO': '79.112.0.0/13',
- 'RS': '178.220.0.0/14',
+ 'RS': '93.86.0.0/15',
'RU': '5.136.0.0/13',
- 'RW': '105.178.0.0/15',
+ 'RW': '41.186.0.0/16',
'SA': '188.48.0.0/13',
'SB': '202.1.160.0/19',
'SC': '154.192.0.0/11',
- 'SD': '154.96.0.0/13',
+ 'SD': '102.120.0.0/13',
'SE': '78.64.0.0/12',
- 'SG': '152.56.0.0/14',
+ 'SG': '8.128.0.0/10',
'SI': '188.196.0.0/14',
'SK': '78.98.0.0/15',
- 'SL': '197.215.0.0/17',
+ 'SL': '102.143.0.0/17',
'SM': '89.186.32.0/19',
'SN': '41.82.0.0/15',
- 'SO': '197.220.64.0/19',
+ 'SO': '154.115.192.0/18',
'SR': '186.179.128.0/17',
'SS': '105.235.208.0/21',
'ST': '197.159.160.0/19',
@@ -5197,15 +5203,15 @@ class GeoUtils(object):
'TV': '202.2.96.0/19',
'TW': '120.96.0.0/11',
'TZ': '156.156.0.0/14',
- 'UA': '93.72.0.0/13',
- 'UG': '154.224.0.0/13',
- 'US': '3.0.0.0/8',
+ 'UA': '37.52.0.0/14',
+ 'UG': '102.80.0.0/13',
+ 'US': '6.0.0.0/8',
'UY': '167.56.0.0/13',
- 'UZ': '82.215.64.0/18',
+ 'UZ': '84.54.64.0/18',
'VA': '212.77.0.0/19',
- 'VC': '24.92.144.0/20',
+ 'VC': '207.191.240.0/21',
'VE': '186.88.0.0/13',
- 'VG': '172.103.64.0/18',
+ 'VG': '66.81.192.0/20',
'VI': '146.226.0.0/16',
'VN': '14.160.0.0/11',
'VU': '202.80.32.0/20',
@@ -5214,8 +5220,8 @@ class GeoUtils(object):
'YE': '134.35.0.0/16',
'YT': '41.242.116.0/22',
'ZA': '41.0.0.0/11',
- 'ZM': '165.56.0.0/13',
- 'ZW': '41.85.192.0/19',
+ 'ZM': '102.144.0.0/13',
+ 'ZW': '102.177.192.0/18',
}
@classmethod
@@ -5377,6 +5383,19 @@ def decode_packed_codes(code):
obfucasted_code)
+def caesar(s, alphabet, shift):
+ if shift == 0:
+ return s
+ l = len(alphabet)
+ return ''.join(
+ alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
+ for c in s)
+
+
+def rot47(s):
+ return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
+
+
def parse_m3u8_attributes(attrib):
info = {}
for (key, val) in re.findall(r'(?P[A-Z0-9-]+)=(?P"[^"]+"|[^",]+)(?:,|$)', attrib):
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index b0f5a6b47..1227abc0a 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2019.07.16'
+__version__ = '2019.11.28'