mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-28 01:21:01 +01:00
[youtube] Fix history, trending and mix playlists (#136)
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com> Co-authored-by: Matthew <colethedj@protonmail.com>
This commit is contained in:
parent
2c736b4f61
commit
cd7c66cf01
@ -37,7 +37,6 @@ def test_youtube_playlist_matching(self):
|
|||||||
assertPlaylist('PL63F0C78739B09958')
|
assertPlaylist('PL63F0C78739B09958')
|
||||||
assertTab('https://www.youtube.com/AsapSCIENCE')
|
assertTab('https://www.youtube.com/AsapSCIENCE')
|
||||||
assertTab('https://www.youtube.com/embedded')
|
assertTab('https://www.youtube.com/embedded')
|
||||||
assertTab('https://www.youtube.com/feed') # Own channel's home page
|
|
||||||
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
|
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
|
||||||
assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
|
assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
|
||||||
assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
|
assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
|
||||||
|
@ -59,9 +59,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
|
|||||||
_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
|
_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
|
||||||
|
|
||||||
_RESERVED_NAMES = (
|
_RESERVED_NAMES = (
|
||||||
r'embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared|hashtag|'
|
r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|'
|
||||||
r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|'
|
r'movies|results|shared|hashtag|trending|feed|feeds|'
|
||||||
r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)')
|
r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
|
||||||
|
|
||||||
_NETRC_MACHINE = 'youtube'
|
_NETRC_MACHINE = 'youtube'
|
||||||
# If True it will raise an error if no login info is provided
|
# If True it will raise an error if no login info is provided
|
||||||
@ -2520,17 +2520,22 @@ def _extract_channel_id(self, webpage):
|
|||||||
channel_url, 'channel id')
|
channel_url, 'channel id')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_grid_item_renderer(item):
|
def _extract_basic_item_renderer(item):
|
||||||
for item_kind in ('Playlist', 'Video', 'Channel'):
|
# Modified from _extract_grid_item_renderer
|
||||||
renderer = item.get('grid%sRenderer' % item_kind)
|
known_renderers = (
|
||||||
if renderer:
|
'playlistRenderer', 'videoRenderer', 'channelRenderer'
|
||||||
|
'gridPlaylistRenderer', 'gridVideoRenderer', 'gridChannelRenderer'
|
||||||
|
)
|
||||||
|
for key, renderer in item.items():
|
||||||
|
if key not in known_renderers:
|
||||||
|
continue
|
||||||
return renderer
|
return renderer
|
||||||
|
|
||||||
def _grid_entries(self, grid_renderer):
|
def _grid_entries(self, grid_renderer):
|
||||||
for item in grid_renderer['items']:
|
for item in grid_renderer['items']:
|
||||||
if not isinstance(item, dict):
|
if not isinstance(item, dict):
|
||||||
continue
|
continue
|
||||||
renderer = self._extract_grid_item_renderer(item)
|
renderer = self._extract_basic_item_renderer(item)
|
||||||
if not isinstance(renderer, dict):
|
if not isinstance(renderer, dict):
|
||||||
continue
|
continue
|
||||||
title = try_get(
|
title = try_get(
|
||||||
@ -2559,7 +2564,7 @@ def _shelf_entries_from_content(self, shelf_renderer):
|
|||||||
content = shelf_renderer.get('content')
|
content = shelf_renderer.get('content')
|
||||||
if not isinstance(content, dict):
|
if not isinstance(content, dict):
|
||||||
return
|
return
|
||||||
renderer = content.get('gridRenderer')
|
renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer')
|
||||||
if renderer:
|
if renderer:
|
||||||
# TODO: add support for nested playlists so each shelf is processed
|
# TODO: add support for nested playlists so each shelf is processed
|
||||||
# as separate playlist
|
# as separate playlist
|
||||||
@ -2601,20 +2606,6 @@ def _playlist_entries(self, video_list_renderer):
|
|||||||
continue
|
continue
|
||||||
yield self._extract_video(renderer)
|
yield self._extract_video(renderer)
|
||||||
|
|
||||||
r""" # Not needed in the new implementation
|
|
||||||
def _itemSection_entries(self, item_sect_renderer):
|
|
||||||
for content in item_sect_renderer['contents']:
|
|
||||||
if not isinstance(content, dict):
|
|
||||||
continue
|
|
||||||
renderer = content.get('videoRenderer', {})
|
|
||||||
if not isinstance(renderer, dict):
|
|
||||||
continue
|
|
||||||
video_id = renderer.get('videoId')
|
|
||||||
if not video_id:
|
|
||||||
continue
|
|
||||||
yield self._extract_video(renderer)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _rich_entries(self, rich_grid_renderer):
|
def _rich_entries(self, rich_grid_renderer):
|
||||||
renderer = try_get(
|
renderer = try_get(
|
||||||
rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
|
rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
|
||||||
@ -2837,7 +2828,7 @@ def extract_entries(parent_renderer): # this needs to called again for continua
|
|||||||
'gridPlaylistRenderer': (self._grid_entries, 'items'),
|
'gridPlaylistRenderer': (self._grid_entries, 'items'),
|
||||||
'gridVideoRenderer': (self._grid_entries, 'items'),
|
'gridVideoRenderer': (self._grid_entries, 'items'),
|
||||||
'playlistVideoRenderer': (self._playlist_entries, 'contents'),
|
'playlistVideoRenderer': (self._playlist_entries, 'contents'),
|
||||||
'itemSectionRenderer': (self._playlist_entries, 'contents'),
|
'itemSectionRenderer': (extract_entries, 'contents'), # for feeds
|
||||||
'richItemRenderer': (extract_entries, 'contents'), # for hashtag
|
'richItemRenderer': (extract_entries, 'contents'), # for hashtag
|
||||||
}
|
}
|
||||||
continuation_items = try_get(
|
continuation_items = try_get(
|
||||||
@ -2955,13 +2946,30 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
|
|||||||
self._entries(selected_tab, identity_token, playlist_id),
|
self._entries(selected_tab, identity_token, playlist_id),
|
||||||
**metadata)
|
**metadata)
|
||||||
|
|
||||||
|
def _extract_mix_playlist(self, playlist, playlist_id):
|
||||||
|
page_num = 0
|
||||||
|
while True:
|
||||||
|
videos = list(self._playlist_entries(playlist))
|
||||||
|
if not videos:
|
||||||
|
return
|
||||||
|
video_count = len(videos)
|
||||||
|
start = min(video_count - 24, 26) if video_count > 25 else 0
|
||||||
|
for item in videos[start:]:
|
||||||
|
yield item
|
||||||
|
|
||||||
|
page_num += 1
|
||||||
|
_, data = self._extract_webpage(
|
||||||
|
'https://www.youtube.com/watch?list=%s&v=%s' % (playlist_id, videos[-1]['id']),
|
||||||
|
'%s page %d' % (playlist_id, page_num))
|
||||||
|
playlist = try_get(
|
||||||
|
data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
|
||||||
|
|
||||||
def _extract_from_playlist(self, item_id, url, data, playlist):
|
def _extract_from_playlist(self, item_id, url, data, playlist):
|
||||||
title = playlist.get('title') or try_get(
|
title = playlist.get('title') or try_get(
|
||||||
data, lambda x: x['titleText']['simpleText'], compat_str)
|
data, lambda x: x['titleText']['simpleText'], compat_str)
|
||||||
playlist_id = playlist.get('playlistId') or item_id
|
playlist_id = playlist.get('playlistId') or item_id
|
||||||
# Inline playlist rendition continuation does not always work
|
|
||||||
# at Youtube side, so delegating regular tab-based playlist URL
|
# Delegating everything except mix playlists to regular tab-based playlist URL
|
||||||
# processing whenever possible.
|
|
||||||
playlist_url = urljoin(url, try_get(
|
playlist_url = urljoin(url, try_get(
|
||||||
playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
|
playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
|
||||||
compat_str))
|
compat_str))
|
||||||
@ -2969,9 +2977,10 @@ def _extract_from_playlist(self, item_id, url, data, playlist):
|
|||||||
return self.url_result(
|
return self.url_result(
|
||||||
playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
|
playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
|
||||||
video_title=title)
|
video_title=title)
|
||||||
|
|
||||||
return self.playlist_result(
|
return self.playlist_result(
|
||||||
self._playlist_entries(playlist), playlist_id=playlist_id,
|
self._extract_mix_playlist(playlist, playlist_id),
|
||||||
playlist_title=title)
|
playlist_id=playlist_id, playlist_title=title)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_alerts(data):
|
def _extract_alerts(data):
|
||||||
@ -3001,35 +3010,7 @@ def _extract_identity_token(self, webpage, item_id):
|
|||||||
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
|
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
|
||||||
'identity token', default=None)
|
'identity token', default=None)
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _extract_webpage(self, url, item_id):
|
||||||
item_id = self._match_id(url)
|
|
||||||
url = compat_urlparse.urlunparse(
|
|
||||||
compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
|
|
||||||
is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
|
|
||||||
if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
|
|
||||||
self._downloader.report_warning(
|
|
||||||
'A channel/user page was given. All the channel\'s videos will be downloaded. '
|
|
||||||
'To download only the videos in the home page, add a "/featured" to the URL')
|
|
||||||
url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
|
|
||||||
|
|
||||||
# Handle both video/playlist URLs
|
|
||||||
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
|
|
||||||
video_id = qs.get('v', [None])[0]
|
|
||||||
playlist_id = qs.get('list', [None])[0]
|
|
||||||
|
|
||||||
if is_home is not None and is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
|
|
||||||
if playlist_id:
|
|
||||||
self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
|
|
||||||
url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
|
|
||||||
# return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
|
|
||||||
else:
|
|
||||||
raise ExtractorError('Unable to recognize tab page')
|
|
||||||
if video_id and playlist_id:
|
|
||||||
if self._downloader.params.get('noplaylist'):
|
|
||||||
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
|
|
||||||
return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
|
|
||||||
self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
|
|
||||||
|
|
||||||
retries = self._downloader.params.get('extractor_retries', 3)
|
retries = self._downloader.params.get('extractor_retries', 3)
|
||||||
count = -1
|
count = -1
|
||||||
last_error = 'Incomplete yt initial data recieved'
|
last_error = 'Incomplete yt initial data recieved'
|
||||||
@ -3041,8 +3022,7 @@ def _real_extract(self, url):
|
|||||||
self.report_warning('%s. Retrying ...' % last_error)
|
self.report_warning('%s. Retrying ...' % last_error)
|
||||||
webpage = self._download_webpage(
|
webpage = self._download_webpage(
|
||||||
url, item_id,
|
url, item_id,
|
||||||
'Downloading webpage%s' % ' (retry #%d)' % count if count else '')
|
'Downloading webpage%s' % (' (retry #%d)' % count if count else ''))
|
||||||
identity_token = self._extract_identity_token(webpage, item_id)
|
|
||||||
data = self._extract_yt_initial_data(item_id, webpage)
|
data = self._extract_yt_initial_data(item_id, webpage)
|
||||||
err_msg = None
|
err_msg = None
|
||||||
for alert_type, alert_message in self._extract_alerts(data):
|
for alert_type, alert_message in self._extract_alerts(data):
|
||||||
@ -3058,23 +3038,61 @@ def _real_extract(self, url):
|
|||||||
break
|
break
|
||||||
if count >= retries:
|
if count >= retries:
|
||||||
self._downloader.report_error(last_error)
|
self._downloader.report_error(last_error)
|
||||||
|
return webpage, data
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
item_id = self._match_id(url)
|
||||||
|
url = compat_urlparse.urlunparse(
|
||||||
|
compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
|
||||||
|
|
||||||
|
# This is not matched in a channel page with a tab selected
|
||||||
|
mobj = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
|
||||||
|
mobj = mobj.groupdict() if mobj else {}
|
||||||
|
if mobj and not mobj.get('not_channel'):
|
||||||
|
self._downloader.report_warning(
|
||||||
|
'A channel/user page was given. All the channel\'s videos will be downloaded. '
|
||||||
|
'To download only the videos in the home page, add a "/featured" to the URL')
|
||||||
|
url = '%s/videos%s' % (mobj.get('pre'), mobj.get('post') or '')
|
||||||
|
|
||||||
|
# Handle both video/playlist URLs
|
||||||
|
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
|
||||||
|
video_id = qs.get('v', [None])[0]
|
||||||
|
playlist_id = qs.get('list', [None])[0]
|
||||||
|
|
||||||
|
if not video_id and (mobj.get('not_channel') or '').startswith('watch'):
|
||||||
|
if not playlist_id:
|
||||||
|
# If there is neither video or playlist ids,
|
||||||
|
# youtube redirects to home page, which is undesirable
|
||||||
|
raise ExtractorError('Unable to recognize tab page')
|
||||||
|
self._downloader.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
|
||||||
|
url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
|
||||||
|
|
||||||
|
if video_id and playlist_id:
|
||||||
|
if self._downloader.params.get('noplaylist'):
|
||||||
|
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
|
||||||
|
return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
|
||||||
|
self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
|
||||||
|
|
||||||
|
webpage, data = self._extract_webpage(url, item_id)
|
||||||
|
|
||||||
tabs = try_get(
|
tabs = try_get(
|
||||||
data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
|
data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
|
||||||
if tabs:
|
if tabs:
|
||||||
|
identity_token = self._extract_identity_token(webpage, item_id)
|
||||||
return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
|
return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
|
||||||
|
|
||||||
playlist = try_get(
|
playlist = try_get(
|
||||||
data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
|
data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
|
||||||
if playlist:
|
if playlist:
|
||||||
return self._extract_from_playlist(item_id, url, data, playlist)
|
return self._extract_from_playlist(item_id, url, data, playlist)
|
||||||
# Fallback to video extraction if no playlist alike page is recognized.
|
|
||||||
# First check for the current video then try the v attribute of URL query.
|
|
||||||
video_id = try_get(
|
video_id = try_get(
|
||||||
data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
|
data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
|
||||||
compat_str) or video_id
|
compat_str) or video_id
|
||||||
if video_id:
|
if video_id:
|
||||||
|
self._downloader.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
|
||||||
return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
|
return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
|
||||||
# Failed to recognize
|
|
||||||
raise ExtractorError('Unable to recognize tab page')
|
raise ExtractorError('Unable to recognize tab page')
|
||||||
|
|
||||||
|
|
||||||
@ -3338,7 +3356,6 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):
|
|||||||
Subclasses must define the _FEED_NAME property.
|
Subclasses must define the _FEED_NAME property.
|
||||||
"""
|
"""
|
||||||
_LOGIN_REQUIRED = True
|
_LOGIN_REQUIRED = True
|
||||||
# _MAX_PAGES = 5
|
|
||||||
_TESTS = []
|
_TESTS = []
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
Loading…
Reference in New Issue
Block a user