From 78caa52aea356a60b6efbe92484d6bdea1fe7432 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 13 Sep 2014 07:51:06 +0200 Subject: [PATCH] [youtube] Modernize --- youtube_dl/extractor/youtube.py | 247 ++++++++++++++++---------------- 1 file changed, 125 insertions(+), 122 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 150778592..b54c69122 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import unicode_literals + + import itertools import json import os.path @@ -69,29 +72,29 @@ def _login(self): return galx = self._search_regex(r'(?s)= 0 else u':' - steps = u'' if step == 1 else (u':%d' % step) - return u's[%s%s%s]' % (starts, ends, steps) + starts = '' if start == 0 else str(start) + ends = (u':%d' % (end+step)) if end + step >= 0 else ':' + steps = '' if step == 1 else (u':%d' % step) + return 's[%s%s%s]' % (starts, ends, steps) step = None start = '(Never used)' # Quelch pyflakes warnings - start will be @@ -477,26 +480,26 @@ def _genslice(start, end, step): start = prev continue else: - yield u's[%d]' % prev + yield 's[%d]' % prev if step is None: - yield u's[%d]' % i + yield 's[%d]' % i else: yield _genslice(start, i, step) - test_string = u''.join(map(compat_chr, range(len(example_sig)))) + test_string = ''.join(map(compat_chr, range(len(example_sig)))) cache_res = func(test_string) cache_spec = [ord(c) for c in cache_res] - expr_code = u' + '.join(gen_sig_code(cache_spec)) + expr_code = ' + '.join(gen_sig_code(cache_spec)) signature_id_tuple = '(%s)' % ( ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n' - u' return %s\n') % (signature_id_tuple, expr_code) + ' return %s\n') % (signature_id_tuple, expr_code) self.to_screen(u'Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): funcname = self._search_regex( r'signature=([$a-zA-Z]+)', jscode, - u'Initial JS player signature function name') + 'Initial JS player signature function name') jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) @@ -504,9 +507,9 @@ def _parse_sig_js(self, jscode): def _parse_sig_swf(self, file_contents): swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = u'SignatureDecipher' + TARGET_CLASSNAME = 'SignatureDecipher' searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, u'decipher') + initial_function = swfi.extract_function(searched_class, 'decipher') return lambda s: initial_function([s]) def _decrypt_signature(self, s, video_id, player_url, age_gate=False): @@ -516,7 +519,7 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False): raise ExtractorError(u'Cannot decrypt signature without player_url') if player_url.startswith(u'//'): - player_url = u'https:' + player_url + player_url = 'https:' + player_url try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: @@ -531,7 +534,7 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False): except Exception as e: tb = traceback.format_exc() raise ExtractorError( - u'Signature extraction failed: ' + tb, cause=e) + 'Signature extraction failed: ' + tb, cause=e) def _get_available_subtitles(self, video_id, webpage): try: @@ -554,7 +557,7 @@ def _get_available_subtitles(self, video_id, webpage): 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), 'name': unescapeHTML(l[0]).encode('utf-8'), }) - url = u'https://www.youtube.com/api/timedtext?' + params + url = 'https://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url if not sub_lang_list: self._downloader.report_warning(u'video doesn\'t have subtitles') @@ -567,7 +570,7 @@ def _get_available_automatic_caption(self, video_id, webpage): sub_format = self._downloader.params.get('subtitlesformat', 'srt') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) - err_msg = u'Couldn\'t find automatic captions for %s' % video_id + err_msg = 'Couldn\'t find automatic captions for %s' % video_id if mobj is None: self._downloader.report_warning(err_msg) return {} @@ -623,7 +626,7 @@ def _get_urls(_manifest): urls = filter(lambda l: l and not l.startswith('#'), lines) return urls - manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest') + manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest') formats_urls = _get_urls(manifest) for format_url in formats_urls: itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag') @@ -636,8 +639,8 @@ def _extract_annotations(self, video_id): def _real_extract(self, url): proto = ( - u'http' if self._downloader.params.get('prefer_insecure', False) - else u'https') + 'http' if self._downloader.params.get('prefer_insecure', False) + else 'https') # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) @@ -688,11 +691,11 @@ def _real_extract(self, url): if 'token' not in video_info: if 'reason' in video_info: raise ExtractorError( - u'YouTube said: %s' % video_info['reason'][0], + 'YouTube said: %s' % video_info['reason'][0], expected=True, video_id=video_id) else: raise ExtractorError( - u'"token" parameter not in video info for unknown reason', + '"token" parameter not in video info for unknown reason', video_id=video_id) if 'view_count' in video_info: @@ -725,7 +728,7 @@ def _real_extract(self, url): video_title = video_info['title'][0] else: self._downloader.report_warning(u'Unable to extract video title') - video_title = u'_' + video_title = '_' # thumbnail image # We try first to get a high quality image: @@ -779,7 +782,7 @@ def _real_extract(self, url): if fd_mobj: video_description = unescapeHTML(fd_mobj.group(1)) else: - video_description = u'' + video_description = '' def _extract_count(count_name): count = self._search_regex( @@ -826,7 +829,7 @@ def _extract_count(count_name): if m_s is not None: self.to_screen(u'%s: Encrypted signatures detected.' % video_id) video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] - m_s = re_signature.search(args.get('adaptive_fmts', u'')) + m_s = re_signature.search(args.get('adaptive_fmts', '')) if m_s is not None: if 'adaptive_fmts' in video_info: video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts'] @@ -876,12 +879,12 @@ def _map_to_format_list(urlmap): if not age_gate: jsplayer_url_json = self._search_regex( r'"assets":.+?"js":\s*("[^"]+")', - video_webpage, u'JS player URL') + video_webpage, 'JS player URL') player_url = json.loads(jsplayer_url_json) if player_url is None: player_url_json = self._search_regex( r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, u'age gate player URL') + video_webpage, 'age gate player URL') player_url = json.loads(player_url_json) if self._downloader.params.get('verbose'): @@ -892,14 +895,14 @@ def _map_to_format_list(urlmap): if player_url.endswith('swf'): player_version = self._search_regex( r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - u'flash player', fatal=False) + 'flash player', fatal=False) player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( r'html5player-([^/]+?)(?:/html5player)?\.js', player_url, 'html5 player', fatal=False) - player_desc = u'html5 player %s' % player_version + player_desc = 'html5 player %s' % player_version parts_sizes = self._signature_cache_id(encrypted_sig) self.to_screen(u'{%s} signature length %s, %s' % @@ -991,7 +994,7 @@ def decrypt_sig(mobj): } class YoutubePlaylistIE(YoutubeBaseInfoExtractor): - IE_DESC = u'YouTube.com playlists' + IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? (?:\w+\.)? @@ -1013,7 +1016,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _MORE_PAGES_INDICATOR = r'data-link-type="next"' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' - IE_NAME = u'youtube:playlist' + IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', 'info_dict': { @@ -1068,7 +1071,7 @@ def _extract_mix(self, playlist_id): # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage( - url, playlist_id, u'Downloading Youtube mix') + url, playlist_id, 'Downloading Youtube mix') search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) title_span = ( search_title('playlist-title') or @@ -1105,7 +1108,7 @@ def _real_extract(self, url): return self._extract_mix(playlist_id) if playlist_id.startswith('TL'): raise ExtractorError(u'For downloading YouTube.com top lists, use ' - u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) + 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) @@ -1114,7 +1117,7 @@ def _real_extract(self, url): # Check if the playlist exists or is private if re.search(r'
[^<]*?(The|This) playlist (does not exist|is private)[^<]*?
', page) is not None: raise ExtractorError( - u'The playlist doesn\'t exist or is private, use --username or ' + 'The playlist doesn\'t exist or is private, use --username or ' '--netrc to access it.', expected=True) @@ -1141,16 +1144,16 @@ def _real_extract(self, url): playlist_title = self._html_search_regex( r'(?s)

\s*(.*?)\s*

', - page, u'title') + page, 'title') url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) class YoutubeTopListIE(YoutubePlaylistIE): - IE_NAME = u'youtube:toplist' + IE_NAME = 'youtube:toplist' IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"' - u' (Example: "yttoplist:music:Top Tracks")') + ' (Example: "yttoplist:music:Top Tracks")') _VALID_URL = r'yttoplist:(?P.*?):(?P.*?)$' _TESTS = [] @@ -1161,7 +1164,7 @@ def _real_extract(self, url): query = compat_urllib_parse.urlencode({'title': title}) playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query) channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title) - link = self._html_search_regex(playlist_re, channel_page, u'list') + link = self._html_search_regex(playlist_re, channel_page, 'list') url = compat_urlparse.urljoin('https://www.youtube.com/', link) video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' @@ -1169,7 +1172,7 @@ def _real_extract(self, url): # sometimes the webpage doesn't contain the videos # retry until we get them for i in itertools.count(0): - msg = u'Downloading Youtube mix' + msg = 'Downloading Youtube mix' if i > 0: msg += ', retry #%d' % i @@ -1182,11 +1185,11 @@ def _real_extract(self, url): class YoutubeChannelIE(InfoExtractor): - IE_DESC = u'YouTube.com channels' + IE_DESC = 'YouTube.com channels' _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" _MORE_PAGES_INDICATOR = 'yt-uix-load-more' _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' - IE_NAME = u'youtube:channel' + IE_NAME = 'youtube:channel' def extract_videos_from_page(self, page): ids_in_page = [] @@ -1238,12 +1241,12 @@ def _real_extract(self, url): class YoutubeUserIE(InfoExtractor): - IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' + IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' - IE_NAME = u'youtube:user' + IE_NAME = 'youtube:user' @classmethod def suitable(cls, url): @@ -1272,7 +1275,7 @@ def download_page(pagenum): gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) page = self._download_webpage( gdata_url, username, - u'Downloading video ids from %d to %d' % ( + 'Downloading video ids from %d to %d' % ( start_index, start_index + self._GDATA_PAGE_SIZE)) try: @@ -1300,10 +1303,10 @@ def download_page(pagenum): class YoutubeSearchIE(SearchInfoExtractor): - IE_DESC = u'YouTube.com searches' - _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' + IE_DESC = 'YouTube.com searches' + _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' _MAX_RESULTS = 1000 - IE_NAME = u'youtube:search' + IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' def _get_n_results(self, query, n): @@ -1327,7 +1330,7 @@ def _get_n_results(self, query, n): if 'items' not in api_response: raise ExtractorError( - u'[youtube] No video results', expected=True) + '[youtube] No video results', expected=True) new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids @@ -1346,12 +1349,12 @@ class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' _SEARCH_KEY = 'ytsearchdate' - IE_DESC = u'YouTube.com searches, newest videos first' + IE_DESC = 'YouTube.com searches, newest videos first' class YoutubeSearchURLIE(InfoExtractor): - IE_DESC = u'YouTube.com search URLs' - IE_NAME = u'youtube:search_url' + IE_DESC = 'YouTube.com search URLs' + IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)' def _real_extract(self, url): @@ -1360,7 +1363,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, query) result_code = self._search_regex( - r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML') + r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML') part_codes = re.findall( r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) @@ -1386,14 +1389,14 @@ def _real_extract(self, url): class YoutubeShowIE(InfoExtractor): - IE_DESC = u'YouTube.com (multi-season) shows' + IE_DESC = 'YouTube.com (multi-season) shows' _VALID_URL = r'https?://www\.youtube\.com/show/(.*)' - IE_NAME = u'youtube:show' + IE_NAME = 'youtube:show' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) show_name = mobj.group(1) - webpage = self._download_webpage(url, show_name, u'Downloading show webpage') + webpage = self._download_webpage(url, show_name, 'Downloading show webpage') # There's one playlist for each season of the show m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons))) @@ -1419,7 +1422,7 @@ def _FEED_TEMPLATE(self): @property def IE_NAME(self): - return u'youtube:%s' % self._FEED_NAME + return 'youtube:%s' % self._FEED_NAME def _real_initialize(self): self._login() @@ -1429,8 +1432,8 @@ def _real_extract(self, url): paging = 0 for i in itertools.count(1): info = self._download_json(self._FEED_TEMPLATE % paging, - u'%s feed' % self._FEED_NAME, - u'Downloading page %s' % i) + '%s feed' % self._FEED_NAME, + 'Downloading page %s' % i) feed_html = info.get('feed_html') or info.get('content_html') load_more_widget_html = info.get('load_more_widget_html') or feed_html m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) @@ -1447,45 +1450,45 @@ def _real_extract(self, url): return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)' + IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = u'Youtube Recommended videos' + _PLAYLIST_TITLE = 'Youtube Recommended videos' class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): - IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)' + IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' _FEED_NAME = 'watch_later' - _PLAYLIST_TITLE = u'Youtube Watch Later' + _PLAYLIST_TITLE = 'Youtube Watch Later' _PERSONAL_FEED = True class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)' - _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory' + IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)' + _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history' _PERSONAL_FEED = True - _PLAYLIST_TITLE = u'Youtube Watch History' + _PLAYLIST_TITLE = 'Youtube Watch History' class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = u'youtube:favorites' - IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' + IE_NAME = 'youtube:favorites' + IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?' _LOGIN_REQUIRED = True def _real_extract(self, url): webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id') + playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') return self.url_result(playlist_id, 'YoutubePlaylist') class YoutubeSubscriptionsIE(YoutubePlaylistIE): - IE_NAME = u'youtube:subscriptions' - IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' + IE_NAME = 'youtube:subscriptions' + IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' _TESTS = [] def _real_extract(self, url): - title = u'Youtube Subscriptions' + title = 'Youtube Subscriptions' page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title) # The extraction process is the same as for playlists, but the regex @@ -1537,9 +1540,9 @@ class YoutubeTruncatedURLIE(InfoExtractor): def _real_extract(self, url): raise ExtractorError( - u'Did you forget to quote the URL? Remember that & is a meta ' - u'character in most shells, so you want to put the URL in quotes, ' - u'like youtube-dl ' - u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' - u' or simply youtube-dl BaW_jenozKc .', + 'Did you forget to quote the URL? Remember that & is a meta ' + 'character in most shells, so you want to put the URL in quotes, ' + 'like youtube-dl ' + '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' + ' or simply youtube-dl BaW_jenozKc .', expected=True)