From c94427dd60f9836e3752077977cae3b4d0da47d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Dec 2017 22:34:56 +0700 Subject: [PATCH 01/78] [pluralsight] Detect agreement request (#14913) --- youtube_dl/extractor/pluralsight.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 4bf0aa786a..597b112183 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -131,6 +131,13 @@ def _login(self): if BLOCKED in response: raise ExtractorError( 'Unable to login: %s' % BLOCKED, expected=True) + MUST_AGREE = 'To continue using Pluralsight, you must agree to' + if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): + raise ExtractorError( + 'Unable to login: %s some documents. Go to pluralsight.com, ' + 'log in and agree with what Pluralsight requires.' + % MUST_AGREE, expected=True) + raise ExtractorError('Unable to log in') def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): From 61d18c8a4bc91602738bfd2e506c3cbbc3a3788b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Dec 2017 22:42:02 +0700 Subject: [PATCH 02/78] [porncom] Fix metadata extraction (closes #14911) --- youtube_dl/extractor/porncom.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 8218c7d3bf..60ade06da3 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -77,12 +77,14 @@ def _real_extract(self, url): self._sort_formats(formats) view_count = str_to_int(self._search_regex( - r'class=["\']views["\'][^>]*>

([\d,.]+)', webpage, + (r'Views:\s*\s*\s*([\d,.]+)', + r'class=["\']views["\'][^>]*>

([\d,.]+)'), webpage, 'view count', fatal=False)) def extract_list(kind): s = self._search_regex( - r'(?s)]*>%s:(.+?)

' % kind.capitalize(), + (r'(?s)%s:\s*\s*(.+?)' % kind.capitalize(), + r'(?s)]*>%s:(.+?)

' % kind.capitalize()), webpage, kind, fatal=False) return re.findall(r']+>([^<]+)', s or '') From 91328f26b05084a1ddd890866670a1133564ecd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Dec 2017 23:01:57 +0700 Subject: [PATCH 03/78] [ard] Skip invalid stream URLs (closes #14906) --- youtube_dl/extractor/ard.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 915f8862e3..ef73d5a933 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from .generic import GenericIE +from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, @@ -126,6 +127,8 @@ def _extract_formats(self, media_info, video_id): quality = stream.get('_quality') server = stream.get('_server') for stream_url in stream_urls: + if not isinstance(stream_url, compat_str) or '//' not in stream_url: + continue ext = determine_ext(stream_url) if quality != 'auto' and ext in ('f4m', 'm3u8'): continue @@ -146,13 +149,11 @@ def _extract_formats(self, media_info, video_id): 'play_path': stream_url, 'format_id': 'a%s-rtmp-%s' % (num, quality), } - elif stream_url.startswith('http'): + else: f = { 'url': stream_url, 'format_id': 'a%s-%s-%s' % (num, ext, quality) } - else: - continue m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) if m: f.update({ From d3f8b76b69b9bc986354a83a8d4ee4a8617ce52d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Dec 2017 23:11:15 +0700 Subject: [PATCH 04/78] [extractor/generic] Fix typo (closes #14902) Don't pass video_id as mpd_id --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 56df2ab479..c8b7c2e63f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2195,7 +2195,7 @@ def _real_extract(self, url): return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( - doc, video_id, + doc, mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) From b271e3352603c523bf2c6762973937fd36925aae Mon Sep 17 00:00:00 2001 From: Windom Date: Tue, 5 Dec 2017 19:08:31 +0200 Subject: [PATCH 05/78] [xhamster] Add support for mobile URLs and fix thumbnail extraction --- youtube_dl/extractor/xhamster.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 52f8ded2f8..68652a22fc 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -75,6 +75,10 @@ class XHamsterIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # mobile site + 'url': 'https://m.xhamster.com/videos/cute-teen-jacqueline-solo-masturbation-8559111', + 'only_matching': True, }, { 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', 'only_matching': True, @@ -93,7 +97,8 @@ def _real_extract(self, url): video_id = mobj.group('id') or mobj.group('id_2') display_id = mobj.group('display_id') or mobj.group('display_id_2') - webpage = self._download_webpage(url, video_id) + desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) + webpage = self._download_webpage(desktop_url, video_id) error = self._html_search_regex( r']+id=["\']videoClosed["\'][^>]*>(.+?)', @@ -229,8 +234,8 @@ def get_height(s): webpage, 'uploader', default='anonymous') thumbnail = self._search_regex( - [r'''thumb\s*:\s*(?P["'])(?P.+?)(?P=q)''', - r''']+poster=(?P["'])(?P.+?)(?P=q)[^>]*>'''], + [r'''["']thumbUrl["']\s*:\s*(?P["'])(?P.+?)(?P=q)''', + r''']+"poster"=(?P["'])(?P.+?)(?P=q)[^>]*>'''], webpage, 'thumbnail', fatal=False, group='thumbnail') duration = parse_duration(self._search_regex( @@ -274,15 +279,16 @@ def get_height(s): class XHamsterEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P\d+)' + _VALID_URL = r'https?://(?:.+?\.)?xhamster\.com/xembed\.php\?video=(?P\d+)' _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', 'info_dict': { 'id': '3328539', 'ext': 'mp4', 'title': 'Pen Masturbation', + 'timestamp': 1406581861, 'upload_date': '20140728', - 'uploader_id': 'anonymous', + 'uploader': 'ManyakisArt', 'duration': 5, 'age_limit': 18, } From 3c4fbfeca2431b120537b9eaedf5977dd0ab13b1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 6 Dec 2017 10:54:20 +0100 Subject: [PATCH 06/78] [dailymotion] remove dailymotion cloud extractor(closes #6794) https://web.archive.org/web/20160312110217/https://www.dmcloud.net/ --- youtube_dl/extractor/dailymotion.py | 49 ----------------------------- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/francetv.py | 9 +----- youtube_dl/extractor/generic.py | 27 +--------------- 4 files changed, 2 insertions(+), 84 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 21a2d02392..0e7d587dd4 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -413,52 +413,3 @@ def _real_extract(self, url): 'title': full_user, 'entries': self._extract_entries(user), } - - -class DailymotionCloudIE(DailymotionBaseInfoExtractor): - _VALID_URL_PREFIX = r'https?://api\.dmcloud\.net/(?:player/)?embed/' - _VALID_URL = r'%s[^/]+/(?P[^/?]+)' % _VALID_URL_PREFIX - _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX - - _TESTS = [{ - # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html - # Tested at FranceTvInfo_2 - 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', - 'only_matching': True, - }, { - # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html - 'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1', - 'only_matching': True, - }] - - @classmethod - def _extract_dmcloud_url(cls, webpage): - mobj = re.search(r']+src=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, webpage) - if mobj: - return mobj.group(1) - - mobj = re.search( - r']+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, - webpage) - if mobj: - return mobj.group(1) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage_no_ff(url, video_id) - - title = self._html_search_regex(r'([^>]+)', webpage, 'title') - - video_info = self._parse_json(self._search_regex( - r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) - - # TODO: parse ios_url, which is in fact a manifest - video_url = video_info['mp4_url'] - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': video_info.get('thumbnail_url'), - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2cc3bc4632..9c9739ad22 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -246,7 +246,6 @@ DailymotionIE, DailymotionPlaylistIE, DailymotionUserIE, - DailymotionCloudIE, ) from .daisuki import ( DaisukiMottoIE, diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 5a3abeaff6..80c5970156 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -13,10 +13,7 @@ parse_duration, determine_ext, ) -from .dailymotion import ( - DailymotionIE, - DailymotionCloudIE, -) +from .dailymotion import DailymotionIE class FranceTVBaseInfoExtractor(InfoExtractor): @@ -290,10 +287,6 @@ def _real_extract(self, url): page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) - dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) - if dmcloud_url: - return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key()) - dailymotion_urls = DailymotionIE._extract_urls(webpage) if dailymotion_urls: return self.playlist_result([ diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c8b7c2e63f..c7b6092153 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -59,10 +59,7 @@ from .drtuber import DrTuberIE from .redtube import RedTubeIE from .vimeo import VimeoIE -from .dailymotion import ( - DailymotionIE, - DailymotionCloudIE, -) +from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE @@ -1472,23 +1469,6 @@ class GenericIE(InfoExtractor): 'timestamp': 1432570283, }, }, - # Dailymotion Cloud video - { - 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', - 'info_dict': { - 'id': 'x2uy8t3', - 'ext': 'mp4', - 'title': 'Sauvons les abeilles ! - Le débat', - 'description': 'md5:d9082128b1c5277987825d684939ca26', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'timestamp': 1434970506, - 'upload_date': '20150622', - 'uploader': 'Public Sénat', - 'uploader_id': 'xa9gza', - }, - 'skip': 'File not found.', - }, # OnionStudios embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', @@ -2704,11 +2684,6 @@ def _real_extract(self, url): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') - # Look for Dailymotion Cloud videos - dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) - if dmcloud_url: - return self.url_result(dmcloud_url, 'DailymotionCloud') - # Look for OnionStudios embeds onionstudios_url = OnionStudiosIE._extract_url(webpage) if onionstudios_url: From 684ae102360dbef6d1e5fcd75a0f86266030e02d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 6 Dec 2017 22:56:14 +0100 Subject: [PATCH 07/78] [fox] add support for adobe pass auth and extract subtitles(close #14489)(closes #14205) --- youtube_dl/extractor/fox.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 5f98d017b8..11d6c9c325 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -11,6 +11,7 @@ parse_duration, try_get, unified_timestamp, + update_url_query, ) @@ -62,7 +63,8 @@ def _real_extract(self, url): duration = int_or_none(video.get('durationInSeconds')) or int_or_none( video.get('duration')) or parse_duration(video.get('duration')) timestamp = unified_timestamp(video.get('datePublished')) - age_limit = parse_age_limit(video.get('contentRating')) + rating = video.get('contentRating') + age_limit = parse_age_limit(rating) data = try_get( video, lambda x: x['trackingData']['properties'], dict) or {} @@ -77,8 +79,24 @@ def _real_extract(self, url): release_year = int_or_none(video.get('releaseYear')) if data.get('authRequired'): - # TODO: AP - pass + resource = self._get_mvpd_resource( + 'fbc-fox', title, video.get('guid'), rating) + release_url = update_url_query( + release_url, { + 'auth': self._extract_mvpd_auth( + url, video_id, 'fbc-fox', resource) + }) + + subtitles = {} + for doc_rel in video.get('documentReleases', []): + rel_url = doc_rel.get('url') + if not url or doc_rel.get('format') != 'SCC': + continue + subtitles['en'] = [{ + 'url': rel_url, + 'ext': 'scc', + }] + break info = { 'id': video_id, @@ -93,6 +111,7 @@ def _real_extract(self, url): 'episode': episode, 'episode_number': episode_number, 'release_year': release_year, + 'subtitles': subtitles, } urlh = self._request_webpage(HEADRequest(release_url), video_id) From 1bd4fc96e6f8978f83e90ca913df18a979445028 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 7 Dec 2017 08:46:30 +0100 Subject: [PATCH 08/78] [sonyliv] extract higher quality formats and bypass geo restriction(closes #14922) --- youtube_dl/extractor/sonyliv.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py index accd112aa2..c3078e2857 100644 --- a/youtube_dl/extractor/sonyliv.py +++ b/youtube_dl/extractor/sonyliv.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import smuggle_url class SonyLIVIE(InfoExtractor): @@ -10,12 +11,12 @@ class SonyLIVIE(InfoExtractor): 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", 'info_dict': { 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", - 'id': '5024612095001', + 'id': 'ref:5024612095001', 'ext': 'mp4', - 'upload_date': '20160707', + 'upload_date': '20170923', 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', - 'uploader_id': '4338955589001', - 'timestamp': 1467870968, + 'uploader_id': '5182475815001', + 'timestamp': 1506200547, }, 'params': { 'skip_download': True, @@ -26,9 +27,11 @@ class SonyLIVIE(InfoExtractor): 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s' def _real_extract(self, url): brightcove_id = self._match_id(url) return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['IN']}), + 'BrightcoveNew', brightcove_id) From a670b1ba266533c520a436469585904321808a1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 7 Dec 2017 22:16:41 +0700 Subject: [PATCH 09/78] [README.md] Add is_live, start_time and end_time to output template section (closes #14926) --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index ea321d5362..cd30d147a9 100644 --- a/README.md +++ b/README.md @@ -511,6 +511,9 @@ # OUTPUT TEMPLATE - `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage - `comment_count` (numeric): Number of comments on the video - `age_limit` (numeric): Age restriction for the video (years) + - `is_live` (boolean): Whether this video is a live stream or a fixed-length video + - `start_time` (numeric): Time in seconds where the reproduction should start, as specified in the URL + - `end_time` (numeric): Time in seconds where the reproduction should end, as specified in the URL - `format` (string): A human-readable description of the format - `format_id` (string): Format code specified by `--format` - `format_note` (string): Additional info about the format From d21d0ba6c14e8a6696130090641da4e2028e1bb3 Mon Sep 17 00:00:00 2001 From: Timendum Date: Mon, 23 Oct 2017 15:32:45 +0200 Subject: [PATCH 10/78] [raiplay:playlist] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rai.py | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9c9739ad22..d8f9f94ccb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -857,6 +857,7 @@ RaiPlayIE, RaiPlayLiveIE, RaiIE, + RaiPlaylistIE, ) from .rbmaradio import RBMARadioIE from .rds import RDSIE diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 5bf64a56b7..6254583808 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -455,3 +455,29 @@ def _real_extract(self, url): info.update(relinker_info) return info + + +class RaiPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P[^/]+)' + _TESTS = [{ + 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + title = self._html_search_meta('programma', webpage, default=None) + video_urls = re.findall(' href="(/raiplay/video.+)"', webpage) + video_urls = [urljoin(url, video_url) for video_url in video_urls] + entries = [ + self.url_result( + video_url, + RaiPlayIE.ie_key()) + for video_url in video_urls if RaiPlayIE.suitable(video_url) + ] + return self.playlist_result(entries, playlist_id, title) From 1115271ac61b89cc4ac1ca922eff8a4bed0fbf57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 Dec 2017 00:46:28 +0700 Subject: [PATCH 11/78] [raiplay:playlist] Fix issues and improve (closes #14563) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/rai.py | 62 +++++++++++++++++------------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d8f9f94ccb..b9c97fac45 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -856,8 +856,8 @@ from .rai import ( RaiPlayIE, RaiPlayLiveIE, + RaiPlayPlaylistIE, RaiIE, - RaiPlaylistIE, ) from .rbmaradio import RBMARadioIE from .rds import RDSIE diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 6254583808..d22311031f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -17,6 +17,7 @@ parse_duration, strip_or_none, try_get, + unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -249,6 +250,41 @@ def _real_extract(self, url): } +class RaiPlayPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._html_search_meta( + ('programma', 'nomeProgramma'), webpage, 'title') + description = unescapeHTML(self._html_search_meta( + ('description', 'og:description'), webpage, 'description')) + print(description) + + entries = [] + for mobj in re.finditer( + r']+\bhref=(["\'])(?P/raiplay/video/.+?)\1', + webpage): + video_url = urljoin(url, mobj.group('path')) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) + + return self.playlist_result(entries, playlist_id, title, description) + + class RaiIE(RaiBaseIE): _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ @@ -455,29 +491,3 @@ def _real_extract(self, url): info.update(relinker_info) return info - - -class RaiPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P[^/]+)' - _TESTS = [{ - 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', - 'info_dict': { - 'id': 'nondirloalmiocapo', - 'title': 'Non dirlo al mio capo', - }, - 'playlist_mincount': 12, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._html_search_meta('programma', webpage, default=None) - video_urls = re.findall(' href="(/raiplay/video.+)"', webpage) - video_urls = [urljoin(url, video_url) for video_url in video_urls] - entries = [ - self.url_result( - video_url, - RaiPlayIE.ie_key()) - for video_url in video_urls if RaiPlayIE.suitable(video_url) - ] - return self.playlist_result(entries, playlist_id, title) From e2707a832cd53e2cfa68b99db997890a6a5bd685 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Mon, 23 Oct 2017 21:15:48 +0200 Subject: [PATCH 12/78] [ellentube] Fix extraction (closes #14407) --- youtube_dl/extractor/ellentube.py | 140 +++++++++++++++++++++++++++++ youtube_dl/extractor/ellentv.py | 101 --------------------- youtube_dl/extractor/extractors.py | 8 +- 3 files changed, 145 insertions(+), 104 deletions(-) create mode 100644 youtube_dl/extractor/ellentube.py delete mode 100644 youtube_dl/extractor/ellentv.py diff --git a/youtube_dl/extractor/ellentube.py b/youtube_dl/extractor/ellentube.py new file mode 100644 index 0000000000..68fe172736 --- /dev/null +++ b/youtube_dl/extractor/ellentube.py @@ -0,0 +1,140 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, +) + + +class EllenTubeIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + https://api-prod\.ellentube\.com/ellenapi/api/item/ + |ellentube: + ) + (?P + [\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12} + )''' + + _TESTS = [{ + 'url': 'https://api-prod.ellentube.com/ellenapi/api/item/75c64c16-aefd-4558-b4f5-3de09b22e6fc', + 'match_only': True, + }, { + 'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0', + 'match_only': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + 'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, video_id) + title = data['title'] + description = data.get('description') + publish_time = int_or_none(data.get('publishTime')) + thumbnail = data.get('thumbnail') + + formats = [] + duration = None + for entry in data.get('media'): + if entry.get('id') == 'm3u8': + formats = self._extract_m3u8_formats( + entry.get('url'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + duration = int_or_none(entry.get('duration')) + break + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'timestamp': publish_time, + 'formats': formats, + } + + +class EllenTubeVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P.+)\.html' + + _TEST = { + 'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html', + 'md5': '2fabc277131bddafdd120e0fc0f974c9', + 'info_dict': { + 'id': '0822171c-3829-43bf-b99f-d77358ae75e3', + 'ext': 'mp4', + 'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck', + 'description': 'md5:76e3355e2242a78ad9e3858e5616923f', + 'duration': 514, + 'timestamp': 1508505120000, + 'thumbnail': 'https://warnerbros-h.assetsadobe.com/is/image/content/dam/ellen/videos/episodes/season15/32/video--2728751654987218111', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + r'(?s).*data-config.+([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + webpage, 'video id') + return self.url_result('ellentube:%s' % video_id, 'EllenTube') + + +class EllenTubePlaylistIE(InfoExtractor): + def _extract_videos_from_json(self, data, display_id): + return [self.url_result('ellentube:%s' % elem['id'], 'EllenTube') + for elem in data if elem.get('type') == 'VIDEO'] + + def _extract_playlist(self, url, display_id, extract_description=True): + webpage = self._download_webpage(url, display_id) + playlist_data = self._html_search_regex( + r'', webpage, 'playlist data') + playlist_title = self._search_regex( + r'"title"\s*:\s*"(.+?)"', playlist_data, 'playlist title') + playlist_description = clean_html(self._search_regex( + r'"description"\s*:\s*"(.+?)"', playlist_data, 'playlist description', + fatal=False)) if extract_description else None + api_search = self._search_regex( + r'"filter"\s*:\s*"(.+?)"', playlist_data, 'playlist api request') + api_data = self._download_json( + 'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' % api_search, + display_id) + return self.playlist_result( + self._extract_videos_from_json(api_data, display_id), + display_id, playlist_title, playlist_description) + + +class EllenTubeEpisodeIE(EllenTubePlaylistIE): + _VALID_URL = r'https?://(?:www\.)?ellentube\.com/episode/(?P.+)\.html' + + _TEST = { + 'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html', + 'info_dict': { + 'id': 'dax-shepard-jordan-fisher-haim', + 'title': 'Dax Shepard, \'DWTS\' Team Jordan Fisher & Lindsay Arnold, HAIM', + 'description': 'md5:aed85d42892f6126e71ec5ed2aea2a0d' + }, + 'playlist_count': 6, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._extract_playlist(url, display_id) + + +class EllenTubeStudioIE(EllenTubePlaylistIE): + _VALID_URL = r'https?://(?:www\.)?ellentube\.com/studios/(?P.+)\.html' + + _TEST = { + 'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html', + 'info_dict': { + 'id': 'macey-goes-rving0', + 'title': 'Macey Goes RVing', + }, + 'playlist_mincount': 3, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._extract_playlist(url, display_id, False) diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py deleted file mode 100644 index e0a13dd76c..0000000000 --- a/youtube_dl/extractor/ellentv.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from .kaltura import KalturaIE -from ..utils import NO_DEFAULT - - -class EllenTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P[a-z0-9_-]+)' - _TESTS = [{ - 'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', - 'md5': '4294cf98bc165f218aaa0b89e0fd8042', - 'info_dict': { - 'id': '0_ipq1gsai', - 'ext': 'mov', - 'title': 'Fast Fingers of Fate', - 'description': 'md5:3539013ddcbfa64b2a6d1b38d910868a', - 'timestamp': 1428035648, - 'upload_date': '20150403', - 'uploader_id': 'batchUser', - }, - }, { - # not available via http://widgets.ellentube.com/ - 'url': 'http://www.ellentv.com/videos/1-szkgu2m2/', - 'info_dict': { - 'id': '1_szkgu2m2', - 'ext': 'flv', - 'title': "Ellen's Amazingly Talented Audience", - 'description': 'md5:86ff1e376ff0d717d7171590e273f0a5', - 'timestamp': 1255140900, - 'upload_date': '20091010', - 'uploader_id': 'ellenkaltura@gmail.com', - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - URLS = ('http://widgets.ellentube.com/videos/%s' % video_id, url) - - for num, url_ in enumerate(URLS, 1): - webpage = self._download_webpage( - url_, video_id, fatal=num == len(URLS)) - - default = NO_DEFAULT if num == len(URLS) else None - - partner_id = self._search_regex( - r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id', - default=default) - - kaltura_id = self._search_regex( - [r'id="kaltura_player_([^"]+)"', - r"_wb_entry_id\s*:\s*'([^']+)", - r'data-kaltura-entry-id="([^"]+)'], - webpage, 'kaltura id', default=default) - - if partner_id and kaltura_id: - break - - return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key()) - - -class EllenTVClipsIE(InfoExtractor): - IE_NAME = 'EllenTV:clips' - _VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P[a-z0-9_-]+)' - _TEST = { - 'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/', - 'info_dict': { - 'id': 'meryl-streep-vanessa-hudgens', - 'title': 'Meryl Streep, Vanessa Hudgens', - }, - 'playlist_mincount': 5, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - playlist = self._extract_playlist(webpage, playlist_id) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._og_search_title(webpage), - 'entries': self._extract_entries(playlist) - } - - def _extract_playlist(self, webpage, playlist_id): - json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json') - return self._parse_json('[{' + json_string + '}]', playlist_id) - - def _extract_entries(self, playlist): - return [ - self.url_result( - 'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']), - KalturaIE.ie_key(), video_id=item['kaltura_entry_id']) - for item in playlist] diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b9c97fac45..55d2dd1fe5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -311,9 +311,11 @@ from .eighttracks import EightTracksIE from .einthusan import EinthusanIE from .eitb import EitbIE -from .ellentv import ( - EllenTVIE, - EllenTVClipsIE, +from .ellentube import ( + EllenTubeIE, + EllenTubeEpisodeIE, + EllenTubeStudioIE, + EllenTubeVideoIE, ) from .elpais import ElPaisIE from .embedly import EmbedlyIE From 2a57b62b8007973b5b4974a1d9f5ab06ae78c86e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 Dec 2017 02:16:23 +0700 Subject: [PATCH 13/78] [ellentube] Fix issues, improve and simplify (closes #14570) --- youtube_dl/extractor/ellentube.py | 167 ++++++++++++++--------------- youtube_dl/extractor/extractors.py | 3 +- 2 files changed, 81 insertions(+), 89 deletions(-) diff --git a/youtube_dl/extractor/ellentube.py b/youtube_dl/extractor/ellentube.py index 68fe172736..5444732748 100644 --- a/youtube_dl/extractor/ellentube.py +++ b/youtube_dl/extractor/ellentube.py @@ -4,137 +4,130 @@ from .common import InfoExtractor from ..utils import ( clean_html, + extract_attributes, + float_or_none, int_or_none, + try_get, ) -class EllenTubeIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - https://api-prod\.ellentube\.com/ellenapi/api/item/ - |ellentube: - ) - (?P - [\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12} - )''' +class EllenTubeBaseIE(InfoExtractor): + def _extract_data_config(self, webpage, video_id): + details = self._search_regex( + r'(<[^>]+\bdata-component=(["\'])[Dd]etails.+?>)', webpage, + 'details') + return self._parse_json( + extract_attributes(details)['data-config'], video_id) - _TESTS = [{ - 'url': 'https://api-prod.ellentube.com/ellenapi/api/item/75c64c16-aefd-4558-b4f5-3de09b22e6fc', - 'match_only': True, - }, { - 'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0', - 'match_only': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, video_id) + def _extract_video(self, data, video_id): title = data['title'] - description = data.get('description') - publish_time = int_or_none(data.get('publishTime')) - thumbnail = data.get('thumbnail') formats = [] duration = None for entry in data.get('media'): if entry.get('id') == 'm3u8': formats = self._extract_m3u8_formats( - entry.get('url'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + entry['url'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') duration = int_or_none(entry.get('duration')) break self._sort_formats(formats) + + def get_insight(kind): + return int_or_none(try_get( + data, lambda x: x['insight']['%ss' % kind])) + return { + 'extractor_key': EllenTubeIE.ie_key(), 'id': video_id, 'title': title, - 'description': description, + 'description': data.get('description'), 'duration': duration, - 'thumbnail': thumbnail, - 'timestamp': publish_time, + 'thumbnail': data.get('thumbnail'), + 'timestamp': float_or_none(data.get('publishTime'), scale=1000), + 'view_count': get_insight('view'), + 'like_count': get_insight('like'), 'formats': formats, } -class EllenTubeVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P.+)\.html' - - _TEST = { - 'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html', +class EllenTubeIE(EllenTubeBaseIE): + _VALID_URL = r'''(?x) + (?: + ellentube:| + https://api-prod\.ellentube\.com/ellenapi/api/item/ + ) + (?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + ''' + _TESTS = [{ + 'url': 'https://api-prod.ellentube.com/ellenapi/api/item/0822171c-3829-43bf-b99f-d77358ae75e3', 'md5': '2fabc277131bddafdd120e0fc0f974c9', 'info_dict': { 'id': '0822171c-3829-43bf-b99f-d77358ae75e3', 'ext': 'mp4', 'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck', 'description': 'md5:76e3355e2242a78ad9e3858e5616923f', + 'thumbnail': r're:^https?://.+?', 'duration': 514, - 'timestamp': 1508505120000, - 'thumbnail': 'https://warnerbros-h.assetsadobe.com/is/image/content/dam/ellen/videos/episodes/season15/32/video--2728751654987218111', + 'timestamp': 1508505120, + 'upload_date': '20171020', + 'view_count': int, + 'like_count': int, } + }, { + 'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + 'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, + video_id) + return self._extract_video(data, video_id) + + +class EllenTubeVideoIE(EllenTubeBaseIE): + _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P.+?)\.html' + _TEST = { + 'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html', + 'only_matching': True, } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex( - r'(?s).*data-config.+([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - webpage, 'video id') - return self.url_result('ellentube:%s' % video_id, 'EllenTube') + video_id = self._extract_data_config(webpage, display_id)['id'] + return self.url_result( + 'ellentube:%s' % video_id, ie=EllenTubeIE.ie_key(), + video_id=video_id) -class EllenTubePlaylistIE(InfoExtractor): - def _extract_videos_from_json(self, data, display_id): - return [self.url_result('ellentube:%s' % elem['id'], 'EllenTube') - for elem in data if elem.get('type') == 'VIDEO'] - - def _extract_playlist(self, url, display_id, extract_description=True): - webpage = self._download_webpage(url, display_id) - playlist_data = self._html_search_regex( - r'', webpage, 'playlist data') - playlist_title = self._search_regex( - r'"title"\s*:\s*"(.+?)"', playlist_data, 'playlist title') - playlist_description = clean_html(self._search_regex( - r'"description"\s*:\s*"(.+?)"', playlist_data, 'playlist description', - fatal=False)) if extract_description else None - api_search = self._search_regex( - r'"filter"\s*:\s*"(.+?)"', playlist_data, 'playlist api request') - api_data = self._download_json( - 'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' % api_search, - display_id) - return self.playlist_result( - self._extract_videos_from_json(api_data, display_id), - display_id, playlist_title, playlist_description) - - -class EllenTubeEpisodeIE(EllenTubePlaylistIE): - _VALID_URL = r'https?://(?:www\.)?ellentube\.com/episode/(?P.+)\.html' - - _TEST = { +class EllenTubePlaylistIE(EllenTubeBaseIE): + _VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P.+?)\.html' + _TESTS = [{ 'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html', 'info_dict': { 'id': 'dax-shepard-jordan-fisher-haim', - 'title': 'Dax Shepard, \'DWTS\' Team Jordan Fisher & Lindsay Arnold, HAIM', - 'description': 'md5:aed85d42892f6126e71ec5ed2aea2a0d' + 'title': "Dax Shepard, 'DWTS' Team Jordan Fisher & Lindsay Arnold, HAIM", + 'description': 'md5:bfc982194dabb3f4e325e43aa6b2e21c', }, 'playlist_count': 6, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._extract_playlist(url, display_id) - - -class EllenTubeStudioIE(EllenTubePlaylistIE): - _VALID_URL = r'https?://(?:www\.)?ellentube\.com/studios/(?P.+)\.html' - - _TEST = { + }, { 'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html', - 'info_dict': { - 'id': 'macey-goes-rving0', - 'title': 'Macey Goes RVing', - }, - 'playlist_mincount': 3, - } + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) - return self._extract_playlist(url, display_id, False) + webpage = self._download_webpage(url, display_id) + data = self._extract_data_config(webpage, display_id)['data'] + feed = self._download_json( + 'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' + % data['filter'], display_id) + entries = [ + self._extract_video(elem, elem['id']) + for elem in feed if elem.get('type') == 'VIDEO' and elem.get('id')] + return self.playlist_result( + entries, display_id, data.get('title'), + clean_html(data.get('description'))) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 55d2dd1fe5..0177a2cff0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -313,9 +313,8 @@ from .eitb import EitbIE from .ellentube import ( EllenTubeIE, - EllenTubeEpisodeIE, - EllenTubeStudioIE, EllenTubeVideoIE, + EllenTubePlaylistIE, ) from .elpais import ElPaisIE from .embedly import EmbedlyIE From f4cc03d60b5dd713fb8964cd9ecf8ca2b1a8a556 Mon Sep 17 00:00:00 2001 From: Andrew Bottom Date: Tue, 24 Oct 2017 11:50:02 -0500 Subject: [PATCH 14/78] [stretchinternet] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/stretchinternet.py | 28 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 youtube_dl/extractor/stretchinternet.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0177a2cff0..612fd35d4d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1001,6 +1001,7 @@ from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE +from .stretchinternet import StretchInternetIE from .sunporno import SunPornoIE from .svt import ( SVTIE, diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py new file mode 100644 index 0000000000..9a0ec0e650 --- /dev/null +++ b/youtube_dl/extractor/stretchinternet.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class StretchInternetIE(InfoExtractor): + IE_DESC = 'StretchInternet' + _VALID_URL = r'https?://.*?stretchinternet\.com/[^/_?].*(?<=eventId=)(?P.*)(?=&).*' + _TEST = { + 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=313900&streamType=video', + 'info_dict': { + 'id': '313900', + 'ext': 'mp4', + 'title': 'StretchInternet' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + stream = self._download_json('https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' % video_id, video_id) + stream_url = stream.get('source') + return { + 'ie_key': 'Generic', + 'id': video_id, + 'url': 'http://%s' % stream_url, + 'title': 'StretchInternet' + } From a3de5e6c0e0efef4e8ff0cd37961c594b13c7fb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 Dec 2017 17:58:08 +0700 Subject: [PATCH 15/78] [stretchinternet] Fix issues and improve (closes #14576) --- youtube_dl/extractor/stretchinternet.py | 38 +++++++++++++++++++------ 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py index 9a0ec0e650..ae2ac1b42f 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/youtube_dl/extractor/stretchinternet.py @@ -1,28 +1,48 @@ -# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import int_or_none class StretchInternetIE(InfoExtractor): - IE_DESC = 'StretchInternet' - _VALID_URL = r'https?://.*?stretchinternet\.com/[^/_?].*(?<=eventId=)(?P.*)(?=&).*' + _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/portal\.htm\?.*?\beventId=(?P\d+)' _TEST = { 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=313900&streamType=video', 'info_dict': { 'id': '313900', 'ext': 'mp4', - 'title': 'StretchInternet' + 'title': 'Augustana (S.D.) Baseball vs University of Mary', + 'description': 'md5:7578478614aae3bdd4a90f578f787438', + 'timestamp': 1490468400, + 'upload_date': '20170325', } } def _real_extract(self, url): video_id = self._match_id(url) - stream = self._download_json('https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' % video_id, video_id) - stream_url = stream.get('source') + + stream = self._download_json( + 'https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' + % video_id, video_id) + + video_url = 'https://%s' % stream['source'] + + event = self._download_json( + 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', + video_id, query={ + 'clientID': 99997, + 'eventID': video_id, + 'token': 'asdf', + })['event'] + + title = event.get('title') or event['mobileTitle'] + description = event.get('customText') + timestamp = int_or_none(event.get('longtime')) + return { - 'ie_key': 'Generic', 'id': video_id, - 'url': 'http://%s' % stream_url, - 'title': 'StretchInternet' + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'url': video_url, } From 6f1ec339a0332041d6469887a91cf4e3ff557477 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 Dec 2017 19:52:31 +0700 Subject: [PATCH 16/78] [udemy] Improve course id extraction (closes #14938) --- youtube_dl/extractor/udemy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index c248ea7278..b66033923c 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -62,11 +62,11 @@ class UdemyIE(InfoExtractor): def _extract_course_info(self, webpage, video_id): course = self._parse_json( unescapeHTML(self._search_regex( - r'ng-init=["\'].*\bcourse=({.+?});', webpage, 'course', default='{}')), + r'ng-init=["\'].*\bcourse=({.+?})[;"\']', + webpage, 'course', default='{}')), video_id, fatal=False) or {} course_id = course.get('id') or self._search_regex( - (r'"id"\s*:\s*(\d+)', r'data-course-id=["\'](\d+)'), - webpage, 'course id') + r'data-course-id=["\'](\d+)', webpage, 'course id') return course_id, course.get('title') def _enroll_course(self, base_url, webpage, course_id): From 913b61eeee9436a3ddf5675ec935d00c3a05b7a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 Dec 2017 20:02:19 +0700 Subject: [PATCH 17/78] [udemy] Extract more HLS formats --- youtube_dl/extractor/udemy.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index b66033923c..195f5ce78d 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -257,6 +257,11 @@ def extract_formats(source_list): video_url = source.get('file') or source.get('src') if not video_url or not isinstance(video_url, compat_str): continue + if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue format_id = source.get('label') f = { 'url': video_url, From 51f2863357af55dfe026bfe47808bf104e7eebf9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 10 Dec 2017 14:10:52 +0100 Subject: [PATCH 18/78] [twitter] improve extraction(closes #14197) --- youtube_dl/extractor/twitter.py | 141 +++++++++++++++----------------- 1 file changed, 64 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1b0b963716..d7e425041f 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -43,7 +43,7 @@ def _search_dimensions_in_video_url(a_format, video_url): class TwitterCardIE(TwitterBaseIE): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?Pcards/tfw/v1|videos(?:/tweet)?)/(?P\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -51,11 +51,10 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', - 'title': 'Twitter Card', + 'title': 'Twitter web player', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 30.033, }, - 'skip': 'Video gone', }, { 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', @@ -63,11 +62,9 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', - 'title': 'Twitter Card', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 80.155, + 'title': 'Twitter web player', + 'thumbnail': r're:^https?://.*(?:\bformat=|\.)jpg', }, - 'skip': 'Video gone', }, { 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', @@ -120,15 +117,15 @@ def _parse_media_info(self, media_info, video_id): elif media_url.endswith('.mpd'): formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) else: - vbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) + tbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) a_format = { 'url': media_url, - 'format_id': 'http-%d' % vbr if vbr else 'http', - 'vbr': vbr, + 'format_id': 'http-%d' % tbr if tbr else 'http', + 'tbr': tbr, } # Reported bitRate may be zero - if not a_format['vbr']: - del a_format['vbr'] + if not a_format['tbr']: + del a_format['tbr'] self._search_dimensions_in_video_url(a_format, media_url) @@ -150,79 +147,83 @@ def _extract_mobile_formats(self, username, video_id): bearer_token = self._search_regex( r'BEARER_TOKEN\s*:\s*"([^"]+)"', main_script, 'bearer token') - guest_token = self._search_regex( - r'document\.cookie\s*=\s*decodeURIComponent\("gt=(\d+)', - webpage, 'guest token') + # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id api_data = self._download_json( - 'https://api.twitter.com/2/timeline/conversation/%s.json' % video_id, - video_id, 'Downloading mobile API data', + 'https://api.twitter.com/1.1/statuses/show/%s.json' % video_id, + video_id, 'Downloading API data', headers={ 'Authorization': 'Bearer ' + bearer_token, - 'x-guest-token': guest_token, }) - media_info = try_get(api_data, lambda o: o['globalObjects']['tweets'][video_id] - ['extended_entities']['media'][0]['video_info']) or {} + media_info = try_get(api_data, lambda o: o['extended_entities']['media'][0]['video_info']) or {} return self._parse_media_info(media_info, video_id) def _real_extract(self, url): - video_id = self._match_id(url) + path, video_id = re.search(self._VALID_URL, url).groups() config = None formats = [] duration = None - webpage = self._download_webpage(url, video_id) + urls = [url] + if path.startswith('cards/'): + urls.append('https://twitter.com/i/videos/' + video_id) - iframe_url = self._html_search_regex( - r']+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', - webpage, 'video iframe', default=None) - if iframe_url: - return self.url_result(iframe_url) + for u in urls: + webpage = self._download_webpage(u, video_id) - config = self._parse_json(self._html_search_regex( - r'data-(?:player-)?config="([^"]+)"', webpage, - 'data player config', default='{}'), - video_id) + iframe_url = self._html_search_regex( + r']+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', + webpage, 'video iframe', default=None) + if iframe_url: + return self.url_result(iframe_url) - if config.get('source_type') == 'vine': - return self.url_result(config['player_url'], 'Vine') + config = self._parse_json(self._html_search_regex( + r'data-(?:player-)?config="([^"]+)"', webpage, + 'data player config', default='{}'), + video_id) - periscope_url = PeriscopeIE._extract_url(webpage) - if periscope_url: - return self.url_result(periscope_url, PeriscopeIE.ie_key()) + if config.get('source_type') == 'vine': + return self.url_result(config['player_url'], 'Vine') - video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') + periscope_url = PeriscopeIE._extract_url(webpage) + if periscope_url: + return self.url_result(periscope_url, PeriscopeIE.ie_key()) - if video_url: - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) - else: - f = { - 'url': video_url, - } + video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') - self._search_dimensions_in_video_url(f, video_url) + if video_url: + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) + else: + f = { + 'url': video_url, + } - formats.append(f) + self._search_dimensions_in_video_url(f, video_url) - vmap_url = config.get('vmapUrl') or config.get('vmap_url') - if vmap_url: - formats.extend( - self._extract_formats_from_vmap_url(vmap_url, video_id)) + formats.append(f) - media_info = None + vmap_url = config.get('vmapUrl') or config.get('vmap_url') + if vmap_url: + formats.extend( + self._extract_formats_from_vmap_url(vmap_url, video_id)) - for entity in config.get('status', {}).get('entities', []): - if 'mediaInfo' in entity: - media_info = entity['mediaInfo'] + media_info = None - if media_info: - formats.extend(self._parse_media_info(media_info, video_id)) - duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) + for entity in config.get('status', {}).get('entities', []): + if 'mediaInfo' in entity: + media_info = entity['mediaInfo'] - username = config.get('user', {}).get('screen_name') - if username: - formats.extend(self._extract_mobile_formats(username, video_id)) + if media_info: + formats.extend(self._parse_media_info(media_info, video_id)) + duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) + + username = config.get('user', {}).get('screen_name') + if username: + formats.extend(self._extract_mobile_formats(username, video_id)) + + if formats: + break self._remove_duplicate_formats(formats) self._sort_formats(formats) @@ -258,9 +259,6 @@ class TwitterIE(InfoExtractor): 'uploader_id': 'freethenipple', 'duration': 12.922, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', @@ -277,7 +275,6 @@ class TwitterIE(InfoExtractor): 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/starwars/status/665052190608723968', - 'md5': '39b7199856dee6cd4432e72c74bc69d4', 'info_dict': { 'id': '665052190608723968', 'ext': 'mp4', @@ -303,20 +300,16 @@ class TwitterIE(InfoExtractor): }, }, { 'url': 'https://twitter.com/jaydingeer/status/700207533655363584', - 'md5': '', 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'あかさ - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'あかさ on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'あかさ', + 'uploader': 'JG', 'uploader_id': 'jaydingeer', 'duration': 30.0, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', 'md5': '89a15ed345d13b86e9a5a5e051fa308a', @@ -342,9 +335,6 @@ class TwitterIE(InfoExtractor): 'uploader': 'Captain America', 'duration': 3.17, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', 'info_dict': { @@ -370,9 +360,6 @@ class TwitterIE(InfoExtractor): 'uploader_id': 'news_al3alm', 'duration': 277.4, }, - 'params': { - 'format': 'best[format_id^=http-]', - }, }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { From c38970ca10fc8c8e3ba05019d5f73ff235de9fc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Dec 2017 22:46:21 +0700 Subject: [PATCH 19/78] [culturebox] Improve video id extraction (closes #14947) --- youtube_dl/extractor/francetv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 80c5970156..095bb3954c 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -356,6 +356,7 @@ def _real_extract(self, url): raise ExtractorError('Video %s is not available' % name, expected=True) video_id, catalogue = self._search_regex( - r'"https?://videos\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video id').split('@') + r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]', + webpage, 'video id').split('@') return self._extract_video(video_id, catalogue) From fa1dd6d2cdaa41b67ee2fc47dc5184a040004d41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Dec 2017 23:15:24 +0700 Subject: [PATCH 20/78] [ChangeLog] Actualize --- ChangeLog | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/ChangeLog b/ChangeLog index 63837d62b7..fd3f3ebf81 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,30 @@ +version + +Core ++ [utils] Add sami mimetype to mimetype2ext + +Extractors +* [culturebox] Improve video id extraction (#14947) +* [twitter] Improve extraction (#14197) ++ [udemy] Extract more HLS formats +* [udemy] Improve course id extraction (#14938) ++ [stretchinternet] Add support for portal.stretchinternet.com (#14576) +* [ellentube] Fix extraction (#14407, #14570) ++ [raiplay:playlist] Add support for playlists (#14563) +* [sonyliv] Bypass geo restriction +* [sonyliv] Extract higher quality formats (#14922) +* [fox] Extract subtitles ++ [fox] Add support for Adobe Pass authentication (#14205, #14489) +- [dailymotion:cloud] Remove extractor (#6794) +* [xhamster] Fix thumbnail extraction (#14780) ++ [xhamster] Add support for mobile URLs (#14780) +* [generic] Don't pass video id as mpd id while extracting DASH (#14902) +* [ard] Skip invalid stream URLs (#14906) +* [porncom] Fix metadata extraction (#14911) +* [pluralsight] Detect agreement request (#14913) +* [toutv] Fix login (#14614) + + version 2017.12.02 Core From 1fa0dce2c044826eec87b712f90964b90198aaa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Dec 2017 23:18:53 +0700 Subject: [PATCH 21/78] release 2017.12.10 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 8 +++++--- youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 513823b9bd..736869bf0f 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ ## Please follow the guide below --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.02** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.10** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ ### If the purpose of this *issue* is a *bug report*, *site support request* or [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.02 +[debug] youtube-dl version 2017.12.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index fd3f3ebf81..a1fdcab999 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.12.10 Core + [utils] Add sami mimetype to mimetype2ext diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0287a40114..d5e3a8a67d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -198,7 +198,6 @@ # Supported sites - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** - - **DailymotionCloud** - **DaisukiMotto** - **DaisukiMottoPlaylist** - **daum.net** @@ -243,8 +242,9 @@ # Supported sites - **eHow** - **Einthusan** - **eitb.tv** - - **EllenTV** - - **EllenTV:clips** + - **EllenTube** + - **EllenTubePlaylist** + - **EllenTubeVideo** - **ElPais**: El País - **Embedly** - **EMPFlix** @@ -662,6 +662,7 @@ # Supported sites - **Rai** - **RaiPlay** - **RaiPlayLive** + - **RaiPlayPlaylist** - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** @@ -781,6 +782,7 @@ # Supported sites - **streamcloud.eu** - **StreamCZ** - **StreetVoice** + - **StretchInternet** - **SunPorno** - **SVT** - **SVTPlay**: SVT Play and Öppet arkiv diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 88bf1d652f..d0c4383174 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.02' +__version__ = '2017.12.10' From b6f78d76c14a1787606aad23c2df0d5158d1fde1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Dec 2017 13:38:55 +0100 Subject: [PATCH 22/78] [tbs] fix extraction(fixes #13658) --- youtube_dl/extractor/tbs.py | 134 +++++++++++++++++++++++---------- youtube_dl/extractor/turner.py | 48 ++++++------ 2 files changed, 120 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index e9474533f4..460bc5d742 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -4,58 +4,110 @@ import re from .turner import TurnerBaseIE -from ..utils import extract_attributes +from ..utils import ( + float_or_none, + int_or_none, + strip_or_none, +) class TBSIE(TurnerBaseIE): - # https://github.com/rg3/youtube-dl/issues/13658 - _WORKING = False - - _VALID_URL = r'https?://(?:www\.)?(?Ptbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P[^/?#]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?Ptbs|tntdrama)\.com/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P[^/?#]+)' _TESTS = [{ - 'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', - 'md5': '9e61d680e2285066ade7199e6408b2ee', + 'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster', 'info_dict': { - 'id': '2007318', + 'id': '8d384cde33b89f3a43ce5329de42903ed5099887', 'ext': 'mp4', - 'title': 'Theatrical Trailer', - 'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', + 'title': 'Monster', + 'description': 'Get a first look at the theatrical trailer for TNT’s highly anticipated new psychological thriller The Alienist, which premieres January 22 on TNT.', + 'timestamp': 1508175329, + 'upload_date': '20171016', }, - 'skip': 'TBS videos are deleted after a while', + 'params': { + # m3u8 download + 'skip_download': True, + } }, { - 'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', - 'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', - 'info_dict': { - 'id': '1538823', - 'ext': 'mp4', - 'title': 'You Better Run', - 'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', - }, - 'skip': 'TBS videos are deleted after a while', + 'url': 'http://www.tbs.com/shows/search-party/season-1/episode-1/explicit-the-mysterious-disappearance-of-the-girl-no-one-knew', + 'only_matching': True, + }, { + 'url': 'http://www.tntdrama.com/movies/star-wars-a-new-hope', + 'only_matching': True, }] def _real_extract(self, url): domain, display_id = re.match(self._VALID_URL, url).groups() site = domain[:3] webpage = self._download_webpage(url, display_id) - video_params = extract_attributes(self._search_regex(r'(<[^>]+id="page-video"[^>]*>)', webpage, 'video params')) - query = None - clip_id = video_params.get('clipid') - if clip_id: - query = 'id=' + clip_id - else: - query = 'titleId=' + video_params['titleid'] - return self._extract_cvp_info( - 'http://www.%s.com/service/cvpXml?%s' % (domain, query), display_id, { - 'default': { - 'media_src': 'http://ht.cdn.turner.com/%s/big' % site, - }, - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/%s/big' % site, - 'tokenizer_src': 'http://www.%s.com/video/processors/services/token_ipadAdobe.do' % domain, - }, - }, { - 'url': url, - 'site_name': site.upper(), - 'auth_required': video_params.get('isAuthRequired') != 'false', - }) + video_data = self._parse_json(self._search_regex( + r']+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})', + webpage, 'drupal setting'), display_id)['turner_playlist'][0] + + media_id = video_data['mediaID'] + title = video_data['title'] + + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://www.%s.com/service/token_spe' % site, + m3u8_url, media_id, { + 'url': url, + 'site_name': site.upper(), + 'auth_required': video_data.get('authRequired') == '1', + }) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration')) + + if not chapters: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + duration = float_or_none(chapter.get('duration')) + if start_time is None or duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + duration, + }) + self._sort_formats(formats) + + thumbnails = [] + for image_id, image in video_data.get('images', {}).items(): + image_url = image.get('url') + if not image_url or image.get('type') != 'video': + continue + i = { + 'id': image_id, + 'url': image_url, + } + mobj = re.search(r'(\d+)x(\d+)', image_url) + if mobj: + i.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + thumbnails.append(i) + + return { + 'id': media_id, + 'title': title, + 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), + 'duration': duration, + 'timestamp': int_or_none(video_data.get('created')), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'cahpters': chapters, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index efeb677ee9..e73b64aebd 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -18,9 +18,32 @@ class TurnerBaseIE(AdobePassIE): + _AKAMAI_SPE_TOKEN_CACHE = {} + def _extract_timestamp(self, video_data): return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): + secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' + token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) + if not token: + query = { + 'path': secure_path, + 'videoId': content_id, + } + if ap_data.get('auth_required'): + query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) + auth = self._download_xml( + tokenizer_src, content_id, query=query) + error_msg = xpath_text(auth, 'error/msg') + if error_msg: + raise ExtractorError(error_msg, expected=True) + token = xpath_text(auth, 'token') + if not token: + return video_url + self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token + return video_url + '?hdnea=' + token + def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): video_data = self._download_xml(data_src, video_id) video_id = video_data.attrib['id'] @@ -33,7 +56,6 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): # rtmp_src = splited_rtmp_src[1] # aifp = xpath_text(video_data, 'akamai/aifp', default='') - tokens = {} urls = [] formats = [] rex = re.compile( @@ -67,26 +89,10 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): secure_path_data = path_data.get('secure') if not secure_path_data: continue - video_url = secure_path_data['media_src'] + video_url - secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' - token = tokens.get(secure_path) - if not token: - query = { - 'path': secure_path, - 'videoId': content_id, - } - if ap_data.get('auth_required'): - query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], video_id, ap_data['site_name'], ap_data['site_name']) - auth = self._download_xml( - secure_path_data['tokenizer_src'], video_id, query=query) - error_msg = xpath_text(auth, 'error/msg') - if error_msg: - raise ExtractorError(error_msg, expected=True) - token = xpath_text(auth, 'token') - if not token: - continue - tokens[secure_path] = token - video_url = video_url + '?hdnea=' + token + video_url = self._add_akamai_spe_token( + secure_path_data['tokenizer_src'], + secure_path_data['media_src'] + video_url, + content_id, ap_data) elif not re.match('https?://', video_url): base_path_data = path_data.get(ext, path_data.get('default', {})) media_src = base_path_data.get('media_src') From 5868079e9921bfb03c64439d2188b765fc30dcfc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Dec 2017 14:37:03 +0100 Subject: [PATCH 23/78] [nick.com] improve extraction(fixes #14876) --- youtube_dl/extractor/nick.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 310eea2cf0..cae8faf3e0 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -10,7 +10,7 @@ class NickIE(MTVServicesInfoExtractor): # None of videos on the website are still alive? IE_NAME = 'nick.com' - _VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P[^/?#.]+)' + _VALID_URL = r'https?://(?P(?:(?:www|beta)\.)?nick(?:jr)?\.com)/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' _GEO_COUNTRIES = ['US'] _TESTS = [{ @@ -69,8 +69,14 @@ def _get_feed_query(self, uri): 'mgid': uri, } - def _extract_mgid(self, webpage): - return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') + def _real_extract(self, url): + domain, display_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'http://%s/data/video.endLevel.json' % domain, + display_id, query={ + 'urlKey': display_id, + }) + return self._get_videos_info(video_data['player'] + video_data['id']) class NickDeIE(MTVServicesInfoExtractor): From 08d77a95c98a1d1dcaa5df99b709f119e7a64f37 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Dec 2017 17:18:05 +0100 Subject: [PATCH 24/78] [nickelodeon:br] add support for Nickelodeon(Brazil) websites(closes #14893) --- youtube_dl/extractor/nick.py | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index cae8faf3e0..f38c2c725e 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -79,6 +79,51 @@ def _real_extract(self, url): return self._get_videos_info(video_data['player'] + video_data['id']) +class NickBeIE(MTVServicesInfoExtractor): + IE_NAME = 'nickelodeon:br' + _VALID_URL = r'https?://(?P(?:www\.)?nickjr|mundonick\.uol)\.com\.br/(?:programas/)?[^/]+/videos/(?:episodios/)?(?P[^/?#.]+)' + _TESTS = [{ + 'url': 'http://www.nickjr.com.br/patrulha-canina/videos/210-labirinto-de-pipoca/', + 'only_matching': True, + }, { + 'url': 'http://mundonick.uol.com.br/programas/the-loud-house/videos/muitas-irmas/7ljo9j', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + uri = self._search_regex( + r'data-(?:contenturi|mgid)="([^"]+)', webpage, 'mgid') + video_id = self._id_from_uri(uri) + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html', + video_id, query={ + 'uri': uri, + 'configtype': 'edge', + }, headers={ + 'Referer': url, + }) + info_url = self._remove_template_parameter(config['feedWithQueryParams']) + if info_url == 'None': + if domain.startswith('www.'): + domain = domain[4:] + content_domain = { + 'mundonick.uol': 'mundonick.com.br', + 'nickjr': 'br.nickelodeonjunior.tv', + }[domain] + query = { + 'mgid': uri, + 'imageEp': content_domain, + 'arcEp': content_domain, + } + if domain == 'nickjr.com.br': + query['ep'] = 'c4b16088' + info_url = update_url_query( + 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed', query) + return self._get_videos_info_from_url(info_url, video_id) + + class NickDeIE(MTVServicesInfoExtractor): IE_NAME = 'nick.de' _VALID_URL = r'https?://(?:www\.)?(?Pnick\.(?:de|com\.pl|ch)|nickelodeon\.(?:nl|be|at|dk|no|se))/[^/]+/(?:[^/]+/)*(?P[^/?#&]+)' From e4f201bc1b4ffbf6423278e8d744727aa7d6f415 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Dec 2017 17:19:16 +0100 Subject: [PATCH 25/78] [extractors] add import for NickBrIE --- youtube_dl/extractor/extractors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 612fd35d4d..25c4ce0de3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -689,6 +689,7 @@ ) from .nick import ( NickIE, + NickBeIE, NickDeIE, NickNightIE, NickRuIE, From 127e98d31dd96b135f0f511c4ca171a300e089cc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Dec 2017 17:21:31 +0100 Subject: [PATCH 26/78] [nickelodeon:br] correct extractor name --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/nick.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 25c4ce0de3..99887ee89c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -689,7 +689,7 @@ ) from .nick import ( NickIE, - NickBeIE, + NickBrIE, NickDeIE, NickNightIE, NickRuIE, diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index f38c2c725e..7edd68472b 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -79,7 +79,7 @@ def _real_extract(self, url): return self._get_videos_info(video_data['player'] + video_data['id']) -class NickBeIE(MTVServicesInfoExtractor): +class NickBrIE(MTVServicesInfoExtractor): IE_NAME = 'nickelodeon:br' _VALID_URL = r'https?://(?P(?:www\.)?nickjr|mundonick\.uol)\.com\.br/(?:programas/)?[^/]+/videos/(?:episodios/)?(?P[^/?#.]+)' _TESTS = [{ From 23b6e23002d8bd3af3b2ccb2e1b833c1dd1b3cbf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Dec 2017 21:36:12 +0100 Subject: [PATCH 27/78] [tvnow] fix extraction(closes #7831) --- youtube_dl/extractor/extractors.py | 8 +- youtube_dl/extractor/nowtv.py | 261 ----------------------------- youtube_dl/extractor/tvnow.py | 175 +++++++++++++++++++ 3 files changed, 179 insertions(+), 265 deletions(-) delete mode 100644 youtube_dl/extractor/nowtv.py create mode 100644 youtube_dl/extractor/tvnow.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 99887ee89c..fb9f365f2b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -722,10 +722,6 @@ NownessPlaylistIE, NownessSeriesIE, ) -from .nowtv import ( - NowTVIE, - NowTVListIE, -) from .noz import NozIE from .npo import ( AndereTijdenIE, @@ -1105,6 +1101,10 @@ from .tvland import TVLandIE from .tvn24 import TVN24IE from .tvnoe import TVNoeIE +from .tvnow import ( + TVNowIE, + TVNowListIE, +) from .tvp import ( TVPEmbedIE, TVPIE, diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py deleted file mode 100644 index e43b37136e..0000000000 --- a/youtube_dl/extractor/nowtv.py +++ /dev/null @@ -1,261 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - determine_ext, - int_or_none, - parse_iso8601, - parse_duration, - remove_start, -) - - -class NowTVBaseIE(InfoExtractor): - _VIDEO_FIELDS = ( - 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', - 'broadcastStartDate', 'seoUrl', 'duration', 'files', - 'format.defaultImage169Format', 'format.defaultImage169Logo') - - def _extract_video(self, info, display_id=None): - video_id = compat_str(info['id']) - - files = info['files'] - if not files: - if info.get('geoblocked', False): - raise ExtractorError( - 'Video %s is not available from your location due to geo restriction' % video_id, - expected=True) - if not info.get('free', True): - raise ExtractorError( - 'Video %s is not available for free' % video_id, expected=True) - - formats = [] - for item in files['items']: - if determine_ext(item['path']) != 'f4v': - continue - app, play_path = remove_start(item['path'], '/').split('/', 1) - formats.append({ - 'url': 'rtmpe://fms.rtl.de', - 'app': app, - 'play_path': 'mp4:%s' % play_path, - 'ext': 'flv', - 'page_url': 'http://rtlnow.rtl.de', - 'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf', - 'tbr': int_or_none(item.get('bitrate')), - }) - self._sort_formats(formats) - - title = info['title'] - description = info.get('articleLong') or info.get('articleShort') - timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') - duration = parse_duration(info.get('duration')) - - f = info.get('format', {}) - thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') - - return { - 'id': video_id, - 'display_id': display_id or info.get('seoUrl'), - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } - - -class NowTVIE(NowTVBaseIE): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P[^/]+)/(?:player|preview)' - - _TESTS = [{ - # rtl - 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', - 'info_dict': { - 'id': '203519', - 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', - 'ext': 'flv', - 'title': 'Inka Bause stellt die neuen Bauern vor', - 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432580700, - 'upload_date': '20150525', - 'duration': 2786, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # rtl2 - 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', - 'info_dict': { - 'id': '203481', - 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', - 'ext': 'flv', - 'title': 'Berlin - Tag & Nacht (Folge 934)', - 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432666800, - 'upload_date': '20150526', - 'duration': 2641, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # rtlnitro - 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', - 'info_dict': { - 'id': '165780', - 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', - 'ext': 'flv', - 'title': 'Hals- und Beinbruch', - 'description': 'md5:b50d248efffe244e6f56737f0911ca57', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432415400, - 'upload_date': '20150523', - 'duration': 2742, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # superrtl - 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', - 'info_dict': { - 'id': '99205', - 'display_id': 'medicopter-117/angst', - 'ext': 'flv', - 'title': 'Angst!', - 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1222632900, - 'upload_date': '20080928', - 'duration': 3025, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # ntv - 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', - 'info_dict': { - 'id': '203521', - 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', - 'ext': 'flv', - 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', - 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432751700, - 'upload_date': '20150527', - 'duration': 1083, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # vox - 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', - 'info_dict': { - 'id': '128953', - 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', - 'ext': 'flv', - 'title': "Büro-Fall / Chihuahua 'Joel'", - 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432408200, - 'upload_date': '20150523', - 'duration': 3092, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', - 'only_matching': True, - }, { - 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', - 'only_matching': True, - }, { - 'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player', - 'only_matching': True, - }, { - 'url': 'http://www.nowtv.de/rtl2/zuhause-im-glueck/jahr/2015/11/eine-erschuetternde-diagnose/player', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = '%s/%s' % (mobj.group('show_id'), mobj.group('id')) - - info = self._download_json( - 'https://api.nowtv.de/v3/movies/%s?fields=%s' - % (display_id, ','.join(self._VIDEO_FIELDS)), display_id) - - return self._extract_video(info, display_id) - - -class NowTVListIE(NowTVBaseIE): - _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P[^/]+)/list/(?P[^?/#&]+)$' - - _SHOW_FIELDS = ('title', ) - _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) - - _TESTS = [{ - 'url': 'http://www.nowtv.at/rtl/stern-tv/list/aktuell', - 'info_dict': { - 'id': '17006', - 'title': 'stern TV - Aktuell', - }, - 'playlist_count': 1, - }, { - 'url': 'http://www.nowtv.at/rtl/das-supertalent/list/free-staffel-8', - 'info_dict': { - 'id': '20716', - 'title': 'Das Supertalent - FREE Staffel 8', - }, - 'playlist_count': 14, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - show_id = mobj.group('show_id') - season_id = mobj.group('id') - - fields = [] - fields.extend(self._SHOW_FIELDS) - fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) - fields.extend( - 'formatTabs.formatTabPages.container.movies.%s' % field - for field in self._VIDEO_FIELDS) - - list_info = self._download_json( - 'https://api.nowtv.de/v3/formats/seo?fields=%s&name=%s.php' - % (','.join(fields), show_id), - season_id) - - season = next( - season for season in list_info['formatTabs']['items'] - if season.get('seoheadline') == season_id) - - title = '%s - %s' % (list_info['title'], season['headline']) - - entries = [] - for container in season['formatTabPages']['items']: - for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []: - entries.append(self._extract_video(info)) - - return self.playlist_result( - entries, compat_str(season.get('id') or season_id), title) diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py new file mode 100644 index 0000000000..e2169f2bce --- /dev/null +++ b/youtube_dl/extractor/tvnow.py @@ -0,0 +1,175 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + parse_iso8601, + parse_duration, + update_url_query, +) + + +class TVNowBaseIE(InfoExtractor): + _VIDEO_FIELDS = ( + 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', + 'broadcastStartDate', 'isDrm', 'duration', 'manifest.dashclear', + 'format.defaultImage169Format', 'format.defaultImage169Logo') + + def _call_api(self, path, video_id, query): + return self._download_json( + 'https://api.tvnow.de/v3/' + path, + video_id, query=query) + + def _extract_video(self, info, display_id): + video_id = compat_str(info['id']) + title = info['title'] + + mpd_url = info['manifest']['dashclear'] + if not mpd_url: + if info.get('isDrm'): + raise ExtractorError( + 'Video %s is DRM protected' % video_id, expected=True) + if info.get('geoblocked'): + raise ExtractorError( + 'Video %s is not available from your location due to geo restriction' % video_id, + expected=True) + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + + mpd_url = update_url_query(mpd_url, {'filter': ''}) + formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + description = info.get('articleLong') or info.get('articleShort') + timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') + duration = parse_duration(info.get('duration')) + + f = info.get('format', {}) + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } + + +class TVNowIE(TVNowBaseIE): + _VALID_URL = r'https?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P[^/]+)/(?:player|preview)' + + _TESTS = [{ + # rtl + 'url': 'https://www.tvnow.de/rtl/alarm-fuer-cobra-11/freier-fall/player?return=/rtl', + 'info_dict': { + 'id': '385314', + 'display_id': 'alarm-fuer-cobra-11/freier-fall', + 'ext': 'mp4', + 'title': 'Freier Fall', + 'description': 'md5:8c2d8f727261adf7e0dc18366124ca02', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1512677700, + 'upload_date': '20171207', + 'duration': 2862.0, + }, + }, { + # rtl2 + 'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player', + 'only_matching': 'True', + }, { + # rtlnitro + 'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player', + 'only_matching': 'True', + }, { + # superrtl + 'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player', + 'only_matching': 'True', + }, { + # ntv + 'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player', + 'only_matching': 'True', + }, { + # vox + 'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player', + 'only_matching': 'True', + }, { + # rtlplus + 'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player', + 'only_matching': 'True', + }] + + def _real_extract(self, url): + display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() + + info = self._call_api( + 'movies/' + display_id, display_id, query={ + 'fields': ','.join(self._VIDEO_FIELDS), + }) + + return self._extract_video(info, display_id) + + +class TVNowListIE(TVNowBaseIE): + _VALID_URL = r'(?Phttps?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P[^/]+)/)list/(?P[^?/#&]+)$' + + _SHOW_FIELDS = ('title', ) + _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) + _VIDEO_FIELDS = ('id', 'headline', 'seoUrl', ) + + _TESTS = [{ + 'url': 'https://www.tvnow.de/rtl/30-minuten-deutschland/list/aktuell', + 'info_dict': { + 'id': '28296', + 'title': '30 Minuten Deutschland - Aktuell', + }, + 'playlist_mincount': 1, + }] + + def _real_extract(self, url): + base_url, show_id, season_id = re.match(self._VALID_URL, url).groups() + + fields = [] + fields.extend(self._SHOW_FIELDS) + fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) + fields.extend( + 'formatTabs.formatTabPages.container.movies.%s' % field + for field in self._VIDEO_FIELDS) + + list_info = self._call_api( + 'formats/seo', season_id, query={ + 'fields': ','.join(fields), + 'name': show_id + '.php' + }) + + season = next( + season for season in list_info['formatTabs']['items'] + if season.get('seoheadline') == season_id) + + title = '%s - %s' % (list_info['title'], season['headline']) + + entries = [] + for container in season['formatTabPages']['items']: + for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []: + seo_url = info.get('seoUrl') + if not seo_url: + continue + entries.append(self.url_result( + base_url + seo_url + '/player', 'TVNow', info.get('id'))) + + return self.playlist_result( + entries, compat_str(season.get('id') or season_id), title) From cb0c2310fbf232e09ae41013be3400034171d6d2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 12 Dec 2017 10:33:03 +0100 Subject: [PATCH 28/78] [discovery] fix free videos extraction(#14157)(#14954) --- youtube_dl/extractor/discovery.py | 122 +++++++++++++--------------- youtube_dl/extractor/discoverygo.py | 72 ++++++++-------- 2 files changed, 94 insertions(+), 100 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 55853f76f9..f9cec1d23d 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -1,14 +1,18 @@ from __future__ import unicode_literals -from .common import InfoExtractor +import random +import re +import string + +from .discoverygo import DiscoveryGoBaseIE from ..utils import ( - parse_duration, - parse_iso8601, + ExtractorError, + update_url_query, ) -from ..compat import compat_str +from ..compat import compat_HTTPError -class DiscoveryIE(InfoExtractor): +class DiscoveryIE(DiscoveryGoBaseIE): _VALID_URL = r'''(?x)https?://(?:www\.)?(?: discovery| investigationdiscovery| @@ -19,79 +23,65 @@ class DiscoveryIE(InfoExtractor): sciencechannel| tlc| velocity - )\.com/(?:[^/]+/)*(?P[^./?#]+)''' + )\.com(?P/tv-shows/[^/]+/(?:video|full-episode)s/(?P[^./?#]+))''' _TESTS = [{ - 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', + 'url': 'https://www.discovery.com/tv-shows/cash-cab/videos/dave-foley', 'info_dict': { - 'id': '20769', + 'id': '5a2d9b4d6b66d17a5026e1fd', 'ext': 'mp4', - 'title': 'Mission Impossible Outtakes', - 'description': ('Watch Jamie Hyneman and Adam Savage practice being' - ' each other -- to the point of confusing Jamie\'s dog -- and ' - 'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s' - ' back.'), - 'duration': 156, - 'timestamp': 1302032462, - 'upload_date': '20110405', - 'uploader_id': '103207', + 'title': 'Dave Foley', + 'description': 'md5:4b39bcafccf9167ca42810eb5f28b01f', + 'duration': 608, }, 'params': { 'skip_download': True, # requires ffmpeg } }, { - 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons', - 'info_dict': { - 'id': 'mythbusters-the-simpsons', - 'title': 'MythBusters: The Simpsons', - }, - 'playlist_mincount': 10, - }, { - 'url': 'http://www.animalplanet.com/longfin-eels-maneaters/', - 'info_dict': { - 'id': '78326', - 'ext': 'mp4', - 'title': 'Longfin Eels: Maneaters?', - 'description': 'Jeremy Wade tests whether or not New Zealand\'s longfin eels are man-eaters by covering himself in fish guts and getting in the water with them.', - 'upload_date': '20140725', - 'timestamp': 1406246400, - 'duration': 116, - 'uploader_id': '103207', - }, - 'params': { - 'skip_download': True, # requires ffmpeg - } + 'url': 'https://www.investigationdiscovery.com/tv-shows/final-vision/full-episodes/final-vision', + 'only_matching': True, }] + _GEO_COUNTRIES = ['US'] + _GEO_BYPASS = False def _real_extract(self, url): - display_id = self._match_id(url) - info = self._download_json(url + '?flat=1', display_id) + path, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) - video_title = info.get('playlist_title') or info.get('video_title') + react_data = self._parse_json(self._search_regex( + r'window\.__reactTransmitPacket\s*=\s*({.+?});', + webpage, 'react data'), display_id) + content_blocks = react_data['layout'][path]['contentBlocks'] + video = next(cb for cb in content_blocks if cb.get('type') == 'video')['content']['items'][0] + video_id = video['id'] - entries = [] + access_token = self._download_json( + 'https://www.discovery.com/anonymous', display_id, query={ + 'authLink': update_url_query( + 'https://login.discovery.com/v1/oauth2/authorize', { + 'client_id': react_data['application']['apiClientId'], + 'redirect_uri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html', + 'response_type': 'anonymous', + 'state': 'nonce,' + ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + }) + })['access_token'] - for idx, video_info in enumerate(info['playlist']): - subtitles = {} - caption_url = video_info.get('captionsUrl') - if caption_url: - subtitles = { - 'en': [{ - 'url': caption_url, - }] - } + try: + stream = self._download_json( + 'https://api.discovery.com/v1/streaming/video/' + video_id, + display_id, headers={ + 'Authorization': 'Bearer ' + access_token, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + e_description = self._parse_json( + e.cause.read().decode(), display_id)['description'] + if 'resource not available for country' in e_description: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + if 'Authorized Networks' in e_description: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.', expected=True) + raise ExtractorError(e_description) + raise - entries.append({ - '_type': 'url_transparent', - 'url': 'http://players.brightcove.net/103207/default_default/index.html?videoId=ref:%s' % video_info['referenceId'], - 'id': compat_str(video_info['id']), - 'title': video_info['title'], - 'description': video_info.get('description'), - 'duration': parse_duration(video_info.get('video_length')), - 'webpage_url': video_info.get('href') or video_info.get('url'), - 'thumbnail': video_info.get('thumbnailURL'), - 'alt_title': video_info.get('secondary_title'), - 'timestamp': parse_iso8601(video_info.get('publishedDate')), - 'subtitles': subtitles, - }) - - return self.playlist_result(entries, display_id, video_title) + return self._extract_video_info(video, stream, display_id) diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 7cd5d42916..99376454b2 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -27,42 +27,9 @@ class DiscoveryGoBaseIE(InfoExtractor): velocitychannel )go\.com/%s(?P[^/?#&]+)''' - -class DiscoveryGoIE(DiscoveryGoBaseIE): - _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+' - _GEO_COUNTRIES = ['US'] - _TEST = { - 'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/', - 'info_dict': { - 'id': '58c167d86b66d12f2addeb01', - 'ext': 'mp4', - 'title': 'Reaper Madness', - 'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78', - 'duration': 2519, - 'series': 'Bering Sea Gold', - 'season_number': 8, - 'episode_number': 6, - 'age_limit': 14, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - container = extract_attributes( - self._search_regex( - r'(]+class=["\']video-player-container[^>]+>)', - webpage, 'video container')) - - video = self._parse_json( - container.get('data-video') or container.get('data-json'), - display_id) - + def _extract_video_info(self, video, stream, display_id): title = video['name'] - stream = video.get('stream') if not stream: if video.get('authenticated') is True: raise ExtractorError( @@ -124,6 +91,43 @@ def _real_extract(self, url): } +class DiscoveryGoIE(DiscoveryGoBaseIE): + _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+' + _GEO_COUNTRIES = ['US'] + _TEST = { + 'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/', + 'info_dict': { + 'id': '58c167d86b66d12f2addeb01', + 'ext': 'mp4', + 'title': 'Reaper Madness', + 'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78', + 'duration': 2519, + 'series': 'Bering Sea Gold', + 'season_number': 8, + 'episode_number': 6, + 'age_limit': 14, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + container = extract_attributes( + self._search_regex( + r'(]+class=["\']video-player-container[^>]+>)', + webpage, 'video container')) + + video = self._parse_json( + container.get('data-video') or container.get('data-json'), + display_id) + + stream = video.get('stream') + + return self._extract_video_info(video, stream, display_id) + + class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE): _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % '' _TEST = { From e6b8803d599df4de3c115e4de4ef192a1e10c749 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 12 Dec 2017 11:11:44 +0100 Subject: [PATCH 29/78] [toutv] add support special video urls(closes #14179) --- youtube_dl/extractor/toutv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 17c0adc15a..2e7876cc5b 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -16,7 +16,7 @@ class TouTvIE(InfoExtractor): _NETRC_MACHINE = 'toutv' IE_NAME = 'tou.tv' - _VALID_URL = r'https?://ici\.tou\.tv/(?P[a-zA-Z0-9_-]+(?:/S[0-9]+E[0-9]+)?)' + _VALID_URL = r'https?://ici\.tou\.tv/(?P[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)' _access_token = None _claims = None @@ -37,6 +37,9 @@ class TouTvIE(InfoExtractor): }, { 'url': 'http://ici.tou.tv/hackers', 'only_matching': True, + }, { + 'url': 'https://ici.tou.tv/l-age-adulte/S01C501', + 'only_matching': True, }] def _real_initialize(self): From 6b2d8c91823c8f4c5dcda42cf174ab5468c44661 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 12 Dec 2017 18:04:05 +0100 Subject: [PATCH 30/78] [bbc.co.uk] fix extraction for 320k m3u8 streams broken since 197224b7a4e37a6581bf1a0da18d0f67ea61a476 --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 5525f7c9b9..8b20c03d6e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -386,7 +386,7 @@ def _process_media_selector(self, media_selection, programme_id): m3u8_id=format_id, fatal=False)) if re.search(self._USP_RE, href): usp_formats = self._extract_m3u8_formats( - re.sub(self._USP_RE, r'/\1\.ism/\1\.m3u8', href), + re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) for f in usp_formats: From 15960255fe3caaae58132116d772d1c1bcf415fa Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 12 Dec 2017 18:16:45 +0100 Subject: [PATCH 31/78] [tbs] fix typo --- youtube_dl/extractor/tbs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index 460bc5d742..eab22c38f1 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -36,8 +36,7 @@ class TBSIE(TurnerBaseIE): }] def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - site = domain[:3] + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) video_data = self._parse_json(self._search_regex( r']+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})', @@ -62,7 +61,7 @@ def _real_extract(self, url): 'http://www.%s.com/service/token_spe' % site, m3u8_url, media_id, { 'url': url, - 'site_name': site.upper(), + 'site_name': site[:3].upper(), 'auth_required': video_data.get('authRequired') == '1', }) formats.extend(self._extract_m3u8_formats( From c8be7d5f7417f5779cda2c02653f38401dd7a6b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 13 Dec 2017 23:14:30 +0700 Subject: [PATCH 32/78] [byutv] Fix extraction (closes #14966, closes #14967) --- youtube_dl/extractor/byutv.py | 71 ++++++++---------------------- youtube_dl/extractor/extractors.py | 5 +-- 2 files changed, 19 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 8ef089653d..dd7d2f858a 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -3,20 +3,19 @@ import re from .common import InfoExtractor -from ..utils import ExtractorError class BYUtvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?byutv\.org/watch/(?!event/)(?P[0-9a-f-]+)(?:/(?P[^/?#&]+))?' + _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P[0-9a-f-]+)(?:/(?P[^/?#&]+))?' _TESTS = [{ 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'info_dict': { - 'id': '6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', + 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', 'display_id': 'studio-c-season-5-episode-5', 'ext': 'mp4', 'title': 'Season 5 Episode 5', - 'description': 'md5:e07269172baff037f8e8bf9956bc9747', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65', + 'thumbnail': r're:^https?://.*', 'duration': 1486.486, }, 'params': { @@ -26,6 +25,9 @@ class BYUtvIE(InfoExtractor): }, { 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', 'only_matching': True, + }, { + 'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo', + 'only_matching': True, }] def _real_extract(self, url): @@ -33,16 +35,16 @@ def _real_extract(self, url): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, display_id) - episode_code = self._search_regex( - r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information') - - ep = self._parse_json( - episode_code, display_id, transform_source=lambda s: - re.sub(r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', s)) - - if ep['providerType'] != 'Ooyala': - raise ExtractorError('Unsupported provider %s' % ep['provider']) + ep = self._download_json( + 'https://api.byutv.org/api3/catalog/getvideosforcontent', video_id, + query={ + 'contentid': video_id, + 'channel': 'byutv_global', + 'x-byutv-context': 'web$Global', + }, headers={ + 'x-byutv-context': 'web$Global', + 'x-byutv-platformkey': 'xsaaw9c7y5', + })['ooyalaVOD'] return { '_type': 'url_transparent', @@ -50,44 +52,7 @@ def _real_extract(self, url): 'url': 'ooyala:%s' % ep['providerId'], 'id': video_id, 'display_id': display_id, - 'title': ep['title'], + 'title': ep.get('title'), 'description': ep.get('description'), 'thumbnail': ep.get('imageThumbnail'), } - - -class BYUtvEventIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?byutv\.org/watch/event/(?P[0-9a-f-]+)' - _TEST = { - 'url': 'http://www.byutv.org/watch/event/29941b9b-8bf6-48d2-aebf-7a87add9e34b', - 'info_dict': { - 'id': '29941b9b-8bf6-48d2-aebf-7a87add9e34b', - 'ext': 'mp4', - 'title': 'Toledo vs. BYU (9/30/16)', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - ooyala_id = self._search_regex( - r'providerId\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'ooyala id', group='id') - - title = self._search_regex( - r'class=["\']description["\'][^>]*>\s*

([^<]+)

', webpage, - 'title').strip() - - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % ooyala_id, - 'id': video_id, - 'title': title, - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index fb9f365f2b..4072455137 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -138,10 +138,7 @@ BrightcoveNewIE, ) from .buzzfeed import BuzzFeedIE -from .byutv import ( - BYUtvIE, - BYUtvEventIE, -) +from .byutv import BYUtvIE from .c56 import C56IE from .camdemy import ( CamdemyIE, From bec49996c68fe73b4e17e7418290072a4c4b7cdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 13 Dec 2017 23:49:05 +0700 Subject: [PATCH 33/78] [downloader/http] Return actual download result (closes #14971) --- youtube_dl/downloader/http.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 8a6638cc2d..3ff26ff708 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -284,8 +284,7 @@ def retry(e): while count <= retries: try: establish_connection() - download() - return True + return download() except RetryDownload as e: count += 1 if count <= retries: From 6bf9c28b0ac5d44381008cebe4741f8f1d78c53d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Dec 2017 17:51:24 +0100 Subject: [PATCH 34/78] [byutv] add support for geo restricted videos --- youtube_dl/extractor/byutv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index dd7d2f858a..4bf4efe1f3 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -39,10 +39,10 @@ def _real_extract(self, url): 'https://api.byutv.org/api3/catalog/getvideosforcontent', video_id, query={ 'contentid': video_id, - 'channel': 'byutv_global', - 'x-byutv-context': 'web$Global', + 'channel': 'byutv', + 'x-byutv-context': 'web$US', }, headers={ - 'x-byutv-context': 'web$Global', + 'x-byutv-context': 'web$US', 'x-byutv-platformkey': 'xsaaw9c7y5', })['ooyalaVOD'] From 7974e289a10f87ae52fdc32b5f5cc7e14a0fafda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Dec 2017 01:00:14 +0700 Subject: [PATCH 35/78] [postprocessor/xattr] Clarify NO_SPACE message (#14970) --- youtube_dl/postprocessor/xattrpp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index fbdfa02acc..b0aed9ca7b 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -42,6 +42,7 @@ def run(self, info): 'user.dublincore.format': 'format', } + num_written = 0 for xattrname, infoname in xattr_mapping.items(): value = info.get(infoname) @@ -52,6 +53,7 @@ def run(self, info): byte_value = value.encode('utf-8') write_xattr(filename, xattrname, byte_value) + num_written += 1 return [], info @@ -62,8 +64,8 @@ def run(self, info): except XAttrMetadataError as e: if e.reason == 'NO_SPACE': self._downloader.report_warning( - 'There\'s no disk space left or disk quota exceeded. ' + - 'Extended attributes are not written.') + 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' + + (('Some ' if num_written else '') + 'extended attributes are not written.').capitalize()) elif e.reason == 'VALUE_TOO_LONG': self._downloader.report_warning( 'Unable to write extended attributes due to too long values.') From 3fae11ac006567bfbfe9368bc11f56da090e267a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Dec 2017 04:49:07 +0700 Subject: [PATCH 36/78] [itv] Improve extraction, extract more subtitles and duration (closes #14944) --- youtube_dl/extractor/itv.py | 125 ++++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 48 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 413a219dc2..18a7d7f8cd 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -26,7 +26,7 @@ class ITVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-zA-Z]+)' _GEO_COUNTRIES = ['GB'] - _TEST = { + _TESTS = [{ 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', 'info_dict': { 'id': '2a2936a0053', @@ -37,7 +37,11 @@ class ITVIE(InfoExtractor): # rtmp download 'skip_download': True, }, - } + }, { + # unavailable via data-playlist-url + 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -101,6 +105,18 @@ def _add_sub_element(element, name): 'Content-Type': 'text/xml; charset=utf-8', 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', }) + + info = self._search_json_ld(webpage, video_id, default={}) + formats = [] + subtitles = {} + + def extract_subtitle(sub_url): + ext = determine_ext(sub_url, 'ttml') + subtitles.setdefault('en', []).append({ + 'url': sub_url, + 'ext': 'ttml' if ext == 'xml' else ext, + }) + resp_env = self._download_xml( params['data-playlist-url'], video_id, headers=headers, data=etree.tostring(req_env)) @@ -111,37 +127,55 @@ def _add_sub_element(element, name): if fault_code == 'InvalidGeoRegion': self.raise_geo_restricted( msg=fault_string, countries=self._GEO_COUNTRIES) - raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) - title = xpath_text(playlist, 'EpisodeTitle', fatal=True) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] + elif fault_code != 'InvalidEntity': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, fault_string), expected=True) + info.update({ + 'title': self._og_search_title(webpage), + 'episode_title': params.get('data-video-episode'), + 'series': params.get('data-video-title'), + }) + else: + title = xpath_text(playlist, 'EpisodeTitle', default=None) + info.update({ + 'title': title, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duration': parse_duration(xpath_text(playlist, 'Duration')), + }) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] - formats = [] - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + f = { + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, + 'tbr': tbr, + 'ext': 'flv', + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) + + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if caption_url.text: + extract_subtitle(caption_url.text) ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') hmac = params.get('data-video-hmac') @@ -198,27 +232,22 @@ def _add_sub_element(element, name): formats.append({ 'url': href, }) + subs = video_data.get('Subtitles') + if isinstance(subs, list): + for sub in subs: + if not isinstance(sub, dict): + continue + href = sub.get('Href') + if isinstance(href, compat_str): + extract_subtitle(href) + if not info.get('duration'): + info['duration'] = parse_duration(video_data.get('Duration')) + self._sort_formats(formats) - subtitles = {} - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if not caption_url.text: - continue - ext = determine_ext(caption_url.text, 'ttml') - subtitles.setdefault('en', []).append({ - 'url': caption_url.text, - 'ext': 'ttml' if ext == 'xml' else ext, - }) - - info = self._search_json_ld(webpage, video_id, default={}) info.update({ 'id': video_id, - 'title': title, 'formats': formats, 'subtitles': subtitles, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duartion': parse_duration(xpath_text(playlist, 'Duration')), }) return info From c6a5a811a197e1c6520764df1f5f93473692dd18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Dec 2017 05:15:51 +0700 Subject: [PATCH 37/78] [ChangeLog] Actualize --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index a1fdcab999..254264c4ab 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +version + +Core +* [postprocessor/xattr] Clarify NO_SPACE message (#14970) +* [downloader/http] Return actual download result from real_download (#14971) + +Extractors ++ [itv] Extract more subtitles and duration +* [itv] Improve extraction (#14944) ++ [byutv] Add support for geo restricted videos +* [byutv] Fix extraction (#14966, #14967) ++ [bbccouk] Fix extraction for 320k HLS streams ++ [toutv] Add support for special video URLs (#14179) +* [discovery] Fix free videos extraction (#14157, #14954) +* [tvnow] Fix extraction (#7831) ++ [nickelodeon:br] Add support for nickelodeon brazil websites (#14893) +* [nick] Improve extraction (#14876) +* [tbs] Fix extraction (#13658) + + version 2017.12.10 Core From 8ff2b16435cdf53c3e684532afe699dfdf34403c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Dec 2017 05:19:21 +0700 Subject: [PATCH 38/78] release 2017.12.14 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 8 ++++---- youtube_dl/version.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 736869bf0f..f37d8aa42e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ ## Please follow the guide below --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.14** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ ### If the purpose of this *issue* is a *bug report*, *site support request* or [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.10 +[debug] youtube-dl version 2017.12.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 254264c4ab..03d2defb73 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.12.14 Core * [postprocessor/xattr] Clarify NO_SPACE message (#14970) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d5e3a8a67d..ebddd5b9d5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -122,7 +122,6 @@ # Supported sites - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **BuzzFeed** - **BYUtv** - - **BYUtvEvent** - **Camdemy** - **CamdemyFolder** - **CamWithHer** @@ -538,6 +537,7 @@ # Supported sites - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** + - **nickelodeon:br** - **nickelodeonru** - **nicknight** - **niconico**: ニコニコ動画 @@ -556,8 +556,6 @@ # Supported sites - **nowness** - **nowness:playlist** - **nowness:series** - - **NowTV** (Currently broken) - - **NowTVList** - **nowvideo**: NowVideo - **Noz** - **npo**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl @@ -794,7 +792,7 @@ # Supported sites - **tagesschau:player** - **Tass** - **TastyTrade** - - **TBS** (Currently broken) + - **TBS** - **TDSLifeway** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos @@ -865,6 +863,8 @@ # Supported sites - **tvland.com** - **TVN24** - **TVNoe** + - **TVNow** + - **TVNowList** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d0c4383174..2b5a634641 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.10' +__version__ = '2017.12.14' From 1c4804ef9b03e92ef6b1d6720f9a93bb54d98324 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 14 Dec 2017 23:05:07 +0100 Subject: [PATCH 39/78] [voot] fix format extraction(closes #14758) --- youtube_dl/extractor/voot.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py index 5de3deb8c0..5fd1c3d805 100644 --- a/youtube_dl/extractor/voot.py +++ b/youtube_dl/extractor/voot.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .kaltura import KalturaIE from ..utils import ( ExtractorError, int_or_none, @@ -17,11 +16,10 @@ class VootIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', 'info_dict': { - 'id': '0_8ledb18o', + 'id': '441353', 'ext': 'mp4', 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', - 'uploader_id': 'batchUser', 'timestamp': 1472162937, 'upload_date': '20160825', 'duration': 1146, @@ -61,8 +59,10 @@ def _real_extract(self, url): media = media_info['assets'] - entry_id = media['EntryId'] title = media['MediaName'] + formats = self._extract_m3u8_formats( + 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + media['EntryId'], + video_id, 'mp4', m3u8_id='hls', fatal=False) description, series, season_number, episode, episode_number = [None] * 5 @@ -82,9 +82,7 @@ def _real_extract(self, url): episode_number = int_or_none(value) return { - '_type': 'url_transparent', - 'url': 'kaltura:1982551:%s' % entry_id, - 'ie_key': KalturaIE.ie_key(), + 'id': video_id, 'title': title, 'description': description, 'series': series, @@ -95,4 +93,5 @@ def _real_extract(self, url): 'duration': int_or_none(media.get('Duration')), 'view_count': int_or_none(media.get('ViewCounter')), 'like_count': int_or_none(media.get('like_counter')), + 'formats': formats, } From 23f511f5c74d85222e69259996666b7ef97b9421 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 15 Dec 2017 09:05:59 +0100 Subject: [PATCH 40/78] [voot] sort formats --- youtube_dl/extractor/voot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py index 5fd1c3d805..4267544893 100644 --- a/youtube_dl/extractor/voot.py +++ b/youtube_dl/extractor/voot.py @@ -62,7 +62,8 @@ def _real_extract(self, url): title = media['MediaName'] formats = self._extract_m3u8_formats( 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + media['EntryId'], - video_id, 'mp4', m3u8_id='hls', fatal=False) + video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) description, series, season_number, episode, episode_number = [None] * 5 From d05ba4b89e195ed18e8d51561649712f945769b9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 15 Dec 2017 09:27:56 +0100 Subject: [PATCH 41/78] [disney] skip Apple FairPlay formats(#14982) --- youtube_dl/extractor/disney.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py index 968c4c7fd5..0eee82fd6b 100644 --- a/youtube_dl/extractor/disney.py +++ b/youtube_dl/extractor/disney.py @@ -10,6 +10,7 @@ compat_str, determine_ext, ExtractorError, + update_url_query, ) @@ -108,9 +109,16 @@ def _real_extract(self, url): continue tbr = int_or_none(flavor.get('bitrate')) if tbr == 99999: - formats.extend(self._extract_m3u8_formats( + # wrong ks(Kaltura Signature) causes 404 Error + flavor_url = update_url_query(flavor_url, {'ks': ''}) + m3u8_formats = self._extract_m3u8_formats( flavor_url, video_id, 'mp4', - m3u8_id=flavor_format, fatal=False)) + m3u8_id=flavor_format, fatal=False) + for f in m3u8_formats: + # Apple FairPlay + if '/fpshls/' in f['url']: + continue + formats.append(f) continue format_id = [] if flavor_format: From 498a8a4ca56e66adb84f8f1488c0239d048c7adc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Dec 2017 22:53:56 +0700 Subject: [PATCH 42/78] [vk] Make view count optional (closes #14979) --- youtube_dl/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 0d8376522d..d4838b3e5f 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -414,7 +414,7 @@ def _real_extract(self, url): view_count = str_to_int(self._search_regex( r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', - info_page, 'view count', fatal=False)) + info_page, 'view count', default=None)) formats = [] for format_id, format_url in data.items(): From c402e7f3a03aceeb80dd032831ceafb6d0aaa935 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 16 Dec 2017 12:55:20 +0100 Subject: [PATCH 43/78] [discoverygo] correct ttml subtitle extension --- youtube_dl/extractor/discoverygo.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 99376454b2..3368c4c075 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + determine_ext, extract_attributes, ExtractorError, int_or_none, @@ -73,7 +74,11 @@ def _extract_video_info(self, video, stream, display_id): not subtitle_url.startswith('http')): continue lang = caption.get('fileLang', 'en') - subtitles.setdefault(lang, []).append({'url': subtitle_url}) + ext = determine_ext(subtitle_url) + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + 'ext': 'ttml' if ext == 'xml' else ext, + }) return { 'id': video_id, From b555ae9bf146ce9e2bf81327746847f0bc4d63e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Dec 2017 21:56:16 +0700 Subject: [PATCH 44/78] [utils] Add another date format pattern (#14999) --- test/test_utils.py | 1 + youtube_dl/utils.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index cc13f795c3..0857c0fc0c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -343,6 +343,7 @@ def test_unified_timestamps(self): self.assertEqual(unified_timestamp('Feb 7, 2016 at 6:35 pm'), 1454870100) self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361) self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) + self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index eccbc0b1f3..2843a3dc06 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -159,6 +159,8 @@ def register_socks_protocols(): '%Y-%m-%dT%H:%M', '%b %d %Y at %H:%M', '%b %d %Y at %H:%M:%S', + '%B %d %Y at %H:%M', + '%B %d %Y at %H:%M:%S', ) DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) From 06dbcd7be41f71346e75f78f3ad77d9eca0219ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Dec 2017 21:57:30 +0700 Subject: [PATCH 45/78] [cbslocal] Fix timestamp extraction (closes #14999, closes #15000) --- youtube_dl/extractor/cbslocal.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 7d78e3aaee..90852a9ef9 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -91,12 +91,10 @@ def _real_extract(self, url): info_dict = self._extract_anvato_videos(webpage, display_id) - time_str = self._html_search_regex( - r'class="entry-date">([^<]+)<', webpage, 'released date', default=None) - if time_str: - timestamp = unified_timestamp(time_str) - else: - timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage)) + timestamp = unified_timestamp(self._html_search_regex( + r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, + 'released date', default=None)) or parse_iso8601( + self._html_search_meta('uploadDate', webpage)) info_dict.update({ 'display_id': display_id, From 3dfa9ec21362f8781d0a2d2eef1c9474b525af79 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Dec 2017 09:14:52 +0100 Subject: [PATCH 46/78] [crunchyroll] Future-proof XML element checks(closes #15013) --- youtube_dl/extractor/crunchyroll.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index b53f2d7050..b92f254479 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -392,7 +392,7 @@ def _get_subtitles(self, video_id, webpage): 'Downloading subtitles for ' + sub_name, data={ 'subtitle_script_id': sub_id, }) - if not sub_doc: + if sub_doc is None: continue sid = sub_doc.get('id') iv = xpath_text(sub_doc, 'iv', 'subtitle iv') @@ -479,9 +479,9 @@ def _real_extract(self, url): 'video_quality': stream_quality, 'current_page': url, }) - if streamdata: + if streamdata is not None: stream_info = streamdata.find('./{default}preload/stream_info') - if stream_info: + if stream_info is not None: stream_infos.append(stream_info) stream_info = self._call_rpc_api( 'VideoEncode_GetStreamInfo', video_id, @@ -490,7 +490,7 @@ def _real_extract(self, url): 'video_format': stream_format, 'video_encode_quality': stream_quality, }) - if stream_info: + if stream_info is not None: stream_infos.append(stream_info) for stream_info in stream_infos: video_encode_id = xpath_text(stream_info, './video_encode_id') From 25475dfab356e625ee480e1631601b86403afb5a Mon Sep 17 00:00:00 2001 From: Hongjie Dong Date: Mon, 4 Dec 2017 00:26:09 -0800 Subject: [PATCH 47/78] [mailru] Add support for embed URLs --- youtube_dl/extractor/mailru.py | 36 +++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index f7cc3c8328..f1865fd115 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -13,8 +13,7 @@ class MailRuIE(InfoExtractor): IE_NAME = 'mailru' IE_DESC = 'Видео@Mail.Ru' - _VALID_URL = r'https?://(?:(?:www|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P(?:[^/]+/){3}\d+)|(?:(?P(?:[^/]+/){2})video/(?P[^/]+/\d+))\.html)' - + _VALID_URL = r'https?://(?:(?:www|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P(?:[^/]+/){3}\d+)|(?:(?P(?:[^/]+/){2})video/(?P[^/]+/\d+))\.html|video/embed/(?P\d+))' _TESTS = [ { 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', @@ -65,25 +64,34 @@ class MailRuIE(InfoExtractor): { 'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html', 'only_matching': True, + }, + { + 'url': 'https://my.mail.ru/video/embed/7949340477499637815', + 'only_matching': True, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('idv1') - - if not video_id: - video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') - - webpage = self._download_webpage(url, video_id) - + meta_id = mobj.group('meta_id') + page_config = None video_data = None + video_id = None + if not meta_id: + video_id = mobj.group('idv1') + if not video_id: + video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') - page_config = self._parse_json(self._search_regex( - r'(?s)]+class="sp-video__page-config"[^>]*>(.+?)', - webpage, 'page config', default='{}'), video_id, fatal=False) - if page_config: - meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') + webpage = self._download_webpage(url, video_id) + + page_config = self._parse_json(self._search_regex( + r'(?s)]+class="sp-video__page-config"[^>]*>(.+?)', + webpage, 'page config', default='{}'), video_id, fatal=False) + if page_config or meta_id: + if page_config: + meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') + elif meta_id: + meta_url = 'https://my.mail.ru/+/video/meta/' + meta_id if meta_url: video_data = self._download_json( meta_url, video_id, 'Downloading video meta JSON', fatal=False) From 549bb416f5a9d15c03749a98abd582d0e40418ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 Dec 2017 18:36:19 +0700 Subject: [PATCH 48/78] [mailru] Fix issues and improve (closes #14904) --- youtube_dl/extractor/mailru.py | 43 ++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index f1865fd115..6b7c5e3e03 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -13,7 +13,15 @@ class MailRuIE(InfoExtractor): IE_NAME = 'mailru' IE_DESC = 'Видео@Mail.Ru' - _VALID_URL = r'https?://(?:(?:www|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P(?:[^/]+/){3}\d+)|(?:(?P(?:[^/]+/){2})video/(?P[^/]+/\d+))\.html|video/embed/(?P\d+))' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m)\.)?my\.mail\.ru/ + (?: + video/.*\#video=/?(?P(?:[^/]+/){3}\d+)| + (?:(?P(?:[^/]+/){2})video/(?P[^/]+/\d+))\.html| + (?:video/embed|\+/video/meta)/(?P\d+) + ) + ''' _TESTS = [ { 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', @@ -22,7 +30,7 @@ class MailRuIE(InfoExtractor): 'id': '46301138_76', 'ext': 'mp4', 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', - 'timestamp': 1393232740, + 'timestamp': 1393235077, 'upload_date': '20140224', 'uploader': 'sonypicturesrus', 'uploader_id': 'sonypicturesrus@mail.ru', @@ -39,7 +47,7 @@ class MailRuIE(InfoExtractor): 'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion', 'timestamp': 1397039888, 'upload_date': '20140409', - 'uploader': 'hitech@corp.mail.ru', + 'uploader': 'hitech', 'uploader_id': 'hitech@corp.mail.ru', 'duration': 245, }, @@ -68,33 +76,38 @@ class MailRuIE(InfoExtractor): { 'url': 'https://my.mail.ru/video/embed/7949340477499637815', 'only_matching': True, + }, + { + 'url': 'http://my.mail.ru/+/video/meta/7949340477499637815', + 'only_matching': True, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - meta_id = mobj.group('meta_id') - page_config = None - video_data = None + meta_id = mobj.group('metaid') + video_id = None - if not meta_id: + if meta_id: + meta_url = 'https://my.mail.ru/+/video/meta/%s' % meta_id + else: video_id = mobj.group('idv1') if not video_id: video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') - webpage = self._download_webpage(url, video_id) - page_config = self._parse_json(self._search_regex( r'(?s)]+class="sp-video__page-config"[^>]*>(.+?)', webpage, 'page config', default='{}'), video_id, fatal=False) - if page_config or meta_id: if page_config: meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') - elif meta_id: - meta_url = 'https://my.mail.ru/+/video/meta/' + meta_id - if meta_url: - video_data = self._download_json( - meta_url, video_id, 'Downloading video meta JSON', fatal=False) + else: + meta_url = None + + video_data = None + if meta_url: + video_data = self._download_json( + meta_url, video_id or meta_id, 'Downloading video meta JSON', + fatal=not video_id) # Fallback old approach if not video_data: From 7e810109877bb59d213574b115e46d9621db9a18 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Dec 2017 19:13:25 +0100 Subject: [PATCH 49/78] [cspan] add support for audio only pages and catch page errors(closes #14995) --- youtube_dl/extractor/cspan.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 171820e272..67d6df4b0e 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -4,13 +4,14 @@ from .common import InfoExtractor from ..utils import ( - int_or_none, - unescapeHTML, - find_xpath_attr, - smuggle_url, determine_ext, ExtractorError, extract_attributes, + find_xpath_attr, + get_element_by_class, + int_or_none, + smuggle_url, + unescapeHTML, ) from .senateisvp import SenateISVPIE from .ustream import UstreamIE @@ -68,6 +69,10 @@ class CSpanIE(InfoExtractor): 'uploader': 'HouseCommittee', 'uploader_id': '12987475', }, + }, { + # Audio Only + 'url': 'https://www.c-span.org/video/?437336-1/judiciary-antitrust-competition-policy-consumer-rights', + 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' @@ -111,7 +116,15 @@ def _real_extract(self, url): title = self._og_search_title(webpage) surl = smuggle_url(senate_isvp_url, {'force_title': title}) return self.url_result(surl, 'SenateISVP', video_id, title) + video_id = self._search_regex( + r'jwsetup\.clipprog\s*=\s*(\d+);', + webpage, 'jwsetup program id', default=None) + if video_id: + video_type = 'program' if video_type is None or video_id is None: + error_message = get_element_by_class('VLplayer-error-message', webpage) + if error_message: + raise ExtractorError(error_message) raise ExtractorError('unable to find video id and type') def get_text_attr(d, attr): @@ -138,7 +151,7 @@ def get_text_attr(d, attr): entries = [] for partnum, f in enumerate(files): formats = [] - for quality in f['qualities']: + for quality in f.get('qualities', []): formats.append({ 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), 'url': unescapeHTML(get_text_attr(quality, 'file')), From 99081da90c9b8ce4ee7fe9452787507fed4251a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 Dec 2017 03:31:53 +0700 Subject: [PATCH 50/78] [downloader/fragment] Encode filename of fragment being removed (closes #15020) --- youtube_dl/downloader/fragment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 7bb61a5414..ea5e3a4b5d 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -112,7 +112,7 @@ def _append_fragment(self, ctx, frag_content): if self.__do_ytdl_file(ctx): self._write_ytdl_file(ctx) if not self.params.get('keep_fragments', False): - os.remove(ctx['fragment_filename_sanitized']) + os.remove(encodeFilename(ctx['fragment_filename_sanitized'])) del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): From 4a109f81bc39ed09931fd6f9d21d20f7abdc9742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 19 Dec 2017 00:38:39 +0700 Subject: [PATCH 51/78] [afreecatv] Improve format extraction (closes #15019) --- youtube_dl/extractor/afreecatv.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index e6513c7a4d..513dd81df5 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -228,10 +228,19 @@ def _real_extract(self, url): r'^(\d{8})_', key, 'upload date', default=None) file_duration = int_or_none(file_element.get('duration')) format_id = key if key else '%s_%s' % (video_id, file_num) - formats = self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', - note='Downloading part %d m3u8 information' % file_num) + if determine_ext(file_url) == 'm3u8': + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', + note='Downloading part %d m3u8 information' % file_num) + else: + formats = [{ + 'url': file_url, + 'format_id': 'http', + }] + if not formats: + continue + self._sort_formats(formats) file_info = common_entry.copy() file_info.update({ 'id': format_id, From c10c93238e6db0df8746fc185ca316b9d8ccece5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 19 Dec 2017 03:51:03 +0700 Subject: [PATCH 52/78] [extractor/common] Introduce uploader, uploader_id and uploader_url meta fields for playlists (#11427, #15018) --- youtube_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 80a9c982f3..e5ef5e4906 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -301,8 +301,9 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "title", "description" and "id" attributes - with the same semantics as videos (see above). + Additionally, playlists can have "id", "title", "description", "uploader", + "uploader_id", "uploader_url" attributes with the same semantics as videos + (see above). _type "multi_video" indicates that there are multiple videos that From 07aeced68e5aa24f3e2562aa7cb9ddd2f11b59ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 19 Dec 2017 03:51:28 +0700 Subject: [PATCH 53/78] [youtube] Extract uploader, uploader_id and uploader_url for playlists (#11427, #15018) --- youtube_dl/extractor/youtube.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9943dddc13..0919bef0e0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2270,6 +2270,19 @@ def _extract_playlist(self, playlist_id): r'(?s)

]*>\s*(.*?)\s*

', page, 'title', default=None) + _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*
  • \s*]+\bhref=' + uploader = self._search_regex( + r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, + page, 'uploader', default=None) + mobj = re.search( + r'%s(["\'])(?P/(?:user|channel)/(?P.+?))\1' % _UPLOADER_BASE, + page) + if mobj: + uploader_id = mobj.group('uploader_id') + uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) + else: + uploader_id = uploader_url = None + has_videos = True if not playlist_title: @@ -2280,8 +2293,15 @@ def _extract_playlist(self, playlist_id): except StopIteration: has_videos = False - return has_videos, self.playlist_result( + playlist = self.playlist_result( self._entries(page, playlist_id), playlist_id, playlist_title) + playlist.update({ + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + }) + + return has_videos, playlist def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL From 3961c6cb9d3a1c30fe31db774b0809095952f1bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 19 Dec 2017 03:53:44 +0700 Subject: [PATCH 54/78] [YoutubeDL] Add support for playlist_uploader and playlist_uploader_id in output template (closes #11427, #15018) --- README.md | 2 ++ youtube_dl/YoutubeDL.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index cd30d147a9..47b0640abf 100644 --- a/README.md +++ b/README.md @@ -539,6 +539,8 @@ # OUTPUT TEMPLATE - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according to the total length of the playlist - `playlist_id` (string): Playlist identifier - `playlist_title` (string): Playlist title + - `playlist_uploader` (string): Full name of the playlist uploader + - `playlist_uploader_id` (string): Nickname or id of the playlist uploader Available for the video that belongs to some logical chapter or section: diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 68721e9ab8..ace80f14b8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -975,6 +975,8 @@ def report_download(num_entries): 'playlist': playlist, 'playlist_id': ie_result.get('id'), 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), 'playlist_index': i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], From 78466fcab519d1b92fd9846bc8073885308a7e22 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Dec 2017 02:00:13 +0100 Subject: [PATCH 55/78] [shahid] add support for show pages(closes #7401) --- youtube_dl/extractor/aws.py | 78 +++++++++++ youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/scrippsnetworks.py | 103 +++------------ youtube_dl/extractor/shahid.py | 164 +++++++++++++++++------- 4 files changed, 219 insertions(+), 131 deletions(-) create mode 100644 youtube_dl/extractor/aws.py diff --git a/youtube_dl/extractor/aws.py b/youtube_dl/extractor/aws.py new file mode 100644 index 0000000000..670abce0cc --- /dev/null +++ b/youtube_dl/extractor/aws.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import hashlib +import hmac + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlencode + + +class AWSIE(InfoExtractor): + _AWS_ALGORITHM = 'AWS4-HMAC-SHA256' + _AWS_REGION = 'us-east-1' + + def _aws_execute_api(self, aws_dict, video_id, query=None): + query = query or {} + amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') + date = amz_date[:8] + headers = { + 'Accept': 'application/json', + 'Host': self._AWS_PROXY_HOST, + 'X-Amz-Date': amz_date, + } + session_token = aws_dict.get('session_token') + if session_token: + headers['X-Amz-Security-Token'] = session_token + headers['X-Api-Key'] = self._AWS_API_KEY + + def aws_hash(s): + return hashlib.sha256(s.encode('utf-8')).hexdigest() + + # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html + canonical_querystring = compat_urllib_parse_urlencode(query) + canonical_headers = '' + for header_name, header_value in headers.items(): + canonical_headers += '%s:%s\n' % (header_name.lower(), header_value) + signed_headers = ';'.join([header.lower() for header in headers.keys()]) + canonical_request = '\n'.join([ + 'GET', + aws_dict['uri'], + canonical_querystring, + canonical_headers, + signed_headers, + aws_hash('') + ]) + + # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html + credential_scope_list = [date, self._AWS_REGION, 'execute-api', 'aws4_request'] + credential_scope = '/'.join(credential_scope_list) + string_to_sign = '\n'.join([self._AWS_ALGORITHM, amz_date, credential_scope, aws_hash(canonical_request)]) + + # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html + def aws_hmac(key, msg): + return hmac.new(key, msg.encode('utf-8'), hashlib.sha256) + + def aws_hmac_digest(key, msg): + return aws_hmac(key, msg).digest() + + def aws_hmac_hexdigest(key, msg): + return aws_hmac(key, msg).hexdigest() + + k_signing = ('AWS4' + aws_dict['secret_key']).encode('utf-8') + for value in credential_scope_list: + k_signing = aws_hmac_digest(k_signing, value) + + signature = aws_hmac_hexdigest(k_signing, string_to_sign) + + # Task 4: http://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html + headers['Authorization'] = ', '.join([ + '%s Credential=%s/%s' % (self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope), + 'SignedHeaders=%s' % signed_headers, + 'Signature=%s' % signature, + ]) + + return self._download_json( + 'https://%s%s%s' % (self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''), + video_id, headers=headers) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4072455137..513074801c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -927,7 +927,10 @@ from .servingsys import ServingSysIE from .servus import ServusIE from .sexu import SexuIE -from .shahid import ShahidIE +from .shahid import ( + ShahidIE, + ShahidShowIE, +) from .shared import ( SharedIE, VivoIE, diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index b446a02bac..4023aeef81 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -1,13 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import json import hashlib -import hmac import re -from .common import InfoExtractor +from .aws import AWSIE from .anvato import AnvatoIE from ..utils import ( smuggle_url, @@ -16,7 +14,7 @@ ) -class ScrippsNetworksWatchIE(InfoExtractor): +class ScrippsNetworksWatchIE(AWSIE): IE_NAME = 'scrippsnetworks:watch' _VALID_URL = r'''(?x) https?:// @@ -64,44 +62,27 @@ class ScrippsNetworksWatchIE(InfoExtractor): 'travelchannel': 'trav', 'geniuskitchen': 'genius', } - _SNI_HOST = 'web.api.video.snidigital.com' - _AWS_REGION = 'us-east-1' - _AWS_IDENTITY_ID_JSON = json.dumps({ - 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % _AWS_REGION - }) - _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' - _AWS_SERVICE = 'execute-api' - _AWS_REQUEST = 'aws4_request' - _AWS_SIGNED_HEADERS = ';'.join([ - 'host', 'x-amz-date', 'x-amz-security-token', 'x-api-key']) - _AWS_CANONICAL_REQUEST_TEMPLATE = '''GET -%(uri)s + _AWS_PROXY_HOST = 'web.api.video.snidigital.com' -host:%(host)s -x-amz-date:%(date)s -x-amz-security-token:%(token)s -x-api-key:%(key)s - -%(signed_headers)s -%(payload_hash)s''' + _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site', 'id') - def aws_hash(s): - return hashlib.sha256(s.encode('utf-8')).hexdigest() - + aws_identity_id_json = json.dumps({ + 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION + }).encode('utf-8') token = self._download_json( - 'https://cognito-identity.us-east-1.amazonaws.com/', video_id, - data=self._AWS_IDENTITY_ID_JSON.encode('utf-8'), + 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, + data=aws_identity_id_json, headers={ 'Accept': '*/*', 'Content-Type': 'application/x-amz-json-1.1', 'Referer': url, - 'X-Amz-Content-Sha256': aws_hash(self._AWS_IDENTITY_ID_JSON), + 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', 'X-Amz-User-Agent': self._AWS_USER_AGENT, })['Token'] @@ -124,64 +105,12 @@ def get(key): sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, fatal=True) - access_key_id = get('AccessKeyId') - secret_access_key = get('SecretAccessKey') - session_token = get('SessionToken') - - # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html - uri = '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id) - datetime_now = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') - date = datetime_now[:8] - canonical_string = self._AWS_CANONICAL_REQUEST_TEMPLATE % { - 'uri': uri, - 'host': self._SNI_HOST, - 'date': datetime_now, - 'token': session_token, - 'key': self._AWS_API_KEY, - 'signed_headers': self._AWS_SIGNED_HEADERS, - 'payload_hash': aws_hash(''), - } - - # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html - credential_string = '/'.join([date, self._AWS_REGION, self._AWS_SERVICE, self._AWS_REQUEST]) - string_to_sign = '\n'.join([ - 'AWS4-HMAC-SHA256', datetime_now, credential_string, - aws_hash(canonical_string)]) - - # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html - def aws_hmac(key, msg): - return hmac.new(key, msg.encode('utf-8'), hashlib.sha256) - - def aws_hmac_digest(key, msg): - return aws_hmac(key, msg).digest() - - def aws_hmac_hexdigest(key, msg): - return aws_hmac(key, msg).hexdigest() - - k_secret = 'AWS4' + secret_access_key - k_date = aws_hmac_digest(k_secret.encode('utf-8'), date) - k_region = aws_hmac_digest(k_date, self._AWS_REGION) - k_service = aws_hmac_digest(k_region, self._AWS_SERVICE) - k_signing = aws_hmac_digest(k_service, self._AWS_REQUEST) - - signature = aws_hmac_hexdigest(k_signing, string_to_sign) - - auth_header = ', '.join([ - 'AWS4-HMAC-SHA256 Credential=%s' % '/'.join( - [access_key_id, date, self._AWS_REGION, self._AWS_SERVICE, self._AWS_REQUEST]), - 'SignedHeaders=%s' % self._AWS_SIGNED_HEADERS, - 'Signature=%s' % signature, - ]) - - mcp_id = self._download_json( - 'https://%s%s' % (self._SNI_HOST, uri), video_id, headers={ - 'Accept': '*/*', - 'Referer': url, - 'Authorization': auth_header, - 'X-Amz-Date': datetime_now, - 'X-Amz-Security-Token': session_token, - 'X-Api-Key': self._AWS_API_KEY, - })['results'][0]['mcpId'] + mcp_id = self._aws_execute_api({ + 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), + 'access_key': get('AccessKeyId'), + 'secret_key': get('SecretAccessKey'), + 'session_token': get('SessionToken'), + }, video_id)['results'][0]['mcpId'] return self.url_result( smuggle_url( diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 374f7faf9d..5c2a6206be 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -1,22 +1,53 @@ # coding: utf-8 from __future__ import unicode_literals -import re import json +import math +import re -from .common import InfoExtractor +from .aws import AWSIE from ..compat import compat_HTTPError from ..utils import ( + clean_html, ExtractorError, + InAdvancePagedList, int_or_none, parse_iso8601, str_or_none, urlencode_postdata, - clean_html, ) -class ShahidIE(InfoExtractor): +class ShahidBaseIE(AWSIE): + _AWS_PROXY_HOST = 'api2.shahid.net' + _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + + def _handle_error(self, e): + fail_data = self._parse_json( + e.cause.read().decode('utf-8'), None, fatal=False) + if fail_data: + faults = fail_data.get('faults', []) + faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) + if faults_message: + raise ExtractorError(faults_message, expected=True) + + def _call_api(self, path, video_id, request=None): + query = {} + if request: + query['request'] = json.dumps(request) + try: + return self._aws_execute_api({ + 'uri': '/proxy/v2/' + path, + 'access_key': 'AKIAI6X4TYCIXM2B7MUQ', + 'secret_key': '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', + }, video_id, query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + +class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ @@ -41,34 +72,25 @@ class ShahidIE(InfoExtractor): 'only_matching': True }] - def _api2_request(self, *args, **kwargs): - try: - return self._download_json(*args, **kwargs) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - fail_data = self._parse_json( - e.cause.read().decode('utf-8'), None, fatal=False) - if fail_data: - faults = fail_data.get('faults', []) - faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) - if faults_message: - raise ExtractorError(faults_message, expected=True) - raise - def _real_initialize(self): email, password = self._get_login_info() if email is None: return - user_data = self._api2_request( - 'https://shahid.mbc.net/wd/service/users/login', - None, 'Logging in', data=json.dumps({ - 'email': email, - 'password': password, - 'basic': 'false', - }).encode('utf-8'), headers={ - 'Content-Type': 'application/json; charset=UTF-8', - })['user'] + try: + user_data = self._download_json( + 'https://shahid.mbc.net/wd/service/users/login', + None, 'Logging in', data=json.dumps({ + 'email': email, + 'password': password, + 'basic': 'false', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/json; charset=UTF-8', + })['user'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise self._download_webpage( 'https://shahid.mbc.net/populateContext', @@ -81,25 +103,13 @@ def _real_initialize(self): 'sessionId': user_data['sessionId'], })) - def _get_api_data(self, response): - data = response.get('data', {}) - - error = data.get('error') - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), - expected=True) - - return data - def _real_extract(self, url): page_type, video_id = re.match(self._VALID_URL, url).groups() if page_type == 'clip': page_type = 'episode' - playout = self._api2_request( - 'https://api2.shahid.net/proxy/v2/playout/url/' + video_id, - video_id, 'Downloading player JSON')['playout'] + playout = self._call_api( + 'playout/url/' + video_id, video_id)['playout'] if playout.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) @@ -107,13 +117,27 @@ def _real_extract(self, url): formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') self._sort_formats(formats) - video = self._get_api_data(self._download_json( + # video = self._call_api( + # 'product/id', video_id, { + # 'id': video_id, + # 'productType': 'ASSET', + # 'productSubType': page_type.upper() + # })['productModel'] + + response = self._download_json( 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), video_id, 'Downloading video JSON', query={ 'apiKey': 'sh@hid0nlin3', 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', - }))[page_type] + }) + data = response.get('data', {}) + error = data.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), + expected=True) + video = data[page_type] title = video['title'] categories = [ category['name'] @@ -135,3 +159,57 @@ def _real_extract(self, url): 'episode_id': video_id, 'formats': formats, } + + +class ShahidShowIE(ShahidBaseIE): + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', + 'info_dict': { + 'id': '79187', + 'title': 'رامز قرش البحر', + 'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', + }, + 'playlist_mincount': 32, + }, { + 'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', + 'only_matching': True + }] + _PAGE_SIZE = 30 + + def _real_extract(self, url): + show_id = self._match_id(url) + + product = self._call_api( + 'playableAsset', show_id, {'showId': show_id})['productModel'] + playlist = product['playlist'] + playlist_id = playlist['id'] + show = product.get('show', {}) + + def page_func(page_num): + playlist = self._call_api( + 'product/playlist', show_id, { + 'playListId': playlist_id, + 'pageNumber': page_num, + 'pageSize': 30, + 'sorts': [{ + 'order': 'DESC', + 'type': 'SORTDATE' + }], + }) + for product in playlist.get('productList', {}).get('products', []): + product_url = product.get('productUrl', []).get('url') + if not product_url: + continue + yield self.url_result( + product_url, 'Shahid', + str_or_none(product.get('id')), + product.get('title')) + + entries = InAdvancePagedList( + page_func, + math.ceil(playlist['count'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, show_id, show.get('title'), show.get('description')) From 17c3aced5d0d2cf7df41e9978500260756ee8ad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 19 Dec 2017 22:53:04 +0700 Subject: [PATCH 56/78] [animeondemand] Relax login error regex --- youtube_dl/extractor/animeondemand.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 34c2b363ea..be032d5b42 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -85,8 +85,8 @@ def _login(self): if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( - r'

    (.+?)

    ', - response, 'error', default=None) + r']+\bclass=(["\'])(?:(?!\1).)*\balert\s(?:(?!\1).)*\1[^>]*>(?P.+?)

    ', + response, 'error', default=None, group='error') if error: raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') From d2d766bc6d6f976c28fad8b69a1de060b55f5b17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 20 Dec 2017 23:17:36 +0700 Subject: [PATCH 57/78] [animeondemand] Fix typo --- youtube_dl/extractor/animeondemand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index be032d5b42..e4fa72f466 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -85,7 +85,7 @@ def _login(self): if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( - r']+\bclass=(["\'])(?:(?!\1).)*\balert\s(?:(?!\1).)*\1[^>]*>(?P.+?)

    ', + r']+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P.+?)

    ', response, 'error', default=None, group='error') if error: raise ExtractorError('Unable to login: %s' % error, expected=True) From 963d237d26c7e6da7b6f514c1d240a7046501b05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Dec 2017 23:38:16 +0700 Subject: [PATCH 58/78] Add LICENSE, AUTHORS and ChangeLog to PyPI package (closes #15054) --- MANIFEST.in | 3 +++ setup.py | 1 + 2 files changed, 4 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 5743f605a2..af7518e0da 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,7 @@ include README.md +include LICENSE +include AUTHORS +include ChangeLog include test/*.py include test/*.json include youtube-dl.bash-completion diff --git a/setup.py b/setup.py index 67d6633ed6..7dbb5805f8 100644 --- a/setup.py +++ b/setup.py @@ -109,6 +109,7 @@ def run(self): author_email='ytdl@yt-dl.org', maintainer='Sergey M.', maintainer_email='dstftw@gmail.com', + license='Unlicense', packages=[ 'youtube_dl', 'youtube_dl.extractor', 'youtube_dl.downloader', From 3e191da6d9d1cbe62d8f638ed68a93a46348b38b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Dec 2017 23:46:08 +0700 Subject: [PATCH 59/78] [Makefile] Add AUTHORS to youtube-dl.tar.gz --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1c760bef8b..fe247810fe 100644 --- a/Makefile +++ b/Makefile @@ -110,7 +110,7 @@ _EXTRACTOR_FILES = $(shell find youtube_dl/extractor -iname '*.py' -and -not -in youtube_dl/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) $(PYTHON) devscripts/make_lazy_extractors.py $@ -youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish ChangeLog +youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish ChangeLog AUTHORS @tar -czf youtube-dl.tar.gz --transform "s|^|youtube-dl/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ --exclude '*.kate-swp' \ @@ -122,7 +122,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude 'docs/_build' \ -- \ bin devscripts test youtube_dl docs \ - ChangeLog LICENSE README.md README.txt \ + ChangeLog AUTHORS LICENSE README.md README.txt \ Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion \ youtube-dl.zsh youtube-dl.fish setup.py setup.cfg \ youtube-dl From 9e3682d555d431514d9583170ae8be1b6fc12839 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Dec 2017 23:53:27 +0700 Subject: [PATCH 60/78] [MANIFEST.in] Include all test data in PyPI package --- MANIFEST.in | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index af7518e0da..4e43e99f39 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,9 +2,8 @@ include README.md include LICENSE include AUTHORS include ChangeLog -include test/*.py -include test/*.json include youtube-dl.bash-completion include youtube-dl.fish include youtube-dl.1 recursive-include docs Makefile conf.py *.rst +recursive-include test * From 4b7dd1705a7c16c1426ed7ed39e51e275124b4f3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 23 Dec 2017 13:21:33 +0100 Subject: [PATCH 61/78] [7plus] Add new extractor(closes #15043) --- youtube_dl/extractor/brightcove.py | 116 +++++++++++++++-------------- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/sevenplus.py | 67 +++++++++++++++++ 3 files changed, 128 insertions(+), 56 deletions(-) create mode 100644 youtube_dl/extractor/sevenplus.py diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0ed59bcbc4..f04505011c 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -464,7 +464,7 @@ class BrightcoveNewIE(AdobePassIE): 'timestamp': 1441391203, 'upload_date': '20150904', 'uploader_id': '929656772001', - 'formats': 'mincount:22', + 'formats': 'mincount:20', }, }, { # with rtmp streams @@ -478,7 +478,7 @@ class BrightcoveNewIE(AdobePassIE): 'timestamp': 1433556729, 'upload_date': '20150606', 'uploader_id': '4036320279001', - 'formats': 'mincount:41', + 'formats': 'mincount:39', }, 'params': { # m3u8 download @@ -564,59 +564,7 @@ def _extract_urls(ie, webpage): return entries - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) - - account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) - - policy_key = None - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') - - if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P.+?)\1', - webpage, 'policy key', group='pk') - - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) - try: - json_data = self._download_json(api_url, video_id, headers={ - 'Accept': 'application/json;pk=%s' % policy_key - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] - message = json_data.get('message') or json_data['error_code'] - if json_data.get('error_subcode') == 'CLIENT_GEO': - self.raise_geo_restricted(msg=message) - raise ExtractorError(message, expected=True) - raise - - errors = json_data.get('errors') - if errors and errors[0].get('error_subcode') == 'TVE_AUTH': - custom_fields = json_data['custom_fields'] - tve_token = self._extract_mvpd_auth( - smuggled_data['source_url'], video_id, - custom_fields['bcadobepassrequestorid'], - custom_fields['bcadobepassresourceid']) - json_data = self._download_json( - api_url, video_id, headers={ - 'Accept': 'application/json;pk=%s' % policy_key - }, query={ - 'tveToken': tve_token, - }) - + def _parse_brightcove_metadata(self, json_data, video_id): title = json_data['name'].strip() formats = [] @@ -682,6 +630,7 @@ def build_format_id(kind): }) formats.append(f) + errors = json_data.get('errors') if not formats and errors: error = errors[0] raise ExtractorError( @@ -708,9 +657,64 @@ def build_format_id(kind): 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), - 'uploader_id': account_id, + 'uploader_id': json_data.get('account_id'), 'formats': formats, 'subtitles': subtitles, 'tags': json_data.get('tags', []), 'is_live': is_live, } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) + + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') + + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) + try: + json_data = self._download_json(api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message, expected=True) + raise + + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + + return self._parse_brightcove_metadata(json_data, video_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 513074801c..9ba1be2cd5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -926,6 +926,7 @@ from .sendtonews import SendtoNewsIE from .servingsys import ServingSysIE from .servus import ServusIE +from .sevenplus import SevenPlusIE from .sexu import SexuIE from .shahid import ( ShahidIE, diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py new file mode 100644 index 0000000000..9792f820a5 --- /dev/null +++ b/youtube_dl/extractor/sevenplus.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .brightcove import BrightcoveNewIE +from ..utils import update_url_query + + +class SevenPlusIE(BrightcoveNewIE): + IE_NAME = '7plus' + _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P[^?]+\?.*?\bepisode-id=(?P[^&#]+))' + _TESTS = [{ + 'url': 'https://7plus.com.au/BEAT?episode-id=BEAT-001', + 'info_dict': { + 'id': 'BEAT-001', + 'ext': 'mp4', + 'title': 'S1 E1 - Help / Lucy In The Sky With Diamonds', + 'description': 'md5:37718bea20a8eedaca7f7361af566131', + 'uploader_id': '5303576322001', + 'upload_date': '20171031', + 'timestamp': 1509440068, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + } + }, { + 'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, episode_id = re.match(self._VALID_URL, url).groups() + + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + + for source in media.get('sources', {}): + src = source.get('src') + if not src: + continue + source['src'] = update_url_query(src, {'rule': ''}) + + info = self._parse_brightcove_metadata(media, episode_id) + + content = self._download_json( + 'https://component-cdn.swm.digital/content/' + path, + episode_id, headers={ + 'market-id': 4, + }, fatal=False) or {} + for item in content.get('items', {}): + if item.get('componentData', {}).get('componentType') == 'infoPanel': + for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: + value = item.get(src_key) + if value: + info[dst_key] = value + + return info From 2132edaa03857085821b6a1214ce1410e0c2e463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 20:57:35 +0700 Subject: [PATCH 62/78] [extractor/common] Move X-Forwarded-For setup code into _request_webpage --- youtube_dl/extractor/common.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e5ef5e4906..3b79b8cb41 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -495,6 +495,16 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) @@ -524,15 +534,6 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote= if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - # Some sites check X-Forwarded-For HTTP header in order to figure out - # the origin of the client behind proxy. This allows bypassing geo - # restriction by faking this header's value to IP that belongs to some - # geo unrestricted country. We will do so once we encounter any - # geo restriction error. - if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal From 5c5e60cff894e5372f89e6ba45d7ab6575c0a0b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 20:59:14 +0700 Subject: [PATCH 63/78] [voot] Fix video identification --- youtube_dl/extractor/voot.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py index 4267544893..751b21ee51 100644 --- a/youtube_dl/extractor/voot.py +++ b/youtube_dl/extractor/voot.py @@ -16,7 +16,7 @@ class VootIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', 'info_dict': { - 'id': '441353', + 'id': '0_8ledb18o', 'ext': 'mp4', 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', @@ -59,9 +59,10 @@ def _real_extract(self, url): media = media_info['assets'] + entry_id = media['EntryId'] title = media['MediaName'] formats = self._extract_m3u8_formats( - 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + media['EntryId'], + 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, video_id, 'mp4', m3u8_id='hls') self._sort_formats(formats) @@ -83,7 +84,8 @@ def _real_extract(self, url): episode_number = int_or_none(value) return { - 'id': video_id, + 'extractor_key': 'Kaltura', + 'id': entry_id, 'title': title, 'description': description, 'series': series, From 69d69da98aa093c05776371beac1f2ffb4f5eea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 21:16:32 +0700 Subject: [PATCH 64/78] [kaltura] Add another embed pattern for entry_id For cases when player configuration map is setup via indexing operator, e.g. kalturaPlayerConfiguration_1_lre6rg3i_10[entry_id] = 1_lre6rg3i (see https://www.heise.de/video/artikel/odcast-c-t-uplink-20-1-Apple-CarPlay-vs-Android-Auto-Galileo-3D-Sound-erklaert-3919694.html) --- youtube_dl/extractor/kaltura.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index bdac2df3e5..e369959e3c 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -125,9 +125,12 @@ def _extract_url(webpage): (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* (?P=q1).*? (?: - entry_?[Ii]d| - (?P["'])entry_?[Ii]d(?P=q2) - )\s*:\s* + (?: + entry_?[Ii]d| + (?P["'])entry_?[Ii]d(?P=q2) + )\s*:\s*| + \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]?\s*=\s* + ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) ''', webpage) or re.search( From f5a6321107db17ec8efaccaa2a4febc64b5aa5ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 21:17:28 +0700 Subject: [PATCH 65/78] [ChangeLog] Actualize --- ChangeLog | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 03d2defb73..2d62f2fde0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +version + +Core +* [extractor/common] Move X-Forwarded-For setup code into _request_webpage ++ [YoutubeDL] Add support for playlist_uploader and playlist_uploader_id in + output template (#11427, #15018) ++ [extractor/common] Introduce uploader, uploader_id and uploader_url + meta fields for playlists (#11427, #15018) +* [downloader/fragment] Encode filename of fragment being removed (#15020) ++ [utils] Add another date format pattern (#14999) + +Extractors ++ [kaltura] Add another embed pattern for entry_id ++ [7plus] Add support for 7plus.com.au (#15043) +* [animeondemand] Relax login error regular expression ++ [shahid] Add support for show pages (#7401) ++ [youtube] Extract uploader, uploader_id and uploader_url for playlists + (#11427, #15018) +* [afreecatv] Improve format extraction (#15019) ++ [cspan] Add support for audio only pages and catch page errors (#14995) ++ [mailru] Add support for embed URLs (#14904) +* [crunchyroll] Future-proof XML element checks (#15013) +* [cbslocal] Fix timestamp extraction (#14999, #15000) +* [discoverygo] Correct TTML subtitle extension +* [vk] Make view count optional (#14979) +* [disney] Skip Apple FairPlay formats (#14982) +* [voot] Fix format extraction (#14758) + + version 2017.12.14 Core @@ -148,8 +177,8 @@ Extractors + [fxnetworks] Extract series metadata (#14603) + [younow] Add support for younow.com (#9255, #9432, #12436) * [dctptv] Fix extraction (#14599) -* [youtube] Restrict embed regex (#14600) -* [vimeo] Restrict iframe embed regex (#14600) +* [youtube] Restrict embed regular expression (#14600) +* [vimeo] Restrict iframe embed regular expression (#14600) * [soundgasm] Improve extraction (#14588) - [myvideo] Remove extractor (#8557) + [nbc] Add support for classic-tv videos (#14575) From c2f2f8b120628f7e0e4b0a6f7184884fa976d9c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 21:22:41 +0700 Subject: [PATCH 66/78] [kaltura] Fix typo --- youtube_dl/extractor/kaltura.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index e369959e3c..562e25f6d3 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -129,7 +129,7 @@ def _extract_url(webpage): entry_?[Ii]d| (?P["'])entry_?[Ii]d(?P=q2) )\s*:\s*| - \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]?\s*=\s* + \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) ''', webpage) or From 307a7588b0a9205688e8ebc2539c1b0e19f68a6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Dec 2017 21:24:18 +0700 Subject: [PATCH 67/78] release 2017.12.23 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f37d8aa42e..d7a91239f2 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ ## Please follow the guide below --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.14** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ ### If the purpose of this *issue* is a *bug report*, *site support request* or [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.14 +[debug] youtube-dl version 2017.12.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 2d62f2fde0..ba64f3e026 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.12.23 Core * [extractor/common] Move X-Forwarded-For setup code into _request_webpage diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ebddd5b9d5..eac35e3909 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -10,6 +10,7 @@ # Supported sites - **56.com** - **5min** - **6play** + - **7plus** - **8tracks** - **91porn** - **9c9media** @@ -728,6 +729,7 @@ # Supported sites - **Servus** - **Sexu** - **Shahid** + - **ShahidShow** - **Shared**: shared.sx - **ShowRoomLive** - **Sina** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2b5a634641..f999584d72 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.14' +__version__ = '2017.12.23' From 0e25a1a27875369a4fdf11b6a7fcfc969b1f482d Mon Sep 17 00:00:00 2001 From: JianxinLi Date: Mon, 4 Dec 2017 10:33:56 +0800 Subject: [PATCH 68/78] [youku] Update ccode Change-Id: Id397e814e81ff560506d68563b7409eebbe5943d --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index f0ba011970..9d0caee934 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ def _real_extract(self, url): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0501', + 'ccode': '0507', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, From 116561697d605f22f749e3d092e8e4795ca0573d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 23 Dec 2017 23:41:24 +0800 Subject: [PATCH 69/78] [ChangeLog] Update after #14903 --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index ba64f3e026..658c00c5b3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [youku] Update ccode + + version 2017.12.23 Core From b954e72c8731e65b9b0548a537cd0e3275b54e4d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 23 Dec 2017 23:42:02 +0800 Subject: [PATCH 70/78] [ChangeLog] typo --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 658c00c5b3..cb750e2706 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version Extractors * [youku] Update ccode From 273c23d960cbd2da18fadaef002473db41b5f56b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 24 Dec 2017 13:53:27 +0700 Subject: [PATCH 71/78] [openload] Add support for oload.stream (closes #15070) --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index a99af12a40..aed579f362 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -242,7 +242,7 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.tv)/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -289,6 +289,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'http://www.openload.link/f/KnG-kKZdcfY', 'only_matching': True, + }, { + 'url': 'https://oload.stream/f/KnG-kKZdcfY', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From a75419586bb900df711de49adf5047afa9f083ef Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 24 Dec 2017 20:47:42 +0800 Subject: [PATCH 72/78] [openload] Remove a confusing exception If phantomjs is not installed, there's an error besides the missing phantomjs exception: Exception ignored in: > Traceback (most recent call last): File "/home/yen/Projects/youtube-dl/youtube_dl/extractor/openload.py", line 142, in __del__ os.remove(self._TMP_FILES[name].name) AttributeError: 'PhantomJSwrapper' object has no attribute '_TMP_FILES' --- youtube_dl/extractor/openload.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index aed579f362..d1eb3be259 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -112,6 +112,8 @@ def _version(): return get_exe_version('phantomjs', version_re=r'([0-9.]+)') def __init__(self, extractor, required_version=None, timeout=10000): + self._TMP_FILES = {} + self.exe = check_executable('phantomjs', ['-v']) if not self.exe: raise ExtractorError('PhantomJS executable not found in PATH, ' @@ -130,7 +132,6 @@ def __init__(self, extractor, required_version=None, timeout=10000): self.options = { 'timeout': timeout, } - self._TMP_FILES = {} for name in self._TMP_FILE_NAMES: tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() @@ -140,7 +141,7 @@ def __del__(self): for name in self._TMP_FILE_NAMES: try: os.remove(self._TMP_FILES[name].name) - except (IOError, OSError): + except (IOError, OSError, KeyError): pass def _save_cookies(self, url): From d99a1000c7522cb37910afe772d7317687521eb0 Mon Sep 17 00:00:00 2001 From: JianxinLi Date: Sun, 24 Dec 2017 00:30:27 +0800 Subject: [PATCH 73/78] [youku] Fix list extraction.(close #15065) Change-Id: I578fdc5b69509bdcd8d3191e3917afe47c234ff6 --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 9d0caee934..3e64cce388 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -276,9 +276,9 @@ def _real_extract(self, url): r']+id="(reload_\d+)', first_page, 'first page reload id') # The first reload_id has the same items as first_page reload_ids = re.findall(']+data-id="([^"]+)">', first_page) + entries.extend(initial_entries) for idx, reload_id in enumerate(reload_ids): if reload_id == first_page_reload_id: - entries.extend(initial_entries) continue _, new_entries = self._extract_entries( 'http://list.youku.com/show/episode', show_id, From d3ca28323545a36819d9c32797907bc190095b5c Mon Sep 17 00:00:00 2001 From: JianxinLi Date: Mon, 25 Dec 2017 21:39:10 +0800 Subject: [PATCH 74/78] [youku] Add test case. Some playlist has no data-id value. Change-Id: I97455f2907f08bda03b538cdc13ec827e2f8ce26 --- youtube_dl/extractor/youku.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 3e64cce388..c7947d4a11 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -241,6 +241,10 @@ class YoukuShowIE(InfoExtractor): # Ongoing playlist. The initial page is the last one 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', 'only_matching': True, + }, { + # No data-id value. + 'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', + 'only_matching': True, }] def _extract_entries(self, playlist_data_url, show_id, note, query): From 173558ce9620bf1b22ba2d4c67288e2a45c715fc Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 25 Dec 2017 22:06:18 +0800 Subject: [PATCH 75/78] [ChangeLog] Update after #15065 --- ChangeLog | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index cb750e2706..420a1bd111 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,8 @@ version Extractors -* [youku] Update ccode +* [youku] Update ccode (#14880) +* [youku] Fix list extraction (#15065) version 2017.12.23 From 0f897e0929b2a3ebcae616f8b1bbdac8cd9c6f75 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 25 Dec 2017 23:28:51 +0100 Subject: [PATCH 76/78] [espn] add support for espnfc and extract more formats(closes #8053) --- youtube_dl/extractor/espn.py | 70 +++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 7a74360683..0e135b8bc7 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .once import OnceIE from ..compat import compat_str from ..utils import ( determine_ext, @@ -9,22 +12,27 @@ ) -class ESPNIE(InfoExtractor): +class ESPNIE(OnceIE): _VALID_URL = r'''(?x) https?:// - (?: - (?:(?:\w+\.)+)?espn\.go| - (?:www\.)?espn - )\.com/ (?: (?: - video/clip| - watch/player - ) - (?: - \?.*?\bid=| - /_/id/ - ) + (?: + (?:(?:\w+\.)+)?espn\.go| + (?:www\.)?espn + )\.com/ + (?: + (?: + video/(?:clip|iframe/twitter)| + watch/player + ) + (?: + .*?\?.*?\bid=| + /_/id/ + ) + ) + )| + (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/ ) (?P\d+) ''' @@ -77,6 +85,15 @@ class ESPNIE(InfoExtractor): }, { 'url': 'http://www.espn.com/video/clip/_/id/17989860', 'only_matching': True, + }, { + 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', + 'only_matching': True, + }, { + 'url': 'http://www.espnfc.us/video/espn-fc-tv/86/video/3319154/nashville-unveiled-as-the-newest-club-in-mls', + 'only_matching': True, + }, { + 'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets', + 'only_matching': True, }] def _real_extract(self, url): @@ -93,7 +110,9 @@ def _real_extract(self, url): def traverse_source(source, base_source_id=None): for source_id, source in source.items(): - if isinstance(source, compat_str): + if source_id == 'alert': + continue + elif isinstance(source, compat_str): extract_source(source, base_source_id) elif isinstance(source, dict): traverse_source( @@ -106,7 +125,9 @@ def extract_source(source_url, source_id=None): return format_urls.add(source_url) ext = determine_ext(source_url) - if ext == 'smil': + if OnceIE.suitable(source_url): + formats.extend(self._extract_once_formats(source_url)) + elif ext == 'smil': formats.extend(self._extract_smil_formats( source_url, video_id, fatal=False)) elif ext == 'f4m': @@ -117,12 +138,24 @@ def extract_source(source_url, source_id=None): source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=source_id, fatal=False)) else: - formats.append({ + f = { 'url': source_url, 'format_id': source_id, - }) + } + mobj = re.search(r'(\d+)p(\d+)_(\d+)k\.', source_url) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'fps': int(mobj.group(2)), + 'tbr': int(mobj.group(3)), + }) + if source_id == 'mezzanine': + f['preference'] = 1 + formats.append(f) - traverse_source(clip['links']['source']) + links = clip.get('links', {}) + traverse_source(links.get('source', {})) + traverse_source(links.get('mobile', {})) self._sort_formats(formats) description = clip.get('caption') or clip.get('description') @@ -144,9 +177,6 @@ def extract_source(source_url, source_id=None): class ESPNArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P[^/]+)' _TESTS = [{ - 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', - 'only_matching': True, - }, { 'url': 'http://espn.go.com/nba/recap?gameId=400793786', 'only_matching': True, }, { From 45d20488f188b680daa39c5b9fa88d0bba102ab5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Dec 2017 12:32:04 +0100 Subject: [PATCH 77/78] [umg:de] Add new extractor(closes #11582)(closes #11584) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/umg.py | 103 +++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 youtube_dl/extractor/umg.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9ba1be2cd5..3269ed743e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1143,6 +1143,7 @@ from .udn import UDNEmbedIE from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE +from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE from .uol import UOLIE diff --git a/youtube_dl/extractor/umg.py b/youtube_dl/extractor/umg.py new file mode 100644 index 0000000000..d815cd9a6b --- /dev/null +++ b/youtube_dl/extractor/umg.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_filesize, + parse_iso8601, +) + + +class UMGDeIE(InfoExtractor): + IE_NAME = 'umg:de' + IE_DESC = 'Universal Music Deutschland' + _VALID_URL = r'https?://(?:www\.)?universal-music\.de/[^/]+/videos/[^/?#]+-(?P\d+)' + _TEST = { + 'url': 'https://www.universal-music.de/sido/videos/jedes-wort-ist-gold-wert-457803', + 'md5': 'ebd90f48c80dcc82f77251eb1902634f', + 'info_dict': { + 'id': '457803', + 'ext': 'mp4', + 'title': 'Jedes Wort ist Gold wert', + 'timestamp': 1513591800, + 'upload_date': '20171218', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://api.universal-music.de/graphql', + video_id, query={ + 'query': '''{ + universalMusic(channel:16) { + video(id:%s) { + headline + formats { + formatId + url + type + width + height + mimeType + fileSize + } + duration + createdDate + } + } +}''' % video_id})['data']['universalMusic']['video'] + + title = video_data['headline'] + hls_url_template = 'http://mediadelivery.universal-music-services.de/vod/mp4:autofill/storage/' + '/'.join(list(video_id)) + '/content/%s/file/playlist.m3u8' + + thumbnails = [] + formats = [] + + def add_m3u8_format(format_id): + m3u8_formats = self._extract_m3u8_formats( + hls_url_template % format_id, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal='False') + if m3u8_formats and m3u8_formats[0].get('height'): + formats.extend(m3u8_formats) + + for f in video_data.get('formats', []): + f_url = f.get('url') + mime_type = f.get('mimeType') + if not f_url or mime_type == 'application/mxf': + continue + fmt = { + 'url': f_url, + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'filesize': parse_filesize(f.get('fileSize')), + } + f_type = f.get('type') + if f_type == 'Image': + thumbnails.append(fmt) + elif f_type == 'Video': + format_id = f.get('formatId') + if format_id: + fmt['format_id'] = format_id + if mime_type == 'video/mp4': + add_m3u8_format(format_id) + urlh = self._request_webpage(f_url, video_id, fatal=False) + if urlh: + first_byte = urlh.read(1) + if first_byte not in (b'F', b'\x00'): + continue + formats.append(fmt) + if not formats: + for format_id in (867, 836, 940): + add_m3u8_format(format_id) + self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr')) + + return { + 'id': video_id, + 'title': title, + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': parse_iso8601(video_data.get('createdDate'), ' '), + 'thumbnails': thumbnails, + 'formats': formats, + } From db145ee54a57f5ccc89639de8c589eb111a91b19 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Dec 2017 14:20:21 +0100 Subject: [PATCH 78/78] [espn] Add new extractor for http://fivethirtyeight.com(closes #6864) --- youtube_dl/extractor/espn.py | 31 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 32 insertions(+) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 0e135b8bc7..127c69b2eb 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -205,3 +205,34 @@ def _real_extract(self, url): return self.url_result( 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) + + +class FiveThirtyEightIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fivethirtyeight\.com/features/(?P[^/?#]+)' + _TEST = { + 'url': 'http://fivethirtyeight.com/features/how-the-6-8-raiders-can-still-make-the-playoffs/', + 'info_dict': { + 'id': '21846851', + 'ext': 'mp4', + 'title': 'FiveThirtyEight: The Raiders can still make the playoffs', + 'description': 'Neil Paine breaks down the simplest scenario that will put the Raiders into the playoffs at 8-8.', + 'timestamp': 1513960621, + 'upload_date': '20171222', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'data-video-id=["\'](?P\d+)', + webpage, 'video id', group='id') + + return self.url_result( + 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3269ed743e..91bd3287ce 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -322,6 +322,7 @@ from .espn import ( ESPNIE, ESPNArticleIE, + FiveThirtyEightIE, ) from .esri import EsriVideoIE from .etonline import ETOnlineIE