From e0a8686f48d10ed86f7be92132dd37481981adf3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 24 Nov 2017 18:42:41 +0100 Subject: [PATCH] [faz] fix extraction and add support for Perform Group embeds(fixes #14714) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/faz.py | 34 +++++++++--- youtube_dl/extractor/performgroup.py | 83 ++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 youtube_dl/extractor/performgroup.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d084707ee8..aecb84b188 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -789,6 +789,7 @@ from .pbs import PBSIE from .pearvideo import PearVideoIE from .people import PeopleIE +from .performgroup import PerformGroupIE from .periscope import ( PeriscopeIE, PeriscopeUserIE, diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 4bc8fc5127..312ee2aeed 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_etree_fromstring from ..utils import ( xpath_element, xpath_text, @@ -43,10 +46,15 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) description = self._og_search_description(webpage) - config_xml_url = self._search_regex( - r'videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url') - config = self._download_xml( - config_xml_url, video_id, 'Downloading config xml') + media = self._html_search_regex( + r"data-videojs-media='([^']+)", + webpage, 'media') + if media == 'extern': + perform_url = self._search_regex( + r"]+?src='((?:http:)?//player\.performgroup\.com/eplayer/eplayer\.html#/?[0-9a-f]{26}\.[0-9a-z]{26})", + webpage, 'perform url') + return self.url_result(perform_url) + config = compat_etree_fromstring(media) encodings = xpath_element(config, 'ENCODINGS', 'encodings', True) formats = [] @@ -55,12 +63,24 @@ def _real_extract(self, url): if encoding is not None: encoding_url = xpath_text(encoding, 'FILENAME') if encoding_url: - formats.append({ + tbr = xpath_text(encoding, 'AVERAGEBITRATE', 1000) + if tbr: + tbr = int_or_none(tbr.replace(',', '.')) + f = { 'url': encoding_url, 'format_id': code.lower(), 'quality': pref, - 'tbr': int_or_none(xpath_text(encoding, 'AVERAGEBITRATE')), - }) + 'tbr': tbr, + 'vcodec': xpath_text(encoding, 'CODEC'), + } + mobj = re.search(r'(\d+)x(\d+)_(\d+)\.mp4', encoding_url) + if mobj: + f.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + 'tbr': tbr or int(mobj.group(3)), + }) + formats.append(f) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/performgroup.py b/youtube_dl/extractor/performgroup.py new file mode 100644 index 0000000000..26942bfb3f --- /dev/null +++ b/youtube_dl/extractor/performgroup.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PerformGroupIE(InfoExtractor): + _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P[0-9a-f]{26})\.(?P[0-9a-z]{26})' + _TESTS = [{ + # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html + 'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab', + 'md5': '259cb03d142e2e52471e8837ecacb29f', + 'info_dict': { + 'id': 'xgrwobuzumes1lwjxtcdpwgxd', + 'ext': 'mp4', + 'title': 'Liga MX: Keine Einsicht nach Horrorfoul', + 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', + 'timestamp': 1511533477, + 'upload_date': '20171124', + } + }] + + def _call_api(self, service, auth_token, content_id, referer_url): + return self._download_json( + 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), + content_id, headers={ + 'Referer': referer_url, + 'Origin': 'http://player.performgroup.com', + }, query={ + '_fmt': 'json', + }) + + def _real_extract(self, url): + player_id, auth_token = re.search(self._VALID_URL, url).groups() + bootstrap = self._call_api('bootstrap', auth_token, player_id, url) + video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0] + video_id = video['uuid'] + vod = self._call_api('vod', auth_token, video_id, url) + media = vod['videos']['video'][0]['media'] + + formats = [] + hls_url = media.get('hls', {}).get('url') + if hls_url: + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + hds_url = media.get('hds', {}).get('url') + if hds_url: + formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False)) + + for c in media.get('content', []): + c_url = c.get('url') + if not c_url: + continue + tbr = int_or_none(c.get('bitrate'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': c_url, + 'tbr': tbr, + 'width': int_or_none(c.get('width')), + 'height': int_or_none(c.get('height')), + 'filesize': int_or_none(c.get('fileSize')), + 'vcodec': c.get('type'), + 'fps': int_or_none(c.get('videoFrameRate')), + 'vbr': int_or_none(c.get('videoRate'), 1000), + 'abr': int_or_none(c.get('audioRate'), 1000), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video['title'], + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': int_or_none(video.get('publishedTime'), 1000), + 'formats': formats, + }