From 9bb8e0a3f9276f65de38cda431bf72f7bd266693 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 3 Feb 2015 10:58:28 +0100 Subject: [PATCH] [wsj] Add new extractor (Fixes #4854) --- test/test_utils.py | 3 ++ youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/common.py | 1 + youtube_dl/extractor/wsj.py | 89 ++++++++++++++++++++++++++++++++ youtube_dl/utils.py | 2 +- 5 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/wsj.py diff --git a/test/test_utils.py b/test/test_utils.py index 0ffccd35f0..80c765bc49 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -156,6 +156,9 @@ def test_unified_dates(self): self.assertEqual( unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False), '20141126') + self.assertEqual( + unified_strdate('2/2/2015 6:47:40 PM', day_first=False), + '20150202') def test_find_xpath_attr(self): testxml = ''' diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5dcb14febb..5866a76179 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -554,6 +554,7 @@ from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE from .wrzuta import WrzutaIE +from .wsj import WSJIE from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xhamster import XHamsterIE diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 653d793fc0..602601b243 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -145,6 +145,7 @@ class InfoExtractor(object): thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. + creator: The main artist who created the video. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py new file mode 100644 index 0000000000..cbe3dc7bec --- /dev/null +++ b/youtube_dl/extractor/wsj.py @@ -0,0 +1,89 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, +) + + +class WSJIE(InfoExtractor): + _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P[a-zA-Z0-9-]+)' + IE_DESC = 'Wall Street Journal' + _TEST = { + 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', + 'md5': '9747d7a6ebc2f4df64b981e1dde9efa9', + 'info_dict': { + 'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', + 'ext': 'mp4', + 'upload_date': '20150202', + 'uploader_id': 'bbright', + 'creator': 'bbright', + 'categories': list, # a long list + 'duration': 90, + 'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + bitrates = [128, 174, 264, 320, 464, 664, 1264] + api_url = ( + 'http://video-api.wsj.com/api-video/find_all_videos.asp?' + 'type=guid&count=1&query=%s&' + 'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,' + 'author,description,name,linkURL,videoStillURL,duration,videoURL,' + 'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,' + 'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,' + 'allthingsd-subsection,sm-section,sm-subsection,provider,' + 'formattedCreationDate,keywords,keywordsOmniture,column,editor,' + 'emailURL,emailPartnerID,showName,omnitureProgramName,' + 'omnitureVideoFormat,linkRelativeURL,touchCastID,' + 'omniturePublishDate,%s') % ( + video_id, ','.join('video%dkMP4Url' % br for br in bitrates)) + info = self._download_json(api_url, video_id)['items'][0] + + # Thumbnails are conveniently in the correct format already + thumbnails = info.get('thumbnailList') + creator = info.get('author') + uploader_id = info.get('editor') + categories = info.get('keywords') + duration = int_or_none(info.get('duration')) + upload_date = unified_strdate( + info.get('formattedCreationDate'), day_first=False) + title = info.get('name', info.get('titletag')) + + formats = [{ + 'format_id': 'f4m', + 'format_note': 'f4m (meta URL)', + 'url': info['videoURL'], + }] + if info.get('hls'): + formats.extend(self._extract_m3u8_formats( + info['hls'], video_id, ext='mp4', + preference=0, entry_protocol='m3u8_native')) + for br in bitrates: + field = 'video%dkMP4Url' % br + if info.get(field): + formats.append({ + 'format_id': 'mp4-%d' % br, + 'container': 'mp4', + 'tbr': br, + 'url': info[field], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'thumbnails': thumbnails, + 'creator': creator, + 'uploader_id': uploader_id, + 'duration': duration, + 'upload_date': upload_date, + 'title': title, + 'formats': formats, + 'categories': categories, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 251074bf5a..8f5463f1c9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -701,7 +701,7 @@ def unified_strdate(date_str, day_first=True): # %z (UTC offset) is only supported in python>=3.2 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) # Remove AM/PM + timezone - date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str) + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) format_expressions = [ '%d %B %Y',