[mlb] Extract more metadata and all formats, provide more tests

This commit is contained in:
Sergey M․ 2014-07-16 20:40:28 +07:00
parent 1aa42fedee
commit 7bb49d1057
2 changed files with 81 additions and 53 deletions

View File

@ -170,7 +170,7 @@
from .metacritic import MetacriticIE from .metacritic import MetacriticIE
from .mit import TechTVMITIE, MITIE, OCWMITIE from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mixcloud import MixcloudIE from .mixcloud import MixcloudIE
from .mlb import MlbIE from .mlb import MLBIE
from .mpora import MporaIE from .mpora import MporaIE
from .mofosex import MofosexIE from .mofosex import MofosexIE
from .mooshare import MooshareIE from .mooshare import MooshareIE

View File

@ -3,72 +3,100 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
parse_duration,
parse_iso8601,
find_xpath_attr,
)
class MlbIE(InfoExtractor): class MLBIE(InfoExtractor):
_VALID_URL = r'http?://m\.mlb\.com/video/topic/[0-9]+/v(?P<id>n?\d+)/.*$' _VALID_URL = r'http?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)'
_TEST = { _TESTS = [
{
'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', 'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby',
'md5': u'd9c022c10d21f849f49c05ae12a8a7e9', 'md5': 'd9c022c10d21f849f49c05ae12a8a7e9',
'info_dict': { 'info_dict': {
'id': '34496663', 'id': '34496663',
'ext': 'mp4', 'ext': 'mp4',
'format': 'mp4', 'title': 'Stanton prepares for Derby',
'description': "7/11/14: Giancarlo Stanton practices for the Home Run Derby prior to the game against the Mets", 'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57',
'title': "Stanton prepares for Derby", 'duration': 46,
'timestamp': 1405105800,
'upload_date': '20140711',
'thumbnail': 're:^https?://.*\.jpg$',
}, },
} },
{
'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby',
'md5': '0e6e73d509321e142409b695eadd541f',
'info_dict': {
'id': '34578115',
'ext': 'mp4',
'title': 'Cespedes repeats as Derby champ',
'description': 'md5:08df253ce265d4cf6fb09f581fafad07',
'duration': 488,
'timestamp': 1405399936,
'upload_date': '20140715',
'thumbnail': 're:^https?://.*\.jpg$',
},
},
{
'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance',
'md5': 'b8fd237347b844365d74ea61d4245967',
'info_dict': {
'id': '34577915',
'ext': 'mp4',
'title': 'Bautista on Home Run Derby',
'description': 'md5:b80b34031143d0986dddc64a8839f0fb',
'duration': 52,
'timestamp': 1405390722,
'upload_date': '20140715',
'thumbnail': 're:^https?://.*\.jpg$',
},
},
]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) detail = self._download_xml(
'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
% (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)
title = self._og_search_title(webpage, default=video_id) title = detail.find('./headline').text
description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)"/>', webpage, 'description', fatal=False) description = detail.find('./big-blurb').text
thumbnail = self._html_search_regex(r'<meta itemprop="image" (?:content|value)="(.*?)" />', webpage, 'image', fatal=False) duration = parse_duration(detail.find('./duration').text)
timestamp = parse_iso8601(detail.attrib['date'][:-5])
# use the video_id to find the Media detail XML thumbnail = find_xpath_attr(
id_len = len(video_id) detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text
_mediadetail_url = 'http://m.mlb.com/gen/multimedia/detail/'+video_id[id_len-3]+'/'+video_id[id_len-2]+'/'+video_id[id_len-1]+'/'+video_id+'.xml'
mediadetails = self._download_xml(_mediadetail_url, video_id, "Downloading media detail...") formats = []
has1500K = 0 for media_url in detail.findall('./url'):
has1200K = 0 playback_scenario = media_url.attrib['playback_scenario']
has600K = 0 fmt = {
# loop through the list of url's and only get the highest quality MP4 content 'url': media_url.text,
for element in mediadetails.findall('url'): 'format_id': playback_scenario,
scenario = element.attrib['playback_scenario'] }
if scenario.startswith(u'FLASH'): m = re.search(r'(?P<vbr>\d+)K_(?P<width>\d+)X(?P<height>\d+)', playback_scenario)
if scenario.startswith(u'FLASH_1800K'): if m:
video_url = element.text fmt.update({
# 1800K is the current highest quality video on MLB.com 'vbr': int(m.group('vbr')) * 1000,
break 'width': int(m.group('width')),
else: 'height': int(m.group('height')),
if scenario.startswith(u'FLASH_1500K'): })
video_url = element.text formats.append(fmt)
has1500K = 1
else: self._sort_formats(formats)
if (scenario.startswith(u'FLASH_1200K') and not has1500K):
video_url = element.text
has1200K = 1
else:
if (scenario.startswith(u'FLASH_600K') and not has1200K):
video_url = element.text
has600K = 1
else:
if (scenario.startswith(u'FLASH_300K') and not has600K):
video_url = element.text
return { return {
'id': video_id, 'id': video_id,
'url': video_url,
'extractor': 'mlb',
'webpage_url': url,
'title': title, 'title': title,
'ext': 'mp4',
'format': 'mp4',
'description': description, 'description': description,
'duration': duration,
'timestamp': timestamp,
'formats': formats,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
} }