Add new extractor

This commit is contained in:
Lucas 2015-09-24 22:19:09 +02:00 committed by Jaime Marquínez Ferrándiz
parent 6722ebd437
commit 47f2d01a5a
2 changed files with 116 additions and 0 deletions

View File

@ -274,6 +274,7 @@
from .keezmovies import KeezMoviesIE from .keezmovies import KeezMoviesIE
from .khanacademy import KhanAcademyIE from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE from .kickstarter import KickStarterIE
from .kika import KikaIE
from .keek import KeekIE from .keek import KeekIE
from .kontrtube import KontrTubeIE from .kontrtube import KontrTubeIE
from .krasview import KrasViewIE from .krasview import KrasViewIE

View File

@ -0,0 +1,115 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import ExtractorError
class KikaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|sendung)(?P<id>\d+).*'
_TESTS = [
{
'url': 'http://www.kika.de/baumhaus/videos/video9572.html',
'md5': '94fc748cf5d64916571d275a07ffe2d5',
'info_dict': {
'id': '9572',
'ext': 'mp4',
'title': 'Baumhaus vom 29. Oktober 2014',
'description': None
}
},
{
'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html',
'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
'info_dict': {
'id': '8182',
'ext': 'mp4',
'title': 'Beutolomäus und der geheime Weihnachtswunsch',
'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd'
}
},
{
'url': 'http://www.kika.de/videos/allevideos/video9572_zc-32ca94ad_zs-3f535991.html',
'md5': '94fc748cf5d64916571d275a07ffe2d5',
'info_dict': {
'id': '9572',
'ext': 'mp4',
'title': 'Baumhaus vom 29. Oktober 2014',
'description': None
}
},
{
'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/sendung81244_zc-81d703f8_zs-f82d5e31.html',
'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
'info_dict': {
'id': '8182',
'ext': 'mp4',
'title': 'Beutolomäus und der geheime Weihnachtswunsch',
'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd'
}
}
]
def _real_extract(self, url):
# broadcast_id may be the same as the video_id
broadcast_id = self._match_id(url)
webpage = self._download_webpage(url, broadcast_id)
xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml'
video_id = self._search_regex(xml_re, webpage, "xml_url", default=None)
if not video_id:
# Video is not available online
err_msg = 'Video %s is not available online' % broadcast_id
raise ExtractorError(err_msg, expected=True)
xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id)
xml_tree = self._download_xml(xml_url, video_id)
title = xml_tree.find('title').text
webpage_url = xml_tree.find('htmlUrl').text
# Try to get the description, not available for all videos
try:
broadcast_elem = xml_tree.find('broadcast')
description = broadcast_elem.find('broadcastDescription').text
except AttributeError:
# No description available
description = None
# duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42)
tmp = xml_tree.find('duration').text.split(':')
duration = int(tmp[0]) * 60 + int(tmp[1])
formats_list = []
for elem in xml_tree.find('assets'):
format_dict = {}
format_dict['url'] = elem.find('progressiveDownloadUrl').text
format_dict['ext'] = elem.find('mediaType').text.lower()
format_dict['format'] = elem.find('profileName').text
width = int(elem.find('frameWidth').text)
height = int(elem.find('frameHeight').text)
format_dict['width'] = width
format_dict['height'] = height
format_dict['resolution'] = '%dx%d' % (width, height)
format_dict['abr'] = int(elem.find('bitrateAudio').text)
format_dict['vbr'] = int(elem.find('bitrateVideo').text)
format_dict['tbr'] = format_dict['abr'] + format_dict['vbr']
format_dict['filesize'] = int(elem.find('fileSize').text)
# append resolution and dict for sorting by resolution
formats_list.append((width * height, format_dict))
# Sort by resolution (=quality)
formats_list.sort()
out_list = [x[1] for x in formats_list]
return {
'id': video_id,
'title': title,
'description': description,
'formats': out_list,
'duration': duration,
'webpage_url': webpage_url
}