Update to ytdl-commit-dfbbe29

[redbulltv] fix embed data extraction dfbbe2902f
2025-02-18 23:24:11 +01:00 · 2021-05-20 21:08:49 +05:30 · 2021-05-20 21:08:49 +05:30 · b73612a254
commit b73612a254
parent 5014558ab9
10 changed files with 145 additions and 44 deletions
--- a/yt_dlp/extractor/eroprofile.py
+++ b/yt_dlp/extractor/eroprofile.py
@ -6,7 +6,7 @@ from .common import InfoExtractor
 from ..compat import compat_urllib_parse_urlencode
 from ..utils import (
    ExtractorError,
-    unescapeHTML
+    merge_dicts,
 )
@ -24,7 +24,8 @@ class EroProfileIE(InfoExtractor):
            'title': 'sexy babe softcore',
            'thumbnail': r're:https?://.*\.jpg',
            'age_limit': 18,
-        }
+        },
        'skip': 'Video not found',
    }, {
        'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file',
        'md5': '1baa9602ede46ce904c431f5418d8916',
@ -77,19 +78,15 @@ class EroProfileIE(InfoExtractor):
            [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
            webpage, 'video id', default=None)
        video_url = unescapeHTML(self._search_regex(
            r'<source src="([^"]+)', webpage, 'video url'))
        title = self._html_search_regex(
-            r'Title:</th><td>([^<]+)</td>', webpage, 'title')
+            (r'Title:</th><td>([^<]+)</td>', r'<h1[^>]*>(.+?)</h1>'),
-        thumbnail = self._search_regex(
+            webpage, 'title')
            r'onclick="showVideoPlayer\(\)"><img src="([^"]+)',
            webpage, 'thumbnail', fatal=False)
-        return {
+        info = self._parse_html5_media_entries(url, webpage, video_id)[0]
        return merge_dicts(info, {
            'id': video_id,
            'display_id': display_id,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'age_limit': 18,
-        }
+        })
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@ -985,6 +985,7 @@ from .platzi import (
 from .playfm import PlayFMIE
 from .playplustv import PlayPlusTVIE
 from .plays import PlaysTVIE
 from .playstuff import PlayStuffIE
 from .playtvak import PlaytvakIE
 from .playvid import PlayvidIE
 from .playwire import PlaywireIE
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@ -126,6 +126,7 @@ from .viqeo import ViqeoIE
 from .expressen import ExpressenIE
 from .zype import ZypeIE
 from .odnoklassniki import OdnoklassnikiIE
 from .vk import VKIE
 from .kinja import KinjaEmbedIE
 from .gedidigital import GediDigitalIE
 from .rcs import RCSEmbedsIE
@ -2252,6 +2253,10 @@ class GenericIE(InfoExtractor):
            'playlist_mincount': 52,
        },
        {
            # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed)
            'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
            'only_matching': True,
        }, {
            # WimTv embed player
            'url': 'http://www.msmotor.tv/wearefmi-pt-2-2021/',
            'info_dict': {
@ -2803,6 +2808,11 @@ class GenericIE(InfoExtractor):
        if odnoklassniki_url:
            return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
        # Look for sibnet embedded player
        sibnet_urls = VKIE._extract_sibnet_urls(webpage)
        if sibnet_urls:
            return self.playlist_from_matches(sibnet_urls, video_id, video_title)
        # Look for embedded ivi player
        mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
        if mobj is not None:
@ -3454,6 +3464,9 @@ class GenericIE(InfoExtractor):
                        'url': src,
                        'ext': (mimetype2ext(src_type)
                                or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
                        'http_headers': {
                            'Referer': full_response.geturl(),
                        },
                    })
            if formats:
                self._sort_formats(formats)
@ -3522,7 +3535,7 @@ class GenericIE(InfoExtractor):
            m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
            # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
            if m_video_type is not None:
-                found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
+                found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage))
        if not found:
            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
            found = re.search(
--- a/yt_dlp/extractor/orf.py
+++ b/yt_dlp/extractor/orf.py
@ -182,7 +182,7 @@ class ORFRadioIE(InfoExtractor):
            duration = end - start if end and start else None
            entries.append({
                'id': loop_stream_id.replace('.mp3', ''),
-                'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
+                'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
                'title': title,
                'description': clean_html(data.get('subtitle')),
                'duration': duration,
--- a/yt_dlp/extractor/phoenix.py
+++ b/yt_dlp/extractor/phoenix.py
@ -9,8 +9,9 @@ from ..compat import compat_str
 from ..utils import (
    int_or_none,
    merge_dicts,
    try_get,
    unified_timestamp,
-    xpath_text,
+    urljoin,
 )
@ -27,10 +28,11 @@ class PhoenixIE(ZDFBaseIE):
            'title': 'Wohin führt der Protest in der Pandemie?',
            'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
            'duration': 1691,
-            'timestamp': 1613906100,
+            'timestamp': 1613902500,
            'upload_date': '20210221',
            'uploader': 'Phoenix',
-            'channel': 'corona nachgehakt',
+            'series': 'corona nachgehakt',
            'episode': 'Wohin führt der Protest in der Pandemie?',
        },
    }, {
        # Youtube embed
@ -79,50 +81,53 @@ class PhoenixIE(ZDFBaseIE):
        video_id = compat_str(video.get('basename') or video.get('content'))
-        details = self._download_xml(
+        details = self._download_json(
            'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php',
-            video_id, 'Downloading details XML', query={
+            video_id, 'Downloading details JSON', query={
                'ak': 'web',
                'ptmd': 'true',
                'id': video_id,
                'profile': 'player2',
            })
-        title = title or xpath_text(
+        title = title or details['title']
-            details, './/information/title', 'title', fatal=True)
+        content_id = details['tracking']['nielsen']['content']['assetid']
        content_id = xpath_text(
            details, './/video/details/basename', 'content id', fatal=True)
        info = self._extract_ptmd(
            'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id,
            content_id, None, url)
-        timestamp = unified_timestamp(xpath_text(details, './/details/airtime'))
+        duration = int_or_none(try_get(
            details, lambda x: x['tracking']['nielsen']['content']['length']))
        timestamp = unified_timestamp(details.get('editorialDate'))
        series = try_get(
            details, lambda x: x['tracking']['nielsen']['content']['program'],
            compat_str)
        episode = title if details.get('contentType') == 'episode' else None
        thumbnails = []
-        for node in details.findall('.//teaserimages/teaserimage'):
+        teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {}
-            thumbnail_url = node.text
+        for thumbnail_key, thumbnail_url in teaser_images.items():
            thumbnail_url = urljoin(url, thumbnail_url)
            if not thumbnail_url:
                continue
            thumbnail = {
                'url': thumbnail_url,
            }
-            thumbnail_key = node.get('key')
+            m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
-            if thumbnail_key:
+            if m:
-                m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
+                thumbnail['width'] = int(m.group(1))
-                if m:
+                thumbnail['height'] = int(m.group(2))
                    thumbnail['width'] = int(m.group(1))
                    thumbnail['height'] = int(m.group(2))
            thumbnails.append(thumbnail)
        return merge_dicts(info, {
            'id': content_id,
            'title': title,
-            'description': xpath_text(details, './/information/detail'),
+            'description': details.get('leadParagraph'),
-            'duration': int_or_none(xpath_text(details, './/details/lengthSec')),
+            'duration': duration,
            'thumbnails': thumbnails,
            'timestamp': timestamp,
-            'uploader': xpath_text(details, './/details/channel'),
+            'uploader': details.get('tvService'),
-            'uploader_id': xpath_text(details, './/details/originChannelId'),
+            'series': series,
-            'channel': xpath_text(details, './/details/originChannelTitle'),
+            'episode': episode,
        })
--- a/yt_dlp/extractor/playstuff.py
+++ b/yt_dlp/extractor/playstuff.py
@ -0,0 +1,65 @@
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
    smuggle_url,
    try_get,
 )
 class PlayStuffIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?play\.stuff\.co\.nz/details/(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://play.stuff.co.nz/details/608778ac1de1c4001a3fa09a',
        'md5': 'c82d3669e5247c64bc382577843e5bd0',
        'info_dict': {
            'id': '6250584958001',
            'ext': 'mp4',
            'title': 'Episode 1: Rotorua/Mt Maunganui/Tauranga',
            'description': 'md5:c154bafb9f0dd02d01fd4100fb1c1913',
            'uploader_id': '6005208634001',
            'timestamp': 1619491027,
            'upload_date': '20210427',
        },
        'add_ie': ['BrightcoveNew'],
    }, {
        # geo restricted, bypassable
        'url': 'https://play.stuff.co.nz/details/_6155660351001',
        'only_matching': True,
    }]
    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        state = self._parse_json(
            self._search_regex(
                r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'state'),
            video_id)
        account_id = try_get(
            state, lambda x: x['configurations']['accountId'],
            compat_str) or '6005208634001'
        player_id = try_get(
            state, lambda x: x['configurations']['playerId'],
            compat_str) or 'default'
        entries = []
        for item_id, video in state['items'].items():
            if not isinstance(video, dict):
                continue
            asset_id = try_get(
                video, lambda x: x['content']['attributes']['assetId'],
                compat_str)
            if not asset_id:
                continue
            entries.append(self.url_result(
                smuggle_url(
                    self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, asset_id),
                    {'geo_countries': ['NZ']}),
                'BrightcoveNew', video_id))
        return self.playlist_result(entries, video_id)
--- a/yt_dlp/extractor/redbulltv.py
+++ b/yt_dlp/extractor/redbulltv.py
@ -133,8 +133,10 @@ class RedBullEmbedIE(RedBullTVIE):
        rrn_id = self._match_id(url)
        asset_id = self._download_json(
            'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql',
-            rrn_id, headers={'API-KEY': 'e90a1ff11335423998b100c929ecc866'},
+            rrn_id, headers={
-            query={
+                'Accept': 'application/json',
                'API-KEY': 'e90a1ff11335423998b100c929ecc866',
            }, query={
                'query': '''{
  resource(id: "%s", enforceGeoBlocking: false) {
    %s
--- a/yt_dlp/extractor/shahid.py
+++ b/yt_dlp/extractor/shahid.py
@ -21,6 +21,7 @@ from ..utils import (
 class ShahidBaseIE(AWSIE):
    _AWS_PROXY_HOST = 'api2.shahid.net'
    _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh'
    _VALID_URL_BASE = r'https?://shahid\.mbc\.net/[a-z]{2}/'
    def _handle_error(self, e):
        fail_data = self._parse_json(
@ -49,7 +50,7 @@ class ShahidBaseIE(AWSIE):
 class ShahidIE(ShahidBaseIE):
    _NETRC_MACHINE = 'shahid'
-    _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'
+    _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924',
        'info_dict': {
@ -73,6 +74,9 @@ class ShahidIE(ShahidBaseIE):
        # shahid plus subscriber only
        'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511',
        'only_matching': True
    }, {
        'url': 'https://shahid.mbc.net/en/shows/Ramez-Fi-Al-Shallal-season-1-episode-1/episode-359319',
        'only_matching': True
    }]
    def _real_initialize(self):
@ -168,7 +172,7 @@ class ShahidIE(ShahidBaseIE):
 class ShahidShowIE(ShahidBaseIE):
-    _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)'
+    _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187',
        'info_dict': {
--- a/yt_dlp/extractor/shared.py
+++ b/yt_dlp/extractor/shared.py
@ -86,10 +86,10 @@ class SharedIE(SharedBaseIE):
 class VivoIE(SharedBaseIE):
    IE_DESC = 'vivo.sx'
-    _VALID_URL = r'https?://vivo\.sx/(?P<id>[\da-z]{10})'
+    _VALID_URL = r'https?://vivo\.s[xt]/(?P<id>[\da-z]{10})'
    _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed'
-    _TEST = {
+    _TESTS = [{
        'url': 'http://vivo.sx/d7ddda0e78',
        'md5': '15b3af41be0b4fe01f4df075c2678b2c',
        'info_dict': {
@ -98,7 +98,10 @@ class VivoIE(SharedBaseIE):
            'title': 'Chicken',
            'filesize': 515659,
        },
-    }
+    }, {
        'url': 'http://vivo.st/d7ddda0e78',
        'only_matching': True,
    }]
    def _extract_title(self, webpage):
        title = self._html_search_regex(
--- a/yt_dlp/extractor/vk.py
+++ b/yt_dlp/extractor/vk.py
@ -300,6 +300,13 @@ class VKIE(VKBaseIE):
            'only_matching': True,
        }]
    @staticmethod
    def _extract_sibnet_urls(webpage):
        # https://help.sibnet.ru/?sibnet_video_embed
        return [unescapeHTML(mobj.group('url')) for mobj in re.finditer(
            r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1',
            webpage)]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('videoid')
@ -408,6 +415,10 @@ class VKIE(VKBaseIE):
        if odnoklassniki_url:
            return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
        sibnet_urls = self._extract_sibnet_urls(info_page)
        if sibnet_urls:
            return self.url_result(sibnet_urls[0])
        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
        if m_opts:
            m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))