From 3a1583ca75fb523cbad0e5e174387ea7b477d175 Mon Sep 17 00:00:00 2001 From: sepro Date: Fri, 21 Feb 2025 22:39:41 +0100 Subject: [PATCH] [ie/BunnyCdn] Add extractor (#11586) Also adds BunnyCdnFD Authored by: seproDev, Grub4K Co-authored-by: Simon Sawicki --- yt_dlp/downloader/__init__.py | 2 + yt_dlp/downloader/bunnycdn.py | 50 +++++++++ yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/bunnycdn.py | 178 ++++++++++++++++++++++++++++++ yt_dlp/extractor/sovietscloset.py | 45 +++----- 5 files changed, 246 insertions(+), 30 deletions(-) create mode 100644 yt_dlp/downloader/bunnycdn.py create mode 100644 yt_dlp/extractor/bunnycdn.py diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 51a9f28f06..1b12bd4bed 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -35,6 +35,7 @@ from .rtmp import RtmpFD from .rtsp import RtspFD from .websocket import WebSocketFragmentFD from .youtube_live_chat import YoutubeLiveChatFD +from .bunnycdn import BunnyCdnFD PROTOCOL_MAP = { 'rtmp': RtmpFD, @@ -55,6 +56,7 @@ PROTOCOL_MAP = { 'websocket_frag': WebSocketFragmentFD, 'youtube_live_chat': YoutubeLiveChatFD, 'youtube_live_chat_replay': YoutubeLiveChatFD, + 'bunnycdn': BunnyCdnFD, } diff --git a/yt_dlp/downloader/bunnycdn.py b/yt_dlp/downloader/bunnycdn.py new file mode 100644 index 0000000000..e787f698a1 --- /dev/null +++ b/yt_dlp/downloader/bunnycdn.py @@ -0,0 +1,50 @@ +import hashlib +import random +import threading + +from .common import FileDownloader +from . import HlsFD +from ..networking import Request +from ..networking.exceptions import network_exceptions + + +class BunnyCdnFD(FileDownloader): + """ + Downloads from BunnyCDN with required pings + Note, this is not a part of public API, and will be removed without notice. + DO NOT USE + """ + + def real_download(self, filename, info_dict): + self.to_screen(f'[{self.FD_NAME}] Downloading from BunnyCDN') + + fd = HlsFD(self.ydl, self.params) + + stop_event = threading.Event() + ping_thread = threading.Thread(target=self.ping_thread, args=(stop_event,), kwargs=info_dict['_bunnycdn_ping_data']) + ping_thread.start() + + try: + return fd.real_download(filename, info_dict) + finally: + stop_event.set() + + def ping_thread(self, stop_event, url, headers, secret, context_id): + # Site sends ping every 4 seconds, but this throttles the download. Pinging every 2 seconds seems to work. + ping_interval = 2 + # Hard coded resolution as it doesn't seem to matter + res = 1080 + paused = 'false' + current_time = 0 + + while not stop_event.wait(ping_interval): + current_time += ping_interval + + time = current_time + round(random.random(), 6) + md5_hash = hashlib.md5(f'{secret}_{context_id}_{time}_{paused}_{res}'.encode()).hexdigest() + ping_url = f'{url}?hash={md5_hash}&time={time}&paused={paused}&resolution={res}' + + try: + self.ydl.urlopen(Request(ping_url, headers=headers)).read() + except network_exceptions as e: + self.to_screen(f'[{self.FD_NAME}] Ping failed: {e}') diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c937dfe13c..9a49bcb309 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -312,6 +312,7 @@ from .brilliantpala import ( ) from .bundesliga import BundesligaIE from .bundestag import BundestagIE +from .bunnycdn import BunnyCdnIE from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE diff --git a/yt_dlp/extractor/bunnycdn.py b/yt_dlp/extractor/bunnycdn.py new file mode 100644 index 0000000000..d787533841 --- /dev/null +++ b/yt_dlp/extractor/bunnycdn.py @@ -0,0 +1,178 @@ +import json + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + parse_qs, + smuggle_url, + unsmuggle_url, + url_or_none, + urlhandle_detect_ext, +) +from ..utils.traversal import find_element, traverse_obj + + +class BunnyCdnIE(InfoExtractor): + _VALID_URL = r'https?://(?:iframe\.mediadelivery\.net|video\.bunnycdn\.com)/(?:embed|play)/(?P\d+)/(?P[\da-f-]+)' + _EMBED_REGEX = [rf']+src=[\'"](?P{_VALID_URL}[^\'"]*)[\'"]'] + _TESTS = [{ + 'url': 'https://iframe.mediadelivery.net/embed/113933/e73edec1-e381-4c8b-ae73-717a140e0924', + 'info_dict': { + 'id': 'e73edec1-e381-4c8b-ae73-717a140e0924', + 'ext': 'mp4', + 'title': 'mistress morgana (3).mp4', + 'description': '', + 'timestamp': 1693251673, + 'thumbnail': r're:^https?://.*\.b-cdn\.net/e73edec1-e381-4c8b-ae73-717a140e0924/thumbnail\.jpg', + 'duration': 7.0, + 'upload_date': '20230828', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://iframe.mediadelivery.net/play/136145/32e34c4b-0d72-437c-9abb-05e67657da34', + 'info_dict': { + 'id': '32e34c4b-0d72-437c-9abb-05e67657da34', + 'ext': 'mp4', + 'timestamp': 1691145748, + 'thumbnail': r're:^https?://.*\.b-cdn\.net/32e34c4b-0d72-437c-9abb-05e67657da34/thumbnail_9172dc16\.jpg', + 'duration': 106.0, + 'description': 'md5:981a3e899a5c78352b21ed8b2f1efd81', + 'upload_date': '20230804', + 'title': 'Sanela ist Teil der #arbeitsmarktkraft', + }, + 'params': {'skip_download': True}, + }, { + # Stream requires activation and pings + 'url': 'https://iframe.mediadelivery.net/embed/200867/2e8545ec-509d-4571-b855-4cf0235ccd75', + 'info_dict': { + 'id': '2e8545ec-509d-4571-b855-4cf0235ccd75', + 'ext': 'mp4', + 'timestamp': 1708497752, + 'title': 'netflix part 1', + 'duration': 3959.0, + 'description': '', + 'upload_date': '20240221', + 'thumbnail': r're:^https?://.*\.b-cdn\.net/2e8545ec-509d-4571-b855-4cf0235ccd75/thumbnail\.jpg', + }, + 'params': {'skip_download': True}, + }] + _WEBPAGE_TESTS = [{ + # Stream requires Referer + 'url': 'https://conword.io/', + 'info_dict': { + 'id': '3a5d863e-9cd6-447e-b6ef-e289af50b349', + 'ext': 'mp4', + 'title': 'Conword bei der Stadt Köln und Stadt Dortmund', + 'description': '', + 'upload_date': '20231031', + 'duration': 31.0, + 'thumbnail': 'https://video.watchuh.com/3a5d863e-9cd6-447e-b6ef-e289af50b349/thumbnail.jpg', + 'timestamp': 1698783879, + }, + 'params': {'skip_download': True}, + }, { + # URL requires token and expires + 'url': 'https://www.stockphotos.com/video/moscow-subway-the-train-is-arriving-at-the-park-kultury-station-10017830', + 'info_dict': { + 'id': '0b02fa20-4e8c-4140-8f87-f64d820a3386', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.b-cdn\.net/0b02fa20-4e8c-4140-8f87-f64d820a3386/thumbnail\.jpg', + 'title': 'Moscow subway. The train is arriving at the Park Kultury station.', + 'upload_date': '20240531', + 'duration': 18.0, + 'timestamp': 1717152269, + 'description': '', + }, + 'params': {'skip_download': True}, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield smuggle_url(embed_url, {'Referer': url}) + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + video_id, library_id = self._match_valid_url(url).group('id', 'library_id') + webpage = self._download_webpage( + f'https://iframe.mediadelivery.net/embed/{library_id}/{video_id}', video_id, + headers=traverse_obj(smuggled_data, {'Referer': 'Referer'}), + query=traverse_obj(parse_qs(url), {'token': 'token', 'expires': 'expires'})) + + if html_title := self._html_extract_title(webpage, default=None) == '403': + raise ExtractorError( + 'This video is inaccessible. Setting a Referer header ' + 'might be required to access the video', expected=True) + elif html_title == '404': + raise ExtractorError('This video does not exist', expected=True) + + headers = {'Referer': url} + + info = traverse_obj(self._parse_html5_media_entries(url, webpage, video_id, _headers=headers), 0) or {} + formats = info.get('formats') or [] + subtitles = info.get('subtitles') or {} + + original_url = self._search_regex( + r'(?:var|const|let)\s+originalUrl\s*=\s*["\']([^"\']+)["\']', webpage, 'original url', default=None) + if url_or_none(original_url): + urlh = self._request_webpage( + HEADRequest(original_url), video_id=video_id, note='Checking original', + headers=headers, fatal=False, expected_status=(403, 404)) + if urlh and urlh.status == 200: + formats.append({ + 'url': original_url, + 'format_id': 'source', + 'quality': 1, + 'http_headers': headers, + 'ext': urlhandle_detect_ext(urlh, default='mp4'), + 'filesize': int_or_none(urlh.get_header('Content-Length')), + }) + + # MediaCage Streams require activation and pings + src_url = self._search_regex( + r'\.setAttribute\([\'"]src[\'"],\s*[\'"]([^\'"]+)[\'"]\)', webpage, 'src url', default=None) + activation_url = self._search_regex( + r'loadUrl\([\'"]([^\'"]+/activate)[\'"]', webpage, 'activation url', default=None) + ping_url = self._search_regex( + r'loadUrl\([\'"]([^\'"]+/ping)[\'"]', webpage, 'ping url', default=None) + secret = traverse_obj(parse_qs(src_url), ('secret', 0)) + context_id = traverse_obj(parse_qs(src_url), ('contextId', 0)) + ping_data = {} + if src_url and activation_url and ping_url and secret and context_id: + self._download_webpage( + activation_url, video_id, headers=headers, note='Downloading activation data') + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, video_id, 'mp4', headers=headers, m3u8_id='hls', fatal=False) + for fmt in fmts: + fmt.update({ + 'protocol': 'bunnycdn', + 'http_headers': headers, + }) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + ping_data = { + '_bunnycdn_ping_data': { + 'url': ping_url, + 'headers': headers, + 'secret': secret, + 'context_id': context_id, + }, + } + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(webpage, ({find_element(id='main-video', html=True)}, {extract_attributes}, { + 'title': ('data-plyr-config', {json.loads}, 'title', {str}), + 'thumbnail': ('data-poster', {url_or_none}), + })), + **ping_data, + **self._search_json_ld(webpage, video_id, fatal=False), + } diff --git a/yt_dlp/extractor/sovietscloset.py b/yt_dlp/extractor/sovietscloset.py index 773ddd3445..d35214aa84 100644 --- a/yt_dlp/extractor/sovietscloset.py +++ b/yt_dlp/extractor/sovietscloset.py @@ -1,5 +1,6 @@ +from .bunnycdn import BunnyCdnIE from .common import InfoExtractor -from ..utils import try_get, unified_timestamp +from ..utils import make_archive_id, try_get, unified_timestamp class SovietsClosetBaseIE(InfoExtractor): @@ -43,7 +44,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'url': 'https://sovietscloset.com/video/1337', 'md5': 'bd012b04b261725510ca5383074cdd55', 'info_dict': { - 'id': '1337', + 'id': '2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67', 'ext': 'mp4', 'title': 'The Witcher #13', 'thumbnail': r're:^https?://.*\.b-cdn\.net/2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67/thumbnail\.jpg$', @@ -55,20 +56,23 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'upload_date': '20170413', 'uploader_id': 'SovietWomble', 'uploader_url': 'https://www.twitch.tv/SovietWomble', - 'duration': 7007, + 'duration': 7008, 'was_live': True, 'availability': 'public', 'series': 'The Witcher', 'season': 'Misc', 'episode_number': 13, 'episode': 'Episode 13', + 'creators': ['SovietWomble'], + 'description': '', + '_old_archive_ids': ['sovietscloset 1337'], }, }, { 'url': 'https://sovietscloset.com/video/1105', 'md5': '89fa928f183893cb65a0b7be846d8a90', 'info_dict': { - 'id': '1105', + 'id': 'c0e5e76f-3a93-40b4-bf01-12343c2eec5d', 'ext': 'mp4', 'title': 'Arma 3 - Zeus Games #5', 'uploader': 'SovietWomble', @@ -80,39 +84,20 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'upload_date': '20160420', 'uploader_id': 'SovietWomble', 'uploader_url': 'https://www.twitch.tv/SovietWomble', - 'duration': 8804, + 'duration': 8805, 'was_live': True, 'availability': 'public', 'series': 'Arma 3', 'season': 'Zeus Games', 'episode_number': 5, 'episode': 'Episode 5', + 'creators': ['SovietWomble'], + 'description': '', + '_old_archive_ids': ['sovietscloset 1105'], }, }, ] - def _extract_bunnycdn_iframe(self, video_id, bunnycdn_id): - iframe = self._download_webpage( - f'https://iframe.mediadelivery.net/embed/5105/{bunnycdn_id}', - video_id, note='Downloading BunnyCDN iframe', headers=self.MEDIADELIVERY_REFERER) - - m3u8_url = self._search_regex(r'(https?://.*?\.m3u8)', iframe, 'm3u8 url') - thumbnail_url = self._search_regex(r'(https?://.*?thumbnail\.jpg)', iframe, 'thumbnail url') - - m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, headers=self.MEDIADELIVERY_REFERER) - - if not m3u8_formats: - duration = None - else: - duration = self._extract_m3u8_vod_duration( - m3u8_formats[0]['url'], video_id, headers=self.MEDIADELIVERY_REFERER) - - return { - 'formats': m3u8_formats, - 'thumbnail': thumbnail_url, - 'duration': duration, - } - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -122,13 +107,13 @@ class SovietsClosetIE(SovietsClosetBaseIE): stream = self.parse_nuxt_jsonp(f'{static_assets_base}/video/{video_id}/payload.js', video_id, 'video')['stream'] - return { + return self.url_result( + f'https://iframe.mediadelivery.net/embed/5105/{stream["bunnyId"]}', ie=BunnyCdnIE, url_transparent=True, **self.video_meta( video_id=video_id, game_name=stream['game']['name'], category_name=try_get(stream, lambda x: x['subcategory']['name'], str), episode_number=stream.get('number'), stream_date=stream.get('date')), - **self._extract_bunnycdn_iframe(video_id, stream['bunnyId']), - } + _old_archive_ids=[make_archive_id(self, video_id)]) class SovietsClosetPlaylistIE(SovietsClosetBaseIE):