diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 895390d6cf..5751383712 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -342,7 +342,6 @@ def can_merge_formats(cls, info_dict, params): and cls.can_download(info_dict)) def _call_downloader(self, tmpfilename, info_dict): - urls = [f['url'] for f in info_dict.get('requested_formats', [])] or [info_dict['url']] ffpp = FFmpegPostProcessor(downloader=self) if not ffpp.available: self.report_error('m3u8 download detected but ffmpeg could not be found. Please install') @@ -372,16 +371,6 @@ def _call_downloader(self, tmpfilename, info_dict): # http://trac.ffmpeg.org/ticket/6125#comment:10 args += ['-seekable', '1' if seekable else '0'] - http_headers = None - if info_dict.get('http_headers'): - youtubedl_headers = handle_youtubedl_headers(info_dict['http_headers']) - http_headers = [ - # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: - # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - '-headers', - ''.join(f'{key}: {val}\r\n' for key, val in youtubedl_headers.items()) - ] - env = None proxy = self.params.get('proxy') if proxy: @@ -434,21 +423,26 @@ def _call_downloader(self, tmpfilename, info_dict): start_time, end_time = info_dict.get('section_start') or 0, info_dict.get('section_end') - for i, url in enumerate(urls): - if http_headers is not None and re.match(r'^https?://', url): - args += http_headers + selected_formats = info_dict.get('requested_formats') or [info_dict] + for i, fmt in enumerate(selected_formats): + if fmt.get('http_headers') and re.match(r'^https?://', fmt['url']): + headers_dict = handle_youtubedl_headers(fmt['http_headers']) + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in headers_dict.items())]) + if start_time: args += ['-ss', str(start_time)] if end_time: args += ['-t', str(end_time - start_time)] - args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', url] + args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', fmt['url']] if not (start_time or end_time) or not self.params.get('force_keyframes_at_cuts'): args += ['-c', 'copy'] if info_dict.get('requested_formats') or protocol == 'http_dash_segments': - for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]): + for i, fmt in enumerate(selected_formats): stream_number = fmt.get('manifest_stream_number', 0) args.extend(['-map', f'{i}:{stream_number}']) @@ -488,8 +482,9 @@ def _call_downloader(self, tmpfilename, info_dict): args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) self._debug_cmd(args) + piped = any(fmt['url'] in ('-', 'pipe:') for fmt in selected_formats) with Popen(args, stdin=subprocess.PIPE, env=env) as proc: - if url in ('-', 'pipe:'): + if piped: self.on_process_started(proc, proc.stdin) try: retval = proc.wait() @@ -499,7 +494,7 @@ def _call_downloader(self, tmpfilename, info_dict): # produces a file that is playable (this is mostly useful for live # streams). Note that Windows is not affected and produces playable # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). - if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): + if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and not piped: proc.communicate_or_kill(b'q') else: proc.kill(timeout=None) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c90d7b7f64..a12328f04a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -78,6 +78,7 @@ WyborczaVideoIE, ) from .airmozilla import AirMozillaIE +from .airtv import AirTVIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amara import AmaraIE @@ -536,7 +537,7 @@ ESPNCricInfoIE, ) from .esri import EsriVideoIE -from .europa import EuropaIE +from .europa import EuropaIE, EuroParlWebstreamIE from .europeantour import EuropeanTourIE from .eurosport import EurosportIE from .euscreen import EUScreenIE @@ -1281,6 +1282,7 @@ from .ondemandkorea import OnDemandKoreaIE from .onefootball import OneFootballIE from .onenewsnz import OneNewsNZIE +from .oneplace import OnePlacePodcastIE from .onet import ( OnetIE, OnetChannelIE, diff --git a/yt_dlp/extractor/airtv.py b/yt_dlp/extractor/airtv.py new file mode 100644 index 0000000000..0b73a966ed --- /dev/null +++ b/yt_dlp/extractor/airtv.py @@ -0,0 +1,96 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + determine_ext, + int_or_none, + mimetype2ext, + parse_iso8601, + traverse_obj +) + + +class AirTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.air\.tv/watch\?v=(?P\w+)' + _TESTS = [{ + # without youtube_id + 'url': 'https://www.air.tv/watch?v=W87jcWleSn2hXZN47zJZsQ', + 'info_dict': { + 'id': 'W87jcWleSn2hXZN47zJZsQ', + 'ext': 'mp4', + 'release_date': '20221003', + 'release_timestamp': 1664792603, + 'channel_id': 'vgfManQlRQKgoFQ8i8peFQ', + 'title': 'md5:c12d49ed367c3dadaa67659aff43494c', + 'upload_date': '20221003', + 'duration': 151, + 'view_count': int, + 'thumbnail': 'https://cdn-sp-gcs.air.tv/videos/W/8/W87jcWleSn2hXZN47zJZsQ/b13fc56464f47d9d62a36d110b9b5a72-4096x2160_9.jpg', + 'timestamp': 1664792603, + } + }, { + # with youtube_id + 'url': 'https://www.air.tv/watch?v=sv57EC8tRXG6h8dNXFUU1Q', + 'info_dict': { + 'id': '2ZTqmpee-bQ', + 'ext': 'mp4', + 'comment_count': int, + 'tags': 'count:11', + 'channel_follower_count': int, + 'like_count': int, + 'uploader': 'Newsflare', + 'thumbnail': 'https://i.ytimg.com/vi_webp/2ZTqmpee-bQ/maxresdefault.webp', + 'availability': 'public', + 'title': 'Geese Chase Alligator Across Golf Course', + 'uploader_id': 'NewsflareBreaking', + 'channel_url': 'https://www.youtube.com/channel/UCzSSoloGEz10HALUAbYhngQ', + 'description': 'md5:99b21d9cea59330149efbd9706e208f5', + 'age_limit': 0, + 'channel_id': 'UCzSSoloGEz10HALUAbYhngQ', + 'uploader_url': 'http://www.youtube.com/user/NewsflareBreaking', + 'view_count': int, + 'categories': ['News & Politics'], + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'Newsflare', + 'duration': 37, + 'upload_date': '20180511', + } + }] + + def _get_formats_and_subtitle(self, json_data, video_id): + formats, subtitles = [], {} + for source in traverse_obj(json_data, 'sources', 'sources_desktop', ...): + ext = determine_ext(source.get('src'), mimetype2ext(source.get('type'))) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles(source.get('src'), video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({'url': source.get('src'), 'ext': ext}) + return formats, subtitles + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['initialState']['videos'][display_id] + if nextjs_json.get('youtube_id'): + return self.url_result( + f'https://www.youtube.com/watch?v={nextjs_json.get("youtube_id")}', YoutubeIE) + + formats, subtitles = self._get_formats_and_subtitle(nextjs_json, display_id) + return { + 'id': display_id, + 'title': nextjs_json.get('title') or self._html_search_meta('og:title', webpage), + 'formats': formats, + 'subtitles': subtitles, + 'description': nextjs_json.get('description') or None, + 'duration': int_or_none(nextjs_json.get('duration')), + 'thumbnails': [ + {'url': thumbnail} + for thumbnail in traverse_obj(nextjs_json, ('default_thumbnails', ...))], + 'channel_id': traverse_obj(nextjs_json, 'channel', 'channel_slug'), + 'timestamp': parse_iso8601(nextjs_json.get('created')), + 'release_timestamp': parse_iso8601(nextjs_json.get('published')), + 'view_count': int_or_none(nextjs_json.get('views')), + } diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index c2b4937658..29daabe4a3 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -3,6 +3,7 @@ int_or_none, orderedSet, parse_duration, + parse_iso8601, parse_qs, qualities, unified_strdate, @@ -87,3 +88,86 @@ def get_item(type_, preference): 'view_count': view_count, 'formats': formats } + + +class EuroParlWebstreamIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:multimedia|webstreaming)\.europarl\.europa\.eu/[^/#?]+/ + (?:embed/embed\.html\?event=|(?!video)[^/#?]+/[\w-]+_)(?P[\w-]+) + ''' + _TESTS = [{ + 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', + 'info_dict': { + 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', + 'ext': 'mp4', + 'release_timestamp': 1663137900, + 'title': 'Plenary session', + 'release_date': '20220914', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/eu-cop27-un-climate-change-conference-in-sharm-el-sheikh-egypt-ep-delegation-meets-with-ngo-represen_20221114-1600-SPECIAL-OTHER', + 'info_dict': { + 'id': 'a8428de8-b9cd-6a2e-11e4-3805d9c9ff5c', + 'ext': 'mp4', + 'release_timestamp': 1668434400, + 'release_date': '20221114', + 'title': 'md5:d3550280c33cc70e0678652e3d52c028', + }, + 'params': { + 'skip_download': True, + } + }, { + # embed webpage + 'url': 'https://webstreaming.europarl.europa.eu/ep/embed/embed.html?event=20220914-0900-PLENARY&language=en&autoplay=true&logo=true', + 'info_dict': { + 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', + 'ext': 'mp4', + 'title': 'Plenary session', + 'release_date': '20220914', + 'release_timestamp': 1663137900, + }, + 'params': { + 'skip_download': True, + } + }, { + # live webstream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', + 'info_dict': { + 'ext': 'mp4', + 'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715', + 'release_timestamp': 1668502800, + 'title': 'Euroscola 2022-11-15 19:21', + 'release_date': '20221115', + 'live_status': 'is_live', + }, + 'skip': 'not live anymore' + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + json_info = self._download_json( + 'https://vis-api.vuplay.co.uk/event/external', display_id, + query={ + 'player_key': 'europarl|718f822c-a48c-4841-9947-c9cb9bb1743c', + 'external_id': display_id, + }) + + formats, subtitles = self._extract_mpd_formats_and_subtitles(json_info['streaming_url'], display_id) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + json_info['streaming_url'].replace('.mpd', '.m3u8'), display_id) + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': json_info['id'], + 'title': json_info.get('title'), + 'formats': formats, + 'subtitles': subtitles, + 'release_timestamp': parse_iso8601(json_info.get('published_start')), + 'is_live': 'LIVE' in json_info.get('state', '') + } diff --git a/yt_dlp/extractor/foxsports.py b/yt_dlp/extractor/foxsports.py index f9d7fe52ae..f906a1718d 100644 --- a/yt_dlp/extractor/foxsports.py +++ b/yt_dlp/extractor/foxsports.py @@ -1,31 +1,51 @@ from .common import InfoExtractor +from .uplynk import UplynkPreplayIE +from ..utils import HEADRequest, float_or_none, make_archive_id, smuggle_url class FoxSportsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*video/(?P\d+)' - - _TEST = { - 'url': 'http://www.foxsports.com/tennessee/video/432609859715', - 'md5': 'b49050e955bebe32c301972e4012ac17', + _VALID_URL = r'https?://(?:www\.)?foxsports\.com/watch/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.foxsports.com/watch/play-612168c6700004b', 'info_dict': { - 'id': '432609859715', + 'id': 'b72f5bd8658140baa5791bb676433733', 'ext': 'mp4', - 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', - 'description': 'Courtney Lee talks about Memphis being focused.', - # TODO: fix timestamp - 'upload_date': '19700101', # '20150423', - # 'timestamp': 1429761109, - 'uploader': 'NEWA-FNG-FOXSPORTS', + 'display_id': 'play-612168c6700004b', + 'title': 'md5:e0c4ecac3a1f25295b4fae22fb5c126a', + 'description': 'md5:371bc43609708ae2b9e1a939229762af', + 'uploader_id': '06b4a36349624051a9ba52ac3a91d268', + 'upload_date': '20221205', + 'timestamp': 1670262586, + 'duration': 31.7317, + 'thumbnail': r're:^https?://.*\.jpg$', + 'extra_param_to_segment_url': str, }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, - 'add_ie': ['ThePlatform'], - } + }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_ld = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) + data = self._download_json( + f'https://api3.fox.com/v2.0/vodplayer/sportsclip/{video_id}', + video_id, note='Downloading API JSON', headers={ + 'x-api-key': 'cf289e299efdfa39fb6316f259d1de93', + }) + preplay_url = self._request_webpage( + HEADRequest(data['url']), video_id, 'Fetching preplay URL').geturl() - return self.url_result( - 'https://feed.theplatform.com/f/BKQ29B/foxsports-all?byId=' + video_id, 'ThePlatformFeed') + return { + '_type': 'url_transparent', + 'ie_key': UplynkPreplayIE.ie_key(), + 'url': smuggle_url(preplay_url, {'Origin': 'https://www.foxsports.com'}), + 'display_id': video_id, + 'title': data.get('name') or json_ld.get('title'), + 'description': data.get('description') or json_ld.get('description'), + 'duration': float_or_none(data.get('durationInSeconds')), + 'timestamp': json_ld.get('timestamp'), + 'thumbnails': json_ld.get('thumbnails'), + '_old_archive_ids': [make_archive_id(self, video_id)], + } diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index bf3c9c1e8c..2281c71f3d 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2356,7 +2356,7 @@ def _real_extract(self, url): info_dict.update({ 'formats': formats, 'subtitles': subtitles, - 'http_headers': headers, + 'http_headers': headers or None, }) return info_dict diff --git a/yt_dlp/extractor/netverse.py b/yt_dlp/extractor/netverse.py index 719a9dabe2..3c4fd92eb0 100644 --- a/yt_dlp/extractor/netverse.py +++ b/yt_dlp/extractor/netverse.py @@ -1,3 +1,5 @@ +import itertools + from .common import InfoExtractor from .dailymotion import DailymotionIE from ..utils import smuggle_url, traverse_obj @@ -16,6 +18,26 @@ def _call_api(self, slug, endpoint, query={}, season_id='', display_id=None): f'https://api.netverse.id/medias/api/v2/{self._ENDPOINTS[endpoint]}/{slug}/{season_id}', display_id or slug, query=query) + def _get_comments(self, video_id): + last_page_number = None + for i in itertools.count(1): + comment_data = self._download_json( + f'https://api.netverse.id/mediadetails/api/v3/videos/comments/{video_id}', + video_id, data=b'', fatal=False, query={'page': i}, + note=f'Downloading JSON comment metadata page {i}') or {} + yield from traverse_obj(comment_data, ('response', 'comments', 'data', ..., { + 'id': '_id', + 'text': 'comment', + 'author_id': 'customer_id', + 'author': ('customer', 'name'), + 'author_thumbnail': ('customer', 'profile_picture'), + })) + + if not last_page_number: + last_page_number = traverse_obj(comment_data, ('response', 'comments', 'last_page')) + if i >= (last_page_number or 0): + break + class NetverseIE(NetverseBaseIE): _VALID_URL = r'https?://(?:\w+\.)?netverse\.id/(?Pwatch|video)/(?P[^/?#&]+)' @@ -28,7 +50,7 @@ class NetverseIE(NetverseBaseIE): 'ext': 'mp4', 'season': 'Season 2016', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/T7aV31Y0eGRWBbwkK/x1080', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', 'episode_number': 22, 'episode': 'Episode 22', 'uploader_id': 'x2ir3vq', @@ -51,7 +73,7 @@ class NetverseIE(NetverseBaseIE): 'ext': 'mp4', 'season': 'Season 2', 'description': 'md5:8a74f70812cca267e19ee0635f0af835', - 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/Thwuy1YURicFmGu0v/x1080', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', 'episode_number': 2, 'episode': 'Episode 2', 'view_count': int, @@ -75,7 +97,7 @@ class NetverseIE(NetverseBaseIE): 'title': 'Tetangga Baru', 'season': 'Season 1', 'description': 'md5:23fcf70e97d461d3029d25d59b2ccfb9', - 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/T3Ogm1YEnnyjVKAFF/x1080', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', 'episode_number': 1, 'episode': 'Episode 1', 'timestamp': 1624538169, @@ -96,7 +118,7 @@ class NetverseIE(NetverseBaseIE): 'info_dict': { 'id': 'x887jzz', 'ext': 'mp4', - 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/TfuZ_1Y6PboJ5An_s/x1080', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', 'season': 'Season 1', 'episode_number': 1, 'description': 'md5:d4f627b3e7a3f9acdc55f6cdd5ea41d5', @@ -114,6 +136,60 @@ class NetverseIE(NetverseBaseIE): 'upload_date': '20220225', }, 'skip': 'This video get Geo-blocked for some country' + }, { + # video with comments + 'url': 'https://netverse.id/video/episode-1-season-2016-ok-food', + 'info_dict': { + 'id': 'k6hetBPiQMljSxxvAy7', + 'ext': 'mp4', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', + 'display_id': 'episode-1-season-2016-ok-food', + 'like_count': int, + 'description': '', + 'duration': 1471, + 'age_limit': 0, + 'timestamp': 1642405848, + 'episode_number': 1, + 'season': 'Season 2016', + 'uploader_id': 'x2ir3vq', + 'title': 'Episode 1 - Season 2016 - Ok Food', + 'upload_date': '20220117', + 'tags': [], + 'view_count': int, + 'episode': 'Episode 1', + 'uploader': 'Net Prime', + 'comment_count': int, + }, + 'params':{ + 'getcomments': True + } + }, { + # video with multiple page comment + 'url': 'https://netverse.id/video/match-island-eps-1-fix', + 'info_dict': { + 'id': 'x8aznjc', + 'ext': 'mp4', + 'like_count': int, + 'tags': ['Match-Island', 'Pd00111'], + 'display_id': 'match-island-eps-1-fix', + 'view_count': int, + 'episode': 'Episode 1', + 'uploader': 'Net Prime', + 'duration': 4070, + 'timestamp': 1653068165, + 'description': 'md5:e9cf3b480ad18e9c33b999e3494f223f', + 'age_limit': 0, + 'title': 'Welcome To Match Island', + 'upload_date': '20220520', + 'episode_number': 1, + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', + 'uploader_id': 'x2ir3vq', + 'season': 'Season 1', + 'comment_count': int, + }, + 'params':{ + 'getcomments': True + } }] def _real_extract(self, url): @@ -131,6 +207,7 @@ def _real_extract(self, url): 'thumbnail': traverse_obj(videos, ('program_detail', 'thumbnail_image')), 'description': traverse_obj(videos, ('program_detail', 'description')), 'episode_number': videos.get('episode_order'), + '__post_extractor': self.extract_comments(display_id), } diff --git a/yt_dlp/extractor/nosnl.py b/yt_dlp/extractor/nosnl.py index eba94c416d..cea54c98e2 100644 --- a/yt_dlp/extractor/nosnl.py +++ b/yt_dlp/extractor/nosnl.py @@ -3,7 +3,7 @@ class NOSNLArticleIE(InfoExtractor): - _VALID_URL = r'https?://nos\.nl/((?!video)(\w+/)?\w+/)\d+-(?P[\w-]+)' + _VALID_URL = r'https?://nos\.nl/(?Pvideo|(\w+/)?\w+)/?\d+-(?P[\w-]+)' _TESTS = [ { # only 1 video @@ -22,13 +22,14 @@ class NOSNLArticleIE(InfoExtractor): 'info_dict': { 'id': '2440409', 'title': 'Vannacht sliepen weer enkele honderden asielzoekers in Ter Apel buiten', - 'description': 'Er werd wel geprobeerd om kwetsbare migranten onderdak te bieden, zegt het COA.', + 'description': 'md5:72b1e1674d798460e79d78fa37e9f56d', 'tags': ['aanmeldcentrum', 'Centraal Orgaan opvang asielzoekers', 'COA', 'asielzoekers', 'Ter Apel'], 'modified_timestamp': 1660452773, 'modified_date': '20220814', 'upload_date': '20220813', 'thumbnail': 'https://cdn.nos.nl/image/2022/07/18/880346/1024x576a.jpg', 'timestamp': 1660401384, + 'categories': ['Regionaal nieuws', 'Binnenland'], }, 'playlist_count': 2, }, { @@ -37,20 +38,37 @@ class NOSNLArticleIE(InfoExtractor): 'info_dict': { 'id': '2440789', 'title': 'Wekdienst 16/8: Groningse acties tien jaar na zware aardbeving • Femke Bol in actie op EK atletiek ', - 'description': 'Nieuws, weer, verkeer: met dit overzicht begin je geïnformeerd aan de dag.', + 'description': 'md5:0bd277ed7a44fc15cb12a9d27d8f6641', 'tags': ['wekdienst'], 'modified_date': '20220816', 'modified_timestamp': 1660625449, 'timestamp': 1660625449, 'upload_date': '20220816', 'thumbnail': 'https://cdn.nos.nl/image/2022/08/16/888178/1024x576a.jpg', + 'categories': ['Binnenland', 'Buitenland'], }, 'playlist_count': 2, + }, { + # video url + 'url': 'https://nos.nl/video/2452718-xi-en-trudeau-botsen-voor-de-camera-op-g20-top-je-hebt-gelekt', + 'info_dict': { + 'id': '2452718', + 'title': 'Xi en Trudeau botsen voor de camera op G20-top: \'Je hebt gelekt\'', + 'modified_date': '20221117', + 'description': 'md5:61907dac576f75c11bf8ffffd4a3cc0f', + 'tags': ['Xi', 'Trudeau', 'G20', 'indonesié'], + 'upload_date': '20221117', + 'thumbnail': 'https://cdn.nos.nl/image/2022/11/17/916155/1024x576a.jpg', + 'modified_timestamp': 1668663388, + 'timestamp': 1668663388, + 'categories': ['Buitenland'], + }, + 'playlist_mincount': 1, } ] def _entries(self, nextjs_json, display_id): - for item in nextjs_json['items']: + for item in nextjs_json: if item.get('type') == 'video': formats, subtitle = self._extract_m3u8_formats_and_subtitles( traverse_obj(item, ('source', 'url')), display_id, ext='mp4') @@ -77,13 +95,14 @@ def _entries(self, nextjs_json, display_id): } def _real_extract(self, url): - display_id = self._match_valid_url(url).group('display_id') + site_type, display_id = self._match_valid_url(url).group('type', 'display_id') webpage = self._download_webpage(url, display_id) nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data'] return { '_type': 'playlist', - 'entries': self._entries(nextjs_json, display_id), + 'entries': self._entries( + [nextjs_json['video']] if site_type == 'video' else nextjs_json['items'], display_id), 'id': str(nextjs_json['id']), 'title': nextjs_json.get('title') or self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage), 'description': (nextjs_json.get('description') @@ -91,5 +110,6 @@ def _real_extract(self, url): 'tags': nextjs_json.get('keywords'), 'modified_timestamp': parse_iso8601(nextjs_json.get('modifiedAt')), 'thumbnail': nextjs_json.get('shareImageSrc') or self._html_search_meta(['og:image', 'twitter:image'], webpage), - 'timestamp': parse_iso8601(nextjs_json.get('publishedAt')) + 'timestamp': parse_iso8601(nextjs_json.get('publishedAt')), + 'categories': traverse_obj(nextjs_json, ('categories', ..., 'label')), } diff --git a/yt_dlp/extractor/oneplace.py b/yt_dlp/extractor/oneplace.py new file mode 100644 index 0000000000..86337ad0ad --- /dev/null +++ b/yt_dlp/extractor/oneplace.py @@ -0,0 +1,43 @@ +from .common import InfoExtractor + + +class OnePlacePodcastIE(InfoExtractor): + _VALID_URL = r'https?://www\.oneplace\.com/[\w]+/[^/]+/listen/[\w-]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.oneplace.com/ministries/a-daily-walk/listen/living-in-the-last-days-part-2-958461.html', + 'info_dict': { + 'id': '958461', + 'ext': 'mp3', + 'title': 'Living in the Last Days Part 2 | A Daily Walk with John Randall', + 'description': 'md5:fbb8f1cf21447ac54ecaa2887fc20c6e', + } + }, { + 'url': 'https://www.oneplace.com/ministries/ankerberg-show/listen/ep-3-relying-on-the-constant-companionship-of-the-holy-spirit-part-2-922513.html', + 'info_dict': { + 'id': '922513', + 'ext': 'mp3', + 'description': 'md5:8b810b4349aa40a5d033b4536fe428e1', + 'title': 'md5:ce10f7d8d5ddcf485ed8905ef109659d', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'url': self._search_regex(( + r'mp3-url\s*=\s*"([^"]+)', + r']+id\s*=\s*"player"[^>]+data-media-url\s*=\s*"(?P[^"]+)', + ), webpage, 'media url'), + 'ext': 'mp3', + 'vcodec': 'none', + 'title': self._html_search_regex(( + r']class\s*=\s*"details"[^>]+>[^<]]+>(?P[^>]+)>', + self._meta_regex('og:title'), self._meta_regex('title'), + ), webpage, 'title', group='content', default=None), + 'description': self._html_search_regex( + r']+class="[^"]+epDesc"[^>]*>\s*(?P.+?)\s*', + webpage, 'description', default=None), + } diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index 2c6cd6d4bb..8361fbbc5f 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -1,19 +1,24 @@ import json from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, int_or_none, - try_get, + str_or_none, + strip_or_none, + traverse_obj, unified_timestamp, url_or_none, ) class PinterestBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + _VALID_URL_BASE = r'''(?x) + https?://(?:[^/]+\.)?pinterest\.(?: + com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx| + dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu| + co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)''' def _call_api(self, resource, video_id, options): return self._download_json( @@ -24,14 +29,53 @@ def _call_api(self, resource, video_id, options): def _extract_video(self, data, extract_formats=True): video_id = data['id'] + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) - title = (data.get('title') or data.get('grid_title') or video_id).strip() + info = { + 'title': strip_or_none(traverse_obj(data, 'title', 'grid_title', default='')), + 'description': traverse_obj(data, 'seo_description', 'description'), + 'timestamp': unified_timestamp(data.get('created_at')), + 'thumbnails': thumbnails, + 'uploader': traverse_obj(data, ('closeup_attribution', 'full_name')), + 'uploader_id': str_or_none(traverse_obj(data, ('closeup_attribution', 'id'))), + 'repost_count': int_or_none(data.get('repin_count')), + 'comment_count': int_or_none(data.get('comment_count')), + 'categories': traverse_obj(data, ('pin_join', 'visual_annotation'), expected_type=list), + 'tags': traverse_obj(data, 'hashtags', expected_type=list), + } urls = [] formats = [] duration = None - if extract_formats: - for format_id, format_dict in data['videos']['video_list'].items(): + domain = data.get('domain', '') + if domain.lower() != 'uploaded by user' and traverse_obj(data, ('embed', 'src')): + if not info['title']: + info['title'] = None + return { + '_type': 'url_transparent', + 'url': data['embed']['src'], + **info, + } + + elif extract_formats: + video_list = traverse_obj( + data, ('videos', 'video_list'), + ('story_pin_data', 'pages', ..., 'blocks', ..., 'video', 'video_list'), + expected_type=dict, get_all=False, default={}) + for format_id, format_dict in video_list.items(): if not isinstance(format_dict, dict): continue format_url = url_or_none(format_dict.get('url')) @@ -53,72 +97,79 @@ def _extract_video(self, data, extract_formats=True): 'duration': duration, }) - description = data.get('description') or data.get('description_html') or data.get('seo_description') - timestamp = unified_timestamp(data.get('created_at')) - - def _u(field): - return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) - - uploader = _u('full_name') - uploader_id = _u('id') - - repost_count = int_or_none(data.get('repin_count')) - comment_count = int_or_none(data.get('comment_count')) - categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) - tags = data.get('hashtags') - - thumbnails = [] - images = data.get('images') - if isinstance(images, dict): - for thumbnail_id, thumbnail in images.items(): - if not isinstance(thumbnail, dict): - continue - thumbnail_url = url_or_none(thumbnail.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - return { 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'repost_count': repost_count, - 'comment_count': comment_count, - 'categories': categories, - 'tags': tags, 'formats': formats, + 'duration': duration, + 'webpage_url': f'https://www.pinterest.com/pin/{video_id}/', 'extractor_key': PinterestIE.ie_key(), + 'extractor': PinterestIE.IE_NAME, + **info, } class PinterestIE(PinterestBaseIE): _VALID_URL = r'%s/pin/(?P\d+)' % PinterestBaseIE._VALID_URL_BASE _TESTS = [{ + # formats found in data['videos'] 'url': 'https://www.pinterest.com/pin/664281013778109217/', 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', 'info_dict': { 'id': '664281013778109217', 'ext': 'mp4', 'title': 'Origami', - 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'description': 'md5:e29801cab7d741ea8c741bc50c8d00ab', 'duration': 57.7, 'timestamp': 1593073622, 'upload_date': '20200625', - 'uploader': 'Love origami -I am Dafei', - 'uploader_id': '586523688879454212', - 'repost_count': 50, - 'comment_count': 0, + 'repost_count': int, + 'comment_count': int, 'categories': list, 'tags': list, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + }, { + # formats found in data['story_pin_data'] + 'url': 'https://www.pinterest.com/pin/1084663891475263837/', + 'md5': '069ac19919ab9e1e13fa60de46290b03', + 'info_dict': { + 'id': '1084663891475263837', + 'ext': 'mp4', + 'title': 'Gadget, Cool products, Amazon product, technology, Kitchen gadgets', + 'description': 'md5:d0a4b6ae996ff0c6eed83bc869598d13', + 'uploader': 'CoolCrazyGadgets', + 'uploader_id': '1084664028912989237', + 'upload_date': '20211003', + 'timestamp': 1633246654.0, + 'duration': 14.9, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': list, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + }, { + # vimeo.com embed + 'url': 'https://www.pinterest.ca/pin/441282463481903715/', + 'info_dict': { + 'id': '111691128', + 'ext': 'mp4', + 'title': 'Tonite Let\'s All Make Love In London (1967)', + 'description': 'md5:8190f37b3926807809ec57ec21aa77b2', + 'uploader': 'Vimeo', + 'uploader_id': '473792960706651251', + 'upload_date': '20180120', + 'timestamp': 1516409040, + 'duration': 3404, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'uploader_url': 'https://vimeo.com/willardandrade', + }, + 'params': { + 'skip_download': 'm3u8', }, }, { 'url': 'https://co.pinterest.com/pin/824721750502199491/', diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index 5a4fd975e0..97e6354b42 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -91,12 +91,12 @@ def _download_and_extract_formats(self, video_id, query=None): class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:video(?:/private)?|(?:play/)?embed)/(?P[\da-z]{32})' _EMBED_REGEX = [r']+?src=(["\'])(?P(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1'] _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', - 'md5': '1d24f180fac7a02f3900712e5a5764d6', + 'md5': 'e33ac625efca66aba86cbec9851f2692', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', 'ext': 'mp4', @@ -108,6 +108,10 @@ class RutubeIE(RutubeBaseIE): 'timestamp': 1381943602, 'upload_date': '20131016', 'age_limit': 0, + 'view_count': int, + 'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg', + 'category': ['Новости и СМИ'], + }, }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', @@ -121,6 +125,24 @@ class RutubeIE(RutubeBaseIE): }, { 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/private/884fb55f07a97ab673c7d654553e0f48/?p=x2QojCumHTS3rsKHWXN8Lg', + 'md5': 'd106225f15d625538fe22971158e896f', + 'info_dict': { + 'id': '884fb55f07a97ab673c7d654553e0f48', + 'ext': 'mp4', + 'title': 'Яцуноками, Nioh2', + 'description': 'Nioh2: финал сражения с боссом Яцуноками', + 'duration': 15, + 'uploader': 'mexus', + 'uploader_id': '24222106', + 'timestamp': 1670646232, + 'upload_date': '20221210', + 'age_limit': 0, + 'view_count': int, + 'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg', + 'category': ['Видеоигры'], + }, }] @classmethod @@ -129,8 +151,9 @@ def suitable(cls, url): def _real_extract(self, url): video_id = self._match_id(url) - info = self._download_and_extract_info(video_id) - info['formats'] = self._download_and_extract_formats(video_id) + query = parse_qs(url) + info = self._download_and_extract_info(video_id, query) + info['formats'] = self._download_and_extract_formats(video_id, query) return info diff --git a/yt_dlp/extractor/slideslive.py b/yt_dlp/extractor/slideslive.py index 9a60a79e73..86c26a8a2b 100644 --- a/yt_dlp/extractor/slideslive.py +++ b/yt_dlp/extractor/slideslive.py @@ -1,92 +1,176 @@ from .common import InfoExtractor from ..utils import ( - bool_or_none, smuggle_url, - try_get, + traverse_obj, + unified_timestamp, url_or_none, ) class SlidesLiveIE(InfoExtractor): _VALID_URL = r'https?://slideslive\.com/(?P[0-9]+)' - _WORKING = False _TESTS = [{ - # video_service_name = YOUTUBE + # service_name = yoda 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', - 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f', 'info_dict': { - 'id': 'LMtgR8ba0b0', + 'id': '38902413', 'ext': 'mp4', 'title': 'GCC IA16 backend', - 'description': 'Watch full version of this video at https://slideslive.com/38902413.', - 'uploader': 'SlidesLive Videos - A', - 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', - 'timestamp': 1597615266, - 'upload_date': '20170925', - } - }, { - # video_service_name = yoda - 'url': 'https://slideslive.com/38935785', - 'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a', - 'info_dict': { - 'id': 'RMraDYN5ozA_', - 'ext': 'mp4', - 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', + 'timestamp': 1648189972, + 'upload_date': '20220325', + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': 'm3u8', }, }, { - # video_service_name = youtube + # service_name = yoda + 'url': 'https://slideslive.com/38935785', + 'info_dict': { + 'id': '38935785', + 'ext': 'mp4', + 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', + 'upload_date': '20211115', + 'timestamp': 1636996003, + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # service_name = yoda + 'url': 'https://slideslive.com/38973182/how-should-a-machine-learning-researcher-think-about-ai-ethics', + 'info_dict': { + 'id': '38973182', + 'ext': 'mp4', + 'title': 'How Should a Machine Learning Researcher Think About AI Ethics?', + 'upload_date': '20220201', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1643728135, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # service_name = youtube + 'url': 'https://slideslive.com/38897546/special-metaprednaska-petra-ludwiga-hodnoty-pro-lepsi-spolecnost', + 'md5': '8a79b5e3d700837f40bd2afca3c8fa01', + 'info_dict': { + 'id': 'jmg02wCJD5M', + 'display_id': '38897546', + 'ext': 'mp4', + 'title': 'SPECIÁL: Meta-přednáška Petra Ludwiga - Hodnoty pro lepší společnost', + 'description': 'Watch full version of this video at https://slideslive.com/38897546.', + 'channel_url': 'https://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw', + 'channel': 'SlidesLive Videos - G1', + 'channel_id': 'UCZWdAkNYFncuX0khyvhqnxw', + 'uploader_id': 'UCZWdAkNYFncuX0khyvhqnxw', + 'uploader': 'SlidesLive Videos - G1', + 'uploader_url': 'http://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw', + 'live_status': 'not_live', + 'upload_date': '20160710', + 'timestamp': 1618786715, + 'duration': 6827, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'age_limit': 0, + 'thumbnail': r're:^https?://.*\.jpg', + 'playable_in_embed': True, + 'availability': 'unlisted', + 'tags': [], + 'categories': ['People & Blogs'], + }, + }, { + # service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', 'only_matching': True, }, { - # video_service_name = url + # service_name = url 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1', 'only_matching': True, }, { - # video_service_name = vimeo + # service_name = vimeo 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3', 'only_matching': True, }] + def _extract_custom_m3u8_info(self, m3u8_data): + m3u8_dict = {} + + lookup = { + 'PRESENTATION-TITLE': 'title', + 'PRESENTATION-UPDATED-AT': 'timestamp', + 'PRESENTATION-THUMBNAIL': 'thumbnail', + 'PLAYLIST-TYPE': 'playlist_type', + 'VOD-VIDEO-SERVICE-NAME': 'service_name', + 'VOD-VIDEO-ID': 'service_id', + 'VOD-VIDEO-SERVERS': 'video_servers', + 'VOD-SUBTITLES': 'subtitles', + } + + for line in m3u8_data.splitlines(): + if not line.startswith('#EXT-SL-'): + continue + tag, _, value = line.partition(':') + key = lookup.get(tag.lstrip('#EXT-SL-')) + if not key: + continue + m3u8_dict[key] = value + + # Some values are stringified JSON arrays + for key in ('video_servers', 'subtitles'): + if key in m3u8_dict: + m3u8_dict[key] = self._parse_json(m3u8_dict[key], None, fatal=False) or [] + + return m3u8_dict + def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://ben.slideslive.com/player/' + video_id, video_id) - service_name = video_data['video_service_name'].lower() + webpage = self._download_webpage(url, video_id) + player_token = self._search_regex(r'data-player-token="([^"]+)"', webpage, 'player token') + player_data = self._download_webpage( + f'https://ben.slideslive.com/player/{video_id}', video_id, + note='Downloading player info', query={'player_token': player_token}) + player_info = self._extract_custom_m3u8_info(player_data) + + service_name = player_info['service_name'].lower() assert service_name in ('url', 'yoda', 'vimeo', 'youtube') - service_id = video_data['video_service_id'] + service_id = player_info['service_id'] + subtitles = {} - for sub in try_get(video_data, lambda x: x['subtitles'], list) or []: - if not isinstance(sub, dict): - continue + for sub in traverse_obj(player_info, ('subtitles', ...), expected_type=dict): webvtt_url = url_or_none(sub.get('webvtt_url')) if not webvtt_url: continue - lang = sub.get('language') or 'en' - subtitles.setdefault(lang, []).append({ + subtitles.setdefault(sub.get('language') or 'en', []).append({ 'url': webvtt_url, + 'ext': 'vtt', }) + info = { 'id': video_id, - 'thumbnail': video_data.get('thumbnail'), - 'is_live': bool_or_none(video_data.get('is_live')), + 'title': player_info.get('title') or self._html_search_meta('title', webpage, default=''), + 'timestamp': unified_timestamp(player_info.get('timestamp')), + 'is_live': player_info.get('playlist_type') != 'vod', + 'thumbnail': url_or_none(player_info.get('thumbnail')), 'subtitles': subtitles, } + if service_name in ('url', 'yoda'): - info['title'] = video_data['title'] if service_name == 'url': info['url'] = service_id else: + cdn_hostname = player_info['video_servers'][0] formats = [] - _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' - # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol formats.extend(self._extract_m3u8_formats( - _MANIFEST_PATTERN % (service_id, 'm3u8'), - service_id, 'mp4', m3u8_id='hls', fatal=False)) + f'https://{cdn_hostname}/{service_id}/master.m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=False, live=True)) formats.extend(self._extract_mpd_formats( - _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, - mpd_id='dash', fatal=False)) + f'https://{cdn_hostname}/{service_id}/master.mpd', + video_id, mpd_id='dash', fatal=False)) info.update({ - 'id': service_id, 'formats': formats, }) else: @@ -94,10 +178,11 @@ def _real_extract(self, url): '_type': 'url_transparent', 'url': service_id, 'ie_key': service_name.capitalize(), - 'title': video_data.get('title'), + 'display_id': video_id, }) if service_name == 'vimeo': info['url'] = smuggle_url( - 'https://player.vimeo.com/video/' + service_id, + f'https://player.vimeo.com/video/{service_id}', {'http_headers': {'Referer': url}}) + return info diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 18ebb3617f..a4e280c82b 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -293,7 +293,7 @@ def _real_extract(self, url): class TwitterIE(TwitterBaseIE): IE_NAME = 'twitter' - _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P\d+)' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P\d+)(?:/video/(?P\d+))?' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -336,7 +336,7 @@ class TwitterIE(TwitterBaseIE): 'id': '665052190608723968', 'display_id': '665052190608723968', 'ext': 'mp4', - 'title': 'md5:55fef1d5b811944f1550e91b44abb82e', + 'title': 'md5:e99588f17b3dd0503814ffb560e64731', 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', 'uploader_id': 'starwars', 'uploader': r're:Star Wars.*', @@ -648,7 +648,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_url': 'https://twitter.com/Rizdraws', 'upload_date': '20220928', 'timestamp': 1664391723, - 'thumbnail': 're:^https?://.*\\.jpg', + 'thumbnail': r're:^https?://.+\.jpg', 'like_count': int, 'repost_count': int, 'comment_count': int, @@ -727,6 +727,48 @@ class TwitterIE(TwitterBaseIE): }, 'add_ie': ['TwitterSpaces'], 'params': {'skip_download': 'm3u8'}, + }, { + # URL specifies video number but --yes-playlist + 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '1600649710662213632', + 'title': 'md5:be05989b0722e114103ed3851a0ffae2', + 'timestamp': 1670459604.0, + 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', + 'comment_count': int, + 'uploader_id': 'CTVJLaidlaw', + 'repost_count': int, + 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], + 'upload_date': '20221208', + 'age_limit': 0, + 'uploader': 'Jocelyn Laidlaw', + 'uploader_url': 'https://twitter.com/CTVJLaidlaw', + 'like_count': int, + }, + }, { + # URL specifies video number and --no-playlist + 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/2', + 'info_dict': { + 'id': '1600649511827013632', + 'ext': 'mp4', + 'title': 'md5:be05989b0722e114103ed3851a0ffae2', + 'thumbnail': r're:^https?://.+\.jpg', + 'timestamp': 1670459604.0, + 'uploader_id': 'CTVJLaidlaw', + 'uploader': 'Jocelyn Laidlaw', + 'repost_count': int, + 'comment_count': int, + 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], + 'duration': 102.226, + 'uploader_url': 'https://twitter.com/CTVJLaidlaw', + 'display_id': '1600649710662213632', + 'like_count': int, + 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', + 'upload_date': '20221208', + 'age_limit': 0, + }, + 'params': {'noplaylist': True}, }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -828,7 +870,7 @@ def _build_graphql_query(self, media_id): } def _real_extract(self, url): - twid = self._match_id(url) + twid, selected_index = self._match_valid_url(url).group('id', 'index') if self.is_logged_in or self._configuration_arg('force_graphql'): self.write_debug(f'Using GraphQL API (Auth = {self.is_logged_in})') result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) @@ -998,6 +1040,13 @@ def get_binding_value(k): entries[0]['_old_archive_ids'] = [make_archive_id(self, twid)] + if not self._yes_playlist(twid, selected_index, video_label='URL-specified video number'): + index = int(selected_index) - 1 + if index >= len(entries): + raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True) + + return entries[index] + if len(entries) == 1: return entries[0] diff --git a/yt_dlp/extractor/uplynk.py b/yt_dlp/extractor/uplynk.py index 87c427f63f..e7d816ef4f 100644 --- a/yt_dlp/extractor/uplynk.py +++ b/yt_dlp/extractor/uplynk.py @@ -2,40 +2,42 @@ from .common import InfoExtractor from ..utils import ( - float_or_none, ExtractorError, + float_or_none, + smuggle_url, + traverse_obj, + unsmuggle_url, + update_url_query, ) -class UplynkIE(InfoExtractor): - IE_NAME = 'uplynk' - _VALID_URL = r'https?://.*?\.uplynk\.com/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P[^&]+))?' - _TEST = { - 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', - 'info_dict': { - 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', - 'ext': 'mp4', - 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', - 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } +class UplynkBaseIE(InfoExtractor): + _UPLYNK_URL_RE = r'''(?x) + https?://[\w-]+\.uplynk\.com/(?P + ext/[0-9a-f]{32}/(?P[^/?&]+)| + (?P[0-9a-f]{32}) + )\.(?:m3u8|json) + (?:.*?\bpbs=(?P[^&]+))?''' - def _extract_uplynk_info(self, uplynk_content_url): - path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() + def _extract_uplynk_info(self, url): + uplynk_content_url, smuggled_data = unsmuggle_url(url, {}) + mobj = re.match(self._UPLYNK_URL_RE, uplynk_content_url) + if not mobj: + raise ExtractorError('Necessary parameters not found in Uplynk URL') + path, external_id, video_id, session_id = mobj.group('path', 'external_id', 'id', 'session_id') display_id = video_id or external_id + headers = traverse_obj( + smuggled_data, {'Referer': 'Referer', 'Origin': 'Origin'}, casesense=False) formats, subtitles = self._extract_m3u8_formats_and_subtitles( - 'http://content.uplynk.com/%s.m3u8' % path, - display_id, 'mp4', 'm3u8_native') + f'http://content.uplynk.com/{path}.m3u8', display_id, 'mp4', headers=headers) if session_id: for f in formats: - f['extra_param_to_segment_url'] = 'pbs=' + session_id - asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) + f['extra_param_to_segment_url'] = f'pbs={session_id}' + asset = self._download_json( + f'http://content.uplynk.com/player/assetinfo/{path}.json', display_id) if asset.get('error') == 1: - raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) + msg = asset.get('msg') or 'unknown error' + raise ExtractorError(f'{self.IE_NAME} said: {msg}', expected=True) return { 'id': asset['asset'], @@ -47,20 +49,40 @@ def _extract_uplynk_info(self, uplynk_content_url): 'subtitles': subtitles, } + +class UplynkIE(UplynkBaseIE): + IE_NAME = 'uplynk' + _VALID_URL = UplynkBaseIE._UPLYNK_URL_RE + _TEST = { + 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', + 'info_dict': { + 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', + 'ext': 'mp4', + 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', + 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', + 'duration': 530.2739166666679, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8', + }, + } + def _real_extract(self, url): return self._extract_uplynk_info(url) -class UplynkPreplayIE(UplynkIE): # XXX: Do not subclass from concrete IE +class UplynkPreplayIE(UplynkBaseIE): IE_NAME = 'uplynk:preplay' - _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.json' + _VALID_URL = r'https?://[\w-]+\.uplynk\.com/preplay2?/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.json' def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) path, external_id, video_id = self._match_valid_url(url).groups() display_id = video_id or external_id preplay = self._download_json(url, display_id) - content_url = 'http://content.uplynk.com/%s.m3u8' % path + content_url = f'http://content.uplynk.com/{path}.m3u8' session_id = preplay.get('sid') if session_id: - content_url += '?pbs=' + session_id - return self._extract_uplynk_info(content_url) + content_url = update_url_query(content_url, {'pbs': session_id}) + return self._extract_uplynk_info(smuggle_url(content_url, smuggled_data)) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index c6c89915b4..9dde34fb01 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4382,6 +4382,25 @@ def _extract_basic_item_renderer(item): elif key.startswith('grid') and key.endswith('Renderer'): return renderer + def _extract_channel_renderer(self, renderer): + channel_id = renderer['channelId'] + title = self._get_text(renderer, 'title') + channel_url = f'https://www.youtube.com/channel/{channel_id}' + return { + '_type': 'url', + 'url': channel_url, + 'id': channel_id, + 'ie_key': YoutubeTabIE.ie_key(), + 'channel': title, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'title': title, + 'channel_follower_count': self._get_count(renderer, 'subscriberCountText'), + 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), + 'playlist_count': self._get_count(renderer, 'videoCountText'), + 'description': self._get_text(renderer, 'descriptionSnippet'), + } + def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: if not isinstance(item, dict): @@ -4407,9 +4426,7 @@ def _grid_entries(self, grid_renderer): # channel channel_id = renderer.get('channelId') if channel_id: - yield self.url_result( - 'https://www.youtube.com/channel/%s' % channel_id, - ie=YoutubeTabIE.ie_key(), video_title=title) + yield self._extract_channel_renderer(renderer) continue # generic endpoint URL support ep_url = urljoin('https://www.youtube.com/', try_get( @@ -5762,7 +5779,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': 'cole-dlp-test-acc', 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', 'channel': 'cole-dlp-test-acc', - 'channel_follower_count': int, }, 'playlist_mincount': 1, 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, @@ -5930,7 +5946,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'cole-dlp-test-acc - Shorts', 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', 'channel': 'cole-dlp-test-acc', - 'channel_follower_count': int, 'description': 'test description', 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', @@ -5976,8 +5991,40 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': str, } }], - 'params': {'extract_flat': True}, + 'params': {'extract_flat': True, 'playlist_items': '1'}, 'playlist_mincount': 1 + }, { + # Channel renderer metadata. Contains number of videos on the channel + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'title': 'cole-dlp-test-acc - Channels', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'description': 'test description', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'ie_key': 'YoutubeTab', + 'url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'title': 'PewDiePie', + 'channel': 'PewDiePie', + 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'thumbnails': list, + 'channel_follower_count': int, + 'playlist_count': int + } + }], + 'params': {'extract_flat': True}, }] @classmethod @@ -6531,6 +6578,30 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): # 'title': '#cats', # }], }, + }, { + # Channel results + 'url': 'https://www.youtube.com/results?search_query=kurzgesagt&sp=EgIQAg%253D%253D', + 'info_dict': { + 'id': 'kurzgesagt', + 'title': 'kurzgesagt', + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'id': 'UCsXVk37bltHxD1rDPwtNM8Q', + 'url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', + 'ie_key': 'YoutubeTab', + 'channel': 'Kurzgesagt – In a Nutshell', + 'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc', + 'title': 'Kurzgesagt – In a Nutshell', + 'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q', + 'playlist_count': int, # XXX: should have a way of saying > 1 + 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', + 'thumbnails': list + } + }], + 'params': {'extract_flat': True, 'playlist_items': '1'}, + 'playlist_mincount': 1, }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True,