From 5424dbaf91728aaf77458e68d993ba6c34e8e222 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Mon, 19 Dec 2022 11:36:14 +0900 Subject: [PATCH 01/84] Deprioritize HEVC-over-FLV formats (#5823) Authored by: Lesmiscore --- yt_dlp/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 9697ba1c13..65408bf19b 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -6307,6 +6307,12 @@ def calculate_preference(self, format): # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported? # format['preference'] = -1000 + if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''): + # HEVC-over-FLV is out-of-spec by FLV's original spec + # ref. https://trac.ffmpeg.org/ticket/6389 + # ref. https://github.com/yt-dlp/yt-dlp/pull/5821 + format['preference'] = -100 + # Determine missing bitrates if format.get('tbr') is None: if format.get('vbr') is not None and format.get('abr') is not None: From 1fc089143c79b02b8373ae1d785d5e3a68635d4d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 21 Dec 2022 00:55:47 +0000 Subject: [PATCH 02/84] [extractor/reddit] Extract crossposted media (#5801) Closes #5798 Authored by: bashonly --- yt_dlp/extractor/reddit.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index f1a5c852af..fcfee51e8a 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -64,6 +64,25 @@ class RedditIE(InfoExtractor): 'id': 'wzqkxp', 'title': 'md5:72d3d19402aa11eff5bd32fc96369b37', }, + }, { + # crossposted reddit-hosted media + 'url': 'https://www.reddit.com/r/dumbfuckers_club/comments/zjjw82/cringe/', + 'md5': '746180895c7b75a9d6b05341f507699a', + 'info_dict': { + 'id': 'a1oneun6pa5a1', + 'ext': 'mp4', + 'display_id': 'zjjw82', + 'title': 'Cringe', + 'uploader': 'Otaku-senpai69420', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'upload_date': '20221212', + 'timestamp': 1670812309, + 'duration': 16, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, }, { 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', 'only_matching': True, @@ -179,7 +198,8 @@ def add_thumbnail(src): raise ExtractorError('No media found', expected=True) # Check if media is hosted on reddit: - reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False) + reddit_video = traverse_obj(data, ( + (None, ('crosspost_parent_list', ...)), ('secure_media', 'media'), 'reddit_video'), get_all=False) if reddit_video: playlist_urls = [ try_get(reddit_video, lambda x: unescapeHTML(x[y])) From 0b5546c723b9fb212e7e0199dbdaae8b8e0bf206 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 15 Dec 2022 19:58:57 +0530 Subject: [PATCH 03/84] [extractor] Let `_extract_format` functions obey `--ignore-no-formats` --- yt_dlp/extractor/common.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 3910c55adb..9031f3c116 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1759,6 +1759,9 @@ def _sleep(self, timeout, video_id, msg_template=None): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True, m3u8_id=None, data=None, headers={}, query={}): + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', @@ -1908,6 +1911,9 @@ def _extract_m3u8_formats_and_subtitles( errnote=None, fatal=True, live=False, data=None, headers={}, query={}): + if self.get_param('ignore_no_formats_error'): + fatal = False + if not m3u8_url: if errnote is not False: errnote = errnote or 'Failed to obtain m3u8 URL' @@ -2187,6 +2193,9 @@ def _xpath_ns(path, namespace=None): return '/'.join(out) def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) if res is False: assert not fatal @@ -2462,6 +2471,10 @@ def _extract_mpd_formats(self, *args, **kwargs): def _extract_mpd_formats_and_subtitles( self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( mpd_url, video_id, note='Downloading MPD manifest' if note is None else note, @@ -2831,6 +2844,9 @@ def _extract_ism_formats(self, *args, **kwargs): return fmts def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( ism_url, video_id, note='Downloading ISM manifest' if note is None else note, From 69f5fe45b98ef3ecb8e5ac69ebebdce7733a3ae4 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 20 Dec 2022 00:41:45 +0530 Subject: [PATCH 04/84] [FFmpegVideoConvertor] Add `gif` to `--recode-video` --- README.md | 10 +++++----- yt_dlp/postprocessor/ffmpeg.py | 5 ++++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c0a2a420bc..440ed19348 100644 --- a/README.md +++ b/README.md @@ -893,11 +893,11 @@ ## Post-Processing Options: specific bitrate like 128K (default 5) --remux-video FORMAT Remux the video into another container if necessary (currently supported: avi, flv, - mkv, mov, mp4, webm, aac, aiff, alac, flac, - m4a, mka, mp3, ogg, opus, vorbis, wav). If - target container does not support the - video/audio codec, remuxing will fail. You - can specify multiple rules; e.g. + gif, mkv, mov, mp4, webm, aac, aiff, alac, + flac, m4a, mka, mp3, ogg, opus, vorbis, + wav). If target container does not support + the video/audio codec, remuxing will fail. + You can specify multiple rules; e.g. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 and anything else to mkv --recode-video FORMAT Re-encode the video into another format if diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 67890fc310..069066e0c6 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -538,7 +538,10 @@ def run(self, information): class FFmpegVideoConvertorPP(FFmpegPostProcessor): - SUPPORTED_EXTS = (*MEDIA_EXTENSIONS.common_video, *sorted(MEDIA_EXTENSIONS.common_audio + ('aac', 'vorbis'))) + SUPPORTED_EXTS = ( + *sorted((*MEDIA_EXTENSIONS.common_video, 'gif')), + *sorted((*MEDIA_EXTENSIONS.common_audio, 'aac', 'vorbis')), + ) FORMAT_RE = create_mapping_re(SUPPORTED_EXTS) _ACTION = 'converting' From 8791e78cccd68db8161f06dc8567280e0d99a5e1 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 21 Dec 2022 20:30:26 +0530 Subject: [PATCH 05/84] Fix `original_url` in playlists --- yt_dlp/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 8d28783d86..abb0ddfe52 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1626,8 +1626,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None): if result_type in ('url', 'url_transparent'): ie_result['url'] = sanitize_url( ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https') - if ie_result.get('original_url'): - extra_info.setdefault('original_url', ie_result['original_url']) + if ie_result.get('original_url') and not extra_info.get('original_url'): + extra_info = {'original_url': ie_result['original_url'], **extra_info} extract_flat = self.params.get('extract_flat', False) if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) From 1c226ccdd464c09218a33824aedbcf3aa305a678 Mon Sep 17 00:00:00 2001 From: skbeh <60107333+skbeh@users.noreply.github.com> Date: Sat, 24 Dec 2022 18:47:37 +0800 Subject: [PATCH 06/84] [extractor/bilibili] Improve `_VALID_URL` (#5820) Authored by: skbeh --- yt_dlp/extractor/bilibili.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index bc0424194f..616a549607 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1034,7 +1034,7 @@ def _real_extract(self, url): class BiliLiveIE(InfoExtractor): - _VALID_URL = r'https?://live.bilibili.com/(?P\d+)' + _VALID_URL = r'https?://live.bilibili.com/(blanc/)?(?P\d+)' _TESTS = [{ 'url': 'https://live.bilibili.com/196', @@ -1050,6 +1050,9 @@ class BiliLiveIE(InfoExtractor): }, { 'url': 'https://live.bilibili.com/196?broadcast_type=0&is_room_feed=1?spm_id_from=333.999.space_home.strengthen_live_card.click', 'only_matching': True + }, { + 'url': 'https://live.bilibili.com/blanc/196', + 'only_matching': True }] _FORMATS = { From d61ef7f34395eae33810ec16397f86c54bf06af6 Mon Sep 17 00:00:00 2001 From: Giulio Muscarello Date: Sat, 24 Dec 2022 11:49:10 +0100 Subject: [PATCH 07/84] [extractor/ARD] Add vtt subtitles (#5835) Authored by: CapacitorSet --- yt_dlp/extractor/ard.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 0a8a8746ab..8660741ce4 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -46,6 +46,9 @@ def _parse_media_info(self, media_info, video_id, fsk): subtitles['de'] = [{ 'ext': 'ttml', 'url': subtitle_url, + }, { + 'ext': 'vtt', + 'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt', }] return { @@ -286,16 +289,16 @@ def _real_extract(self, url): class ARDIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P[^/?#&]+))\.html' _TESTS = [{ - # available till 7.01.2022 - 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', - 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', + # available till 7.12.2023 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html', + 'md5': 'a438f671e87a7eba04000336a119ccc4', 'info_dict': { - 'id': 'maischberger-die-woche-video100', - 'display_id': 'maischberger-die-woche-video100', + 'id': 'maischberger-video-424', + 'display_id': 'maischberger-video-424', 'ext': 'mp4', - 'duration': 3687.0, - 'title': 'maischberger. die woche vom 7. Januar 2021', - 'upload_date': '20210107', + 'duration': 4452.0, + 'title': 'maischberger am 07.12.2022', + 'upload_date': '20221207', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { From 9012d20b23b01827c8d75b460da22485c5cc80ef Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 27 Dec 2022 03:01:08 +0530 Subject: [PATCH 08/84] [extractor/mixch] Support `--wait-for-video` --- yt_dlp/extractor/mixch.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py index 3f430a7176..7eedbc7520 100644 --- a/yt_dlp/extractor/mixch.py +++ b/yt_dlp/extractor/mixch.py @@ -32,8 +32,10 @@ def _real_extract(self, url): initial_js_state = self._parse_json(self._search_regex( r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) - if not initial_js_state.get('liveInfo'): - raise ExtractorError('Livestream has ended.', expected=True) + + is_live = initial_js_state.get('liveInfo') + if not is_live: + self.raise_no_formats('Livestream has ended or has not started', expected=True) return { 'id': video_id, @@ -48,8 +50,8 @@ def _real_extract(self, url): 'url': traverse_obj(initial_js_state, ('liveInfo', 'hls')) or 'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_%s.m3u8' % video_id, 'ext': 'mp4', 'protocol': 'm3u8', - }], - 'is_live': True, + }] if is_live else [], + 'live_status': 'is_live' if is_live else 'is_upcoming', } From 4af47a00038dfbe6a243119e499f2e876e0f2766 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 27 Dec 2022 10:13:22 +0530 Subject: [PATCH 09/84] Fix 9012d20b23b01827c8d75b460da22485c5cc80ef --- yt_dlp/extractor/mixch.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py index 7eedbc7520..4be6947289 100644 --- a/yt_dlp/extractor/mixch.py +++ b/yt_dlp/extractor/mixch.py @@ -1,8 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - ExtractorError, - traverse_obj, -) +from ..utils import UserNotLive, traverse_obj class MixchIE(InfoExtractor): @@ -32,10 +29,8 @@ def _real_extract(self, url): initial_js_state = self._parse_json(self._search_regex( r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) - - is_live = initial_js_state.get('liveInfo') - if not is_live: - self.raise_no_formats('Livestream has ended or has not started', expected=True) + if not initial_js_state.get('liveInfo'): + raise UserNotLive(video_id=video_id) return { 'id': video_id, @@ -47,11 +42,12 @@ def _real_extract(self, url): 'uploader_id': video_id, 'formats': [{ 'format_id': 'hls', - 'url': traverse_obj(initial_js_state, ('liveInfo', 'hls')) or 'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_%s.m3u8' % video_id, + 'url': (traverse_obj(initial_js_state, ('liveInfo', 'hls')) + or f'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_{video_id}.m3u8'), 'ext': 'mp4', 'protocol': 'm3u8', - }] if is_live else [], - 'live_status': 'is_live' if is_live else 'is_upcoming', + }], + 'is_live': True, } From 032f22020c3aaf0c1be1bb500498d13782d01c73 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Tue, 27 Dec 2022 15:25:09 +0900 Subject: [PATCH 10/84] [extractor/trtcocuk] Add extractor (#5009) Closes #2635 Authored by: HobbyistDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/trtcocuk.py | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 yt_dlp/extractor/trtcocuk.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a12328f04a..63c7abb10c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1905,6 +1905,7 @@ TrovoChannelVodIE, TrovoChannelClipIE, ) +from .trtcocuk import TrtCocukVideoIE from .trueid import TrueIDIE from .trunews import TruNewsIE from .truth import TruthIE diff --git a/yt_dlp/extractor/trtcocuk.py b/yt_dlp/extractor/trtcocuk.py new file mode 100644 index 0000000000..f27f5a1e36 --- /dev/null +++ b/yt_dlp/extractor/trtcocuk.py @@ -0,0 +1,48 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, parse_iso8601, traverse_obj + + +class TrtCocukVideoIE(InfoExtractor): + _VALID_URL = r'https?://www\.trtcocuk\.net\.tr/video/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.trtcocuk.net.tr/video/kaptan-pengu-ve-arkadaslari-1', + 'info_dict': { + 'id': '3789738', + 'ext': 'mp4', + 'season_number': 1, + 'series': '"Kaptan Pengu ve Arkadaşları"', + 'season': 'Season 1', + 'title': 'Kaptan Pengu ve Arkadaşları 1 Bölüm İzle TRT Çocuk', + 'release_date': '20201209', + 'release_timestamp': 1607513774, + } + }, { + 'url': 'https://www.trtcocuk.net.tr/video/sef-rokanin-lezzet-dunyasi-17', + 'info_dict': { + 'id': '10260842', + 'ext': 'mp4', + 'series': '"Şef Roka\'nın Lezzet Dünyası"', + 'title': 'Şef Roka\'nın Lezzet Dünyası 17 Bölüm İzle TRT Çocuk', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + nuxtjs_data = self._search_nuxt_data(webpage, display_id)['data'] + + try: + video_url = self._parse_json(nuxtjs_data['video'], display_id) + except ExtractorError: + video_url = nuxtjs_data['video'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id) + + return { + 'id': str(nuxtjs_data['id']), + 'formats': formats, + 'subtitles': subtitles, + 'season_number': int_or_none(nuxtjs_data.get('season')), + 'release_timestamp': parse_iso8601(nuxtjs_data.get('publishedDate')), + 'series': traverse_obj(nuxtjs_data, ('show', 0, 'title')), + 'title': self._html_extract_title(webpage) # TODO: get better title + } From 247c8dd4f548436e2cf0f2e55a80aa37ec62555a Mon Sep 17 00:00:00 2001 From: barsnick Date: Tue, 27 Dec 2022 07:34:01 +0100 Subject: [PATCH 11/84] [extractor/urplay] Support for audio-only formats (#4606) Closes #4605 Authored by: barsnick --- yt_dlp/extractor/urplay.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/urplay.py b/yt_dlp/extractor/urplay.py index 0f0d6592d8..5d69dadd67 100644 --- a/yt_dlp/extractor/urplay.py +++ b/yt_dlp/extractor/urplay.py @@ -14,12 +14,13 @@ class URPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ur(?:play|skola)\.se/(?:program|Produkter)/(?P[0-9]+)' _TESTS = [{ 'url': 'https://urplay.se/program/203704-ur-samtiden-livet-universum-och-rymdens-markliga-musik-om-vetenskap-kritiskt-tankande-och-motstand', - 'md5': 'ff5b0c89928f8083c74bbd5099c9292d', + 'md5': '5ba36643c77cc3d34ffeadad89937d1e', 'info_dict': { 'id': '203704', 'ext': 'mp4', 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', + 'thumbnail': r're:^https?://.+\.jpg', 'timestamp': 1513292400, 'upload_date': '20171214', 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', @@ -29,6 +30,24 @@ class URPlayIE(InfoExtractor): 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', 'age_limit': 15, }, + }, { + 'url': 'https://urplay.se/program/222967-en-foralders-dagbok-mitt-barn-skadar-sig-sjalv', + 'info_dict': { + 'id': '222967', + 'ext': 'mp4', + 'title': 'En förälders dagbok : Mitt barn skadar sig själv', + 'description': 'md5:9f771eef03a732a213b367b52fe826ca', + 'thumbnail': r're:^https?://.+\.jpg', + 'timestamp': 1629676800, + 'upload_date': '20210823', + 'series': 'En förälders dagbok', + 'duration': 1740, + 'age_limit': 15, + 'episode_number': 3, + 'categories': 'count:2', + 'tags': 'count:7', + 'episode': 'Mitt barn skadar sig själv', + }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', 'info_dict': { @@ -36,12 +55,17 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', + 'thumbnail': r're:^https?://.+\.jpg', 'timestamp': 1440086400, 'upload_date': '20150820', 'series': 'Tripp, Trapp, Träd', 'duration': 865, + 'age_limit': 1, + 'episode_number': 1, + 'categories': [], 'tags': ['Sova'], 'episode': 'Sovkudde', + 'season': 'Säsong 1', }, }, { 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', @@ -69,7 +93,7 @@ def _real_extract(self, url): urplayer_streams = urplayer_data.get('streamingInfo', {}) for k, v in urplayer_streams.get('raw', {}).items(): - if not (k in ('sd', 'hd') and isinstance(v, dict)): + if not (k in ('sd', 'hd', 'mp3', 'm4a') and isinstance(v, dict)): continue file_http = v.get('location') if file_http: From 0ef3d470272694533301294e733e96343dab57af Mon Sep 17 00:00:00 2001 From: Bobscorn Date: Tue, 27 Dec 2022 20:04:56 +1300 Subject: [PATCH 12/84] [extractor/beatbump] Add extractors (#5304) Authored by: Bobscorn, pukkandan Closes #4653 --- yt_dlp/extractor/_extractors.py | 4 ++ yt_dlp/extractor/beatbump.py | 101 ++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 yt_dlp/extractor/beatbump.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 63c7abb10c..71cd54bf46 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -184,6 +184,10 @@ from .beeg import BeegIE from .behindkink import BehindKinkIE from .bellmedia import BellMediaIE +from .beatbump import ( + BeatBumpVideoIE, + BeatBumpPlaylistIE, +) from .beatport import BeatportIE from .berufetv import BerufeTVIE from .bet import BetIE diff --git a/yt_dlp/extractor/beatbump.py b/yt_dlp/extractor/beatbump.py new file mode 100644 index 0000000000..0f40ebe7ac --- /dev/null +++ b/yt_dlp/extractor/beatbump.py @@ -0,0 +1,101 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE, YoutubeTabIE + + +class BeatBumpVideoIE(InfoExtractor): + _VALID_URL = r'https://beatbump\.ml/listen\?id=(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://beatbump.ml/listen?id=MgNrAu2pzNs', + 'md5': '5ff3fff41d3935b9810a9731e485fe66', + 'info_dict': { + 'id': 'MgNrAu2pzNs', + 'ext': 'mp4', + 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', + 'artist': 'Stephen', + 'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp', + 'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', + 'upload_date': '20190312', + 'categories': ['Music'], + 'playable_in_embed': True, + 'duration': 169, + 'like_count': int, + 'alt_title': 'Voyeur Girl', + 'view_count': int, + 'track': 'Voyeur Girl', + 'uploader': 'Stephen - Topic', + 'title': 'Voyeur Girl', + 'channel_follower_count': int, + 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'album': 'it\'s too much love to know my dear', + 'channel': 'Stephen', + 'comment_count': int, + 'description': 'md5:7ae382a65843d6df2685993e90a8628f', + 'tags': 'count:11', + 'creator': 'Stephen', + 'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA', + } + }] + + def _real_extract(self, url): + id_ = self._match_id(url) + return self.url_result(f'https://music.youtube.com/watch?v={id_}', YoutubeIE, id_) + + +class BeatBumpPlaylistIE(InfoExtractor): + _VALID_URL = r'https://beatbump\.ml/(?:release\?id=|artist/|playlist/)(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://beatbump.ml/release?id=MPREb_gTAcphH99wE', + 'playlist_count': 50, + 'info_dict': { + 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', + 'availability': 'unlisted', + 'view_count': int, + 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', + 'description': '', + 'tags': [], + 'modified_date': '20221223', + } + }, { + 'url': 'https://beatbump.ml/artist/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'playlist_mincount': 1, + 'params': {'flatplaylist': True}, + 'info_dict': { + 'id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'channel_follower_count': int, + 'title': 'NoCopyrightSounds - Videos', + 'uploader': 'NoCopyrightSounds', + 'description': 'md5:cd4fd53d81d363d05eee6c1b478b491a', + 'channel': 'NoCopyrightSounds', + 'tags': 'count:12', + 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + }, + }, { + 'url': 'https://beatbump.ml/playlist/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'playlist_mincount': 1, + 'params': {'flatplaylist': True}, + 'info_dict': { + 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds', + 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/@NoCopyrightSounds', + 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'title': 'NCS : All Releases 💿', + 'uploader': 'NoCopyrightSounds', + 'availability': 'public', + 'channel': 'NoCopyrightSounds', + 'tags': [], + 'modified_date': '20221225', + 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + } + }] + + def _real_extract(self, url): + id_ = self._match_id(url) + return self.url_result(f'https://music.youtube.com/browse/{id_}', YoutubeTabIE, id_) From 15e9e578c04f1fa3f408dc3ec99491cc3f0ba839 Mon Sep 17 00:00:00 2001 From: chris <6024426+iw0nderhow@users.noreply.github.com> Date: Tue, 27 Dec 2022 20:52:58 +0100 Subject: [PATCH 13/84] [extractor/ArteTV] Extract chapters (#5879) Authored by: iw0nderhow, bashonly --- yt_dlp/extractor/arte.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index 54e4d2d0ce..dfbfe03c3c 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -65,6 +65,21 @@ class ArteTVIE(ArteTVBaseIE): }, { 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE', 'only_matching': True, + }, { + 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/', + 'info_dict': { + 'id': '110203-006-A', + 'chapters': 'count:16', + 'description': 'md5:cf592f1df52fe52007e3f8eac813c084', + 'alt_title': 'Zaz', + 'title': 'Baloise Session 2022', + 'timestamp': 1668445200, + 'duration': 4054, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530', + 'upload_date': '20221114', + 'ext': 'mp4', + }, + 'expected_warnings': ['geo restricted'] }] _GEO_BYPASS = True @@ -180,9 +195,6 @@ def _real_extract(self, url): else: self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}') - # TODO: chapters from stream['segments']? - # The JS also looks for chapters in config['data']['attributes']['chapters'], - # but I am yet to find a video having those formats.extend(secondary_formats) self._remove_duplicate_formats(formats) @@ -205,6 +217,11 @@ def _real_extract(self, url): {'url': image['url'], 'id': image.get('caption')} for image in metadata.get('images') or [] if url_or_none(image.get('url')) ], + # TODO: chapters may also be in stream['segments']? + 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., { + 'start_time': 'startTime', + 'title': 'title', + })) or None, } From da8d2de2082ab55f11d76d0aef7e6c3614672b45 Mon Sep 17 00:00:00 2001 From: "lauren n. liberda" Date: Tue, 27 Dec 2022 20:57:26 +0100 Subject: [PATCH 14/84] [extractor/cda] Support premium and misc improvements (#5529) * Fix cache for non-ASCII key * Improve error messages * Better UA for fingerprint bypass Authored by: selfisekai --- yt_dlp/cache.py | 9 ++++---- yt_dlp/extractor/cda.py | 47 +++++++++++++++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/yt_dlp/cache.py b/yt_dlp/cache.py index 4f9fb78d37..7be91eae5d 100644 --- a/yt_dlp/cache.py +++ b/yt_dlp/cache.py @@ -5,6 +5,7 @@ import re import shutil import traceback +import urllib.parse from .utils import expand_path, traverse_obj, version_tuple, write_json_file from .version import __version__ @@ -22,11 +23,9 @@ def _get_root_dir(self): return expand_path(res) def _get_cache_fn(self, section, key, dtype): - assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \ - 'invalid section %r' % section - assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key - return os.path.join( - self._get_root_dir(), section, f'{key}.{dtype}') + assert re.match(r'^[\w.-]+$', section), f'invalid section {section!r}' + key = urllib.parse.quote(key, safe='').replace('%', ',') # encode non-ascii characters + return os.path.join(self._get_root_dir(), section, f'{key}.{dtype}') @property def enabled(self): diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index d1212e686a..1157114b2a 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -4,6 +4,7 @@ import hashlib import hmac import json +import random import re from .common import InfoExtractor @@ -27,11 +28,10 @@ class CDAIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' _NETRC_MACHINE = 'cdapl' - _BASE_URL = 'http://www.cda.pl/' + _BASE_URL = 'https://www.cda.pl' _BASE_API_URL = 'https://api.cda.pl' _API_HEADERS = { 'Accept': 'application/vnd.cda.public+json', - 'User-Agent': 'pl.cda 1.0 (version 1.2.88 build 15306; Android 9; Xiaomi Redmi 3S)', } # hardcoded in the app _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q' @@ -101,6 +101,38 @@ def _download_age_confirm_page(self, url, video_id, *args, **kwargs): }, **kwargs) def _perform_login(self, username, password): + app_version = random.choice(( + '1.2.88 build 15306', + '1.2.174 build 18469', + )) + android_version = random.randrange(8, 14) + phone_model = random.choice(( + # x-kom.pl top selling Android smartphones, as of 2022-12-26 + # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android + 'ASUS ZenFone 8', + 'Motorola edge 20 5G', + 'Motorola edge 30 neo 5G', + 'Motorola moto g22', + 'OnePlus Nord 2T 5G', + 'Samsung Galaxy A32 SM‑A325F', + 'Samsung Galaxy M13', + 'Samsung Galaxy S20 FE 5G', + 'Xiaomi 11T', + 'Xiaomi POCO M4 Pro', + 'Xiaomi Redmi 10', + 'Xiaomi Redmi 10C', + 'Xiaomi Redmi 9C NFC', + 'Xiaomi Redmi Note 10 Pro', + 'Xiaomi Redmi Note 11 Pro', + 'Xiaomi Redmi Note 11', + 'Xiaomi Redmi Note 11S 5G', + 'Xiaomi Redmi Note 11S', + 'realme 10', + 'realme 9 Pro+', + 'vivo Y33s', + )) + self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})' + cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {} if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5: self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}' @@ -138,9 +170,6 @@ def _api_extract(self, video_id): meta = self._download_json( f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video'] - if meta.get('premium') and not meta.get('premium_free'): - self.report_drm(video_id) - uploader = traverse_obj(meta, 'author', 'login') formats = [{ @@ -151,6 +180,10 @@ def _api_extract(self, video_id): 'filesize': quality.get('length'), } for quality in meta['qualities'] if quality.get('file')] + if meta.get('premium') and not meta.get('premium_free') and not formats: + raise ExtractorError( + 'Video requires CDA Premium - subscription needed', expected=True) + return { 'id': video_id, 'title': meta.get('title'), @@ -167,10 +200,10 @@ def _api_extract(self, video_id): def _web_extract(self, video_id, url): self._set_cookie('cda.pl', 'cda.player', 'html5') webpage = self._download_webpage( - self._BASE_URL + '/video/' + video_id, video_id) + f'{self._BASE_URL}/video/{video_id}/vfilm', video_id) if 'Ten film jest dostępny dla użytkowników premium' in webpage: - raise ExtractorError('This video is only available for premium users.', expected=True) + self.raise_login_required('This video is only available for premium users') if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): self.raise_geo_restricted() From d1b5f3d79cb33f393f17aa12df24fca33c7ef3aa Mon Sep 17 00:00:00 2001 From: "lauren n. liberda" Date: Tue, 27 Dec 2022 21:47:25 +0100 Subject: [PATCH 15/84] [extractor/polskieradio] Adapt to next.js redesigns (#5416) Authored by: selfisekai --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/arte.py | 1 - yt_dlp/extractor/polskieradio.py | 213 ++++++++++++++++++++++++------- 3 files changed, 167 insertions(+), 49 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 71cd54bf46..ea1d0a2dfb 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1409,6 +1409,8 @@ from .polsatgo import PolsatGoIE from .polskieradio import ( PolskieRadioIE, + PolskieRadioLegacyIE, + PolskieRadioAuditionIE, PolskieRadioCategoryIE, PolskieRadioPlayerIE, PolskieRadioPodcastIE, diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index dfbfe03c3c..e3cc5afb05 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -195,7 +195,6 @@ def _real_extract(self, url): else: self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}') - formats.extend(secondary_formats) self._remove_duplicate_formats(formats) diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 99244f6b4a..68c4a2afd0 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -10,6 +10,7 @@ compat_urlparse ) from ..utils import ( + determine_ext, extract_attributes, ExtractorError, InAdvancePagedList, @@ -17,6 +18,7 @@ js_to_json, parse_iso8601, strip_or_none, + traverse_obj, unified_timestamp, unescapeHTML, url_or_none, @@ -48,28 +50,11 @@ def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): yield entry -class PolskieRadioIE(PolskieRadioBaseExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P[0-9]+)' - _TESTS = [{ # Old-style single broadcast. - 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', - 'info_dict': { - 'id': '1587943', - 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', - 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', - }, - 'playlist': [{ - 'md5': '2984ee6ce9046d91fc233bc1a864a09a', - 'info_dict': { - 'id': '1540576', - 'ext': 'mp3', - 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', - 'timestamp': 1456594200, - 'upload_date': '20160227', - 'duration': 2364, - 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' - }, - }], - }, { # New-style single broadcast. +class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): + # legacy sites + IE_NAME = 'polskieradio:legacy' + _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P\d+)' + _TESTS = [{ 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', 'info_dict': { 'id': '2534482', @@ -96,16 +81,6 @@ class PolskieRadioIE(PolskieRadioBaseExtractor): 'ext': 'mp3', 'title': 'Pogłos 29 października godz. 23:01', }, - }, { - 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', - 'only_matching': True, - }, { - 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', - 'only_matching': True, - }, { - # with mp4 video - 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', - 'only_matching': True, }, { 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', 'only_matching': True, @@ -114,7 +89,9 @@ class PolskieRadioIE(PolskieRadioBaseExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage, urlh = self._download_webpage_handle(url, playlist_id) + if PolskieRadioIE.suitable(urlh.url): + return self.url_result(urlh.url, PolskieRadioIE, playlist_id) content = self._search_regex( r'(?s)]+class="\s*this-article\s*"[^>]*>(.+?)]+class="tags"[^>]*>', @@ -153,23 +130,160 @@ def _real_extract(self, url): return self.playlist_result(entries, playlist_id, title, description) -class PolskieRadioCategoryIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P\d+)' +class PolskieRadioIE(InfoExtractor): + # new next.js sites, excluding radiokierowcow.pl + _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio(?:24)?\.pl/artykul/(?P\d+)' _TESTS = [{ - 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', + 'url': 'https://jedynka.polskieradio.pl/artykul/1587943', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '7a85d429-5356-4def-a347-925e4ae7406b', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + }, + }], + }, { + 'url': 'https://trojka.polskieradio.pl/artykul/1632955', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'https://trojka.polskieradio.pl/artykul/1634903', + 'only_matching': True, + }, { + 'url': 'https://jedynka.polskieradio.pl/artykul/3042436,Polityka-wschodnia-ojca-i-syna-Wladyslawa-Lokietka-i-Kazimierza-Wielkiego', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + article_data = traverse_obj( + self._search_nextjs_data(webpage, playlist_id), ('props', 'pageProps', 'data', 'articleData')) + + title = strip_or_none(article_data['title']) + + description = strip_or_none(article_data.get('lead')) + + entries = [{ + 'url': entry['file'], + 'ext': determine_ext(entry.get('fileName')), + 'id': self._search_regex( + r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'), + 'title': strip_or_none(entry.get('description')) or title, + } for entry in article_data.get('attachments') or () if entry['fileType'] in ('Audio', )] + + return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioAuditionIE(InfoExtractor): + # new next.js sites + IE_NAME = 'polskieradio:audition' + _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio\.pl/audycj[ae]/(?P\d+)' + _TESTS = [{ + # articles, PR1 + 'url': 'https://jedynka.polskieradio.pl/audycje/5102', 'info_dict': { 'id': '5102', - 'title': 'HISTORIA ŻYWA', + 'title': 'Historia żywa', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', }, 'playlist_mincount': 38, }, { - 'url': 'http://www.polskieradio.pl/7/4807', + # episodes, PR1 + 'url': 'https://jedynka.polskieradio.pl/audycje/5769', 'info_dict': { - 'id': '4807', - 'title': 'Vademecum 1050. rocznicy Chrztu Polski' + 'id': '5769', + 'title': 'AgroFakty', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', }, - 'playlist_mincount': 5 + 'playlist_mincount': 269, }, { + # both episodes and articles, PR3 + 'url': 'https://trojka.polskieradio.pl/audycja/8906', + 'info_dict': { + 'id': '8906', + 'title': 'Trójka budzi', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_mincount': 722, + }] + + def _call_lp3(self, path, query, video_id, note): + return self._download_json( + f'https://lp3test.polskieradio.pl/{path}', video_id, note, + query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'}) + + def _entries(self, playlist_id, has_episodes, has_articles): + for i in itertools.count(1) if has_episodes else []: + page = self._call_lp3( + 'AudioArticle/GetListByCategoryId', { + 'categoryId': playlist_id, + 'PageSize': 10, + 'skip': i, + 'format': 400, + }, playlist_id, f'Downloading episode list page {i}') + if not traverse_obj(page, 'data'): + break + for episode in page['data']: + yield { + 'id': str(episode['id']), + 'url': episode['file'], + 'title': episode.get('title'), + 'duration': int_or_none(episode.get('duration')), + 'timestamp': parse_iso8601(episode.get('datePublic')), + } + + for i in itertools.count(1) if has_articles else []: + page = self._call_lp3( + 'Article/GetListByCategoryId', { + 'categoryId': playlist_id, + 'PageSize': 9, + 'skip': i, + 'format': 400, + }, playlist_id, f'Downloading article list page {i}') + if not traverse_obj(page, 'data'): + break + for article in page['data']: + yield { + '_type': 'url_transparent', + 'ie_key': PolskieRadioIE.ie_key(), + 'id': str(article['id']), + 'url': article['url'], + 'title': article.get('shortTitle'), + 'description': traverse_obj(article, ('description', 'lead')), + 'timestamp': parse_iso8601(article.get('datePublic')), + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + page_props = traverse_obj( + self._search_nextjs_data(self._download_webpage(url, playlist_id), playlist_id), + ('props', 'pageProps', ('data', None)), get_all=False) + + has_episodes = bool(traverse_obj(page_props, 'episodes', 'audios')) + has_articles = bool(traverse_obj(page_props, 'articles')) + + return self.playlist_result( + self._entries(playlist_id, has_episodes, has_articles), playlist_id, + title=traverse_obj(page_props, ('details', 'name')), + description=traverse_obj(page_props, ('details', 'description', 'lead')), + thumbnail=traverse_obj(page_props, ('details', 'photo'))) + + +class PolskieRadioCategoryIE(InfoExtractor): + # legacy sites + IE_NAME = 'polskieradio:category' + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P\d+)' + _TESTS = [{ 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', 'only_matching': True }, { @@ -186,9 +300,6 @@ class PolskieRadioCategoryIE(InfoExtractor): 'title': 'Muzyka', }, 'playlist_mincount': 61 - }, { - 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', - 'only_matching': True, }, { 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', 'only_matching': True, @@ -196,7 +307,7 @@ class PolskieRadioCategoryIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) + return False if PolskieRadioLegacyIE.suitable(url) else super().suitable(url) def _entries(self, url, page, category_id): content = page @@ -209,7 +320,7 @@ def _entries(self, url, page, category_id): if not href: continue yield self.url_result( - compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), + compat_urlparse.urljoin(url, href), PolskieRadioLegacyIE, entry_id, entry.get('title')) mobj = re.search( r']+class=["\']next["\'][^>]*>\s*]+href=(["\'])(?P(?:(?!\1).)+)\1', @@ -222,7 +333,9 @@ def _entries(self, url, page, category_id): def _real_extract(self, url): category_id = self._match_id(url) - webpage = self._download_webpage(url, category_id) + webpage, urlh = self._download_webpage_handle(url, category_id) + if PolskieRadioAuditionIE.suitable(urlh.url): + return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id) title = self._html_search_regex( r'([^<]+) - [^<]+ - [^<]+', webpage, 'title', fatal=False) @@ -358,7 +471,7 @@ def get_page(page_num): 'entries': InAdvancePagedList( get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), 'id': str(data['id']), - 'title': data['title'], + 'title': data.get('title'), 'description': data.get('description'), 'uploader': data.get('announcer'), } @@ -374,6 +487,10 @@ class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): 'ext': 'mp3', 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', + 'episode': 'Theresa May rezygnuje. Co dalej z brexitem?', + 'duration': 2893, + 'thumbnail': 'https://static.prsa.pl/images/58649376-c8a0-4ba2-a714-78b383285f5f.jpg', + 'series': 'Raport o stanie świata', }, }] From a4d6ead30fde0e85eb34859e86c707621e38f8a1 Mon Sep 17 00:00:00 2001 From: Damiano Amatruda Date: Thu, 29 Dec 2022 07:54:19 +0100 Subject: [PATCH 16/84] [extractor/ciscowebex] Support password-protected videos (#5601) Authored by: damianoamatruda --- yt_dlp/extractor/ciscowebex.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/ciscowebex.py b/yt_dlp/extractor/ciscowebex.py index 44595d854c..0fcf022820 100644 --- a/yt_dlp/extractor/ciscowebex.py +++ b/yt_dlp/extractor/ciscowebex.py @@ -1,5 +1,6 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, try_get, unified_timestamp, @@ -38,11 +39,30 @@ def _real_extract(self, url): siteurl = mobj.group('siteurl_1') or mobj.group('siteurl_2') video_id = mobj.group('id') - stream = self._download_json( + password = self.get_param('videopassword') + + headers = {'Accept': 'application/json'} + if password: + headers['accessPwd'] = password + + stream, urlh = self._download_json_handle( 'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id), - video_id, fatal=False, query={'siteurl': siteurl}) - if not stream: - self.raise_login_required(method='cookies') + video_id, headers=headers, query={'siteurl': siteurl}, expected_status=(403, 429)) + + if urlh.status == 403: + if stream['code'] == 53004: + self.raise_login_required() + if stream['code'] == 53005: + if password: + raise ExtractorError('Wrong password', expected=True) + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {stream["code"]} - {stream["message"]}', expected=True) + + if urlh.status == 429: + self.raise_login_required( + f'{self.IE_NAME} asks you to solve a CAPTCHA. Solve CAPTCHA in browser and', + method='cookies') video_id = stream.get('recordUUID') or video_id @@ -78,7 +98,7 @@ def _real_extract(self, url): 'title': stream['recordName'], 'description': stream.get('description'), 'uploader': stream.get('ownerDisplayName'), - 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'), # mail or id + 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'), 'timestamp': unified_timestamp(stream.get('createTime')), 'duration': int_or_none(stream.get('duration'), 1000), 'webpage_url': 'https://%s.webex.com/recordingservice/sites/%s/recording/playback/%s' % (subdomain, siteurl, video_id), From 06a9d68eb8413120f7e03d6c288cf855cd782f77 Mon Sep 17 00:00:00 2001 From: Kurt Bestor Date: Thu, 29 Dec 2022 16:18:55 +0900 Subject: [PATCH 17/84] [extractor/youku] Fix extractor (#5622) Closes #4456 Authored by: KurtBestor --- yt_dlp/extractor/youku.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py index 624975b982..ab59200d79 100644 --- a/yt_dlp/extractor/youku.py +++ b/yt_dlp/extractor/youku.py @@ -96,25 +96,35 @@ class YoukuIE(InfoExtractor): 'thumbnail': r're:^https?://.*', 'uploader': '明月庄主moon', 'uploader_id': '38465621', - 'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0', + 'uploader_url': 'https://www.youku.com/profile/index/?uid=UMTUzODYyNDg0', 'tags': list, }, }, { - 'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805', + 'url': 'https://v.youku.com/v_show/id_XNTA2NTA0MjA1Mg==.html', 'info_dict': { - 'id': 'XMjIyNzAzMTQ4NA', + 'id': 'XNTA2NTA0MjA1Mg', 'ext': 'mp4', - 'title': '卡马乔国足开大脚长传冲吊集锦', - 'duration': 289, + 'title': 'Minecraft我的世界:建造超大巨型航空飞机,菜鸟vs高手vs黑客', + 'duration': 542.13, 'thumbnail': r're:^https?://.*', - 'uploader': '阿卜杜拉之星', - 'uploader_id': '2382249', - 'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==', + 'uploader': '波哥游戏解说', + 'uploader_id': '156688084', + 'uploader_url': 'https://www.youku.com/profile/index/?uid=UNjI2NzUyMzM2', 'tags': list, }, }, { - 'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html', - 'only_matching': True, + 'url': 'https://v.youku.com/v_show/id_XNTE1MzczOTg4MA==.html', + 'info_dict': { + 'id': 'XNTE1MzczOTg4MA', + 'ext': 'mp4', + 'title': '国产超A特工片', + 'duration': 362.97, + 'thumbnail': r're:^https?://.*', + 'uploader': '陈晓娟说历史', + 'uploader_id': '1640913339', + 'uploader_url': 'https://www.youku.com/profile/index/?uid=UNjU2MzY1MzM1Ng==', + 'tags': list, + }, }] @staticmethod @@ -151,7 +161,7 @@ def _real_extract(self, url): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0532', + 'ccode': '0524', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, From 074b2fae9076221faaa8697381428131ad968dc9 Mon Sep 17 00:00:00 2001 From: lkw123 <2020393267@qq.com> Date: Thu, 29 Dec 2022 15:38:49 +0800 Subject: [PATCH 18/84] [extractor/kankanews] Add extractor (#5729) Authored by: synthpop123 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/kankanews.py | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 yt_dlp/extractor/kankanews.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ea1d0a2dfb..672eb95962 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -831,6 +831,7 @@ from .kakao import KakaoIE from .kaltura import KalturaIE from .kanal2 import Kanal2IE +from .kankanews import KankaNewsIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE diff --git a/yt_dlp/extractor/kankanews.py b/yt_dlp/extractor/kankanews.py new file mode 100644 index 0000000000..46e239bd6c --- /dev/null +++ b/yt_dlp/extractor/kankanews.py @@ -0,0 +1,48 @@ +import time +import random +import string +import hashlib +import urllib.parse + +from .common import InfoExtractor + + +class KankaNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kankanews\.com/a/\d+\-\d+\-\d+/(?P\d+)\.shtml' + _TESTS = [{ + 'url': 'https://www.kankanews.com/a/2022-11-08/00310276054.shtml?appid=1088227', + 'md5': '05e126513c74b1258d657452a6f4eef9', + 'info_dict': { + 'id': '4485057', + 'url': 'http://mediaplay.kksmg.com/2022/11/08/h264_450k_mp4_1a388ad771e0e4cc28b0da44d245054e_ncm.mp4', + 'ext': 'mp4', + 'title': '视频|第23个中国记者节,我们在进博切蛋糕', + 'thumbnail': r're:^https?://.*\.jpg*', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'omsid\s*=\s*"(\d+)"', webpage, 'video id') + + params = { + 'nonce': ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)), + 'omsid': video_id, + 'platform': 'pc', + 'timestamp': int(time.time()), + 'version': '1.0', + } + params['sign'] = hashlib.md5((hashlib.md5(( + urllib.parse.urlencode(params) + '&28c8edde3d61a0411511d3b1866f0636' + ).encode()).hexdigest()).encode()).hexdigest() + + meta = self._download_json('https://api-app.kankanews.com/kankan/pc/getvideo', + video_id, query=params)['result']['video'] + + return { + 'id': video_id, + 'url': meta['videourl'], + 'title': self._search_regex(r'g\.title\s*=\s*"([^"]+)"', webpage, 'title'), + 'thumbnail': meta.get('titlepic'), + } From 6b71d186dda5c71b8ff2ec665cbda6f9d4ffb06e Mon Sep 17 00:00:00 2001 From: monnef <1975567+mnn@users.noreply.github.com> Date: Thu, 29 Dec 2022 08:47:23 +0100 Subject: [PATCH 19/84] [extractor/curiositystream] Fix auth (#5730) Authored by: mnn --- yt_dlp/extractor/curiositystream.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/curiositystream.py b/yt_dlp/extractor/curiositystream.py index 26cf24fbbd..941cf4e79c 100644 --- a/yt_dlp/extractor/curiositystream.py +++ b/yt_dlp/extractor/curiositystream.py @@ -1,4 +1,5 @@ import re +import urllib.parse from .common import InfoExtractor from ..compat import compat_str @@ -23,7 +24,7 @@ def _call_api(self, path, video_id, query=None): auth_cookie = self._get_cookies('https://curiositystream.com').get('auth_token') if auth_cookie: self.write_debug('Obtained auth_token cookie') - self._auth_token = auth_cookie.value + self._auth_token = urllib.parse.unquote(auth_cookie.value) if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( @@ -54,8 +55,11 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', 'channel': 'Curiosity Stream', 'categories': ['Technology', 'Interview'], - 'average_rating': 96.79, + 'average_rating': float, 'series_id': '2', + 'thumbnail': r're:https://img.curiositystream.com/.+\.jpg', + 'tags': [], + 'duration': 158 }, 'params': { # m3u8 download From 9fcd8ad1f21377f8cf784c35ebc758743227666e Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Thu, 29 Dec 2022 04:08:22 -0400 Subject: [PATCH 20/84] [extractor/spankbang] Fix extractor (#5791) Authored by: JChris246 Closes #5731 --- yt_dlp/extractor/spankbang.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/spankbang.py b/yt_dlp/extractor/spankbang.py index f242d334c9..43da34a325 100644 --- a/yt_dlp/extractor/spankbang.py +++ b/yt_dlp/extractor/spankbang.py @@ -177,7 +177,6 @@ class SpankBangPlaylistIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) playlist_id = mobj.group('id') - display_id = mobj.group('display_id') webpage = self._download_webpage( url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) @@ -186,11 +185,11 @@ def _real_extract(self, url): urljoin(url, mobj.group('path')), ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) for mobj in re.finditer( - r']+\bhref=(["\'])(?P/?[\da-z]+-(?P[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' - % re.escape(display_id), webpage)] + r']+\bhref=(["\'])(?P/?[\da-z]+-(?P[\da-z]+)/playlist/[^"\'](?:(?!\1).)*)\1', + webpage)] title = self._html_search_regex( - r'

([^<]+)\s+playlist\s*<', webpage, 'playlist title', + r'([^<]+)\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title) From 153e88a75151a51cc2a2fbf02d62f66fc09b29d9 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Thu, 29 Dec 2022 17:12:07 +0900 Subject: [PATCH 21/84] [extractor/netverse] Add `NetverseSearch` extractor (#5838) Authored by: HobbyistDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/netverse.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 672eb95962..1b76d82643 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1160,6 +1160,7 @@ from .netverse import ( NetverseIE, NetversePlaylistIE, + NetverseSearchIE, ) from .newgrounds import ( NewgroundsIE, diff --git a/yt_dlp/extractor/netverse.py b/yt_dlp/extractor/netverse.py index 3c4fd92eb0..398198a1b0 100644 --- a/yt_dlp/extractor/netverse.py +++ b/yt_dlp/extractor/netverse.py @@ -1,6 +1,6 @@ import itertools -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from .dailymotion import DailymotionIE from ..utils import smuggle_url, traverse_obj @@ -251,3 +251,31 @@ def _real_extract(self, url): self.parse_playlist(playlist_data['response'], playlist_id), traverse_obj(playlist_data, ('response', 'webseries_info', 'slug')), traverse_obj(playlist_data, ('response', 'webseries_info', 'title'))) + + +class NetverseSearchIE(SearchInfoExtractor): + _SEARCH_KEY = 'netsearch' + + _TESTS = [{ + 'url': 'netsearch10:tetangga', + 'info_dict': { + 'id': 'tetangga', + 'title': 'tetangga', + }, + 'playlist_count': 10, + }] + + def _search_results(self, query): + last_page = None + for i in itertools.count(1): + search_data = self._download_json( + 'https://api.netverse.id/search/elastic/search', query, + query={'q': query, 'page': i}, note=f'Downloading page {i}') + + videos = traverse_obj(search_data, ('response', 'data', ...)) + for video in videos: + yield self.url_result(f'https://netverse.id/video/{video["slug"]}', NetverseIE) + + last_page = last_page or traverse_obj(search_data, ('response', 'lastpage')) + if not videos or i >= (last_page or 0): + break From 9a9006ba20f1f9f34183e1bde098c75502a018f8 Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 29 Dec 2022 06:15:38 -0500 Subject: [PATCH 22/84] [extractor/twitcasting] Fix videos with password (#5894) Closes #5888 Authored by: bashonly, Spicadox --- yt_dlp/extractor/twitcasting.py | 34 +++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 735cb0bb08..2548dae047 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -38,7 +38,7 @@ class TwitCastingIE(InfoExtractor): 'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20110822', - 'timestamp': 1314010824, + 'timestamp': 1313978424, 'duration': 32, 'view_count': int, }, @@ -52,10 +52,10 @@ class TwitCastingIE(InfoExtractor): 'ext': 'mp4', 'title': 'Live playing something #3689740', 'uploader_id': 'mttbernardini', - 'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.', + 'description': 'md5:1dc7efa2f1ab932fcd119265cebeec69', 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20120212', - 'timestamp': 1329028024, + 'upload_date': '20120211', + 'timestamp': 1328995624, 'duration': 681, 'view_count': int, }, @@ -64,15 +64,22 @@ class TwitCastingIE(InfoExtractor): 'videopassword': 'abc', }, }, { - 'note': 'archive is split in 2 parts', 'url': 'https://twitcasting.tv/loft_heaven/movie/685979292', 'info_dict': { 'id': '685979292', 'ext': 'mp4', - 'title': '南波一海のhear_here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”', - 'duration': 6964.599334, + 'title': '【無料配信】南波一海のhear/here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”', + 'uploader_id': 'loft_heaven', + 'description': 'md5:3a0c7b53019df987ce545c935538bacf', + 'upload_date': '20210604', + 'timestamp': 1622802114, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6964, + 'view_count': int, + }, + 'params': { + 'skip_download': True, }, - 'playlist_mincount': 2, }] def _parse_data_movie_playlist(self, dmp, video_id): @@ -88,15 +95,18 @@ def _parse_data_movie_playlist(self, dmp, video_id): def _real_extract(self, url): uploader_id, video_id = self._match_valid_url(url).groups() + webpage, urlh = self._download_webpage_handle(url, video_id) video_password = self.get_param('videopassword') request_data = None if video_password: request_data = urlencode_postdata({ 'password': video_password, + **self._hidden_inputs(webpage), }, encoding='utf-8') - webpage, urlh = self._download_webpage_handle( - url, video_id, data=request_data, - headers={'Origin': 'https://twitcasting.tv'}) + webpage, urlh = self._download_webpage_handle( + url, video_id, data=request_data, + headers={'Origin': 'https://twitcasting.tv'}, + note='Trying video password') if urlh.geturl() != url and request_data: webpage = self._download_webpage( urlh.geturl(), video_id, data=request_data, @@ -122,7 +132,7 @@ def _real_extract(self, url): duration = (try_get(video_js_data, lambda x: sum(float_or_none(y.get('duration')) for y in x) / 1000) or parse_duration(clean_html(get_element_by_class('tw-player-duration-time', webpage)))) view_count = str_to_int(self._search_regex( - (r'Total\s*:\s*([\d,]+)\s*Views', r'総視聴者\s*:\s*([\d,]+)\s*]+datetime="([^"]+)"', webpage, 'datetime', None)) From 3d667e0047915c32f5df9fdd86a4223dc0e9ce8f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 12:03:03 +0000 Subject: [PATCH 23/84] [extractor/slideslive] Support embeds and slides (#5784) Authored by: bashonly, Grub4K, pukkandan --- yt_dlp/extractor/slideslive.py | 390 ++++++++++++++++++++++++++++++--- 1 file changed, 362 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/slideslive.py b/yt_dlp/extractor/slideslive.py index 86c26a8a2b..4268bfeaf1 100644 --- a/yt_dlp/extractor/slideslive.py +++ b/yt_dlp/extractor/slideslive.py @@ -1,16 +1,24 @@ +import re +import urllib.parse + from .common import InfoExtractor from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, smuggle_url, traverse_obj, unified_timestamp, + update_url_query, url_or_none, + xpath_text, ) class SlidesLiveIE(InfoExtractor): - _VALID_URL = r'https?://slideslive\.com/(?P[0-9]+)' + _VALID_URL = r'https?://slideslive\.com/(?:embed/(?:presentation/)?)?(?P[0-9]+)' _TESTS = [{ - # service_name = yoda + # service_name = yoda, only XML slides info 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', 'info_dict': { 'id': '38902413', @@ -19,12 +27,14 @@ class SlidesLiveIE(InfoExtractor): 'timestamp': 1648189972, 'upload_date': '20220325', 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:42', + 'chapters': 'count:41', }, 'params': { 'skip_download': 'm3u8', }, }, { - # service_name = yoda + # service_name = yoda, /v7/ slides 'url': 'https://slideslive.com/38935785', 'info_dict': { 'id': '38935785', @@ -32,13 +42,15 @@ class SlidesLiveIE(InfoExtractor): 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', 'upload_date': '20211115', 'timestamp': 1636996003, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:640', + 'chapters': 'count:639', }, 'params': { 'skip_download': 'm3u8', }, }, { - # service_name = yoda + # service_name = yoda, /v1/ slides 'url': 'https://slideslive.com/38973182/how-should-a-machine-learning-researcher-think-about-ai-ethics', 'info_dict': { 'id': '38973182', @@ -47,12 +59,14 @@ class SlidesLiveIE(InfoExtractor): 'upload_date': '20220201', 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1643728135, + 'thumbnails': 'count:3', + 'chapters': 'count:2', }, 'params': { 'skip_download': 'm3u8', }, }, { - # service_name = youtube + # service_name = youtube, only XML slides info 'url': 'https://slideslive.com/38897546/special-metaprednaska-petra-ludwiga-hodnoty-pro-lepsi-spolecnost', 'md5': '8a79b5e3d700837f40bd2afca3c8fa01', 'info_dict': { @@ -76,26 +90,253 @@ class SlidesLiveIE(InfoExtractor): 'comment_count': int, 'channel_follower_count': int, 'age_limit': 0, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:169', 'playable_in_embed': True, 'availability': 'unlisted', 'tags': [], 'categories': ['People & Blogs'], + 'chapters': 'count:168', }, }, { - # service_name = youtube + # embed-only presentation, only XML slides info + 'url': 'https://slideslive.com/embed/presentation/38925850', + 'info_dict': { + 'id': '38925850', + 'ext': 'mp4', + 'title': 'Towards a Deep Network Architecture for Structured Smoothness', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:8', + 'timestamp': 1629671508, + 'upload_date': '20210822', + 'chapters': 'count:7', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # embed-only presentation, only JSON slides info, /v5/ slides (.png) + 'url': 'https://slideslive.com/38979920/', + 'info_dict': { + 'id': '38979920', + 'ext': 'mp4', + 'title': 'MoReL: Multi-omics Relational Learning', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:7', + 'timestamp': 1654714970, + 'upload_date': '20220608', + 'chapters': 'count:6', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v2/ slides (.jpg) + 'url': 'https://slideslive.com/38954074', + 'info_dict': { + 'id': '38954074', + 'ext': 'mp4', + 'title': 'Decentralized Attribution of Generative Models', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:16', + 'timestamp': 1622806321, + 'upload_date': '20210604', + 'chapters': 'count:15', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v4/ slides (.png) + 'url': 'https://slideslive.com/38979570/', + 'info_dict': { + 'id': '38979570', + 'ext': 'mp4', + 'title': 'Efficient Active Search for Combinatorial Optimization Problems', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:9', + 'timestamp': 1654714896, + 'upload_date': '20220608', + 'chapters': 'count:8', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v10/ slides + 'url': 'https://slideslive.com/embed/presentation/38979880?embed_parent_url=https%3A%2F%2Fedit.videoken.com%2F', + 'info_dict': { + 'id': '38979880', + 'ext': 'mp4', + 'title': 'The Representation Power of Neural Networks', + 'timestamp': 1654714962, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:22', + 'upload_date': '20220608', + 'chapters': 'count:21', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v7/ slides, 2 video slides + 'url': 'https://slideslive.com/embed/presentation/38979682?embed_container_origin=https%3A%2F%2Fedit.videoken.com', + 'playlist_count': 3, + 'info_dict': { + 'id': '38979682-playlist', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models', + }, + 'playlist': [{ + 'info_dict': { + 'id': '38979682', + 'ext': 'mp4', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models', + 'timestamp': 1654714920, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:30', + 'upload_date': '20220608', + 'chapters': 'count:31', + }, + }, { + 'info_dict': { + 'id': '38979682-021', + 'ext': 'mp4', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 021', + 'duration': 3, + 'timestamp': 1654714920, + 'upload_date': '20220608', + }, + }, { + 'info_dict': { + 'id': '38979682-024', + 'ext': 'mp4', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 024', + 'duration': 4, + 'timestamp': 1654714920, + 'upload_date': '20220608', + }, + }], + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v6/ slides, 1 video slide, edit.videoken.com embed + 'url': 'https://slideslive.com/38979481/', + 'playlist_count': 2, + 'info_dict': { + 'id': '38979481-playlist', + 'title': 'How to Train Your MAML to Excel in Few-Shot Classification', + }, + 'playlist': [{ + 'info_dict': { + 'id': '38979481', + 'ext': 'mp4', + 'title': 'How to Train Your MAML to Excel in Few-Shot Classification', + 'timestamp': 1654714877, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:43', + 'upload_date': '20220608', + 'chapters': 'count:43', + }, + }, { + 'info_dict': { + 'id': '38979481-013', + 'ext': 'mp4', + 'title': 'How to Train Your MAML to Excel in Few-Shot Classification - Slide 013', + 'duration': 3, + 'timestamp': 1654714877, + 'upload_date': '20220608', + }, + }], + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v3/ slides, .jpg and .png, service_name = youtube + 'url': 'https://slideslive.com/embed/38932460/', + 'info_dict': { + 'id': 'RTPdrgkyTiE', + 'display_id': '38932460', + 'ext': 'mp4', + 'title': 'Active Learning for Hierarchical Multi-Label Classification', + 'description': 'Watch full version of this video at https://slideslive.com/38932460.', + 'channel': 'SlidesLive Videos - A', + 'channel_id': 'UC62SdArr41t_-_fX40QCLRw', + 'channel_url': 'https://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw', + 'uploader': 'SlidesLive Videos - A', + 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'uploader_url': 'http://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw', + 'upload_date': '20200903', + 'timestamp': 1602599092, + 'duration': 942, + 'age_limit': 0, + 'live_status': 'not_live', + 'playable_in_embed': True, + 'availability': 'unlisted', + 'categories': ['People & Blogs'], + 'tags': [], + 'channel_follower_count': int, + 'like_count': int, + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:jpg|png|webp)', + 'thumbnails': 'count:21', + 'chapters': 'count:20', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # service_name = yoda 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', 'only_matching': True, }, { - # service_name = url + # dead link, service_name = url 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1', 'only_matching': True, }, { - # service_name = vimeo + # dead link, service_name = vimeo 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + # only XML slides info + 'url': 'https://iclr.cc/virtual_2020/poster_Hklr204Fvr.html', + 'info_dict': { + 'id': '38925850', + 'ext': 'mp4', + 'title': 'Towards a Deep Network Architecture for Structured Smoothness', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:8', + 'timestamp': 1629671508, + 'upload_date': '20210822', + 'chapters': 'count:7', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Reference: https://slideslive.com/embed_presentation.js + for embed_id in re.findall(r'(?s)new\s+SlidesLiveEmbed\s*\([^)]+\bpresentationId:\s*["\'](\d+)["\']', webpage): + url_parsed = urllib.parse.urlparse(url) + origin = f'{url_parsed.scheme}://{url_parsed.netloc}' + yield update_url_query( + f'https://slideslive.com/embed/presentation/{embed_id}', { + 'embed_parent_url': url, + 'embed_container_origin': origin, + }) + + def _download_embed_webpage_handle(self, video_id, headers): + return self._download_webpage_handle( + f'https://slideslive.com/embed/presentation/{video_id}', video_id, + headers=headers, query=traverse_obj(headers, { + 'embed_parent_url': 'Referer', + 'embed_container_origin': 'Origin', + })) + def _extract_custom_m3u8_info(self, m3u8_data): m3u8_dict = {} @@ -108,6 +349,8 @@ def _extract_custom_m3u8_info(self, m3u8_data): 'VOD-VIDEO-ID': 'service_id', 'VOD-VIDEO-SERVERS': 'video_servers', 'VOD-SUBTITLES': 'subtitles', + 'VOD-SLIDES-JSON-URL': 'slides_json_url', + 'VOD-SLIDES-XML-URL': 'slides_xml_url', } for line in m3u8_data.splitlines(): @@ -126,9 +369,33 @@ def _extract_custom_m3u8_info(self, m3u8_data): return m3u8_dict + def _extract_formats(self, cdn_hostname, path, video_id): + formats = [] + formats.extend(self._extract_m3u8_formats( + f'https://{cdn_hostname}/{path}/master.m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=False, live=True)) + formats.extend(self._extract_mpd_formats( + f'https://{cdn_hostname}/{path}/master.mpd', + video_id, mpd_id='dash', fatal=False)) + return formats + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage, urlh = self._download_embed_webpage_handle( + video_id, headers=traverse_obj(parse_qs(url), { + 'Referer': ('embed_parent_url', -1), + 'Origin': ('embed_container_origin', -1)})) + redirect_url = urlh.geturl() + if 'domain_not_allowed' in redirect_url: + domain = traverse_obj(parse_qs(redirect_url), ('allowed_domains[]', ...), get_all=False) + if not domain: + raise ExtractorError( + 'This is an embed-only presentation. Try passing --referer', expected=True) + webpage, _ = self._download_embed_webpage_handle(video_id, headers={ + 'Referer': f'https://{domain}/', + 'Origin': f'https://{domain}', + }) + player_token = self._search_regex(r'data-player-token="([^"]+)"', webpage, 'player token') player_data = self._download_webpage( f'https://ben.slideslive.com/player/{video_id}', video_id, @@ -139,6 +406,50 @@ def _real_extract(self, url): assert service_name in ('url', 'yoda', 'vimeo', 'youtube') service_id = player_info['service_id'] + slides_info_url = None + slides, slides_info = [], [] + if player_info.get('slides_json_url'): + slides_info_url = player_info['slides_json_url'] + slides = traverse_obj(self._download_json( + slides_info_url, video_id, fatal=False, + note='Downloading slides JSON', errnote=False), 'slides', expected_type=list) or [] + for slide_id, slide in enumerate(slides, start=1): + slides_info.append(( + slide_id, traverse_obj(slide, ('image', 'name')), + int_or_none(slide.get('time'), scale=1000))) + + if not slides and player_info.get('slides_xml_url'): + slides_info_url = player_info['slides_xml_url'] + slides = self._download_xml( + slides_info_url, video_id, fatal=False, + note='Downloading slides XML', errnote='Failed to download slides info') + for slide_id, slide in enumerate(slides.findall('./slide'), start=1): + slides_info.append(( + slide_id, xpath_text(slide, './slideName', 'name'), + int_or_none(xpath_text(slide, './timeSec', 'time')))) + + slides_version = int(self._search_regex( + r'https?://slides\.slideslive\.com/\d+/v(\d+)/\w+\.(?:json|xml)', + slides_info_url, 'slides version', default=0)) + if slides_version < 4: + slide_url_template = 'https://cdn.slideslive.com/data/presentations/%s/slides/big/%s.jpg' + else: + slide_url_template = 'https://slides.slideslive.com/%s/slides/original/%s.png' + + chapters, thumbnails = [], [] + if url_or_none(player_info.get('thumbnail')): + thumbnails.append({'id': 'cover', 'url': player_info['thumbnail']}) + for slide_id, slide_path, start_time in slides_info: + if slide_path: + thumbnails.append({ + 'id': f'{slide_id:03d}', + 'url': slide_url_template % (video_id, slide_path), + }) + chapters.append({ + 'title': f'Slide {slide_id:03d}', + 'start_time': start_time, + }) + subtitles = {} for sub in traverse_obj(player_info, ('subtitles', ...), expected_type=dict): webvtt_url = url_or_none(sub.get('webvtt_url')) @@ -154,25 +465,15 @@ def _real_extract(self, url): 'title': player_info.get('title') or self._html_search_meta('title', webpage, default=''), 'timestamp': unified_timestamp(player_info.get('timestamp')), 'is_live': player_info.get('playlist_type') != 'vod', - 'thumbnail': url_or_none(player_info.get('thumbnail')), + 'thumbnails': thumbnails, + 'chapters': chapters, 'subtitles': subtitles, } - if service_name in ('url', 'yoda'): - if service_name == 'url': - info['url'] = service_id - else: - cdn_hostname = player_info['video_servers'][0] - formats = [] - formats.extend(self._extract_m3u8_formats( - f'https://{cdn_hostname}/{service_id}/master.m3u8', - video_id, 'mp4', m3u8_id='hls', fatal=False, live=True)) - formats.extend(self._extract_mpd_formats( - f'https://{cdn_hostname}/{service_id}/master.mpd', - video_id, mpd_id='dash', fatal=False)) - info.update({ - 'formats': formats, - }) + if service_name == 'url': + info['url'] = service_id + elif service_name == 'yoda': + info['formats'] = self._extract_formats(player_info['video_servers'][0], service_id, video_id) else: info.update({ '_type': 'url_transparent', @@ -185,4 +486,37 @@ def _real_extract(self, url): f'https://player.vimeo.com/video/{service_id}', {'http_headers': {'Referer': url}}) - return info + video_slides = traverse_obj(slides, (..., 'video', 'id')) + if not video_slides: + return info + + def entries(): + yield info + + service_data = self._download_json( + f'https://ben.slideslive.com/player/{video_id}/slides_video_service_data', + video_id, fatal=False, query={ + 'player_token': player_token, + 'videos': ','.join(video_slides), + }, note='Downloading video slides info', errnote='Failed to download video slides info') or {} + + for slide_id, slide in enumerate(slides, 1): + if not traverse_obj(slide, ('video', 'service')) == 'yoda': + continue + video_path = traverse_obj(slide, ('video', 'id')) + cdn_hostname = traverse_obj(service_data, ( + video_path, 'video_servers', ...), get_all=False) + if not cdn_hostname or not video_path: + continue + formats = self._extract_formats(cdn_hostname, video_path, video_id) + if not formats: + continue + yield { + 'id': f'{video_id}-{slide_id:03d}', + 'title': f'{info["title"]} - Slide {slide_id:03d}', + 'timestamp': info['timestamp'], + 'duration': int_or_none(traverse_obj(slide, ('video', 'duration_ms')), scale=1000), + 'formats': formats, + } + + return self.playlist_result(entries(), f'{video_id}-playlist', info['title']) From 4b183d49620e564219c01714ca8639199f6b1cc0 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 14:29:08 +0000 Subject: [PATCH 24/84] [extractor/videoken] Add extractors (#5824) Closes #5818 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 7 + yt_dlp/extractor/videoken.py | 336 ++++++++++++++++++++++++++++++++ 2 files changed, 343 insertions(+) create mode 100644 yt_dlp/extractor/videoken.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1b76d82643..e51228afff 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2097,6 +2097,13 @@ ) from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE +from .videoken import ( + VideoKenIE, + VideoKenPlayerIE, + VideoKenPlaylistIE, + VideoKenCategoryIE, + VideoKenTopicIE, +) from .videomore import ( VideomoreIE, VideomoreVideoIE, diff --git a/yt_dlp/extractor/videoken.py b/yt_dlp/extractor/videoken.py new file mode 100644 index 0000000000..560b41a6d7 --- /dev/null +++ b/yt_dlp/extractor/videoken.py @@ -0,0 +1,336 @@ +import base64 +import functools +import math +import re +import time +import urllib.parse + +from .common import InfoExtractor +from .slideslive import SlidesLiveIE +from ..utils import ( + ExtractorError, + InAdvancePagedList, + int_or_none, + traverse_obj, + update_url_query, + url_or_none, +) + + +class VideoKenBaseIE(InfoExtractor): + _ORGANIZATIONS = { + 'videos.icts.res.in': 'icts', + 'videos.cncf.io': 'cncf', + 'videos.neurips.cc': 'neurips', + } + _BASE_URL_RE = rf'https?://(?P{"|".join(map(re.escape, _ORGANIZATIONS))})/' + + _PAGE_SIZE = 12 + + def _get_org_id_and_api_key(self, org, video_id): + details = self._download_json( + f'https://analytics.videoken.com/api/videolake/{org}/details', video_id, + note='Downloading organization ID and API key', headers={ + 'Accept': 'application/json', + }) + return details['id'], details['apikey'] + + def _create_slideslive_url(self, video_url, video_id, referer): + if not video_url and not video_id: + return + elif not video_url or 'embed/sign-in' in video_url: + video_url = f'https://slideslive.com/embed/{video_id.lstrip("slideslive-")}' + if url_or_none(referer): + return update_url_query(video_url, { + 'embed_parent_url': referer, + 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).netloc}', + }) + return video_url + + def _extract_videos(self, videos, url): + for video in traverse_obj(videos, (('videos', 'results'), ...)): + video_id = traverse_obj(video, 'youtube_id', 'videoid') + if not video_id: + continue + ie_key = None + if traverse_obj(video, 'type', 'source') == 'youtube': + video_url = video_id + ie_key = 'Youtube' + else: + video_url = traverse_obj(video, 'embed_url', 'embeddableurl') + if urllib.parse.urlparse(video_url).netloc == 'slideslive.com': + ie_key = SlidesLiveIE + video_url = self._create_slideslive_url(video_url, video_id, url) + if not video_url: + continue + yield self.url_result(video_url, ie_key, video_id) + + +class VideoKenIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P[\w-]+)' + _TESTS = [{ + # neurips -> videoken -> slideslive + 'url': 'https://videos.neurips.cc/video/slideslive-38922815', + 'info_dict': { + 'id': '38922815', + 'ext': 'mp4', + 'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures', + 'timestamp': 1630939331, + 'upload_date': '20210906', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:330', + 'chapters': 'count:329', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to download VideoKen API JSON'], + }, { + # neurips -> videoken -> slideslive -> youtube + 'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348', + 'info_dict': { + 'id': '2Xa_dt78rJE', + 'ext': 'mp4', + 'display_id': '38923348', + 'title': 'Machine Education', + 'description': 'Watch full version of this video at https://slideslive.com/38923348.', + 'channel': 'SlidesLive Videos - G2', + 'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w', + 'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w', + 'uploader': 'SlidesLive Videos - G2', + 'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w', + 'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w', + 'duration': 2504, + 'timestamp': 1618922125, + 'upload_date': '20200131', + 'age_limit': 0, + 'channel_follower_count': int, + 'view_count': int, + 'availability': 'unlisted', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'categories': ['People & Blogs'], + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:78', + 'chapters': 'count:77', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to download VideoKen API JSON'], + }, { + # icts -> videoken -> youtube + 'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc', + 'info_dict': { + 'id': 'zysIsojYdvc', + 'ext': 'mp4', + 'title': 'Small-worlds, complex networks and random graphs (Lecture 3) by Remco van der Hofstad', + 'description': 'md5:87433069d79719eeadc1962cc2ace00b', + 'channel': 'International Centre for Theoretical Sciences', + 'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ', + 'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ', + 'uploader': 'International Centre for Theoretical Sciences', + 'uploader_id': 'ICTStalks', + 'uploader_url': 'http://www.youtube.com/user/ICTStalks', + 'duration': 3372, + 'upload_date': '20191004', + 'age_limit': 0, + 'live_status': 'not_live', + 'availability': 'public', + 'playable_in_embed': True, + 'channel_follower_count': int, + 'like_count': int, + 'view_count': int, + 'categories': ['Science & Technology'], + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:42', + 'chapters': 'count:20', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8', + 'only_matching': True, + }, { + 'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI', + 'only_matching': True, + }, { + 'url': 'https://videos.icts.res.in/video/d7HuP_abpKU', + 'only_matching': True, + }] + + def _real_extract(self, url): + hostname, video_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id) + details = self._download_json( + 'https://analytics.videoken.com/api/videoinfo_private', video_id, query={ + 'videoid': video_id, + 'org_id': org_id, + }, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON', + errnote='Failed to download VideoKen API JSON', fatal=False) + if details: + return next(self._extract_videos({'videos': [details]}, url)) + # fallback for API error 400 response + elif video_id.startswith('slideslive-'): + return self.url_result( + self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id) + elif re.match(r'^[\w-]{11}$', video_id): + self.url_result(video_id, 'Youtube', video_id) + else: + raise ExtractorError('Unable to extract without VideoKen API response') + + +class VideoKenPlayerIE(VideoKenBaseIE): + _VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P\d+)' + _TESTS = [{ + 'url': 'https://player.videoken.com/embed/slideslive-38968434', + 'info_dict': { + 'id': '38968434', + 'ext': 'mp4', + 'title': 'Deep Learning with Label Differential Privacy', + 'timestamp': 1643377020, + 'upload_date': '20220128', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:30', + 'chapters': 'count:29', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id) + + +class VideoKenPlaylistIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P\d+)' + _TESTS = [{ + 'url': 'https://videos.icts.res.in/category/1822/playlist/381', + 'playlist_mincount': 117, + 'info_dict': { + 'id': '381', + 'title': 'Cosmology - The Next Decade', + }, + }] + + def _real_extract(self, url): + hostname, playlist_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id) + videos = self._download_json( + f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/', + playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON') + return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title')) + + +class VideoKenCategoryIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P\d+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://videos.icts.res.in/category/1822/', + 'playlist_mincount': 500, + 'info_dict': { + 'id': '1822', + 'title': 'Programs', + }, + }, { + 'url': 'https://videos.neurips.cc/category/350/', + 'playlist_mincount': 34, + 'info_dict': { + 'id': '350', + 'title': 'NeurIPS 2018', + }, + }, { + 'url': 'https://videos.cncf.io/category/479/', + 'playlist_mincount': 328, + 'info_dict': { + 'id': '479', + 'title': 'KubeCon + CloudNativeCon Europe\'19', + }, + }] + + def _get_category_page(self, category_id, org_id, page=1, note=None): + return self._download_json( + f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id, + fatal=False, note=note if note else f'Downloading category page {page}', + query={ + 'category_id': category_id, + 'page_number': page, + 'length': self._PAGE_SIZE, + }, headers={'Accept': 'application/json'}) or {} + + def _entries(self, category_id, org_id, url, page): + videos = self._get_category_page(category_id, org_id, page + 1) + yield from self._extract_videos(videos, url) + + def _real_extract(self, url): + hostname, category_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id) + category_info = self._get_category_page(category_id, org_id, note='Downloading category info') + category = category_info['category_name'] + total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, category_id, org_id, url), + total_pages, self._PAGE_SIZE), category_id, category) + + +class VideoKenTopicIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P[^/#?]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://videos.neurips.cc/topic/machine%20learning/', + 'playlist_mincount': 500, + 'info_dict': { + 'id': 'machine_learning', + 'title': 'machine learning', + }, + }, { + 'url': 'https://videos.icts.res.in/topic/gravitational%20waves/', + 'playlist_mincount': 77, + 'info_dict': { + 'id': 'gravitational_waves', + 'title': 'gravitational waves' + }, + }, { + 'url': 'https://videos.cncf.io/topic/prometheus/', + 'playlist_mincount': 134, + 'info_dict': { + 'id': 'prometheus', + 'title': 'prometheus', + }, + }] + + def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None): + return self._download_json( + 'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={ + 'orgid': org_id, + 'size': self._PAGE_SIZE, + 'query': topic, + 'page': page, + 'sort': 'upload_desc', + 'filter': 'all', + 'token': api_key, + 'is_topic': 'true', + 'category': '', + 'searchid': search_id, + }, headers={'Accept': 'application/json'}, + note=note if note else f'Downloading topic page {page}') or {} + + def _entries(self, topic, org_id, search_id, api_key, url, page): + videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1) + yield from self._extract_videos(videos, url) + + def _real_extract(self, url): + hostname, topic_id = self._match_valid_url(url).group('host', 'id') + topic = urllib.parse.unquote(topic_id) + topic_id = topic.replace(' ', '_') + org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic) + search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode() + total_pages = int_or_none(self._get_topic_page( + topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages']) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, topic, org_id, search_id, api_key, url), + total_pages, self._PAGE_SIZE), topic_id, topic) From 53006b35ea8b26ff31a96a423ddaa3304d0a124e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 15:04:09 +0000 Subject: [PATCH 25/84] [extractor/amazon] Add `AmazonReviews` extractor (#5857) Closes #5766 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/amazon.py | 116 ++++++++++++++++++++++++++++++-- 2 files changed, 113 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e51228afff..4fed24c35b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -87,7 +87,10 @@ AluraCourseIE ) from .amcnetworks import AMCNetworksIE -from .amazon import AmazonStoreIE +from .amazon import ( + AmazonStoreIE, + AmazonReviewsIE, +) from .amazonminitv import ( AmazonMiniTVIE, AmazonMiniTVSeasonIE, diff --git a/yt_dlp/extractor/amazon.py b/yt_dlp/extractor/amazon.py index 4d3170683a..a03f983e0e 100644 --- a/yt_dlp/extractor/amazon.py +++ b/yt_dlp/extractor/amazon.py @@ -1,5 +1,17 @@ +import re + from .common import InfoExtractor -from ..utils import ExtractorError, int_or_none +from ..utils import ( + ExtractorError, + clean_html, + float_or_none, + get_element_by_attribute, + get_element_by_class, + int_or_none, + js_to_json, + traverse_obj, + url_or_none, +) class AmazonStoreIE(InfoExtractor): @@ -9,7 +21,7 @@ class AmazonStoreIE(InfoExtractor): 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', 'info_dict': { 'id': 'B098XNCHLD', - 'title': 'md5:dae240564cbb2642170c02f7f0d7e472', + 'title': str, }, 'playlist_mincount': 1, 'playlist': [{ @@ -20,28 +32,32 @@ class AmazonStoreIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 34, }, - }] + }], + 'expected_warnings': ['Unable to extract data'], }, { 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', 'info_dict': { 'id': 'B0863TXGM3', - 'title': 'md5:d1d3352428f8f015706c84b31e132169', + 'title': str, }, 'playlist_mincount': 4, + 'expected_warnings': ['Unable to extract data'], }, { 'url': 'https://www.amazon.com/dp/B0845NXCXF/', 'info_dict': { 'id': 'B0845NXCXF', - 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + 'title': str, }, 'playlist-mincount': 1, + 'expected_warnings': ['Unable to extract data'], }, { 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', 'info_dict': { 'id': 'B08WX337PQ', - 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + 'title': str, }, 'playlist_mincount': 1, + 'expected_warnings': ['Unable to extract data'], }] def _real_extract(self, url): @@ -52,7 +68,7 @@ def _real_extract(self, url): try: data_json = self._search_json( r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, - transform_source=lambda x: x.replace(R'\\u', R'\u')) + transform_source=js_to_json) except ExtractorError as e: retry.error = e @@ -66,3 +82,89 @@ def _real_extract(self, url): 'width': int_or_none(video.get('videoWidth')), } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) + + +class AmazonReviewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P[^/&#$?]+)' + _TESTS = [{ + 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl', + 'info_dict': { + 'id': 'R10VE9VUSY19L3', + 'ext': 'mp4', + 'title': 'Get squad #Suspicious', + 'description': 'md5:7012695052f440a1e064e402d87e0afb', + 'uploader': 'Kimberly Cronkright', + 'average_rating': 1.0, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['Review body was not found in webpage'], + }, { + 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US', + 'info_dict': { + 'id': 'R10VE9VUSY19L3', + 'ext': 'mp4', + 'title': 'Get squad #Suspicious', + 'description': 'md5:7012695052f440a1e064e402d87e0afb', + 'uploader': 'Kimberly Cronkright', + 'average_rating': 1.0, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['Review body was not found in webpage'], + }, { + 'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/', + 'info_dict': { + 'id': 'RV1CO8JN5VGXV', + 'ext': 'mp4', + 'title': 'Not sure about its durability', + 'description': 'md5:1a252c106357f0a3109ebf37d2e87494', + 'uploader': 'Shoaib Gulzar', + 'average_rating': 2.0, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['Review body was not found in webpage'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + for retry in self.RetryManager(): + webpage = self._download_webpage(url, video_id) + review_body = get_element_by_attribute('data-hook', 'review-body', webpage) + if not review_body: + retry.error = ExtractorError('Review body was not found in webpage', expected=True) + + formats, subtitles = [], {} + + manifest_url = self._search_regex( + r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None) + if url_or_none(manifest_url): + fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + manifest_url, video_id, 'mp4', fatal=False) + formats.extend(fmts) + + video_url = self._search_regex( + r']+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None) + if url_or_none(video_url): + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'http-mp4', + }) + + if not formats: + self.raise_no_formats('No video found for this customer review', expected=True) + + return { + 'id': video_id, + 'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage)) + or self._html_extract_title(webpage)), + 'description': clean_html(traverse_obj(re.findall( + r'(.+?)', review_body), -1)), + 'uploader': clean_html(get_element_by_class('a-profile-name', webpage)), + 'average_rating': float_or_none(clean_html(get_element_by_attribute( + 'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]), + 'thumbnail': self._search_regex( + r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None), + 'formats': formats, + 'subtitles': subtitles, + } From 2647c933b8ed22f95dd8e9866c4db031867a1bc8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 16:32:54 +0000 Subject: [PATCH 26/84] [extractor/wistia] Improve extension detection (#5415) Closes #5053 Authored by: bashonly, Grub4k, pukkandan --- yt_dlp/extractor/wistia.py | 41 ++++++++----- yt_dlp/utils.py | 122 +++++++++++++++++++++++-------------- 2 files changed, 104 insertions(+), 59 deletions(-) diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index 38dcc2f5b5..884fa4b5fd 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -6,12 +6,15 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + HEADRequest, + determine_ext, float_or_none, int_or_none, parse_qs, traverse_obj, try_get, update_url_query, + urlhandle_detect_ext, ) @@ -34,6 +37,16 @@ def _download_embed_config(self, config_type, config_id, referer): return embed_config + def _get_real_ext(self, url): + ext = determine_ext(url, default_ext='bin') + if ext == 'bin': + urlh = self._request_webpage( + HEADRequest(url), None, note='Checking media extension', + errnote='HEAD request returned error', fatal=False) + if urlh: + ext = urlhandle_detect_ext(urlh, default='bin') + return 'mp4' if ext == 'mov' else ext + def _extract_media(self, embed_config): data = embed_config['media'] video_id = data['hashedId'] @@ -51,13 +64,13 @@ def _extract_media(self, embed_config): continue elif atype in ('still', 'still_image'): thumbnails.append({ - 'url': aurl, + 'url': aurl.replace('.bin', f'.{self._get_real_ext(aurl)}'), 'width': int_or_none(a.get('width')), 'height': int_or_none(a.get('height')), 'filesize': int_or_none(a.get('size')), }) else: - aext = a.get('ext') + aext = a.get('ext') or self._get_real_ext(aurl) display_name = a.get('display_name') format_id = atype if atype and atype.endswith('_video') and display_name: @@ -169,26 +182,26 @@ class WistiaIE(WistiaBaseIE): 'md5': '10c1ce9c4dde638202513ed17a3767bd', 'info_dict': { 'id': 'a6ndpko1wg', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'Episode 2: Boxed Water\'s retention is thirsty', 'upload_date': '20210324', 'description': 'md5:da5994c2c2d254833b412469d9666b7a', 'duration': 966.0, 'timestamp': 1616614369, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.png', } }, { 'url': 'wistia:5vd7p4bct5', 'md5': 'b9676d24bf30945d97060638fbfe77f0', 'info_dict': { 'id': '5vd7p4bct5', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679', 'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f', 'upload_date': '20220915', 'timestamp': 1663258727, 'duration': 623.019, - 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.(?:jpg|bin)$', + 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.jpg$', }, }, { 'url': 'wistia:sh7fpupwlt', @@ -208,25 +221,25 @@ class WistiaIE(WistiaBaseIE): 'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool', 'info_dict': { 'id': 'cqwukac3z1', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content', 'duration': 158.125, 'timestamp': 1618974400, 'description': 'md5:27abc99a758573560be72600ef95cece', 'upload_date': '20210421', - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.jpg', } }, { 'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', 'md5': 'b9676d24bf30945d97060638fbfe77f0', 'info_dict': { 'id': '5vd7p4bct5', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', 'upload_date': '20220915', 'timestamp': 1663258727, 'duration': 623.019, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.jpg', 'description': 'a Paywall Videos video', }, }] @@ -302,9 +315,9 @@ class WistiaChannelIE(WistiaBaseIE): 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n', 'info_dict': { 'id': 'sp5dqjzw3n', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'The Roof S2: The Modern CRO', - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.png', 'duration': 86.487, 'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n', 'timestamp': 1619790290, @@ -334,12 +347,12 @@ class WistiaChannelIE(WistiaBaseIE): 'info_dict': { 'id': 'pz0m0l0if3', 'title': 'A Framework for Improving Product Team Performance', - 'ext': 'bin', + 'ext': 'mp4', 'timestamp': 1653935275, 'upload_date': '20220530', 'description': 'Learn how to help your company improve and achieve your product related goals.', 'duration': 1854.39, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.png', }, 'params': {'noplaylist': True, 'skip_download': True}, }] diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 65408bf19b..3947dcf2e5 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3480,67 +3480,93 @@ def error_to_str(err): return f'{type(err).__name__}: {err}' -def mimetype2ext(mt): - if mt is None: +def mimetype2ext(mt, default=NO_DEFAULT): + if not isinstance(mt, str): + if default is not NO_DEFAULT: + return default return None - mt, _, params = mt.partition(';') - mt = mt.strip() - - FULL_MAP = { - 'audio/mp4': 'm4a', - # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as - # it's the most popular one - 'audio/mpeg': 'mp3', - 'audio/x-wav': 'wav', - 'audio/wav': 'wav', - 'audio/wave': 'wav', - } - - ext = FULL_MAP.get(mt) - if ext is not None: - return ext - - SUBTYPE_MAP = { + MAP = { + # video '3gpp': '3gp', - 'smptett+xml': 'tt', - 'ttaf+xml': 'dfxp', - 'ttml+xml': 'ttml', - 'x-flv': 'flv', - 'x-mp4-fragmented': 'mp4', - 'x-ms-sami': 'sami', - 'x-ms-wmv': 'wmv', + 'mp2t': 'ts', + 'mp4': 'mp4', + 'mpeg': 'mpeg', 'mpegurl': 'm3u8', - 'x-mpegurl': 'm3u8', - 'vnd.apple.mpegurl': 'm3u8', + 'quicktime': 'mov', + 'webm': 'webm', + 'vp9': 'vp9', + 'x-flv': 'flv', + 'x-m4v': 'm4v', + 'x-matroska': 'mkv', + 'x-mng': 'mng', + 'x-mp4-fragmented': 'mp4', + 'x-ms-asf': 'asf', + 'x-ms-wmv': 'wmv', + 'x-msvideo': 'avi', + + # application (streaming playlists) 'dash+xml': 'mpd', 'f4m+xml': 'f4m', 'hds+xml': 'f4m', + 'vnd.apple.mpegurl': 'm3u8', 'vnd.ms-sstr+xml': 'ism', - 'quicktime': 'mov', - 'mp2t': 'ts', + 'x-mpegurl': 'm3u8', + + # audio + 'audio/mp4': 'm4a', + # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. + # Using .mp3 as it's the most popular one + 'audio/mpeg': 'mp3', + 'audio/webm': 'weba', + 'audio/x-matroska': 'mka', + 'audio/x-mpegurl': 'm3u', + 'midi': 'mid', + 'ogg': 'ogg', + 'wav': 'wav', + 'wave': 'wav', + 'x-aac': 'aac', + 'x-flac': 'flac', + 'x-m4a': 'm4a', + 'x-realaudio': 'ra', 'x-wav': 'wav', - 'filmstrip+json': 'fs', + + # image + 'avif': 'avif', + 'bmp': 'bmp', + 'gif': 'gif', + 'jpeg': 'jpg', + 'png': 'png', 'svg+xml': 'svg', - } + 'tiff': 'tif', + 'vnd.wap.wbmp': 'wbmp', + 'webp': 'webp', + 'x-icon': 'ico', + 'x-jng': 'jng', + 'x-ms-bmp': 'bmp', - _, _, subtype = mt.rpartition('/') - ext = SUBTYPE_MAP.get(subtype.lower()) - if ext is not None: - return ext + # caption + 'filmstrip+json': 'fs', + 'smptett+xml': 'tt', + 'ttaf+xml': 'dfxp', + 'ttml+xml': 'ttml', + 'x-ms-sami': 'sami', - SUFFIX_MAP = { + # misc + 'gzip': 'gz', 'json': 'json', 'xml': 'xml', 'zip': 'zip', - 'gzip': 'gz', } - _, _, suffix = subtype.partition('+') - ext = SUFFIX_MAP.get(suffix) - if ext is not None: - return ext + mimetype = mt.partition(';')[0].strip().lower() + _, _, subtype = mimetype.rpartition('/') + ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1]) + if ext: + return ext + elif default is not NO_DEFAULT: + return default return subtype.replace('+', '.') @@ -3634,7 +3660,7 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): return 'mkv' if allow_mkv else preferences[-1] -def urlhandle_detect_ext(url_handle): +def urlhandle_detect_ext(url_handle, default=NO_DEFAULT): getheader = url_handle.headers.get cd = getheader('Content-Disposition') @@ -3645,7 +3671,13 @@ def urlhandle_detect_ext(url_handle): if e: return e - return mimetype2ext(getheader('Content-Type')) + meta_ext = getheader('x-amz-meta-name') + if meta_ext: + e = meta_ext.rpartition('.')[2] + if e: + return e + + return mimetype2ext(getheader('Content-Type'), default=default) def encode_data_uri(data, mime_type): From c1edb853b0a0cc69ea08337c0c5aee669b26d3d2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 17:31:01 +0000 Subject: [PATCH 27/84] [extractor/kick] Add extractor (#5736) Closes #5722 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/kick.py | 127 ++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 yt_dlp/extractor/kick.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4fed24c35b..a2b92b85ae 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -844,6 +844,10 @@ KhanAcademyIE, KhanAcademyUnitIE, ) +from .kick import ( + KickIE, + KickVODIE, +) from .kicker import KickerIE from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py new file mode 100644 index 0000000000..a79ffb7a98 --- /dev/null +++ b/yt_dlp/extractor/kick.py @@ -0,0 +1,127 @@ +from .common import InfoExtractor + +from ..utils import ( + HEADRequest, + UserNotLive, + float_or_none, + merge_dicts, + str_or_none, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class KickBaseIE(InfoExtractor): + def _real_initialize(self): + self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session') + xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN') + if not xsrf_token: + self.write_debug('kick.com did not set XSRF-TOKEN cookie') + KickBaseIE._API_HEADERS = { + 'Authorization': f'Bearer {xsrf_token.value}', + 'X-XSRF-TOKEN': xsrf_token.value, + } if xsrf_token else {} + + def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs): + return self._download_json( + f'https://kick.com/api/v1/{path}', display_id, note=note, + headers=merge_dicts(headers, self._API_HEADERS), **kwargs) + + +class KickIE(KickBaseIE): + _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P[\w_]+)' + _TESTS = [{ + 'url': 'https://kick.com/yuppy', + 'info_dict': { + 'id': '6cde1-kickrp-joe-flemmingskick-info-heremust-knowmust-see21', + 'ext': 'mp4', + 'title': str, + 'description': str, + 'channel': 'yuppy', + 'channel_id': '33538', + 'uploader': 'Yuppy', + 'uploader_id': '33793', + 'upload_date': str, + 'live_status': 'is_live', + 'timestamp': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'categories': list, + }, + 'skip': 'livestream', + }, { + 'url': 'https://kick.com/kmack710', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel = self._match_id(url) + response = self._call_api(f'channels/{channel}', channel) + if not traverse_obj(response, 'livestream', expected_type=dict): + raise UserNotLive(video_id=channel) + + return { + 'id': str(traverse_obj( + response, ('livestream', ('slug', 'id')), get_all=False, default=channel)), + 'formats': self._extract_m3u8_formats( + response['playback_url'], channel, 'mp4', live=True), + 'title': traverse_obj( + response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), + 'description': traverse_obj(response, ('user', 'bio')), + 'channel': channel, + 'channel_id': str_or_none(traverse_obj(response, 'id', ('livestream', 'channel_id'))), + 'uploader': traverse_obj(response, 'name', ('user', 'username')), + 'uploader_id': str_or_none(traverse_obj(response, 'user_id', ('user', 'id'))), + 'is_live': True, + 'timestamp': unified_timestamp(traverse_obj(response, ('livestream', 'created_at'))), + 'thumbnail': traverse_obj( + response, ('livestream', 'thumbnail', 'url'), expected_type=url_or_none), + 'categories': traverse_obj(response, ('recent_categories', ..., 'name')), + } + + +class KickVODIE(KickBaseIE): + _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ + 'url': 'https://kick.com/video/54244b5e-050a-4df4-a013-b2433dafbe35', + 'md5': '73691206a6a49db25c5aa1588e6538fc', + 'info_dict': { + 'id': '54244b5e-050a-4df4-a013-b2433dafbe35', + 'ext': 'mp4', + 'title': 'Making 710-carBoosting. Kinda No Pixel inspired. !guilded - !links', + 'description': 'md5:a0d3546bf7955d0a8252ffe0fd6f518f', + 'channel': 'kmack710', + 'channel_id': '16278', + 'uploader': 'Kmack710', + 'uploader_id': '16412', + 'upload_date': '20221206', + 'timestamp': 1670318289, + 'duration': 40104.0, + 'thumbnail': r're:^https?://.*\.jpg', + 'categories': ['Grand Theft Auto V'], + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + response = self._call_api(f'video/{video_id}', video_id) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(response['source'], video_id, 'mp4'), + 'title': traverse_obj( + response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), + 'description': traverse_obj(response, ('livestream', 'channel', 'user', 'bio')), + 'channel': traverse_obj(response, ('livestream', 'channel', 'slug')), + 'channel_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'id'))), + 'uploader': traverse_obj(response, ('livestream', 'channel', 'user', 'username')), + 'uploader_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'user_id'))), + 'timestamp': unified_timestamp(response.get('created_at')), + 'duration': float_or_none(traverse_obj(response, ('livestream', 'duration')), scale=1000), + 'thumbnail': traverse_obj( + response, ('livestream', 'thumbnail'), expected_type=url_or_none), + 'categories': traverse_obj(response, ('livestream', 'categories', ..., 'name')), + } From ca2f6e14e65f0faf92cabff8b7e5b4760363c52e Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Fri, 30 Dec 2022 03:01:22 +0900 Subject: [PATCH 28/84] [extractor/BiliLive] Fix extractor - Remove unnecessary group in `_VALID_URL` - This extractor always returns livestreams --- yt_dlp/extractor/bilibili.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 616a549607..37711c138a 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1034,7 +1034,7 @@ def _real_extract(self, url): class BiliLiveIE(InfoExtractor): - _VALID_URL = r'https?://live.bilibili.com/(blanc/)?(?P\d+)' + _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P\d+)' _TESTS = [{ 'url': 'https://live.bilibili.com/196', @@ -1114,6 +1114,7 @@ def _real_extract(self, url): 'thumbnail': room_data.get('user_cover'), 'timestamp': stream_data.get('live_time'), 'formats': formats, + 'is_live': True, 'http_headers': { 'Referer': url, }, From e107c2b8cf8d6f3506d07bc64fc243682ee49b1e Mon Sep 17 00:00:00 2001 From: nosoop Date: Thu, 29 Dec 2022 10:46:43 -0800 Subject: [PATCH 29/84] [extractor/soundcloud] Support user permalink (#5842) Closes #5841 Authored by: nosoop --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/soundcloud.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a2b92b85ae..352de83cac 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1710,6 +1710,7 @@ SoundcloudSetIE, SoundcloudRelatedIE, SoundcloudUserIE, + SoundcloudUserPermalinkIE, SoundcloudTrackStationIE, SoundcloudPlaylistIE, SoundcloudSearchIE, diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 4879d48c80..979f23f44f 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -782,6 +782,27 @@ def _real_extract(self, url): '%s (%s)' % (user['username'], resource.capitalize())) +class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://api\.soundcloud\.com/users/(?P\d+)' + IE_NAME = 'soundcloud:user:permalink' + _TESTS = [{ + 'url': 'https://api.soundcloud.com/users/30909869', + 'info_dict': { + 'id': '30909869', + 'title': 'neilcic', + }, + 'playlist_mincount': 23, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + user = self._download_json( + self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS) + + return self._extract_playlist( + f'{self._API_V2_BASE}stream/users/{user["id"]}', str(user['id']), user.get('username')) + + class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P[^/?#&]+)' IE_NAME = 'soundcloud:trackstation' From efa944f4bc892321a0d01dcddb210405761ecada Mon Sep 17 00:00:00 2001 From: Anant Murmu Date: Fri, 30 Dec 2022 08:13:49 +0530 Subject: [PATCH 30/84] [cleanup] Use `random.choices` (#5800) Authored by: freezboltz --- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/extractor/adn.py | 2 +- yt_dlp/extractor/discovery.py | 2 +- yt_dlp/extractor/funimation.py | 2 +- yt_dlp/extractor/linuxacademy.py | 5 ++--- yt_dlp/extractor/tencent.py | 4 ++-- yt_dlp/extractor/tiktok.py | 10 +++++----- yt_dlp/extractor/videa.py | 2 +- yt_dlp/extractor/viu.py | 2 +- yt_dlp/extractor/vrv.py | 2 +- yt_dlp/extractor/youku.py | 4 ++-- 11 files changed, 18 insertions(+), 19 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index abb0ddfe52..17f37a6432 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1068,7 +1068,7 @@ def _outtmpl_expandpath(outtmpl): # correspondingly that is not what we want since we need to keep # '%%' intact for template dict substitution step. Working around # with boundary-alike separator hack. - sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) + sep = ''.join(random.choices(ascii_letters, k=32)) outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$') # outtmpl should be expand_path'ed before template dict substitution diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index e0c18c8773..f1f55e87fc 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -168,7 +168,7 @@ def _real_extract(self, url): }, data=b'')['token'] links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') - self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + self._K = ''.join(random.choices('0123456789abcdef', k=16)) message = bytes_to_intlist(json.dumps({ 'k': self._K, 't': token, diff --git a/yt_dlp/extractor/discovery.py b/yt_dlp/extractor/discovery.py index fd3fc8fb0f..e6e109d5c5 100644 --- a/yt_dlp/extractor/discovery.py +++ b/yt_dlp/extractor/discovery.py @@ -78,7 +78,7 @@ def _real_extract(self, url): 'Downloading token JSON metadata', query={ 'authRel': 'authorization', 'client_id': '3020a40c2356a645b4b4', - 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'nonce': ''.join(random.choices(string.ascii_letters, k=32)), 'redirectUri': 'https://www.discovery.com/', })['access_token'] diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 18363c1b91..47c316664a 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -210,7 +210,7 @@ def _real_extract(self, url): page = self._download_json( 'https://www.funimation.com/api/showexperience/%s/' % experience_id, display_id, headers=headers, expected_status=403, query={ - 'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]), + 'pinst_id': ''.join(random.choices(string.digits + string.ascii_letters, k=8)), }, note=f'Downloading {format_name} JSON') sources = page.get('items') or [] if not sources: diff --git a/yt_dlp/extractor/linuxacademy.py b/yt_dlp/extractor/linuxacademy.py index a570248b7a..7bb64e17c4 100644 --- a/yt_dlp/extractor/linuxacademy.py +++ b/yt_dlp/extractor/linuxacademy.py @@ -75,9 +75,8 @@ class LinuxAcademyIE(InfoExtractor): def _perform_login(self, username, password): def random_string(): - return ''.join([ - random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') - for _ in range(32)]) + return ''.join(random.choices( + '0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32)) webpage, urlh = self._download_webpage_handle( self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ diff --git a/yt_dlp/extractor/tencent.py b/yt_dlp/extractor/tencent.py index ff8bf991ef..44cae04720 100644 --- a/yt_dlp/extractor/tencent.py +++ b/yt_dlp/extractor/tencent.py @@ -32,7 +32,7 @@ def _get_ckey(self, video_id, url, guid): padding_mode='whitespace').hex().upper() def _get_video_api_response(self, video_url, video_id, series_id, subtitle_format, video_format, video_quality): - guid = ''.join([random.choice(string.digits + string.ascii_lowercase) for _ in range(16)]) + guid = ''.join(random.choices(string.digits + string.ascii_lowercase, k=16)) ckey = self._get_ckey(video_id, video_url, guid) query = { 'vid': video_id, @@ -55,7 +55,7 @@ def _get_video_api_response(self, video_url, video_id, series_id, subtitle_forma 'platform': self._PLATFORM, # For VQQ 'guid': guid, - 'flowid': ''.join(random.choice(string.digits + string.ascii_lowercase) for _ in range(32)), + 'flowid': ''.join(random.choices(string.digits + string.ascii_lowercase, k=32)), } return self._search_json(r'QZOutputJson=', self._download_webpage( diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 2dd4510cc3..709d944dc6 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -49,7 +49,7 @@ def _get_sigi_state(self, webpage, display_id): def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): - self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160))) + self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) if webpage_cookies.get('sid_tt'): self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value) @@ -68,8 +68,8 @@ def _build_api_query(self, query, app_version, manifest_app_version): 'build_number': app_version, 'manifest_version_code': manifest_app_version, 'update_version_code': manifest_app_version, - 'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)), - 'uuid': ''.join([random.choice(string.digits) for _ in range(16)]), + 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), + 'uuid': ''.join(random.choices(string.digits, k=16)), '_rticket': int(time.time() * 1000), 'ts': int(time.time()), 'device_brand': 'Google', @@ -638,7 +638,7 @@ def _video_entries_api(self, webpage, user_id, username): 'max_cursor': 0, 'min_cursor': 0, 'retry_type': 'no_retry', - 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. + 'device_id': ''.join(random.choices(string.digits, k=19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. } for page in itertools.count(1): @@ -686,7 +686,7 @@ def _entries(self, list_id, display_id): 'cursor': 0, 'count': 20, 'type': 5, - 'device_id': ''.join(random.choice(string.digits) for i in range(19)) + 'device_id': ''.join(random.choices(string.digits, k=19)) } for page in itertools.count(1): diff --git a/yt_dlp/extractor/videa.py b/yt_dlp/extractor/videa.py index 52fa8fcec2..59ae933b08 100644 --- a/yt_dlp/extractor/videa.py +++ b/yt_dlp/extractor/videa.py @@ -119,7 +119,7 @@ def _real_extract(self, url): result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] query = parse_qs(player_url) - random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) + random_seed = ''.join(random.choices(string.ascii_letters + string.digits, k=8)) query['_s'] = random_seed query['_t'] = result[:16] diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index 19d48234e4..dd4cad7ba8 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -251,7 +251,7 @@ def _login(self, country_code, video_id): return self._user_token def _get_token(self, country_code, video_id): - rand = ''.join(random.choice('0123456789') for _ in range(10)) + rand = ''.join(random.choices('0123456789', k=10)) return self._download_json( f'https://api-gateway-global.viu.com/api/auth/token?v={rand}000', video_id, headers={'Content-Type': 'application/json'}, note='Getting bearer token', diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py index 89fa7affc2..ad9dc568a6 100644 --- a/yt_dlp/extractor/vrv.py +++ b/yt_dlp/extractor/vrv.py @@ -30,7 +30,7 @@ def _call_api(self, path, video_id, note, data=None): base_url = self._API_DOMAIN + '/core/' + path query = [ ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), - ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])), + ('oauth_nonce', ''.join(random.choices(string.ascii_letters, k=32))), ('oauth_signature_method', 'HMAC-SHA1'), ('oauth_timestamp', int(time.time())), ] diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py index ab59200d79..404f196f46 100644 --- a/yt_dlp/extractor/youku.py +++ b/yt_dlp/extractor/youku.py @@ -129,8 +129,8 @@ class YoukuIE(InfoExtractor): @staticmethod def get_ysuid(): - return '%d%s' % (int(time.time()), ''.join([ - random.choice(string.ascii_letters) for i in range(3)])) + return '%d%s' % (int(time.time()), ''.join( + random.choices(string.ascii_letters, k=3))) def get_format_name(self, fm): _dict = { From 4455918e7f090ace0b0c2537bbfd364956eb66cb Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 10:12:13 +0530 Subject: [PATCH 31/84] [extractor/stv] Detect DRM Closes #5320 --- yt_dlp/extractor/stv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py index c879fb52eb..8b3e63538c 100644 --- a/yt_dlp/extractor/stv.py +++ b/yt_dlp/extractor/stv.py @@ -73,6 +73,8 @@ def _real_extract(self, url): }) programme = result.get('programme') or {} + if programme.get('drmEnabled'): + self.report_drm(video_id) return { '_type': 'url_transparent', From 119e40ef64b25f66a39246e87ce6c143cd34276d Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 11:15:41 +0530 Subject: [PATCH 32/84] Add pre-processor stage `video` Related: #456, #5808 --- README.md | 44 +++++++++++++++++++------------------ yt_dlp/YoutubeDL.py | 17 +++++++++------ yt_dlp/options.py | 53 +++++++++++++++++++++------------------------ yt_dlp/utils.py | 2 +- 4 files changed, 59 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 440ed19348..d31fedb00e 100644 --- a/README.md +++ b/README.md @@ -725,7 +725,7 @@ ## Verbosity and Simulation Options: screen, optionally prefixed with when to print it, separated by a ":". Supported values of "WHEN" are the same as that of - --use-postprocessor, and "video" (default). + --use-postprocessor (default: video). Implies --quiet. Implies --simulate unless --no-simulate or later stages of WHEN are used. This option can be used multiple times @@ -979,18 +979,18 @@ ## Post-Processing Options: --ffmpeg-location PATH Location of the ffmpeg binary; either the path to the binary or its containing directory --exec [WHEN:]CMD Execute a command, optionally prefixed with - when to execute it (after_move if - unspecified), separated by a ":". Supported - values of "WHEN" are the same as that of - --use-postprocessor. Same syntax as the - output template can be used to pass any - field as arguments to the command. After - download, an additional field "filepath" - that contains the final path of the - downloaded file is also available, and if no - fields are passed, %(filepath)q is appended - to the end of the command. This option can - be used multiple times + when to execute it, separated by a ":". + Supported values of "WHEN" are the same as + that of --use-postprocessor (default: + after_move). Same syntax as the output + template can be used to pass any field as + arguments to the command. After download, an + additional field "filepath" that contains + the final path of the downloaded file is + also available, and if no fields are passed, + %(filepath)q is appended to the end of the + command. This option can be used multiple + times --no-exec Remove any previously defined --exec --convert-subs FORMAT Convert the subtitles to another format (currently supported: ass, lrc, srt, vtt) @@ -1028,14 +1028,16 @@ ## Post-Processing Options: postprocessor is invoked. It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), - "before_dl" (before each video download), - "post_process" (after each video download; - default), "after_move" (after moving video - file to it's final locations), "after_video" - (after downloading and processing all - formats of a video), or "playlist" (at end - of playlist). This option can be used - multiple times to add different postprocessors + "video" (after --format; before + --print/--output), "before_dl" (before each + video download), "post_process" (after each + video download; default), "after_move" + (after moving video file to it's final + locations), "after_video" (after downloading + and processing all formats of a video), or + "playlist" (at end of playlist). This option + can be used multiple times to add different + postprocessors ## SponsorBlock Options: Make chapter entries for, or remove various segments (sponsor, diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 17f37a6432..5057323274 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2977,6 +2977,16 @@ def process_info(self, info_dict): # Does nothing under normal operation - for backward compatibility of process_info self.post_extract(info_dict) + + def replace_info_dict(new_info): + nonlocal info_dict + if new_info == info_dict: + return + info_dict.clear() + info_dict.update(new_info) + + new_info, _ = self.pre_process(info_dict, 'video') + replace_info_dict(new_info) self._num_downloads += 1 # info_dict['_filename'] needs to be set for backward compatibility @@ -3090,13 +3100,6 @@ def _write_link_file(link_type): for link_type, should_write in write_links.items()): return - def replace_info_dict(new_info): - nonlocal info_dict - if new_info == info_dict: - return - info_dict.clear() - info_dict.update(new_info) - new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) replace_info_dict(new_info) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index bc574b8857..096a502491 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -277,6 +277,20 @@ def _dict_from_options_callback( out_dict[key] = out_dict.get(key, []) + [val] if append else val setattr(parser.values, option.dest, out_dict) + def when_prefix(default): + return { + 'default': {}, + 'type': 'str', + 'action': 'callback', + 'callback': _dict_from_options_callback, + 'callback_kwargs': { + 'allowed_keys': '|'.join(map(re.escape, POSTPROCESS_WHEN)), + 'default_key': default, + 'multiple_keys': False, + 'append': True, + }, + } + parser = _YoutubeDLOptionParser() alias_group = optparse.OptionGroup(parser, 'Aliases') Formatter = string.Formatter() @@ -1086,28 +1100,16 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Do not download the video but write all related files (Alias: --no-download)') verbosity.add_option( '-O', '--print', - metavar='[WHEN:]TEMPLATE', dest='forceprint', default={}, type='str', - action='callback', callback=_dict_from_options_callback, - callback_kwargs={ - 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)), - 'default_key': 'video', - 'multiple_keys': False, - 'append': True, - }, help=( + metavar='[WHEN:]TEMPLATE', dest='forceprint', **when_prefix('video'), + help=( 'Field name or output template to print to screen, optionally prefixed with when to print it, separated by a ":". ' - 'Supported values of "WHEN" are the same as that of --use-postprocessor, and "video" (default). ' + 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: video). ' 'Implies --quiet. Implies --simulate unless --no-simulate or later stages of WHEN are used. ' 'This option can be used multiple times')) verbosity.add_option( '--print-to-file', - metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', default={}, type='str', nargs=2, - action='callback', callback=_dict_from_options_callback, - callback_kwargs={ - 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)), - 'default_key': 'video', - 'multiple_keys': False, - 'append': True, - }, help=( + metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', nargs=2, **when_prefix('video'), + help=( 'Append given template to the file. The values of WHEN and TEMPLATE are same as that of --print. ' 'FILE uses the same syntax as the output template. This option can be used multiple times')) verbosity.add_option( @@ -1629,16 +1631,10 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Location of the ffmpeg binary; either the path to the binary or its containing directory') postproc.add_option( '--exec', - metavar='[WHEN:]CMD', dest='exec_cmd', default={}, type='str', - action='callback', callback=_dict_from_options_callback, - callback_kwargs={ - 'allowed_keys': '|'.join(map(re.escape, POSTPROCESS_WHEN)), - 'default_key': 'after_move', - 'multiple_keys': False, - 'append': True, - }, help=( - 'Execute a command, optionally prefixed with when to execute it (after_move if unspecified), separated by a ":". ' - 'Supported values of "WHEN" are the same as that of --use-postprocessor. ' + metavar='[WHEN:]CMD', dest='exec_cmd', **when_prefix('after_move'), + help=( + 'Execute a command, optionally prefixed with when to execute it, separated by a ":". ' + 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: after_move). ' 'Same syntax as the output template can be used to pass any field as arguments to the command. ' 'After download, an additional field "filepath" that contains the final path of the downloaded file ' 'is also available, and if no fields are passed, %(filepath)q is appended to the end of the command. ' @@ -1714,7 +1710,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'ARGS are a semicolon ";" delimited list of NAME=VALUE. ' 'The "when" argument determines when the postprocessor is invoked. ' 'It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), ' - '"before_dl" (before each video download), "post_process" (after each video download; default), ' + '"video" (after --format; before --print/--output), "before_dl" (before each video download), ' + '"post_process" (after each video download; default), ' '"after_move" (after moving video file to it\'s final locations), ' '"after_video" (after downloading and processing all formats of a video), ' 'or "playlist" (at end of playlist). ' diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 3947dcf2e5..43b5fda1d2 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3395,7 +3395,7 @@ def q(qid): return q -POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist') +POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist') DEFAULT_OUTTMPL = { From fe74d5b592438c669f5717b34504f27c34ca9904 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 11:01:14 +0530 Subject: [PATCH 33/84] Let `--parse/replace-in-metadata` run at any post-processing stage Closes #5808, #456 --- README.md | 13 +++++++++---- yt_dlp/__init__.py | 14 ++++++++------ yt_dlp/options.py | 12 +++++++----- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d31fedb00e..500f92387b 100644 --- a/README.md +++ b/README.md @@ -952,13 +952,18 @@ ## Post-Processing Options: mkv/mka video files --no-embed-info-json Do not embed the infojson as an attachment to the video file - --parse-metadata FROM:TO Parse additional metadata like title/artist + --parse-metadata [WHEN:]FROM:TO + Parse additional metadata like title/artist from other fields; see "MODIFYING METADATA" - for details - --replace-in-metadata FIELDS REGEX REPLACE + for details. Supported values of "WHEN" are + the same as that of --use-postprocessor + (default: pre_process) + --replace-in-metadata [WHEN:]FIELDS REGEX REPLACE Replace text in a metadata field using the given regex. This option can be used - multiple times + multiple times. Supported values of "WHEN" + are the same as that of --use-postprocessor + (default: pre_process) --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --concat-playlist POLICY Concatenate videos in a playlist. One of diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 202f102ba9..3490816c4c 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -386,10 +386,12 @@ def metadataparser_actions(f): raise ValueError(f'{cmd} is invalid; {err}') yield action - parse_metadata = opts.parse_metadata or [] if opts.metafromtitle is not None: - parse_metadata.append('title:%s' % opts.metafromtitle) - opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, parse_metadata))) + opts.parse_metadata.setdefault('pre_process', []).append('title:%s' % opts.metafromtitle) + opts.parse_metadata = { + k: list(itertools.chain(*map(metadataparser_actions, v))) + for k, v in opts.parse_metadata.items() + } # Other options if opts.playlist_items is not None: @@ -561,11 +563,11 @@ def report_deprecation(val, old, new=None): def get_postprocessors(opts): yield from opts.add_postprocessors - if opts.parse_metadata: + for when, actions in opts.parse_metadata.items(): yield { 'key': 'MetadataParser', - 'actions': opts.parse_metadata, - 'when': 'pre_process' + 'actions': actions, + 'when': when } sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove if sponsorblock_query: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 096a502491..ed83cb763e 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1586,14 +1586,16 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=optparse.SUPPRESS_HELP) postproc.add_option( '--parse-metadata', - metavar='FROM:TO', dest='parse_metadata', action='append', + metavar='[WHEN:]FROM:TO', dest='parse_metadata', **when_prefix('pre_process'), help=( - 'Parse additional metadata like title/artist from other fields; ' - 'see "MODIFYING METADATA" for details')) + 'Parse additional metadata like title/artist from other fields; see "MODIFYING METADATA" for details. ' + 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: pre_process)')) postproc.add_option( '--replace-in-metadata', - dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3, - help='Replace text in a metadata field using the given regex. This option can be used multiple times') + dest='parse_metadata', metavar='[WHEN:]FIELDS REGEX REPLACE', nargs=3, **when_prefix('pre_process'), + help=( + 'Replace text in a metadata field using the given regex. This option can be used multiple times. ' + 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: pre_process)')) postproc.add_option( '--xattrs', '--xattr', action='store_true', dest='xattrs', default=False, From d5f043d127cac1e8ec8a6eacde04ad1133600a16 Mon Sep 17 00:00:00 2001 From: ChillingPepper <90042155+ChillingPepper@users.noreply.github.com> Date: Fri, 30 Dec 2022 07:38:38 +0100 Subject: [PATCH 34/84] [utils] js_to_json: Fix bug in f55523c (#5771) Authored by: ChillingPepper, pukkandan --- test/test_utils.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++ yt_dlp/utils.py | 8 ++++- 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 49ab3796b9..82ae77ea25 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -954,6 +954,85 @@ def test_escape_url(self): ) self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') + def test_js_to_json_vars_strings(self): + self.assertDictEqual( + json.loads(js_to_json( + '''{ + 'null': a, + 'nullStr': b, + 'true': c, + 'trueStr': d, + 'false': e, + 'falseStr': f, + 'unresolvedVar': g, + }''', + { + 'a': 'null', + 'b': '"null"', + 'c': 'true', + 'd': '"true"', + 'e': 'false', + 'f': '"false"', + 'g': 'var', + } + )), + { + 'null': None, + 'nullStr': 'null', + 'true': True, + 'trueStr': 'true', + 'false': False, + 'falseStr': 'false', + 'unresolvedVar': 'var' + } + ) + + self.assertDictEqual( + json.loads(js_to_json( + '''{ + 'int': a, + 'intStr': b, + 'float': c, + 'floatStr': d, + }''', + { + 'a': '123', + 'b': '"123"', + 'c': '1.23', + 'd': '"1.23"', + } + )), + { + 'int': 123, + 'intStr': '123', + 'float': 1.23, + 'floatStr': '1.23', + } + ) + + self.assertDictEqual( + json.loads(js_to_json( + '''{ + 'object': a, + 'objectStr': b, + 'array': c, + 'arrayStr': d, + }''', + { + 'a': '{}', + 'b': '"{}"', + 'c': '[]', + 'd': '"[]"', + } + )), + { + 'object': {}, + 'objectStr': '{}', + 'array': [], + 'arrayStr': '[]', + } + ) + def test_js_to_json_realworld(self): inp = '''{ 'clip':{'provider':'pseudo'} diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 43b5fda1d2..64c83a77a2 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3360,7 +3360,13 @@ def fix_kv(m): return f'"{i}":' if v.endswith(':') else str(i) if v in vars: - return json.dumps(vars[v]) + try: + if not strict: + json.loads(vars[v]) + except json.decoder.JSONDecodeError: + return json.dumps(vars[v]) + else: + return vars[v] if not strict: return f'"{v}"' From f74371a97d67237e055612006602934b910b1275 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 11:57:33 +0530 Subject: [PATCH 35/84] [extractor/bilibili] Fix `--no-playlist` for anthology Closes #5797 --- yt_dlp/extractor/bilibili.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 37711c138a..92620f697b 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -303,7 +303,8 @@ def _real_extract(self, url): getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') if is_anthology: - title += f' p{part_id:02d} {traverse_obj(page_list_json, ((part_id or 1) - 1, "part")) or ""}' + part_id = part_id or 1 + title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}' aid = video_data.get('aid') old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') From ec54bd43f374cee429d67078ac61b75e66afb3fa Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 14:07:11 +0530 Subject: [PATCH 36/84] Fix bug in writing playlist info-json Closes #4889 --- yt_dlp/YoutubeDL.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 5057323274..db6bfded83 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1862,11 +1862,10 @@ def __process_playlist(self, ie_result, download): self.to_screen('[download] Downloading item %s of %s' % ( self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS))) - extra.update({ + entry_result = self.__process_iterable_entry(entry, download, collections.ChainMap({ 'playlist_index': playlist_index, 'playlist_autonumber': i + 1, - }) - entry_result = self.__process_iterable_entry(entry, download, extra) + }, extra)) if not entry_result: failures += 1 if failures >= max_failures: From fbb73833067ba742459729809679a62f34b3e41e Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 15:30:56 +0530 Subject: [PATCH 37/84] Add `weba` to known extensions --- test/test_utils.py | 2 ++ yt_dlp/utils.py | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 82ae77ea25..3d5a6ea6ba 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1953,6 +1953,8 @@ def test_get_compatible_ext(self): vcodecs=[None], acodecs=[None], vexts=['webm'], aexts=['m4a']), 'mkv') self.assertEqual(get_compatible_ext( vcodecs=[None], acodecs=[None], vexts=['webm'], aexts=['webm']), 'webm') + self.assertEqual(get_compatible_ext( + vcodecs=[None], acodecs=[None], vexts=['webm'], aexts=['weba']), 'webm') self.assertEqual(get_compatible_ext( vcodecs=['h264'], acodecs=['mp4a'], vexts=['mov'], aexts=['m4a']), 'mp4') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 64c83a77a2..ee5340cd26 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3656,7 +3656,7 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): COMPATIBLE_EXTS = ( {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'}, - {'webm'}, + {'webm', 'weba'}, ) for ext in preferences or vexts: current_exts = {ext, *vexts, *aexts} @@ -5962,7 +5962,7 @@ def items_(self): common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'), video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'), common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'), - audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'), + audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'), thumbnails=('jpg', 'png', 'webp'), storyboards=('mhtml', ), subtitles=('srt', 'vtt', 'ass', 'lrc'), @@ -6094,9 +6094,9 @@ class FormatSorter: 'vext': {'type': 'ordered', 'field': 'video_ext', 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'), 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')}, - 'aext': {'type': 'ordered', 'field': 'audio_ext', - 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'), - 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')}, + 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext', + 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'), + 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')}, 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', 'field': ('vcodec', 'acodec'), From 9bb856998b0d5a0ad58268f0ba8d784fb9d934e3 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 15:32:33 +0530 Subject: [PATCH 38/84] [extractor/youtube] Extract DRC formats --- yt_dlp/extractor/youtube.py | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 9dde34fb01..506bd1e19a 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2544,6 +2544,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': [], }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, + }, { + 'note': 'Audio formats with Dynamic Range Compression', + 'url': 'https://www.youtube.com/watch?v=Tq92D6wQ1mg', + 'info_dict': { + 'id': 'Tq92D6wQ1mg', + 'ext': 'weba', + 'title': '[MMD] Adios - EVERGLOW [+Motion DL]', + 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'channel_follower_count': int, + 'description': 'md5:17eccca93a786d51bc67646756894066', + 'upload_date': '20191228', + 'uploader_url': 'http://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'], + 'playable_in_embed': True, + 'like_count': int, + 'categories': ['Entertainment'], + 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', + 'age_limit': 18, + 'channel': 'Projekt Melody', + 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'view_count': int, + 'availability': 'needs_auth', + 'comment_count': int, + 'live_status': 'not_live', + 'uploader': 'Projekt Melody', + 'duration': 106, + }, + 'params': {'extractor_args': {'youtube': {'player_client': ['tv_embedded']}}, 'format': '251-drc'}, } ] @@ -3553,7 +3582,7 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l itag = str_or_none(fmt.get('itag')) audio_track = fmt.get('audioTrack') or {} - stream_id = '%s.%s' % (itag or '', audio_track.get('id', '')) + stream_id = (itag, audio_track.get('id'), fmt.get('isDrc')) if stream_id in stream_ids: continue @@ -3634,11 +3663,12 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), - 'format_id': itag, + 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}', 'format_note': join_nonempty( '%s%s' % (audio_track.get('displayName') or '', ' (default)' if language_preference > 0 else ''), fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), + 'DRC' if fmt.get('isDrc') else None, try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '), @@ -3647,7 +3677,7 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l 'fps': int_or_none(fmt.get('fps')) or None, 'audio_channels': fmt.get('audioChannels'), 'height': height, - 'quality': q(quality), + 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, 'has_drm': bool(fmt.get('drmFamilies')), 'tbr': tbr, 'url': fmt_url, From 8d1ddb0805c7c56bd03a5c0837c55602473d213f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 31 Dec 2022 09:45:12 +0530 Subject: [PATCH 39/84] [extractor/udemy] Fix lectures that have no URL and detect DRM Closes #5662 --- yt_dlp/extractor/udemy.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/udemy.py b/yt_dlp/extractor/udemy.py index 8b99c59cf5..329e5da2d9 100644 --- a/yt_dlp/extractor/udemy.py +++ b/yt_dlp/extractor/udemy.py @@ -11,8 +11,10 @@ int_or_none, js_to_json, sanitized_Request, + smuggle_url, try_get, unescapeHTML, + unsmuggle_url, url_or_none, urlencode_postdata, ) @@ -106,7 +108,7 @@ def _download_lecture(self, course_id, lecture_id): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data,course_is_drmed', }) def _handle_error(self, response): @@ -199,16 +201,19 @@ def is_logged(webpage): def _real_extract(self, url): lecture_id = self._match_id(url) + course_id = unsmuggle_url(url, {})[1].get('course_id') - webpage = self._download_webpage(url, lecture_id) - - course_id, _ = self._extract_course_info(webpage, lecture_id) + webpage = None + if not course_id: + webpage = self._download_webpage(url, lecture_id) + course_id, _ = self._extract_course_info(webpage, lecture_id) try: lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + webpage = webpage or self._download_webpage(url, lecture_id) self._enroll_course(url, webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) else: @@ -391,6 +396,9 @@ def extract_subtitles(track_list): if f.get('url'): formats.append(f) + if not formats and asset.get('course_is_drmed'): + self.report_drm(video_id) + return { 'id': video_id, 'title': title, @@ -449,7 +457,9 @@ def _real_extract(self, url): if lecture_id: entry = { '_type': 'url_transparent', - 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']), + 'url': smuggle_url( + f'https://www.udemy.com/{course_path}/learn/v4/t/lecture/{entry["id"]}', + {'course_id': course_id}), 'title': entry.get('title'), 'ie_key': UdemyIE.ie_key(), } From a0e526ed4d042c88771cd5669ceb4413d2b8c47f Mon Sep 17 00:00:00 2001 From: Stel Abrego Date: Fri, 30 Dec 2022 20:58:33 -0800 Subject: [PATCH 40/84] [extractor/bandcamp] Add `album_artist` (#5537) Closes #5536 Authored by: stelcodes --- yt_dlp/extractor/bandcamp.py | 48 +++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index de81e0de7b..e89b3a69b3 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -29,11 +29,18 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", + 'title': 'youtube-dl "\'/\\ä↭ - youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭', 'duration': 9.8485, - 'uploader': 'youtube-dl "\'/\\ä↭', + 'uploader': 'youtube-dl "\'/\\ä↭', 'upload_date': '20121129', 'timestamp': 1354224127, + 'track': 'youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭', + 'album_artist': 'youtube-dl "\'/\\ä↭', + 'track_id': '1812978515', + 'artist': 'youtube-dl "\'/\\ä↭', + 'uploader_url': 'https://youtube-dl.bandcamp.com', + 'uploader_id': 'youtube-dl', + 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg', }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { @@ -41,7 +48,8 @@ class BandcampIE(InfoExtractor): 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', 'info_dict': { 'id': '2650410135', - 'ext': 'aiff', + 'ext': 'm4a', + 'acodec': r're:[fa]lac', 'title': 'Ben Prunty - Lanius (Battle)', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Ben Prunty', @@ -54,7 +62,10 @@ class BandcampIE(InfoExtractor): 'track_number': 1, 'track_id': '2650410135', 'artist': 'Ben Prunty', + 'album_artist': 'Ben Prunty', 'album': 'FTL: Advanced Edition Soundtrack', + 'uploader_url': 'https://benprunty.bandcamp.com', + 'uploader_id': 'benprunty', }, }, { # no free download, mp3 128 @@ -75,7 +86,34 @@ class BandcampIE(InfoExtractor): 'track_number': 5, 'track_id': '2584466013', 'artist': 'Mastodon', + 'album_artist': 'Mastodon', 'album': 'Call of the Mastodon', + 'uploader_url': 'https://relapsealumni.bandcamp.com', + 'uploader_id': 'relapsealumni', + }, + }, { + # track from compilation album (artist/album_artist difference) + 'url': 'https://diskotopia.bandcamp.com/track/safehouse', + 'md5': '19c5337bca1428afa54129f86a2f6a69', + 'info_dict': { + 'id': '1978174799', + 'ext': 'mp3', + 'title': 'submerse - submerse - Safehouse', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'submerse', + 'timestamp': 1480779297, + 'upload_date': '20161203', + 'release_timestamp': 1481068800, + 'release_date': '20161207', + 'duration': 154.066, + 'track': 'submerse - Safehouse', + 'track_number': 3, + 'track_id': '1978174799', + 'artist': 'submerse', + 'album_artist': 'Diskotopia', + 'album': 'DSK F/W 2016-2017 Free Compilation', + 'uploader_url': 'https://diskotopia.bandcamp.com', + 'uploader_id': 'diskotopia', }, }] @@ -121,6 +159,9 @@ def _real_extract(self, url): embed = self._extract_data_attr(webpage, title, 'embed', False) current = tralbum.get('current') or {} artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') + album_artist = self._html_search_regex( + r'

[\S\s]*?by\s*\s*\s*([^>]+?)\s*', + webpage, 'album artist', fatal=False) timestamp = unified_timestamp( current.get('publish_date') or tralbum.get('album_publish_date')) @@ -205,6 +246,7 @@ def _real_extract(self, url): 'track_id': track_id, 'artist': artist, 'album': embed.get('album_title'), + 'album_artist': album_artist, 'formats': formats, } From 2fb0f858686c46abc50a0e253245afe750746775 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 31 Dec 2022 11:02:24 +0530 Subject: [PATCH 41/84] [update] Workaround #5632 --- yt_dlp/update.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/update.py b/yt_dlp/update.py index ac3e28057d..a3a731aef5 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -15,7 +15,6 @@ Popen, cached_method, deprecation_warning, - remove_end, shell_quote, system_identifier, traverse_obj, @@ -43,7 +42,8 @@ def _get_variant_and_executable_path(): # Ref: https://en.wikipedia.org/wiki/Uname#Examples if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'): machine = '_x86' if platform.architecture()[0][:2] == '32' else '' - return f'{remove_end(sys.platform, "32")}{machine}_exe', path + # NB: https://github.com/yt-dlp/yt-dlp/issues/5632 + return f'{sys.platform}{machine}_exe', path path = os.path.dirname(__file__) if isinstance(__loader__, zipimporter): @@ -74,8 +74,8 @@ def current_git_head(): _FILE_SUFFIXES = { 'zip': '', 'py2exe': '_min.exe', - 'win_exe': '.exe', - 'win_x86_exe': '_x86.exe', + 'win32_exe': '.exe', + 'win32_x86_exe': '_x86.exe', 'darwin_exe': '_macos', 'darwin_legacy_exe': '_macos_legacy', 'linux_exe': '_linux', From 8e40b9d1ec132ae1bcac50b3ee520ece46ac9c55 Mon Sep 17 00:00:00 2001 From: Matthew Date: Sun, 1 Jan 2023 04:29:22 +0000 Subject: [PATCH 42/84] Improve plugin architecture (#5553) to make plugins easier to develop and use: * Plugins are now loaded as namespace packages. * Plugins can be loaded in any distribution of yt-dlp (binary, pip, source, etc.). * Plugin packages can be installed and managed via pip, or dropped into any of the documented locations. * Users do not need to edit any code files to install plugins. * Backwards-compatible with previous plugin architecture. As a side-effect, yt-dlp will now search in a few more locations for config files. Closes https://github.com/yt-dlp/yt-dlp/issues/1389 Authored by: flashdagger, coletdjnz, pukkandan, Grub4K Co-authored-by: Marcel Co-authored-by: pukkandan Co-authored-by: Simon Sawicki --- .gitignore | 8 +- README.md | 66 ++++++- devscripts/make_lazy_extractors.py | 4 + test/test_plugins.py | 73 ++++++++ .../yt_dlp_plugins/extractor/_ignore.py | 5 + .../yt_dlp_plugins/extractor/ignore.py | 12 ++ .../yt_dlp_plugins/extractor/normal.py | 9 + .../yt_dlp_plugins/postprocessor/normal.py | 5 + .../yt_dlp_plugins/extractor/zipped.py | 5 + .../yt_dlp_plugins/postprocessor/zipped.py | 5 + yt_dlp/YoutubeDL.py | 15 +- yt_dlp/extractor/extractors.py | 4 +- yt_dlp/options.py | 91 +++++----- yt_dlp/plugins.py | 171 ++++++++++++++++++ yt_dlp/postprocessor/__init__.py | 5 +- yt_dlp/utils.py | 55 ++++-- ytdlp_plugins/extractor/__init__.py | 4 - ytdlp_plugins/extractor/sample.py | 14 -- ytdlp_plugins/postprocessor/__init__.py | 4 - ytdlp_plugins/postprocessor/sample.py | 26 --- 20 files changed, 455 insertions(+), 126 deletions(-) create mode 100644 test/test_plugins.py create mode 100644 test/testdata/yt_dlp_plugins/extractor/_ignore.py create mode 100644 test/testdata/yt_dlp_plugins/extractor/ignore.py create mode 100644 test/testdata/yt_dlp_plugins/extractor/normal.py create mode 100644 test/testdata/yt_dlp_plugins/postprocessor/normal.py create mode 100644 test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py create mode 100644 test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py create mode 100644 yt_dlp/plugins.py delete mode 100644 ytdlp_plugins/extractor/__init__.py delete mode 100644 ytdlp_plugins/extractor/sample.py delete mode 100644 ytdlp_plugins/postprocessor/__init__.py delete mode 100644 ytdlp_plugins/postprocessor/sample.py diff --git a/.gitignore b/.gitignore index 00d74057fa..ef4d116167 100644 --- a/.gitignore +++ b/.gitignore @@ -120,9 +120,5 @@ yt-dlp.zip */extractor/lazy_extractors.py # Plugins -ytdlp_plugins/extractor/* -!ytdlp_plugins/extractor/__init__.py -!ytdlp_plugins/extractor/sample.py -ytdlp_plugins/postprocessor/* -!ytdlp_plugins/postprocessor/__init__.py -!ytdlp_plugins/postprocessor/sample.py +ytdlp_plugins/* +yt-dlp-plugins/* diff --git a/README.md b/README.md index 500f92387b..4294090dc5 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ * [Modifying metadata examples](#modifying-metadata-examples) * [EXTRACTOR ARGUMENTS](#extractor-arguments) * [PLUGINS](#plugins) + * [Installing Plugins](#installing-plugins) + * [Developing Plugins](#developing-plugins) * [EMBEDDING YT-DLP](#embedding-yt-dlp) * [Embedding examples](#embedding-examples) * [DEPRECATED OPTIONS](#deprecated-options) @@ -1110,15 +1112,20 @@ # CONFIGURATION * If `-P` is not given, the current directory is searched 1. **User Configuration**: * `${XDG_CONFIG_HOME}/yt-dlp/config` (recommended on Linux/macOS) + * `${XDG_CONFIG_HOME}/yt-dlp/config.txt` * `${XDG_CONFIG_HOME}/yt-dlp.conf` * `${APPDATA}/yt-dlp/config` (recommended on Windows) * `${APPDATA}/yt-dlp/config.txt` * `~/yt-dlp.conf` * `~/yt-dlp.conf.txt` + * `~/.yt-dlp/config` + * `~/.yt-dlp/config.txt` See also: [Notes about environment variables](#notes-about-environment-variables) 1. **System Configuration**: * `/etc/yt-dlp.conf` + * `/etc/yt-dlp/config` + * `/etc/yt-dlp/config.txt` E.g. with the following configuration file yt-dlp will always extract the audio, not copy the mtime, use a proxy and save all videos under `YouTube` directory in your home directory: ``` @@ -1789,19 +1796,68 @@ #### twitter # PLUGINS -Plugins are loaded from `/ytdlp_plugins//__init__.py`; where `` is the directory of the binary (`/yt-dlp`), or the root directory of the module if you are running directly from source-code (`/yt_dlp/__main__.py`). Plugins are currently not supported for the `pip` version +Note that **all** plugins are imported even if not invoked, and that **there are no checks** performed on plugin code. **Use plugins at your own risk and only if you trust the code!** -Plugins can be of ``s `extractor` or `postprocessor`. Extractor plugins do not need to be enabled from the CLI and are automatically invoked when the input URL is suitable for it. Postprocessor plugins can be invoked using `--use-postprocessor NAME`. +Plugins can be of ``s `extractor` or `postprocessor`. +- Extractor plugins do not need to be enabled from the CLI and are automatically invoked when the input URL is suitable for it. +- Extractor plugins take priority over builtin extractors. +- Postprocessor plugins can be invoked using `--use-postprocessor NAME`. -See [ytdlp_plugins](ytdlp_plugins) for example plugins. -Note that **all** plugins are imported even if not invoked, and that **there are no checks** performed on plugin code. Use plugins at your own risk and only if you trust the code +Plugins are loaded from the namespace packages `yt_dlp_plugins.extractor` and `yt_dlp_plugins.postprocessor`. -If you are a plugin author, add [ytdlp-plugins](https://github.com/topics/ytdlp-plugins) as a topic to your repository for discoverability +In other words, the file structure on the disk looks something like: + + yt_dlp_plugins/ + extractor/ + myplugin.py + postprocessor/ + myplugin.py + +yt-dlp looks for these `yt_dlp_plugins` namespace folders in many locations (see below) and loads in plugins from **all** of them. See the [wiki for some known plugins](https://github.com/yt-dlp/yt-dlp/wiki/Plugins) +## Installing Plugins +Plugins can be installed using various methods and locations. + +1. **Configuration directories**: + Plugin packages (containing a `yt_dlp_plugins` namespace folder) can be dropped into the following standard [configuration locations](#configuration): + * **User Plugins** + * `${XDG_CONFIG_HOME}/yt-dlp/plugins//yt_dlp_plugins/` (recommended on Linux/macOS) + * `${XDG_CONFIG_HOME}/yt-dlp-plugins//yt_dlp_plugins/` + * `${APPDATA}/yt-dlp/plugins//yt_dlp_plugins/` (recommended on Windows) + * `~/.yt-dlp/plugins//yt_dlp_plugins/` + * `~/yt-dlp-plugins//yt_dlp_plugins/` + * **System Plugins** + * `/etc/yt-dlp/plugins//yt_dlp_plugins/` + * `/etc/yt-dlp-plugins//yt_dlp_plugins/` +2. **Executable location**: Plugin packages can similarly be installed in a `yt-dlp-plugins` directory under the executable location: + * Binary: where `/yt-dlp.exe`, `/yt-dlp-plugins//yt_dlp_plugins/` + * Source: where `/yt_dlp/__main__.py`, `/yt-dlp-plugins//yt_dlp_plugins/` + +3. **pip and other locations in `PYTHONPATH`** + * Plugin packages can be installed and managed using `pip`. See [ytdlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for an example. + * Note: plugin files between plugin packages installed with pip must have unique filenames + * Any path in `PYTHONPATH` is searched in for the `yt_dlp_plugins` namespace folder. + * Note: This does not apply for Pyinstaller/py2exe builds. + + +.zip, .egg and .whl archives containing a `yt_dlp_plugins` namespace folder in their root are also supported. These can be placed in the same locations `yt_dlp_plugins` namespace folders can be found. +- e.g. `${XDG_CONFIG_HOME}/yt-dlp/plugins/mypluginpkg.zip` where `mypluginpkg.zip` contains `yt_dlp_plugins//myplugin.py` + +Run yt-dlp with `--verbose`/`-v` to check if the plugin has been loaded. + +## Developing Plugins + +See [ytdlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for a sample plugin package with instructions on how to set up an environment for plugin development. + +All public classes with a name ending in `IE` are imported from each file. This respects underscore prefix (e.g. `_MyBasePluginIE` is private) and `__all__`. Modules can similarly be excluded by prefixing the module name with an underscore (e.g. `_myplugin.py`) + +If you are a plugin author, add [yt-dlp-plugins](https://github.com/topics/yt-dlp-plugins) as a topic to your repository for discoverability + +See the [Developer Instructions](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#developer-instructions) on how to write and test an extractor. # EMBEDDING YT-DLP diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index c502bdf896..d74ea202f0 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -40,8 +40,12 @@ def main(): _ALL_CLASSES = get_all_ies() # Must be before import + import yt_dlp.plugins from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor + # Filter out plugins + _ALL_CLASSES = [cls for cls in _ALL_CLASSES if not cls.__module__.startswith(f'{yt_dlp.plugins.PACKAGE_NAME}.')] + DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR}) module_src = '\n'.join(( MODULE_TEMPLATE, diff --git a/test/test_plugins.py b/test/test_plugins.py new file mode 100644 index 0000000000..6cde579e1e --- /dev/null +++ b/test/test_plugins.py @@ -0,0 +1,73 @@ +import importlib +import os +import shutil +import sys +import unittest +from pathlib import Path + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +TEST_DATA_DIR = Path(os.path.dirname(os.path.abspath(__file__)), 'testdata') +sys.path.append(str(TEST_DATA_DIR)) +importlib.invalidate_caches() + +from yt_dlp.plugins import PACKAGE_NAME, directories, load_plugins + + +class TestPlugins(unittest.TestCase): + + TEST_PLUGIN_DIR = TEST_DATA_DIR / PACKAGE_NAME + + def test_directories_containing_plugins(self): + self.assertIn(self.TEST_PLUGIN_DIR, map(Path, directories())) + + def test_extractor_classes(self): + for module_name in tuple(sys.modules): + if module_name.startswith(f'{PACKAGE_NAME}.extractor'): + del sys.modules[module_name] + plugins_ie = load_plugins('extractor', 'IE') + + self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) + self.assertIn('NormalPluginIE', plugins_ie.keys()) + + # don't load modules with underscore prefix + self.assertFalse( + f'{PACKAGE_NAME}.extractor._ignore' in sys.modules.keys(), + 'loaded module beginning with underscore') + self.assertNotIn('IgnorePluginIE', plugins_ie.keys()) + + # Don't load extractors with underscore prefix + self.assertNotIn('_IgnoreUnderscorePluginIE', plugins_ie.keys()) + + # Don't load extractors not specified in __all__ (if supplied) + self.assertNotIn('IgnoreNotInAllPluginIE', plugins_ie.keys()) + self.assertIn('InAllPluginIE', plugins_ie.keys()) + + def test_postprocessor_classes(self): + plugins_pp = load_plugins('postprocessor', 'PP') + self.assertIn('NormalPluginPP', plugins_pp.keys()) + + def test_importing_zipped_module(self): + zip_path = TEST_DATA_DIR / 'zipped_plugins.zip' + shutil.make_archive(str(zip_path)[:-4], 'zip', str(zip_path)[:-4]) + sys.path.append(str(zip_path)) # add zip to search paths + importlib.invalidate_caches() # reset the import caches + + try: + for plugin_type in ('extractor', 'postprocessor'): + package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}') + self.assertIn(zip_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__)) + + plugins_ie = load_plugins('extractor', 'IE') + self.assertIn('ZippedPluginIE', plugins_ie.keys()) + + plugins_pp = load_plugins('postprocessor', 'PP') + self.assertIn('ZippedPluginPP', plugins_pp.keys()) + + finally: + sys.path.remove(str(zip_path)) + os.remove(zip_path) + importlib.invalidate_caches() # reset the import caches + + +if __name__ == '__main__': + unittest.main() diff --git a/test/testdata/yt_dlp_plugins/extractor/_ignore.py b/test/testdata/yt_dlp_plugins/extractor/_ignore.py new file mode 100644 index 0000000000..57faf75bbc --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/_ignore.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class IgnorePluginIE(InfoExtractor): + pass diff --git a/test/testdata/yt_dlp_plugins/extractor/ignore.py b/test/testdata/yt_dlp_plugins/extractor/ignore.py new file mode 100644 index 0000000000..816a16aa20 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/ignore.py @@ -0,0 +1,12 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class IgnoreNotInAllPluginIE(InfoExtractor): + pass + + +class InAllPluginIE(InfoExtractor): + pass + + +__all__ = ['InAllPluginIE'] diff --git a/test/testdata/yt_dlp_plugins/extractor/normal.py b/test/testdata/yt_dlp_plugins/extractor/normal.py new file mode 100644 index 0000000000..b09009bdc6 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/normal.py @@ -0,0 +1,9 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class NormalPluginIE(InfoExtractor): + pass + + +class _IgnoreUnderscorePluginIE(InfoExtractor): + pass diff --git a/test/testdata/yt_dlp_plugins/postprocessor/normal.py b/test/testdata/yt_dlp_plugins/postprocessor/normal.py new file mode 100644 index 0000000000..315b85a488 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/postprocessor/normal.py @@ -0,0 +1,5 @@ +from yt_dlp.postprocessor.common import PostProcessor + + +class NormalPluginPP(PostProcessor): + pass diff --git a/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py new file mode 100644 index 0000000000..01542e0d8d --- /dev/null +++ b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class ZippedPluginIE(InfoExtractor): + pass diff --git a/test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py b/test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py new file mode 100644 index 0000000000..223822bd6f --- /dev/null +++ b/test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py @@ -0,0 +1,5 @@ +from yt_dlp.postprocessor.common import PostProcessor + + +class ZippedPluginPP(PostProcessor): + pass diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index db6bfded83..9ef56a46b6 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -32,6 +32,7 @@ from .extractor.common import UnsupportedURLIE from .extractor.openload import PhantomJSwrapper from .minicurses import format_text +from .plugins import directories as plugin_directories from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors from .postprocessor import ( EmbedThumbnailPP, @@ -3773,10 +3774,6 @@ def get_encoding(stream): write_debug('Lazy loading extractors is forcibly disabled') else: write_debug('Lazy loading extractors is disabled') - if plugin_extractors or plugin_postprocessors: - write_debug('Plugins: %s' % [ - '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') - for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) if self.params['compat_opts']: write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts'])) @@ -3810,6 +3807,16 @@ def get_encoding(stream): proxy_map.update(handler.proxies) write_debug(f'Proxy map: {proxy_map}') + for plugin_type, plugins in {'Extractor': plugin_extractors, 'Post-Processor': plugin_postprocessors}.items(): + if not plugins: + continue + write_debug(f'{plugin_type} Plugins: %s' % (', '.join(sorted(('%s%s' % ( + klass.__name__, '' if klass.__name__ == name else f' as {name}') + for name, klass in plugins.items()))))) + plugin_dirs = plugin_directories() + if plugin_dirs: + write_debug(f'Plugin directories: {plugin_dirs}') + # Not implemented if False and self.params.get('call_home'): ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode() diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 610e02f906..beda02917e 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -1,10 +1,10 @@ import contextlib import os -from ..utils import load_plugins +from ..plugins import load_plugins # NB: Must be before other imports so that plugins can be correctly injected -_PLUGIN_CLASSES = load_plugins('extractor', 'IE', {}) +_PLUGIN_CLASSES = load_plugins('extractor', 'IE') _LAZY_LOADER = False if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): diff --git a/yt_dlp/options.py b/yt_dlp/options.py index ed83cb763e..be4695cbb5 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -29,6 +29,8 @@ expand_path, format_field, get_executable_path, + get_system_config_dirs, + get_user_config_dirs, join_nonempty, orderedSet_from_options, remove_end, @@ -42,62 +44,67 @@ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): if ignore_config_files == 'if_override': ignore_config_files = overrideArguments is not None - def _readUserConf(package_name, default=[]): - # .config + def _load_from_config_dirs(config_dirs): + for config_dir in config_dirs: + conf_file_path = os.path.join(config_dir, 'config') + conf = Config.read_file(conf_file_path, default=None) + if conf is None: + conf_file_path += '.txt' + conf = Config.read_file(conf_file_path, default=None) + if conf is not None: + return conf, conf_file_path + return None, None + + def _read_user_conf(package_name, default=None): + # .config/package_name.conf xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') - userConfFile = os.path.join(xdg_config_home, package_name, 'config') - if not os.path.isfile(userConfFile): - userConfFile = os.path.join(xdg_config_home, '%s.conf' % package_name) - userConf = Config.read_file(userConfFile, default=None) - if userConf is not None: - return userConf, userConfFile + user_conf_file = os.path.join(xdg_config_home, '%s.conf' % package_name) + user_conf = Config.read_file(user_conf_file, default=None) + if user_conf is not None: + return user_conf, user_conf_file - # appdata - appdata_dir = os.getenv('appdata') - if appdata_dir: - userConfFile = os.path.join(appdata_dir, package_name, 'config') - userConf = Config.read_file(userConfFile, default=None) - if userConf is None: - userConfFile += '.txt' - userConf = Config.read_file(userConfFile, default=None) - if userConf is not None: - return userConf, userConfFile + # home (~/package_name.conf or ~/package_name.conf.txt) + user_conf_file = os.path.join(compat_expanduser('~'), '%s.conf' % package_name) + user_conf = Config.read_file(user_conf_file, default=None) + if user_conf is None: + user_conf_file += '.txt' + user_conf = Config.read_file(user_conf_file, default=None) + if user_conf is not None: + return user_conf, user_conf_file - # home - userConfFile = os.path.join(compat_expanduser('~'), '%s.conf' % package_name) - userConf = Config.read_file(userConfFile, default=None) - if userConf is None: - userConfFile += '.txt' - userConf = Config.read_file(userConfFile, default=None) - if userConf is not None: - return userConf, userConfFile + # Package config directories (e.g. ~/.config/package_name/package_name.txt) + user_conf, user_conf_file = _load_from_config_dirs(get_user_config_dirs(package_name)) + if user_conf is not None: + return user_conf, user_conf_file + return default if default is not None else [], None - return default, None + def _read_system_conf(package_name, default=None): + system_conf, system_conf_file = _load_from_config_dirs(get_system_config_dirs(package_name)) + if system_conf is not None: + return system_conf, system_conf_file + return default if default is not None else [], None - def add_config(label, path, user=False): + def add_config(label, path=None, func=None): """ Adds config and returns whether to continue """ if root.parse_known_args()[0].ignoreconfig: return False - # Multiple package names can be given here - # E.g. ('yt-dlp', 'youtube-dlc', 'youtube-dl') will look for - # the configuration file of any of these three packages - for package in ('yt-dlp',): - if user: - args, current_path = _readUserConf(package, default=None) - else: - current_path = os.path.join(path, '%s.conf' % package) - args = Config.read_file(current_path, default=None) - if args is not None: - root.append_config(args, current_path, label=label) - return True + elif func: + assert path is None + args, current_path = func('yt-dlp') + else: + current_path = os.path.join(path, 'yt-dlp.conf') + args = Config.read_file(current_path, default=None) + if args is not None: + root.append_config(args, current_path, label=label) + return True return True def load_configs(): yield not ignore_config_files yield add_config('Portable', get_executable_path()) yield add_config('Home', expand_path(root.parse_known_args()[0].paths.get('home', '')).strip()) - yield add_config('User', None, user=True) - yield add_config('System', '/etc') + yield add_config('User', func=_read_user_conf) + yield add_config('System', func=_read_system_conf) opts = optparse.Values({'verbose': True, 'print_help': False}) try: diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py new file mode 100644 index 0000000000..7d2226d0f1 --- /dev/null +++ b/yt_dlp/plugins.py @@ -0,0 +1,171 @@ +import contextlib +import importlib +import importlib.abc +import importlib.machinery +import importlib.util +import inspect +import itertools +import os +import pkgutil +import sys +import traceback +import zipimport +from pathlib import Path +from zipfile import ZipFile + +from .compat import functools # isort: split +from .compat import compat_expanduser +from .utils import ( + get_executable_path, + get_system_config_dirs, + get_user_config_dirs, + write_string, +) + +PACKAGE_NAME = 'yt_dlp_plugins' +COMPAT_PACKAGE_NAME = 'ytdlp_plugins' + + +class PluginLoader(importlib.abc.Loader): + """Dummy loader for virtual namespace packages""" + + def exec_module(self, module): + return None + + +@functools.cache +def dirs_in_zip(archive): + with ZipFile(archive) as zip: + return set(itertools.chain.from_iterable( + Path(file).parents for file in zip.namelist())) + + +class PluginFinder(importlib.abc.MetaPathFinder): + """ + This class provides one or multiple namespace packages. + It searches in sys.path and yt-dlp config folders for + the existing subdirectories from which the modules can be imported + """ + + def __init__(self, *packages): + self._zip_content_cache = {} + self.packages = set(itertools.chain.from_iterable( + itertools.accumulate(name.split('.'), lambda a, b: '.'.join((a, b))) + for name in packages)) + + def search_locations(self, fullname): + candidate_locations = [] + + def _get_package_paths(*root_paths, containing_folder='plugins'): + for config_dir in map(Path, root_paths): + plugin_dir = config_dir / containing_folder + if not plugin_dir.is_dir(): + continue + yield from plugin_dir.iterdir() + + # Load from yt-dlp config folders + candidate_locations.extend(_get_package_paths( + *get_user_config_dirs('yt-dlp'), *get_system_config_dirs('yt-dlp'), + containing_folder='plugins')) + + # Load from yt-dlp-plugins folders + candidate_locations.extend(_get_package_paths( + get_executable_path(), + compat_expanduser('~'), + '/etc', + os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config'), + containing_folder='yt-dlp-plugins')) + + candidate_locations.extend(map(Path, sys.path)) # PYTHONPATH + + parts = Path(*fullname.split('.')) + locations = set() + for path in dict.fromkeys(candidate_locations): + candidate = path / parts + if candidate.is_dir(): + locations.add(str(candidate)) + elif path.name and any(path.with_suffix(suffix).is_file() for suffix in {'.zip', '.egg', '.whl'}): + with contextlib.suppress(FileNotFoundError): + if parts in dirs_in_zip(path): + locations.add(str(candidate)) + return locations + + def find_spec(self, fullname, path=None, target=None): + if fullname not in self.packages: + return None + + search_locations = self.search_locations(fullname) + if not search_locations: + return None + + spec = importlib.machinery.ModuleSpec(fullname, PluginLoader(), is_package=True) + spec.submodule_search_locations = search_locations + return spec + + def invalidate_caches(self): + dirs_in_zip.cache_clear() + for package in self.packages: + if package in sys.modules: + del sys.modules[package] + + +def directories(): + spec = importlib.util.find_spec(PACKAGE_NAME) + return spec.submodule_search_locations if spec else [] + + +def iter_modules(subpackage): + fullname = f'{PACKAGE_NAME}.{subpackage}' + with contextlib.suppress(ModuleNotFoundError): + pkg = importlib.import_module(fullname) + yield from pkgutil.iter_modules(path=pkg.__path__, prefix=f'{fullname}.') + + +def load_module(module, module_name, suffix): + return inspect.getmembers(module, lambda obj: ( + inspect.isclass(obj) + and obj.__name__.endswith(suffix) + and obj.__module__.startswith(module_name) + and not obj.__name__.startswith('_') + and obj.__name__ in getattr(module, '__all__', [obj.__name__]))) + + +def load_plugins(name, suffix): + classes = {} + + for finder, module_name, _ in iter_modules(name): + if any(x.startswith('_') for x in module_name.split('.')): + continue + try: + if sys.version_info < (3, 10) and isinstance(finder, zipimport.zipimporter): + # zipimporter.load_module() is deprecated in 3.10 and removed in 3.12 + # The exec_module branch below is the replacement for >= 3.10 + # See: https://docs.python.org/3/library/zipimport.html#zipimport.zipimporter.exec_module + module = finder.load_module(module_name) + else: + spec = finder.find_spec(module_name) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + except Exception: + write_string(f'Error while importing module {module_name!r}\n{traceback.format_exc(limit=-1)}') + continue + classes.update(load_module(module, module_name, suffix)) + + # Compat: old plugin system using __init__.py + # Note: plugins imported this way do not show up in directories() + # nor are considered part of the yt_dlp_plugins namespace package + with contextlib.suppress(FileNotFoundError): + spec = importlib.util.spec_from_file_location( + name, Path(get_executable_path(), COMPAT_PACKAGE_NAME, name, '__init__.py')) + plugins = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = plugins + spec.loader.exec_module(plugins) + classes.update(load_module(plugins, spec.name, suffix)) + + return classes + + +sys.meta_path.insert(0, PluginFinder(f'{PACKAGE_NAME}.extractor', f'{PACKAGE_NAME}.postprocessor')) + +__all__ = ['directories', 'load_plugins', 'PACKAGE_NAME', 'COMPAT_PACKAGE_NAME'] diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py index f168be46ad..bfe9df733b 100644 --- a/yt_dlp/postprocessor/__init__.py +++ b/yt_dlp/postprocessor/__init__.py @@ -33,14 +33,15 @@ from .sponskrub import SponSkrubPP from .sponsorblock import SponsorBlockPP from .xattrpp import XAttrMetadataPP -from ..utils import load_plugins +from ..plugins import load_plugins -_PLUGIN_CLASSES = load_plugins('postprocessor', 'PP', globals()) +_PLUGIN_CLASSES = load_plugins('postprocessor', 'PP') def get_postprocessor(key): return globals()[key + 'PP'] +globals().update(_PLUGIN_CLASSES) __all__ = [name for name in globals().keys() if name.endswith('PP')] __all__.extend(('PostProcessor', 'FFmpegPostProcessor')) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index ee5340cd26..32da598d0f 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -18,7 +18,6 @@ import html.parser import http.client import http.cookiejar -import importlib.util import inspect import io import itertools @@ -5372,22 +5371,37 @@ def get_executable_path(): return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1])) -def load_plugins(name, suffix, namespace): - classes = {} - with contextlib.suppress(FileNotFoundError): - plugins_spec = importlib.util.spec_from_file_location( - name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py')) - plugins = importlib.util.module_from_spec(plugins_spec) - sys.modules[plugins_spec.name] = plugins - plugins_spec.loader.exec_module(plugins) - for name in dir(plugins): - if name in namespace: - continue - if not name.endswith(suffix): - continue - klass = getattr(plugins, name) - classes[name] = namespace[name] = klass - return classes +def get_user_config_dirs(package_name): + locations = set() + + # .config (e.g. ~/.config/package_name) + xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') + config_dir = os.path.join(xdg_config_home, package_name) + if os.path.isdir(config_dir): + locations.add(config_dir) + + # appdata (%APPDATA%/package_name) + appdata_dir = os.getenv('appdata') + if appdata_dir: + config_dir = os.path.join(appdata_dir, package_name) + if os.path.isdir(config_dir): + locations.add(config_dir) + + # home (~/.package_name) + user_config_directory = os.path.join(compat_expanduser('~'), '.%s' % package_name) + if os.path.isdir(user_config_directory): + locations.add(user_config_directory) + + return locations + + +def get_system_config_dirs(package_name): + locations = set() + # /etc/package_name + system_config_directory = os.path.join('/etc', package_name) + if os.path.isdir(system_config_directory): + locations.add(system_config_directory) + return locations def traverse_obj( @@ -6367,3 +6381,10 @@ def calculate_preference(self, format): # Deprecated has_certifi = bool(certifi) has_websockets = bool(websockets) + + +def load_plugins(name, suffix, namespace): + from .plugins import load_plugins + ret = load_plugins(name, suffix) + namespace.update(ret) + return ret diff --git a/ytdlp_plugins/extractor/__init__.py b/ytdlp_plugins/extractor/__init__.py deleted file mode 100644 index 3045a590bd..0000000000 --- a/ytdlp_plugins/extractor/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa: F401 - -# ℹ️ The imported name must end in "IE" -from .sample import SamplePluginIE diff --git a/ytdlp_plugins/extractor/sample.py b/ytdlp_plugins/extractor/sample.py deleted file mode 100644 index a8bc455eb3..0000000000 --- a/ytdlp_plugins/extractor/sample.py +++ /dev/null @@ -1,14 +0,0 @@ -# ⚠ Don't use relative imports -from yt_dlp.extractor.common import InfoExtractor - - -# ℹ️ Instructions on making extractors can be found at: -# 🔗 https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#adding-support-for-a-new-site - -class SamplePluginIE(InfoExtractor): - _WORKING = False - IE_DESC = False - _VALID_URL = r'^sampleplugin:' - - def _real_extract(self, url): - self.to_screen('URL "%s" successfully captured' % url) diff --git a/ytdlp_plugins/postprocessor/__init__.py b/ytdlp_plugins/postprocessor/__init__.py deleted file mode 100644 index 61099abbc6..0000000000 --- a/ytdlp_plugins/postprocessor/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa: F401 - -# ℹ️ The imported name must end in "PP" and is the name to be used in --use-postprocessor -from .sample import SamplePluginPP diff --git a/ytdlp_plugins/postprocessor/sample.py b/ytdlp_plugins/postprocessor/sample.py deleted file mode 100644 index 4563e1c116..0000000000 --- a/ytdlp_plugins/postprocessor/sample.py +++ /dev/null @@ -1,26 +0,0 @@ -# ⚠ Don't use relative imports -from yt_dlp.postprocessor.common import PostProcessor - - -# ℹ️ See the docstring of yt_dlp.postprocessor.common.PostProcessor -class SamplePluginPP(PostProcessor): - def __init__(self, downloader=None, **kwargs): - # ⚠ Only kwargs can be passed from the CLI, and all argument values will be string - # Also, "downloader", "when" and "key" are reserved names - super().__init__(downloader) - self._kwargs = kwargs - - # ℹ️ See docstring of yt_dlp.postprocessor.common.PostProcessor.run - def run(self, info): - if info.get('_type', 'video') != 'video': # PP was called for playlist - self.to_screen(f'Post-processing playlist {info.get("id")!r} with {self._kwargs}') - elif info.get('filepath'): # PP was called after download (default) - filepath = info.get('filepath') - self.to_screen(f'Post-processed {filepath!r} with {self._kwargs}') - elif info.get('requested_downloads'): # PP was called after_video - filepaths = [f.get('filepath') for f in info.get('requested_downloads')] - self.to_screen(f'Post-processed {filepaths!r} with {self._kwargs}') - else: # PP was called before actual download - filepath = info.get('_filename') - self.to_screen(f'Pre-processed {filepath!r} with {self._kwargs}') - return [], info # return list_of_files_to_delete, info_dict From 3e01ce744a981d8f19ae77ec695005e7000f4703 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 1 Jan 2023 18:40:26 +1300 Subject: [PATCH 43/84] [extractor/generic] Use `Accept-Encoding: identity` for initial request The existing comment seems to imply this was the desired behavior from the beginning. Partial fix for https://github.com/yt-dlp/yt-dlp/issues/5855, https://github.com/yt-dlp/yt-dlp/issues/5851, https://github.com/yt-dlp/yt-dlp/issues/4748 --- yt_dlp/extractor/generic.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 2281c71f3d..ffc2790230 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2154,6 +2154,21 @@ class GenericIE(InfoExtractor): 'age_limit': 0, 'direct': True, } + }, { + 'note': 'server returns data in brotli compression by default if `accept-encoding: *` is specified.', + 'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867', + 'info_dict': { + 'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867', + 'ext': 'mp4', + 'title': 'čauky lidi 70 finall', + 'description': 'čauky lidi 70 finall', + 'thumbnail': 'h', + 'upload_date': '20220606', + 'timestamp': 1654513791, + 'duration': 318.0, + 'direct': True, + 'age_limit': 0, + } } ] @@ -2312,7 +2327,7 @@ def _real_extract(self, url): # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. full_response = self._request_webpage(url, video_id, headers={ - 'Accept-Encoding': '*', + 'Accept-Encoding': 'identity', **smuggled_data.get('http_headers', {}) }) new_url = full_response.geturl() From 1cdda3299810b86206853a22e680758eadcc4e05 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 14:11:14 +0530 Subject: [PATCH 44/84] [utils] `get_exe_version`: Detect broken executables Authored by: dirkf, pukkandan Closes #5561 --- yt_dlp/utils.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 32da598d0f..5af176b364 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2720,8 +2720,10 @@ def _get_exe_version_output(exe, args): # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if yt-dlp is run in the background. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 - stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + if ret: + return None except OSError: return False return stdout @@ -2739,11 +2741,15 @@ def detect_exe_version(output, version_re=None, unrecognized='present'): def get_exe_version(exe, args=['--version'], - version_re=None, unrecognized='present'): + version_re=None, unrecognized=('present', 'broken')): """ Returns the version of the specified executable, or False if the executable is not present """ + unrecognized = variadic(unrecognized) + assert len(unrecognized) in (1, 2) out = _get_exe_version_output(exe, args) - return detect_exe_version(out, version_re, unrecognized) if out else False + if out is None: + return unrecognized[-1] + return out and detect_exe_version(out, version_re, unrecognized[0]) def frange(start=0, stop=None, step=1): From 88fb9425775da7f92d24e8b5f3009cafb56e94d6 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 13:32:05 +0530 Subject: [PATCH 45/84] Add message when there are no subtitles/thumbnails Closes #5551 --- yt_dlp/YoutubeDL.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9ef56a46b6..866d069b76 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3930,7 +3930,7 @@ def _write_description(self, label, ie_result, descfn): elif not self.params.get('overwrites', True) and os.path.exists(descfn): self.to_screen(f'[info] {label.title()} description is already present') elif ie_result.get('description') is None: - self.report_warning(f'There\'s no {label} description to write') + self.to_screen(f'[info] There\'s no {label} description to write') return False else: try: @@ -3946,15 +3946,18 @@ def _write_subtitles(self, info_dict, filename): ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error''' ret = [] subtitles = info_dict.get('requested_subtitles') - if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')): + if not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE return ret - + elif not subtitles: + self.to_screen('[info] There\'s no subtitles for the requested languages') + return ret sub_filename_base = self.prepare_filename(info_dict, 'subtitle') if not sub_filename_base: self.to_screen('[info] Skipping writing video subtitles') return ret + for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) @@ -4001,6 +4004,9 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None thumbnails, ret = [], [] if write_all or self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') or [] + if not thumbnails: + self.to_screen(f'[info] There\'s no {label} thumbnails to download') + return ret multiple = write_all and len(thumbnails) > 1 if thumb_filename_base is None: From 2a06bb4eb671eb306a2687ef0a4f853b936f05e0 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 13:42:43 +0530 Subject: [PATCH 46/84] Add `--compat-options 2021,2022` Use these to guard against future compat changes. This allows devs to change defaults and make other potentially breaking changes more easily. If you need everything to work exactly as-is, put this in your config --- README.md | 2 ++ yt_dlp/options.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 4294090dc5..f6bf1175e2 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,8 @@ ### Differences in default behavior * `--compat-options all`: Use all compat options (Do NOT use) * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect` +* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` +* `--compat-options 2022`: Currently does nothing. Use this to enable all future compat options # INSTALLATION diff --git a/yt_dlp/options.py b/yt_dlp/options.py index be4695cbb5..e9766c02d7 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -470,6 +470,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): }, 'aliases': { 'youtube-dl': ['all', '-multistreams'], 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], + '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'], + '2022': [], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' From 78d25e0b7c2b45597e193c0decb33f4f248502a9 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 14:10:51 +0530 Subject: [PATCH 47/84] [extractor/embedly] Handle vimeo embeds Closes #3360 --- yt_dlp/extractor/embedly.py | 62 +++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/embedly.py b/yt_dlp/extractor/embedly.py index 483d018bb4..db5ef055ec 100644 --- a/yt_dlp/extractor/embedly.py +++ b/yt_dlp/extractor/embedly.py @@ -1,13 +1,63 @@ import re import urllib.parse + from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from .youtube import YoutubeTabIE +from ..utils import parse_qs, smuggle_url, traverse_obj class EmbedlyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?url=(?P[^#&]+)' + _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?(?:src|url)=(?:[^#&]+)' _TESTS = [{ 'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1', + 'info_dict': { + 'id': 'UUGLim4T2loE5rwCMdpCIPVg', + 'modified_date': '20221225', + 'view_count': int, + 'uploader_url': 'https://www.youtube.com/@TraciHinesMusic', + 'channel_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'uploader': 'TraciJHines', + 'channel_url': 'https://www.youtube.com/@TraciHinesMusic', + 'channel': 'TraciJHines', + 'availability': 'public', + 'uploader_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'description': '', + 'tags': [], + 'title': 'Uploads from TraciJHines', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1', + 'params': {'noplaylist': True}, + 'info_dict': { + 'id': 'SU4fj_aEMVw', + 'ext': 'mp4', + 'title': 'I\'m on Patreon!', + 'age_limit': 0, + 'categories': ['Entertainment'], + 'thumbnail': 'https://i.ytimg.com/vi_webp/SU4fj_aEMVw/maxresdefault.webp', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'TraciJHines', + 'uploader_id': 'TraciJHines', + 'channel_url': 'https://www.youtube.com/channel/UCGLim4T2loE5rwCMdpCIPVg', + 'uploader_url': 'http://www.youtube.com/user/TraciJHines', + 'upload_date': '20150211', + 'duration': 282, + 'availability': 'public', + 'channel_follower_count': int, + 'tags': 'count:39', + 'view_count': int, + 'comment_count': int, + 'channel_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'like_count': int, + 'uploader': 'TraciJHines', + 'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364', + 'chapters': list, + + }, + }, { + 'url': 'https://cdn.embedly.com/widgets/media.html?src=https://player.vimeo.com/video/1234567?h=abcdefgh', 'only_matching': True, }] @@ -21,4 +71,10 @@ def _extract_embed_urls(cls, url, webpage): yield urllib.parse.unquote(mobj.group('url')) def _real_extract(self, url): - return self.url_result(compat_urllib_parse_unquote(self._match_id(url))) + qs = parse_qs(url) + src = urllib.parse.unquote(traverse_obj(qs, ('url', 0)) or '') + if src and YoutubeTabIE.suitable(src): + return self.url_result(src, YoutubeTabIE) + return self.url_result(smuggle_url( + urllib.parse.unquote(traverse_obj(qs, ('src', 0), ('url', 0))), + {'http_headers': {'Referer': url}})) From 26fdfc3704a278acada27cc420d67c6d3f71423b Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 14:39:58 +0530 Subject: [PATCH 48/84] [extractor/biliintl:series] Make partial download of series faster --- yt_dlp/extractor/bilibili.py | 51 +++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 92620f697b..3274a427da 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -20,9 +20,11 @@ parse_count, parse_qs, qualities, + smuggle_url, srt_subtitles_timecode, str_or_none, traverse_obj, + unsmuggle_url, url_or_none, urlencode_postdata, ) @@ -881,16 +883,12 @@ def _get_formats(self, *, ep_id=None, aid=None): return formats - def _extract_video_info(self, video_data, *, ep_id=None, aid=None): + def _parse_video_metadata(self, video_data): return { - 'id': ep_id or aid, 'title': video_data.get('title_display') or video_data.get('title'), 'thumbnail': video_data.get('cover'), 'episode_number': int_or_none(self._search_regex( r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), - 'formats': self._get_formats(ep_id=ep_id, aid=aid), - 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid), - 'extractor_key': BiliIntlIE.ie_key(), } def _perform_login(self, username, password): @@ -975,9 +973,16 @@ class BiliIntlIE(BiliIntlBaseIE): 'only_matching': True, }] - def _real_extract(self, url): - season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') - video_id = ep_id or aid + def _make_url(video_id, series_id=None): + if series_id: + return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}' + return f'https://www.bilibili.tv/en/video/{video_id}' + + def _extract_video_metadata(self, url, video_id, season_id): + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('title'): + return smuggled_data + webpage = self._download_webpage(url, video_id) # Bstation layout initial_data = ( @@ -989,13 +994,26 @@ def _real_extract(self, url): if season_id and not video_data: # Non-Bstation layout, read through episode list season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) - video_data = traverse_obj(season_json, - ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), - expected_type=dict, get_all=False) - return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid) + video_data = traverse_obj(season_json, ( + 'sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == video_id + ), expected_type=dict, get_all=False) + + return self._parse_video_metadata(video_data) + + def _real_extract(self, url): + season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') + video_id = ep_id or aid + + return { + 'id': video_id, + **self._extract_video_metadata(url, video_id, season_id), + 'formats': self._get_formats(ep_id=ep_id, aid=aid), + 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), + } class BiliIntlSeriesIE(BiliIntlBaseIE): + IE_NAME = 'biliintl:series' _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?play/(?P\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', @@ -1021,9 +1039,12 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): def _entries(self, series_id): series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) - for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): - episode_id = str(episode.get('episode_id')) - yield self._extract_video_info(episode, ep_id=episode_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict): + episode_id = str(episode['episode_id']) + yield self.url_result(smuggle_url( + BiliIntlIE._make_url(episode_id, series_id), + self._parse_video_metadata(episode) + ), BiliIntlIE, episode_id) def _real_extract(self, url): series_id = self._match_id(url) From 193fb150b76c4aaf41fb2c98b073e7e1f8a108f0 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 17:01:48 +0530 Subject: [PATCH 49/84] Fix bug in 119e40ef64b25f66a39246e87ce6c143cd34276d --- yt_dlp/YoutubeDL.py | 3 ++- yt_dlp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 866d069b76..8ce71a2dc6 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3460,7 +3460,8 @@ def run_pp(self, pp, infodict): return infodict def run_all_pps(self, key, info, *, additional_pps=None): - self._forceprint(key, info) + if key != 'video': + self._forceprint(key, info) for pp in (additional_pps or []) + self._pps[key]: info = self.run_pp(pp, info) return info diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 3490816c4c..9cb1324105 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -703,7 +703,7 @@ def parse_options(argv=None): postprocessors = list(get_postprocessors(opts)) - print_only = bool(opts.forceprint) and all(k not in opts.forceprint for k in POSTPROCESS_WHEN[2:]) + print_only = bool(opts.forceprint) and all(k not in opts.forceprint for k in POSTPROCESS_WHEN[3:]) any_getting = any(getattr(opts, k) for k in ( 'dumpjson', 'dump_single_json', 'getdescription', 'getduration', 'getfilename', 'getformat', 'getid', 'getthumbnail', 'gettitle', 'geturl' From 8c53322cda75394a8d551dde20b2529ee5ad6e89 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Mon, 2 Jan 2023 02:16:25 +0900 Subject: [PATCH 50/84] [downloader/aria2c] Native progress for aria2c via RPC (#3724) Authored by: Lesmiscore, pukkandan Closes #2038 --- README.md | 3 +- yt_dlp/downloader/external.py | 109 ++++++++++++++++++++++++++++++++-- yt_dlp/options.py | 6 +- yt_dlp/utils.py | 9 +++ 4 files changed, 119 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f6bf1175e2..83e69a236b 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,7 @@ ### Differences in default behavior * When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior +* yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: `aria2c`). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is For ease of use, a few more compat options are available: @@ -160,7 +161,7 @@ ### Differences in default behavior * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Currently does nothing. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options no-external-downloader-progress`. Use this to enable all future compat options # INSTALLATION diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 5751383712..569839f6f4 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -1,9 +1,11 @@ import enum +import json import os.path import re import subprocess import sys import time +import uuid from .fragment import FragmentFD from ..compat import functools @@ -20,8 +22,10 @@ determine_ext, encodeArgument, encodeFilename, + find_available_port, handle_youtubedl_headers, remove_end, + sanitized_Request, traverse_obj, ) @@ -60,7 +64,6 @@ def real_download(self, filename, info_dict): } if filename != '-': fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(f'\r[{self.get_basename()}] Downloaded {fsize} bytes') self.try_rename(tmpfilename, filename) status.update({ 'downloaded_bytes': fsize, @@ -129,8 +132,7 @@ def _call_downloader(self, tmpfilename, info_dict): self._debug_cmd(cmd) if 'fragments' not in info_dict: - _, stderr, returncode = Popen.run( - cmd, text=True, stderr=subprocess.PIPE if self._CAPTURE_STDERR else None) + _, stderr, returncode = self._call_process(cmd, info_dict) if returncode and stderr: self.to_stderr(stderr) return returncode @@ -140,7 +142,7 @@ def _call_downloader(self, tmpfilename, info_dict): retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry, frag_index=None, fatal=not skip_unavailable_fragments) for retry in retry_manager: - _, stderr, returncode = Popen.run(cmd, text=True, stderr=subprocess.PIPE) + _, stderr, returncode = self._call_process(cmd, info_dict) if not returncode: break # TODO: Decide whether to retry based on error code @@ -172,6 +174,9 @@ def _call_downloader(self, tmpfilename, info_dict): self.try_remove(encodeFilename('%s.frag.urls' % tmpfilename)) return 0 + def _call_process(self, cmd, info_dict): + return Popen.run(cmd, text=True, stderr=subprocess.PIPE) + class CurlFD(ExternalFD): AVAILABLE_OPT = '-V' @@ -256,6 +261,14 @@ def supports_manifest(manifest): def _aria2c_filename(fn): return fn if os.path.isabs(fn) else f'.{os.path.sep}{fn}' + def _call_downloader(self, tmpfilename, info_dict): + if 'no-external-downloader-progress' not in self.params.get('compat_opts', []): + info_dict['__rpc'] = { + 'port': find_available_port() or 19190, + 'secret': str(uuid.uuid4()), + } + return super()._call_downloader(tmpfilename, info_dict) + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-c', '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', @@ -276,6 +289,12 @@ def _make_cmd(self, tmpfilename, info_dict): cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=') cmd += self._configuration_args() + if '__rpc' in info_dict: + cmd += [ + '--enable-rpc', + f'--rpc-listen-port={info_dict["__rpc"]["port"]}', + f'--rpc-secret={info_dict["__rpc"]["secret"]}'] + # aria2c strips out spaces from the beginning/end of filenames and paths. # We work around this issue by adding a "./" to the beginning of the # filename and relative path, and adding a "/" at the end of the path. @@ -304,6 +323,88 @@ def _make_cmd(self, tmpfilename, info_dict): cmd += ['--', info_dict['url']] return cmd + def aria2c_rpc(self, rpc_port, rpc_secret, method, params=()): + # Does not actually need to be UUID, just unique + sanitycheck = str(uuid.uuid4()) + d = json.dumps({ + 'jsonrpc': '2.0', + 'id': sanitycheck, + 'method': method, + 'params': [f'token:{rpc_secret}', *params], + }).encode('utf-8') + request = sanitized_Request( + f'http://localhost:{rpc_port}/jsonrpc', + data=d, headers={ + 'Content-Type': 'application/json', + 'Content-Length': f'{len(d)}', + 'Ytdl-request-proxy': '__noproxy__', + }) + with self.ydl.urlopen(request) as r: + resp = json.load(r) + assert resp.get('id') == sanitycheck, 'Something went wrong with RPC server' + return resp['result'] + + def _call_process(self, cmd, info_dict): + if '__rpc' not in info_dict: + return super()._call_process(cmd, info_dict) + + send_rpc = functools.partial(self.aria2c_rpc, info_dict['__rpc']['port'], info_dict['__rpc']['secret']) + started = time.time() + + fragmented = 'fragments' in info_dict + frag_count = len(info_dict['fragments']) if fragmented else 1 + status = { + 'filename': info_dict.get('_filename'), + 'status': 'downloading', + 'elapsed': 0, + 'downloaded_bytes': 0, + 'fragment_count': frag_count if fragmented else None, + 'fragment_index': 0 if fragmented else None, + } + self._hook_progress(status, info_dict) + + def get_stat(key, *obj, average=False): + val = tuple(filter(None, map(float, traverse_obj(obj, (..., ..., key))))) or [0] + return sum(val) / (len(val) if average else 1) + + with Popen(cmd, text=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) as p: + # Add a small sleep so that RPC client can receive response, + # or the connection stalls infinitely + time.sleep(0.2) + retval = p.poll() + while retval is None: + # We don't use tellStatus as we won't know the GID without reading stdout + # Ref: https://aria2.github.io/manual/en/html/aria2c.html#aria2.tellActive + active = send_rpc('aria2.tellActive') + completed = send_rpc('aria2.tellStopped', [0, frag_count]) + + downloaded = get_stat('totalLength', completed) + get_stat('completedLength', active) + speed = get_stat('downloadSpeed', active) + total = frag_count * get_stat('totalLength', active, completed, average=True) + if total < downloaded: + total = None + + status.update({ + 'downloaded_bytes': int(downloaded), + 'speed': speed, + 'total_bytes': None if fragmented else total, + 'total_bytes_estimate': total, + 'eta': (total - downloaded) / (speed or 1), + 'fragment_index': min(frag_count, len(completed) + 1) if fragmented else None, + 'elapsed': time.time() - started + }) + self._hook_progress(status, info_dict) + + if not active and len(completed) >= frag_count: + send_rpc('aria2.shutdown') + retval = p.wait() + break + + time.sleep(0.1) + retval = p.poll() + + return '', p.stderr.read(), retval + class HttpieFD(ExternalFD): AVAILABLE_OPT = '--version' diff --git a/yt_dlp/options.py b/yt_dlp/options.py index e9766c02d7..5bbb292dee 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -464,14 +464,14 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'allowed_values': { 'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', - 'no-attach-info-json', 'embed-metadata', 'embed-thumbnail-atomicparsley', - 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', + 'no-attach-info-json', 'embed-thumbnail-atomicparsley', 'no-external-downloader-progress', + 'embed-metadata', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', }, 'aliases': { 'youtube-dl': ['all', '-multistreams'], 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'], - '2022': [], + '2022': ['no-external-downloader-progress'], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 5af176b364..45a7e6eaa5 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5243,6 +5243,15 @@ def random_birthday(year_field, month_field, day_field): } +def find_available_port(interface=''): + try: + with socket.socket() as sock: + sock.bind((interface, 0)) + return sock.getsockname()[1] + except OSError: + return None + + # Templates for internet shortcut files, which are plain text files. DOT_URL_LINK_TEMPLATE = '''\ [InternetShortcut] From e756f45ba0648f972be71ce328419a623e381028 Mon Sep 17 00:00:00 2001 From: Matthew Date: Mon, 2 Jan 2023 04:55:11 +0000 Subject: [PATCH 51/84] Improve handling for overriding extractors with plugins (#5916) * Extractors replaced with plugin extractors now show in debug output * Better testcase handling * Added documentation Authored by: coletdjnz, pukkandan --- README.md | 9 ++++++--- yt_dlp/YoutubeDL.py | 22 +++++++++++++++------- yt_dlp/extractor/common.py | 13 +++++++++++-- yt_dlp/extractor/extractors.py | 2 ++ yt_dlp/extractor/testurl.py | 11 ++++++----- 5 files changed, 40 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 83e69a236b..c4bd6ef0c7 100644 --- a/README.md +++ b/README.md @@ -1841,7 +1841,7 @@ ## Installing Plugins * Source: where `/yt_dlp/__main__.py`, `/yt-dlp-plugins//yt_dlp_plugins/` 3. **pip and other locations in `PYTHONPATH`** - * Plugin packages can be installed and managed using `pip`. See [ytdlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for an example. + * Plugin packages can be installed and managed using `pip`. See [yt-dlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for an example. * Note: plugin files between plugin packages installed with pip must have unique filenames * Any path in `PYTHONPATH` is searched in for the `yt_dlp_plugins` namespace folder. * Note: This does not apply for Pyinstaller/py2exe builds. @@ -1854,9 +1854,12 @@ ## Installing Plugins ## Developing Plugins -See [ytdlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for a sample plugin package with instructions on how to set up an environment for plugin development. +See [yt-dlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for a sample plugin package with instructions on how to set up an environment for plugin development. -All public classes with a name ending in `IE` are imported from each file. This respects underscore prefix (e.g. `_MyBasePluginIE` is private) and `__all__`. Modules can similarly be excluded by prefixing the module name with an underscore (e.g. `_myplugin.py`) +All public classes with a name ending in `IE`/`PP` are imported from each file for extractors and postprocessors repectively. This respects underscore prefix (e.g. `_MyBasePluginIE` is private) and `__all__`. Modules can similarly be excluded by prefixing the module name with an underscore (e.g. `_myplugin.py`) + +To replace an existing extractor with a subclass of one, set the `plugin_name` class keyword argument (e.g. `MyPluginIE(ABuiltInIE, plugin_name='myplugin')` will replace `ABuiltInIE` with `MyPluginIE`). +Due to the mechanics behind this, you should exclude the subclass extractor from being imported separately by making it private using one of the methods described above. If you are a plugin author, add [yt-dlp-plugins](https://github.com/topics/yt-dlp-plugins) as a topic to your repository for discoverability diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 8ce71a2dc6..e7b4690590 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -33,7 +33,7 @@ from .extractor.openload import PhantomJSwrapper from .minicurses import format_text from .plugins import directories as plugin_directories -from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors +from .postprocessor import _PLUGIN_CLASSES as plugin_pps from .postprocessor import ( EmbedThumbnailPP, FFmpegFixupDuplicateMoovPP, @@ -3730,7 +3730,10 @@ def print_debug_header(self): # These imports can be slow. So import them only as needed from .extractor.extractors import _LAZY_LOADER - from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors + from .extractor.extractors import ( + _PLUGIN_CLASSES as plugin_ies, + _PLUGIN_OVERRIDES as plugin_ie_overrides + ) def get_encoding(stream): ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) @@ -3808,12 +3811,17 @@ def get_encoding(stream): proxy_map.update(handler.proxies) write_debug(f'Proxy map: {proxy_map}') - for plugin_type, plugins in {'Extractor': plugin_extractors, 'Post-Processor': plugin_postprocessors}.items(): - if not plugins: - continue - write_debug(f'{plugin_type} Plugins: %s' % (', '.join(sorted(('%s%s' % ( + for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): + display_list = ['%s%s' % ( klass.__name__, '' if klass.__name__ == name else f' as {name}') - for name, klass in plugins.items()))))) + for name, klass in plugins.items()] + if plugin_type == 'Extractor': + display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})' + for parent, plugins in plugin_ie_overrides.items()) + if not display_list: + continue + write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}') + plugin_dirs = plugin_directories() if plugin_dirs: write_debug(f'Plugin directories: {plugin_dirs}') diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 9031f3c116..f48b97a6b6 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3442,13 +3442,17 @@ def get_testcases(cls, include_onlymatching=False): continue t['name'] = cls.ie_key() yield t + if getattr(cls, '__wrapped__', None): + yield from cls.__wrapped__.get_testcases(include_onlymatching) @classmethod def get_webpage_testcases(cls): tests = vars(cls).get('_WEBPAGE_TESTS', []) for t in tests: t['name'] = cls.ie_key() - return tests + yield t + if getattr(cls, '__wrapped__', None): + yield from cls.__wrapped__.get_webpage_testcases() @classproperty(cache=True) def age_limit(cls): @@ -3710,10 +3714,12 @@ def __init_subclass__(cls, *, plugin_name=None, **kwargs): if plugin_name: mro = inspect.getmro(cls) super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] - cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key + cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key + cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}' while getattr(super_class, '__wrapped__', None): super_class = super_class.__wrapped__ setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + _PLUGIN_OVERRIDES[super_class].append(cls) return super().__init_subclass__(**kwargs) @@ -3770,3 +3776,6 @@ class UnsupportedURLIE(InfoExtractor): def _real_extract(self, url): raise UnsupportedError(url) + + +_PLUGIN_OVERRIDES = collections.defaultdict(list) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index beda02917e..baa69d2421 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -24,3 +24,5 @@ globals().update(_PLUGIN_CLASSES) _ALL_CLASSES[:0] = _PLUGIN_CLASSES.values() + +from .common import _PLUGIN_OVERRIDES # noqa: F401 diff --git a/yt_dlp/extractor/testurl.py b/yt_dlp/extractor/testurl.py index dccca10046..0da01aa53e 100644 --- a/yt_dlp/extractor/testurl.py +++ b/yt_dlp/extractor/testurl.py @@ -23,11 +23,12 @@ def _real_extract(self, url): if len(matching_extractors) == 0: raise ExtractorError(f'No extractors matching {extractor_id!r} found', expected=True) elif len(matching_extractors) > 1: - try: # Check for exact match - extractor = next( - ie for ie in matching_extractors - if ie.IE_NAME.lower() == extractor_id.lower()) - except StopIteration: + extractor = next(( # Check for exact match + ie for ie in matching_extractors if ie.IE_NAME.lower() == extractor_id.lower() + ), None) or next(( # Check for exact match without plugin suffix + ie for ie in matching_extractors if ie.IE_NAME.split('+')[0].lower() == extractor_id.lower() + ), None) + if not extractor: raise ExtractorError( 'Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors), expected=True) From b23b503e22ff577d23920e877ee73da478bb4c6f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 2 Jan 2023 05:44:54 +0000 Subject: [PATCH 52/84] [extractor/odnoklassniki] Extract subtitles (#5920) Closes #5744 Authored by: bashonly --- yt_dlp/extractor/odnoklassniki.py | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index 4f325f0878..4b73eed37e 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -11,6 +11,7 @@ int_or_none, qualities, smuggle_url, + traverse_obj, unescapeHTML, unified_strdate, unsmuggle_url, @@ -153,6 +154,26 @@ class OdnoklassnikiIE(InfoExtractor): 'title': 'Быковское крещение', 'duration': 3038.181, }, + 'skip': 'HTTP Error 400', + }, { + 'note': 'subtitles', + 'url': 'https://ok.ru/video/4249587550747', + 'info_dict': { + 'id': '4249587550747', + 'ext': 'mp4', + 'title': 'Small Country An African Childhood (2020) (1080p) +subtitle', + 'uploader': 'Sunflower Movies', + 'uploader_id': '595802161179', + 'upload_date': '20220816', + 'duration': 6728, + 'age_limit': 0, + 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+', + 'like_count': int, + 'subtitles': dict, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, @@ -202,6 +223,7 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': 0, 'duration': 10444, }, + 'skip': 'Site no longer embeds', }] @classmethod @@ -294,6 +316,16 @@ def _extract_desktop(self, url): like_count = int_or_none(metadata.get('likeCount')) + subtitles = {} + for sub in traverse_obj(metadata, ('movie', 'subtitleTracks', ...), expected_type=dict): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles.setdefault(sub.get('language') or 'en', []).append({ + 'url': sub_url, + 'ext': 'vtt', + }) + info = { 'id': video_id, 'title': title, @@ -305,6 +337,7 @@ def _extract_desktop(self, url): 'like_count': like_count, 'age_limit': age_limit, 'start_time': start_time, + 'subtitles': subtitles, } # pladform From 13f930abc0c91d8e50336488e4c55defe97aa588 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 2 Jan 2023 05:46:06 +0000 Subject: [PATCH 53/84] [extractor/fifa] Fix Preplay extraction (#5921) Closes #5839 Authored by: dirkf --- yt_dlp/extractor/fifa.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/fifa.py b/yt_dlp/extractor/fifa.py index dc00edcb31..8b4db3a8ae 100644 --- a/yt_dlp/extractor/fifa.py +++ b/yt_dlp/extractor/fifa.py @@ -17,8 +17,10 @@ class FifaIE(InfoExtractor): 'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b', 'ext': 'mp4', 'categories': ['FIFA Tournaments'], - 'thumbnail': 'https://digitalhub.fifa.com/transform/fa6f0b3e-a2e9-4cf7-9f32-53c57bcb7360/2006_Final_ITA_FRA', + 'thumbnail': 'https://digitalhub.fifa.com/transform/135e2656-3a51-407b-8810-6c34bec5b59b/FMR_2006_Italy_France_Final_Hero', 'duration': 8165, + 'release_timestamp': 1152403200, + 'release_date': '20060709', }, 'params': {'skip_download': 'm3u8'}, }, { @@ -54,7 +56,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) preconnect_link = self._search_regex( - r']+rel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') + r']+\brel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') video_details = self._download_json( f'{preconnect_link}/sections/videoDetails/{video_id}', video_id, 'Downloading Video Details', fatal=False) @@ -62,22 +64,9 @@ def _real_extract(self, url): preplay_parameters = self._download_json( f'{preconnect_link}/videoPlayerData/{video_id}', video_id, 'Downloading Preplay Parameters')['preplayParameters'] - cid = preplay_parameters['contentId'] content_data = self._download_json( - f'https://content.uplynk.com/preplay/{cid}/multiple.json', video_id, 'Downloading Content Data', query={ - 'v': preplay_parameters['preplayAPIVersion'], - 'tc': preplay_parameters['tokenCheckAlgorithmVersion'], - 'rn': preplay_parameters['randomNumber'], - 'exp': preplay_parameters['tokenExpirationDate'], - 'ct': preplay_parameters['contentType'], - 'cid': cid, - 'mbtracks': preplay_parameters['tracksAssetNumber'], - 'ad': preplay_parameters['adConfiguration'], - 'ad.preroll': int(preplay_parameters['adPreroll']), - 'ad.cmsid': preplay_parameters['adCMSSourceId'], - 'ad.vid': preplay_parameters['adSourceVideoID'], - 'sig': preplay_parameters['signature'], - }) + 'https://content.uplynk.com/preplay/{contentId}/multiple.json?{queryStr}&sig={signature}'.format(**preplay_parameters), + video_id, 'Downloading Content Data') formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id) From d7f98714696a4c9691ed28fb9b63395b9227646a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 2 Jan 2023 05:50:37 +0000 Subject: [PATCH 54/84] [extractor/iqiyi] Fix `Iq` JS regex (#5922) Closes #5702 Authored by: bashonly --- yt_dlp/extractor/iqiyi.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index dbc688fb92..eba89f787e 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -527,11 +527,14 @@ def _extract_vms_player_js(self, webpage, video_id): webpack_js_url = self._proto_relative_url(self._search_regex( r').*?\.setup\s*\((?P[^)]+)\)', + r'''(?s)jwplayer\s*\(\s*(?P'|")(?!(?P=q)).+(?P=q)\s*\)(?!).*?\.\s*setup\s*\(\s*(?P(?:\([^)]*\)|[^)])+)\s*\)''', webpage) if mobj: try: @@ -3237,19 +3243,20 @@ def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - entries = [] + if not isinstance(jwplayer_data, dict): + return entries - # JWPlayer backward compatibility: single playlist item + playlist_items = jwplayer_data.get('playlist') + # JWPlayer backward compatibility: single playlist item/flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if not isinstance(playlist_items, list): + playlist_items = (playlist_items or jwplayer_data, ) - for video_data in jwplayer_data['playlist']: + for video_data in playlist_items: + if not isinstance(video_data, dict): + continue # JWPlayer backward compatibility: flattened sources # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 if 'sources' not in video_data: @@ -3287,6 +3294,13 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, + 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ... + 'genre': clean_html(video_data.get('genre')), + 'channel': clean_html(dict_get(video_data, ('category', 'channel'))), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'release_year': int_or_none(video_data.get('releasedate')), + 'age_limit': int_or_none(video_data.get('age_restriction')), } # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): @@ -3304,7 +3318,7 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] + urls = set() formats = [] for source in jwplayer_sources_data: if not isinstance(source, dict): @@ -3313,14 +3327,14 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, base_url, self._proto_relative_url(source.get('file'))) if not source_url or source_url in urls: continue - urls.append(source_url) + urls.add(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': + if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url: formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': @@ -3335,13 +3349,12 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 'ext': ext, }) else: + format_id = str_or_none(source.get('label')) height = int_or_none(source.get('height')) - if height is None: + if height is None and format_id: # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''), - 'height', default=None)) + height = parse_resolution(format_id).get('height') a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), @@ -3349,6 +3362,7 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 'tbr': int_or_none(source.get('bitrate'), scale=1000), 'filesize': int_or_none(source.get('filesize')), 'ext': ext, + 'format_id': format_id } if source_url.startswith('rtmp'): a_format['ext'] = 'flv' diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index ffc2790230..14d492f075 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -32,6 +32,7 @@ unified_timestamp, unsmuggle_url, url_or_none, + urljoin, variadic, xpath_attr, xpath_text, @@ -1867,11 +1868,13 @@ class GenericIE(InfoExtractor): 'display_id': 'kelis-4th-of-july', 'ext': 'mp4', 'title': 'Kelis - 4th Of July', - 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + 'description': 'Kelis - 4th Of July', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Untested major version'], }, { # KVS Player 'url': 'https://www.kvs-demo.com/embed/105/', @@ -1880,35 +1883,12 @@ class GenericIE(InfoExtractor): 'display_id': 'kelis-4th-of-july', 'ext': 'mp4', 'title': 'Kelis - 4th Of July / Embed Player', - 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', }, 'params': { 'skip_download': True, }, }, { - # KVS Player - 'url': 'https://thisvid.com/videos/french-boy-pantsed/', - 'md5': '3397979512c682f6b85b3b04989df224', - 'info_dict': { - 'id': '2400174', - 'display_id': 'french-boy-pantsed', - 'ext': 'mp4', - 'title': 'French Boy Pantsed - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', - } - }, { - # KVS Player - 'url': 'https://thisvid.com/embed/2400174/', - 'md5': '3397979512c682f6b85b3b04989df224', - 'info_dict': { - 'id': '2400174', - 'display_id': 'french-boy-pantsed', - 'ext': 'mp4', - 'title': 'French Boy Pantsed - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', - } - }, { - # KVS Player 'url': 'https://youix.com/video/leningrad-zoj/', 'md5': '94f96ba95706dc3880812b27b7d8a2b8', 'info_dict': { @@ -1916,8 +1896,8 @@ class GenericIE(InfoExtractor): 'display_id': 'leningrad-zoj', 'ext': 'mp4', 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', - 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', - } + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, }, { # KVS Player 'url': 'https://youix.com/embed/18485', @@ -1927,19 +1907,20 @@ class GenericIE(InfoExtractor): 'display_id': 'leningrad-zoj', 'ext': 'mp4', 'title': 'Ленинград - ЗОЖ', - 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', - } + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, }, { # KVS Player 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/', 'md5': '94166bdb26b4cb1fb9214319a629fc51', 'info_dict': { 'id': '21217', - 'display_id': '40-nochey-40-nights-2016', + 'display_id': '40-nochey-2016', 'ext': 'mp4', 'title': '40 ночей (2016) - BogMedia.org', + 'description': 'md5:4e6d7d622636eb7948275432eb256dc3', 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', - } + }, }, { # KVS Player (for sites that serve kt_player.js via non-https urls) @@ -1949,9 +1930,9 @@ class GenericIE(InfoExtractor): 'id': '389508', 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', 'ext': 'mp4', - 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', - 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg', - } + 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', + 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg', + }, }, { # Reddit-hosted video that will redirect and be processed by RedditIE @@ -2169,7 +2150,20 @@ class GenericIE(InfoExtractor): 'direct': True, 'age_limit': 0, } - } + }, + { + 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/', + 'md5': 'e2f0a4c329f7986280b7328e24036d60', + 'info_dict': { + 'id': '284002', + 'display_id': 'just-out-of-the-shower-joi', + 'ext': 'mp4', + 'title': 'Just Out Of The Shower JOI - Shooshtime', + 'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg', + 'height': 720, + 'age_limit': 18, + }, + }, ] def report_following_redirect(self, new_url): @@ -2235,43 +2229,87 @@ def itunes(key): 'entries': entries, } - def _kvs_getrealurl(self, video_url, license_code): + @classmethod + def _kvs_get_real_url(cls, video_url, license_code): if not video_url.startswith('function/0/'): return video_url # not obfuscated - url_path, _, url_query = video_url.partition('?') - urlparts = url_path.split('/')[2:] - license = self._kvs_getlicensetoken(license_code) - newmagic = urlparts[5][:32] + parsed = urllib.parse.urlparse(video_url[len('function/0/'):]) + license = cls._kvs_get_license_token(license_code) + urlparts = parsed.path.split('/') - for o in range(len(newmagic) - 1, -1, -1): - new = '' - l = (o + sum(int(n) for n in license[o:])) % 32 + HASH_LENGTH = 32 + hash = urlparts[3][:HASH_LENGTH] + indices = list(range(HASH_LENGTH)) - for i in range(0, len(newmagic)): - if i == o: - new += newmagic[l] - elif i == l: - new += newmagic[o] - else: - new += newmagic[i] - newmagic = new + # Swap indices of hash according to the destination calculated from the license token + accum = 0 + for src in reversed(range(HASH_LENGTH)): + accum += license[src] + dest = (src + accum) % HASH_LENGTH + indices[src], indices[dest] = indices[dest], indices[src] - urlparts[5] = newmagic + urlparts[5][32:] - return '/'.join(urlparts) + '?' + url_query + urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:] + return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts))) - def _kvs_getlicensetoken(self, license): - modlicense = license.replace('$', '').replace('0', '1') - center = int(len(modlicense) / 2) + @staticmethod + def _kvs_get_license_token(license): + license = license.replace('$', '') + license_values = [int(char) for char in license] + + modlicense = license.replace('0', '1') + center = len(modlicense) // 2 fronthalf = int(modlicense[:center + 1]) backhalf = int(modlicense[center:]) + modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1] - modlicense = str(4 * abs(fronthalf - backhalf)) - retval = '' - for o in range(0, center + 1): - for i in range(1, 5): - retval += str((int(license[o + i]) + int(modlicense[o])) % 10) - return retval + return [ + (license_values[index + offset] + current) % 10 + for index, current in enumerate(map(int, modlicense)) + for offset in range(4) + ] + + def _extract_kvs(self, url, webpage, video_id): + flashvars = self._search_json( + r'(?s:]*>.*?var\s+flashvars\s*=)', + webpage, 'flashvars', video_id, transform_source=js_to_json) + + # extract the part after the last / as the display_id from the + # canonical URL. + display_id = self._search_regex( + r'(?:' + r'|)', + webpage, 'display_id', fatal=False) + title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)', webpage, 'title') + + thumbnail = flashvars['preview_url'] + if thumbnail.startswith('//'): + protocol, _, _ = url.partition('/') + thumbnail = protocol + thumbnail + + url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys())) + formats = [] + for key in url_keys: + if '/get_file/' not in flashvars[key]: + continue + format_id = flashvars.get(f'{key}_text', key) + formats.append({ + 'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])), + 'format_id': format_id, + 'ext': 'mp4', + **(parse_resolution(format_id) or parse_resolution(flashvars[key])), + 'http_headers': {'Referer': url}, + }) + if not formats[-1].get('height'): + formats[-1]['quality'] = 1 + + return { + 'id': flashvars['video_id'], + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } def _real_extract(self, url): if url.startswith('//'): @@ -2580,6 +2618,17 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): self.report_detected('video.js embed') return [{'formats': formats, 'subtitles': subtitles}] + # Look for generic KVS player (before json-ld bc of some urls that break otherwise) + found = self._search_regex(( + r']+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P\d+(?:\.\d+)+)\1[^>]*>', + r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P\d+(?:\.\d+)+)\2\s*,', + ), webpage, 'KVS player', group='ver', default=False) + if found: + self.report_detected('KWS Player') + if found.split('.')[0] not in ('4', '5', '6'): + self.report_warning(f'Untested major version ({found}) in player engine - download may fail.') + return [self._extract_kvs(url, webpage, video_id)] + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): @@ -2622,52 +2671,6 @@ def filter_video(urls): ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) if found: self.report_detected('JW Player embed') - if not found: - # Look for generic KVS player - found = re.search(r'', webpage) - flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json) - - # extract the part after the last / as the display_id from the - # canonical URL. - display_id = self._search_regex( - r'(?:' - r'|)', - webpage, 'display_id', fatal=False - ) - title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)', webpage, 'title') - - thumbnail = flashvars['preview_url'] - if thumbnail.startswith('//'): - protocol, _, _ = url.partition('/') - thumbnail = protocol + thumbnail - - url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys())) - formats = [] - for key in url_keys: - if '/get_file/' not in flashvars[key]: - continue - format_id = flashvars.get(f'{key}_text', key) - formats.append({ - 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']), - 'format_id': format_id, - 'ext': 'mp4', - **(parse_resolution(format_id) or parse_resolution(flashvars[key])) - }) - if not formats[-1].get('height'): - formats[-1]['quality'] = 1 - - return [{ - 'id': flashvars['video_id'], - 'display_id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - }] if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) diff --git a/yt_dlp/extractor/peekvids.py b/yt_dlp/extractor/peekvids.py index 2d9b9a7425..d1fc058b92 100644 --- a/yt_dlp/extractor/peekvids.py +++ b/yt_dlp/extractor/peekvids.py @@ -1,71 +1,128 @@ +import re + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_class, + int_or_none, + merge_dicts, + url_or_none, +) -class PeekVidsIE(InfoExtractor): +class PeekVidsBaseIE(InfoExtractor): + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).group('domain', 'id') + webpage = self._download_webpage(url, video_id, expected_status=429) + if '>Rate Limit Exceeded' in webpage: + raise ExtractorError( + f'You are suspected as a bot. Wait, or pass the captcha on the site and provide cookies. {self._login_hint()}', + video_id=video_id, expected=True) + + title = self._html_search_regex(r'(?s)]*>(.+?)

', webpage, 'title') + + display_id = video_id + video_id = self._search_regex(r'(?s)]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID') + srcs = self._download_json( + f'https://www.{domain}/v-alt/{video_id}', video_id, + note='Downloading list of source files') + + formats = [] + for k, v in srcs.items(): + f_url = url_or_none(v) + if not f_url: + continue + + height = self._search_regex(r'^data-src(\d{3,})$', k, 'height', default=None) + if not height: + continue + + formats.append({ + 'url': f_url, + 'format_id': height, + 'height': int_or_none(height), + }) + + if not formats: + formats = [{'url': url} for url in srcs.values()] + + info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) + info.pop('url', None) + + # may not have found the thumbnail if it was in a list in the ld+json + info.setdefault('thumbnail', self._og_search_thumbnail(webpage)) + detail = (get_element_by_class('detail-video-block', webpage) + or get_element_by_class('detail-block', webpage) or '') + info['description'] = self._html_search_regex( + rf'(?s)(.+?)(?:{re.escape(info.get("description", ""))}\s*<|]*>\s*{re.escape(name)}\s*:\s*(.+?)', + html, name, default='') + return list(filter(None, re.split(r'\s+', l))) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'age_limit': 18, + 'formats': formats, + 'categories': cat_tags('Categories', detail), + 'tags': cat_tags('Tags', detail), + 'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None), + }, info) + + +class PeekVidsIE(PeekVidsBaseIE): _VALID_URL = r'''(?x) - https?://(?:www\.)?peekvids\.com/ + https?://(?:www\.)?(?Ppeekvids\.com)/ (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) (?P[^/?&#]*) ''' _TESTS = [{ 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', - 'md5': 'a00940646c428e232407e3e62f0e8ef5', + 'md5': '2ff6a357a9717dc9dc9894b51307e9a2', 'info_dict': { - 'id': 'BSyLMbN0YCd', - 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', + 'id': '1262717', + 'display_id': 'BSyLMbN0YCd', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', 'ext': 'mp4', 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp (7 min), uploaded by SEXYhub.com', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', 'timestamp': 1642579329, 'upload_date': '20220119', 'duration': 416, 'view_count': int, 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, }, }] - _DOMAIN = 'www.peekvids.com' - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - short_video_id = self._html_search_regex(r'