From c4d95f67ddc522297bb1fea875255cf94b34d595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kotiuk?= Date: Sun, 20 Oct 2024 23:16:22 +0200 Subject: [PATCH] [ie/cda] Support folders (#10786) Closes #5429 Authored by: pktiuk --- yt_dlp/extractor/_extractors.py | 5 +++- yt_dlp/extractor/cda.py | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4b1f4c316..8d5936094 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -363,7 +363,10 @@ ) from .ccma import CCMAIE from .cctv import CCTVIE -from .cda import CDAIE +from .cda import ( + CDAIE, + CDAFolderIE, +) from .cellebrite import CellebriteIE from .ceskatelevize import CeskaTelevizeIE from .cgtn import CGTNIE diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 62ee8b17f..b2738e492 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -12,6 +12,7 @@ from ..compat import compat_ord from ..utils import ( ExtractorError, + OnDemandPagedList, float_or_none, int_or_none, merge_dicts, @@ -351,3 +352,50 @@ def extract_format(page, version): extract_format(webpage, resolution) return merge_dicts(info_dict, info) + + +class CDAFolderIE(InfoExtractor): + _MAX_PAGE_SIZE = 36 + _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P\w+)/folder/(?P\d+)' + _TESTS = [ + { + 'url': 'https://www.cda.pl/domino264/folder/31188385', + 'info_dict': { + 'id': '31188385', + 'title': 'SERIA DRUGA', + }, + 'playlist_mincount': 13, + }, + { + 'url': 'https://www.cda.pl/smiechawaTV/folder/2664592/vfilm', + 'info_dict': { + 'id': '2664592', + 'title': 'VideoDowcipy - wszystkie odcinki', + }, + 'playlist_mincount': 71, + }, + { + 'url': 'https://www.cda.pl/DeliciousBeauty/folder/19129979/vfilm', + 'info_dict': { + 'id': '19129979', + 'title': 'TESTY KOSMETYKĂ“W', + }, + 'playlist_mincount': 139, + }] + + def _real_extract(self, url): + folder_id, channel = self._match_valid_url(url).group('id', 'channel') + + webpage = self._download_webpage(url, folder_id) + + def extract_page_entries(page): + webpage = self._download_webpage( + f'https://www.cda.pl/{channel}/folder/{folder_id}/vfilm/{page + 1}', folder_id, + f'Downloading page {page + 1}', expected_status=404) + items = re.findall(r']+href="/video/([0-9a-z]+)"', webpage) + for video_id in items: + yield self.url_result(f'https://www.cda.pl/video/{video_id}', CDAIE, video_id) + + return self.playlist_result( + OnDemandPagedList(extract_page_entries, self._MAX_PAGE_SIZE), + folder_id, self._og_search_title(webpage))