From 503406d4bc838b51c9b1adf0d3fd4a9efda26d30 Mon Sep 17 00:00:00 2001 From: lkho Date: Fri, 28 Aug 2020 23:44:50 +0800 Subject: [PATCH 1/8] [duboku] Add new extractor www.duboku.co --- youtube_dl/extractor/duboku.py | 92 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 93 insertions(+) create mode 100644 youtube_dl/extractor/duboku.py diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py new file mode 100644 index 000000000..3e4cf8d5b --- /dev/null +++ b/youtube_dl/extractor/duboku.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import * + + +class DubokuIE(InfoExtractor): + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P[0-9\-]+)\.html.*' + _TESTS = [{ + 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', + 'info_dict': { + 'id': '1575-1-1', + 'title': '白色月光', + 'season': 1, + 'episode': 1, + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }] + + _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*(.*)', html) + if mobj: + href = extract_attributes(mobj.group(0)).get('href') + if href: + mobj1 = re.search(r'/(\d+)\.html', href) + if mobj1 and mobj1.group(1) == series_id: + series_title = clean_html(mobj.group(0)) + series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title) + title = clean_html(html) + title = re.sub(r'[\s\r\n\t]+', ' ', title) + break + + data_url = player_data['url'] + assert data_url + data_from = player_data.get('from') + + # if it is an embedded iframe, maybe it's an external source + if data_from == 'iframe': + # use _type url_transparent to retain the meaningful details + # of the video. + return { + '_type': 'url_transparent', + 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}), + 'id': video_id, + 'title': title, + 'series': series_title, + 'season_number': int_or_none(season_id), + 'season_id': season_id, + 'episode_number': int_or_none(episode_id), + 'episode_id': episode_id, + } + + formats = self._extract_m3u8_formats(data_url, video_id, 'ts') + + return { + 'id': video_id, + 'title': title, + 'series': series_title, + 'season_number': int_or_none(season_id), + 'season_id': season_id, + 'episode_number': int_or_none(episode_id), + 'episode_id': episode_id, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4b3092028..e6c008b6f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -282,6 +282,7 @@ ) from .dtube import DTubeIE from .dvtv import DVTVIE +from .duboku import DubokuIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE From de4144a4aedd6ab9f24ffa1a777bce99e019468e Mon Sep 17 00:00:00 2001 From: lkho Date: Sat, 29 Aug 2020 15:04:16 +0800 Subject: [PATCH 2/8] [duboku] add playlist extractor --- youtube_dl/extractor/duboku.py | 93 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py index 3e4cf8d5b..4db81a665 100644 --- a/youtube_dl/extractor/duboku.py +++ b/youtube_dl/extractor/duboku.py @@ -4,10 +4,49 @@ import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import * +def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): + """Return the content of the tag with the specified attribute in the passed HTML document""" + + if tag is None: + tag = '[a-zA-Z0-9:._-]+' + if attribute is None: + attribute = '' + else: + attribute = r'\s+(?P%s)' % re.escape(attribute) + if value is None: + value = '' + else: + value = re.escape(value) if escape_value else value + value = '=[\'"]?(?P%s)[\'"]?' % value + + retlist = [] + for m in re.finditer(r'''(?xs) + <(?P%s) + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + %s%s + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + \s*> + (?P.*?) + + ''' % (tag, attribute, value), html): + retlist.append(m) + + return retlist + + +def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): + retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value) + return retval[0] if retval else None + + class DubokuIE(InfoExtractor): + IE_NAME = 'duboku' + IE_DESC = 'www.duboku.co' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P[0-9\-]+)\.html.*' _TESTS = [{ 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', @@ -90,3 +129,57 @@ def _real_extract(self, url): 'episode_id': episode_id, 'formats': formats, } + + +class DubokuPlaylistIE(InfoExtractor): + IE_NAME = 'duboku:list' + IE_DESC = 'www.duboku.co entire series' + + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P[0-9]+)\.html.*' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + series_id = mobj.group('id') + fragment = compat_urlparse.urlparse(url).fragment + + webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id + webpage_html = self._download_webpage(webpage_url, series_id) + + # extract title + + title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title') + title = unescapeHTML(title.group('content')) if title else None + if not title: + title = self._html_search_meta('keywords', webpage_html) + if not title: + title = _get_element_by_tag_and_attrib(webpage_html, 'title') + title = unescapeHTML(title.group('content')) if title else None + + # extract playlists + + playlists = {} + for div in _get_elements_by_tag_and_attrib( + webpage_html, attribute='id', value='playlist\\d+', escape_value=False): + playlist_id = div.group('value') + playlist = [] + for a in _get_elements_by_tag_and_attrib( + div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False): + playlist.append({ + 'href': unescapeHTML(a.group('value')), + 'title': unescapeHTML(a.group('content')) + }) + playlists[playlist_id] = playlist + + # select the specified playlist if url fragment exists + playlist = playlists.get(fragment) if fragment else next(iter(playlists.values())) + if not playlist: + raise ExtractorError( + 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist') + + # return url results + return self.playlist_result([ + self.url_result( + 'https://www.duboku.co' + x['href'], video_title=x.get('title')) + for x in playlist], series_id, title) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e6c008b6f..407701717 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -282,7 +282,10 @@ ) from .dtube import DTubeIE from .dvtv import DVTVIE -from .duboku import DubokuIE +from .duboku import ( + DubokuIE, + DubokuPlaylistIE +) from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE From d82b6697c2d2143cff25029d4c79cb152fdff316 Mon Sep 17 00:00:00 2001 From: lkho Date: Sat, 29 Aug 2020 15:23:43 +0800 Subject: [PATCH 3/8] [duboku] add tests --- youtube_dl/extractor/duboku.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py index 4db81a665..27b2c9dc4 100644 --- a/youtube_dl/extractor/duboku.py +++ b/youtube_dl/extractor/duboku.py @@ -136,6 +136,26 @@ class DubokuPlaylistIE(InfoExtractor): IE_DESC = 'www.duboku.co entire series' _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P[0-9]+)\.html.*' + _TESTS = [{ + 'url': 'https://www.duboku.co/vodplay/1575.html', + 'info_dict': { + 'id': '1575#playlist1', + 'title': '白色月光', + }, + 'playlist_count': 12, + }, { + 'url': 'https://www.duboku.co/vodplay/1554.html', + 'info_dict': { + 'id': '1554#playlist1', + 'title': '以家人之名', + }, + }, { + 'url': 'https://www.duboku.co/vodplay/1554.html#playlist2', + 'info_dict': { + 'id': '1554#playlist2', + 'title': '以家人之名', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -173,7 +193,15 @@ def _real_extract(self, url): playlists[playlist_id] = playlist # select the specified playlist if url fragment exists - playlist = playlists.get(fragment) if fragment else next(iter(playlists.values())) + playlist = None + playlist_id = None + if fragment: + playlist = playlists.get(fragment) + playlist_id = fragment + else: + first = next(iter(playlists.items())) + if first: + (playlist_id, playlist) = first if not playlist: raise ExtractorError( 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist') @@ -181,5 +209,5 @@ def _real_extract(self, url): # return url results return self.playlist_result([ self.url_result( - 'https://www.duboku.co' + x['href'], video_title=x.get('title')) - for x in playlist], series_id, title) + 'https://www.duboku.co' + x['href'], DubokuIE.IE_NAME, video_title=x.get('title')) + for x in playlist], series_id + '#' + playlist_id, title) From a8f88d2fece3b883f6a597c63273bf05df77ddca Mon Sep 17 00:00:00 2001 From: lkho Date: Sat, 29 Aug 2020 15:44:56 +0800 Subject: [PATCH 4/8] [duboku] fix test_no_duplicates --- youtube_dl/extractor/duboku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py index 27b2c9dc4..821585698 100644 --- a/youtube_dl/extractor/duboku.py +++ b/youtube_dl/extractor/duboku.py @@ -47,7 +47,7 @@ class DubokuIE(InfoExtractor): IE_NAME = 'duboku' IE_DESC = 'www.duboku.co' - _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P[0-9\-]+)\.html.*' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P[0-9]+-[0-9-]+)\.html.*' _TESTS = [{ 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', 'info_dict': { From 7cc9d5b32167f8080a5eba499643ab7b1e347ee9 Mon Sep 17 00:00:00 2001 From: lkho Date: Sat, 29 Aug 2020 16:25:42 +0800 Subject: [PATCH 5/8] [duboku] replace import *, fix tests --- youtube_dl/extractor/duboku.py | 44 +++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py index 821585698..cd92f5cf1 100644 --- a/youtube_dl/extractor/duboku.py +++ b/youtube_dl/extractor/duboku.py @@ -5,7 +5,16 @@ from .common import InfoExtractor from ..compat import compat_urlparse -from ..utils import * +from ..utils import ( + clean_html, + extract_attributes, + ExtractorError, + get_elements_by_class, + int_or_none, + js_to_json, + smuggle_url, + unescapeHTML, +) def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): @@ -52,9 +61,24 @@ class DubokuIE(InfoExtractor): 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', 'info_dict': { 'id': '1575-1-1', - 'title': '白色月光', - 'season': 1, - 'episode': 1, + 'ext': 'ts', + 'series': '白色月光', + 'title': 'contains:白色月光', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }, { + 'url': 'https://www.duboku.co/vodplay/1588-1-1.html', + 'info_dict': { + 'id': '1588-1-1', + 'ext': 'ts', + 'series': '亲爱的自己', + 'title': 'contains:预告片', + 'season_number': 1, + 'episode_number': 1, }, 'params': { 'skip_download': 'm3u8 download', @@ -137,24 +161,26 @@ class DubokuPlaylistIE(InfoExtractor): _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P[0-9]+)\.html.*' _TESTS = [{ - 'url': 'https://www.duboku.co/vodplay/1575.html', + 'url': 'https://www.duboku.co/voddetail/1575.html', 'info_dict': { - 'id': '1575#playlist1', + 'id': 'startswith:1575', 'title': '白色月光', }, 'playlist_count': 12, }, { - 'url': 'https://www.duboku.co/vodplay/1554.html', + 'url': 'https://www.duboku.co/voddetail/1554.html', 'info_dict': { - 'id': '1554#playlist1', + 'id': 'startswith:1554', 'title': '以家人之名', }, + 'playlist_mincount': 30, }, { - 'url': 'https://www.duboku.co/vodplay/1554.html#playlist2', + 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2', 'info_dict': { 'id': '1554#playlist2', 'title': '以家人之名', }, + 'playlist_mincount': 27, }] def _real_extract(self, url): From bf7392922f801227b95103925e9df3ccc764a000 Mon Sep 17 00:00:00 2001 From: lkho Date: Sun, 30 Aug 2020 15:53:07 +0800 Subject: [PATCH 6/8] [duboku] fix list results, minor error checking --- youtube_dl/extractor/duboku.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py index cd92f5cf1..136ee392e 100644 --- a/youtube_dl/extractor/duboku.py +++ b/youtube_dl/extractor/duboku.py @@ -101,7 +101,7 @@ def _real_extract(self, url): player_data = self._search_regex( self._PLAYER_DATA_PATTERN, webpage_html, 'player_data') - player_data = self._parse_json(js_to_json(player_data), video_id) + player_data = self._parse_json(player_data, video_id, js_to_json) # extract title @@ -121,8 +121,9 @@ def _real_extract(self, url): title = re.sub(r'[\s\r\n\t]+', ' ', title) break - data_url = player_data['url'] - assert data_url + data_url = player_data.get('url') + if not data_url: + raise ExtractorError('Cannot find url in player_data') data_from = player_data.get('from') # if it is an embedded iframe, maybe it's an external source @@ -225,7 +226,7 @@ def _real_extract(self, url): playlist = playlists.get(fragment) playlist_id = fragment else: - first = next(iter(playlists.items())) + first = next(iter(playlists.items()), None) if first: (playlist_id, playlist) = first if not playlist: @@ -235,5 +236,6 @@ def _real_extract(self, url): # return url results return self.playlist_result([ self.url_result( - 'https://www.duboku.co' + x['href'], DubokuIE.IE_NAME, video_title=x.get('title')) + compat_urlparse.urljoin('https://www.duboku.co', x['href']), + ie=DubokuIE.ie_key(), video_title=x.get('title')) for x in playlist], series_id + '#' + playlist_id, title) From 1b8805f8310fc472e627926d5c5a9446babd1cc3 Mon Sep 17 00:00:00 2001 From: lkho Date: Mon, 7 Sep 2020 21:03:39 +0800 Subject: [PATCH 7/8] [duboku] add referer header --- youtube_dl/extractor/duboku.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py index 136ee392e..3d3b24d80 100644 --- a/youtube_dl/extractor/duboku.py +++ b/youtube_dl/extractor/duboku.py @@ -153,6 +153,7 @@ def _real_extract(self, url): 'episode_number': int_or_none(episode_id), 'episode_id': episode_id, 'formats': formats, + 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'} } From b0f50733a12a3196f4ab29064804e065b6d045c1 Mon Sep 17 00:00:00 2001 From: lkho Date: Wed, 9 Sep 2020 03:15:08 +0800 Subject: [PATCH 8/8] [duboku] change ext to mp4 --- youtube_dl/extractor/duboku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/duboku.py b/youtube_dl/extractor/duboku.py index 3d3b24d80..fdc695bf4 100644 --- a/youtube_dl/extractor/duboku.py +++ b/youtube_dl/extractor/duboku.py @@ -142,7 +142,7 @@ def _real_extract(self, url): 'episode_id': episode_id, } - formats = self._extract_m3u8_formats(data_url, video_id, 'ts') + formats = self._extract_m3u8_formats(data_url, video_id, 'mp4') return { 'id': video_id,