From c6ddbdb66c5d6ead5e198013c54ef53d641063f1 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 12:30:07 +1200 Subject: [PATCH 1/7] [voicerepublic] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/voicerepublic.py | 55 +++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/voicerepublic.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f117578a26..5cb3c304d1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -634,6 +634,7 @@ VKUserVideosIE, ) from .vodlocker import VodlockerIE +from .voicerepublic import VoiceRepublicIE from .vporn import VpornIE from .vrt import VRTIE from .vube import VubeIE diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py new file mode 100644 index 0000000000..1a90693cb7 --- /dev/null +++ b/youtube_dl/extractor/voicerepublic.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import ( + compat_urllib_request, +) + + +class VoiceRepublicIE(InfoExtractor): + _VALID_URL = r'https?://voicerepublic\.com/talks/(?P[0-9a-z-]+)' + _TEST = { + 'url': 'https://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', + 'md5': '0554a24d1657915aa8e8f84e15dc9353', + 'info_dict': { + 'id': '2296', + 'ext': 'm4a', + 'title': 'Watching the Watchers: Building a Sousveillance State', + 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', + 'description': 'md5:715ba964958afa2398df615809cfecb1', + 'creator': 'M. C. McGrath', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + req = compat_urllib_request.Request(url) + # Older versions of Firefox get redirected to an "upgrade browser" page + req.add_header('User-Agent', 'youtube-dl') + webpage = self._download_webpage(req, display_id) + thumbnail = self._og_search_thumbnail(webpage) + video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') + + if '
', webpage, 'author', fatal=False), + } From f900dc3fb9e17e399b0f33925ee239696cc46010 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 15:01:58 +1200 Subject: [PATCH 2/7] [voicerepublic] Extract author using _html_search_meta --- youtube_dl/extractor/voicerepublic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 1a90693cb7..7d255d6fad 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -51,5 +51,5 @@ def _real_extract(self, url): 'url': self._og_search_url(webpage), 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'creator': self._search_regex(r'', webpage, 'author', fatal=False), + 'creator': self._html_search_meta('author', webpage), } From 03f760b1c0478c1f65cf6e978d7592be46873313 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 15:40:09 +1200 Subject: [PATCH 3/7] [voicerepublic] Remove creator field --- youtube_dl/extractor/voicerepublic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 7d255d6fad..960974e167 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -19,7 +19,6 @@ class VoiceRepublicIE(InfoExtractor): 'title': 'Watching the Watchers: Building a Sousveillance State', 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', 'description': 'md5:715ba964958afa2398df615809cfecb1', - 'creator': 'M. C. McGrath', } } @@ -51,5 +50,4 @@ def _real_extract(self, url): 'url': self._og_search_url(webpage), 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'creator': self._html_search_meta('author', webpage), } From f03a8a3c4ec4dc95164c12181ffc1ddcb7583ef6 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 15:12:29 +1200 Subject: [PATCH 4/7] [voicerepublic] Raise ExtractorError if audio is still being processed --- youtube_dl/extractor/voicerepublic.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 960974e167..d3e35a815b 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -2,10 +2,8 @@ from __future__ import unicode_literals from .common import InfoExtractor - -from ..compat import ( - compat_urllib_request, -) +from ..compat import compat_urllib_request +from ..utils import ExtractorError class VoiceRepublicIE(InfoExtractor): @@ -31,17 +29,16 @@ def _real_extract(self, url): thumbnail = self._og_search_thumbnail(webpage) video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') - if '
Queued for processing, please stand by...' in webpage: + raise ExtractorError('Audio is still queued for processing') + + formats = [{ + 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + } for ext in ['m4a', 'mp3', 'ogg']] + self._sort_formats(formats) return { 'id': video_id, From 28ebef0b1b1b7b97137fbd8e093c09cb51954606 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 16:03:09 +1200 Subject: [PATCH 5/7] [voicerepublic] Detect list of available formats from the web page --- youtube_dl/extractor/voicerepublic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index d3e35a815b..d150b5b5e2 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_urllib_request from ..utils import ExtractorError @@ -32,12 +34,15 @@ def _real_extract(self, url): if 'Queued for processing, please stand by...' in webpage: raise ExtractorError('Audio is still queued for processing') + ext_matches = re.finditer(r'data-\w+=\'/vrmedia/\d+-clean\.(\w+)\'', webpage) + exts = [match.group(1) for match in ext_matches] + formats = [{ 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), 'ext': ext, 'format_id': ext, 'vcodec': 'none', - } for ext in ['m4a', 'mp3', 'ogg']] + } for ext in exts] self._sort_formats(formats) return { From 1dcb52188d3709711b3ea5ae1ff6bdb985e79c62 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 16:38:26 +1200 Subject: [PATCH 6/7] [voicerepublic] Remove hardcoded paths to media files --- youtube_dl/extractor/voicerepublic.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index d150b5b5e2..a3e40b9401 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -34,15 +34,12 @@ def _real_extract(self, url): if 'Queued for processing, please stand by...' in webpage: raise ExtractorError('Audio is still queued for processing') - ext_matches = re.finditer(r'data-\w+=\'/vrmedia/\d+-clean\.(\w+)\'', webpage) - exts = [match.group(1) for match in ext_matches] - formats = [{ - 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), + 'url': 'https://voicerepublic.com' + path, 'ext': ext, 'format_id': ext, 'vcodec': 'none', - } for ext in exts] + } for ext, path in re.findall(r"data-([^=]+)='(/[^']+\.\1)'", webpage)] self._sort_formats(formats) return { From a6762c4a22325b5b69770de82df8725d2eb5c3df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 May 2015 18:29:15 +0600 Subject: [PATCH 7/7] [voicerepublic] Make more robust and extract more metadata --- youtube_dl/extractor/voicerepublic.py | 95 ++++++++++++++++++++------- 1 file changed, 71 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index a3e40b9401..1106c655b8 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,52 +1,99 @@ -# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_request -from ..utils import ExtractorError +from ..compat import ( + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, +) class VoiceRepublicIE(InfoExtractor): - _VALID_URL = r'https?://voicerepublic\.com/talks/(?P[0-9a-z-]+)' - _TEST = { - 'url': 'https://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', + _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P[0-9a-z-]+)' + _TESTS = [{ + 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', 'md5': '0554a24d1657915aa8e8f84e15dc9353', 'info_dict': { 'id': '2296', + 'display_id': 'watching-the-watchers-building-a-sousveillance-state', 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', - 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', 'description': 'md5:715ba964958afa2398df615809cfecb1', + 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', + 'duration': 1800, + 'view_count': int, } - } + }, { + 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) - req = compat_urllib_request.Request(url) + + req = compat_urllib_request.Request( + compat_urlparse.urljoin(url, '/talks/%s' % display_id)) # Older versions of Firefox get redirected to an "upgrade browser" page req.add_header('User-Agent', 'youtube-dl') webpage = self._download_webpage(req, display_id) - thumbnail = self._og_search_thumbnail(webpage) - video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') - if 'Queued for processing, please stand by...' in webpage: - raise ExtractorError('Audio is still queued for processing') + if '>Queued for processing, please stand by...<' in webpage: + raise ExtractorError( + 'Audio is still queued for processing', expected=True) - formats = [{ - 'url': 'https://voicerepublic.com' + path, - 'ext': ext, - 'format_id': ext, - 'vcodec': 'none', - } for ext, path in re.findall(r"data-([^=]+)='(/[^']+\.\1)'", webpage)] + data = self._parse_json( + self._search_regex( + r'(?s)return ({.+?});\s*\n', webpage, + 'data', default=None), + display_id, fatal=False) + + if data: + title = data['title'] + description = data.get('teaser') + talk_id = data.get('talk_id') or display_id + talk = data['talk'] + duration = int_or_none(talk.get('duration')) + formats = [{ + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in talk['links'].items()] + else: + title = self._og_search_title(webpage) + description = self._html_search_regex( + r"(?s)
]*>(.+?)
", + webpage, 'description', fatal=False) + talk_id = self._search_regex( + [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], + webpage, 'talk id', default=None) or display_id + duration = None + formats = [{ + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", webpage)] self._sort_formats(formats) + thumbnail = self._og_search_thumbnail(webpage) + view_count = int_or_none(self._search_regex( + r"class='play-count[^']*'>\s*(\d+) plays", + webpage, 'play count', fatal=False)) + return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'formats': formats, - 'url': self._og_search_url(webpage), + 'id': talk_id, + 'display_id': display_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, - 'description': self._og_search_description(webpage), + 'duration': duration, + 'view_count': view_count, + 'formats': formats, }