From 87f89dacddfa46399aea9252ca078f5f386dce38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Jun 2018 02:55:20 +0700 Subject: [PATCH] [pbs] Improve extraction (closes #16623, closes #16684) --- youtube_dl/extractor/pbs.py | 57 ++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8d6f2dd3d..52ab2f158 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -375,6 +376,35 @@ class PBSIE(InfoExtractor): }, 'expected_warnings': ['HTTP Error 403: Forbidden'], }, + { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -438,6 +468,7 @@ def _extract_webpage(self, url): r'', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r']+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r']+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ ] media_id = self._search_regex( @@ -472,7 +503,8 @@ def _extract_webpage(self, url): if not url: url = self._og_search_url(webpage) - mobj = re.match(self._VALID_URL, url) + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: @@ -482,13 +514,27 @@ def _extract_webpage(self, url): url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( - r'