[extractor/cbsnews] Overhaul extractors (#6681)

Closes #6565
Authored by: bashonly
This commit is contained in:
bashonly 2023-05-29 05:07:35 -05:00 committed by GitHub
parent fd5d93f704
commit f6e43d6fa9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 346 additions and 170 deletions

View File

@ -313,14 +313,14 @@
CBSIE, CBSIE,
ParamountPressExpressIE, ParamountPressExpressIE,
) )
from .cbslocal import (
CBSLocalIE,
CBSLocalArticleIE,
)
from .cbsinteractive import CBSInteractiveIE from .cbsinteractive import CBSInteractiveIE
from .cbsnews import ( from .cbsnews import (
CBSNewsEmbedIE, CBSNewsEmbedIE,
CBSNewsIE, CBSNewsIE,
CBSLocalIE,
CBSLocalArticleIE,
CBSLocalLiveIE,
CBSNewsLiveIE,
CBSNewsLiveVideoIE, CBSNewsLiveVideoIE,
) )
from .cbssports import ( from .cbssports import (

View File

@ -336,7 +336,7 @@ def _get_anvato_videos(self, access_key, video_id, token):
elif media_format == 'm3u8-variant' or ext == 'm3u8': elif media_format == 'm3u8-variant' or ext == 'm3u8':
# For some videos the initial m3u8 URL returns JSON instead # For some videos the initial m3u8 URL returns JSON instead
manifest_json = self._download_json( manifest_json = self._download_json(
video_url, video_id, note='Downloading manifest JSON', errnote=False) video_url, video_id, note='Downloading manifest JSON', fatal=False)
if manifest_json: if manifest_json:
video_url = manifest_json.get('master_m3u8') video_url = manifest_json.get('master_m3u8')
if not video_url: if not video_url:
@ -392,14 +392,6 @@ def _extract_from_webpage(cls, url, webpage):
url = smuggle_url(url, {'token': anvplayer_data['token']}) url = smuggle_url(url, {'token': anvplayer_data['token']})
yield cls.url_result(url, AnvatoIE, video_id) yield cls.url_result(url, AnvatoIE, video_id)
def _extract_anvato_videos(self, webpage, video_id):
anvplayer_data = self._parse_json(
self._html_search_regex(
self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
video_id)
return self._get_anvato_videos(
anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default'
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {}) url, smuggled_data = unsmuggle_url(url, {})
self._initialize_geo_bypass({ self._initialize_geo_bypass({

View File

@ -1,116 +0,0 @@
from .anvato import AnvatoIE
from .sendtonews import SendtoNewsIE
from ..compat import compat_urlparse
from ..utils import (
parse_iso8601,
unified_timestamp,
)
class CBSLocalIE(AnvatoIE): # XXX: Do not subclass from concrete IE
_VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/'
_VALID_URL = _VALID_URL_BASE + r'video/(?P<id>\d+)'
_TESTS = [{
'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
'info_dict': {
'id': '3580809',
'ext': 'mp4',
'title': 'A Very Blue Anniversary',
'description': 'CBS2s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
'timestamp': int,
'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
},
'categories': [
'Stations\\Spoken Word\\WCBSTV',
'Syndication\\AOL',
'Syndication\\MSN',
'Syndication\\NDN',
'Syndication\\Yahoo',
'Content\\News',
'Content\\News\\Local News',
],
'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
mcp_id = self._match_id(url)
return self.url_result(
'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id)
class CBSLocalArticleIE(AnvatoIE): # XXX: Do not subclass from concrete IE
_VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P<id>[0-9a-z-]+)'
_TESTS = [{
# Anvato backend
'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis',
'md5': 'f0ee3081e3843f575fccef901199b212',
'info_dict': {
'id': '3401037',
'ext': 'mp4',
'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'',
'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.',
'thumbnail': 're:^https?://.*',
'timestamp': 1463440500,
'upload_date': '20160516',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
},
'categories': [
'Stations\\Spoken Word\\KCBSTV',
'Syndication\\MSN',
'Syndication\\NDN',
'Syndication\\AOL',
'Syndication\\Yahoo',
'Syndication\\Tribune',
'Syndication\\Curb.tv',
'Content\\News'
],
'tags': ['CBS 2 News Evening'],
},
}, {
# SendtoNews embed
'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/',
'info_dict': {
'id': 'GxfCe0Zo7D-175909-5588',
},
'playlist_count': 9,
'params': {
# m3u8 download
'skip_download': True,
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
sendtonews_url = SendtoNewsIE._extract_url(webpage)
if sendtonews_url:
return self.url_result(
compat_urlparse.urljoin(url, sendtonews_url),
ie=SendtoNewsIE.ie_key())
info_dict = self._extract_anvato_videos(webpage, display_id)
timestamp = unified_timestamp(self._html_search_regex(
r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage,
'released date', default=None)) or parse_iso8601(
self._html_search_meta('uploadDate', webpage))
info_dict.update({
'display_id': display_id,
'timestamp': timestamp,
})
return info_dict

View File

@ -1,36 +1,153 @@
import base64
import re import re
import urllib.error
import urllib.parse
import zlib import zlib
from .anvato import AnvatoIE
from .common import InfoExtractor from .common import InfoExtractor
from .cbs import CBSIE from .paramountplus import ParamountPlusIE
from ..compat import (
compat_b64decode,
compat_urllib_parse_unquote,
)
from ..utils import ( from ..utils import (
ExtractorError,
HEADRequest,
UserNotLive,
determine_ext,
float_or_none,
format_field,
int_or_none,
make_archive_id,
mimetype2ext,
parse_duration, parse_duration,
smuggle_url,
traverse_obj,
url_or_none,
) )
class CBSNewsEmbedIE(CBSIE): # XXX: Do not subclass from concrete IE class CBSNewsBaseIE(InfoExtractor):
_LOCALES = {
'atlanta': None,
'baltimore': 'BAL',
'boston': 'BOS',
'chicago': 'CHI',
'colorado': 'DEN',
'detroit': 'DET',
'losangeles': 'LA',
'miami': 'MIA',
'minnesota': 'MIN',
'newyork': 'NY',
'philadelphia': 'PHI',
'pittsburgh': 'PIT',
'sacramento': 'SAC',
'sanfrancisco': 'SF',
'texas': 'DAL',
}
_LOCALE_RE = '|'.join(map(re.escape, _LOCALES))
_ANVACK = '5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl'
def _get_item(self, webpage, display_id):
return traverse_obj(self._search_json(
r'CBSNEWS\.defaultPayload\s*=', webpage, 'payload', display_id,
default={}), ('items', 0, {dict})) or {}
def _get_video_url(self, item):
return traverse_obj(item, 'video', 'video2', expected_type=url_or_none)
def _extract_playlist(self, webpage, playlist_id):
entries = [self.url_result(embed_url, CBSNewsEmbedIE) for embed_url in re.findall(
r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage)]
if entries:
return self.playlist_result(
entries, playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage),
self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage))
def _extract_video(self, item, video_url, video_id):
if mimetype2ext(item.get('format'), default=determine_ext(video_url)) == 'mp4':
formats = [{'url': video_url, 'ext': 'mp4'}]
else:
manifest = self._download_webpage(video_url, video_id, note='Downloading m3u8 information')
anvato_id = self._search_regex(r'anvato-(\d+)', manifest, 'Anvato ID', default=None)
# Prefer Anvato if available; cbsnews.com m3u8 formats are re-encoded from Anvato source
if anvato_id:
return self.url_result(
smuggle_url(f'anvato:{self._ANVACK}:{anvato_id}', {'token': 'default'}),
AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)])
formats, _ = self._parse_m3u8_formats_and_subtitles(
manifest, video_url, 'mp4', m3u8_id='hls', video_id=video_id)
def get_subtitles(subs_url):
return {
'en': [{
'url': subs_url,
'ext': 'dfxp', # TTAF1
}],
} if url_or_none(subs_url) else None
episode_meta = traverse_obj(item, {
'season_number': ('season', {int_or_none}),
'episode_number': ('episode', {int_or_none}),
}) if item.get('isFullEpisode') else {}
return {
'id': video_id,
'formats': formats,
**traverse_obj(item, {
'title': (None, ('fulltitle', 'title')),
'description': 'dek',
'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}),
'duration': ('duration', {float_or_none}),
'subtitles': ('captions', {get_subtitles}),
'thumbnail': ('images', ('hd', 'sd'), {url_or_none}),
'is_live': ('type', {lambda x: x == 'live'}),
}, get_all=False),
**episode_meta,
}
class CBSNewsEmbedIE(CBSNewsBaseIE):
IE_NAME = 'cbsnews:embed' IE_NAME = 'cbsnews:embed'
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A',
'only_matching': True, 'info_dict': {
'id': '6ZP4cXvo9FaX3VLH7MF4CgY30JFpY_GA',
'ext': 'mp4',
'title': 'Cops investigate gorilla incident at Cincinnati Zoo',
'description': 'md5:fee7441ab8aaeb3c693482394738102b',
'duration': 350,
'timestamp': 1464719713,
'upload_date': '20160531',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
item = self._parse_json(zlib.decompress(compat_b64decode( item = traverse_obj(self._parse_json(zlib.decompress(base64.b64decode(
compat_urllib_parse_unquote(self._match_id(url))), urllib.parse.unquote(self._match_id(url))),
-zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] -zlib.MAX_WBITS).decode(), None), ('video', 'items', 0, {dict})) or {}
return self._extract_video_info(item['mpxRefId'], 'cbsnews')
video_id = item['mpxRefId']
video_url = self._get_video_url(item)
if not video_url:
# Old embeds redirect user to ParamountPlus but most links are 404
pplus_url = f'https://www.paramountplus.com/shows/video/{video_id}'
try:
self._request_webpage(HEADRequest(pplus_url), video_id)
return self.url_result(pplus_url, ParamountPlusIE)
except ExtractorError:
self.raise_no_formats('This video is no longer available', True, video_id)
return self._extract_video(item, video_url, video_id)
class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE class CBSNewsIE(CBSNewsBaseIE):
IE_NAME = 'cbsnews' IE_NAME = 'cbsnews'
IE_DESC = 'CBS News' IE_DESC = 'CBS News'
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\da-z_-]+)' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\w-]+)'
_TESTS = [ _TESTS = [
{ {
@ -47,10 +164,7 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE
'timestamp': 1476046464, 'timestamp': 1476046464,
'upload_date': '20161009', 'upload_date': '20161009',
}, },
'params': { 'skip': 'This video is no longer available',
# rtmp download
'skip_download': True,
},
}, },
{ {
'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
@ -61,48 +175,234 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE
'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
'upload_date': '20140404', 'upload_date': '20140404',
'timestamp': 1396650660, 'timestamp': 1396650660,
'uploader': 'CBSI-NEW',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 205, 'duration': 205,
'subtitles': { 'subtitles': {
'en': [{ 'en': [{
'ext': 'ttml', 'ext': 'dfxp',
}], }],
}, },
}, },
'params': { 'params': {
# m3u8 download 'skip_download': 'm3u8',
'skip_download': True,
}, },
}, },
{ {
# 48 hours # 48 hours
'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/',
'info_dict': { 'info_dict': {
'id': 'maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved',
'title': 'Cold as Ice', 'title': 'Cold as Ice',
'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?',
}, },
'playlist_mincount': 7, 'playlist_mincount': 7,
}, },
{
'url': 'https://www.cbsnews.com/video/032823-cbs-evening-news/',
'info_dict': {
'id': '_2wuO7hD9LwtyM_TwSnVwnKp6kxlcXgE',
'ext': 'mp4',
'title': 'CBS Evening News, March 28, 2023',
'description': 'md5:db20615aae54adc1d55a1fd69dc75d13',
'duration': 1189,
'timestamp': 1680042600,
'upload_date': '20230328',
'season': 'Season 2023',
'season_number': 2023,
'episode': 'Episode 83',
'episode_number': 83,
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
'skip_download': 'm3u8',
},
},
] ]
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
entries = [] playlist = self._extract_playlist(webpage, display_id)
for embed_url in re.findall(r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): if playlist:
entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) return playlist
if entries:
return self.playlist_result(
entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage),
playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage))
item = self._parse_json(self._html_search_regex( item = self._get_item(webpage, display_id)
r'CBSNEWS\.defaultPayload\s*=\s*({.+})', video_id = item.get('mpxRefId') or display_id
webpage, 'video JSON info'), display_id)['items'][0] video_url = self._get_video_url(item)
return self._extract_video_info(item['mpxRefId'], 'cbsnews') if not video_url:
self.raise_no_formats('No video content was found', expected=True, video_id=video_id)
return self._extract_video(item, video_url, video_id)
class CBSLocalBaseIE(CBSNewsBaseIE):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
item = self._get_item(webpage, display_id)
video_id = item.get('mpxRefId') or display_id
anvato_id = None
video_url = self._get_video_url(item)
if not video_url:
anv_params = self._search_regex(
r'<iframe[^>]+\bdata-src="https?://w3\.mp\.lura\.live/player/prod/v3/anvload\.html\?key=([^"]+)"',
webpage, 'Anvato URL', default=None)
if not anv_params:
playlist = self._extract_playlist(webpage, display_id)
if playlist:
return playlist
self.raise_no_formats('No video content was found', expected=True, video_id=video_id)
anv_data = self._parse_json(base64.urlsafe_b64decode(f'{anv_params}===').decode(), video_id)
anvato_id = anv_data['v']
return self.url_result(
smuggle_url(f'anvato:{anv_data.get("anvack") or self._ANVACK}:{anvato_id}', {
'token': anv_data.get('token') or 'default',
}), AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)])
return self._extract_video(item, video_url, video_id)
class CBSLocalIE(CBSLocalBaseIE):
_VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/(?:live/)?video/(?P<id>[\w-]+)'
_TESTS = [{
# Anvato video via defaultPayload JSON
'url': 'https://www.cbsnews.com/newyork/video/1st-cannabis-dispensary-opens-in-queens/',
'info_dict': {
'id': '6376747',
'ext': 'mp4',
'title': '1st cannabis dispensary opens in Queens',
'description': 'The dispensary is women-owned and located in Jamaica.',
'uploader': 'CBS',
'duration': 20,
'timestamp': 1680193657,
'upload_date': '20230330',
'categories': ['Stations\\Spoken Word\\WCBSTV', 'Content\\Google', 'Content\\News', 'Content\\News\\Local News'],
'tags': 'count:11',
'thumbnail': 're:^https?://.*',
'_old_archive_ids': ['cbslocal 6376747'],
},
'params': {'skip_download': 'm3u8'},
}, {
# cbsnews.com video via defaultPayload JSON
'url': 'https://www.cbsnews.com/newyork/live/video/20230330171655-the-city-is-sounding-the-alarm-on-dangerous-social-media-challenges/',
'info_dict': {
'id': 'sJqfw7YvgSC6ant2zVmzt3y1jYKoL5J3',
'ext': 'mp4',
'title': 'the city is sounding the alarm on dangerous social media challenges',
'description': 'md5:8eccc9b1b73be5138a52e9c4350d2cd6',
'thumbnail': 'https://images-cbsn.cbsnews.com/prod/2023/03/30/story_22509622_1680196925.jpg',
'duration': 41.0,
'timestamp': 1680196615,
'upload_date': '20230330',
},
'params': {'skip_download': 'm3u8'},
}]
class CBSLocalArticleIE(CBSLocalBaseIE):
_VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/news/(?P<id>[\w-]+)'
_TESTS = [{
# Anvato video via iframe embed
'url': 'https://www.cbsnews.com/newyork/news/mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service/',
'playlist_count': 2,
'info_dict': {
'id': 'mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service',
'title': 'MTA station agents begin leaving their booths to provide more direct customer service',
'description': 'The more than 2,200 agents will provide face-to-face customer service to passengers.',
},
}, {
'url': 'https://www.cbsnews.com/losangeles/news/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis/',
'md5': 'f0ee3081e3843f575fccef901199b212',
'info_dict': {
'id': '3401037',
'ext': 'mp4',
'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'',
'thumbnail': 're:^https?://.*',
'timestamp': 1463440500,
'upload_date': '20160516',
},
'skip': 'Video has been removed',
}]
class CBSNewsLiveBaseIE(CBSNewsBaseIE):
def _get_id(self, url):
raise NotImplementedError('This method must be implemented by subclasses')
def _real_extract(self, url):
video_id = self._get_id(url)
if not video_id:
raise ExtractorError('Livestream is not available', expected=True)
data = traverse_obj(self._download_json(
'https://feeds-cbsn.cbsnews.com/2.0/rundown/', video_id, query={
'partner': 'cbsnsite',
'edition': video_id,
'type': 'live',
}), ('navigation', 'data', 0, {dict}))
video_url = traverse_obj(data, (('videoUrlDAI', ('videoUrl', 'base')), {url_or_none}), get_all=False)
if not video_url:
raise UserNotLive(video_id=video_id)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', m3u8_id='hls')
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'is_live': True,
**traverse_obj(data, {
'title': 'headline',
'description': 'rundown_slug',
'thumbnail': ('images', 'thumbnail_url_hd', {url_or_none}),
}),
}
class CBSLocalLiveIE(CBSNewsLiveBaseIE):
_VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?P<id>{CBSNewsBaseIE._LOCALE_RE})/live/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.cbsnews.com/losangeles/live/',
'info_dict': {
'id': 'CBSN-LA',
'ext': 'mp4',
'title': str,
'description': r're:KCBS/CBSN_LA.CRISPIN.\w+.RUNDOWN \w+ \w+',
'thumbnail': r're:^https?://.*\.jpg$',
'live_status': 'is_live',
},
'params': {'skip_download': 'm3u8'},
}]
def _get_id(self, url):
return format_field(self._LOCALES, self._match_id(url), 'CBSN-%s')
class CBSNewsLiveIE(CBSNewsLiveBaseIE):
IE_NAME = 'cbsnews:live'
IE_DESC = 'CBS News Livestream'
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.cbsnews.com/live/',
'info_dict': {
'id': 'CBSN-US',
'ext': 'mp4',
'title': str,
'description': r're:\w+ \w+ CRISPIN RUNDOWN',
'thumbnail': r're:^https?://.*\.jpg$',
'live_status': 'is_live',
},
'params': {'skip_download': 'm3u8'},
}]
def _get_id(self, url):
return 'CBSN-US'
class CBSNewsLiveVideoIE(InfoExtractor): class CBSNewsLiveVideoIE(InfoExtractor):
@ -111,7 +411,7 @@ class CBSNewsLiveVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[^/?#]+)' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[^/?#]+)'
# Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples
_TEST = { _TESTS = [{
'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
'info_dict': { 'info_dict': {
'id': 'clinton-sanders-prepare-to-face-off-in-nh', 'id': 'clinton-sanders-prepare-to-face-off-in-nh',
@ -120,7 +420,7 @@ class CBSNewsLiveVideoIE(InfoExtractor):
'duration': 334, 'duration': 334,
}, },
'skip': 'Video gone', 'skip': 'Video gone',
} }]
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
@ -131,13 +431,13 @@ def _real_extract(self, url):
'dvr_slug': display_id, 'dvr_slug': display_id,
}) })
formats = self._extract_akamai_formats(video_info['url'], display_id)
return { return {
'id': display_id, 'id': display_id,
'display_id': display_id, 'display_id': display_id,
'title': video_info['headline'], 'formats': self._extract_akamai_formats(video_info['url'], display_id),
'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), **traverse_obj(video_info, {
'duration': parse_duration(video_info.get('segmentDur')), 'title': 'headline',
'formats': formats, 'thumbnail': ('thumbnail_url_hd', {url_or_none}),
'duration': ('segmentDur', {parse_duration}),
}),
} }