[RadioFrance] support pages with embedded playback info

This commit is contained in:
lonm 2024-10-15 16:28:49 +01:00
parent 0fb8bc11ed
commit 9e3ac89514

View File

@ -1,4 +1,3 @@
import itertools
import re
from .common import InfoExtractor
@ -261,6 +260,9 @@ def _call_api(self, station, content_id, cursor):
def _generate_playlist_entries(self, station, content_id, content_response):
while True:
for entry in content_response['items']:
if entry['link'] == '':
yield entry
else:
yield self.url_result(
f'https://www.radiofrance.fr{entry["link"]}', url_transparent=True, **traverse_obj(entry, {
'title': 'title',
@ -274,6 +276,25 @@ def _generate_playlist_entries(self, station, content_id, content_response):
else:
break
def _extract_embedded_episodes(self, item, webpage, content_id):
"""Certain episdoes data are embedded directly in the page, use these if the link is missing"""
links = item['playerInfo']['media']['sources']
item['formats'] = []
for linkkey in links:
url = self._search_regex(linkkey+r'\.url="([^"]+)";', webpage, content_id)
dur = int(self._search_regex(linkkey+r'\.duration=(\d+);', webpage, content_id))
preset = self._search_json(linkkey+r'\.preset=', webpage, content_id, content_id, contains_pattern=r'\{.+\}', transform_source=js_to_json)
item['formats'].append({
'format_id': preset['id'],
'url': url,
'vcodec': 'none',
'acodec': preset['encoding'],
'quality': preset['bitrate'],
'duration': dur
})
item['duration'] = dur
return item
def _real_extract(self, url):
playlist_id = self._match_id(url)
# If it is a podcast playlist, get the name of the station it is on
@ -343,6 +364,16 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 321,
}, {
'url': 'http://www.radiofrance.fr/franceculture/podcasts/serie-les-aventures-de-tintin-les-cigares-du-pharaon',
'info_dict': {
'id': '01b096c6-e7f8-49c4-8319-dd399221885b',
'display_id': 'serie-les-aventures-de-tintin-les-cigares-du-pharaon',
'title': 'Les Cigares du Pharaon\xa0: les Aventures de Tintin',
'description': 'md5:1c5b6d010b2aaeb0d90b2c233b5f7b15',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_count': 5
}, {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
'only_matching': True,
@ -359,12 +390,19 @@ def _call_api(self, station, podcast_id, cursor):
webpage = self._download_webpage(url, podcast_id, note=f'Downloading {podcast_id} page {cursor}')
resp = {}
resp['items'] = []
# _search_json cannot parse the data as it contains javascript
# Therefore, parse the episodes objects array separately
resp['items'] = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
itemlist = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
contains_pattern=r'\[.+\]', transform_source=js_to_json)
for item in itemlist:
if item['model'] == 'Expression':
if item['link'] == '':
item = self._extract_embedded_episodes(item, webpage, podcast_id)
resp['items'].append(item)
# the pagination data is stored in a javascript object 'a'
lastPage = int(re.search(r'a\.lastPage\s*=\s*(\d+);', webpage).group(1))
hasMorePages = cursor < lastPage
@ -426,6 +464,8 @@ def _call_api(self, station, profile_id, cursor):
# get episode data, note, not all will be A/V, so filter for 'expression'
for item in pagedata['items']:
if item['model'] == 'Expression':
if item.link == '':
item = self._extract_embedded_episodes(item, webpage, profile_id)
resp['items'].append(item)
resp['metadata'] = self._search_json(r'content:\s*', webpage, profile_id, profile_id,