From 9e3ac8951450b727d56ce5ed1eecef5d24414e75 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Tue, 15 Oct 2024 16:28:49 +0100
Subject: [PATCH] [RadioFrance] support pages with embedded playback info

---
 yt_dlp/extractor/radiofrance.py | 62 +++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 11 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index 7d47ba686f..f7ee6ad6ad 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -1,4 +1,3 @@
-import itertools
 import re
 
 from .common import InfoExtractor
@@ -261,19 +260,41 @@ def _call_api(self, station, content_id, cursor):
     def _generate_playlist_entries(self, station, content_id, content_response):
         while True:
             for entry in content_response['items']:
-                yield self.url_result(
-                    f'https://www.radiofrance.fr{entry["link"]}', url_transparent=True, **traverse_obj(entry, {
-                        'title': 'title',
-                        'description': 'standFirst',
-                        'timestamp': ('publishedDate', {int_or_none}),
-                        'thumbnail': ('visual', 'src'),
-                    }))
+                if entry['link'] == '':
+                    yield entry
+                else:
+                    yield self.url_result(
+                        f'https://www.radiofrance.fr{entry["link"]}', url_transparent=True, **traverse_obj(entry, {
+                            'title': 'title',
+                            'description': 'standFirst',
+                            'timestamp': ('publishedDate', {int_or_none}),
+                            'thumbnail': ('visual', 'src'),
+                        }))
 
             if content_response['next']:
                 content_response = self._call_api(station, content_id, content_response['next'])
             else:
                 break
 
+    def _extract_embedded_episodes(self, item, webpage, content_id):
+        """Certain episdoes data are embedded directly in the page, use these if the link is missing"""
+        links = item['playerInfo']['media']['sources']
+        item['formats'] = []
+        for linkkey in links:
+            url = self._search_regex(linkkey+r'\.url="([^"]+)";', webpage, content_id)
+            dur = int(self._search_regex(linkkey+r'\.duration=(\d+);', webpage, content_id))
+            preset = self._search_json(linkkey+r'\.preset=', webpage, content_id, content_id, contains_pattern=r'\{.+\}', transform_source=js_to_json)
+            item['formats'].append({
+                'format_id': preset['id'],
+                'url': url,
+                'vcodec': 'none',
+                'acodec': preset['encoding'],
+                'quality': preset['bitrate'],
+                'duration': dur
+            })
+            item['duration'] = dur
+        return item
+
     def _real_extract(self, url):
         playlist_id = self._match_id(url)
         # If it is a podcast playlist, get the name of the station it is on
@@ -343,6 +364,16 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
         },
         'playlist_mincount': 321,
+    }, {
+        'url': 'http://www.radiofrance.fr/franceculture/podcasts/serie-les-aventures-de-tintin-les-cigares-du-pharaon',
+        'info_dict': {
+            'id': '01b096c6-e7f8-49c4-8319-dd399221885b',
+            'display_id': 'serie-les-aventures-de-tintin-les-cigares-du-pharaon',
+            'title': 'Les Cigares du Pharaon\xa0: les Aventures de Tintin',
+            'description': 'md5:1c5b6d010b2aaeb0d90b2c233b5f7b15',
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+        },
+        'playlist_count': 5
     }, {
         'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
         'only_matching': True,
@@ -359,12 +390,19 @@ def _call_api(self, station, podcast_id, cursor):
         webpage = self._download_webpage(url, podcast_id, note=f'Downloading {podcast_id} page {cursor}')
 
         resp = {}
+        resp['items'] = []
 
         # _search_json cannot parse the data as it contains javascript
         # Therefore, parse the episodes objects array separately
-        resp['items'] = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
+        itemlist = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
                                           contains_pattern=r'\[.+\]', transform_source=js_to_json)
 
+        for item in itemlist:
+            if item['model'] == 'Expression':
+                if item['link'] == '':
+                    item = self._extract_embedded_episodes(item, webpage, podcast_id)
+                resp['items'].append(item)
+
         # the pagination data is stored in a javascript object 'a'
         lastPage = int(re.search(r'a\.lastPage\s*=\s*(\d+);', webpage).group(1))
         hasMorePages = cursor < lastPage
@@ -416,7 +454,7 @@ def _call_api(self, station, profile_id, cursor):
         pagedata = self._search_json(r'documents\s*:\s*', webpage, profile_id, profile_id,
                                      transform_source=js_to_json)
 
-        # get thepage data
+        # get the page data
         pagekey = pagedata['pagination']
         hasMorePages = False
         lastPage = int(self._search_regex(pagekey+r'\.lastPage=(\d+);', webpage, profile_id, '0'))
@@ -425,7 +463,9 @@ def _call_api(self, station, profile_id, cursor):
 
         # get episode data, note, not all will be A/V, so filter for 'expression'
         for item in pagedata['items']:
-            if item['model']=='Expression':
+            if item['model'] == 'Expression':
+                if item.link == '':
+                    item = self._extract_embedded_episodes(item, webpage, profile_id)
                 resp['items'].append(item)
 
         resp['metadata'] = self._search_json(r'content:\s*', webpage, profile_id, profile_id,