[youtube] Correct invalid JSON (Fixes #2353)

2024-11-25 00:00:48 +01:00 · 2014-02-09 17:56:10 +01:00 · 2014-02-09 17:56:10 +01:00 · 81c2f20b53
commit 81c2f20b53
parent 1afe753462
3 changed files with 15 additions and 6 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -271,8 +271,11 @@ def _download_xml(self, url_or_request, video_id,

    def _download_json(self, url_or_request, video_id,
                       note=u'Downloading JSON metadata',
-                       errnote=u'Unable to download JSON metadata'):
+                       errnote=u'Unable to download JSON metadata',
+                       transform_source=None):
        json_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        if transform_source:
+            json_string = transform_source(json_string)
        try:
            return json.loads(json_string)
        except ValueError as ve:
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -34,6 +34,7 @@
    unified_strdate,
    orderedSet,
    write_json_file,
+    uppercase_escape,
 )

 class YoutubeBaseInfoExtractor(InfoExtractor):
@ -1590,10 +1591,9 @@ def _real_extract(self, url):
            # Download all channel pages using the json-based channel_ajax query
            for pagenum in itertools.count(1):
                url = self._MORE_PAGES_URL % (pagenum, channel_id)
-                page = self._download_webpage(url, channel_id,
-                                              u'Downloading page #%s' % pagenum)
-    
-                page = json.loads(page)
+                page = self._download_json(
+                    url, channel_id, note=u'Downloading page #%s' % pagenum,
+                    transform_source=uppercase_escape)

                ids_in_page = self.extract_videos_from_page(page['content_html'])
                video_ids.extend(ids_in_page)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -1214,3 +1214,9 @@ def getslice(self, start=0, end=None):
            if end == nextfirstid:
                break
        return res
+
+
+def uppercase_escape(s):
+    return re.sub(
+        r'\\U([0-9a-fA-F]{8})',
+        lambda m: compat_chr(int(m.group(1), base=16)), s)