[fd/hls] Support --write-pages for m3u8 media playlists (#12333)

Authored by: bashonly
2025-02-22 03:09:30 +01:00 · 2025-02-18 20:23:42 -06:00 · 2025-02-18 20:23:42 -06:00 · be69468752
commit be69468752
parent 5271ef48c6
3 changed files with 35 additions and 22 deletions
--- a/yt_dlp/downloader/hls.py
+++ b/yt_dlp/downloader/hls.py
@ -16,6 +16,7 @@ from ..utils import (
    update_url_query,
    urljoin,
 )
 from ..utils._utils import _request_dump_filename
 class HlsFD(FragmentFD):
@ -80,7 +81,15 @@ class HlsFD(FragmentFD):
            self.to_screen(f'[{self.FD_NAME}] Downloading m3u8 manifest')
            urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
            man_url = urlh.url
-            s = urlh.read().decode('utf-8', 'ignore')
+            s_bytes = urlh.read()
            if self.params.get('write_pages'):
                dump_filename = _request_dump_filename(
                    man_url, info_dict['id'], None,
                    trim_length=self.params.get('trim_file_name'))
                self.to_screen(f'[{self.FD_NAME}] Saving request to {dump_filename}')
                with open(dump_filename, 'wb') as outf:
                    outf.write(s_bytes)
            s = s_bytes.decode('utf-8', 'ignore')
        can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None
        if can_download:
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -2,7 +2,6 @@ import base64
 import collections
 import functools
 import getpass
 import hashlib
 import http.client
 import http.cookiejar
 import http.cookies
@ -78,7 +77,6 @@ from ..utils import (
    parse_iso8601,
    parse_m3u8_attributes,
    parse_resolution,
    sanitize_filename,
    sanitize_url,
    smuggle_url,
    str_or_none,
@ -100,6 +98,7 @@ from ..utils import (
    xpath_text,
    xpath_with_ns,
 )
 from ..utils._utils import _request_dump_filename
 class InfoExtractor:
@ -1022,23 +1021,6 @@ class InfoExtractor:
                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
                expected=True)
    def _request_dump_filename(self, url, video_id, data=None):
        if data is not None:
            data = hashlib.md5(data).hexdigest()
        basen = join_nonempty(video_id, data, url, delim='_')
        trim_length = self.get_param('trim_file_name') or 240
        if len(basen) > trim_length:
            h = '___' + hashlib.md5(basen.encode()).hexdigest()
            basen = basen[:trim_length - len(h)] + h
        filename = sanitize_filename(f'{basen}.dump', restricted=True)
        # Working around MAX_PATH limitation on Windows (see
        # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
        if os.name == 'nt':
            absfilepath = os.path.abspath(filename)
            if len(absfilepath) > 259:
                filename = fR'\\?\{absfilepath}'
        return filename
    def __decode_webpage(self, webpage_bytes, encoding, headers):
        if not encoding:
            encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
@ -1067,7 +1049,9 @@ class InfoExtractor:
        if self.get_param('write_pages'):
            if isinstance(url_or_request, Request):
                data = self._create_request(url_or_request, data).data
-            filename = self._request_dump_filename(urlh.url, video_id, data)
+            filename = _request_dump_filename(
                urlh.url, video_id, data,
                trim_length=self.get_param('trim_file_name'))
            self.to_screen(f'Saving request to {filename}')
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)
@ -1128,7 +1112,9 @@ class InfoExtractor:
                             impersonate=None, require_impersonation=False):
            if self.get_param('load_pages'):
                url_or_request = self._create_request(url_or_request, data, headers, query)
-                filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
+                filename = _request_dump_filename(
                    url_or_request.url, video_id, url_or_request.data,
                    trim_length=self.get_param('trim_file_name'))
                self.to_screen(f'Loading request from {filename}')
                try:
                    with open(filename, 'rb') as dumpf:
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -5631,6 +5631,24 @@ def filesize_from_tbr(tbr, duration):
    return int(duration * tbr * (1000 / 8))
 def _request_dump_filename(url, video_id, data=None, trim_length=None):
    if data is not None:
        data = hashlib.md5(data).hexdigest()
    basen = join_nonempty(video_id, data, url, delim='_')
    trim_length = trim_length or 240
    if len(basen) > trim_length:
        h = '___' + hashlib.md5(basen.encode()).hexdigest()
        basen = basen[:trim_length - len(h)] + h
    filename = sanitize_filename(f'{basen}.dump', restricted=True)
    # Working around MAX_PATH limitation on Windows (see
    # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
    if os.name == 'nt':
        absfilepath = os.path.abspath(filename)
        if len(absfilepath) > 259:
            filename = fR'\\?\{absfilepath}'
    return filename
 # XXX: Temporary
 class _YDLLogger:
    def __init__(self, ydl=None):