[networking] Add support for zstandard content-encoding

Supported by urllib/requests/curl_cffi

Authored-by: coletdjnz
This commit is contained in:
coletdjnz 2024-07-14 18:20:03 +12:00
parent 8531d2b03b
commit aec3cc3218
No known key found for this signature in database
GPG Key ID: 91984263BB39894A
6 changed files with 97 additions and 14 deletions

View File

@ -57,6 +57,9 @@ curl-cffi = [
"curl-cffi==0.5.10; os_name=='nt' and implementation_name=='cpython'", "curl-cffi==0.5.10; os_name=='nt' and implementation_name=='cpython'",
"curl-cffi>=0.5.10,!=0.6.*,<0.8; os_name!='nt' and implementation_name=='cpython'", "curl-cffi>=0.5.10,!=0.6.*,<0.8; os_name!='nt' and implementation_name=='cpython'",
] ]
zstd = [
"zstandard>=0.22.0",
]
secretstorage = [ secretstorage = [
"cffi", "cffi",
"secretstorage", "secretstorage",

View File

@ -2,6 +2,7 @@
# Allow direct execution # Allow direct execution
import os import os
import re
import sys import sys
import pytest import pytest
@ -36,7 +37,7 @@
verify_address_availability, verify_address_availability,
) )
from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.cookies import YoutubeDLCookieJar
from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3 from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3, zstandard
from yt_dlp.networking import ( from yt_dlp.networking import (
HEADRequest, HEADRequest,
PUTRequest, PUTRequest,
@ -62,7 +63,7 @@
ImpersonateTarget, ImpersonateTarget,
) )
from yt_dlp.utils import YoutubeDLError from yt_dlp.utils import YoutubeDLError
from yt_dlp.utils._utils import _YDLLogger as FakeLogger from yt_dlp.utils._utils import _YDLLogger as FakeLogger, int_or_none
from yt_dlp.utils.networking import HTTPHeaderDict, std_headers from yt_dlp.utils.networking import HTTPHeaderDict, std_headers
TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.dirname(os.path.abspath(__file__))
@ -217,6 +218,7 @@ def do_GET(self):
self.end_headers() self.end_headers()
elif self.path == '/content-encoding': elif self.path == '/content-encoding':
encodings = self.headers.get('ytdl-encoding', '') encodings = self.headers.get('ytdl-encoding', '')
content_encoding_header = self.headers.get('ytdl-encoding-header', encodings)
payload = b'<html><video src="/vid.mp4" /></html>' payload = b'<html><video src="/vid.mp4" /></html>'
for encoding in filter(None, (e.strip() for e in encodings.split(','))): for encoding in filter(None, (e.strip() for e in encodings.split(','))):
if encoding == 'br' and brotli: if encoding == 'br' and brotli:
@ -228,6 +230,8 @@ def do_GET(self):
payload = buf.getvalue() payload = buf.getvalue()
elif encoding == 'deflate': elif encoding == 'deflate':
payload = zlib.compress(payload) payload = zlib.compress(payload)
elif encoding == 'zstd':
payload = zstandard.compress(payload)
elif encoding == 'unsupported': elif encoding == 'unsupported':
payload = b'raw' payload = b'raw'
break break
@ -235,7 +239,7 @@ def do_GET(self):
self._status(415) self._status(415)
return return
self.send_response(200) self.send_response(200)
self.send_header('Content-Encoding', encodings) self.send_header('Content-Encoding', content_encoding_header)
self.send_header('Content-Length', str(len(payload))) self.send_header('Content-Length', str(len(payload)))
self.end_headers() self.end_headers()
self.wfile.write(payload) self.wfile.write(payload)
@ -622,7 +626,7 @@ def test_gzip_trailing_garbage(self, handler):
assert data == '<html><video src="/vid.mp4" /></html>' assert data == '<html><video src="/vid.mp4" /></html>'
@pytest.mark.skip_handler('CurlCFFI', 'not applicable to curl-cffi') @pytest.mark.skip_handler('CurlCFFI', 'not applicable to curl-cffi')
@pytest.mark.skipif(not brotli, reason='brotli support is not installed') @pytest.mark.skipif(not brotli, reason='brotli not available')
def test_brotli(self, handler): def test_brotli(self, handler):
with handler() as rh: with handler() as rh:
res = validate_and_send( res = validate_and_send(
@ -632,6 +636,52 @@ def test_brotli(self, handler):
assert res.headers.get('Content-Encoding') == 'br' assert res.headers.get('Content-Encoding') == 'br'
assert res.read() == b'<html><video src="/vid.mp4" /></html>' assert res.read() == b'<html><video src="/vid.mp4" /></html>'
@pytest.mark.skipif(not brotli, reason='brotli not available')
def test_brotli_error(self, handler):
with handler() as rh:
with pytest.raises(TransportError):
# depending on implementation, error may be raised at request time or read time
res = validate_and_send(
rh, Request(
f'http://127.0.0.1:{self.http_port}/content-encoding',
headers={'ytdl-encoding': 'deflate', 'ytdl-encoding-header': 'br'}))
res.read()
# TODO: implement centralised version parser
@pytest.mark.skip_handler_if(
'CurlCFFI',
lambda _: tuple(map(int, re.split(r'\D+', curl_cffi.__version__)[:3])) < (0, 7, 0),
'zstd not supported by curl_cffi < 0.7.0')
@pytest.mark.skip_handler_if(
'Requests',
lambda _: tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.')) < (2, 0, 0),
'zstd not supported by urllib3 < 2.0.0')
@pytest.mark.skipif(not zstandard, reason='zstandard not available')
def test_zstd(self, handler):
with handler() as rh:
res = validate_and_send(
rh, Request(
f'http://127.0.0.1:{self.http_port}/content-encoding',
headers={'ytdl-encoding': 'zstd'}))
assert res.headers.get('Content-Encoding') == 'zstd'
assert res.read() == b'<html><video src="/vid.mp4" /></html>'
# TODO: implement centralised version parser
@pytest.mark.skip_handler_if(
'Requests',
lambda _: tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.')) < (2, 0, 0),
'zstd not supported by urllib3 < 2.0.0')
@pytest.mark.skipif(not zstandard, reason='zstandard not available')
def test_zstd_error(self, handler):
with handler() as rh:
with pytest.raises(TransportError):
# depending on implementation, error may be raised at request time or read time
res = validate_and_send(
rh, Request(
f'http://127.0.0.1:{self.http_port}/content-encoding',
headers={'ytdl-encoding': 'unsupported', 'ytdl-encoding-header': 'zstd'}))
res.read()
def test_deflate(self, handler): def test_deflate(self, handler):
with handler() as rh: with handler() as rh:
res = validate_and_send( res = validate_and_send(
@ -641,6 +691,16 @@ def test_deflate(self, handler):
assert res.headers.get('Content-Encoding') == 'deflate' assert res.headers.get('Content-Encoding') == 'deflate'
assert res.read() == b'<html><video src="/vid.mp4" /></html>' assert res.read() == b'<html><video src="/vid.mp4" /></html>'
def test_deflate_error(self, handler):
with handler() as rh:
with pytest.raises(TransportError):
# depending on implementation, error may be raised at request time or read time
res = validate_and_send(
rh, Request(
f'http://127.0.0.1:{self.http_port}/content-encoding',
headers={'ytdl-encoding': 'gzip', 'ytdl-encoding-header': 'deflate'}))
res.read()
def test_gzip(self, handler): def test_gzip(self, handler):
with handler() as rh: with handler() as rh:
res = validate_and_send( res = validate_and_send(
@ -650,6 +710,16 @@ def test_gzip(self, handler):
assert res.headers.get('Content-Encoding') == 'gzip' assert res.headers.get('Content-Encoding') == 'gzip'
assert res.read() == b'<html><video src="/vid.mp4" /></html>' assert res.read() == b'<html><video src="/vid.mp4" /></html>'
def test_gzip_error(self, handler):
with handler() as rh:
with pytest.raises(TransportError):
# depending on implementation, error may be raised at request time or read time
res = validate_and_send(
rh, Request(
f'http://127.0.0.1:{self.http_port}/content-encoding',
headers={'ytdl-encoding': 'unsupported', 'ytdl-encoding-header': 'gzip'}))
res.read()
def test_multiple_encodings(self, handler): def test_multiple_encodings(self, handler):
with handler() as rh: with handler() as rh:
for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'): for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):

View File

@ -22,6 +22,10 @@
if not _path_exists(certifi.where()): if not _path_exists(certifi.where()):
certifi = None certifi = None
try:
import zstandard
except ImportError:
zstandard = None
try: try:
import mutagen import mutagen

View File

@ -29,7 +29,7 @@
raise ImportError('curl_cffi is not installed') raise ImportError('curl_cffi is not installed')
curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3])) curl_cffi_version = tuple(map(int, re.split(r'\D+', curl_cffi.__version__)[:3]))
if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 8, 0)): if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 8, 0)):
curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)' curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)'

View File

@ -8,7 +8,7 @@
import socket import socket
import warnings import warnings
from ..dependencies import brotli, requests, urllib3 from ..dependencies import requests, urllib3
from ..utils import bug_reports_message, int_or_none, variadic from ..utils import bug_reports_message, int_or_none, variadic
from ..utils.networking import normalize_url from ..utils.networking import normalize_url
@ -59,12 +59,7 @@
) )
from ..socks import ProxyError as SocksProxyError from ..socks import ProxyError as SocksProxyError
SUPPORTED_ENCODINGS = [ SUPPORTED_ENCODINGS = urllib3.util.request.ACCEPT_ENCODING.split(',')
'gzip', 'deflate',
]
if brotli is not None:
SUPPORTED_ENCODINGS.append('br')
''' '''
Override urllib3's behavior to not convert lower-case percent-encoded characters Override urllib3's behavior to not convert lower-case percent-encoded characters
@ -259,7 +254,6 @@ class RequestsRH(RequestHandler, InstanceStoreMixin):
https://github.com/psf/requests https://github.com/psf/requests
""" """
_SUPPORTED_URL_SCHEMES = ('http', 'https') _SUPPORTED_URL_SCHEMES = ('http', 'https')
_SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
_SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h') _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
RH_NAME = 'requests' RH_NAME = 'requests'

View File

@ -38,7 +38,7 @@
SSLError, SSLError,
TransportError, TransportError,
) )
from ..dependencies import brotli from ..dependencies import brotli, zstandard
from ..socks import ProxyError as SocksProxyError from ..socks import ProxyError as SocksProxyError
from ..utils import update_url_query from ..utils import update_url_query
from ..utils.networking import normalize_url from ..utils.networking import normalize_url
@ -50,6 +50,10 @@
SUPPORTED_ENCODINGS.append('br') SUPPORTED_ENCODINGS.append('br')
CONTENT_DECODE_ERRORS.append(brotli.error) CONTENT_DECODE_ERRORS.append(brotli.error)
if zstandard:
SUPPORTED_ENCODINGS.append('zstd')
CONTENT_DECODE_ERRORS.append(zstandard.ZstdError)
def _create_http_connection(http_class, source_address, *args, **kwargs): def _create_http_connection(http_class, source_address, *args, **kwargs):
hc = http_class(*args, **kwargs) hc = http_class(*args, **kwargs)
@ -118,6 +122,12 @@ def brotli(data):
return data return data
return brotli.decompress(data) return brotli.decompress(data)
@staticmethod
def zstd(data):
if not data:
return data
return zstandard.ZstdDecompressor().decompress(data)
@staticmethod @staticmethod
def gz(data): def gz(data):
# There may be junk added the end of the file # There may be junk added the end of the file
@ -158,6 +168,8 @@ def http_response(self, req, resp):
decoded_response = self.deflate(decoded_response or resp.read()) decoded_response = self.deflate(decoded_response or resp.read())
elif encoding == 'br' and brotli: elif encoding == 'br' and brotli:
decoded_response = self.brotli(decoded_response or resp.read()) decoded_response = self.brotli(decoded_response or resp.read())
elif encoding == 'zstd' and zstandard:
decoded_response = self.zstd(decoded_response or resp.read())
if decoded_response is not None: if decoded_response is not None:
resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)