From 36e6f62cd0883f0f486d1666d010e5d9e6d515bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 25 Oct 2015 20:04:55 +0100 Subject: [PATCH 1/4] Use a wrapper around xml.etree.ElementTree.fromstring in python 2.x (#7178) Attributes aren't unicode objects, so they couldn't be directly used in info_dict fields (for example '--write-description' doesn't work with bytes). --- test/test_compat.py | 7 +++++++ test/test_utils.py | 11 +++++++---- youtube_dl/compat.py | 25 +++++++++++++++++++++++++ youtube_dl/downloader/f4m.py | 4 ++-- youtube_dl/extractor/bbc.py | 8 +++++--- youtube_dl/extractor/bilibili.py | 6 ++++-- youtube_dl/extractor/brightcove.py | 4 ++-- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/extractor/crunchyroll.py | 4 ++-- youtube_dl/extractor/vevo.py | 6 +++--- youtube_dl/utils.py | 3 ++- 11 files changed, 61 insertions(+), 21 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 4ee0dc99d0..2b0860479e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -13,8 +13,10 @@ from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, + compat_etree_fromstring, compat_expanduser, compat_shlex_split, + compat_str, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, ) @@ -71,5 +73,10 @@ def test_compat_urllib_parse_unquote_plus(self): def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) + def test_compat_etree_fromstring(self): + xml = '' + doc = compat_etree_fromstring(xml.encode('utf-8')) + self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 918a7a9ef8..a9e0fed7e9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -68,6 +68,9 @@ cli_valueless_option, cli_bool_option, ) +from youtube_dl.compat import ( + compat_etree_fromstring, +) class TestUtil(unittest.TestCase): @@ -242,7 +245,7 @@ def test_find_xpath_attr(self): ''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) @@ -263,7 +266,7 @@ def test_xpath_with_ns(self): http://server.com/download.mp3 ''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) self.assertTrue(find('media:song') is not None) self.assertEqual(find('media:song/media:author').text, 'The Author') @@ -285,7 +288,7 @@ def test_xpath_text(self):

Foo

''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default') self.assertTrue(xpath_text(doc, 'div/bar') is None) @@ -297,7 +300,7 @@ def test_xpath_attr(self):

Foo

''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index d103ab9adf..cf10835ca5 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -14,6 +14,7 @@ import subprocess import sys import itertools +import xml.etree.ElementTree try: @@ -212,6 +213,29 @@ def data_open(self, req): except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error +if sys.version_info[0] >= 3: + compat_etree_fromstring = xml.etree.ElementTree.fromstring +else: + # on python 2.x the the attributes of a node are str objects instead of + # unicode + etree = xml.etree.ElementTree + + # on 2.6 XML doesn't have a parser argument, function copied from CPython + # 2.7 source + def _XML(text, parser=None): + if not parser: + parser = etree.XMLParser(target=etree.TreeBuilder()) + parser.feed(text) + return parser.close() + + def _element_factory(*args, **kwargs): + el = etree.Element(*args, **kwargs) + for k, v in el.items(): + el.set(k, v.decode('utf-8')) + return el + + def compat_etree_fromstring(text): + return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) try: from urllib.parse import parse_qs as compat_parse_qs @@ -507,6 +531,7 @@ def compat_itertools_count(start=0, step=1): 'compat_chr', 'compat_cookiejar', 'compat_cookies', + 'compat_etree_fromstring', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 7f6143954d..6170cc1552 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -5,10 +5,10 @@ import itertools import os import time -import xml.etree.ElementTree as etree from .fragment import FragmentFD from ..compat import ( + compat_etree_fromstring, compat_urlparse, compat_urllib_error, compat_urllib_parse_urlparse, @@ -290,7 +290,7 @@ def real_download(self, filename, info_dict): man_url = urlh.geturl() manifest = urlh.read() - doc = etree.fromstring(manifest) + doc = compat_etree_fromstring(manifest) formats = [(int(f.attrib.get('bitrate', -1)), f) for f in self._get_unencrypted_media(doc)] if requested_bitrate is None: diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2cdce1eb95..a55a6dbc9d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -14,7 +13,10 @@ remove_end, unescapeHTML, ) -from ..compat import compat_HTTPError +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, +) class BBCCoUkIE(InfoExtractor): @@ -344,7 +346,7 @@ def _download_media_selector_url(self, url, programme_id=None): url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) + media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) else: raise return self._process_media_selector(media_selection, programme_id) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index ecc17ebebc..6c66a12368 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -4,9 +4,11 @@ import re import itertools import json -import xml.etree.ElementTree as ET from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, +) from ..utils import ( int_or_none, unified_strdate, @@ -88,7 +90,7 @@ def _real_extract(self, url): except ValueError: pass - lq_doc = ET.fromstring(lq_page) + lq_doc = compat_etree_fromstring(lq_page) lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c22930..1686cdde14 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -3,10 +3,10 @@ import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_parse_qs, compat_str, compat_urllib_parse, @@ -119,7 +119,7 @@ def _build_brighcove_url(cls, object_str): object_str = fix_xml_ampersands(object_str) try: - object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) except compat_xml_parse_error: return diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 10c0d5d1f9..52523d7b24 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,7 +10,6 @@ import socket import sys import time -import xml.etree.ElementTree from ..compat import ( compat_cookiejar, @@ -23,6 +22,7 @@ compat_urllib_request, compat_urlparse, compat_str, + compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -461,7 +461,7 @@ def _download_xml(self, url_or_request, video_id, return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f8ce10111d..0c9b8ca024 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -5,12 +5,12 @@ import json import base64 import zlib -import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_request, @@ -234,7 +234,7 @@ def ass_bool(strvalue): return output def _extract_subtitles(self, subtitle): - sub_root = xml.etree.ElementTree.fromstring(subtitle) + sub_root = compat_etree_fromstring(subtitle) return [{ 'ext': 'srt', 'data': self._convert_subtitles_to_srt(sub_root), diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c17094f819..4c0de354f7 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_urllib_request, ) from ..utils import ( @@ -97,7 +97,7 @@ def _formats_from_json(self, video_info): if last_version['version'] == -1: raise ExtractorError('Unable to extract last version of the video') - renditions = xml.etree.ElementTree.fromstring(last_version['data']) + renditions = compat_etree_fromstring(last_version['data']) formats = [] # Already sorted from worst to best quality for rend in renditions.findall('rendition'): @@ -114,7 +114,7 @@ def _formats_from_json(self, video_info): def _formats_from_smil(self, smil_xml): formats = [] - smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) + smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8')) els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') for el in els: src = el.attrib['src'] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a61e476467..7d846d6808 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -36,6 +36,7 @@ from .compat import ( compat_basestring, compat_chr, + compat_etree_fromstring, compat_html_entities, compat_http_client, compat_kwargs, @@ -1974,7 +1975,7 @@ def parse_node(node): return out - dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') From 387db16a789fea25795433538d80513c18d0f699 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 25 Oct 2015 20:30:54 +0100 Subject: [PATCH 2/4] [compat] compat_etree_fromstring: only decode bytes objects --- test/test_compat.py | 3 ++- youtube_dl/compat.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 2b0860479e..834f4bc552 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -74,9 +74,10 @@ def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) def test_compat_etree_fromstring(self): - xml = '' + xml = '' doc = compat_etree_fromstring(xml.encode('utf-8')) self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) + self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index cf10835ca5..f39d4e9a9e 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,8 +216,7 @@ def data_open(self, req): if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the the attributes of a node are str objects instead of - # unicode + # on python 2.x the the attributes of a node aren't always unicode objects etree = xml.etree.ElementTree # on 2.6 XML doesn't have a parser argument, function copied from CPython @@ -231,7 +230,8 @@ def _XML(text, parser=None): def _element_factory(*args, **kwargs): el = etree.Element(*args, **kwargs) for k, v in el.items(): - el.set(k, v.decode('utf-8')) + if isinstance(v, bytes): + el.set(k, v.decode('utf-8')) return el def compat_etree_fromstring(text): From f78546272cf7c4b10c8003870728ab69bec982fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 26 Oct 2015 16:41:24 +0100 Subject: [PATCH 3/4] [compat] compat_etree_fromstring: also decode the text attribute Deletes parse_xml from utils, because it also does it. --- test/test_compat.py | 11 ++++++++++- youtube_dl/compat.py | 18 ++++++++++++++++-- youtube_dl/extractor/ard.py | 4 ++-- youtube_dl/extractor/generic.py | 4 ++-- youtube_dl/utils.py | 23 ----------------------- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 834f4bc552..b6bfad05e3 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -74,10 +74,19 @@ def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) def test_compat_etree_fromstring(self): - xml = '' + xml = ''' + + foo + 中文 + spam + + ''' doc = compat_etree_fromstring(xml.encode('utf-8')) self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) + self.assertTrue(isinstance(doc.find('normal').text, compat_str)) + self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) + self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f39d4e9a9e..2d43ec852c 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,9 +216,19 @@ def data_open(self, req): if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the the attributes of a node aren't always unicode objects + # on python 2.x the attributes and text of a node aren't always unicode + # objects etree = xml.etree.ElementTree + try: + _etree_iter = etree.Element.iter + except AttributeError: # Python <=2.6 + def _etree_iter(root): + for el in root.findall('*'): + yield el + for sub in _etree_iter(el): + yield sub + # on 2.6 XML doesn't have a parser argument, function copied from CPython # 2.7 source def _XML(text, parser=None): @@ -235,7 +245,11 @@ def _element_factory(*args, **kwargs): return el def compat_etree_fromstring(text): - return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + for el in _etree_iter(doc): + if el.text is not None and isinstance(el.text, bytes): + el.text = el.text.decode('utf-8') + return doc try: from urllib.parse import parse_qs as compat_parse_qs diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6f465789b4..73be6d2040 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -14,8 +14,8 @@ parse_duration, unified_strdate, xpath_text, - parse_xml, ) +from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): @@ -161,7 +161,7 @@ def _real_extract(self, url): raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) if re.search(r'[\?&]rss($|[=&])', url): - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb26..1de96b268c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( + compat_etree_fromstring, compat_urllib_parse_unquote, compat_urllib_request, compat_urlparse, @@ -21,7 +22,6 @@ HEADRequest, is_html, orderedSet, - parse_xml, smuggle_url, unescapeHTML, unified_strdate, @@ -1237,7 +1237,7 @@ def _real_extract(self, url): # Is it an RSS feed, a SMIL file or a XSPF playlist? try: - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7d846d6808..c761ea22a4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'): return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) -try: - etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError: # Python <=2.6 - etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): - class TreeBuilder(xml.etree.ElementTree.TreeBuilder): - def doctype(self, name, pubid, system): - pass # Ignore doctypes - - parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) - kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) - # Fix up XML parser in Python 2.x - if sys.version_info < (3, 0): - for n in etree_iter(tree): - if n.text is not None: - if not isinstance(n.text, compat_str): - n.text = n.text.decode('utf-8') - return tree - - US_RATINGS = { 'G': 0, 'PG': 10, From ae37338e681319a28d98dc551253d9fa1830969a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 29 Oct 2015 13:58:40 +0100 Subject: [PATCH 4/4] [compat] compat_etree_fromstring: clarify comment --- youtube_dl/compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2d43ec852c..a3e85264ac 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,8 +216,8 @@ def data_open(self, req): if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the attributes and text of a node aren't always unicode - # objects + # python 2.x tries to encode unicode strings with ascii (see the + # XMLParser._fixtext method) etree = xml.etree.ElementTree try: