import textwrap
import unittest
from yt_dlp.compat import compat_HTMLParseError
from yt_dlp.parsing import (
MatchingElementParser,
HTMLIgnoreRanges,
HTMLTagParser,
)
extract_attributes = MatchingElementParser.extract_attributes
get_element_by_attribute = MatchingElementParser.get_element_by_attribute
get_element_by_class = MatchingElementParser.get_element_by_class
get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute
get_element_html_by_class = MatchingElementParser.get_element_html_by_class
get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag
get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute
get_elements_by_class = MatchingElementParser.get_elements_by_class
get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute
get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class
get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute
get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag
class TestParsing(unittest.TestCase):
def test_extract_attributes(self):
self.assertEqual(extract_attributes(''), {'x': 'y'})
self.assertEqual(extract_attributes(""), {'x': 'y'})
self.assertEqual(extract_attributes(''), {'x': 'y'})
self.assertEqual(extract_attributes(''), {'x': "a 'b' c"})
self.assertEqual(extract_attributes(''), {'x': 'a "b" c'})
self.assertEqual(extract_attributes(''), {'x': 'y'})
self.assertEqual(extract_attributes(''), {'x': 'y'})
self.assertEqual(extract_attributes(''), {'x': '&'}) # XML
self.assertEqual(extract_attributes(''), {'x': '"'})
self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2
self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0
self.assertEqual(extract_attributes(''), {'x': '&foo'})
self.assertEqual(extract_attributes(''), {'x': "'"})
self.assertEqual(extract_attributes(''), {'x': '"'})
self.assertEqual(extract_attributes(''), {'x': None})
self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None})
self.assertEqual(extract_attributes(''), {'x': 'y'})
self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'})
self.assertEqual(extract_attributes(''), {'x': 'y'})
self.assertEqual(extract_attributes(''), {'x': 'y'})
self.assertEqual(extract_attributes(""), {'x': 'y'})
self.assertEqual(extract_attributes(''), {'x': '\ny\n'})
self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased
self.assertEqual(extract_attributes(''), {'x': '2'})
self.assertEqual(extract_attributes(''), {'x': '2'})
self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'})
self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'})
self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'})
# "Narrow" Python builds don't support unicode code points outside BMP.
try:
chr(0x10000)
supports_outside_bmp = True
except ValueError:
supports_outside_bmp = False
if supports_outside_bmp:
self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'})
# Malformed HTML should not break attributes extraction on older Python
self.assertEqual(extract_attributes(''), {})
GET_ELEMENT_BY_CLASS_TEST_STRING = '''
nice
also nice
'''
def test_get_element_by_class(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
def test_get_element_html_by_class(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_class('foo', html),
'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
foo
'''
def test_get_element_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice')
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
def test_get_element_html_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html),
'nice')
self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
nice
also nice
'''
GET_ELEMENTS_BY_CLASS_RES = [
'nice',
'also nice'
]
def test_get_elements_by_class(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('no-such-class', html), [])
def test_get_elements_html_by_class(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
def test_get_elements_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
def test_get_elements_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html),
self.GET_ELEMENTS_BY_CLASS_RES)
self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
def test_get_elements_text_and_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(
get_elements_text_and_html_by_attribute('class', 'foo bar', html),
list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
self.assertEqual(get_elements_text_and_html_by_attribute(
'class', 'foo', 'nicenot nice', tag='a'),
[('nice', 'nice')])
def test_get_element_text_and_html_by_tag(self):
get_element_by_tag_test_string = '''
random text lorem ipsum
this should be returned
this should also be returned
this should also be returned
closing tag above should not trick, so this should also be returned
but this text should not be returned
'''
html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4)
get_element_by_tag_res_outerdiv_html = html.strip()[32:276]
get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6]
get_element_by_tag_res_innerspan_html = html.strip()[78:119]
get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7]
self.assertEqual(
get_element_text_and_html_by_tag('div', html),
(get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html))
self.assertEqual(
get_element_text_and_html_by_tag('span', html),
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
self.assertIsNone(get_element_text_and_html_by_tag('article', html))
def test_get_elements_text_and_html_by_tag(self):
class StrictParser(MatchingElementParser):
STRICT = True
test_string = '''
ignore
'''
items = get_elements_text_and_html_by_tag('img', test_string)
self.assertEqual(items, [('', ''), ('', '')])
self.assertEqual(
StrictParser.get_element_text_and_html_by_tag('use', ''),
('', ''))
def test_get_element_text_and_html_by_tag_malformed(self):
inner_text = 'inner text'
malnested_elements = f'{inner_text}'
commented_html = ''
outerdiv_html = f'{malnested_elements}
'
html = f'{commented_html}{outerdiv_html}'
self.assertEqual(
get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
self.assertEqual(
get_element_text_and_html_by_tag('malnested_a', html),
(f'{inner_text}',
f'{inner_text}'))
self.assertEqual(
get_element_text_and_html_by_tag('malnested_b', html),
(f'{inner_text}',
f'{inner_text}'))
self.assertEqual(
get_element_text_and_html_by_tag('orphan', f'{html}'), ('', ''))
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}'))
# ignore case on tags
ci_html = f'{html}'
self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html))
def test_strict_html_parsing(self):
class StrictTagParser(HTMLTagParser):
STRICT = True
parser = StrictTagParser()
with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
parser.taglist('', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
parser.taglist('', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '
'"):
parser.taglist('
', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after ''"):
parser.taglist('
', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
parser.taglist('
', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
parser.taglist('
must be empty', reset=True)
def test_relaxed_html_parsing(self):
Tag = HTMLTagParser.Tag
parser = HTMLTagParser()
self.assertEqual(parser.taglist('', reset=True), [])
tags = parser.taglist('
', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(), ('', '
'))
self.assertEqual(tags[1].text_and_html(), ('', '
'))
tags = parser.taglist('
', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(), ('
', '
'))
self.assertEqual(tags[1].text_and_html(), ('
', '
'))
tags = parser.taglist('
', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(), ('
/p>', '
'))
self.assertEqual(tags[1].text_and_html(), ('', '
'))
tags = parser.taglist('
', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(),
('
paragraph
', '
'))
self.assertEqual(tags[1].text_and_html(), ('paragraph', '
paragraph
'))
tags = parser.taglist('
must be empty', reset=True)
self.assertEqual(tags, [Tag('img')])
self.assertEqual(tags[0].text_and_html(), ('', '
'))
def test_compliant_html_parsing(self):
# certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
Tag = HTMLTagParser.Tag
html = '''
no error without closing tag:
self closing is ok:
'''
parser = HTMLTagParser()
tags = parser.taglist(html, reset=True)
self.assertEqual(tags, [Tag('img'), Tag('img')])
# don't get fooled by '>' in attributes
html = '''
'''
tags = parser.taglist(html, reset=True)
self.assertEqual(tags[0].text_and_html(), ('', html))
def test_tag_return_order(self):
Tag = HTMLTagParser.Tag
html = '''
'''
parser = HTMLTagParser()
tags = parser.taglist(html, reset=True)
self.assertEqual(
str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'),
Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')]))
tags = parser.taglist(html, reset=True, depth_first=True)
self.assertEqual(
str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'),
Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')]))
# return tags in nested order
tags = parser.taglist(html, reset=True, depth_first=None)
self.assertEqual(
str(tags), str([
[Tag('t0'),
[Tag('t1'),
[Tag('t2'), [Tag('t3')], [Tag('t4')]]],
[Tag('t5'), [Tag('t6')]]],
[Tag('t7'), [Tag('t8')]]]))
def test_html_ignored_ranges(self):
def mark_comments(_string, char='^', nochar='-'):
cmts = HTMLIgnoreRanges(_string)
return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
html_string = '''
no comments in this line
---------------------------------------------------------------------
----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
before after
-----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
this is a leftover comment --> and end
----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
--------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
'''
lines = textwrap.dedent(html_string).strip().splitlines()
for line, marker in zip(lines[0::2], lines[1::2]):
self.assertEqual((line, mark_comments(line)), (line, marker))
# yet we must be able to match script elements
test_string = ''''''
items = get_element_text_and_html_by_tag('script', test_string)
self.assertEqual(items, ("var foo = 'bar';", test_string))