import textwrap import unittest from yt_dlp.compat import compat_HTMLParseError from yt_dlp.parsing import ( MatchingElementParser, HTMLIgnoreRanges, HTMLTagParser, ) extract_attributes = MatchingElementParser.extract_attributes get_element_by_attribute = MatchingElementParser.get_element_by_attribute get_element_by_class = MatchingElementParser.get_element_by_class get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute get_element_html_by_class = MatchingElementParser.get_element_html_by_class get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute get_elements_by_class = MatchingElementParser.get_elements_by_class get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag class TestParsing(unittest.TestCase): def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': "a 'b' c"}) self.assertEqual(extract_attributes(''), {'x': 'a "b" c'}) self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': '&'}) # XML self.assertEqual(extract_attributes(''), {'x': '"'}) self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 self.assertEqual(extract_attributes(''), {'x': '&foo'}) self.assertEqual(extract_attributes(''), {'x': "'"}) self.assertEqual(extract_attributes(''), {'x': '"'}) self.assertEqual(extract_attributes(''), {'x': None}) self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None}) self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'}) self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased self.assertEqual(extract_attributes(''), {'x': '2'}) self.assertEqual(extract_attributes(''), {'x': '2'}) self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'}) self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'}) # "Narrow" Python builds don't support unicode code points outside BMP. try: chr(0x10000) supports_outside_bmp = True except ValueError: supports_outside_bmp = False if supports_outside_bmp: self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) # Malformed HTML should not break attributes extraction on older Python self.assertEqual(extract_attributes(''), {}) GET_ELEMENT_BY_CLASS_TEST_STRING = ''' nice
also nice
''' def test_get_element_by_class(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_by_class('foo', html), 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) def test_get_element_html_by_class(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_html_by_class('foo', html), 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' ''' def test_get_element_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice') self.assertEqual(get_element_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice') html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') def test_get_element_html_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), 'nice') self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None) html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip()) GET_ELEMENTS_BY_CLASS_TEST_STRING = ''' nice also nice ''' GET_ELEMENTS_BY_CLASS_RES = [ 'nice', 'also nice' ] def test_get_elements_by_class(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_class('no-such-class', html), []) def test_get_elements_html_by_class(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES) self.assertEqual(get_elements_html_by_class('no-such-class', html), []) def test_get_elements_by_attribute(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) def test_get_elements_html_by_attribute(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), self.GET_ELEMENTS_BY_CLASS_RES) self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), []) def test_get_elements_text_and_html_by_attribute(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual( get_elements_text_and_html_by_attribute('class', 'foo bar', html), list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES))) self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) self.assertEqual(get_elements_text_and_html_by_attribute( 'class', 'foo', 'nicenot nice', tag='a'), [('nice', 'nice')]) def test_get_element_text_and_html_by_tag(self): get_element_by_tag_test_string = ''' random text lorem ipsum

this should be returned this should also be returned
this should also be returned
closing tag above should not trick, so this should also be returned
but this text should not be returned ''' html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4) get_element_by_tag_res_outerdiv_html = html.strip()[32:276] get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6] get_element_by_tag_res_innerspan_html = html.strip()[78:119] get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7] self.assertEqual( get_element_text_and_html_by_tag('div', html), (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html)) self.assertEqual( get_element_text_and_html_by_tag('span', html), (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) self.assertIsNone(get_element_text_and_html_by_tag('article', html)) def test_get_elements_text_and_html_by_tag(self): class StrictParser(MatchingElementParser): STRICT = True test_string = ''' ignore ''' items = get_elements_text_and_html_by_tag('img', test_string) self.assertEqual(items, [('', ''), ('', '')]) self.assertEqual( StrictParser.get_element_text_and_html_by_tag('use', ''), ('', '')) def test_get_element_text_and_html_by_tag_malformed(self): inner_text = 'inner text' malnested_elements = f'{inner_text}' commented_html = '' outerdiv_html = f'
{malnested_elements}
' html = f'{commented_html}{outerdiv_html}' self.assertEqual( get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html)) self.assertEqual( get_element_text_and_html_by_tag('malnested_a', html), (f'{inner_text}', f'{inner_text}')) self.assertEqual( get_element_text_and_html_by_tag('malnested_b', html), (f'{inner_text}', f'{inner_text}')) self.assertEqual( get_element_text_and_html_by_tag('orphan', f'{html}'), ('', '')) self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) # ignore case on tags ci_html = f'{html}' self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html)) def test_strict_html_parsing(self): class StrictTagParser(HTMLTagParser): STRICT = True parser = StrictTagParser() with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"): parser.taglist('

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"): parser.taglist('

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): parser.taglist('

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): parser.taglist('

/p>

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"): parser.taglist('

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"): parser.taglist('must be empty', reset=True) def test_relaxed_html_parsing(self): Tag = HTMLTagParser.Tag parser = HTMLTagParser() self.assertEqual(parser.taglist('

', reset=True), []) tags = parser.taglist('

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) self.assertEqual(tags[0].text_and_html(), ('', '

')) self.assertEqual(tags[1].text_and_html(), ('', '

')) tags = parser.taglist('

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) self.assertEqual(tags[0].text_and_html(), ('

', '

')) self.assertEqual(tags[1].text_and_html(), ('
', '

')) tags = parser.taglist('

/p>

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) self.assertEqual(tags[0].text_and_html(), ('

/p>', '

/p>

')) self.assertEqual(tags[1].text_and_html(), ('', '

')) tags = parser.taglist('

paragraph

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) self.assertEqual(tags[0].text_and_html(), ('

paragraph', '

paragraph

')) self.assertEqual(tags[1].text_and_html(), ('paragraph', '

paragraph')) tags = parser.taglist('must be empty', reset=True) self.assertEqual(tags, [Tag('img')]) self.assertEqual(tags[0].text_and_html(), ('', '')) def test_compliant_html_parsing(self): # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) Tag = HTMLTagParser.Tag html = ''' no error without closing tag: self closing is ok: ''' parser = HTMLTagParser() tags = parser.taglist(html, reset=True) self.assertEqual(tags, [Tag('img'), Tag('img')]) # don't get fooled by '>' in attributes html = '''''' tags = parser.taglist(html, reset=True) self.assertEqual(tags[0].text_and_html(), ('', html)) def test_tag_return_order(self): Tag = HTMLTagParser.Tag html = ''' ''' parser = HTMLTagParser() tags = parser.taglist(html, reset=True) self.assertEqual( str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'), Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')])) tags = parser.taglist(html, reset=True, depth_first=True) self.assertEqual( str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'), Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')])) # return tags in nested order tags = parser.taglist(html, reset=True, depth_first=None) self.assertEqual( str(tags), str([ [Tag('t0'), [Tag('t1'), [Tag('t2'), [Tag('t3')], [Tag('t4')]]], [Tag('t5'), [Tag('t6')]]], [Tag('t7'), [Tag('t8')]]])) def test_html_ignored_ranges(self): def mark_comments(_string, char='^', nochar='-'): cmts = HTMLIgnoreRanges(_string) return "".join(char if _idx in cmts else nochar for _idx in range(len(_string))) html_string = ''' no comments in this line --------------------------------------------------------------------- ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--- before after -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------- this is a leftover comment --> and end ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^--------- --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^--------- ''' lines = textwrap.dedent(html_string).strip().splitlines() for line, marker in zip(lines[0::2], lines[1::2]): self.assertEqual((line, mark_comments(line)), (line, marker)) # yet we must be able to match script elements test_string = '''''' items = get_element_text_and_html_by_tag('script', test_string) self.assertEqual(items, ("var foo = 'bar';", test_string))