Fixed search plugin crashing on nested headlines

2025-02-18 19:14:10 +01:00 · 2023-01-08 09:39:05 +01:00 · 2023-01-08 09:39:05 +01:00 · 81e7b8c7fc
commit 81e7b8c7fc
parent c4d61cdc41
2 changed files with 76 additions and 28 deletions
--- a/material/plugins/search/plugin.py
+++ b/material/plugins/search/plugin.py
@ -266,6 +266,10 @@ class Element:
        self.tag   = tag
        self.attrs = attrs

+    # String representation
+    def __repr__(self):
+        return self.tag
+
    # Support comparison (compare by tag only)
    def __eq__(self, other):
        if other is Element:
@ -291,12 +295,22 @@ class Section:
    """

    # Initialize HTML section
-    def __init__(self, el):
+    def __init__(self, el, depth = 0):
        self.el = el
+        self.depth = depth
+
+        # Initialize section data
        self.text  = []
        self.title = []
        self.id = None

+    # String representation
+    def __repr__(self):
+        if self.id:
+            return "#".join([self.el.tag, self.id])
+        else:
+            return self.el.tag
+
    # Check whether the section should be excluded
    def is_excluded(self):
        return self.el.is_excluded()
@ -350,15 +364,16 @@ class Parser(HTMLParser):

        # Handle headings
        if tag in ([f"h{x}" for x in range(1, 7)]):
+            depth = len(self.context)
            if "id" in attrs:

                # Ensure top-level section
                if tag != "h1" and not self.data:
-                    self.section = Section(Element("hx"))
+                    self.section = Section(Element("hx"), depth)
                    self.data.append(self.section)

                # Set identifier, if not first section
-                self.section = Section(el)
+                self.section = Section(el, depth)
                if self.data:
                    self.section.id = attrs["id"]

@ -398,6 +413,20 @@ class Parser(HTMLParser):
        if not self.context or self.context[-1] != tag:
            return

+        # Check whether we're exiting the current context, which happens when
+        # a headline is nested in another element. In that case, we close the
+        # current section, continuing to append data to the previous section,
+        # which could also be a nested section – see https://bit.ly/3IxxIJZ
+        if self.section.depth > len(self.context):
+            for section in reversed(self.data):
+                if section.depth and section.depth <= len(self.context):
+
+                    # Set depth to 0 in order to denote that the current section
+                    # is exited and must not be considered again.
+                    self.section.depth = 0
+                    self.section = section
+                    break
+
        # Remove element from skip list
        el = self.context.pop()
        if el in self.skip:
@ -407,18 +436,13 @@ class Parser(HTMLParser):
        # Render closing tag if kept
        if not self.skip.intersection(self.context):
            if tag in self.keep:
+
+                # Check whether we're inside the section title
                data = self.section.text
-                if self.section.el in reversed(self.context):
+                if self.section.el in self.context:
                    data = self.section.title

-                # Remove element if empty (or only whitespace)
-                if data[-1] == f"<{tag}>":
-                    del data[-1:]
-                elif data[-1].isspace() and data[-2] == f"<{tag}>":
-                    del data[-2:]
-
                # Append to section title or text
-                else:
                data.append(f"</{tag}>")

    # Called for the text contents of each tag
@ -439,7 +463,7 @@ class Parser(HTMLParser):
            self.data.append(self.section)

        # Handle section headline
-        if self.section.el in reversed(self.context):
+        if self.section.el in self.context:
            permalink = False
            for el in self.context:
                if el.tag == "a" and el.attrs.get("class") == "headerlink":
--- a/src/plugins/search/plugin.py
+++ b/src/plugins/search/plugin.py
@ -266,6 +266,10 @@ class Element:
        self.tag   = tag
        self.attrs = attrs

+    # String representation
+    def __repr__(self):
+        return self.tag
+
    # Support comparison (compare by tag only)
    def __eq__(self, other):
        if other is Element:
@ -291,12 +295,22 @@ class Section:
    """

    # Initialize HTML section
-    def __init__(self, el):
+    def __init__(self, el, depth = 0):
        self.el = el
+        self.depth = depth
+
+        # Initialize section data
        self.text  = []
        self.title = []
        self.id = None

+    # String representation
+    def __repr__(self):
+        if self.id:
+            return "#".join([self.el.tag, self.id])
+        else:
+            return self.el.tag
+
    # Check whether the section should be excluded
    def is_excluded(self):
        return self.el.is_excluded()
@ -350,15 +364,16 @@ class Parser(HTMLParser):

        # Handle headings
        if tag in ([f"h{x}" for x in range(1, 7)]):
+            depth = len(self.context)
            if "id" in attrs:

                # Ensure top-level section
                if tag != "h1" and not self.data:
-                    self.section = Section(Element("hx"))
+                    self.section = Section(Element("hx"), depth)
                    self.data.append(self.section)

                # Set identifier, if not first section
-                self.section = Section(el)
+                self.section = Section(el, depth)
                if self.data:
                    self.section.id = attrs["id"]

@ -398,6 +413,20 @@ class Parser(HTMLParser):
        if not self.context or self.context[-1] != tag:
            return

+        # Check whether we're exiting the current context, which happens when
+        # a headline is nested in another element. In that case, we close the
+        # current section, continuing to append data to the previous section,
+        # which could also be a nested section – see https://bit.ly/3IxxIJZ
+        if self.section.depth > len(self.context):
+            for section in reversed(self.data):
+                if section.depth and section.depth <= len(self.context):
+
+                    # Set depth to 0 in order to denote that the current section
+                    # is exited and must not be considered again.
+                    self.section.depth = 0
+                    self.section = section
+                    break
+
        # Remove element from skip list
        el = self.context.pop()
        if el in self.skip:
@ -407,18 +436,13 @@ class Parser(HTMLParser):
        # Render closing tag if kept
        if not self.skip.intersection(self.context):
            if tag in self.keep:
+
+                # Check whether we're inside the section title
                data = self.section.text
-                if self.section.el in reversed(self.context):
+                if self.section.el in self.context:
                    data = self.section.title

-                # Remove element if empty (or only whitespace)
-                if data[-1] == f"<{tag}>":
-                    del data[-1:]
-                elif data[-1].isspace() and data[-2] == f"<{tag}>":
-                    del data[-2:]
-
                # Append to section title or text
-                else:
                data.append(f"</{tag}>")

    # Called for the text contents of each tag
@ -439,7 +463,7 @@ class Parser(HTMLParser):
            self.data.append(self.section)

        # Handle section headline
-        if self.section.el in reversed(self.context):
+        if self.section.el in self.context:
            permalink = False
            for el in self.context:
                if el.tag == "a" and el.attrs.get("class") == "headerlink":