Fixed blog readtime calculation to ignore non-content text (#7370)

2025-02-18 19:14:10 +01:00 · 2024-07-16 15:49:13 +02:00 · 2024-07-16 15:49:13 +02:00 · 6b13c560f5
commit 6b13c560f5
parent 4f8081c268
2 changed files with 64 additions and 6 deletions
--- a/material/plugins/blog/readtime/parser.py
+++ b/material/plugins/blog/readtime/parser.py
@ -20,6 +20,10 @@

 from html.parser import HTMLParser

+# TODO: Refactor the `void` set into a common module and import it from there
+# and not from the search plugin.
+from material.plugins.search.plugin import void
+
 # -----------------------------------------------------------------------------
 # Classes
 # -----------------------------------------------------------------------------
@ -31,15 +35,40 @@ class ReadtimeParser(HTMLParser):
    def __init__(self):
        super().__init__(convert_charrefs = True)

+        # Tags to skip
+        self.skip = set([
+            "object",                  # Objects
+            "script",                  # Scripts
+            "style",                   # Styles
+            "svg"                      # SVGs
+        ])
+
+        # Current context
+        self.context = []
+
        # Keep track of text and images
        self.text   = []
        self.images = 0

-    # Collect images
+    # Called at the start of every HTML tag
    def handle_starttag(self, tag, attrs):
+        # Collect images
        if tag == "img":
            self.images += 1

-    # Collect text
+        # Ignore self-closing tags
+        if tag not in void:
+            # Add tag to context
+            self.context.append(tag)
+
+    # Called for the text contents of each tag
    def handle_data(self, data):
-        self.text.append(data)
+        # Collect text if not inside skip context
+        if not self.skip.intersection(self.context):
+            self.text.append(data)
+
+    # Called at the end of every HTML tag
+    def handle_endtag(self, tag):
+        if self.context and self.context[-1] == tag:
+            # Remove tag from context
+            self.context.pop()
--- a/src/plugins/blog/readtime/parser.py
+++ b/src/plugins/blog/readtime/parser.py
@ -20,6 +20,10 @@

 from html.parser import HTMLParser

+# TODO: Refactor the `void` set into a common module and import it from there
+# and not from the search plugin.
+from material.plugins.search.plugin import void
+
 # -----------------------------------------------------------------------------
 # Classes
 # -----------------------------------------------------------------------------
@ -31,15 +35,40 @@ class ReadtimeParser(HTMLParser):
    def __init__(self):
        super().__init__(convert_charrefs = True)

+        # Tags to skip
+        self.skip = set([
+            "object",                  # Objects
+            "script",                  # Scripts
+            "style",                   # Styles
+            "svg"                      # SVGs
+        ])
+
+        # Current context
+        self.context = []
+
        # Keep track of text and images
        self.text   = []
        self.images = 0

-    # Collect images
+    # Called at the start of every HTML tag
    def handle_starttag(self, tag, attrs):
+        # Collect images
        if tag == "img":
            self.images += 1

-    # Collect text
+        # Ignore self-closing tags
+        if tag not in void:
+            # Add tag to context
+            self.context.append(tag)
+
+    # Called for the text contents of each tag
    def handle_data(self, data):
-        self.text.append(data)
+        # Collect text if not inside skip context
+        if not self.skip.intersection(self.context):
+            self.text.append(data)
+
+    # Called at the end of every HTML tag
+    def handle_endtag(self, tag):
+        if self.context and self.context[-1] == tag:
+            # Remove tag from context
+            self.context.pop()