Slight refactoring of tokenizer

2024-11-23 23:21:00 +01:00 · 2022-12-18 20:51:39 +01:00 · 2022-12-18 20:51:39 +01:00 · b64d0a6993
commit b64d0a6993
parent 8d190ef150
10 changed files with 27 additions and 24 deletions
--- a/material/assets/javascripts/workers/search.d78809e2.min.js
+++ b/material/assets/javascripts/workers/search.d78809e2.min.js
--- a/material/assets/javascripts/workers/search.d78809e2.min.js.map
+++ b/material/assets/javascripts/workers/search.d78809e2.min.js.map
--- a/material/assets/stylesheets/extra.82c6347d.min.css
+++ b/material/assets/stylesheets/extra.82c6347d.min.css
--- a/material/assets/stylesheets/extra.82c6347d.min.css.map
+++ b/material/assets/stylesheets/extra.82c6347d.min.css.map
--- a/material/assets/stylesheets/extra.d6bc9295.min.css
+++ b/material/assets/stylesheets/extra.d6bc9295.min.css
--- a/material/assets/stylesheets/extra.d6bc9295.min.css.map
+++ b/material/assets/stylesheets/extra.d6bc9295.min.css.map
--- a/material/base.html
+++ b/material/base.html
@ -212,7 +212,7 @@
        "base": base_url,
        "features": features,
        "translations": {},
-        "search": "assets/javascripts/workers/search.3de43c86.min.js" | url
+        "search": "assets/javascripts/workers/search.d78809e2.min.js" | url
      } -%}
      {%- if config.extra.version -%}
        {%- set _ = app.update({ "version": config.extra.version }) -%}
@ -246,7 +246,7 @@
      {% endfor %}
    {% endblock %}
    {% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %}
-      <link rel="stylesheet" href="{{ 'assets/stylesheets/extra.d6bc9295.min.css' | url }}">
+      <link rel="stylesheet" href="{{ 'assets/stylesheets/extra.82c6347d.min.css' | url }}">
      <script src="{{ 'assets/javascripts/extra/bundle.cfb3feee.min.js' | url }}" defer></script>
    {% endif %}
  </body>
--- a/src/assets/javascripts/integrations/search/_/index.ts
+++ b/src/assets/javascripts/integrations/search/_/index.ts
@ -79,7 +79,7 @@ function extractor(table: Map<string, PositionTable>) {
      if (typeof doc[name] === "undefined")
        return undefined

-      /* Compute identifier and initiable table */
+      /* Compute identifier and initialize table */
      const id = [doc.location, name].join(":")
      table.set(id, lunr.tokenizer.table = [])

@ -162,6 +162,11 @@ export class Search {
      this.tokenizer = tokenize as typeof lunr.tokenizer
      lunr.tokenizer.separator = new RegExp(config.separator)

+      /* Set up custom segmenter, if loaded */
+      lunr.segmenter = "TinySegmenter" in lunr
+        ? new lunr.TinySegmenter()
+        : undefined
+
      /* Compute functions to be removed from the pipeline */
      const fns = difference([
        "trimmer", "stopWordFilter", "stemmer"
--- a/src/assets/javascripts/integrations/search/internal/tokenize/index.ts
+++ b/src/assets/javascripts/integrations/search/internal/tokenize/index.ts
@ -54,11 +54,6 @@ export function tokenize(
  if (typeof input === "undefined")
    return tokens

-  /* Initialize segmenter, if loaded */
-  const segmenter = "TinySegmenter" in lunr
-    ? new lunr.TinySegmenter()
-    : undefined
-
  /* Tokenize strings one after another */
  const inputs = Array.isArray(input) ? input : [input]
  for (let i = 0; i < inputs.length; i++) {
@ -67,13 +62,12 @@ export function tokenize(

    /* Split string into sections and tokenize content blocks */
    extract(inputs[i], (block, type, start, end) => {
-      block += total
+      table[block += total] ||= []
      switch (type) {

        /* Handle markup */
        case Extract.TAG_OPEN:
        case Extract.TAG_CLOSE:
-          table[block] ||= []
          table[block].push(
            start       << 12 |
            end - start <<  2 |
@ -91,10 +85,10 @@ export function tokenize(
             * also split words at word boundaries, which is not what we want,
             * so we need to check if we can somehow mitigate this behavior.
             */
-            if (typeof segmenter !== "undefined") {
+            if (typeof lunr.segmenter !== "undefined") {
              const subsection = section.slice(index, until)
-              if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
-                const segments = segmenter.segment(subsection)
+              if (/^[MHIK]$/.test(lunr.segmenter.ctype_(subsection))) {
+                const segments = lunr.segmenter.segment(subsection)
                for (let s = 0, l = 0; s < segments.length; s++) {

                  /* Add block to section */
@ -120,7 +114,6 @@ export function tokenize(
            }

            /* Add block to section */
-            table[block] ||= []
            table[block].push(
              start + index << 12 |
              until - index <<  2 |
--- a/typings/lunr/index.d.ts
+++ b/typings/lunr/index.d.ts
@ -78,6 +78,11 @@ declare global {
      let table: number[][]
    }

+    /**
+     * Segmenter
+     */
+    let segmenter: TinySegmenter | undefined
+
    /**
     * Lexeme type
     */