1
0
mirror of https://github.com/squidfunk/mkdocs-material.git synced 2024-11-24 07:30:12 +01:00

Fixed highlighting of tags

This commit is contained in:
squidfunk 2022-12-11 15:55:12 +01:00
parent ee1496499a
commit 24a3be8f04
15 changed files with 252 additions and 190 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -211,7 +211,7 @@
"base": base_url, "base": base_url,
"features": features, "features": features,
"translations": {}, "translations": {},
"search": "assets/javascripts/workers/search.208e55ea.min.js" | url "search": "assets/javascripts/workers/search.f5389c75.min.js" | url
} -%} } -%}
{%- if config.extra.version -%} {%- if config.extra.version -%}
{%- set _ = app.update({ "version": config.extra.version }) -%} {%- set _ = app.update({ "version": config.extra.version }) -%}
@ -239,13 +239,13 @@
</script> </script>
{% endblock %} {% endblock %}
{% block scripts %} {% block scripts %}
<script src="{{ 'assets/javascripts/bundle.f1ef77e2.min.js' | url }}"></script> <script src="{{ 'assets/javascripts/bundle.ce0331ff.min.js' | url }}"></script>
{% for path in config.extra_javascript %} {% for path in config.extra_javascript %}
<script src="{{ path | url }}"></script> <script src="{{ path | url }}"></script>
{% endfor %} {% endfor %}
{% endblock %} {% endblock %}
{% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %} {% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %}
<link rel="stylesheet" href="{{ 'assets/stylesheets/extra.b3906f4e.min.css' | url }}"> <link rel="stylesheet" href="{{ 'assets/stylesheets/extra.d35223bf.min.css' | url }}">
<script src="{{ 'assets/javascripts/extra/bundle.f719a234.min.js' | url }}" defer></script> <script src="{{ 'assets/javascripts/extra/bundle.f719a234.min.js' | url }}" defer></script>
{% endif %} {% endif %}
</body> </body>

View File

@ -30,6 +30,7 @@ import {
Position, Position,
PositionTable, PositionTable,
highlight, highlight,
highlightAll,
tokenize tokenize
} from "../internal" } from "../internal"
import { import {
@ -46,7 +47,9 @@ import {
/** /**
* Search item * Search item
*/ */
export interface SearchItem extends SearchDocument { export interface SearchItem
extends SearchDocument
{
score: number /* Score (relevance) */ score: number /* Score (relevance) */
terms: SearchQueryTerms /* Search query terms */ terms: SearchQueryTerms /* Search query terms */
} }
@ -213,6 +216,8 @@ export class Search {
.reduce<SearchItem[]>((item, { ref, score, matchData }) => { .reduce<SearchItem[]>((item, { ref, score, matchData }) => {
let doc = this.map.get(ref) let doc = this.map.get(ref)
if (typeof doc !== "undefined") { if (typeof doc !== "undefined") {
/* Shallow copy document */
doc = { ...doc } doc = { ...doc }
if (doc.tags) if (doc.tags)
doc.tags = [...doc.tags] doc.tags = [...doc.tags]
@ -223,39 +228,29 @@ export class Search {
Object.keys(matchData.metadata) Object.keys(matchData.metadata)
) )
// we must collect all positions for each term! /* Highlight matches in fields */
// we now take the keys of the index
for (const field of this.index.fields) { for (const field of this.index.fields) {
if (!(field in doc)) if (typeof doc[field] === "undefined")
continue continue
/* Collect matches */ /* Collect positions from matches */
const positions: Position[] = [] const positions: Position[] = []
for (const match of Object.values(matchData.metadata)) for (const match of Object.values(matchData.metadata))
if (field in match) if (typeof match[field] !== "undefined")
positions.push(...match[field].position) positions.push(...match[field].position)
/* Skip field, if no highlighting is necessary */ /* Skip highlighting, if no positions were collected */
if (!positions.length) if (!positions.length)
continue continue
// @ts-expect-error - @todo fix typings /* Load table and determine highlighting method */
if (Array.isArray(doc[field])) { const table = this.table.get([doc.location, field].join(":"))!
// @ts-expect-error - @todo fix typings const fn = Array.isArray(doc[field])
for (let i = 0; i < doc[field].length; i++) { ? highlightAll
// @ts-expect-error - @todo fix typings : highlight
doc[field][i] = highlight(doc[field][i],
this.table.get([doc.location, field].join(":"))!, // @ts-expect-error - stop moaning, TypeScript!
positions doc[field] = fn(doc[field], table, positions)
)
}
} else {
// @ts-expect-error - @todo fix typings
doc[field] = highlight(doc[field],
this.table.get([doc.location, field].join(":"))!,
positions
)
}
} }
/* Highlight title and text and apply post-query boosts */ /* Highlight title and text and apply post-query boosts */

View File

@ -41,15 +41,12 @@ type VisitorFn = (
/** /**
* Split a string using the given separator * Split a string using the given separator
* *
* This function intentionally expects a visitor function argument, as opposed * @param input - Input value
* to collecting and returning all sections, for better memory efficiency.
*
* @param value - String value
* @param separator - Separator * @param separator - Separator
* @param fn - Visitor function * @param fn - Visitor function
*/ */
export function split( export function split(
value: string, separator: RegExp, fn: VisitorFn input: string, separator: RegExp, fn: VisitorFn
): void { ): void {
separator = new RegExp(separator, "g") separator = new RegExp(separator, "g")
@ -57,10 +54,10 @@ export function split(
let match: RegExpExecArray | null let match: RegExpExecArray | null
let index = 0 let index = 0
do { do {
match = separator.exec(value) match = separator.exec(input)
/* Emit non-empty range */ /* Emit non-empty range */
const until = match?.index ?? value.length const until = match?.index ?? input.length
if (index < until) if (index < until)
fn(index, until) fn(index, until)

View File

@ -20,6 +20,24 @@
* IN THE SOFTWARE. * IN THE SOFTWARE.
*/ */
/* ----------------------------------------------------------------------------
* Types
* ------------------------------------------------------------------------- */
/**
* Extraction type
*
* This type defines the possible values that are encoded into the first two
* bits of a section that is part of the blocks of a tokenization table. There
* are three types of interest: HTML opening and closing tags, as well as the
* actual text content we need to extract for indexing.
*/
export const enum Extract {
TAG_OPEN = 0, /* HTML opening tag */
TEXT = 1, /* Text content */
TAG_CLOSE = 2 /* HTML closing tag */
}
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
* Helper types * Helper types
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
@ -28,12 +46,12 @@
* Visitor function * Visitor function
* *
* @param block - Block index * @param block - Block index
* @param operation - Operation index * @param type - Extraction type
* @param start - Start offset * @param start - Start offset
* @param end - End offset * @param end - End offset
*/ */
type VisitorFn = ( type VisitorFn = (
block: number, operation: number, start: number, end: number block: number, type: Extract, start: number, end: number
) => void ) => void
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
@ -41,18 +59,18 @@ type VisitorFn = (
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
/** /**
* Extract all non-HTML parts of a string * Split a string into markup and text sections
* *
* This function preprocesses the given string by isolating all non-HTML parts, * This function scans a string and divides it up into sections of markup and
* in order to ensure that HTML tags are removed before indexing. Note that it * text. For each section, it invokes the given visitor function with the block
* intentionally expects a visitor function argument, as opposed to collecting * index, extraction type, as well as start and end offsets. Using a visitor
* and returning all sections, for better memory efficiency. * function (= streaming data) is ideal for minimizing pressure on the GC.
* *
* @param value - String value * @param input - Input value
* @param fn - Visitor function * @param fn - Visitor function
*/ */
export function extract( export function extract(
value: string, fn: VisitorFn input: string, fn: VisitorFn
): void { ): void {
let block = 0 /* Current block */ let block = 0 /* Current block */
@ -60,22 +78,22 @@ export function extract(
let end = 0 /* Current end offset */ let end = 0 /* Current end offset */
/* Split string into sections */ /* Split string into sections */
for (let stack = 0; end < value.length; end++) { for (let stack = 0; end < input.length; end++) {
/* Tag start after non-empty section */ /* Opening tag after non-empty section */
if (value.charAt(end) === "<" && end > start) { if (input.charAt(end) === "<" && end > start) {
fn(block, 1, start, start = end) fn(block, Extract.TEXT, start, start = end)
/* Tag end */ /* Closing tag */
} else if (value.charAt(end) === ">") { } else if (input.charAt(end) === ">") {
if (value.charAt(start + 1) === "/") { if (input.charAt(start + 1) === "/") {
if (--stack === 0) if (--stack === 0)
fn(block++, 2, start, end + 1) fn(block++, Extract.TAG_CLOSE, start, end + 1)
/* Tag is not self-closing */ /* Tag is not self-closing */
} else if (value.charAt(end - 1) !== "/") { } else if (input.charAt(end - 1) !== "/") {
if (stack++ === 0) if (stack++ === 0)
fn(block, 0, start, end + 1) fn(block, Extract.TAG_OPEN, start, end + 1)
} }
/* New section */ /* New section */
@ -85,5 +103,5 @@ export function extract(
/* Add trailing section */ /* Add trailing section */
if (end > start) if (end > start)
fn(block, 1, start, end) fn(block, Extract.TEXT, start, end)
} }

View File

@ -25,7 +25,7 @@
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
/** /**
* Table for indexing * Position table
*/ */
export type PositionTable = number[][] export type PositionTable = number[][]
@ -46,21 +46,55 @@ export type Position = number
* when executing the query. It then highlights all occurrences, and returns * when executing the query. It then highlights all occurrences, and returns
* their concatenation. In case of multiple blocks, two are returned. * their concatenation. In case of multiple blocks, two are returned.
* *
* @param value - String value * @param input - Input value
* @param table - Table for indexing * @param table - Table for indexing
* @param positions - Occurrences * @param positions - Occurrences
* *
* @returns Highlighted string value * @returns Highlighted string value
*/ */
export function highlight( export function highlight(
value: string, table: PositionTable, positions: Position[] input: string, table: PositionTable, positions: Position[]
): string { ): string {
return highlightAll([input], table, positions).pop()!
}
/**
* Highlight all occurrences in a set of strings
*
* @param inputs - Input values
* @param table - Table for indexing
* @param positions - Occurrences
*
* @returns Highlighted string values
*/
export function highlightAll(
inputs: string[], table: PositionTable, positions: Position[]
): string[] {
/* Map blocks to input values */
const mapping = [0]
for (let t = 1; t < table.length; t++) {
const prev = table[t - 1]
const next = table[t]
/* Check if table points to new block */
const p = prev[prev.length - 1] >>> 2 & 0x3FF
const q = next[0] >>> 12
/* Add block to mapping */
mapping.push(+(p > q) + mapping[mapping.length - 1])
}
/* Highlight strings one after another */
return inputs.map((input, i) => {
/* Map occurrences to blocks */ /* Map occurrences to blocks */
const blocks = new Map<number, number[]>() const blocks = new Map<number, number[]>()
for (const i of positions.sort((a, b) => a - b)) { for (const p of positions.sort((a, b) => a - b)) {
const block = i >>> 20 const index = p & 0xFFFFF
const index = i & 0xFFFFF const block = p >>> 20
if (mapping[block] !== i)
continue
/* Ensure presence of block group */ /* Ensure presence of block group */
let group = blocks.get(block) let group = blocks.get(block)
@ -71,6 +105,10 @@ export function highlight(
group.push(index) group.push(index)
} }
/* Just return string, if no occurrences */
if (blocks.size === 0)
return input
/* Compute slices */ /* Compute slices */
const slices: string[] = [] const slices: string[] = []
for (const [block, indexes] of blocks) { for (const [block, indexes] of blocks) {
@ -81,18 +119,20 @@ export function highlight(
const end = t[t.length - 1] >>> 12 const end = t[t.length - 1] >>> 12
const length = t[t.length - 1] >>> 2 & 0x3FF const length = t[t.length - 1] >>> 2 & 0x3FF
/* Extract and highlight slice/block */ /* Extract and highlight slice */
let slice = value.slice(start, end + length) let slice = input.slice(start, end + length)
for (const i of indexes.sort((a, b) => b - a)) { for (const j of indexes.sort((a, b) => b - a)) {
/* Retrieve offset and length of match */ /* Retrieve offset and length of match */
const p = (t[i] >>> 12) - start const p = (t[j] >>> 12) - start
const q = (t[i] >>> 2 & 0x3FF) + p const q = (t[j] >>> 2 & 0x3FF) + p
/* Wrap occurrence */ /* Wrap occurrence */
slice = [ slice = [
slice.slice(0, p), slice.slice(0, p),
"<mark>", slice.slice(p, q), "</mark>", "<mark>",
slice.slice(p, q),
"</mark>",
slice.slice(q) slice.slice(q)
].join("") ].join("")
} }
@ -102,6 +142,7 @@ export function highlight(
break break
} }
/* Return highlighted string value */ /* Return highlighted slices */
return slices.join("") return slices.join("")
})
} }

View File

@ -21,19 +21,29 @@
*/ */
import { split } from "../_" import { split } from "../_"
import { extract } from "../extract" import {
Extract,
extract
} from "../extract"
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
* Functions * Functions
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
/** /**
* Split a string into tokens * Split a string or set of strings into tokens
* *
* This tokenizer supersedes the default tokenizer that is provided by Lunr.js, * This tokenizer supersedes the default tokenizer that is provided by Lunr.js,
* as it is aware of HTML tags and allows for multi-character splitting. * as it is aware of HTML tags and allows for multi-character splitting.
* *
* @param input - String value or token * It takes the given inputs, splits each of them into markup and text sections,
* tokenizes and segments (if necessary) each of them, and then indexes them in
* a table by using a compact bit representation. Bitwise techniques are used
* to write and read from the table during indexing and querying.
*
* @see https://bit.ly/3W3Xw4J - Search: better, faster, smaller
*
* @param input - Input value(s)
* *
* @returns Tokens * @returns Tokens
*/ */
@ -41,67 +51,75 @@ export function tokenize(
input?: string | string[] input?: string | string[]
): lunr.Token[] { ): lunr.Token[] {
const tokens: lunr.Token[] = [] const tokens: lunr.Token[] = []
if (typeof input === "undefined")
return tokens
/** /* Initialize segmenter, if loaded */
* Initialize segmenter, if loaded
*
* Note that doing this here is not ideal, but it's okay as we just test it
* before bringing the new search implementation in its final shape.
*/
const segmenter = "TinySegmenter" in lunr const segmenter = "TinySegmenter" in lunr
? new lunr.TinySegmenter() ? new lunr.TinySegmenter()
: undefined : undefined
/* Tokenize an array of string values */ /* Tokenize strings one after another */
if (Array.isArray(input)) { const inputs = Array.isArray(input) ? input : [input]
// @todo: handle multi-valued fields (e.g. tags) for (let i = 0; i < inputs.length; i++) {
for (const value of input)
tokens.push(...tokenize(value))
/* Tokenize a string value */
} else if (input) {
const table = lunr.tokenizer.table const table = lunr.tokenizer.table
const total = table.length
/* Split string into sections and tokenize content blocks */ /* Split string into sections and tokenize content blocks */
extract(input, (block, type, start, end) => { extract(inputs[i], (block, type, start, end) => {
if (type & 1) { block += total
const section = input.slice(start, end) switch (type) {
/* Handle markup */
case Extract.TAG_OPEN:
case Extract.TAG_CLOSE:
table[block] ||= []
table[block].push(
start << 12 |
end - start << 2 |
type
)
break
/* Handle text content */
case Extract.TEXT:
const section = inputs[i].slice(start, end)
split(section, lunr.tokenizer.separator, (index, until) => { split(section, lunr.tokenizer.separator, (index, until) => {
/** /**
* Apply segmenter after tokenization. Note that the segmenter will * Apply segmenter after tokenization. Note that the segmenter will
* also split words at word boundaries, which is not what we want, so * also split words at word boundaries, which is not what we want,
* we need to check if we can somehow mitigate this behavior. * so we need to check if we can somehow mitigate this behavior.
*/ */
if (typeof segmenter !== "undefined") { if (typeof segmenter !== "undefined") {
const subsection = section.slice(index, until) const subsection = section.slice(index, until)
if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) { if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
const segments = segmenter.segment(subsection) const segments = segmenter.segment(subsection)
for (let i = 0, l = 0; i < segments.length; i++) { for (let s = 0, l = 0; s < segments.length; s++) {
/* Add block to table */ /* Add block to section */
table[block] ||= [] table[block] ||= []
table[block].push( table[block].push(
start + index + l << 12 | start + index + l << 12 |
segments[i].length << 2 | segments[s].length << 2 |
type type
) )
/* Add block as token */ /* Add token with position */
tokens.push(new lunr.Token( tokens.push(new lunr.Token(
segments[i].toLowerCase(), { segments[s].toLowerCase(), {
position: block << 20 | table[block].length - 1 position: block << 20 | table[block].length - 1
} }
)) ))
/* Keep track of length */ /* Keep track of length */
l += segments[i].length l += segments[s].length
} }
return // combine segmenter with other approach!? return
} }
} }
/* Add block to table */ /* Add block to section */
table[block] ||= [] table[block] ||= []
table[block].push( table[block].push(
start + index << 12 | start + index << 12 |
@ -109,22 +127,13 @@ export function tokenize(
type type
) )
/* Add block as token */ /* Add token with position */
tokens.push(new lunr.Token( tokens.push(new lunr.Token(
section.slice(index, until).toLowerCase(), { section.slice(index, until).toLowerCase(), {
position: block << 20 | table[block].length - 1 position: block << 20 | table[block].length - 1
} }
)) ))
}) })
/* Add non-content block to table */
} else {
table[block] ||= []
table[block].push(
start << 12 |
end - start << 2 |
type
)
} }
}) })
} }

View File

@ -26,15 +26,17 @@ import lunr from "lunr"
* Global types * Global types
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
type Fields = "text" | "title" | "tags"
declare global { declare global {
namespace lunr { namespace lunr {
/** /**
* Index - expose inverted index * Index - expose inverted index
*/ */
interface Index { interface Index { // this is defined in the actual inverface...
invertedIndex: Record<string, unknown> invertedIndex: Record<string, unknown>
fields: string[] // @todo: make typing generic? fields: Fields[]
} }
interface Builder { interface Builder {