1
0
mirror of synced 2024-11-15 02:37:40 +01:00

Added 'Fuzzy Match' operation

This commit is contained in:
n1474335 2021-02-12 13:51:51 +00:00
parent 5893ac1a37
commit 8ad18bc7db
9 changed files with 196 additions and 39 deletions

View File

@ -19,7 +19,7 @@ jobs:
- name: Install
run: |
npm install
export NODE_OPTIONS=--max_old_space_size=2048
npm run setheapsize
- name: Lint
run: npx grunt lint

View File

@ -18,7 +18,7 @@ jobs:
- name: Install
run: |
npm install
export NODE_OPTIONS=--max_old_space_size=2048
npm run setheapsize
- name: Lint
run: npx grunt lint

View File

@ -19,7 +19,7 @@ jobs:
- name: Install
run: |
npm install
export NODE_OPTIONS=--max_old_space_size=2048
npm run setheapsize
- name: Lint
run: npx grunt lint

View File

@ -173,6 +173,8 @@
"testuidev": "npx nightwatch --env=dev",
"lint": "npx grunt lint",
"postinstall": "npx grunt exec:fixCryptoApiImports",
"newop": "node --experimental-modules src/core/config/scripts/newOperation.mjs"
"newop": "node --experimental-modules src/core/config/scripts/newOperation.mjs",
"getheapsize": "node -e 'console.log(`node heap limit = ${require(\"v8\").getHeapStatistics().heap_size_limit / (1024 * 1024)} Mb`)'",
"setheapsize": "export NODE_OPTIONS=--max_old_space_size=2048"
}
}

View File

@ -238,6 +238,7 @@
"Pad lines",
"Find / Replace",
"Regular expression",
"Fuzzy Match",
"Offset checker",
"Hamming Distance",
"Convert distance",

View File

@ -16,40 +16,72 @@
* Anurag Awasthi - updated to 0.2.0
*/
const SEQUENTIAL_BONUS = 15; // bonus for adjacent matches
const SEPARATOR_BONUS = 30; // bonus if match occurs after a separator
const CAMEL_BONUS = 30; // bonus if match is uppercase and prev is lower
const FIRST_LETTER_BONUS = 15; // bonus if the first letter is matched
export const DEFAULT_WEIGHTS = {
sequentialBonus: 15, // bonus for adjacent matches
separatorBonus: 30, // bonus if match occurs after a separator
camelBonus: 30, // bonus if match is uppercase and prev is lower
firstLetterBonus: 15, // bonus if the first letter is matched
const LEADING_LETTER_PENALTY = -5; // penalty applied for every letter in str before the first match
const MAX_LEADING_LETTER_PENALTY = -15; // maximum penalty for leading letters
const UNMATCHED_LETTER_PENALTY = -1;
leadingLetterPenalty: -5, // penalty applied for every letter in str before the first match
maxLeadingLetterPenalty: -15, // maximum penalty for leading letters
unmatchedLetterPenalty: -1
};
/**
* Does a fuzzy search to find pattern inside a string.
* @param {*} pattern string pattern to search for
* @param {*} str string string which is being searched
* @param {string} pattern pattern to search for
* @param {string} str string which is being searched
* @param {boolean} global whether to search for all matches or just one
* @returns [boolean, number] a boolean which tells if pattern was
* found or not and a search score
*/
export function fuzzyMatch(pattern, str) {
export function fuzzyMatch(pattern, str, global=false, weights=DEFAULT_WEIGHTS) {
const recursionCount = 0;
const recursionLimit = 10;
const matches = [];
const maxMatches = 256;
if (!global) {
return fuzzyMatchRecursive(
pattern,
str,
0 /* patternCurIndex */,
0 /* strCurrIndex */,
null /* srcMatces */,
null /* srcMatches */,
matches,
maxMatches,
0 /* nextMatch */,
recursionCount,
recursionLimit
recursionLimit,
weights
);
}
// Return all matches
let foundMatch = true,
score,
idxs,
strCurrIndex = 0;
const results = [];
while (foundMatch) {
[foundMatch, score, idxs] = fuzzyMatchRecursive(
pattern,
str,
0 /* patternCurIndex */,
strCurrIndex,
null /* srcMatches */,
matches,
maxMatches,
0 /* nextMatch */,
recursionCount,
recursionLimit,
weights
);
if (foundMatch) results.push([foundMatch, score, [...idxs]]);
strCurrIndex = idxs[idxs.length - 1] + 1;
}
return results;
}
/**
@ -65,7 +97,8 @@ function fuzzyMatchRecursive(
maxMatches,
nextMatch,
recursionCount,
recursionLimit
recursionLimit,
weights
) {
let outScore = 0;
@ -110,7 +143,8 @@ function fuzzyMatchRecursive(
maxMatches,
nextMatch,
recursionCount,
recursionLimit
recursionLimit,
weights
);
if (matched) {
@ -134,16 +168,16 @@ function fuzzyMatchRecursive(
outScore = 100;
// Apply leading letter penalty
let penalty = LEADING_LETTER_PENALTY * matches[0];
let penalty = weights.leadingLetterPenalty * matches[0];
penalty =
penalty < MAX_LEADING_LETTER_PENALTY ?
MAX_LEADING_LETTER_PENALTY :
penalty < weights.maxLeadingLetterPenalty ?
weights.maxLeadingLetterPenalty :
penalty;
outScore += penalty;
// Apply unmatched penalty
const unmatched = str.length - nextMatch;
outScore += UNMATCHED_LETTER_PENALTY * unmatched;
outScore += weights.unmatchedLetterPenalty * unmatched;
// Apply ordering bonuses
for (let i = 0; i < nextMatch; i++) {
@ -152,7 +186,7 @@ function fuzzyMatchRecursive(
if (i > 0) {
const prevIdx = matches[i - 1];
if (currIdx === prevIdx + 1) {
outScore += SEQUENTIAL_BONUS;
outScore += weights.sequentialBonus;
}
}
@ -165,15 +199,15 @@ function fuzzyMatchRecursive(
neighbor !== neighbor.toUpperCase() &&
curr !== curr.toLowerCase()
) {
outScore += CAMEL_BONUS;
outScore += weights.camelBonus;
}
const isNeighbourSeparator = neighbor === "_" || neighbor === " ";
if (isNeighbourSeparator) {
outScore += SEPARATOR_BONUS;
outScore += weights.separatorBonus;
}
} else {
// First letter
outScore += FIRST_LETTER_BONUS;
outScore += weights.firstLetterBonus;
}
}

View File

@ -0,0 +1,120 @@
/**
* @author n1474335 [n1474335@gmail.com]
* @copyright Crown Copyright 2021
* @license Apache-2.0
*/
import Operation from "../Operation.mjs";
import {fuzzyMatch, calcMatchRanges, DEFAULT_WEIGHTS} from "../lib/FuzzyMatch.mjs";
/**
* Fuzzy Match operation
*/
class FuzzyMatch extends Operation {
/**
* FuzzyMatch constructor
*/
constructor() {
super();
this.name = "Fuzzy Match";
this.module = "Default";
this.description = "Conducts a fuzzy search to find a pattern within the input based on weighted criteria.<br><br>e.g. A search for <code>dpan</code> will match on <code><b>D</b>on't <b>Pan</b>ic</code>";
this.infoURL = "https://wikipedia.org/wiki/Fuzzy_matching_(computer-assisted_translation)";
this.inputType = "string";
this.outputType = "html";
this.args = [
{
name: "Search",
type: "binaryString",
value: ""
},
{
name: "Sequential bonus",
type: "number",
value: DEFAULT_WEIGHTS.sequentialBonus,
hint: "Bonus for adjacent matches"
},
{
name: "Separator bonus",
type: "number",
value: DEFAULT_WEIGHTS.separatorBonus,
hint: "Bonus if match occurs after a separator"
},
{
name: "Camel bonus",
type: "number",
value: DEFAULT_WEIGHTS.camelBonus,
hint: "Bonus if match is uppercase and previous is lower"
},
{
name: "First letter bonus",
type: "number",
value: DEFAULT_WEIGHTS.firstLetterBonus,
hint: "Bonus if the first letter is matched"
},
{
name: "Leading letter penalty",
type: "number",
value: DEFAULT_WEIGHTS.leadingLetterPenalty,
hint: "Penalty applied for every letter in the input before the first match"
},
{
name: "Max leading letter penalty",
type: "number",
value: DEFAULT_WEIGHTS.maxLeadingLetterPenalty,
hint: "Maxiumum penalty for leading letters"
},
{
name: "Unmatched letter penalty",
type: "number",
value: DEFAULT_WEIGHTS.unmatchedLetterPenalty
},
];
}
/**
* @param {string} input
* @param {Object[]} args
* @returns {html}
*/
run(input, args) {
const searchStr = args[0];
const weights = {
sequentialBonus: args[1],
separatorBonus: args[2],
camelBonus: args[3],
firstLetterBonus: args[4],
leadingLetterPenalty: args[5],
maxLeadingLetterPenalty: args[6],
unmatchedLetterPenalty: args[7]
};
const matches = fuzzyMatch(searchStr, input, true, weights);
if (!matches) {
return "No matches.";
}
let result = "", pos = 0, hlClass = "hl1";
matches.forEach(([matches, score, idxs]) => {
const matchRanges = calcMatchRanges(idxs);
matchRanges.forEach(([start, length], i) => {
result += input.slice(pos, start);
if (i === 0) result += `<span class="${hlClass}">`;
pos = start + length;
result += `<b>${input.slice(start, pos)}</b>`;
});
result += "</span>";
hlClass = hlClass === "hl1" ? "hl2" : "hl1";
});
result += input.slice(pos, input.length);
return result;
}
}
export default FuzzyMatch;

View File

@ -185,7 +185,7 @@ class RegularExpression extends Operation {
* @param {boolean} captureGroups - Display each of the capture groups separately
* @returns {string}
*/
function regexList (input, regex, displayTotal, matches, captureGroups) {
function regexList(input, regex, displayTotal, matches, captureGroups) {
let output = "",
total = 0,
match;
@ -225,7 +225,7 @@ function regexList (input, regex, displayTotal, matches, captureGroups) {
* @param {boolean} displayTotal
* @returns {string}
*/
function regexHighlight (input, regex, displayTotal) {
function regexHighlight(input, regex, displayTotal) {
let output = "",
title = "",
hl = 1,

View File

@ -6,7 +6,7 @@
import HTMLOperation from "../HTMLOperation.mjs";
import Sortable from "sortablejs";
import {fuzzyMatch, calcMatchRanges} from "../../core/lib/FuzzySearch.mjs";
import {fuzzyMatch, calcMatchRanges} from "../../core/lib/FuzzyMatch.mjs";
/**