Last active
November 7, 2024 21:06
-
-
Save notriddle/aa8edfe3c1b0a4f5e777d6db179464a2 to your computer and use it in GitHub Desktop.
Revisions
-
notriddle revised this gist
Nov 7, 2024 . 1 changed file with 9 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -25,29 +25,30 @@ class EditDistanceCalculator { // Fast path were we already know the correct regex const c = esc[i]; current[i] = dist === 2 ? ("(?:(:?" + c + ".{0,2})|(?:." + c + ".?)|(?:.{0,2}" + c + "?))") : ("(?:(:?" + c + ".)|(?:." + c + ")|.?)"); continue; } let result = "("; let close = ")"; for (let j = i; j < l; j += 1) { // substitution | deletion result += "(?:.?" + prev[j + 1] + ")|"; // insertion result += "(?:." + prev[j] + ")|"; // transposition if (j + 1 < l && string[j + 1] !== string[j]) { result += "(?:" + esc[j + 1] + esc[j] + prev[j + 2] + ")|"; } // verbatim result += "(?:" + esc[j] + "(?:"; close += "))"; } // insertion after verbatim if (dist > 1) { result += ".{0," + dist + "}"; } else { result += ".?"; } result += close; current[i] = result; -
notriddle revised this gist
Nov 7, 2024 . 1 changed file with 14 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -13,11 +13,22 @@ class EditDistanceCalculator { let prev = []; let current = []; const l = string.length; const esc = []; for (let i = 0; i < l; i += 1) { esc[i] = string[i].replace(REGEX_SPECIALS, x => `\\${x}`); prev[i] = string.substring(i).replace(REGEX_SPECIALS, x => `\\${x}`); } prev[l] = ""; for (let dist = 1; dist <= limit; dist += 1) { for (let i = 0; i <= l; i += 1) { if (dist <= 2 && l - i === 1) { // Fast path were we already know the correct regex const c = esc[i]; current[i] = dist === 2 ? ("((" + c + ".{0,2})|(." + c + ".?)|(.{0,2}" + c + "?))") : ("((" + c + ".)|(." + c + ")|.?)"); continue; } let result = "("; let close = ")"; for (let j = i; j < l; j += 1) { @@ -27,11 +38,10 @@ class EditDistanceCalculator { result += "(." + prev[j] + ")|"; // transposition if (j + 1 < l && string[j + 1] !== string[j]) { result += "(" + esc[j + 1] + esc[j] + prev[j + 2] + ")|"; } // verbatim result += "(" + esc[j] + "("; close += "))"; } if (dist > 1) { -
notriddle revised this gist
Nov 7, 2024 . 1 changed file with 37 additions and 25 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,5 @@ // need to escape specials // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions const REGEX_SPECIALS = /[\$\(\)\*\+\.\/\?\[\\\]\^\{\|\}]/; class EditDistanceCalculator { constructor(string, limit) { @@ -8,33 +10,43 @@ class EditDistanceCalculator { this.matcherRegExp = new RegExp("^" + this.buildMatcherRegExp(string, limit) + "$"); } buildMatcherRegExp(string, limit) { let prev = []; let current = []; const l = string.length; for (let i = 0; i <= l; i += 1) { prev[i] = string.substring(i).replace(REGEX_SPECIALS, x => `\\${x}`); } for (let dist = 1; dist <= limit; dist += 1) { for (let i = 0; i <= l; i += 1) { let result = "("; let close = ")"; for (let j = i; j < l; j += 1) { // substitution | deletion result += "(.?" + prev[j + 1] + ")|"; // insertion result += "(." + prev[j] + ")|"; // transposition if (j + 1 < l && string[j + 1] !== string[j]) { result += "(" + (string[j + 1] + string[j]).replace(REGEX_SPECIALS, x => `\\${x}`) + prev[j + 2] + ")|"; } // verbatim result += "(" + string[j].replace(REGEX_SPECIALS, x => `\\${x}`) + "("; close += "))"; } if (dist > 1) { result += ".{0," + dist + "}"; // insertion } else { result += ".?"; // insertion } result += close; current[i] = result; } const tmp = prev; prev = current; current = prev; } return prev[0]; } /* * This function was translated, mostly line-for-line, from @@ -157,4 +169,4 @@ for (let limit = 0; limit < 4; limit += 1) { } } } } -
notriddle revised this gist
Nov 7, 2024 . 1 changed file with 2 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,10 +16,8 @@ class EditDistanceCalculator { let result = "("; let close = ")"; while (string !== "") { // substitution | deletion result += "(.?" + this.buildMatcherRegExp(string.substring(1), limit - 1) + ")|"; // insertion result += "(." + this.buildMatcherRegExp(string, limit - 1) + ")|"; // transposition -
notriddle created this gist
Nov 7, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,162 @@ const REGEX_SPECIALS = /[\$\(\)\*\+\.\/\?\[\\\]\^\{\|\}]/; class EditDistanceCalculator { constructor(string, limit) { this.string = string; this.current = []; this.prev = []; this.prevPrev = []; this.matcherRegExp = new RegExp("^" + this.buildMatcherRegExp(string, limit) + "$"); } buildMatcherRegExp(string, limit) { if (limit <= 0) { // need to escape specials // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions return string.replace(REGEX_SPECIALS, x => `\\${x}`); } let result = "("; let close = ")"; while (string !== "") { // substitution result += "(." + this.buildMatcherRegExp(string.substring(1), limit - 1) + ")|"; // deletion result += "(" + this.buildMatcherRegExp(string.substring(1), limit - 1) + ")|"; // insertion result += "(." + this.buildMatcherRegExp(string, limit - 1) + ")|"; // transposition if (string.length >= 2) { result += "(" + (string[1] + string[0]).replace(REGEX_SPECIALS, x => `\\${x}`) + this.buildMatcherRegExp(string.substring(2), limit - 1) + ")|"; } // verbatim result += "(" + string[0].replace(REGEX_SPECIALS, x => `\\${x}`) + "("; close += "))"; string = string.substring(1); } for (let i = 0; i < limit; ++i) { result += ".?"; // insertion } result += close; return result; } /* * This function was translated, mostly line-for-line, from * https://github.com/rust-lang/rust/blob/ff4b772f805ec1e/compiler/rustc_span/src/edit_distance.rs * * The current implementation is the restricted Damerau-Levenshtein algorithm. It is restricted * because it does not permit modifying characters that have already been transposed. The specific * algorithm should not matter to the caller of the methods, which is why it is not noted in the * documentation. */ calculate(b, limit) { let a = this.string; // Ensure that `b` is the shorter string, minimizing memory use. if (a.length < b.length) { const aTmp = a; a = b; b = aTmp; } const minDist = a.length - b.length; // If we know the limit will be exceeded, we can return early. if (minDist > limit) { return limit + 1; } // Strip common prefix. // We know that `b` is the shorter string, so we don't need to check // `a.length`. while (b.length > 0 && b[0] === a[0]) { a = a.substring(1); b = b.substring(1); } // Strip common suffix. while (b.length > 0 && b[b.length - 1] === a[a.length - 1]) { a = a.substring(0, a.length - 1); b = b.substring(0, b.length - 1); } // If either string is empty, the distance is the length of the other. // We know that `b` is the shorter string, so we don't need to check `a`. if (b.length === 0) { return minDist; } const aLength = a.length; const bLength = b.length; for (let i = 0; i <= bLength; ++i) { this.current[i] = 0; this.prev[i] = i; this.prevPrev[i] = Number.MAX_VALUE; } // row by row for (let i = 1; i <= aLength; ++i) { this.current[0] = i; const aIdx = i - 1; // column by column for (let j = 1; j <= bLength; ++j) { const bIdx = j - 1; // There is no cost to substitute a character with itself. const substitutionCost = a[aIdx] === b[bIdx] ? 0 : 1; this.current[j] = Math.min( // deletion this.prev[j] + 1, // insertion this.current[j - 1] + 1, // substitution this.prev[j - 1] + substitutionCost, ); if ((i > 1) && (j > 1) && (a[aIdx] === b[bIdx - 1]) && (a[aIdx - 1] === b[bIdx])) { // transposition this.current[j] = Math.min( this.current[j], this.prevPrev[j - 2] + 1, ); } } // Rotate the buffers, reusing the memory const prevPrevTmp = this.prevPrev; this.prevPrev = this.prev; this.prev = this.current; this.current = prevPrevTmp; } // `prev` because we already rotated the buffers. const distance = this.prev[bLength]; return distance <= limit ? distance : (limit + 1); } } function buildString(seed) { let string = ""; let alphabet = "ABCD"; let seed_builder = seed; while (seed_builder > 0) { string += alphabet[seed_builder & 3]; seed_builder = seed_builder >> 2; } return string; } for (let limit = 0; limit < 4; limit += 1) { console.log(limit); for (let seed = 0; seed < 1024; seed += 1) { let string1 = buildString(seed); for (let seed2 = 0; seed2 < 1024; seed2 += 1) { let string2 = buildString(seed2); let calculator = new EditDistanceCalculator(string1, limit); let matcherPasses = calculator.matcherRegExp.test(string2); let calculatePasses = calculator.calculate(string2, limit) < limit + 1; if (matcherPasses !== calculatePasses) { console.log(calculator.matcherRegExp); throw new Error("failed string1=" + string1 + " string2=" + string2 + " limit=" + limit + " matcherPasses=" + matcherPasses + " calculatePasses=" + calculatePasses); } } } }