From 63e9220e290624041c8c84e15bd766914f5df4ca Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Thu, 26 Mar 2026 09:10:56 -0500 Subject: [PATCH 01/16] change(web): simplify model.predict() calls Rather than copying over part of the existing context just to delete it, we can simplify prediction calls by just pre-deleting the current token, then applying any relevant deleteLeft transform component afterward to resulting predictions. Build-bot: skip build:web Test-bot: skip --- .../worker-thread/src/main/predict-helpers.ts | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 891dd923cc8..f1ddfe38792 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -321,6 +321,7 @@ export function determineContextTransition( return transition; } +// TODO: Remove this and its associated unit tests! /** * Determines where the context for prediction-generation should be rooted and how * much of the context it should replace. @@ -428,8 +429,6 @@ export function determineSuggestionRange( } } - tokensToPredict.reverse(); - // Can occur when backspacing to the end of a previous word. if(tokensToPredict.length == 0) { if(tokenSetA.length == 0 || tokenSetB.length == 0) { @@ -439,6 +438,7 @@ export function determineSuggestionRange( tokensToPredict.push(tokenSetB.pop()); } + tokensToPredict.reverse(); tokensToRemove.reverse(); return { @@ -465,33 +465,37 @@ export function buildAndMapPredictions( ): CorrectionPredictionTuple[] { const model = transition.final.model; - // No matter the prediction, once we know the root of the prediction, we'll - // always 'replace' the same amount of text. We can handle this before the - // big 'prediction root' loop. - const { predictionContext, deleteLeft } = determineSuggestionAlignment(transition, tokenization, model); + const applicationTarget = transition.base.displayTokenization; + const { tokensToRemove, tokensToPredict } = determineSuggestionRange(applicationTarget, tokenization); - let correction = match.matchString; - let rootCost = match.totalCost; + const deleteLeft = tokensToPredict.length > 1 ? 0 : tokensToRemove.reduce((prev, curr) => prev + curr.searchModule.codepointLength, 0); + + // Exists to be extended by the 'correctionTransfrom' below. + const emptyContext: Context = { + left: '', + startOfBuffer: false, + endOfBuffer: false + }; // Replace the existing context with the correction. const correctionTransform: Transform = { - insert: correction, // insert correction string - deleteLeft: deleteLeft, + insert: match.matchString, // insert correction string + deleteLeft: 0, id: transition.transitionId // The correction should always be based on the most recent external transform/transcription ID. } + const rootCost = match.totalCost; const predictionRoot = { sample: correctionTransform, p: Math.exp(-rootCost * costFactor) }; - // Worth considering: extend Traversal to allow direct prediction lookups? - // let traversal = match.finalTraversal; // ... - let predictions = predictFromCorrections(model, [predictionRoot], predictionContext); + let predictions = predictFromCorrections(model, [predictionRoot], emptyContext); predictions.forEach((entry) => { entry.preservationTransform = tokenization.taillessTrueKeystroke; // // Will need an extra lookup layer if the suggestion is generated from within a cluster. // entry.baseTokenization = transition.final.tokenizationSourceMap.get(tokenization); + entry.prediction.sample.transform.deleteLeft = deleteLeft; }); return predictions; @@ -1069,13 +1073,10 @@ export function finalizeSuggestions( // // Note: may need adjustment if/when supporting phrase-level correction. if(tuple.preservationTransform) { - const presDL = tuple.preservationTransform.deleteLeft; - const mergedTransform = models.buildMergedTransform(tuple.preservationTransform, prediction.sample.transform); - // Any preserved delete-left is applied early because it directly affects the suggestion - // root; we need to remove that preserved delete-left here. - if(presDL > 0) { - mergedTransform.deleteLeft -= presDL; - } + const mergedTransform = { + ...models.buildMergedTransform(tuple.preservationTransform, {...prediction.sample.transform, deleteLeft: 0}), + deleteLeft: prediction.sample.transform.deleteLeft + }; // Temporarily and locally drops 'readonly' semantics so that we can reassign the transform. // See https://www.typescriptlang.org/docs/handbook/release-notes/typescript-2-8.html#improved-control-over-mapped-type-modifiers From 3f8323757f9f5ebf5c1ccb27ba023d2baa5051d9 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Mon, 27 Apr 2026 16:38:40 -0500 Subject: [PATCH 02/16] change(web): remove determineSuggestionAlignment method in favor of determineSuggestionRange --- .../worker-thread/src/main/predict-helpers.ts | 70 --------------- .../determine-suggestion-alignment.tests.ts | 90 ------------------- 2 files changed, 160 deletions(-) delete mode 100644 web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index f1ddfe38792..ce6cea75695 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -321,76 +321,6 @@ export function determineContextTransition( return transition; } -// TODO: Remove this and its associated unit tests! -/** - * Determines where the context for prediction-generation should be rooted and how - * much of the context it should replace. - * @param transition - * @param lexicalModel - * @returns - */ -export function determineSuggestionAlignment( - transition: ContextTransition, - tokenization: ContextTokenization, - lexicalModel: LexicalModel -): { - /** - * The context to use directly for generating predictions from the model. - */ - predictionContext: Context, - /** - * The total number of characters to delete for generated suggestions - * in order to replace the prediction root token entirely. - */ - deleteLeft: number -} { - const transitionEdits = tokenization.transitionEdits; - const context = transition.base.context; - const postContext = transition.final.context; - const inputTransform = transition.inputDistribution[0].sample; - let deleteLeft: number; - - // If the context now has more tokens, the token we'll be 'predicting' didn't originally exist. - const wordbreak = determineModelWordbreaker(lexicalModel); - - // Is the token under construction newly-constructed / is there no pre-existing root? - if(tokenization.taillessTrueKeystroke && transitionEdits?.addedNewTokens) { - return { - // If the new token is due to whitespace or due to a different input type - // that would likely imply a tokenization boundary, infer 'new word' mode. - // Apply any part of the context change that is not considered to be up - // for correction. - predictionContext: models.applyTransform(tokenization.taillessTrueKeystroke, context), - // As the word/token being corrected/predicted didn't originally exist, - // there's no part of it to 'replace'. (Suggestions are applied to the - // pre-transform state.) - deleteLeft: 0 - }; - // If the tokenized context length is shorter... sounds like a backspace (or similar). - } else if (transitionEdits?.removedOldTokens) { - /* Ooh, we've dropped context here. Almost certainly from a backspace or - * similar effect. Even if we drop multiple tokens... well, we know exactly - * how many chars were actually deleted - `inputTransform.deleteLeft`. Since - * we replace a word being corrected/predicted, we take length of the - * remaining context's tail token in addition to however far was deleted to - * reach that state. - */ - deleteLeft = KMWString.length(wordbreak(postContext)) + inputTransform.deleteLeft; - } else { - // Suggestions are applied to the pre-input context, so get the token's original length. - // We're on the same token, so just delete its text for the replacement op. - deleteLeft = KMWString.length(wordbreak(context)); - } - - // Did the wordbreaker (or similar) append a blank token before the caret? If so, - // preserve that by preventing corrections from triggering left-deletion. - if(tokenization.tail.isEmptyToken) { - deleteLeft = 0; - } - - return { predictionContext: context, deleteLeft }; -} - /** * Given two ContextTokenizations related by context transition, this function * determines the tail-end range of the tokenization affected by the transition. diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts deleted file mode 100644 index be2d1177571..00000000000 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts +++ /dev/null @@ -1,90 +0,0 @@ -import { assert } from 'chai'; - -import { LexicalModelTypes } from '@keymanapp/common-types'; -import { default as defaultBreaker } from '@keymanapp/models-wordbreakers'; -import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; - -import { ContextState, ContextTransition, determineSuggestionAlignment, models } from "@keymanapp/lm-worker/test-index"; - -import CasingFunction = LexicalModelTypes.CasingFunction; -import Context = LexicalModelTypes.Context; -import TrieModel = models.TrieModel; - -const plainApplyCasing: CasingFunction = function(caseToApply, text) { - switch(caseToApply) { - case 'lower': - return text.toLowerCase(); - case 'upper': - return text.toUpperCase(); - case 'initial': - return plainApplyCasing('upper', text.charAt(0)) . concat(text.substring(1)); - default: - return text; - } -}; - -const plainCasedModel = new TrieModel( - jsonFixture('models/tries/english-1000'), { - languageUsesCasing: true, - applyCasing: plainApplyCasing, - wordBreaker: defaultBreaker, - searchTermToKey: function(text: string) { - // We're dealing with very simple English text; no need to normalize or remove diacritics here. - return plainApplyCasing('lower', text); - } - } -); - -describe('determineSuggestionAlignment', () => { - it('handles standard cases well - same token, no preservationTransforms', () => { - const context: Context = { - left: 'this is techn', - startOfBuffer: true, - endOfBuffer: true - }; - const baseState = new ContextState(context, plainCasedModel); - - const transition = new ContextTransition(baseState, 0); - transition.finalize(transition.base, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]); - - // transition, model - const results = determineSuggestionAlignment(transition, transition.final.displayTokenization, plainCasedModel); - - assert.deepEqual(results.predictionContext, context); - assert.equal(results.deleteLeft, "techn".length); - }); - - it('handles extension of prior token after backspace', () => { - const context: Context = { - left: 'this is tech ', - startOfBuffer: true, - endOfBuffer: true - }; - const baseState = new ContextState(context, plainCasedModel); - - const transition = baseState.analyzeTransition(context, [{sample: { insert: '', deleteLeft: 1 }, p: 1}]) - - // transition, model - const results = determineSuggestionAlignment(transition, transition.final.displayTokenization, plainCasedModel); - - assert.deepEqual(results.predictionContext, context); - assert.equal(results.deleteLeft, "tech".length + 1 /* for the deleted whitespace */); - }); - - it('handles extension of prior token after complex input with delete-left', () => { - const context: Context = { - left: 'this is tech ', - startOfBuffer: true, - endOfBuffer: true - }; - const baseState = new ContextState(context, plainCasedModel); - - const transition = baseState.analyzeTransition(context, [{sample: { insert: 'n', deleteLeft: 1 }, p: 1}]) - - // transition, model - const results = determineSuggestionAlignment(transition, transition.final.displayTokenization, plainCasedModel); - - assert.deepEqual(results.predictionContext, context); - assert.equal(results.deleteLeft, "techn".length + 1 /* for the deleted whitespace */); - }); -}); \ No newline at end of file From d558c868e3335cf821bac237ec01dc153262c46f Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Tue, 14 Apr 2026 14:42:12 -0500 Subject: [PATCH 03/16] feat(web): add prepareTokenizationSearch helper method This method is designed to determine the appropriate range of tokens, within each context variant, should be eligible for correction when generating predictions and corrections. Build-bot: skip build:web Test-bot: skip --- .../main/correction/tokenization-corrector.ts | 2 +- .../worker-thread/src/main/predict-helpers.ts | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts index e47c6b04fad..b111aa9d394 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts @@ -154,7 +154,7 @@ export class TokenizationCorrector implements CorrectionSearchable boolean + filterClosure: (token: ContextToken, index?: number) => boolean ) { this.tokenization = tokenization; this.tailCorrectionLength = tailCorrectionLength; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index ce6cea75695..4b87f0c3e39 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -13,6 +13,7 @@ import { ContextTransition } from './correction/context-transition.js'; import { ExecutionTimer } from './correction/execution-timer.js'; import ModelCompositor from './model-compositor.js'; import { getBestTokenMatches } from './correction/distance-modeler.js'; +import { TokenizationCorrector } from './correction/tokenization-corrector.js'; import { TokenResultMapping } from './correction/token-result-mapping.js'; const searchForProperty = defaultWordbreaker.searchForProperty; @@ -431,6 +432,39 @@ export function buildAndMapPredictions( return predictions; } +export function prepareTokenizationSearch( + transition: ContextTransition, + tokenizations: ContextTokenization[] +) { + // Goal - determine what parts of each tokenization are searchable & prep them for correcion-search. + const tokenizationAnalyses = tokenizations.map((tokenization) => { + return { + tokenization: tokenization, + analysis: determineSuggestionRange(transition.base.displayTokenization, tokenization) + }; + }); + + const biggestCommonRemoval = tokenizationAnalyses.reduce( + (biggest, current) => biggest.length > current.analysis.tokensToRemove.length ? biggest : current.analysis.tokensToRemove, + [] as ContextToken[] + ); + + const tokenizationSetup = tokenizationAnalyses.map((tuple) => { + // These tokens are unaffected by the input whatsoever, though their + // probability may affect thresholding for the non-locked tokens. + const unaffectedTokenCount = biggestCommonRemoval.length - tuple.analysis.tokensToRemove.length; + + const mutatedLength = tuple.analysis.tokensToPredict.length; + return new TokenizationCorrector(tuple.tokenization, mutatedLength, (token, index) => { + return index >= unaffectedTokenCount // is a modified token + && index == mutatedLength - 1 // TEMP: adjacent to the caret (TO BE REMOVED) + && correctionValidForAutoSelect(token.exampleInput); // and is eligible text-correction + }); + }); + + return tokenizationSetup; +} + /** * This method performs the correction-search and model-lookup operations for * prediction generation by using the user's context state and potential From fdd65c0f7476ab21ce84a5c13f0a636557c2f0b5 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Thu, 30 Apr 2026 08:28:43 -0500 Subject: [PATCH 04/16] feat(web): rework buildAndMapPredictions for multi-token predictions Build-bot: skip build:web Test-bot: skip --- .../main/correction/token-result-mapping.ts | 4 + .../main/correction/tokenization-corrector.ts | 13 +- .../worker-thread/src/main/predict-helpers.ts | 144 +++++++++++++++--- 3 files changed, 132 insertions(+), 29 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/token-result-mapping.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/token-result-mapping.ts index c85e9e9b6d4..862ac1f740f 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/token-result-mapping.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/token-result-mapping.ts @@ -75,6 +75,10 @@ export class TokenResultMapping implements CorrectionResultMapping, return this.node; } + get inputCount(): number { + return this.matchingSpace.inputCount; + } + get inputSequence(): ProbabilityMass[] { return this.node.priorInput; } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts index b111aa9d394..3b71b2eb4f8 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts @@ -35,7 +35,8 @@ export type TokenResult = { matchString: string, inputSamplingCost: number, knownCost: number, - totalCost: number + totalCost: number, + inputCount: number } /** @@ -195,9 +196,10 @@ export class TokenizationCorrector implements CorrectionSearchable, - costFactor: number + tokenizationCorrection: TokenizationResultMapping, ): CorrectionPredictionTuple[] { const model = transition.final.model; + const tokenization = tokenizationCorrection.matchingSpace.tokenization; const applicationTarget = transition.base.displayTokenization; const { tokensToRemove, tokensToPredict } = determineSuggestionRange(applicationTarget, tokenization); @@ -408,28 +407,123 @@ export function buildAndMapPredictions( endOfBuffer: false }; - // Replace the existing context with the correction. - const correctionTransform: Transform = { - insert: match.matchString, // insert correction string - deleteLeft: 0, - id: transition.transitionId // The correction should always be based on the most recent external transform/transcription ID. - } + const correctionTransforms = tokenizationCorrection.matchedResult.map((correction, i) => { + return { + insert: correction.matchString, // insert correction string + deleteLeft: i == 0 ? deleteLeft : 0, + id: transition.transitionId // The correction should always be based on the most recent external transform/transcription ID. + }; + }); - const rootCost = match.totalCost; - const predictionRoot = { - sample: correctionTransform, - p: Math.exp(-rootCost * costFactor) - }; + const correctionCost = tokenizationCorrection.matchedResult.map((correction) => { + let rootCost = correction.totalCost; + /* If we're dealing with the FIRST keystroke of a new sequence, we'll **dramatically** boost + * the exponent to ensure only VERY nearby corrections have a chance of winning, and only if + * there are significantly more likely words. We only need this to allow very minor fat-finger + * adjustments for 100% keystroke-sequence corrections in order to prevent finickiness on + * key borders. + * + * Technically, the probabilities this produces won't be normalized as-is... but there's no + * true NEED to do so for it, even if it'd be 'nice to have'. Consistently tracking when + * to apply it could become tricky, so it's simpler to leave out. + * + * Worst-case, it's possible to temporarily add normalization if a code deep-dive + * is needed in the future. + */ + if(correction.inputCount <= 1) { + /* Suppose a key distribution: most likely with p=0.5, second-most with 0.4 - a pretty + * ambiguous case that would only arise very near the center of the boundary between two keys. + * Raising (0.5/0.4)^16 ~= 35.53. (At time of writing, SINGLE_CHAR_KEY_PROB_EXPONENT = 16.) + * That seems 'within reason' for correction very near boundaries. + * + * So, with the second-most-likely key being that close in probability, its best suggestion + * must be ~ 35.5x more likely than that of the truly-most-likely key to "win". So, it's not + * a HARD cutoff, but more of a 'soft' one. Keeping the principles in mind documented above, + * it's possible to tweak this to a more harsh or lenient setting if desired, rather than + * being totally "all or nothing" on which key is taken for highly-ambiguous keypresses. + */ + rootCost *= ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT; // note the `Math.exp` below. + } + + return Math.exp(-rootCost); + }).reduce((accum, curr) => accum * curr, 1); - let predictions = predictFromCorrections(model, [predictionRoot], emptyContext); - predictions.forEach((entry) => { - entry.preservationTransform = tokenization.taillessTrueKeystroke; - // // Will need an extra lookup layer if the suggestion is generated from within a cluster. - // entry.baseTokenization = transition.final.tokenizationSourceMap.get(tokenization); - entry.prediction.sample.transform.deleteLeft = deleteLeft; + const predictionComponents = correctionTransforms.map((correctionTransform, i) => { + const predictions = model.predict(correctionTransform, emptyContext); + + // Failsafe: if there are no matching predictions, create a fake prediction + // matching the original text. + if(predictions.length == 0) { + predictions.push({ + sample: { + transform: correctionTransform, + displayAs: correctionTransform.insert + }, + // It's not found in the lexicon, so we'll take a low probability for it. + // + // Edit penalties will be applied via the correction component separately later on. + p: -Math.exp(EDIT_DISTANCE_COST_SCALE) + }); + } + + // Regardless of origin, overwrite the transform's deleteLeft value with what it should actually hold. + predictions.forEach((entry) => { + entry.sample.transform.deleteLeft = deleteLeft; + }); + + // Use traversals if possible - extract the most likely entry that is on the traversal, + // rather than predicting (and possibly extending) tokens not adjacent to the caret. + // + // Also, fall back to the actual correction string should prediction not be valid here. + return i == correctionTransforms.length - 1 ? predictions : [predictions[0]]; + }); + + // Constructs a common prefix for all but the final token's component. + const predictionPrefix = predictionComponents + .slice(0, predictionComponents.length-1) + .reduce((accum, curr) => models.buildMergedTransform(accum, curr[0].sample.transform), { insert: '', deleteLeft: 0 }); + const prefixProb = predictionComponents + .slice(0, predictionComponents.length-1) + .reduce((accum, curr) => accum * curr[0].p, 1) + + const completePredictionTuples: CorrectionPredictionTuple[] = predictionComponents[predictionComponents.length-1].map((prediction) => { + const predictionCost = prediction.p * prefixProb; + return { + // Will need to do this differently. We want to have each component + // individualized b/c casing. Case should be maintained for prior tokens + // and managed independently for each. + // + // detectCurrentCasing is designed to determine casing based on context; + // makes sense for 'context up to each token'. + // + // applySuggestionCasing applies onto suggestions, so we'll want to build + // the FULL suggestion AFTER applying casing changes (to each token's + // suggestion component). + prediction: { + sample: { + transformId: transition.transitionId, + transform: models.buildMergedTransform(predictionPrefix, prediction.sample.transform), + displayAs: models.buildMergedTransform(predictionPrefix, prediction.sample.transform).insert // should composite the displayAs strings instead... + }, + p: predictionCost, + }, + correction: { + // Is used partly for word-casing, partly for auto-select enabling. + sample: '', // plain correction string instead... + p: correctionCost + }, + totalProb: predictionCost * correctionCost, + matchLevel: SuggestionSimilarity.none, + // Long-term, we shouldn't have `.preservationTransform` here. + // + // Needed for now until the search actually operates based on + // TokenizationCorrector, rather than the half-converted use currently in + // place. + preservationTransform: tokenization.taillessTrueKeystroke + } }); - return predictions; + return completePredictionTuples; } export function prepareTokenizationSearch( @@ -576,7 +670,9 @@ export async function correctAndEnumerate( */ const costFactor = (tokenization.tail.inputCount <= 1) ? ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT : 1; - const predictions = buildAndMapPredictions(transition, tokenization, match, costFactor); + const suggestionRange = determineSuggestionRange(transition.base.displayTokenization, tokenization) + const corrector = new TokenizationCorrector(tokenization, suggestionRange.tokensToPredict.length, () => true); + const predictions = buildAndMapPredictions(transition, new TokenizationResultMapping([match], corrector)); // Only set 'best correction' cost when a correction ACTUALLY YIELDS predictions. if(predictions.length > 0 && bestCorrectionCost === undefined) { From 6c1170d27a3dfc108201e3ce2ae52a2fcbc30bd5 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Wed, 6 May 2026 13:21:08 -0500 Subject: [PATCH 05/16] change(web): simplify mapWhitespacedTokenization requirements To better handle inputs that shift the word-boundary in some custom models and models released before Keyman 14.0, this PR provides generalized re-use of the whitespace-based token-transition algorithm used for our most prominently-supported models. Build-bot: skip build:web Test-bot: skip --- .../main/correction/context-tokenization.ts | 341 ++++++++++-------- 1 file changed, 187 insertions(+), 154 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index fc2f81615c1..95d3125c2a9 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -334,7 +334,7 @@ export class ContextTokenization { } /** - * Given the existing tokenization and an incoming input `Transform`, this + * Given this existing tokenization and an incoming input `Transform`, this * method precomputes how both the current, pre-application tokenization will * be altered and how the incoming Transform will be tokenized. * @@ -351,158 +351,7 @@ export class ContextTokenization { transform: Transform, edgeOptions?: EdgeWindowOptions ): TokenizationTransitionEdits { - // Step 4: now that our window's been properly updated, determine what the - // input's effects on the context is. - // - // Context does not slide within this function. - // - // Assumption: this alignment cannot fail; we KNOW there's a solid - // before-and-after relationship here, and we can base it on the results of - // a prior syncToSourceWindow call. - // - // We don't wish to do the full tokenization here - we only want to check - // over the last few tokens that might reasonably shift. We also want to - // batch effects. - - // Do not mutate the original transform; it can cause unexpected assertion - // effects in unit tests. - const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0}; - const edgeWindow = buildEdgeWindow(this.tokens, edgeTransform, false, edgeOptions); - const { - retokenizationText, - editBoundary, - sliceIndex: edgeSliceIndex - } = edgeWindow; - // Prevent mutation of the original return property. - const stackedDeletes = edgeWindow.deleteLengths.slice(); - - const tokenize = determineModelTokenizer(lexicalModel); - const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text); - if(postTokenization.length == 0) { - postTokenization.push(''); - } - const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform); - - // What does the edge's retokenization look like when we remove the inserted portions? - const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex); - const insertBoundaryToken = postTokenization[firstInsertPostIndex]; - - // Note: requires that helpers have not mutated `stackedInserts`. - const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0])); - - // Do not preserve empty tokens here, even if tokenization normally would produce one. - // It's redundant and replaceable for tokenization batching efforts. - if(uninsertedBoundaryToken != '') { - retokenizedEdge.push(uninsertedBoundaryToken); - } - - // We've found the root token within the root context state to which deletes (and inserts) - // may be applied. - // We've also found the last post-application token to which transform changes contributed. - // How do these indices line up - we need to properly construct and index our transforms, - // but 'merge' and 'split' edits can mess up that indexing. - - const currentTokens = this.tokens; - const preTokenization = currentTokens - .slice(edgeSliceIndex, editBoundary.tokenIndex+1) - .map(t => t.exampleInput); - - // Determine the effects of splits & merges as applied to the original - // cached context state. - const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits( - preTokenization, - postTokenization.slice(0, firstInsertPostIndex+1) - ); - - /* - * Final steps: We can now safely index the transforms. Let's do it! - * 1. Determine the first index a Transform may align to - * 2. Build the transforms - * - * Notes: - * - text applied to the end of a 'merged' token at the tail: should have - * index 0, not -1. - * - pretokenization index will mismatch by -1: -SUM(merge size - 1) - * - Ex: can + ' + t => can't - * -1 0 0 - * - text applied to the end of a 'split' token at the tail: should also - * have index 0, not 1. - * - posttokenization index will mismatch by +1: SUM(split size - 1) - * - new token after 'split': index 1 - * - Ex: can' + ? => can + ' + ? - * 0 -1 0 1 - * - * The first transform applies at the end of the retokenized zone and its - * associated index. The question: were there deletes that occurred? - */ - - const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex; - let shiftDeletes = false; - // first popped entry == 0 - a delete no-op. - if(stackedDeletes[stackedDeletes.length - 1] == 0) { - // the boundary indices found by both methods above differ - if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) { - shiftDeletes = true; - } - - // there are no inserts, so we don't affect the boundary token we landed on. - if(stackedDeletes.length > 1 && transform.insert == '') { - shiftDeletes = true; - } - } - - if(shiftDeletes) { - // Do not add a zero-length delete if we're not actually altering the - // corresponding token at all. - stackedDeletes.pop(); - } - - // The first delete always applies to index 0. If the built edge window - // omits a context-final empty-string, adjust the tokenization indices - // accordingly. - const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0); - // Mutates stackedInserts, stackedDeletes. - const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length); - const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex); - - // If there's an empty transform in the 0 position and we already know we're - // dropping tokens - and only deleting - we're dropping an - // otherwise-untracked empty token - make sure it's included! - const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0)); - // Past that, if we have more delete entries than insert entries for our transforms, we - // dropped some tokens outright. - const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0); - - // Final step: check for any unexpected boundary shifts not mappable to 'merge' / 'split' - // and not caused by transforms. All transforms always apply in sequence at the end. - const unmappedEdits: EditTuple[] = []; - for(let i = 0; i < editPath.length - transformMap.size; i++) { - const op = editPath[i].op; - switch(op) { - case 'merge': - case 'split': - // already calculated - // can fall through to the `continue;` line. - case 'match': - continue; - default: - // Should only be substitutions here. - // We may wish to add extra analysis in the future when supporting - // prediction from multiple competing tokenizations. - unmappedEdits.push(editPath[i] as EditTuple); - } - } - - return { - alignment: { - edgeWindow: {...edgeWindow, retokenization: retokenizedEdge}, - merges, - splits, - unmappedEdits, - removedTokenCount - }, - tokenizedTransform: transformMap, - }; + return mapWhitespacedTokenization(this.tokens, lexicalModel, transform, edgeOptions); } /** @@ -763,6 +612,190 @@ interface RetokenizedEdgeWindow extends EdgeWindow { retokenization: string[]; } +export interface ContextTokenLike { + exampleInput: string; + isPartial?: boolean; + sourceRangeKey?: string; +} + +/** + * Given an existing tokenization and an incoming input `Transform`, this + * method precomputes how both the current, pre-application tokenization will + * be altered and how the incoming Transform will be tokenized. + * + * This function is able to operate with a reduced interface, not requiring + * the full ContextToken/ContextState/etc subsystem and its related + * SearchQuotientNode requirements. + * + * Note that this method is designed for use with languages that employ + * classical space-based wordbreaking. Do not use it for languages that need + * dictionary-based wordbreaking support! + * @param tokens + * @param lexicalModel + * @param transform + * @param edgeOptions + * @returns + */ +export function mapWhitespacedTokenization( + tokens: ContextTokenLike[], + lexicalModel: LexicalModel, + transform: Transform, + edgeOptions?: EdgeWindowOptions +): TokenizationTransitionEdits { + // Step 4: now that our window's been properly updated, determine what the + // input's effects on the context is. + // + // Context does not slide within this function. + // + // Assumption: this alignment cannot fail; we KNOW there's a solid + // before-and-after relationship here, and we can base it on the results of + // a prior syncToSourceWindow call. + // + // We don't wish to do the full tokenization here - we only want to check + // over the last few tokens that might reasonably shift. We also want to + // batch effects. + + // Do not mutate the original transform; it can cause unexpected assertion + // effects in unit tests. + const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0}; + const edgeWindow = buildEdgeWindow(tokens, edgeTransform, false, edgeOptions); + const { + retokenizationText, + editBoundary, + sliceIndex: edgeSliceIndex + } = edgeWindow; + // Prevent mutation of the original return property. + const stackedDeletes = edgeWindow.deleteLengths.slice(); + + const tokenize = determineModelTokenizer(lexicalModel); + const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text); + if(postTokenization.length == 0) { + postTokenization.push(''); + } + const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform); + + // What does the edge's retokenization look like when we remove the inserted portions? + const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex); + const insertBoundaryToken = postTokenization[firstInsertPostIndex]; + + // Note: requires that helpers have not mutated `stackedInserts`. + const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0])); + + // Do not preserve empty tokens here, even if tokenization normally would produce one. + // It's redundant and replaceable for tokenization batching efforts. + if(uninsertedBoundaryToken != '') { + retokenizedEdge.push(uninsertedBoundaryToken); + } + + // We've found the root token within the root context state to which deletes (and inserts) + // may be applied. + // We've also found the last post-application token to which transform changes contributed. + // How do these indices line up - we need to properly construct and index our transforms, + // but 'merge' and 'split' edits can mess up that indexing. + + const currentTokens = tokens; + const preTokenization = currentTokens + .slice(edgeSliceIndex, editBoundary.tokenIndex+1) + .map(t => t.exampleInput); + + // Determine the effects of splits & merges as applied to the original + // cached context state. + const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits( + preTokenization, + postTokenization.slice(0, firstInsertPostIndex+1) + ); + + /* + * Final steps: We can now safely index the transforms. Let's do it! + * 1. Determine the first index a Transform may align to + * 2. Build the transforms + * + * Notes: + * - text applied to the end of a 'merged' token at the tail: should have + * index 0, not -1. + * - pretokenization index will mismatch by -1: -SUM(merge size - 1) + * - Ex: can + ' + t => can't + * -1 0 0 + * - text applied to the end of a 'split' token at the tail: should also + * have index 0, not 1. + * - posttokenization index will mismatch by +1: SUM(split size - 1) + * - new token after 'split': index 1 + * - Ex: can' + ? => can + ' + ? + * 0 -1 0 1 + * + * The first transform applies at the end of the retokenized zone and its + * associated index. The question: were there deletes that occurred? + */ + + const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex; + let shiftDeletes = false; + // first popped entry == 0 - a delete no-op. + if(stackedDeletes[stackedDeletes.length - 1] == 0) { + // the boundary indices found by both methods above differ + if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) { + shiftDeletes = true; + } + + // there are no inserts, so we don't affect the boundary token we landed on. + if(stackedDeletes.length > 1 && transform.insert == '') { + shiftDeletes = true; + } + } + + if(shiftDeletes) { + // Do not add a zero-length delete if we're not actually altering the + // corresponding token at all. + stackedDeletes.pop(); + } + + // The first delete always applies to index 0. If the built edge window + // omits a context-final empty-string, adjust the tokenization indices + // accordingly. + const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0); + // Mutates stackedInserts, stackedDeletes. + const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length); + const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex); + + // If there's an empty transform in the 0 position and we already know we're + // dropping tokens - and only deleting - we're dropping an + // otherwise-untracked empty token - make sure it's included! + const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0)); + // Past that, if we have more delete entries than insert entries for our transforms, we + // dropped some tokens outright. + const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0); + + // Final step: check for any unexpected boundary shifts not mappable to 'merge' / 'split' + // and not caused by transforms. All transforms always apply in sequence at the end. + const unmappedEdits: EditTuple[] = []; + for(let i = 0; i < editPath.length - transformMap.size; i++) { + const op = editPath[i].op; + switch(op) { + case 'merge': + case 'split': + // already calculated + // can fall through to the `continue;` line. + case 'match': + continue; + default: + // Should only be substitutions here. + // We may wish to add extra analysis in the future when supporting + // prediction from multiple competing tokenizations. + unmappedEdits.push(editPath[i] as EditTuple); + } + } + + return { + alignment: { + edgeWindow: {...edgeWindow, retokenization: retokenizedEdge}, + merges, + splits, + unmappedEdits, + removedTokenCount + }, + tokenizedTransform: transformMap, + }; +} + /** * Constructs a window on one side of the represented context that is aligned to * existing tokenization. @@ -777,7 +810,7 @@ interface RetokenizedEdgeWindow extends EdgeWindow { * @returns */ export function buildEdgeWindow( - currentTokens: ContextToken[], + currentTokens: ContextTokenLike[], // Requires deleteRight be explicitly set. transform: Transform & { deleteRight: number }, applyAtFront: boolean, From daea6e561f016f41f9d60d4c4b95dbaef4a7ff94 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Wed, 6 May 2026 15:07:43 -0500 Subject: [PATCH 06/16] change(web): rework traversalless prediction, add mild whitespace-correction Build-bot: skip build:web Test-bot: skip --- .../templates/src/tokenization.ts | 4 + .../worker-thread/src/main/model-helpers.ts | 3 +- .../worker-thread/src/main/predict-helpers.ts | 131 +++++++----------- .../predict-from-corrections.tests.ts | 8 +- 4 files changed, 58 insertions(+), 88 deletions(-) diff --git a/web/src/engine/predictive-text/templates/src/tokenization.ts b/web/src/engine/predictive-text/templates/src/tokenization.ts index fd8ed28d5ca..47ef927fa5b 100644 --- a/web/src/engine/predictive-text/templates/src/tokenization.ts +++ b/web/src/engine/predictive-text/templates/src/tokenization.ts @@ -95,6 +95,10 @@ export function tokenize( currentIndex = nextIndex; } + if(tokenization.left.length == 0) { + tokenization.left.push({text: '', isWhitespace: false}); + } + // New step 2: handle any rejoins needed. // Handle any desired special handling for directly-pre-caret scenarios - where for this diff --git a/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts index 071cad588c5..7b5115e308f 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/model-helpers.ts @@ -71,7 +71,8 @@ export function determineModelTokenizer(model: LexicalModel) { if(model.wordbreaker) { return models.tokenize(model.wordbreaker, context); } else { - return null; + // Not ideal for pre-14.0 models, but it'll do for now. + return models.tokenize(wordBreakers.default, context); } } } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index ba892d76062..4d73e53e8d3 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -5,7 +5,7 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre import TransformUtils from './transformUtils.js'; import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; -import { ContextTokenization } from './correction/context-tokenization.js'; +import { ContextTokenization, mapWhitespacedTokenization } from './correction/context-tokenization.js'; import { ContextTracker } from './correction/context-tracker.js'; import { ContextToken } from './correction/context-token.js'; import { ContextState, determineContextSlideTransform } from './correction/context-state.js'; @@ -155,77 +155,6 @@ export function tupleDisplayOrderSort(a: CorrectionPredictionTuple, b: Correctio return b.totalProb - a.totalProb; } -export async function correctAndEnumerateWithoutTraversals( - lexicalModel: LexicalModel, - transformDistribution: Distribution, - context: Context -): Promise<{ - /** - * For models that support correction-search caching, this provides the - * cached object corresponding to this method's operation. - * - * Otherwise, is `null`. - */ - postContextState?: ContextState; - - /** - * The suggestions generated based on the user's input state. - */ - rawPredictions: CorrectionPredictionTuple[]; - - /** - * The id of a prior ContextTransition event that triggered a Suggestion found - * at the end of the Context. Will be undefined if no edits have occurred - * since the Suggestion was applied. - */ - revertableTransitionId?: number -}> { - const inputTransform = transformDistribution[0].sample; - let rawPredictions: CorrectionPredictionTuple[] = []; - - let predictionRoots: ProbabilityMass[]; - - // Only allow new-word suggestions if space was the most likely keypress. - const allowSpace = TransformUtils.isWhitespace(inputTransform); - const allowBksp = TransformUtils.isBackspace(inputTransform); - - // Generates raw prediction distributions for each valid input. Can only 'correct' - // against the final input. - // - // This is the old, 12.0-13.0 'correction' style. - if(allowSpace) { - // Detect start of new word; prevent whitespace loss here. - predictionRoots = [{sample: inputTransform, p: 1.0}]; - } else { - predictionRoots = transformDistribution.map((alt) => { - let transform = alt.sample; - - // Filter out special keys unless they're expected. - if(TransformUtils.isWhitespace(transform) && !allowSpace) { - return null; - } else if(TransformUtils.isBackspace(transform) && !allowBksp) { - return null; - } - - return alt; - }); - } - - // Remove `null` entries. - predictionRoots = predictionRoots.filter(tuple => !!tuple); - - // Running in bulk over all suggestions, duplicate entries may be possible. - rawPredictions = predictFromCorrections(lexicalModel, predictionRoots, context); - if(allowSpace) { - rawPredictions.forEach((entry) => entry.preservationTransform = inputTransform); - } - - return { - postContextState: null, - rawPredictions: rawPredictions - }; -} - /** * Determines the most recent ContextState corresponding to the incoming * Context, assuming no context-reset operations have occurred. Their contents @@ -602,7 +531,10 @@ export async function correctAndEnumerate( // It's mostly here to support models compiled before Keyman 14.0, which was // when the `LexiconTraversal` pattern was established. if(!contextTracker) { - return correctAndEnumerateWithoutTraversals(lexicalModel, transformDistribution, context); + return { + postContextState: null, + rawPredictions: correctAndEnumerateWithoutTraversals(lexicalModel, transformDistribution, context) + }; } // 'else': the current, 14.0+ pattern, which is able to leverage @@ -744,35 +676,68 @@ export function shouldStopSearchingEarly( * @param context * @returns */ -export function predictFromCorrections( +export function correctAndEnumerateWithoutTraversals( lexicalModel: LexicalModel, corrections: ProbabilityMass[], context: Context ): CorrectionPredictionTuple[] { let returnedPredictions: CorrectionPredictionTuple[] = []; + const wordbreak = determineModelWordbreaker(lexicalModel); + const tokenizer = determineModelTokenizer(lexicalModel); + const tokenization = tokenizer(context); for(let correction of corrections) { - let predictions = lexicalModel.predict(correction.sample, context); + // Step 1: determine tokenization effects. We can't use the + // ContextTokenization pattern due to the model's lack of LexiconTraversal + // support, though. + + const tokenizedCorrection = mapWhitespacedTokenization(tokenization.left.map((t) => { return {exampleInput: t.text} }), lexicalModel, correction.sample).tokenizedTransform; + const deleteLeft = [...tokenizedCorrection.values()].reduce((total, curr) => total + curr.deleteLeft, 0); + + const tokenizedCorrectionEntries = [...tokenizedCorrection.entries()]; + const preservationTransform = tokenizedCorrectionEntries.slice(0, -1).map((e) => e[1]).reduce((accum, curr) => { + return models.buildMergedTransform(accum, {...curr, deleteLeft: 0}); + }, { insert: '', deleteLeft: 0, id: correction.sample.id}); + preservationTransform.deleteLeft = deleteLeft; + + // Step 2: predict based on the final token. + const emptyContext: Context = { + left: '', + startOfBuffer: true, + endOfBuffer: true + }; - const { sample: correctionTransform, p: correctionProb } = correction; - const correctionRoot = wordbreak(models.applyTransform(correction.sample, context)); + const tailCorrection = tokenizedCorrectionEntries[tokenizedCorrectionEntries.length-1][1]; + let predictions = lexicalModel.predict(tailCorrection, emptyContext); + // Step 3: create the intermediate prediction data entries for each generated prediction let predictionSet = predictions.map((pair: ProbabilityMass) => { // Let's not rely on the model to copy transform IDs. // Only bother is there IS an ID to copy. - if(correctionTransform.id !== undefined) { - pair.sample.transformId = correctionTransform.id; + if(correction.sample.id !== undefined) { + pair.sample.transformId = correction.sample.id; + } + + let correctionText: string; + if(tokenizedCorrectionEntries.length != 1) { + correctionText = correction.sample.insert; + // deleteLeft: 0; it's pre-applied within preservationTransform. + } else { + // Use the deleteLeft & tokenize. + const postContext = models.applyTransform(correction.sample, context); + correctionText = wordbreak(postContext); } let tuple: CorrectionPredictionTuple = { prediction: pair, correction: { - sample: correctionRoot, - p: correctionProb + sample: correctionText, + p: correction.p }, - totalProb: pair.p * correctionProb, - matchLevel: SuggestionSimilarity.none + totalProb: pair.p * correction.p, + matchLevel: SuggestionSimilarity.none, + preservationTransform }; return tuple; }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts index 54eec729554..a99187defa5 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts @@ -4,7 +4,7 @@ import { assert } from 'chai'; import { deepCopy } from "@keymanapp/web-utils"; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { models, predictFromCorrections, tupleDisplayOrderSort } from "@keymanapp/lm-worker/test-index"; +import { models, correctAndEnumerateWithoutTraversals, tupleDisplayOrderSort } from "@keymanapp/lm-worker/test-index"; import CasingFunction = LexicalModelTypes.CasingFunction; import Context = LexicalModelTypes.Context; @@ -112,7 +112,7 @@ describe('predictFromCorrections', () => { futureSuggestions: [ dummied_suggestions ] }); - const predictions = predictFromCorrections(model, correctionDistribution, context); + const predictions = correctAndEnumerateWithoutTraversals(model, correctionDistribution, context); predictions.forEach((entry) => assert.equal(entry.correction.sample, 'Its')); predictions.forEach((entry) => assert.equal(entry.correction.p, 0.6)); predictions.sort(tupleDisplayOrderSort); @@ -164,7 +164,7 @@ describe('predictFromCorrections', () => { futureSuggestions: [ dummied_suggestions ] }); - const predictions = predictFromCorrections(model, correctionDistribution, context); + const predictions = correctAndEnumerateWithoutTraversals(model, correctionDistribution, context); predictions.forEach((entry) => assert.equal(entry.correction.sample, 'Its')); predictions.forEach((entry) => assert.equal(entry.correction.p, 0.6)); predictions.sort(tupleDisplayOrderSort); @@ -247,7 +247,7 @@ describe('predictFromCorrections', () => { futureSuggestions: dummied_suggestions }); - const predictions = predictFromCorrections(model, correctionDistribution, context); + const predictions = correctAndEnumerateWithoutTraversals(model, correctionDistribution, context); predictions.sort(tupleDisplayOrderSort); assert.sameOrderedMembers(predictions.map((entry) => entry.prediction.sample.displayAs), ["is", "it's", "isn't", "its"]); From 9364e0018196caa4636aea5858011446a5a279f9 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Thu, 7 May 2026 16:36:58 -0500 Subject: [PATCH 07/16] fix(web): implement prediction-data correction string Lack of this string can break auto-correction and casing behaviors - and actually _did_ within engine/main! --- .../predictive-text/worker-thread/src/main/predict-helpers.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index ba892d76062..ee7595d0eed 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -509,7 +509,7 @@ export function buildAndMapPredictions( }, correction: { // Is used partly for word-casing, partly for auto-select enabling. - sample: '', // plain correction string instead... + sample: correctionTransforms[correctionTransforms.length-1].insert, // plain correction string instead... p: correctionCost }, totalProb: predictionCost * correctionCost, From 698be9bc70c19dc4558e4a78f791574932945b72 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Thu, 7 May 2026 16:38:21 -0500 Subject: [PATCH 08/16] docs(web): extend comment for last commit's change --- .../predictive-text/worker-thread/src/main/predict-helpers.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index ee7595d0eed..8611c6be249 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -509,7 +509,8 @@ export function buildAndMapPredictions( }, correction: { // Is used partly for word-casing, partly for auto-select enabling. - sample: correctionTransforms[correctionTransforms.length-1].insert, // plain correction string instead... + // Is already the full word, as that's what is provided by TokenizationCorrector. + sample: correctionTransforms[correctionTransforms.length-1].insert, p: correctionCost }, totalProb: predictionCost * correctionCost, From dacfc136ceb70146dc46bb45ddd793b9439351aa Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Fri, 8 May 2026 15:45:19 -0500 Subject: [PATCH 09/16] fix(web): adjust tokenization unit test expectations to match --- .../predictive-text/templates/tokenization.tests.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts index 3bc636c4128..0aa4f9551ed 100644 --- a/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/templates/tokenization.tests.ts @@ -175,7 +175,7 @@ describe('Tokenization functions', function() { }); it('properly handles empty-context cases', function() { - // Wordbreaking on a empty space => no word. + // Wordbreaking on a empty space => no word, but empty initial token. let context = { left: '', startOfBuffer: true, right: '', endOfBuffer: true @@ -184,7 +184,7 @@ describe('Tokenization functions', function() { let tokenization = models.tokenize(wordBreakers.default, context); let expectedResult: models.Tokenization = { - left: [], + left: [{text: '', isWhitespace: false}], right: [], caretSplitsToken: false }; @@ -193,11 +193,11 @@ describe('Tokenization functions', function() { }); it('properly handles null context cases', function() { - // Wordbreaking on a empty space => no word. + // Wordbreaking on a empty space => no word, but empty initial token. let tokenization = models.tokenize(wordBreakers.default, null); let expectedResult: models.Tokenization = { - left: [], + left: [{text: '', isWhitespace: false}], right: [], caretSplitsToken: false }; From 023a560afeeb427342532ba67fbcb023643649d4 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Tue, 12 May 2026 11:57:47 -0500 Subject: [PATCH 10/16] fix(web): fix bugs in createDefaultKeep, extend unit testing It turns out that #15766 did not perfectly address all cases for generation of default "keep" suggestions. This PR will remedy the situation. Build-bot: skip build:web Test-bot: skip --- .../src/main/model-compositor.ts | 2 +- .../worker-thread/src/main/predict-helpers.ts | 16 +- .../create-default-keep.tests.ts | 212 +++++++++++++++++- 3 files changed, 219 insertions(+), 11 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts index a518642d2ce..8a4a53035cf 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts @@ -177,7 +177,7 @@ export class ModelCompositor { // the token, also add a 'keep' suggestion (with `.matchesModel = false`) // matching it. if(!hasExistingKeep) { - const baseTuple = createDefaultKeep(this.lexicalModel, context, transformDistribution[0]); + const baseTuple = createDefaultKeep(this.lexicalModel, postContext, transformDistribution[0]); // Will be re-sorted shortly after this; just use the simple O(1) method here // and let sorting put it in place. diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 28968916e44..c8a953b79c2 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -909,27 +909,29 @@ export function processSimilarity( * This method is designed for use when no appropriate 'keep' suggestion was * generated by the correction-search process. * @param lexicalModel - * @param context + * @param postContext * @param trueInput * @returns */ export function createDefaultKeep( lexicalModel: LexicalModel, - context: Context, + postContext: Context, trueInput: ProbabilityMass ): CorrectionPredictionTuple { const { sample: inputTransform, p: inputTransformProb } = trueInput; const wordbreak = determineModelWordbreaker(lexicalModel); - const postContext = models.applyTransform(inputTransform, context); const truePrefix = wordbreak(postContext); + const truePrefixLen = KMWString.length(truePrefix); + const inputInsertLen = KMWString.length(trueInput.sample.insert) + const tokenPrefixLen = truePrefixLen - Math.max(0, inputInsertLen - trueInput.sample.deleteLeft); - // Generate a full-word 'keep' replacement like other suggestions when one is not otherwise - // produced; we want to replace the full token in the same manner used for other suggestions. - const basePrefixLength = KMWString.length(truePrefix) - KMWString.length(inputTransform.insert) + inputTransform.deleteLeft; + // Generate a full-word 'keep' replacement like other suggestions when one is + // not otherwise produced; we want to replace the full token in the same + // manner used for other suggestions. const keepTransform = { insert: truePrefix, - deleteLeft: basePrefixLength + deleteLeft: Math.max(0, trueInput.sample.deleteLeft - inputInsertLen) + (tokenPrefixLen < 0 ? truePrefixLen : tokenPrefixLen) }; let keepSuggestion = models.transformToSuggestion(keepTransform); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/create-default-keep.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/create-default-keep.tests.ts index 6b5981c8343..40fd2dafc6f 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/create-default-keep.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/create-default-keep.tests.ts @@ -11,6 +11,7 @@ import { assert } from 'chai'; import { LexicalModelTypes } from "@keymanapp/common-types"; import * as wordBreakers from '@keymanapp/models-wordbreakers'; +import { applyTransform } from '@keymanapp/models-templates'; import { CorrectionPredictionTuple, createDefaultKeep, models, SuggestionSimilarity } from "@keymanapp/lm-worker/test-index"; @@ -91,8 +92,8 @@ const testModelWithCasing = new DummyModel({ // No suggestions needed here, so we don't define any. }); -describe('produceKeep', () => { - it(`creates an 'exact'-match suggestion based on primary input and current context`, () => { +describe('createDefaultKeep', () => { + it(`creates an 'exact'-match suggestion based on simple primary input`, () => { const context: Context = { left: 'iphon', right: '', @@ -129,7 +130,212 @@ describe('produceKeep', () => { matchLevel: SuggestionSimilarity.exact }; - const tuple = createDefaultKeep(testModelWithCasing, context, trueInput); + const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); + assert.deepEqual(tuple, expectedKeep); + }); + + it(`creates an 'exact'-match suggestion based on full word after a backspace`, () => { + const context: Context = { + left: 'iphone ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: '', + deleteLeft: 1 + }, + p: 1 + }; + + const expectedKeep: CorrectionPredictionTuple = { + correction: { + sample: 'iphone', + p: 1 + }, + prediction: { + sample: { + transform: { + insert: 'iphone', + deleteLeft: 7 + }, + displayAs: '', + matchesModel: false, + tag: 'keep' + }, + p: 1 + }, + totalProb: 1, + matchLevel: SuggestionSimilarity.exact + }; + + const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); + assert.deepEqual(tuple, expectedKeep); + }); + + it(`creates an 'exact'-match suggestion based on complex deletion`, () => { + const context: Context = { + left: 'iphone a', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'e', + deleteLeft: 3 + }, + p: 1 + }; + + const expectedKeep: CorrectionPredictionTuple = { + correction: { + sample: 'iphone', + p: 1 + }, + prediction: { + sample: { + transform: { + insert: 'iphone', + deleteLeft: 8 + }, + displayAs: '', + matchesModel: false, + tag: 'keep' + }, + p: 1 + }, + totalProb: 1, + matchLevel: SuggestionSimilarity.exact + }; + + const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); + assert.deepEqual(tuple, expectedKeep); + }); + + it(`creates an 'exact'-match suggestion based on complex insertion`, () => { + const context: Context = { + left: 'iphon', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'es and', + deleteLeft: 0 + }, + p: 1 + }; + + const expectedKeep: CorrectionPredictionTuple = { + correction: { + sample: 'and', + p: 1 + }, + prediction: { + sample: { + transform: { + insert: 'and', + deleteLeft: 3 + }, + displayAs: '', + matchesModel: false, + tag: 'keep' + }, + p: 1 + }, + totalProb: 1, + matchLevel: SuggestionSimilarity.exact + }; + + const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); + assert.deepEqual(tuple, expectedKeep); + }); + + it(`creates an 'exact'-match suggestion based on complex replacement`, () => { + const context: Context = { + left: 'iphone ', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 's', + deleteLeft: 1 + }, + p: 1 + }; + + const expectedKeep: CorrectionPredictionTuple = { + correction: { + sample: 'iphones', + p: 1 + }, + prediction: { + sample: { + transform: { + insert: 'iphones', + deleteLeft: 7 + }, + displayAs: '', + matchesModel: false, + tag: 'keep' + }, + p: 1 + }, + totalProb: 1, + matchLevel: SuggestionSimilarity.exact + }; + + const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); + assert.deepEqual(tuple, expectedKeep); + }); + + it(`creates an empty 'exact'-match suggestion after adding a wordbreak`, () => { + const context: Context = { + left: 'iphon', + right: '', + startOfBuffer: true, + endOfBuffer: true + }; + + const trueInput: ProbabilityMass = { + sample: { + insert: 'e ', + deleteLeft: 0 + }, + p: 1 + }; + + const expectedKeep: CorrectionPredictionTuple = { + correction: { + sample: '', + p: 1 + }, + prediction: { + sample: { + transform: { + insert: '', + deleteLeft: 0 + }, + displayAs: '<>', + matchesModel: false, + tag: 'keep' + }, + p: 1 + }, + totalProb: 1, + matchLevel: SuggestionSimilarity.exact + }; + + const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); assert.deepEqual(tuple, expectedKeep); }); }); \ No newline at end of file From b4a87712fb977d590f916051efe70f58e9c403b5 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Tue, 12 May 2026 14:15:17 -0500 Subject: [PATCH 11/16] refactor(web): refactor intermediate composited prediction type This reorganizes the type formerly known as CorrectionPredictionTuple, preparing it to share similarities with a new incoming type handling an earlier, tokenized intermediate stage that will be needed for some aspects of suggestion generation. Build-bot: skip build:web Test-bot: skip --- .../main/correction/tokenization-corrector.ts | 8 +- .../src/main/model-compositor.ts | 9 +- .../worker-thread/src/main/predict-helpers.ts | 295 +++++----- .../early-correction-search-stopping.tests.ts | 29 +- .../prediction-helpers/auto-correct.tests.ts | 544 ++++++++++-------- .../create-default-keep.tests.ts | 140 +++-- .../predict-from-corrections.tests.ts | 38 +- .../suggestion-deduplication.tests.ts | 90 +-- .../suggestion-finalization.tests.ts | 98 ++-- .../suggestion-similarity.tests.ts | 188 +++--- .../worker-custom-punctuation.tests.ts | 14 + .../worker-model-compositor.tests.ts | 4 + 12 files changed, 801 insertions(+), 656 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts index 3b71b2eb4f8..885cdb0ed2b 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts @@ -58,6 +58,7 @@ export class TokenizationCorrector implements CorrectionSearchable; private tokenCostMap: Map; private tokenLookupMap: Map; @@ -172,13 +173,16 @@ export class TokenizationCorrector implements CorrectionSearchable { // New issue: this mangles the space IDs! We almost certainly need some // sort of proper map to the source token. const searchModule = new QuotientNodeFinalizer(token.searchModule, index == orderedTokens.length - 1); this.tokenLookupMap.set(searchModule.spaceId, token); - if(!filterClosure(token)) { + const passesFilter = filterClosure(token); + modelsCorrectables ||= passesFilter; + if(!passesFilter) { this._uncorrectables.push(searchModule); } else if(index == tailCorrectionLength - 1) { // The sole assignment case for this field. It may only be assigned for @@ -189,6 +193,8 @@ export class TokenizationCorrector implements CorrectionSearchable, + prediction: Suggestion | Keep; /** * The correction upon which the Suggestion (or Keep) is based */ - correction: ProbabilityMass, + correction: string +} + +export interface PredictionProbabilities { /** - * The likelihood of the prediction - its lexical-model likelihood multiplied - * by the keystroke-sequence + correction likelihood. + * The probability of the word itself, separate from corrections, as + * determined by the LexicalModel itself. */ - totalProb: number; + prediction: number; + + /** + * The probability of text-correction steps taken to build the correction upon + * which the prediction is based. + */ + correction: number; + + /** + * The likelihood of the represented prediction, combining both the + * `prediction` and `correction` components into a single value. + */ + total: number; +} + +/** + * Tracks common intermediate prediction data, such as its underlying probabilities and its similarity to the actual context. + */ +export interface PredictionMetadata { + /** + * Tracks the relevant probability components contributing to a generated + * prediction. + */ + probabilities: PredictionProbabilities; + + /** + * Indicates that the 'suggestion' represents context changes that qualify for + * auto-selection. + */ + autoSelectable: boolean; + /** * How directly the prediction matches the current token in the context. * @@ -101,12 +130,26 @@ export type CorrectionPredictionTuple = { * available upon initial construction of this type. */ matchLevel?: SuggestionSimilarity; + /** * Text from the triggering input that should _not_ be affected by the * prediction. */ preservationTransform?: Transform; -}; +} + +export interface IntermediateCompositedPrediction { + /** + * Contains the fully composited predictive-text Suggestion and its underlying correction string. + */ + components: CompositedPredictionData; + /** + * Tracks common intermediate prediction data, such as its underlying probabilities and its similarity to the actual context. + */ + metadata: PredictionMetadata; +} + +type IntermediatePrediction = IntermediateCompositedPrediction; /** * An enum to be used when categorizing the level of similarity between @@ -144,15 +187,15 @@ export enum SuggestionSimilarity { exact = 3 } -export function tupleDisplayOrderSort(a: CorrectionPredictionTuple, b: CorrectionPredictionTuple) { +export function tupleDisplayOrderSort(a: IntermediatePrediction, b: IntermediatePrediction) { // Similarity distance - const simDist = (b.matchLevel ?? 0) - (a.matchLevel ?? 0); + const simDist = (b.metadata.matchLevel ?? 0) - (a.metadata.matchLevel ?? 0); if(simDist != 0) { return simDist; } // Probability distance - return b.totalProb - a.totalProb; + return b.metadata.probabilities.total - a.metadata.probabilities.total; } /** @@ -320,7 +363,7 @@ export function determineSuggestionRange( export function buildAndMapPredictions( transition: ContextTransition, tokenizationCorrection: TokenizationResultMapping, -): CorrectionPredictionTuple[] { +): IntermediateCompositedPrediction[] { const model = transition.final.model; const tokenization = tokenizationCorrection.matchingSpace.tokenization; @@ -398,6 +441,10 @@ export function buildAndMapPredictions( // Regardless of origin, overwrite the transform's deleteLeft value with what it should actually hold. predictions.forEach((entry) => { entry.sample.transform.deleteLeft = deleteLeft; + if(transition.transitionId !== undefined) { + entry.sample.transformId = transition.transitionId; + entry.sample.transform.id = transition.transitionId; + } }); // Use traversals if possible - extract the most likely entry that is on the traversal, @@ -415,8 +462,9 @@ export function buildAndMapPredictions( .slice(0, predictionComponents.length-1) .reduce((accum, curr) => accum * curr[0].p, 1) - const completePredictionTuples: CorrectionPredictionTuple[] = predictionComponents[predictionComponents.length-1].map((prediction) => { + const completePredictionTuples: IntermediateCompositedPrediction[] = predictionComponents[predictionComponents.length-1].map((prediction) => { const predictionCost = prediction.p * prefixProb; + return { // Will need to do this differently. We want to have each component // individualized b/c casing. Case should be maintained for prior tokens @@ -428,28 +476,30 @@ export function buildAndMapPredictions( // applySuggestionCasing applies onto suggestions, so we'll want to build // the FULL suggestion AFTER applying casing changes (to each token's // suggestion component). - prediction: { - sample: { + components: { + prediction: { transformId: transition.transitionId, transform: models.buildMergedTransform(predictionPrefix, prediction.sample.transform), displayAs: models.buildMergedTransform(predictionPrefix, prediction.sample.transform).insert // should composite the displayAs strings instead... }, - p: predictionCost, + correction: correctionTransforms[correctionTransforms.length-1].insert }, - correction: { - // Is used partly for word-casing, partly for auto-select enabling. - // Is already the full word, as that's what is provided by TokenizationCorrector. - sample: correctionTransforms[correctionTransforms.length-1].insert, - p: correctionCost - }, - totalProb: predictionCost * correctionCost, - matchLevel: SuggestionSimilarity.none, - // Long-term, we shouldn't have `.preservationTransform` here. - // - // Needed for now until the search actually operates based on - // TokenizationCorrector, rather than the half-converted use currently in - // place. - preservationTransform: tokenization.taillessTrueKeystroke + metadata: { + probabilities: { + prediction: predictionCost, + correction: correctionCost, + total: predictionCost * correctionCost + }, + matchLevel: SuggestionSimilarity.none, + autoSelectable: tokenizationCorrection.matchingSpace.modelsCorrectables, + + // Long-term, we shouldn't have `.preservationTransform` here. + // + // Needed for now until the search actually operates based on + // TokenizationCorrector, rather than the half-converted use currently in + // place. + preservationTransform: tokenization.taillessTrueKeystroke + } } }); @@ -515,7 +565,7 @@ export async function correctAndEnumerate( /** * The suggestions generated based on the user's input state. */ - rawPredictions: CorrectionPredictionTuple[]; + rawPredictions: IntermediateCompositedPrediction[]; /** * The id of a prior ContextTransition event that triggered a Suggestion found @@ -567,9 +617,8 @@ export async function correctAndEnumerate( const searchModules = tokenizations.map(t => t.tail.searchModule); // Only run the correction search when corrections are enabled. - let rawPredictions: CorrectionPredictionTuple[] = []; + let rawPredictions: IntermediateCompositedPrediction[] = []; let bestCorrectionCost: number; - const correctionPredictionMap: Record> = {}; for await(const match of getBestTokenMatches(searchModules, timer)) { // Corrections obtained: now to predict from them! const tokenization = tokenizations.find(t => t.spaceId == match.spaceId); @@ -588,38 +637,15 @@ export async function correctAndEnumerate( continue; } - /* If we're dealing with the FIRST keystroke of a new sequence, we'll **dramatically** boost - * the exponent to ensure only VERY nearby corrections have a chance of winning, and only if - * there are significantly more likely words. We only need this to allow very minor fat-finger - * adjustments for 100% keystroke-sequence corrections in order to prevent finickiness on - * key borders. - * - * Technically, the probabilities this produces won't be normalized as-is... but there's no - * true NEED to do so for it, even if it'd be 'nice to have'. Consistently tracking when - * to apply it could become tricky, so it's simpler to leave out. - * - * Worst-case, it's possible to temporarily add normalization if a code deep-dive - * is needed in the future. - */ - const costFactor = (tokenization.tail.inputCount <= 1) ? ModelCompositor.SINGLE_CHAR_KEY_PROB_EXPONENT : 1; - const suggestionRange = determineSuggestionRange(transition.base.displayTokenization, tokenization) const corrector = new TokenizationCorrector(tokenization, suggestionRange.tokensToPredict.length, () => true); const predictions = buildAndMapPredictions(transition, new TokenizationResultMapping([match], corrector)); // Only set 'best correction' cost when a correction ACTUALLY YIELDS predictions. if(predictions.length > 0 && bestCorrectionCost === undefined) { - bestCorrectionCost = match.totalCost * costFactor; - } - - // If we're getting the same prediction again, it's lower-cost. Update! - let oldPredictionSet = correctionPredictionMap[match.matchString]; - if(oldPredictionSet) { - rawPredictions = rawPredictions.filter((entry) => !oldPredictionSet.find((match) => entry.prediction.sample == match.sample)); + bestCorrectionCost = predictions[0].metadata.probabilities.correction; } - correctionPredictionMap[match.matchString] = predictions.map((entry) => entry.prediction); - rawPredictions = rawPredictions.concat(predictions); if(shouldStopSearchingEarly(bestCorrectionCost, match.totalCost, rawPredictions)) { @@ -640,7 +666,7 @@ export async function correctAndEnumerate( export function shouldStopSearchingEarly( bestCorrectionCost: number, currentCorrectionCost: number, - rawPredictions: CorrectionPredictionTuple[] + rawPredictions: IntermediateCompositedPrediction[] ) { if(currentCorrectionCost >= bestCorrectionCost + CORRECTION_SEARCH_THRESHOLDS.MAX_SEARCH_THRESHOLD) { return true; @@ -656,7 +682,7 @@ export function shouldStopSearchingEarly( // If the best suggestion from the search's current tier fails to beat the worst // pending suggestion from previous tiers, assume all further corrections will // similarly fail to win; terminate the search-loop. - if(rawPredictions[ModelCompositor.MAX_SUGGESTIONS-1].totalProb > Math.exp(-currentCorrectionCost)) { + if(rawPredictions[ModelCompositor.MAX_SUGGESTIONS-1].metadata.probabilities.total > Math.exp(-currentCorrectionCost)) { return true; } } @@ -681,8 +707,8 @@ export function correctAndEnumerateWithoutTraversals( lexicalModel: LexicalModel, corrections: ProbabilityMass[], context: Context -): CorrectionPredictionTuple[] { - let returnedPredictions: CorrectionPredictionTuple[] = []; +): IntermediateCompositedPrediction[] { + let returnedPredictions: IntermediateCompositedPrediction[] = []; const wordbreak = determineModelWordbreaker(lexicalModel); const tokenizer = determineModelTokenizer(lexicalModel); @@ -730,15 +756,21 @@ export function correctAndEnumerateWithoutTraversals( correctionText = wordbreak(postContext); } - let tuple: CorrectionPredictionTuple = { - prediction: pair, - correction: { - sample: correctionText, - p: correction.p + let tuple: IntermediateCompositedPrediction = { + components: { + prediction: pair.sample, + correction: correctionText }, - totalProb: pair.p * correction.p, - matchLevel: SuggestionSimilarity.none, - preservationTransform + metadata: { + probabilities: { + prediction: pair.p, + correction: correction.p, + total: pair.p * correction.p + }, + autoSelectable: correctionValidForAutoSelect(tailCorrection.insert), + matchLevel: SuggestionSimilarity.none, + preservationTransform + } }; return tuple; }); @@ -784,17 +816,17 @@ export function applySuggestionCasing(suggestion: Suggestion, baseWord: string, */ export function dedupeSuggestions( lexicalModel: LexicalModel, - rawPredictions: CorrectionPredictionTuple[], + rawPredictions: IntermediateCompositedPrediction[], context: Context ) { const wordbreak = determineModelWordbreaker(lexicalModel); - let suggestionDistribMap: {[key: string]: CorrectionPredictionTuple} = {}; - let suggestionDistribution: CorrectionPredictionTuple[] = []; + let suggestionDistribMap: {[key: string]: IntermediateCompositedPrediction} = {}; + let suggestionDistribution: IntermediateCompositedPrediction[] = []; // Deduplicator + annotator of 'keep' suggestions. for(let tuple of rawPredictions) { - const predictedWord = wordbreak(models.applyTransform(tuple.prediction.sample.transform, context)); + const predictedWord = wordbreak(models.applyTransform(tuple.components.prediction.transform, context)); // Assumption: suggestions that have the same net result should have the // same displayAs string. (We could try to pick the one with highest net @@ -804,7 +836,7 @@ export function dedupeSuggestions( // Merge 'em! const existingSuggestion = suggestionDistribMap[predictedWord]; if(existingSuggestion) { - existingSuggestion.totalProb += tuple.totalProb; + existingSuggestion.metadata.probabilities.total += tuple.metadata.probabilities.total; } else { suggestionDistribMap[predictedWord] = tuple; } @@ -832,15 +864,16 @@ export function dedupeSuggestions( * current text * - any other suggestion * + * @param lexicalModel * @param suggestionDistribution - * @param context - * @param trueInput inputTransform + its assigned probability + * @param baseContext + * @param finalContext * @returns true if an existing suggestion fulfills the role of 'keep'; * otherwise, false. */ export function processSimilarity( lexicalModel: LexicalModel, - suggestionDistribution: CorrectionPredictionTuple[], + suggestionDistribution: IntermediateCompositedPrediction[], context: Context, trueInput: ProbabilityMass ): boolean { @@ -860,38 +893,38 @@ export function processSimilarity( for(let tuple of suggestionDistribution) { // Don't set it unnecessarily; this can have side-effects in some automated tests. if(inputTransform.id !== undefined) { - tuple.prediction.sample.transformId = inputTransform.id; + tuple.components.prediction.transformId = inputTransform.id; } - const predictedWord = wordbreak(models.applyTransform(tuple.prediction.sample.transform, context)); + const predictedWord = wordbreak(models.applyTransform(tuple.components.prediction.transform, context)); // Is the suggestion an exact match (or, "similar enough") to the // actually-typed context? If so, we wish to note this fact and to // prioritize such a suggestion over suggestions that are not. - if(keyed(tuple.correction.sample) == keyedPrefix) { + if(keyed(tuple.components.correction) == keyedPrefix) { if(predictedWord == truePrefix) { // Exact match: it's a perfect 'keep' suggestion. - tuple.matchLevel = SuggestionSimilarity.exact; - keepOption = toAnnotatedSuggestion(lexicalModel, tuple.prediction.sample, 'keep', models.QuoteBehavior.noQuotes); + tuple.metadata.matchLevel = SuggestionSimilarity.exact; + keepOption = toAnnotatedSuggestion(lexicalModel, tuple.components.prediction, 'keep', models.QuoteBehavior.noQuotes); // Indicates that this suggestion exists directly within the lexical // model as a valid suggestion. (We actively display it if it's an // exact match, but hide it if not, only preserving it for reversions // if/when needed.) keepOption.matchesModel = true; - Object.assign(tuple.prediction.sample, keepOption); - keepOption = tuple.prediction.sample as Outcome; + Object.assign(tuple.components.prediction, keepOption); + keepOption = tuple.components.prediction as Outcome; } else if(keyCased(predictedWord) == lowercasedPrefix) { // Case-insensitive match. No diacritic differences; the ONLY difference is casing. - tuple.matchLevel = SuggestionSimilarity.sameText; + tuple.metadata.matchLevel = SuggestionSimilarity.sameText; } else if(keyed(predictedWord) == keyedPrefix) { // Diacritic-insensitive / exact-key match. - tuple.matchLevel = SuggestionSimilarity.sameKey; + tuple.metadata.matchLevel = SuggestionSimilarity.sameKey; } else { - tuple.matchLevel = SuggestionSimilarity.none; + tuple.metadata.matchLevel = SuggestionSimilarity.none; } } else { - tuple.matchLevel = SuggestionSimilarity.none; + tuple.metadata.matchLevel = SuggestionSimilarity.none; } } @@ -917,7 +950,7 @@ export function createDefaultKeep( lexicalModel: LexicalModel, postContext: Context, trueInput: ProbabilityMass -): CorrectionPredictionTuple { +): IntermediateCompositedPrediction { const { sample: inputTransform, p: inputTransformProb } = trueInput; const wordbreak = determineModelWordbreaker(lexicalModel); @@ -946,19 +979,19 @@ export function createDefaultKeep( // Insert our synthetic keepOption as a prediction tuple. return { - // Product of the two p's below. - totalProb: inputTransformProb * MAX_PROB, - prediction: { - sample: keepOption, - // We always show the keep option if it doesn't directly match, - // so max probability is fine. - p: MAX_PROB, + components: { + prediction: keepOption, + correction: truePrefix }, - correction: { - sample: truePrefix, - p: inputTransformProb * MAX_PROB - }, - matchLevel: SuggestionSimilarity.exact + metadata: { + probabilities: { + prediction: MAX_PROB, + correction: inputTransformProb, + total: inputTransformProb * MAX_PROB + }, + autoSelectable: false, + matchLevel: SuggestionSimilarity.exact + } }; } @@ -991,12 +1024,12 @@ export function correctionValidForAutoSelect(correction: string) { return false; } -export function predictionAutoSelect(suggestionDistribution: CorrectionPredictionTuple[]) { +export function predictionAutoSelect(suggestionDistribution: IntermediateCompositedPrediction[]) { if(suggestionDistribution.length == 0) { return; } - const keepOption = suggestionDistribution[0].prediction.sample as Outcome; + const keepOption = suggestionDistribution[0].components.prediction as Outcome; if(keepOption.tag == 'keep' && keepOption.matchesModel) { // Auto-select it for auto-acceptance; we don't correct away from perfectly-valid // lexical entries, even if they are comparatively low-frequency. @@ -1010,19 +1043,19 @@ export function predictionAutoSelect(suggestionDistribution: CorrectionPredictio if(suggestionDistribution.length == 1) { // Prevent auto-acceptance when the root doesn't meet validation criteria. - if(!correctionValidForAutoSelect(suggestionDistribution[0].correction.sample)) { + if(!suggestionDistribution[0].metadata.autoSelectable) { return; } // Mark for auto-acceptance; there are no alternatives. - suggestionDistribution[0].prediction.sample.autoAccept = true; + suggestionDistribution[0].components.prediction.autoAccept = true; return; } // Is it reasonable to auto-accept any of our suggestions? const bestSuggestion = suggestionDistribution[0]; - const baseCorrection = bestSuggestion.correction.sample; + const baseCorrection = bestSuggestion.components.correction; if(baseCorrection.length == 0) { // If the correction is rooted on an empty root, there's no basis for // auto-correcting to this suggestion. @@ -1031,8 +1064,8 @@ export function predictionAutoSelect(suggestionDistribution: CorrectionPredictio // Find the highest probability for any correction that led to a valid prediction. // No need to full-on re-sort everything, though. - const bestCorrection = suggestionDistribution.reduce((prev, current) => prev?.correction.p > current.correction.p ? prev : current, null).correction; - if(bestCorrection.p > bestSuggestion.correction.p) { + const bestCorrectionP = suggestionDistribution.reduce((prev, current) => Math.max(prev, current.metadata.probabilities.correction), 0); + if(bestCorrectionP > bestSuggestion.metadata.probabilities.correction) { // Here, the best suggestion didn't come from the best correction. // Is it actually reasonable to auto-correct? We're probably just very // biased toward its frequency. (Maybe a threshold should be considered?) @@ -1043,28 +1076,28 @@ export function predictionAutoSelect(suggestionDistribution: CorrectionPredictio // - such as replacing `cant` with `can't` if the latter is much more frequent - // we may wish to group matchLevel values below by 'mapping' them with an appropriate // function. (Both on the next line and within the reduce functor.) - const bestSuggestionTier = bestSuggestion.matchLevel; + const bestSuggestionTier = bestSuggestion.metadata.matchLevel; // compare best vs other probabilities of compatible tier. const probSum = suggestionDistribution.reduce((accum, current) => { // If the suggestion is from a different similarity tier, do not count it against // the required auto-select probability ratio threshold. That threshold should // only apply within the suggestion's tier. - return accum + (current.matchLevel == bestSuggestionTier ? current.totalProb : 0) + return accum + (current.metadata.matchLevel == bestSuggestionTier ? current.metadata.probabilities.total : 0) }, 0); - const proportionOfBest = bestSuggestion.totalProb / probSum; + const proportionOfBest = bestSuggestion.metadata.probabilities.total / probSum; if(proportionOfBest < AUTOSELECT_PROPORTION_THRESHOLD) { return; } - if(!correctionValidForAutoSelect(bestSuggestion.correction.sample)) { + if(!bestSuggestion.metadata.autoSelectable) { return; } // compare correction-cost aspects? We disable if the base correction is lower than best, // but should we do other comparisons too? - bestSuggestion.prediction.sample.autoAccept = true; + bestSuggestion.components.prediction.autoAccept = true; } /** @@ -1085,7 +1118,7 @@ export function predictionAutoSelect(suggestionDistribution: CorrectionPredictio */ export function finalizeSuggestions( lexicalModel: LexicalModel, - deduplicatedSuggestionTuples: CorrectionPredictionTuple[], + deduplicatedSuggestionTuples: IntermediateCompositedPrediction[], context: Context, inputTransform: Transform, verbose?: boolean @@ -1094,42 +1127,44 @@ export function finalizeSuggestions( const tokenize = determineModelTokenizer(lexicalModel); const suggestions = deduplicatedSuggestionTuples.map((tuple) => { - const prediction = tuple.prediction; + const prediction = tuple.components.prediction; // If this is a suggestion after any form of wordbreak input, make sure we preserve any components // from prior tokens! // // Note: may need adjustment if/when supporting phrase-level correction. - if(tuple.preservationTransform) { + if(tuple.metadata.preservationTransform) { const mergedTransform = { - ...models.buildMergedTransform(tuple.preservationTransform, {...prediction.sample.transform, deleteLeft: 0}), - deleteLeft: prediction.sample.transform.deleteLeft + ...models.buildMergedTransform(tuple.metadata.preservationTransform, {...prediction.transform, deleteLeft: 0}), + deleteLeft: prediction.transform.deleteLeft }; // Temporarily and locally drops 'readonly' semantics so that we can reassign the transform. // See https://www.typescriptlang.org/docs/handbook/release-notes/typescript-2-8.html#improved-control-over-mapped-type-modifiers - let mutableSuggestion = prediction.sample as {-readonly [transform in keyof Suggestion]: Suggestion[transform]}; + let mutableSuggestion = prediction as {-readonly [transform in keyof Suggestion]: Suggestion[transform]}; // Assignment via by-reference behavior, as suggestion is an object mutableSuggestion.transform = mergedTransform; } // Is sometimes not set during unit tests. - if(prediction.sample.transformId !== undefined) { - prediction.sample.transform.id = prediction.sample.transformId; + if(prediction.transformId) { + prediction.transform.id = prediction.transformId; } + const probs = tuple.metadata.probabilities; + if(!verbose) { return { - ...prediction.sample, - p: tuple.totalProb + ...prediction, + p: probs.total }; } else { const sample: Outcome = { - ...prediction.sample, - p: tuple.totalProb, - "lexical-p": prediction.p, - "correction-p": tuple.correction.p + ...prediction, + p: probs.total, + "lexical-p": probs.prediction, + "correction-p": probs.correction } return sample; diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/early-correction-search-stopping.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/early-correction-search-stopping.tests.ts index bf45e3b94e1..430d9c6c7e0 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/early-correction-search-stopping.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/early-correction-search-stopping.tests.ts @@ -1,6 +1,16 @@ import { assert } from 'chai'; -import { CORRECTION_SEARCH_THRESHOLDS, CorrectionPredictionTuple, ModelCompositor, shouldStopSearchingEarly } from "@keymanapp/lm-worker/test-index"; +import { CORRECTION_SEARCH_THRESHOLDS, IntermediateCompositedPrediction, ModelCompositor, shouldStopSearchingEarly } from "@keymanapp/lm-worker/test-index"; + +function mockIntermediatePrediction(value: number) { + return { + metadata: { + probabilities: { + total: value + } + } + } as IntermediateCompositedPrediction +} describe('correction-search: shouldStopSearchingEarly', () => { it('stops early once new corrections are less likely than currently discovered predictions', () => { @@ -12,12 +22,7 @@ describe('correction-search: shouldStopSearchingEarly', () => { assert.equal(predictionProbs.length, ModelCompositor.MAX_SUGGESTIONS, "test setup no longer valid"); // The only part for each entry we actually care about here: .totalProb. - /** @type {import('#./predict-helpers.js').CorrectionPredictionTuple[]} */ - const predictions = predictionProbs.map((entry) => { - return { - totalProb: entry - } as CorrectionPredictionTuple - }); + const predictions = predictionProbs.map((entry) => mockIntermediatePrediction(entry)); // Thresholding is performed in log-space. // 0.0501 and 0.0499 are offset on each side of 0.05, the last value in the array defined above. @@ -33,8 +38,8 @@ describe('correction-search: shouldStopSearchingEarly', () => { // // Can technically run the method with an empty array, but the actual scenario would have // at least one prediction present in the "found predictions" array. - assert.isFalse(shouldStopSearchingEarly(baseCost, baseCost + expectedThreshold - 0.01, [{ totalProb: Math.exp(-1) } as CorrectionPredictionTuple])); - assert.isTrue(shouldStopSearchingEarly( baseCost, baseCost + expectedThreshold + 0.01, [{ totalProb: Math.exp(-1) } as CorrectionPredictionTuple])); + assert.isFalse(shouldStopSearchingEarly(baseCost, baseCost + expectedThreshold - 0.01, [mockIntermediatePrediction(Math.exp(-1))])); + assert.isTrue(shouldStopSearchingEarly( baseCost, baseCost + expectedThreshold + 0.01, [mockIntermediatePrediction(Math.exp(-1))])); }); it('stops checking corrections earlier when enough predictions have been found', () => { @@ -43,11 +48,7 @@ describe('correction-search: shouldStopSearchingEarly', () => { // The only part for each entry we actually care about here: .totalProb. /** @type {import('#./predict-helpers.js').CorrectionPredictionTuple[]} */ - const predictions = predictionProbs.map((entry) => { - return { - totalProb: entry - } as CorrectionPredictionTuple - }); + const predictions = predictionProbs.map((entry) => mockIntermediatePrediction(entry)); const baseCost = 1; diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/auto-correct.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/auto-correct.tests.ts index d32326e8436..b55886bb42f 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/auto-correct.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/auto-correct.tests.ts @@ -1,6 +1,6 @@ import { assert } from 'chai'; -import { AUTOSELECT_PROPORTION_THRESHOLD, CorrectionPredictionTuple, predictionAutoSelect, SuggestionSimilarity, tupleDisplayOrderSort } from "@keymanapp/lm-worker/test-index"; +import { AUTOSELECT_PROPORTION_THRESHOLD, IntermediateCompositedPrediction, predictionAutoSelect, SuggestionSimilarity, tupleDisplayOrderSort } from "@keymanapp/lm-worker/test-index"; /* * Preconditions: * - there should always be a 'keep' option. Now, whether or not that option @@ -9,7 +9,7 @@ import { AUTOSELECT_PROPORTION_THRESHOLD, CorrectionPredictionTuple, predictionA */ describe('predictionAutoSelect', () => { it(`does not throw when no suggestions are available`, () => { - const predictions: CorrectionPredictionTuple[] = []; + const predictions: IntermediateCompositedPrediction[] = []; const originalPredictions = [].concat(predictions); assert.doesNotThrow(() => predictionAutoSelect(predictions)); @@ -17,14 +17,10 @@ describe('predictionAutoSelect', () => { }); it(`selects solitary 'keep' suggestion that does match the model`, () => { - const predictions: CorrectionPredictionTuple[] = [ + const predictions: IntermediateCompositedPrediction[] = [ { - correction: { - sample: 'apple', - p: 1 - }, - prediction: { - sample: { + components: { + prediction: { tag: 'keep', transform: { // can be null / "mocked out" insert: 'e', @@ -33,9 +29,16 @@ describe('predictionAutoSelect', () => { matchesModel: true, displayAs: 'apple' }, - p: 1 + correction: 'apple', }, - totalProb: 1 + metadata: { + probabilities: { + prediction: 1, + correction: 1, + total: 1 + }, + autoSelectable: true + } } ]; @@ -43,19 +46,15 @@ describe('predictionAutoSelect', () => { assert.doesNotThrow(() => predictionAutoSelect(predictions)); assert.sameDeepOrderedMembers(predictions, originalPredictions); - const autoselected = predictions.find((entry) => entry.prediction.sample.autoAccept); + const autoselected = predictions.find((entry) => entry.components.prediction.autoAccept); assert.isOk(autoselected); }); it(`does not select suggestions if the root correction has no letters`, () => { - const predictions: CorrectionPredictionTuple[] = [ + const predictions: IntermediateCompositedPrediction[] = [ { - correction: { - sample: '5', - p: 1 - }, - prediction: { - sample: { + components: { + prediction: { tag: 'keep', transform: { insert: '5', @@ -64,17 +63,20 @@ describe('predictionAutoSelect', () => { matchesModel: false, displayAs: '5' }, - p: 0.01 + correction: '5' }, - totalProb: 0.01 + metadata: { + probabilities: { + prediction: 0.01, + correction: 1, + total: 0.01 + }, + autoSelectable: false + } }, { - correction: { - sample: '5', - p: 1 - }, - prediction: { - sample: { + components: { + prediction: { transform: { insert: '5th', deleteLeft: 0 @@ -82,9 +84,16 @@ describe('predictionAutoSelect', () => { matchesModel: true, displayAs: '5th' }, - p: 0.8 + correction: '5' }, - totalProb: 0.8 + metadata: { + probabilities: { + prediction: 0.8, + correction: 1, + total: 0.8 + }, + autoSelectable: false + } } ]; @@ -92,19 +101,15 @@ describe('predictionAutoSelect', () => { assert.doesNotThrow(() => predictionAutoSelect(predictions)); assert.sameDeepOrderedMembers(predictions, originalPredictions); - const autoselected = predictions.find((entry) => entry.prediction.sample.autoAccept); + const autoselected = predictions.find((entry) => entry.components.prediction.autoAccept); assert.isNotOk(autoselected); }); it(`does not select solitary 'keep' suggestion that doesn't match the model`, () => { - const predictions: CorrectionPredictionTuple[] = [ + const predictions: IntermediateCompositedPrediction[] = [ { - correction: { - sample: 'appl', - p: 1 - }, - prediction: { - sample: { + components: { + prediction: { tag: 'keep', transform: { // can be null / "mocked out" insert: 'l', @@ -113,9 +118,16 @@ describe('predictionAutoSelect', () => { matchesModel: false, displayAs: '"appl"' }, - p: 1 + correction: 'appl' }, - totalProb: 1 + metadata: { + probabilities: { + prediction: 1, + correction: 1, + total: 1 + }, + autoSelectable: true + } } ]; @@ -123,18 +135,14 @@ describe('predictionAutoSelect', () => { assert.doesNotThrow(() => predictionAutoSelect(predictions)); assert.sameDeepOrderedMembers(predictions, originalPredictions); - const autoselected = predictions.find((entry) => entry.prediction.sample.autoAccept); + const autoselected = predictions.find((entry) => entry.components.prediction.autoAccept); assert.isNotOk(autoselected); }); it(`selects 'keep' suggestion that does match the model over any alternatives`, () => { - const keepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thin', - p: .8 - }, - prediction: { - sample: { + const keepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { tag: 'keep', transform: { // can be null / "mocked out" insert: 'n', @@ -143,65 +151,81 @@ describe('predictionAutoSelect', () => { matchesModel: true, displayAs: 'thin' }, - p: .05 + correction: 'thin' }, - totalProb: .04 + metadata: { + probabilities: { + prediction: .05, + correction: .8, + total: .05 * .8 + }, + autoSelectable: true + } } - const highestNonKeepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thin', - p: .8 - }, - prediction: { - sample: { + const highestNonKeepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'nk', deleteLeft: 0 }, displayAs: 'think' }, - p: .55 + correction: 'thin' }, - totalProb: .44 + metadata: { + probabilities: { + prediction: .55, + correction: .8, + total: .55 * .8 + }, + autoSelectable: true + } }; - const predictions: CorrectionPredictionTuple[] = [ + const predictions: IntermediateCompositedPrediction[] = [ keepSuggestion, highestNonKeepSuggestion, { - correction: { - sample: 'thin', - p: .8 - }, - prediction: { - sample: { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'ng', deleteLeft: 0 }, displayAs: 'thing' }, - p: .4 + correction: 'thin' }, - totalProb: .32 + metadata: { + probabilities: { + prediction: .4, + correction: .8, + total: .4 * .8 + }, + autoSelectable: true + } }, { - correction: { - sample: 'thic', - p: .2 - }, - prediction: { - sample: { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'ck', deleteLeft: 0 }, displayAs: 'thick' }, - p: 1 + correction: 'thic' }, - totalProb: .2 + metadata: { + probabilities: { + prediction: 1, + correction: .2, + total: 1 * .2 + }, + autoSelectable: true + } } ]; @@ -209,18 +233,14 @@ describe('predictionAutoSelect', () => { assert.doesNotThrow(() => predictionAutoSelect(predictions)); assert.sameDeepMembers(predictions, originalPredictions); - const autoselected = predictions.find((entry) => entry.prediction.sample.autoAccept); + const autoselected = predictions.find((entry) => entry.components.prediction.autoAccept); assert.equal(autoselected, keepSuggestion); }); it(`selects solitary non-'keep' suggestion when 'keep' does not match model`, () => { - const keepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thin', - p: .8 - }, - prediction: { - sample: { + const keepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { tag: 'keep', transform: { // can be null / "mocked out" insert: 'n', @@ -229,40 +249,50 @@ describe('predictionAutoSelect', () => { displayAs: '"thin"', matchesModel: false }, - p: .05 + correction: 'thin' }, - totalProb: .04 + metadata: { + probabilities: { + prediction: .05, + correction: .8, + total: .8 * .05 + }, + autoSelectable: true + } } // To 'win', a suggestion (currently) needs at least twice the probability of the sum of all alternatives. // This threshold may be subject to change. // // Refer to AUTOSELECT_PROPORTION_THRESHOLD in predict-helpers.ts. - const onlyNonKeepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thin', - p: .8 - }, - prediction: { - sample: { + const onlyNonKeepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'nk', deleteLeft: 0 }, displayAs: 'think' }, - p: .01 + correction: 'thin' }, - totalProb: .008 + metadata: { + probabilities: { + prediction: .01, + correction: .8, + total: .01 * .8 + }, + autoSelectable: true + } }; - const predictions: CorrectionPredictionTuple[] = [ + const predictions: IntermediateCompositedPrediction[] = [ keepSuggestion, onlyNonKeepSuggestion ]; - const totalProb = predictions.reduce((accum, current) => accum + current.totalProb, 0); - assert.isBelow(onlyNonKeepSuggestion.totalProb, totalProb * AUTOSELECT_PROPORTION_THRESHOLD, 'test setup is no longer valid'); + const totalProb = predictions.reduce((accum, current) => accum + current.metadata.probabilities.total, 0); + assert.isBelow(onlyNonKeepSuggestion.metadata.probabilities.total, totalProb * AUTOSELECT_PROPORTION_THRESHOLD, 'test setup is no longer valid'); predictions.sort(tupleDisplayOrderSort); @@ -270,18 +300,14 @@ describe('predictionAutoSelect', () => { assert.doesNotThrow(() => predictionAutoSelect(predictions)); assert.sameDeepOrderedMembers(predictions, originalPredictions); - const autoselected = predictions.find((entry) => entry.prediction.sample.autoAccept); + const autoselected = predictions.find((entry) => entry.components.prediction.autoAccept); assert.equal(autoselected, onlyNonKeepSuggestion); }); it(`does not select non-'keep' without sufficient winning probability`, () => { - const keepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thin', - p: .8 - }, - prediction: { - sample: { + const keepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { tag: 'keep', transform: { // can be null / "mocked out" insert: 'n', @@ -290,74 +316,90 @@ describe('predictionAutoSelect', () => { displayAs: '"thin"', matchesModel: false }, - p: .05 + correction: 'thin' }, - totalProb: .04 + metadata: { + probabilities: { + prediction: .05, + correction: .8, + total: .05 * .8 + }, + autoSelectable: true + } } // To 'win', a suggestion (currently) needs at least twice the probability of the sum of all alternatives. // This threshold may be subject to change. // // Refer to AUTOSELECT_PROPORTION_THRESHOLD in predict-helpers.ts. - const highestNonKeepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thin', - p: .8 - }, - prediction: { - sample: { + const highestNonKeepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'nk', deleteLeft: 0 }, displayAs: 'think' }, - p: .55 + correction: 'thin' }, - totalProb: .44 + metadata: { + probabilities: { + prediction: .55, + correction: .8, + total: .55 * .8 + }, + autoSelectable: true + } }; - const predictions: CorrectionPredictionTuple[] = [ + const predictions: IntermediateCompositedPrediction[] = [ keepSuggestion, highestNonKeepSuggestion, { - correction: { - sample: 'thin', - p: .8 - }, - prediction: { - sample: { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'ng', deleteLeft: 0 }, displayAs: 'thing' }, - p: .4 + correction: 'thin' }, - totalProb: .32 + metadata: { + probabilities: { + prediction: .4, + correction: .8, + total: .4 * .8 + }, + autoSelectable: true + } }, { - correction: { - sample: 'thic', - p: .2 - }, - prediction: { - sample: { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'ck', deleteLeft: 0 }, displayAs: 'thick' }, - p: 1 + correction: 'thic' }, - totalProb: .2 + metadata: { + probabilities: { + prediction: 1, + correction: .2, + total: 1 * .2 + }, + autoSelectable: true + } } ]; - const totalProb = predictions.reduce((accum, current) => accum + current.totalProb, 0); - assert.isBelow(highestNonKeepSuggestion.totalProb, totalProb * AUTOSELECT_PROPORTION_THRESHOLD, 'test setup is no longer valid'); + const totalProb = predictions.reduce((accum, current) => accum + current.metadata.probabilities.total, 0); + assert.isBelow(highestNonKeepSuggestion.metadata.probabilities.total, totalProb * AUTOSELECT_PROPORTION_THRESHOLD, 'test setup is no longer valid'); predictions.sort(tupleDisplayOrderSort); @@ -365,18 +407,14 @@ describe('predictionAutoSelect', () => { assert.doesNotThrow(() => predictionAutoSelect(predictions)); assert.sameDeepOrderedMembers(predictions, originalPredictions); - const autoselected = predictions.find((entry) => entry.prediction.sample.autoAccept); + const autoselected = predictions.find((entry) => entry.components.prediction.autoAccept); assert.isNotOk(autoselected); }); it(`does select non-'keep' with sufficient winning probability`, () => { - const keepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thin', - p: .8 - }, - prediction: { - sample: { + const keepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { tag: 'keep', transform: { // can be null / "mocked out" insert: 'n', @@ -385,87 +423,99 @@ describe('predictionAutoSelect', () => { displayAs: '"thin"', matchesModel: false }, - p: .05 + correction: 'thin' }, - totalProb: .04 + metadata: { + probabilities: { + prediction: .05, + correction: .8, + total: .05 * .8 + }, + autoSelectable: true + } } - const highestNonKeepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thin', - p: .9 - }, - prediction: { - sample: { + const highestNonKeepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'nk', deleteLeft: 0 }, displayAs: 'think' }, - p: .75 + correction: 'thin' }, - totalProb: .675 + metadata: { + probabilities: { + prediction: .75, + correction: .9, + total: .75 * .9 + }, + autoSelectable: true + } }; - const predictions: CorrectionPredictionTuple[] = [ + const predictions: IntermediateCompositedPrediction[] = [ keepSuggestion, highestNonKeepSuggestion, { - correction: { - sample: 'thin', - p: .9 - }, - prediction: { - sample: { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'ng', deleteLeft: 0 }, displayAs: 'thing' }, - p: .2 + correction: 'thin' }, - totalProb: .18 + metadata: { + probabilities: { + prediction: .2, + correction: .9, + total: .2 * .9 + }, + autoSelectable: true + } }, { - correction: { - sample: 'thic', - p: .1 - }, - prediction: { - sample: { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'ck', deleteLeft: 0 }, displayAs: 'thick' }, - p: 1 + correction: 'thic' }, - totalProb: .1 + metadata: { + probabilities: { + prediction: 1, + correction: .1, + total: 1 * .1 + }, + autoSelectable: true + } } ]; - const totalProb = predictions.reduce((accum, current) => accum + current.totalProb, 0); - assert.isAbove(highestNonKeepSuggestion.totalProb, totalProb * AUTOSELECT_PROPORTION_THRESHOLD, 'test setup is no longer valid'); + const totalProb = predictions.reduce((accum, current) => accum + current.metadata.probabilities.total, 0); + assert.isAbove(highestNonKeepSuggestion.metadata.probabilities.total, totalProb * AUTOSELECT_PROPORTION_THRESHOLD, 'test setup is no longer valid'); const originalPredictions = [].concat(predictions); assert.doesNotThrow(() => predictionAutoSelect(predictions)); assert.sameDeepMembers(predictions, originalPredictions); - const autoselected = predictions.find((entry) => entry.prediction.sample.autoAccept); + const autoselected = predictions.find((entry) => entry.components.prediction.autoAccept); assert.equal(autoselected, highestNonKeepSuggestion); }); it('ignores non key-matched suggestions when key-matched suggestions exist', () => { - const keepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'cant', - p: 1 - }, - prediction: { - sample: { + const keepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { tag: 'keep', transform: { // can be null / "mocked out" insert: 't', @@ -474,51 +524,64 @@ describe('predictionAutoSelect', () => { displayAs: '"cant"', matchesModel: false }, - p: 1 + correction: 'cant' }, - totalProb: 1, - matchLevel: SuggestionSimilarity.exact + metadata: { + probabilities: { + prediction: 1, + correction: 1, + total: 1 * 1 + }, + autoSelectable: true, + matchLevel: SuggestionSimilarity.exact + } } - const expectedSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'cant', - p: 1 - }, - prediction: { - sample: { + const expectedSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { // can be null / "mocked out" insert: '\'t', deleteLeft: 0 }, displayAs: "can't" }, - p: .2 + correction: 'cant' }, - totalProb: .2, - matchLevel: SuggestionSimilarity.sameKey + metadata: { + probabilities: { + prediction: .2, + correction: 1, + total: .2 * 1 + }, + autoSelectable: true, + matchLevel: SuggestionSimilarity.sameKey + } }; - const predictions: CorrectionPredictionTuple[] = [ + const predictions: IntermediateCompositedPrediction[] = [ keepSuggestion, expectedSuggestion, { - correction: { - sample: 'cant', - p: 1 - }, - prediction: { - sample: { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'teen', deleteLeft: 0 }, displayAs: 'canteen' }, - p: .8 + correction: 'cant' }, - totalProb: .8, - matchLevel: SuggestionSimilarity.none + metadata: { + probabilities: { + prediction: .8, + correction: 1, + total: .8 * 1 + }, + autoSelectable: true, + matchLevel: SuggestionSimilarity.none + } } ]; @@ -527,20 +590,16 @@ describe('predictionAutoSelect', () => { assert.sameDeepMembers(predictions, originalPredictions); - const autoselected = predictions.find((entry) => entry.prediction.sample.autoAccept); + const autoselected = predictions.find((entry) => entry.components.prediction.autoAccept); assert.equal(autoselected, expectedSuggestion); }); // The idea: avoid "over-correcting" when a potential correction has a // super-high-frequency word. it('does not auto-select suggestion if its root correction is not most likely', () => { - const keepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thi', - p: .7 - }, - prediction: { - sample: { + const keepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { tag: 'keep', transform: { // can be null / "mocked out" insert: 'i', @@ -549,61 +608,74 @@ describe('predictionAutoSelect', () => { displayAs: '"thi"', matchesModel: false }, - p: .05 + correction: 'thi' }, - totalProb: .035 + metadata: { + probabilities: { + prediction: .05, + correction: .7, + total: .05 * .7 + }, + autoSelectable: true + } } - const highestCorrectionSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'thi', - p: .7 - }, - prediction: { - sample: { + const highestCorrectionSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'in', deleteLeft: 0 }, displayAs: 'thin' }, - p: .1 + correction: 'thi', }, - totalProb: .07 + metadata: { + probabilities: { + prediction: .1, + correction: .7, + total: .1 * .7 + }, + autoSelectable: true + } }; - const highestNonKeepSuggestion: CorrectionPredictionTuple = { - correction: { - sample: 'the', - p: .3 - }, - prediction: { - sample: { + const highestNonKeepSuggestion: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { // can be null / "mocked out" insert: 'e', deleteLeft: 0 }, displayAs: 'the' }, - p: 1 + correction: 'the' }, - totalProb: .3 + metadata: { + probabilities: { + prediction: 1, + correction: .3, + total: 1 * .3 + }, + autoSelectable: true + } }; - const predictions: CorrectionPredictionTuple[] = [ + const predictions: IntermediateCompositedPrediction[] = [ keepSuggestion, highestNonKeepSuggestion, highestCorrectionSuggestion ]; - const totalProb = predictions.reduce((accum, current) => accum + current.totalProb, 0); - assert.isAbove(highestNonKeepSuggestion.totalProb, totalProb * AUTOSELECT_PROPORTION_THRESHOLD, 'test setup is no longer valid'); + const totalProb = predictions.reduce((accum, current) => accum + current.metadata.probabilities.total, 0); + assert.isAbove(highestNonKeepSuggestion.metadata.probabilities.total, totalProb * AUTOSELECT_PROPORTION_THRESHOLD, 'test setup is no longer valid'); const originalPredictions = [].concat(predictions); assert.doesNotThrow(() => predictionAutoSelect(predictions)); assert.sameDeepMembers(predictions, originalPredictions); - const autoselected = predictions.find((entry) => entry.prediction.sample.autoAccept); + const autoselected = predictions.find((entry) => entry.components.prediction.autoAccept); assert.isNotOk(autoselected); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/create-default-keep.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/create-default-keep.tests.ts index 40fd2dafc6f..a86048b33d4 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/create-default-keep.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/create-default-keep.tests.ts @@ -13,7 +13,7 @@ import { LexicalModelTypes } from "@keymanapp/common-types"; import * as wordBreakers from '@keymanapp/models-wordbreakers'; import { applyTransform } from '@keymanapp/models-templates'; -import { CorrectionPredictionTuple, createDefaultKeep, models, SuggestionSimilarity } from "@keymanapp/lm-worker/test-index"; +import { IntermediateCompositedPrediction, createDefaultKeep, models, SuggestionSimilarity } from "@keymanapp/lm-worker/test-index"; import CasingFunction = LexicalModelTypes.CasingFunction; import Context = LexicalModelTypes.Context; @@ -109,13 +109,9 @@ describe('createDefaultKeep', () => { p: 1 }; - const expectedKeep: CorrectionPredictionTuple = { - correction: { - sample: 'iphone', - p: 1 - }, - prediction: { - sample: { + const expectedKeep: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 'iphone', deleteLeft: 5 @@ -124,10 +120,17 @@ describe('createDefaultKeep', () => { matchesModel: false, tag: 'keep' }, - p: 1 + correction: 'iphone' }, - totalProb: 1, - matchLevel: SuggestionSimilarity.exact + metadata: { + probabilities: { + prediction: 1, + correction: 1, + total: 1 * 1 + }, + autoSelectable: false, + matchLevel: SuggestionSimilarity.exact + } }; const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); @@ -150,13 +153,9 @@ describe('createDefaultKeep', () => { p: 1 }; - const expectedKeep: CorrectionPredictionTuple = { - correction: { - sample: 'iphone', - p: 1 - }, - prediction: { - sample: { + const expectedKeep: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 'iphone', deleteLeft: 7 @@ -165,10 +164,17 @@ describe('createDefaultKeep', () => { matchesModel: false, tag: 'keep' }, - p: 1 + correction: 'iphone' }, - totalProb: 1, - matchLevel: SuggestionSimilarity.exact + metadata: { + probabilities: { + prediction: 1, + correction: 1, + total: 1 * 1 + }, + autoSelectable: false, + matchLevel: SuggestionSimilarity.exact + } }; const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); @@ -191,13 +197,9 @@ describe('createDefaultKeep', () => { p: 1 }; - const expectedKeep: CorrectionPredictionTuple = { - correction: { - sample: 'iphone', - p: 1 - }, - prediction: { - sample: { + const expectedKeep: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 'iphone', deleteLeft: 8 @@ -206,10 +208,17 @@ describe('createDefaultKeep', () => { matchesModel: false, tag: 'keep' }, - p: 1 + correction: 'iphone' }, - totalProb: 1, - matchLevel: SuggestionSimilarity.exact + metadata: { + probabilities: { + prediction: 1, + correction: 1, + total: 1 * 1 + }, + autoSelectable: false, + matchLevel: SuggestionSimilarity.exact + } }; const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); @@ -232,13 +241,9 @@ describe('createDefaultKeep', () => { p: 1 }; - const expectedKeep: CorrectionPredictionTuple = { - correction: { - sample: 'and', - p: 1 - }, - prediction: { - sample: { + const expectedKeep: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 'and', deleteLeft: 3 @@ -247,10 +252,17 @@ describe('createDefaultKeep', () => { matchesModel: false, tag: 'keep' }, - p: 1 + correction: 'and' }, - totalProb: 1, - matchLevel: SuggestionSimilarity.exact + metadata: { + probabilities: { + prediction: 1, + correction: 1, + total: 1 * 1 + }, + autoSelectable: false, + matchLevel: SuggestionSimilarity.exact + } }; const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); @@ -273,13 +285,9 @@ describe('createDefaultKeep', () => { p: 1 }; - const expectedKeep: CorrectionPredictionTuple = { - correction: { - sample: 'iphones', - p: 1 - }, - prediction: { - sample: { + const expectedKeep: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 'iphones', deleteLeft: 7 @@ -288,10 +296,17 @@ describe('createDefaultKeep', () => { matchesModel: false, tag: 'keep' }, - p: 1 + correction: 'iphones' }, - totalProb: 1, - matchLevel: SuggestionSimilarity.exact + metadata: { + probabilities: { + prediction: 1, + correction: 1, + total: 1 * 1 + }, + autoSelectable: false, + matchLevel: SuggestionSimilarity.exact + } }; const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); @@ -314,13 +329,9 @@ describe('createDefaultKeep', () => { p: 1 }; - const expectedKeep: CorrectionPredictionTuple = { - correction: { - sample: '', - p: 1 - }, - prediction: { - sample: { + const expectedKeep: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: '', deleteLeft: 0 @@ -329,10 +340,17 @@ describe('createDefaultKeep', () => { matchesModel: false, tag: 'keep' }, - p: 1 + correction: '' }, - totalProb: 1, - matchLevel: SuggestionSimilarity.exact + metadata: { + probabilities: { + prediction: 1, + correction: 1, + total: 1 * 1 + }, + autoSelectable: false, + matchLevel: SuggestionSimilarity.exact + } }; const tuple = createDefaultKeep(testModelWithCasing, applyTransform(trueInput.sample, context), trueInput); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts index a99187defa5..8234c6ba2a9 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts @@ -71,7 +71,7 @@ const DUMMY_MODEL_CONFIG = { languageUsesCasing: true }; -describe('predictFromCorrections', () => { +describe('correctAndEnumerateWithoutTraversals', () => { it('handles a single correction prefixing multiple entries - no transform ID', () => { const context: Context = { left: 'It', @@ -113,14 +113,15 @@ describe('predictFromCorrections', () => { }); const predictions = correctAndEnumerateWithoutTraversals(model, correctionDistribution, context); - predictions.forEach((entry) => assert.equal(entry.correction.sample, 'Its')); - predictions.forEach((entry) => assert.equal(entry.correction.p, 0.6)); + + predictions.forEach((entry) => assert.equal(entry.components.correction, 'Its')); + predictions.forEach((entry) => assert.equal(entry.metadata.probabilities.correction, 0.6)); predictions.sort(tupleDisplayOrderSort); - assert.sameDeepOrderedMembers(predictions.map((entry) => entry.prediction.sample), dummied_suggestions); + assert.sameDeepOrderedMembers(predictions.map((entry) => entry.components.prediction), dummied_suggestions); - assert.approximately(predictions[0].totalProb, 0.18 * 0.6, 0.00001); - assert.approximately(predictions[1].totalProb, 0.02 * 0.6, 0.00001); + assert.approximately(predictions[0].metadata.probabilities.total, 0.18 * 0.6, 0.00001); + assert.approximately(predictions[1].metadata.probabilities.total, 0.02 * 0.6, 0.00001); }); it('handles a single correction prefixing multiple entries - with transform ID', () => { @@ -165,19 +166,20 @@ describe('predictFromCorrections', () => { }); const predictions = correctAndEnumerateWithoutTraversals(model, correctionDistribution, context); - predictions.forEach((entry) => assert.equal(entry.correction.sample, 'Its')); - predictions.forEach((entry) => assert.equal(entry.correction.p, 0.6)); + + predictions.forEach((entry) => assert.equal(entry.components.correction, 'Its')); + predictions.forEach((entry) => assert.equal(entry.metadata.probabilities.correction, 0.6)); predictions.sort(tupleDisplayOrderSort); - assert.sameOrderedMembers(predictions.map((entry) => entry.prediction.sample.displayAs), ["it's", "its"]); - assert.sameDeepOrderedMembers(predictions.map((entry) => entry.prediction.sample), dummied_suggestions.map((entry) => { + assert.sameOrderedMembers(predictions.map((entry) => entry.components.prediction.displayAs), ["it's", "its"]); + assert.sameDeepOrderedMembers(predictions.map((entry) => entry.components.prediction), dummied_suggestions.map((entry) => { entry = deepCopy(entry); entry.transformId = 314159; return entry; })); - assert.approximately(predictions[0].totalProb, 0.18 * 0.6, 0.00001); - assert.approximately(predictions[1].totalProb, 0.02 * 0.6, 0.00001); + assert.approximately(predictions[0].metadata.probabilities.total, 0.18 * 0.6, 0.00001); + assert.approximately(predictions[1].metadata.probabilities.total, 0.02 * 0.6, 0.00001); }); it('handles multiple corrections at once', () => { @@ -250,12 +252,12 @@ describe('predictFromCorrections', () => { const predictions = correctAndEnumerateWithoutTraversals(model, correctionDistribution, context); predictions.sort(tupleDisplayOrderSort); - assert.sameOrderedMembers(predictions.map((entry) => entry.prediction.sample.displayAs), ["is", "it's", "isn't", "its"]); - assert.sameDeepMembers(predictions.map((entry) => entry.prediction.sample), dummied_suggestions.flatMap((entry) => entry)); + assert.sameOrderedMembers(predictions.map((entry) => entry.components.prediction.displayAs), ["is", "it's", "isn't", "its"]); + assert.sameDeepMembers(predictions.map((entry) => entry.components.prediction), dummied_suggestions.flatMap((entry) => entry)); - assert.approximately(predictions[0].totalProb, 0.4 * 0.4, 0.00001); - assert.approximately(predictions[1].totalProb, 0.18 * 0.6, 0.00001); - assert.approximately(predictions[2].totalProb, 0.4 * 0.2, 0.00001); - assert.approximately(predictions[3].totalProb, 0.02 * 0.6, 0.00001); + assert.approximately(predictions[0].metadata.probabilities.total, 0.4 * 0.4, 0.00001); + assert.approximately(predictions[1].metadata.probabilities.total, 0.18 * 0.6, 0.00001); + assert.approximately(predictions[2].metadata.probabilities.total, 0.4 * 0.2, 0.00001); + assert.approximately(predictions[3].metadata.probabilities.total, 0.02 * 0.6, 0.00001); }); }); \ No newline at end of file diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-deduplication.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-deduplication.tests.ts index eea66d8ad0a..d1aa6df257e 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-deduplication.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-deduplication.tests.ts @@ -4,7 +4,7 @@ import * as wordBreakers from '@keymanapp/models-wordbreakers'; import { deepCopy } from '@keymanapp/web-utils'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { CorrectionPredictionTuple, dedupeSuggestions, models } from "@keymanapp/lm-worker/test-index"; +import { IntermediateCompositedPrediction, dedupeSuggestions, models } from "@keymanapp/lm-worker/test-index"; import Context = LexicalModelTypes.Context; import DummyModel = models.DummyModel; @@ -24,77 +24,89 @@ const testModel = new DummyModel({ * @returns */ const build_its_is_set = () => { - const its: CorrectionPredictionTuple = { - correction: { - sample: 'its', - p: 0.8 - }, - prediction: { - sample: { + const its: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 's', deleteLeft: 0 }, displayAs: 'its' }, - p: 0.2 + correction: 'its' }, - totalProb: 0.16 - // matchLevel does not yet exist. + metadata: { + probabilities: { + prediction: .2, + correction: .8, + total: .2 * .8 + }, + autoSelectable: true + // matchLevel does not yet exist. + } }; - const it_is: CorrectionPredictionTuple = { - correction: { - sample: 'its', - p: 0.8 - }, - prediction: { - sample: { + const it_is: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: '\'s', deleteLeft: 0 }, displayAs: 'it\'s' }, - p: 0.8 + correction: 'its' }, - totalProb: 0.64 + metadata: { + probabilities: { + prediction: .8, + correction: .8, + total: .8 * .8 + }, + autoSelectable: true + } }; - const is: CorrectionPredictionTuple = { - correction: { - sample: 'is', - p: 0.2 - }, - prediction: { - sample: { + const is: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 's', deleteLeft: 1 }, displayAs: 'is' }, - p: 0.5 + correction: 'is' }, - totalProb: 0.1 + metadata: { + probabilities: { + prediction: .5, + correction: .2, + total: .5 * .2 + }, + autoSelectable: true + } }; - const is_not: CorrectionPredictionTuple = { - correction: { - sample: 'is', - p: 0.2 - }, - prediction: { - sample: { + const is_not: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 'sn\'t', deleteLeft: 1 }, displayAs: 'isn\'t' }, - p: 0.5 + correction: 'is' }, - totalProb: 0.1 + metadata: { + probabilities: { + prediction: .5, + correction: .2, + total: .5 * .2 + }, + autoSelectable: true + } }; return { @@ -145,7 +157,7 @@ describe('dedupeSuggestions', () => { // There's no mathematically safe way to combine the components if the // underlying correction sources differ between duplicated suggestions, // though it's mathematically safe to combine their product. - expected.forEach((entry) => entry.totalProb *= (entry.prediction.sample.transform.insert == '\'s') ? 3 : 2); + expected.forEach((entry) => entry.metadata.probabilities.total *= (entry.components.prediction.transform.insert == '\'s') ? 3 : 2); assert.deepEqual(deduplicated, expected); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-finalization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-finalization.tests.ts index 4e63055a101..c1c2ecacc5c 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-finalization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-finalization.tests.ts @@ -5,7 +5,7 @@ import { deepCopy } from '@keymanapp/web-utils'; import * as wordBreakers from '@keymanapp/models-wordbreakers'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { CorrectionPredictionTuple, finalizeSuggestions, models } from "@keymanapp/lm-worker/test-index"; +import { IntermediateCompositedPrediction, finalizeSuggestions, models } from "@keymanapp/lm-worker/test-index"; import DummyModel = models.DummyModel; import Outcome = LexicalModelTypes.Outcome; @@ -39,6 +39,7 @@ const testModelWithoutSpacing = new DummyModel({ } }); + /** * Builds a fresh copy of test values useful for suggestion-similarity * testing. @@ -47,78 +48,89 @@ const testModelWithoutSpacing = new DummyModel({ */ const build_its_is_set = (verbose?: string) => { const verboseFlag = (verbose == 'verbose' ? true : false); - - const its: CorrectionPredictionTuple = { - correction: { - sample: 'its', - p: 0.8 - }, - prediction: { - sample: { + const its: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 's', deleteLeft: 0 }, displayAs: 'its' }, - p: 0.2 + correction: 'its' }, - totalProb: 0.16 - // matchLevel does not yet exist. + metadata: { + probabilities: { + prediction: .2, + correction: .8, + total: .2 * .8 + }, + autoSelectable: true + // matchLevel does not yet exist. + } }; - const it_is: CorrectionPredictionTuple = { - correction: { - sample: 'its', - p: 0.8 - }, - prediction: { - sample: { + const it_is: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: '\'s', deleteLeft: 0 }, displayAs: 'it\'s' }, - p: 0.8 + correction: 'its' }, - totalProb: 0.64 + metadata: { + probabilities: { + prediction: .8, + correction: .8, + total: .8 * .8 + }, + autoSelectable: true + } }; - const is: CorrectionPredictionTuple = { - correction: { - sample: 'is', - p: 0.2 - }, - prediction: { - sample: { + const is: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 's', deleteLeft: 1 }, displayAs: 'is' }, - p: 0.5 + correction: 'is' }, - totalProb: 0.1 + metadata: { + probabilities: { + prediction: .5, + correction: .2, + total: .5 * .2 + }, + autoSelectable: true + } }; - const is_not: CorrectionPredictionTuple = { - correction: { - sample: 'is', - p: 0.2 - }, - prediction: { - sample: { + const is_not: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 'sn\'t', deleteLeft: 1 }, displayAs: 'isn\'t' }, - p: 0.5 + correction: 'is' }, - totalProb: 0.1 + metadata: { + probabilities: { + prediction: .5, + correction: .2, + total: .5 * .2 + }, + autoSelectable: true + } }; const baseDefinitions = { @@ -132,13 +144,13 @@ const build_its_is_set = (verbose?: string) => { const expected = unfinalized.map((entry) => { const mapped: Outcome = { - ...deepCopy(entry.prediction.sample), - p: entry.totalProb + ...deepCopy(entry.components.prediction), + p: entry.metadata.probabilities.total }; if(verboseFlag) { - mapped['correction-p'] = entry.correction.p; - mapped['lexical-p'] = entry.prediction.p; + mapped['correction-p'] = entry.metadata.probabilities.correction; + mapped['lexical-p'] = entry.metadata.probabilities.prediction; } return mapped; diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-similarity.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-similarity.tests.ts index 72911d11281..e4cbfc81b1d 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-similarity.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-similarity.tests.ts @@ -5,7 +5,7 @@ import * as wordBreakers from '@keymanapp/models-wordbreakers'; import { deepCopy } from '@keymanapp/web-utils'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { CorrectionPredictionTuple, models, processSimilarity, SuggestionSimilarity, toAnnotatedSuggestion } from "@keymanapp/lm-worker/test-index"; +import { IntermediateCompositedPrediction, models, processSimilarity, SuggestionSimilarity, toAnnotatedSuggestion } from "@keymanapp/lm-worker/test-index"; import CasingFunction = LexicalModelTypes.CasingFunction; import Context = LexicalModelTypes.Context; @@ -109,77 +109,89 @@ const testModelWithCasing = new DummyModel({ * @returns */ const build_its_is_set = () => { - const its: CorrectionPredictionTuple = { - correction: { - sample: 'its', - p: 0.8 - }, - prediction: { - sample: { + const its: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 's', deleteLeft: 0 }, displayAs: 'its' }, - p: 0.2 + correction: 'its' }, - totalProb: 0.16 - // matchLevel does not yet exist. + metadata: { + probabilities: { + prediction: .2, + correction: .8, + total: .2 * .8 + }, + autoSelectable: true + // matchLevel does not yet exist. + } }; - const it_is: CorrectionPredictionTuple = { - correction: { - sample: 'its', - p: 0.8 - }, - prediction: { - sample: { + const it_is: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: '\'s', deleteLeft: 0 }, displayAs: 'it\'s' }, - p: 0.8 + correction: 'its' }, - totalProb: 0.64 + metadata: { + probabilities: { + prediction: .8, + correction: .8, + total: .8 * .8 + }, + autoSelectable: true + } }; - const is: CorrectionPredictionTuple = { - correction: { - sample: 'is', - p: 0.2 - }, - prediction: { - sample: { + const is: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 's', deleteLeft: 1 }, displayAs: 'is' }, - p: 0.5 + correction: 'is' }, - totalProb: 0.1 + metadata: { + probabilities: { + prediction: .5, + correction: .2, + total: .5 * .2 + }, + autoSelectable: true + } }; - const is_not: CorrectionPredictionTuple = { - correction: { - sample: 'is', - p: 0.2 - }, - prediction: { - sample: { + const is_not: IntermediateCompositedPrediction = { + components: { + prediction: { transform: { insert: 'sn\'t', deleteLeft: 1 }, displayAs: 'isn\'t' }, - p: 0.5 + correction: 'is' }, - totalProb: 0.1 + metadata: { + probabilities: { + prediction: .5, + correction: .2, + total: .5 * .2 + }, + autoSelectable: true + } }; return { @@ -210,32 +222,22 @@ describe('processSimilarity', () => { const testSet = build_its_is_set(); const distribution = [...Object.values(testSet)]; - const expectation: CorrectionPredictionTuple[] = [ - { - ...testSet.its, - matchLevel: SuggestionSimilarity.exact - }, { - ...testSet.it_is, - matchLevel: SuggestionSimilarity.sameKey - }, { - ...testSet.is, - matchLevel: SuggestionSimilarity.none - }, { - ...testSet.is_not, - matchLevel: SuggestionSimilarity.none - } - ]; + const expectation: IntermediateCompositedPrediction[] = [...Object.values(testSet)]; + expectation[0].metadata.matchLevel = SuggestionSimilarity.exact; // its + expectation[1].metadata.matchLevel = SuggestionSimilarity.sameKey; // it_is + expectation[2].metadata.matchLevel = SuggestionSimilarity.none; // is + expectation[3].metadata.matchLevel = SuggestionSimilarity.none; // is_not const its = testSet.its; const original_its = deepCopy(its); - const keep_its = toAnnotatedSuggestion(testModelWithCasing, original_its.prediction.sample, 'keep', QuoteBehavior.noQuotes); + const keep_its = toAnnotatedSuggestion(testModelWithCasing, original_its.components.prediction, 'keep', QuoteBehavior.noQuotes); keep_its.matchesModel = true; processSimilarity(testModelWithCasing, distribution, context, trueInput); assert.sameDeepMembers(distribution, expectation); - assert.equal(its.prediction.sample.tag, 'keep'); - assert.deepEqual(its.prediction.sample, keep_its); + assert.equal(its.components.prediction.tag, 'keep'); + assert.deepEqual(its.components.prediction, keep_its); }); it(`selects contraction as 'more similar' than same-keyed non-contraction when context is contraction`, () => { @@ -257,32 +259,22 @@ describe('processSimilarity', () => { const testSet = build_its_is_set(); const distribution = [...Object.values(testSet)]; - const expectation: CorrectionPredictionTuple[] = [ - { - ...testSet.its, - matchLevel: SuggestionSimilarity.sameKey - }, { - ...testSet.it_is, - matchLevel: SuggestionSimilarity.exact - }, { - ...testSet.is, - matchLevel: SuggestionSimilarity.none - }, { - ...testSet.is_not, - matchLevel: SuggestionSimilarity.none - } - ]; + const expectation: IntermediateCompositedPrediction[] = [...Object.values(testSet)]; + expectation[0].metadata.matchLevel = SuggestionSimilarity.sameKey; // its + expectation[1].metadata.matchLevel = SuggestionSimilarity.exact; // it_is + expectation[2].metadata.matchLevel = SuggestionSimilarity.none; // is + expectation[3].metadata.matchLevel = SuggestionSimilarity.none; // is_not const it_is = testSet.it_is; const original_it_is = deepCopy(it_is); - const keep_it_is = toAnnotatedSuggestion(testModelWithCasing, original_it_is.prediction.sample, 'keep', QuoteBehavior.noQuotes); + const keep_it_is = toAnnotatedSuggestion(testModelWithCasing, original_it_is.components.prediction, 'keep', QuoteBehavior.noQuotes); keep_it_is.matchesModel = true; processSimilarity(testModelWithCasing, distribution, context, trueInput); assert.sameDeepMembers(distribution, expectation); - assert.equal(it_is.prediction.sample.tag, 'keep'); - assert.deepEqual(it_is.prediction.sample, keep_it_is); + assert.equal(it_is.components.prediction.tag, 'keep'); + assert.deepEqual(it_is.components.prediction, keep_it_is); }); describe('with casing', () => { @@ -314,34 +306,22 @@ describe('processSimilarity', () => { // Have the predictions replace existing context parts with the lowercased equivalents. Object.values(testSet).forEach((entry) => { - const transform = entry.prediction.sample.transform; + const transform = entry.components.prediction.transform; transform.insert = transform.deleteLeft == 0 ? `it${transform.insert}` : `i${transform.insert}`; transform.deleteLeft = 2; }); const distribution = [...Object.values(testSet)]; - const expectation: CorrectionPredictionTuple[] = [ - { - ...testSet.its, - matchLevel: SuggestionSimilarity.sameKey - }, { - ...testSet.it_is, - // case mismatch, detectable because we have access to a lowercasing/uppercasing function. - matchLevel: SuggestionSimilarity.sameText - }, { - ...testSet.is, - matchLevel: SuggestionSimilarity.none - }, { - ...testSet.is_not, - matchLevel: SuggestionSimilarity.none - } - ]; - + const expectation: IntermediateCompositedPrediction[] = [...Object.values(testSet)]; + expectation[0].metadata.matchLevel = SuggestionSimilarity.sameKey; // its + expectation[1].metadata.matchLevel = SuggestionSimilarity.sameText; // it_is + expectation[2].metadata.matchLevel = SuggestionSimilarity.none; // is + expectation[3].metadata.matchLevel = SuggestionSimilarity.none; // is_not processSimilarity(testModelWithCasing, distribution, context, trueInput); // Because we mucked with the casing here, there is no perfect 'keep' match. - const keep = distribution.find((entry) => entry.prediction.sample.tag == 'keep'); + const keep = distribution.find((entry) => entry.components.prediction.tag == 'keep'); assert.isNotOk(keep); assert.sameDeepMembers(distribution, expectation); }); @@ -368,34 +348,20 @@ describe('processSimilarity', () => { // Have the predictions replace existing context parts with the lowercased equivalents. Object.values(testSet).forEach((entry) => { - const transform = entry.prediction.sample.transform; + const transform = entry.components.prediction.transform; transform.insert = transform.deleteLeft == 0 ? `it${transform.insert}` : `i${transform.insert}`; transform.deleteLeft = 2; }); const distribution = [...Object.values(testSet)]; - const expectation: CorrectionPredictionTuple[] = [ - { - ...testSet.its, - matchLevel: SuggestionSimilarity.none - }, { - ...testSet.it_is, - // case mismatch, detectable because we have access to a lowercasing/uppercasing function. - matchLevel: SuggestionSimilarity.none - }, { - ...testSet.is, - matchLevel: SuggestionSimilarity.none - }, { - ...testSet.is_not, - matchLevel: SuggestionSimilarity.none - } - ]; + const expectation: IntermediateCompositedPrediction[] = [...Object.values(testSet)]; + expectation.forEach((entry) => entry.metadata.matchLevel = SuggestionSimilarity.none); processSimilarity(testModelWithoutCasing, distribution, context, trueInput); // Because we mucked with the casing here, there is no perfect 'keep' match. - const keep = distribution.find((entry) => entry.prediction.sample.tag == 'keep'); + const keep = distribution.find((entry) => entry.components.prediction.tag == 'keep'); assert.isNotOk(keep); assert.sameDeepMembers(distribution, expectation); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-custom-punctuation.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-custom-punctuation.tests.ts index 9b9ab2c3121..4bac59cafc4 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-custom-punctuation.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-custom-punctuation.tests.ts @@ -81,6 +81,20 @@ describe('Custom Punctuation', function () { open: "'", close: "'" } + }, + // Some of the suggestions above actually wordbreak differently from + // what might be expected. So, we override the wordbreaker to ensure + // the tests run smoothly. + wordbreaker: (text) => { + const textLen = text.length; + if(text.charAt(textLen - 1) == " ") { + return [ + {text: text.substring(0, textLen-2), start: 0, end: textLen-1, length: textLen-1}, + {text: text.substring(textLen-1), start: textLen-1, end: textLen, length: 1} + ]; + } else { + return [{text, start: 0, end: textLen, length: textLen}]; + } } }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-model-compositor.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-model-compositor.tests.ts index 928b6e75c47..8c20df9626e 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-model-compositor.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/worker-model-compositor.tests.ts @@ -869,6 +869,9 @@ describe('ModelCompositor', function() { deleteLeft: 1 } + // Future adjustment: add the 'baseSuggestion' to DummyModel so that it actually + // returns the suggestion again. + // `new models.DummyModel(..., futureSuggestions: [[baseSuggestion]])` let model = new models.DummyModel({punctuation: englishPunctuation}); let compositor = new ModelCompositor(model, true); @@ -883,6 +886,7 @@ describe('ModelCompositor', function() { // As this test is a bit... 'hard-wired', we only get the 'keep' suggestion. // It should still be accurate, though. + // Can be fixed via the "Future adjustment" noted above. assert.equal(suggestions.length, 1); let expectedTransform = { From 61fc801f1480e805a7555b3c6aba2eeae9986ca8 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Tue, 12 May 2026 14:15:17 -0500 Subject: [PATCH 12/16] change(web): support multi-token suggestion similarity Build-bot: skip build:web Test-bot: skip --- .../src/main/model-compositor.ts | 2 +- .../worker-thread/src/main/predict-helpers.ts | 36 +++++++++---------- .../suggestion-similarity.tests.ts | 8 ++--- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts index 08e0f8b9f89..1d4893aab49 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts @@ -174,7 +174,7 @@ export class ModelCompositor { const deduplicatedSuggestionTuples = dedupeSuggestions(this.lexicalModel, rawPredictions, context); // Needs "casing" to be applied first. - const hasExistingKeep = processSimilarity(this.lexicalModel, deduplicatedSuggestionTuples, context, transformDistribution[0]); + const hasExistingKeep = processSimilarity(this.lexicalModel, deduplicatedSuggestionTuples, context, postContext); // If no existing suggestion directly matches the user-visible version of // the token, also add a 'keep' suggestion (with `.matchesModel = false`) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 177eef0a694..176c80e09f0 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -874,35 +874,33 @@ export function dedupeSuggestions( export function processSimilarity( lexicalModel: LexicalModel, suggestionDistribution: IntermediateCompositedPrediction[], - context: Context, - trueInput: ProbabilityMass + baseContext: Context, + finalContext: Context ): boolean { - const { sample: inputTransform } = trueInput; const wordbreak = determineModelWordbreaker(lexicalModel); - const postContext = models.applyTransform(inputTransform, context); - const truePrefix = wordbreak(postContext); - const keyed = (text: string) => lexicalModel.toKey ? lexicalModel.toKey(text) : text; const keyCased = (text: string) => lexicalModel.applyCasing ? lexicalModel.applyCasing('lower', text) : text; - const keyedPrefix = keyed(truePrefix); - const lowercasedPrefix = keyCased(truePrefix); + const keyedTarget = keyed(finalContext.left); + const lowercasedTarget = keyCased(finalContext.left); let keepOption: Outcome; - for(let tuple of suggestionDistribution) { - // Don't set it unnecessarily; this can have side-effects in some automated tests. - if(inputTransform.id !== undefined) { - tuple.components.prediction.transformId = inputTransform.id; - } + // If there are no suggestions found, we can't validate that the underlying + // correction was an empty token. + let allCorrectionsEmpty: boolean = suggestionDistribution.length > 0 + ? true + : wordbreak(finalContext) == ''; - const predictedWord = wordbreak(models.applyTransform(tuple.components.prediction.transform, context)); + for(let tuple of suggestionDistribution) { + const appliedContext = models.applyTransform(tuple.components.prediction.transform, baseContext); + allCorrectionsEmpty &&= tuple.components.correction == ''; // Is the suggestion an exact match (or, "similar enough") to the // actually-typed context? If so, we wish to note this fact and to // prioritize such a suggestion over suggestions that are not. - if(keyed(tuple.components.correction) == keyedPrefix) { - if(predictedWord == truePrefix) { + if(keyed(tuple.components.correction) == keyedTarget) { + if(appliedContext.left == finalContext.left) { // Exact match: it's a perfect 'keep' suggestion. tuple.metadata.matchLevel = SuggestionSimilarity.exact; keepOption = toAnnotatedSuggestion(lexicalModel, tuple.components.prediction, 'keep', models.QuoteBehavior.noQuotes); @@ -914,10 +912,10 @@ export function processSimilarity( keepOption.matchesModel = true; Object.assign(tuple.components.prediction, keepOption); keepOption = tuple.components.prediction as Outcome; - } else if(keyCased(predictedWord) == lowercasedPrefix) { + } else if(keyCased(appliedContext.left) == lowercasedTarget) { // Case-insensitive match. No diacritic differences; the ONLY difference is casing. tuple.metadata.matchLevel = SuggestionSimilarity.sameText; - } else if(keyed(predictedWord) == keyedPrefix) { + } else if(keyed(appliedContext.left) == keyedTarget) { // Diacritic-insensitive / exact-key match. tuple.metadata.matchLevel = SuggestionSimilarity.sameKey; } else { @@ -932,7 +930,7 @@ export function processSimilarity( // // No actual 'keep' needed if the current context token is empty, so we say we // have a 'keep' for that case, even though there isn't really one. - return !!(keepOption || truePrefix == ''); + return !!(keepOption || allCorrectionsEmpty); } /** diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-similarity.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-similarity.tests.ts index e4cbfc81b1d..a485fecd053 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-similarity.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/suggestion-similarity.tests.ts @@ -233,7 +233,7 @@ describe('processSimilarity', () => { const keep_its = toAnnotatedSuggestion(testModelWithCasing, original_its.components.prediction, 'keep', QuoteBehavior.noQuotes); keep_its.matchesModel = true; - processSimilarity(testModelWithCasing, distribution, context, trueInput); + processSimilarity(testModelWithCasing, distribution, context, models.applyTransform(trueInput.sample, context)); assert.sameDeepMembers(distribution, expectation); assert.equal(its.components.prediction.tag, 'keep'); @@ -270,7 +270,7 @@ describe('processSimilarity', () => { const keep_it_is = toAnnotatedSuggestion(testModelWithCasing, original_it_is.components.prediction, 'keep', QuoteBehavior.noQuotes); keep_it_is.matchesModel = true; - processSimilarity(testModelWithCasing, distribution, context, trueInput); + processSimilarity(testModelWithCasing, distribution, context, models.applyTransform(trueInput.sample, context)); assert.sameDeepMembers(distribution, expectation); assert.equal(it_is.components.prediction.tag, 'keep'); @@ -318,7 +318,7 @@ describe('processSimilarity', () => { expectation[1].metadata.matchLevel = SuggestionSimilarity.sameText; // it_is expectation[2].metadata.matchLevel = SuggestionSimilarity.none; // is expectation[3].metadata.matchLevel = SuggestionSimilarity.none; // is_not - processSimilarity(testModelWithCasing, distribution, context, trueInput); + processSimilarity(testModelWithCasing, distribution, context, models.applyTransform(trueInput.sample, context)); // Because we mucked with the casing here, there is no perfect 'keep' match. const keep = distribution.find((entry) => entry.components.prediction.tag == 'keep'); @@ -358,7 +358,7 @@ describe('processSimilarity', () => { const expectation: IntermediateCompositedPrediction[] = [...Object.values(testSet)]; expectation.forEach((entry) => entry.metadata.matchLevel = SuggestionSimilarity.none); - processSimilarity(testModelWithoutCasing, distribution, context, trueInput); + processSimilarity(testModelWithoutCasing, distribution, context, models.applyTransform(trueInput.sample, context)); // Because we mucked with the casing here, there is no perfect 'keep' match. const keep = distribution.find((entry) => entry.components.prediction.tag == 'keep'); From ff53eed4a2295242f018127ef286bcf2d291a356 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Mon, 4 May 2026 14:39:18 -0500 Subject: [PATCH 13/16] change(web): add tokenized prediction intermediate type for whitespace correction support Converts early uses of CompositedPredictionData to TokenizedPredictionData to facilitate important token-based aspects of whitespace correction support, such as case-handling. Build-bot: skip build:web Test-bot: skip --- .../src/main/model-compositor.ts | 35 +-- .../worker-thread/src/main/predict-helpers.ts | 206 +++++++++++++----- 2 files changed, 158 insertions(+), 83 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts index 1d4893aab49..71c7704c144 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts @@ -1,15 +1,13 @@ import * as models from '@keymanapp/models-templates'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { applySuggestionCasing, correctAndEnumerate, createDefaultKeep, dedupeSuggestions, finalizeSuggestions, predictionAutoSelect, processSimilarity, toAnnotatedSuggestion, tupleDisplayOrderSort } from './predict-helpers.js'; -import { detectCurrentCasing, determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; -import TransformUtils from './transformUtils.js'; - import * as correction from './correction/index.js' +import { applySuggestionCasing, compositeIntermediatePredictions, correctAndEnumerate, createDefaultKeep, dedupeSuggestions, finalizeSuggestions, predictionAutoSelect, processSimilarity, toAnnotatedSuggestion, tupleDisplayOrderSort } from './predict-helpers.js'; +import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; + import { ContextTracker } from './correction/context-tracker.js'; import { DEFAULT_ALLOTTED_CORRECTION_TIME_INTERVAL } from './correction/distance-modeler.js'; -import CasingForm = LexicalModelTypes.CasingForm; import Configuration = LexicalModelTypes.Configuration; import Context = LexicalModelTypes.Context; import Distribution = LexicalModelTypes.Distribution; @@ -125,24 +123,6 @@ export class ModelCompositor { const transformId = inputTransform.id; this.initContextTracker(context, transformId); - const allowBksp = TransformUtils.isBackspace(inputTransform); - const allowWhitespace = TransformUtils.isWhitespace(inputTransform); - - const postContext = models.applyTransform(inputTransform, context); - - // TODO: It would be best for the correctAndEnumerate method to return the - // suggestion's prefix, as it already has lots of logic oriented to this. - // The context-tracker used there with v14+ models can determine this more - // robustly. - const truePrefix = this.wordbreak(postContext); - // Only use of `truePrefix`. - const basePrefix = (allowBksp || allowWhitespace) ? truePrefix : this.wordbreak(context); - - // Used to restore whitespaces if operations would remove them. - const currentCasing: CasingForm = lexicalModel.languageUsesCasing - ? detectCurrentCasing(lexicalModel, postContext) - : null; - // Section 1: determine 'prediction roots' - enumerate corrections from most to least likely, // searching for results that yield viable predictions from the model. @@ -160,9 +140,9 @@ export class ModelCompositor { // Properly capitalizes the suggestions based on the existing context casing state. // This may result in duplicates if multiple casing options exist within the // lexicon for a word. (Example: "Apple" the company vs "apple" the fruit.) - for(let tuple of rawPredictions) { - if(currentCasing && currentCasing != 'lower') { - applySuggestionCasing(tuple.components.prediction, basePrefix, this.lexicalModel, currentCasing); + if(lexicalModel.languageUsesCasing) { + for(let tuple of rawPredictions) { + tuple.components.forEach((component) => applySuggestionCasing(component, this.lexicalModel)); } } @@ -171,9 +151,10 @@ export class ModelCompositor { // We want to dedupe before trimming the list so that we can present a full set // of viable distinct suggestions if available. - const deduplicatedSuggestionTuples = dedupeSuggestions(this.lexicalModel, rawPredictions, context); + const deduplicatedSuggestionTuples = dedupeSuggestions(this.lexicalModel, compositeIntermediatePredictions(rawPredictions), context); // Needs "casing" to be applied first. + const postContext = postContextState?.context ?? models.applyTransform(inputTransform, context); const hasExistingKeep = processSimilarity(this.lexicalModel, deduplicatedSuggestionTuples, context, postContext); // If no existing suggestion directly matches the user-visible version of diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 176c80e09f0..567166264fc 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -4,7 +4,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types'; import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbreakers'; import TransformUtils from './transformUtils.js'; -import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; +import { detectCurrentCasing, determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; import { ContextTokenization, mapWhitespacedTokenization } from './correction/context-tokenization.js'; import { ContextTracker } from './correction/context-tracker.js'; import { ContextToken } from './correction/context-token.js'; @@ -76,6 +76,24 @@ export const CORRECTION_SEARCH_THRESHOLDS = { REPLACEMENT_SEARCH_THRESHOLD: 4 as const // e^-4 = 0.0183156388. Allows "80%" of an extra edit. } +export interface TokenizedPredictionData { + /** + * The potential Suggestion + */ + prediction: Suggestion, + /** + * The correction upon which the Suggestion is based + */ + correction: string, + /** + * The ContextToken underlying the correction/prediction. + * + * May be undefined, especially for models that do not leverage the + * LexiconTraversal pattern. + */ + source: ContextToken // useful for getting the unkeyed, original version of the text (in model-compositor, where casing is applied) +} + export interface CompositedPredictionData { /** * The potential Suggestion (or Keep) @@ -138,6 +156,19 @@ export interface PredictionMetadata { preservationTransform?: Transform; } +export interface IntermediateTokenizedPrediction { + /** + * Contains the tokenized components to be used to construct a full + * predictive-text Suggestion, as well as data about the source for each + * component. + */ + components: TokenizedPredictionData[]; + /** + * Tracks common intermediate prediction data, such as its underlying probabilities and its similarity to the actual context. + */ + metadata: PredictionMetadata; +} + export interface IntermediateCompositedPrediction { /** * Contains the fully composited predictive-text Suggestion and its underlying correction string. @@ -149,7 +180,7 @@ export interface IntermediateCompositedPrediction { metadata: PredictionMetadata; } -type IntermediatePrediction = IntermediateCompositedPrediction; +type IntermediatePrediction = IntermediateCompositedPrediction | IntermediateTokenizedPrediction; /** * An enum to be used when categorizing the level of similarity between @@ -363,7 +394,7 @@ export function determineSuggestionRange( export function buildAndMapPredictions( transition: ContextTransition, tokenizationCorrection: TokenizationResultMapping, -): IntermediateCompositedPrediction[] { +): IntermediateTokenizedPrediction[] { const model = transition.final.model; const tokenization = tokenizationCorrection.matchingSpace.tokenization; @@ -451,21 +482,28 @@ export function buildAndMapPredictions( // rather than predicting (and possibly extending) tokens not adjacent to the caret. // // Also, fall back to the actual correction string should prediction not be valid here. - return i == correctionTransforms.length - 1 ? predictions : [predictions[0]]; + const predictionsToReturn = i == correctionTransforms.length - 1 ? predictions : [predictions[0]]; + + return predictionsToReturn.map((prediction) => { + return { + prediction, + correction: correctionTransform.insert + }; + }); }); // Constructs a common prefix for all but the final token's component. const predictionPrefix = predictionComponents .slice(0, predictionComponents.length-1) - .reduce((accum, curr) => models.buildMergedTransform(accum, curr[0].sample.transform), { insert: '', deleteLeft: 0 }); + .map((arr) => arr[0]); const prefixProb = predictionComponents .slice(0, predictionComponents.length-1) - .reduce((accum, curr) => accum * curr[0].p, 1) + .reduce((accum, curr) => accum * curr[0].prediction.p, 1) - const completePredictionTuples: IntermediateCompositedPrediction[] = predictionComponents[predictionComponents.length-1].map((prediction) => { - const predictionCost = prediction.p * prefixProb; + const completePredictionTuples: IntermediateTokenizedPrediction[] = predictionComponents[predictionComponents.length-1].map((tuple) => { + const predictionCost = tuple.prediction.p * prefixProb; - return { + const returnVal: IntermediateTokenizedPrediction = { // Will need to do this differently. We want to have each component // individualized b/c casing. Case should be maintained for prior tokens // and managed independently for each. @@ -476,23 +514,15 @@ export function buildAndMapPredictions( // applySuggestionCasing applies onto suggestions, so we'll want to build // the FULL suggestion AFTER applying casing changes (to each token's // suggestion component). - components: { - prediction: { - transformId: transition.transitionId, - transform: models.buildMergedTransform(predictionPrefix, prediction.sample.transform), - displayAs: models.buildMergedTransform(predictionPrefix, prediction.sample.transform).insert // should composite the displayAs strings instead... - }, - correction: correctionTransforms[correctionTransforms.length-1].insert - }, + components: [], metadata: { probabilities: { prediction: predictionCost, correction: correctionCost, total: predictionCost * correctionCost }, - matchLevel: SuggestionSimilarity.none, autoSelectable: tokenizationCorrection.matchingSpace.modelsCorrectables, - + matchLevel: SuggestionSimilarity.none, // Long-term, we shouldn't have `.preservationTransform` here. // // Needed for now until the search actually operates based on @@ -501,6 +531,21 @@ export function buildAndMapPredictions( preservationTransform: tokenization.taillessTrueKeystroke } } + + // Iteratively add the components into the return value here. + const orderedTokens = tokenizationCorrection.matchingSpace.orderedTokens; + const reportTokenizedPrediction = (tuple: typeof predictionPrefix[0], index: number) => { + returnVal.components.push({ + prediction: tuple.prediction.sample, + correction: tuple.correction, + source: orderedTokens[index] + }); + }; + // Also gets the (changing) tail portion. + predictionPrefix.forEach((tuple, index) => reportTokenizedPrediction(tuple, index)); + reportTokenizedPrediction(tuple, orderedTokens.length - 1); + + return returnVal; }); return completePredictionTuples; @@ -565,7 +610,7 @@ export async function correctAndEnumerate( /** * The suggestions generated based on the user's input state. */ - rawPredictions: IntermediateCompositedPrediction[]; + rawPredictions: IntermediateTokenizedPrediction[]; /** * The id of a prior ContextTransition event that triggered a Suggestion found @@ -617,7 +662,7 @@ export async function correctAndEnumerate( const searchModules = tokenizations.map(t => t.tail.searchModule); // Only run the correction search when corrections are enabled. - let rawPredictions: IntermediateCompositedPrediction[] = []; + let rawPredictions: IntermediateTokenizedPrediction[] = []; let bestCorrectionCost: number; for await(const match of getBestTokenMatches(searchModules, timer)) { // Corrections obtained: now to predict from them! @@ -666,7 +711,7 @@ export async function correctAndEnumerate( export function shouldStopSearchingEarly( bestCorrectionCost: number, currentCorrectionCost: number, - rawPredictions: IntermediateCompositedPrediction[] + rawPredictions: IntermediateTokenizedPrediction[] ) { if(currentCorrectionCost >= bestCorrectionCost + CORRECTION_SEARCH_THRESHOLDS.MAX_SEARCH_THRESHOLD) { return true; @@ -707,9 +752,8 @@ export function correctAndEnumerateWithoutTraversals( lexicalModel: LexicalModel, corrections: ProbabilityMass[], context: Context -): IntermediateCompositedPrediction[] { - let returnedPredictions: IntermediateCompositedPrediction[] = []; - +): IntermediateTokenizedPrediction[] { + let returnedPredictions: IntermediateTokenizedPrediction[] = []; const wordbreak = determineModelWordbreaker(lexicalModel); const tokenizer = determineModelTokenizer(lexicalModel); @@ -720,13 +764,28 @@ export function correctAndEnumerateWithoutTraversals( // support, though. const tokenizedCorrection = mapWhitespacedTokenization(tokenization.left.map((t) => { return {exampleInput: t.text} }), lexicalModel, correction.sample).tokenizedTransform; - const deleteLeft = [...tokenizedCorrection.values()].reduce((total, curr) => total + curr.deleteLeft, 0); + const deleteLeft = tokenization.left.length > 1 ? 0 : tokenization.left.reduce((prev, curr) => prev + KMWString.length(curr.text), 0); + + const intermediateTokens: TokenizedPredictionData[] = []; + [...tokenizedCorrection.entries()].forEach((entry, index) => { + let dl = index == 0 ? deleteLeft: 0; + let text: string; + + if(index != 0) { + text = entry[1].insert; + } else { + text = wordbreak(models.applyTransform(entry[1], context)); + } - const tokenizedCorrectionEntries = [...tokenizedCorrection.entries()]; - const preservationTransform = tokenizedCorrectionEntries.slice(0, -1).map((e) => e[1]).reduce((accum, curr) => { - return models.buildMergedTransform(accum, {...curr, deleteLeft: 0}); - }, { insert: '', deleteLeft: 0, id: correction.sample.id}); - preservationTransform.deleteLeft = deleteLeft; + intermediateTokens.push({ + prediction: { + transform: { insert: text, deleteLeft: dl }, + displayAs: text + }, + correction: text, + source: null + }) + }); // Step 2: predict based on the final token. const emptyContext: Context = { @@ -735,32 +794,28 @@ export function correctAndEnumerateWithoutTraversals( endOfBuffer: true }; - const tailCorrection = tokenizedCorrectionEntries[tokenizedCorrectionEntries.length-1][1]; + const tailCorrection = { insert: intermediateTokens[intermediateTokens.length-1].correction, deleteLeft: 0}; let predictions = lexicalModel.predict(tailCorrection, emptyContext); // Step 3: create the intermediate prediction data entries for each generated prediction let predictionSet = predictions.map((pair: ProbabilityMass) => { + + // Overwrite the last entry with the prediction. + const components = [...intermediateTokens]; + + components[components.length - 1] = { + ...components[components.length - 1], + prediction: pair.sample + }; + // Let's not rely on the model to copy transform IDs. // Only bother is there IS an ID to copy. if(correction.sample.id !== undefined) { - pair.sample.transformId = correction.sample.id; - } - - let correctionText: string; - if(tokenizedCorrectionEntries.length != 1) { - correctionText = correction.sample.insert; - // deleteLeft: 0; it's pre-applied within preservationTransform. - } else { - // Use the deleteLeft & tokenize. - const postContext = models.applyTransform(correction.sample, context); - correctionText = wordbreak(postContext); + components.forEach((c) => c.prediction.transformId = correction.sample.id); } - let tuple: IntermediateCompositedPrediction = { - components: { - prediction: pair.sample, - correction: correctionText - }, + let tuple: IntermediateTokenizedPrediction = { + components, metadata: { probabilities: { prediction: pair.p, @@ -768,8 +823,7 @@ export function correctAndEnumerateWithoutTraversals( total: pair.p * correction.p }, autoSelectable: correctionValidForAutoSelect(tailCorrection.insert), - matchLevel: SuggestionSimilarity.none, - preservationTransform + matchLevel: SuggestionSimilarity.none } }; return tuple; @@ -789,20 +843,60 @@ export function correctAndEnumerateWithoutTraversals( * @param lexicalModel * @param casingForm */ -export function applySuggestionCasing(suggestion: Suggestion, baseWord: string, lexicalModel: LexicalModel, casingForm: CasingForm) { - // Step 1: does the suggestion replace the whole word? If not, we should extend the suggestion to do so. - let unchangedLength = KMWString.length(baseWord) - suggestion.transform.deleteLeft; +export function applySuggestionCasing(predictionToken: TokenizedPredictionData, lexicalModel: LexicalModel) { + const suggestion = predictionToken.prediction; + + // Step 0: our pattern for generating predictions and corrections already + // enforces them to encompass the whole word. - if(unchangedLength > 0) { - suggestion.transform.deleteLeft += unchangedLength; - suggestion.transform.insert = KMWString.substr(baseWord, 0, unchangedLength) + suggestion.transform.insert; + // Step 1: detect the original token's casing + let casingForm: CasingForm; + + // If we are using the context-tracking engine (when traversals are enabled), + // we just leverage the context token's exampleInput to determine casing. + // + // If it's not available, the correction entry reflects a word-broken piece of + // the original context, with its original casing - so we use that instead. + let casingRoot = predictionToken.source ? predictionToken.source.exampleInput : predictionToken.correction; + if(!casingRoot) { + // There's no text in place to verify casing expectations; just leave it + // unchanged. + return; } + casingForm = detectCurrentCasing(lexicalModel, { + left: casingRoot, + startOfBuffer: true, + endOfBuffer: true + }); + // Step 2: Now that the transform affects the whole word, we may safely apply casing rules. suggestion.transform.insert = lexicalModel.applyCasing(casingForm, suggestion.transform.insert); suggestion.displayAs = lexicalModel.applyCasing(casingForm, suggestion.displayAs); } +export function compositeIntermediatePredictions(predictions: IntermediateTokenizedPrediction[]): IntermediateCompositedPrediction[] { + return predictions.map((predictionData) => { + const components = predictionData.components; + + return { + components: components.reduce((total, current) => { + const mergedTransform = models.buildMergedTransform(total.prediction.transform, current.prediction.transform); + const mergedDisplayAs = total.prediction.displayAs + current.prediction.displayAs + + return { + prediction: {...total.prediction, transform: mergedTransform, displayAs: mergedDisplayAs}, + correction: total.correction + current.correction + } + }, { + prediction: {...components[0].prediction, transform: { insert: '', deleteLeft: 0 }, displayAs: ''}, + correction: '' + }), + metadata: predictionData.metadata + }; + }); +} + /** * Given an array of suggestions output from the correction and model-lookup processes, * this function checks for any duplicate suggestions and merges them. From d79d54afc33bbbb4edcb3bd91ab444c609e40afc Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Mon, 11 May 2026 16:24:59 -0500 Subject: [PATCH 14/16] fix(web): adjusts existing unit tests to match new intermediate-prediction-data format --- .../early-correction-search-stopping.tests.ts | 14 +- .../predict-from-corrections.tests.ts | 17 +- .../worker-thread/suggestion-casing.tests.ts | 208 ++++++++++-------- 3 files changed, 131 insertions(+), 108 deletions(-) diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/early-correction-search-stopping.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/early-correction-search-stopping.tests.ts index 430d9c6c7e0..9595f15527a 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/early-correction-search-stopping.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/early-correction-search-stopping.tests.ts @@ -1,15 +1,15 @@ import { assert } from 'chai'; -import { CORRECTION_SEARCH_THRESHOLDS, IntermediateCompositedPrediction, ModelCompositor, shouldStopSearchingEarly } from "@keymanapp/lm-worker/test-index"; +import { CORRECTION_SEARCH_THRESHOLDS, IntermediateTokenizedPrediction, ModelCompositor, shouldStopSearchingEarly } from "@keymanapp/lm-worker/test-index"; -function mockIntermediatePrediction(value: number) { +function mockTokenizedPrediction(value: number) { return { metadata: { probabilities: { total: value } } - } as IntermediateCompositedPrediction + } as IntermediateTokenizedPrediction } describe('correction-search: shouldStopSearchingEarly', () => { @@ -22,7 +22,7 @@ describe('correction-search: shouldStopSearchingEarly', () => { assert.equal(predictionProbs.length, ModelCompositor.MAX_SUGGESTIONS, "test setup no longer valid"); // The only part for each entry we actually care about here: .totalProb. - const predictions = predictionProbs.map((entry) => mockIntermediatePrediction(entry)); + const predictions = predictionProbs.map((entry) => mockTokenizedPrediction(entry)); // Thresholding is performed in log-space. // 0.0501 and 0.0499 are offset on each side of 0.05, the last value in the array defined above. @@ -38,8 +38,8 @@ describe('correction-search: shouldStopSearchingEarly', () => { // // Can technically run the method with an empty array, but the actual scenario would have // at least one prediction present in the "found predictions" array. - assert.isFalse(shouldStopSearchingEarly(baseCost, baseCost + expectedThreshold - 0.01, [mockIntermediatePrediction(Math.exp(-1))])); - assert.isTrue(shouldStopSearchingEarly( baseCost, baseCost + expectedThreshold + 0.01, [mockIntermediatePrediction(Math.exp(-1))])); + assert.isFalse(shouldStopSearchingEarly(baseCost, baseCost + expectedThreshold - 0.01, [mockTokenizedPrediction(Math.exp(-1))])); + assert.isTrue(shouldStopSearchingEarly( baseCost, baseCost + expectedThreshold + 0.01, [mockTokenizedPrediction(Math.exp(-1))])); }); it('stops checking corrections earlier when enough predictions have been found', () => { @@ -48,7 +48,7 @@ describe('correction-search: shouldStopSearchingEarly', () => { // The only part for each entry we actually care about here: .totalProb. /** @type {import('#./predict-helpers.js').CorrectionPredictionTuple[]} */ - const predictions = predictionProbs.map((entry) => mockIntermediatePrediction(entry)); + const predictions = predictionProbs.map((entry) => mockTokenizedPrediction(entry)); const baseCost = 1; diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts index 8234c6ba2a9..d18a2d92e43 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-corrections.tests.ts @@ -114,11 +114,12 @@ describe('correctAndEnumerateWithoutTraversals', () => { const predictions = correctAndEnumerateWithoutTraversals(model, correctionDistribution, context); - predictions.forEach((entry) => assert.equal(entry.components.correction, 'Its')); + predictions.forEach((entry) => assert.equal(entry.components.length, 1)); + predictions.forEach((entry) => assert.equal(entry.components[0].correction, 'Its')); predictions.forEach((entry) => assert.equal(entry.metadata.probabilities.correction, 0.6)); predictions.sort(tupleDisplayOrderSort); - assert.sameDeepOrderedMembers(predictions.map((entry) => entry.components.prediction), dummied_suggestions); + assert.sameDeepOrderedMembers(predictions.map((entry) => entry.components[0].prediction), dummied_suggestions); assert.approximately(predictions[0].metadata.probabilities.total, 0.18 * 0.6, 0.00001); assert.approximately(predictions[1].metadata.probabilities.total, 0.02 * 0.6, 0.00001); @@ -167,12 +168,13 @@ describe('correctAndEnumerateWithoutTraversals', () => { const predictions = correctAndEnumerateWithoutTraversals(model, correctionDistribution, context); - predictions.forEach((entry) => assert.equal(entry.components.correction, 'Its')); + predictions.forEach((entry) => assert.equal(entry.components.length, 1)); + predictions.forEach((entry) => assert.equal(entry.components[0].correction, 'Its')); predictions.forEach((entry) => assert.equal(entry.metadata.probabilities.correction, 0.6)); predictions.sort(tupleDisplayOrderSort); - assert.sameOrderedMembers(predictions.map((entry) => entry.components.prediction.displayAs), ["it's", "its"]); - assert.sameDeepOrderedMembers(predictions.map((entry) => entry.components.prediction), dummied_suggestions.map((entry) => { + assert.sameOrderedMembers(predictions.map((entry) => entry.components[0].prediction.displayAs), ["it's", "its"]); + assert.sameDeepOrderedMembers(predictions.map((entry) => entry.components[0].prediction), dummied_suggestions.map((entry) => { entry = deepCopy(entry); entry.transformId = 314159; return entry; @@ -252,8 +254,9 @@ describe('correctAndEnumerateWithoutTraversals', () => { const predictions = correctAndEnumerateWithoutTraversals(model, correctionDistribution, context); predictions.sort(tupleDisplayOrderSort); - assert.sameOrderedMembers(predictions.map((entry) => entry.components.prediction.displayAs), ["is", "it's", "isn't", "its"]); - assert.sameDeepMembers(predictions.map((entry) => entry.components.prediction), dummied_suggestions.flatMap((entry) => entry)); + predictions.forEach((entry) => assert.equal(entry.components.length, 1)); + assert.sameOrderedMembers(predictions.map((entry) => entry.components[0].prediction.displayAs), ["is", "it's", "isn't", "its"]); + assert.sameDeepMembers(predictions.map((entry) => entry.components[0].prediction), dummied_suggestions.flatMap((entry) => entry)); assert.approximately(predictions[0].metadata.probabilities.total, 0.4 * 0.4, 0.00001); assert.approximately(predictions[1].metadata.probabilities.total, 0.18 * 0.6, 0.00001); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/suggestion-casing.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/suggestion-casing.tests.ts index dd586eab646..7de4abac395 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/suggestion-casing.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/suggestion-casing.tests.ts @@ -13,7 +13,7 @@ import * as wordBreakers from '@keymanapp/models-wordbreakers'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { applySuggestionCasing, models } from '@keymanapp/lm-worker/test-index'; +import { TokenizedPredictionData, applySuggestionCasing, models } from '@keymanapp/lm-worker/test-index'; import CasingFunction = LexicalModelTypes.CasingFunction; import TrieModel = models.TrieModel; @@ -45,117 +45,137 @@ describe('applySuggestionCasing', function() { ); it('properly cases suggestions with no suggestion root', function() { - let suggestion = { - transform: { - insert: 'the', - deleteLeft: 0 + let suggestion: TokenizedPredictionData[] = [{ + prediction: { + transform: { + insert: 'the', + deleteLeft: 0 + }, + displayAs: 'the' }, - displayAs: 'the' - }; - - applySuggestionCasing(suggestion, '', plainCasedModel, 'initial'); - assert.equal(suggestion.displayAs, 'The'); - assert.equal(suggestion.transform.insert, 'The'); - - suggestion = { - transform: { - insert: 'thE', - deleteLeft: 0 - }, - displayAs: 'thE' - }; - - applySuggestionCasing(suggestion, '', plainCasedModel, 'initial'); - assert.equal(suggestion.displayAs, 'ThE'); - assert.equal(suggestion.transform.insert, 'ThE'); - - suggestion = { - transform: { - insert: 'the', - deleteLeft: 0 + correction: '', + source: null + }]; + + applySuggestionCasing(suggestion[0], plainCasedModel); + assert.equal(suggestion[0].prediction.displayAs, 'the'); + assert.equal(suggestion[0].prediction.transform.insert, 'the'); + + suggestion = [{ + prediction: { + transform: { + insert: 'ThE', + deleteLeft: 0 + }, + displayAs: 'ThE' }, - displayAs: 'the' - }; + correction: '', + source: null + }]; - applySuggestionCasing(suggestion, '', plainCasedModel, 'upper'); - assert.equal(suggestion.displayAs, 'THE'); - assert.equal(suggestion.transform.insert, 'THE'); + applySuggestionCasing(suggestion[0], plainCasedModel); + assert.equal(suggestion[0].prediction.displayAs, 'ThE'); + assert.equal(suggestion[0].prediction.transform.insert, 'ThE'); }); it('properly cases suggestions that fully replace the suggestion root', function() { - let suggestion = { - transform: { - insert: 'therefore', - deleteLeft: 3 + let suggestion: TokenizedPredictionData[] = [{ + prediction: { + transform: { + insert: 'therefore', + deleteLeft: 3 + }, + displayAs: 'therefore' }, - displayAs: 'therefore' - }; - - applySuggestionCasing(suggestion, 'the', plainCasedModel, 'initial'); - assert.equal(suggestion.displayAs, 'Therefore'); - assert.equal(suggestion.transform.insert, 'Therefore'); - - suggestion = { - transform: { - insert: 'thereFore', - deleteLeft: 3 + correction: 'The', + source: null + }]; + + applySuggestionCasing(suggestion[0], plainCasedModel); + assert.equal(suggestion[0].prediction.displayAs, 'Therefore'); + assert.equal(suggestion[0].prediction.transform.insert, 'Therefore'); + + suggestion = [{ + prediction: { + transform: { + insert: 'thereFore', + deleteLeft: 3 + }, + displayAs: 'thereFore' }, - displayAs: 'thereFore' - }; - - applySuggestionCasing(suggestion, 'the', plainCasedModel, 'initial'); - assert.equal(suggestion.displayAs, 'ThereFore'); - assert.equal(suggestion.transform.insert, 'ThereFore'); - - suggestion = { - transform: { - insert: 'therefore', - deleteLeft: 3 + correction: 'The', + source: null + }]; + + applySuggestionCasing(suggestion[0], plainCasedModel); + assert.equal(suggestion[0].prediction.displayAs, 'ThereFore'); + assert.equal(suggestion[0].prediction.transform.insert, 'ThereFore'); + + suggestion = [{ + prediction: { + transform: { + insert: 'therefore', + deleteLeft: 3 + }, + displayAs: 'therefore' }, - displayAs: 'therefore' - }; + correction: 'THE', + source: null + }]; - applySuggestionCasing(suggestion, 'the', plainCasedModel, 'upper'); - assert.equal(suggestion.displayAs, 'THEREFORE'); - assert.equal(suggestion.transform.insert, 'THEREFORE'); + applySuggestionCasing(suggestion[0], plainCasedModel); + assert.equal(suggestion[0].prediction.displayAs, 'THEREFORE'); + assert.equal(suggestion[0].prediction.transform.insert, 'THEREFORE'); }); it('properly cases suggestions that do not fully replace the suggestion root', function() { - let suggestion = { - transform: { - insert: 'erefore', - deleteLeft: 1 + let suggestion: TokenizedPredictionData[] = [{ + prediction: { + transform: { + insert: 'therefore', + deleteLeft: 3 + }, + displayAs: 'therefore' }, - displayAs: 'therefore' - }; + correction: 'The', + source: null + }]; // When integrated, the 'the' string comes from a wordbreak operation on the current context. - applySuggestionCasing(suggestion, 'the', plainCasedModel, 'initial'); - assert.equal(suggestion.displayAs, 'Therefore'); - assert.equal(suggestion.transform.insert, 'Therefore'); - - suggestion = { - transform: { - insert: 'ereFore', - deleteLeft: 1 + applySuggestionCasing(suggestion[0], plainCasedModel); + assert.equal(suggestion[0].prediction.displayAs, 'Therefore'); + assert.equal(suggestion[0].prediction.transform.insert, 'Therefore'); + + suggestion = [{ + prediction: { + transform: { + insert: 'ThereFore', + deleteLeft: 3 + }, + displayAs: 'thereFore' }, - displayAs: 'thereFore' - }; - - applySuggestionCasing(suggestion, 'the', plainCasedModel, 'initial'); - assert.equal(suggestion.displayAs, 'ThereFore'); - assert.equal(suggestion.transform.insert, 'ThereFore'); - - suggestion = { - transform: { - insert: 'erefore', - deleteLeft: 1 + correction: 'The', + source: null + }]; + + applySuggestionCasing(suggestion[0], plainCasedModel); + assert.equal(suggestion[0].prediction.displayAs, 'ThereFore'); + assert.equal(suggestion[0].prediction.transform.insert, 'ThereFore'); + + suggestion = [{ + prediction: { + transform: { + insert: 'therefore', + deleteLeft: 3 + }, + displayAs: 'therefore' }, - displayAs: 'therefore' - }; + correction: 'THE', + source: null + }]; - applySuggestionCasing(suggestion, 'the', plainCasedModel, 'upper'); - assert.equal(suggestion.displayAs, 'THEREFORE'); - assert.equal(suggestion.transform.insert, 'THEREFORE'); + applySuggestionCasing(suggestion[0], plainCasedModel); + assert.equal(suggestion[0].prediction.displayAs, 'THEREFORE'); + assert.equal(suggestion[0].prediction.transform.insert, 'THEREFORE'); }); }); \ No newline at end of file From b763609f46ce8af4a741a16f62150d6d3ec56210 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Wed, 13 May 2026 14:18:17 -0500 Subject: [PATCH 15/16] fix(web): apply original casing-application logic on a per-token basis --- .../worker-thread/src/main/predict-helpers.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 567166264fc..5f825e4b9d9 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -871,8 +871,10 @@ export function applySuggestionCasing(predictionToken: TokenizedPredictionData, }); // Step 2: Now that the transform affects the whole word, we may safely apply casing rules. - suggestion.transform.insert = lexicalModel.applyCasing(casingForm, suggestion.transform.insert); - suggestion.displayAs = lexicalModel.applyCasing(casingForm, suggestion.displayAs); + if(casingForm && casingForm != 'lower') { + suggestion.transform.insert = lexicalModel.applyCasing(casingForm, suggestion.transform.insert); + suggestion.displayAs = lexicalModel.applyCasing(casingForm, suggestion.displayAs); + } } export function compositeIntermediatePredictions(predictions: IntermediateTokenizedPrediction[]): IntermediateCompositedPrediction[] { From 1e505feac056c6b73e1ed12b5b51f427959417cb Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Wed, 13 May 2026 15:13:42 -0500 Subject: [PATCH 16/16] change(web): adjust TokenizationCorrector spec Build-bot: skip build:web Test-bot: skip --- .../main/correction/tokenization-corrector.ts | 65 ++++++++++++++----- .../correction/tokenization-result-mapping.ts | 20 ++++-- .../worker-thread/src/main/predict-helpers.ts | 18 +++-- .../tokenization-corrector.tests.ts | 52 +++++---------- 4 files changed, 92 insertions(+), 63 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts index 885cdb0ed2b..93267f5e86b 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts @@ -14,9 +14,10 @@ import { ContextToken } from "./context-token.js"; import { CorrectionSearchable, PathResult } from "./correction-searchable.js"; import { ContextTokenization } from "./context-tokenization.js"; import { QuotientNodeFinalizer } from "./quotient-node-finalizer.js"; -import { TokenizationResultMapping } from "./tokenization-result-mapping.js"; +import { TokenizationResult, TokenizationResultMapping } from "./tokenization-result-mapping.js"; import { EDIT_DISTANCE_COST_SCALE } from "./distance-modeler.js"; import { MAX_EDIT_THRESHOLD_FACTOR } from "./search-quotient-spur.js"; +import { TokenResultMapping } from "./token-result-mapping.js"; // PathResult needs to be generic: // - a result for correcting a single Token - "TokenResult"? @@ -46,7 +47,7 @@ export type TokenResult = { * all correctable tokens, generating corrections for the full represented * range. */ -export class TokenizationCorrector implements CorrectionSearchable, TokenizationResultMapping> { +export class TokenizationCorrector implements CorrectionSearchable { public readonly tokenization: ContextTokenization; private readonly tailCorrectionLength: number; @@ -56,6 +57,7 @@ export class TokenizationCorrector implements CorrectionSearchable; private _previousResults: TokenizationResultMapping[] = []; + private _correctableCodepointLength: number = 0; // fully private public readonly modelsCorrectables: boolean; @@ -65,6 +67,7 @@ export class TokenizationCorrector implements CorrectionSearchable { // New issue: this mangles the space IDs! We almost certainly need some // sort of proper map to the source token. const searchModule = new QuotientNodeFinalizer(token.searchModule, index == orderedTokens.length - 1); this.tokenLookupMap.set(searchModule.spaceId, token); - const passesFilter = filterClosure(token); + // Index within the token subset being examined. + const passesFilter = filterClosure(token, index); modelsCorrectables ||= passesFilter; if(!passesFilter) { this._uncorrectables.push(searchModule); - } else if(index == tailCorrectionLength - 1) { + return; + } + + this.matchableTokenCount++; + this._correctableCodepointLength += searchModule.codepointLength; + if(index == tailCorrectionLength - 1) { // The sole assignment case for this field. It may only be assigned for // the final token, and only if its text is of a form considered // correctable by the filter. @@ -249,6 +259,10 @@ export class TokenizationCorrector implements CorrectionSearchable r instanceof TokenResultMapping).length; + } + // The actual method used to iteratively search for tokenization-level corrections. handleNextNode(): PathResult { // Notable states: @@ -272,11 +286,17 @@ export class TokenizationCorrector implements CorrectionSearchable 0) { + return { + 'type': 'complete', + cost: this.lastTotalCost, + mapping: results + }; + } else { + return { type: 'none' }; + } } } @@ -284,7 +304,6 @@ export class TokenizationCorrector implements CorrectionSearchable { if(correctableToUpdate != this._predictable) { // Lock the 'correctable' token now that either a valid correction for @@ -298,8 +317,12 @@ export class TokenizationCorrector implements CorrectionSearchable c == undefined) != -1) { + // If any token lacks a matching lookup value, abort. + if([...this.tokenLookupMap.keys()].find((k) => !this._generatedTokenResults.has(k))) { return { type: 'intermediate', cost: tokenizationCost }; } + const correctionResults = this.collateResults(); // Determine the proper return type and construct the proper return object accordingly. // @@ -373,11 +397,18 @@ export class TokenizationCorrector implements CorrectionSearchable 0) { + return { + type: 'complete', + cost: tokenizationCost, + mapping: correctionResults + }; + } else { + return { + type: 'none' + } + } } else { return { type: 'none' diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts index 32e0fb48fce..c6fd8db93f3 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts @@ -1,13 +1,23 @@ import { CorrectionResultMapping } from "./correction-result-mapping.js"; import { TokenizationCorrector, TokenResult } from './tokenization-corrector.js'; -export class TokenizationResultMapping implements CorrectionResultMapping> { +export interface TokenizationResult { + tokenCorrections: ReadonlyArray, + totalEditCount: number, + totalEditableCodepoints: number +} + +export class TokenizationResultMapping implements CorrectionResultMapping { readonly matchingSpace: TokenizationCorrector; - readonly matchedResult: ReadonlyArray; + readonly matchedResult: TokenizationResult; constructor(tokenization: TokenResult[], corrector: TokenizationCorrector) { this.matchingSpace = corrector; - this.matchedResult = tokenization; + this.matchedResult = { + tokenCorrections: tokenization, + totalEditCount: tokenization.reduce((accum, curr) => accum + curr.knownCost, 0), + totalEditableCodepoints: 0 //corrector. + } } get spaceId(): number { @@ -22,7 +32,7 @@ export class TokenizationResultMapping implements CorrectionResultMapping accum + curr.knownCost, 0); // } // /** @@ -40,6 +50,6 @@ export class TokenizationResultMapping implements CorrectionResultMapping total + curr.totalCost, 0); + return this.matchedResult.tokenCorrections.reduce((total, curr) => total + curr.totalCost, 0); } } \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 5f825e4b9d9..171c3d675ff 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -410,7 +410,7 @@ export function buildAndMapPredictions( endOfBuffer: false }; - const correctionTransforms = tokenizationCorrection.matchedResult.map((correction, i) => { + const correctionTransforms = tokenizationCorrection.matchedResult.tokenCorrections.map((correction, i) => { return { insert: correction.matchString, // insert correction string deleteLeft: i == 0 ? deleteLeft : 0, @@ -418,7 +418,7 @@ export function buildAndMapPredictions( }; }); - const correctionCost = tokenizationCorrection.matchedResult.map((correction) => { + const correctionCost = tokenizationCorrection.matchedResult.tokenCorrections.map((correction) => { let rootCost = correction.totalCost; /* If we're dealing with the FIRST keystroke of a new sequence, we'll **dramatically** boost * the exponent to ensure only VERY nearby corrections have a chance of winning, and only if @@ -452,7 +452,16 @@ export function buildAndMapPredictions( }).reduce((accum, curr) => accum * curr, 1); const predictionComponents = correctionTransforms.map((correctionTransform, i) => { - const predictions = model.predict(correctionTransform, emptyContext); + let predictions = model.predict(correctionTransform, emptyContext); + + // Ensure codepointLength == prediction codepoint length if i does not match the tail! + // Filter out cases that do not conform to this condition. + if(i != correctionTransforms.length - 1) { + predictions = predictions.filter((p) => { + const codepointLength = tokenizationCorrection.matchingSpace.orderedTokens[i].searchModule.codepointLength; + return KMWString.length(p.sample.transform.insert) == codepointLength; + }); + } // Failsafe: if there are no matching predictions, create a fake prediction // matching the original text. @@ -577,7 +586,8 @@ export function prepareTokenizationSearch( return new TokenizationCorrector(tuple.tokenization, mutatedLength, (token, index) => { return index >= unaffectedTokenCount // is a modified token && index == mutatedLength - 1 // TEMP: adjacent to the caret (TO BE REMOVED) - && correctionValidForAutoSelect(token.exampleInput); // and is eligible text-correction + // and is eligible for text-correction + && (token.searchModule.codepointLength == 0 || correctionValidForAutoSelect(token.exampleInput)); }); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts index a53c3a4a4b7..804cf43d40d 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts @@ -29,7 +29,8 @@ import { SubstitutionQuotientSpur, TokenizationCorrector, TokenResult, - TokenizationResultMapping + TokenizationResultMapping, + TokenizationResult } from '@keymanapp/lm-worker/test-index'; import Distribution = LexicalModelTypes.Distribution; @@ -302,7 +303,7 @@ describe('TokenizationCorrector', () => { assert.equal(searchResult.type, 'complete'); if(searchResult.type == 'complete') { const mapping = searchResult.mapping; - const tokenResults = mapping.matchedResult; + const tokenResults = mapping.matchedResult.tokenCorrections; assert.isNotNaN(searchResult.cost); assert.equal(searchResult.cost, searchResult.mapping.totalCost); assert.equal(tokenResults.length, 1); @@ -327,7 +328,7 @@ describe('TokenizationCorrector', () => { assert.equal(searchResult.type, 'none'); }); - it('finds a default correction for a single correctable token without a model match', () => { + it('returns no result when a single correctable token lacks a model match', () => { const fixture = buildFixture_therefore(); const theref = fixture.theref.tail; @@ -371,23 +372,6 @@ describe('TokenizationCorrector', () => { searchResult = instance.handleNextNode(); } while(searchResult.type == 'intermediate'); - assert.equal(searchResult.type, 'complete'); - if(searchResult.type == 'complete') { - const mapping = searchResult.mapping; - const tokenResults = mapping.matchedResult; - assert.isNotNaN(searchResult.cost); - assert.equal(searchResult.cost, searchResult.mapping.totalCost); - assert.equal(tokenResults.length, 1); - assert.sameOrderedMembers(tokenResults.map((r) => r.matchString), ['therefxyz']); - - // Now that an entry has been found, verify the corrector's state. - assert.isNotOk(instance.predictableToken); // should become an uncorrectable. - assert.isTrue(instance.generatedTokenResults.has(therefxyz)); - assert.equal(instance.generatedTokenResults.get(therefxyz), tokenResults[0]); - } - - // There should be no further possible suggestions. - searchResult = instance.handleNextNode(); assert.equal(searchResult.type, 'none'); }); @@ -411,7 +395,7 @@ describe('TokenizationCorrector', () => { let firstResults: ReadonlyArray; if(searchResult.type == 'complete') { const mapping = searchResult.mapping; - const tokenResults = mapping.matchedResult; + const tokenResults = mapping.matchedResult.tokenCorrections; firstResults = tokenResults; assert.isNotNaN(searchResult.cost); assert.equal(searchResult.cost, searchResult.mapping.totalCost); @@ -434,7 +418,7 @@ describe('TokenizationCorrector', () => { searchResult = instance.handleNextNode(); if(searchResult.type == 'complete') { const mapping = searchResult.mapping; - const tokenResults = mapping.matchedResult; + const tokenResults = mapping.matchedResult.tokenCorrections; // Verify that the first (bound) token is not altered further. // It should receive no further correction attempts. @@ -445,7 +429,7 @@ describe('TokenizationCorrector', () => { } while(searchResult.type != 'none'); }); - it('immediately returns a single result when the only represented token is uncorrectable', () => { + it('immediately returns with no result when the only represented token is uncorrectable', () => { const fixture = buildFixture_terminalWhitespace(); const tokenization = fixture.spaceOnly; @@ -457,13 +441,7 @@ describe('TokenizationCorrector', () => { ); const searchResult = instance.handleNextNode(); - assert.equal(searchResult.type, 'complete'); - if(searchResult.type == 'complete') { - assert.equal(searchResult.mapping.matchedResult[0].matchString, ' '); - } - - const nilResult = instance.handleNextNode(); - assert.equal(nilResult.type, 'none'); + assert.equal(searchResult.type, 'none'); }); it('returns a single result when the final token is uncorrectable', () => { @@ -484,8 +462,8 @@ describe('TokenizationCorrector', () => { assert.equal(searchResult.type, 'complete'); if(searchResult.type == 'complete') { - assert.equal(searchResult.mapping.matchedResult[0].matchString, 'space'); - assert.equal(searchResult.mapping.matchedResult[1].matchString, ' '); + assert.equal(searchResult.mapping.matchedResult.tokenCorrections[0].matchString, 'space'); + assert.equal(searchResult.mapping.matchedResult.tokenCorrections[1].matchString, ' '); } const nilResult = instance.handleNextNode(); @@ -502,20 +480,20 @@ describe('TokenizationCorrector', () => { let haveSeenSingleTokenCorrection = false; let haveSeenThreeTokenCorrection = false; for await(let phraseMatch of getBestMatches< - ReadonlyArray, + TokenizationResult, TokenizationResultMapping, TokenizationCorrector >(correctors, buildTestTimer())) { - if(phraseMatch.matchedResult.length == 1) { + if(phraseMatch.matchedResult.tokenCorrections.length == 1) { if(!haveSeenSingleTokenCorrection) { - assert.sameOrderedMembers(phraseMatch.matchedResult.map((t) => t.matchString), ['theref' /* -ore */]); + assert.sameOrderedMembers(phraseMatch.matchedResult.tokenCorrections.map((t) => t.matchString), ['theref' /* -ore */]); } haveSeenSingleTokenCorrection = true; - } else if(phraseMatch.matchedResult.length == 3) { + } else if(phraseMatch.matchedResult.tokenCorrections.length == 3) { if(!haveSeenThreeTokenCorrection) { - assert.sameOrderedMembers(phraseMatch.matchedResult.map((t) => t.matchString), ['the', ' ', 'ef' /* -fort */]); + assert.sameOrderedMembers(phraseMatch.matchedResult.tokenCorrections.map((t) => t.matchString), ['the', ' ', 'ef' /* -fort */]); } haveSeenThreeTokenCorrection = true; }