refactor(knowledge): centralize tokenizer mapping on EmbeddingModelInfo

waleedlatif1 · claude · waleedlatif1 · commit 14538a1df2ed · 2026-04-29T19:54:05.000-07:00
Add tokenizerProvider directly to EmbeddingModelInfo so callers read it
from the registry instead of reimplementing the gemini→google / openai→openai
map at each call site. Removes the local helper in chunks/service.ts and
the inline ternary in search/route.ts.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/apps/sim/app/api/knowledge/search/route.ts b/apps/sim/app/api/knowledge/search/route.ts
@@ -402,11 +402,10 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
       let tokenCount = null
       if (hasQuery) {
         try {
-          // Use the tokenizer matching the actual embedding provider so token counts
-          // (and the input cost derived from them) reflect how the provider tokenizes.
-          const tokenizerProvider =
-            getEmbeddingModelInfo(queryEmbeddingModel).provider === 'gemini' ? 'google' : 'openai'
-          tokenCount = estimateTokenCount(validatedData.query!, tokenizerProvider)
+          tokenCount = estimateTokenCount(
+            validatedData.query!,
+            getEmbeddingModelInfo(queryEmbeddingModel).tokenizerProvider
+          )
           cost = calculateCost(queryEmbeddingModel, tokenCount.count, 0, false)
         } catch (error) {
           logger.warn(`[${requestId}] Failed to calculate cost for search query`, {
diff --git a/apps/sim/lib/knowledge/chunks/service.ts b/apps/sim/lib/knowledge/chunks/service.ts
@@ -15,15 +15,6 @@ import { getEmbeddingModelInfo } from '@/lib/knowledge/embedding-models'
 import { generateEmbeddings } from '@/lib/knowledge/embeddings'
 import { estimateTokenCount } from '@/lib/tokenization/estimators'
 
-/**
- * Map embedding model provider → tokenization provider id used by
- * `estimateTokenCount`. Keeps stored token counts (and any cost computed
- * from them) consistent with how the embedding provider tokenizes.
- */
-function tokenizerProviderForEmbeddingModel(model: string): 'openai' | 'google' {
-  return getEmbeddingModelInfo(model).provider === 'gemini' ? 'google' : 'openai'
-}
-
 const logger = createLogger('ChunksService')
 
 /**
@@ -136,10 +127,9 @@ export async function createChunk(
     workspaceId
   )
 
-  // Calculate accurate token count using the tokenizer matching the KB's embedding provider.
   const tokenCount = estimateTokenCount(
     chunkData.content,
-    tokenizerProviderForEmbeddingModel(kbEmbeddingModel)
+    getEmbeddingModelInfo(kbEmbeddingModel).tokenizerProvider
   )
 
   const chunkId = generateId()
@@ -398,10 +388,9 @@ export async function updateChunk(
         }
         const { embeddings } = await generateEmbeddings([content], chunkEmbeddingModel, workspaceId)
 
-        // Calculate accurate token count using the tokenizer matching the KB's embedding provider.
         const tokenCount = estimateTokenCount(
           content,
-          tokenizerProviderForEmbeddingModel(chunkEmbeddingModel)
+          getEmbeddingModelInfo(chunkEmbeddingModel).tokenizerProvider
         )
 
         dbUpdateData.content = content
diff --git a/apps/sim/lib/knowledge/embedding-models.ts b/apps/sim/lib/knowledge/embedding-models.ts
@@ -9,12 +9,16 @@ export const DEFAULT_EMBEDDING_MODEL = 'text-embedding-3-small'
 
 export type EmbeddingProviderKind = 'openai' | 'azure-openai' | 'gemini'
 
+export type TokenizerProviderId = 'openai' | 'google'
+
 export interface EmbeddingModelInfo {
   provider: EmbeddingProviderKind
   /** Whether the provider supports requesting a custom output dimensionality. */
   supportsCustomDimensions: boolean
   /** Pricing/billing label — must match an entry in EMBEDDING_MODEL_PRICING when billed. */
   pricingId: string
+  /** Provider id for `estimateTokenCount` so token counts match the embedding provider's tokenization. */
+  tokenizerProvider: TokenizerProviderId
   label: string
   /** Short user-facing description shown in the KB creation UI. */
   description: string
@@ -25,20 +29,23 @@ export const SUPPORTED_EMBEDDING_MODELS: Record<string, EmbeddingModelInfo> = {
     provider: 'openai',
     supportsCustomDimensions: true,
     pricingId: 'text-embedding-3-small',
+    tokenizerProvider: 'openai',
     label: 'OpenAI text-embedding-3-small',
     description: 'Cheapest. Good for English-heavy retrieval at low cost.',
   },
   'text-embedding-3-large': {
     provider: 'openai',
     supportsCustomDimensions: true,
     pricingId: 'text-embedding-3-large',
+    tokenizerProvider: 'openai',
     label: 'OpenAI text-embedding-3-large',
     description: 'Slightly better quality than 3-small at ~6.5× the cost.',
   },
   'gemini-embedding-001': {
     provider: 'gemini',
     supportsCustomDimensions: true,
     pricingId: 'gemini-embedding-001',
+    tokenizerProvider: 'google',
     label: 'Google gemini-embedding-001',
     description: 'Strong multilingual retrieval. Good cost/quality balance.',
   },