Skip to content

Commit 14538a1

Browse files
waleedlatif1claude
andcommitted
refactor(knowledge): centralize tokenizer mapping on EmbeddingModelInfo
Add tokenizerProvider directly to EmbeddingModelInfo so callers read it from the registry instead of reimplementing the gemini→google / openai→openai map at each call site. Removes the local helper in chunks/service.ts and the inline ternary in search/route.ts. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 96cf4dd commit 14538a1

3 files changed

Lines changed: 13 additions & 18 deletions

File tree

apps/sim/app/api/knowledge/search/route.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -402,11 +402,10 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
402402
let tokenCount = null
403403
if (hasQuery) {
404404
try {
405-
// Use the tokenizer matching the actual embedding provider so token counts
406-
// (and the input cost derived from them) reflect how the provider tokenizes.
407-
const tokenizerProvider =
408-
getEmbeddingModelInfo(queryEmbeddingModel).provider === 'gemini' ? 'google' : 'openai'
409-
tokenCount = estimateTokenCount(validatedData.query!, tokenizerProvider)
405+
tokenCount = estimateTokenCount(
406+
validatedData.query!,
407+
getEmbeddingModelInfo(queryEmbeddingModel).tokenizerProvider
408+
)
410409
cost = calculateCost(queryEmbeddingModel, tokenCount.count, 0, false)
411410
} catch (error) {
412411
logger.warn(`[${requestId}] Failed to calculate cost for search query`, {

apps/sim/lib/knowledge/chunks/service.ts

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,6 @@ import { getEmbeddingModelInfo } from '@/lib/knowledge/embedding-models'
1515
import { generateEmbeddings } from '@/lib/knowledge/embeddings'
1616
import { estimateTokenCount } from '@/lib/tokenization/estimators'
1717

18-
/**
19-
* Map embedding model provider → tokenization provider id used by
20-
* `estimateTokenCount`. Keeps stored token counts (and any cost computed
21-
* from them) consistent with how the embedding provider tokenizes.
22-
*/
23-
function tokenizerProviderForEmbeddingModel(model: string): 'openai' | 'google' {
24-
return getEmbeddingModelInfo(model).provider === 'gemini' ? 'google' : 'openai'
25-
}
26-
2718
const logger = createLogger('ChunksService')
2819

2920
/**
@@ -136,10 +127,9 @@ export async function createChunk(
136127
workspaceId
137128
)
138129

139-
// Calculate accurate token count using the tokenizer matching the KB's embedding provider.
140130
const tokenCount = estimateTokenCount(
141131
chunkData.content,
142-
tokenizerProviderForEmbeddingModel(kbEmbeddingModel)
132+
getEmbeddingModelInfo(kbEmbeddingModel).tokenizerProvider
143133
)
144134

145135
const chunkId = generateId()
@@ -398,10 +388,9 @@ export async function updateChunk(
398388
}
399389
const { embeddings } = await generateEmbeddings([content], chunkEmbeddingModel, workspaceId)
400390

401-
// Calculate accurate token count using the tokenizer matching the KB's embedding provider.
402391
const tokenCount = estimateTokenCount(
403392
content,
404-
tokenizerProviderForEmbeddingModel(chunkEmbeddingModel)
393+
getEmbeddingModelInfo(chunkEmbeddingModel).tokenizerProvider
405394
)
406395

407396
dbUpdateData.content = content

apps/sim/lib/knowledge/embedding-models.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,16 @@ export const DEFAULT_EMBEDDING_MODEL = 'text-embedding-3-small'
99

1010
export type EmbeddingProviderKind = 'openai' | 'azure-openai' | 'gemini'
1111

12+
export type TokenizerProviderId = 'openai' | 'google'
13+
1214
export interface EmbeddingModelInfo {
1315
provider: EmbeddingProviderKind
1416
/** Whether the provider supports requesting a custom output dimensionality. */
1517
supportsCustomDimensions: boolean
1618
/** Pricing/billing label — must match an entry in EMBEDDING_MODEL_PRICING when billed. */
1719
pricingId: string
20+
/** Provider id for `estimateTokenCount` so token counts match the embedding provider's tokenization. */
21+
tokenizerProvider: TokenizerProviderId
1822
label: string
1923
/** Short user-facing description shown in the KB creation UI. */
2024
description: string
@@ -25,20 +29,23 @@ export const SUPPORTED_EMBEDDING_MODELS: Record<string, EmbeddingModelInfo> = {
2529
provider: 'openai',
2630
supportsCustomDimensions: true,
2731
pricingId: 'text-embedding-3-small',
32+
tokenizerProvider: 'openai',
2833
label: 'OpenAI text-embedding-3-small',
2934
description: 'Cheapest. Good for English-heavy retrieval at low cost.',
3035
},
3136
'text-embedding-3-large': {
3237
provider: 'openai',
3338
supportsCustomDimensions: true,
3439
pricingId: 'text-embedding-3-large',
40+
tokenizerProvider: 'openai',
3541
label: 'OpenAI text-embedding-3-large',
3642
description: 'Slightly better quality than 3-small at ~6.5× the cost.',
3743
},
3844
'gemini-embedding-001': {
3945
provider: 'gemini',
4046
supportsCustomDimensions: true,
4147
pricingId: 'gemini-embedding-001',
48+
tokenizerProvider: 'google',
4249
label: 'Google gemini-embedding-001',
4350
description: 'Strong multilingual retrieval. Good cost/quality balance.',
4451
},

0 commit comments

Comments
 (0)