-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmodels.toml
More file actions
101 lines (82 loc) · 2.72 KB
/
models.toml
File metadata and controls
101 lines (82 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Embedding Benchmark Models Configuration
#
# MistralRS Models Only (Metal GPU)
# 8 configurations for comprehensive testing
# =============================================================================
# EmbeddingGemma 300M - Google's small but high-quality model
# 768 dims, ~0.6GB, previous benchmark winner
# =============================================================================
[[mistralrs]]
model = "embedding-gemma"
quantization = "none"
device = "metal"
# F16 - Full precision baseline
[[mistralrs]]
model = "embedding-gemma"
quantization = "q8"
device = "metal"
# Q8_0 - 8-bit quantization
[[mistralrs]]
model = "embedding-gemma"
quantization = "q4k"
device = "metal"
# Q4K - 4-bit quantization (lowest memory)
# =============================================================================
# Qwen3-Embedding 0.6B - Alibaba's compact embedding model
# 1024 dims, ~1.2GB
# =============================================================================
[[mistralrs]]
model = "qwen3-0.6b"
quantization = "none"
device = "metal"
# F16 - Full precision baseline
[[mistralrs]]
model = "qwen3-0.6b"
quantization = "q8"
device = "metal"
# Q8_0 - 8-bit quantization
[[mistralrs]]
model = "qwen3-0.6b"
quantization = "q4k"
device = "metal"
# Q4K - 4-bit quantization (lowest memory)
# =============================================================================
# Qwen3-Embedding 4B - Larger model for quality comparison
# 2560 dims, ~8GB - May require significant VRAM
# =============================================================================
[[mistralrs]]
model = "qwen3-4b"
quantization = "q4k"
device = "metal"
# Q4K only - F16 would need ~16GB VRAM
# =============================================================================
# Qwen3-Embedding 8B - Largest model (optional)
# 4096 dims, ~16GB - Requires 32GB+ unified memory
# =============================================================================
# [[mistralrs]]
# model = "qwen3-8b"
# quantization = "q4k"
# device = "metal"
# # Uncomment if you have 32GB+ RAM
# =============================================================================
# FastEmbed Models (ONNX Runtime / CPU)
# Good baselines for comparison
# =============================================================================
[[fastembed]]
model = "BAAI/bge-small-en-v1.5"
# 384 dims, fast baseline
[[fastembed]]
model = "BAAI/bge-base-en-v1.5"
# 768 dims, balanced
[[fastembed]]
model = "jinaai/jina-embeddings-v2-base-code"
# 768 dims, code-specialized
[[fastembed]]
model = "Alibaba-NLP/gte-large-en-v1.5"
# 1024 dims, high quality
[[fastembed]]
model = "nomic-ai/nomic-embed-text-v1.5"
# 768 dims, good general purpose
[[fastembed]]
model = "Snowflake/snowflake-arctic-embed-m"
# 768 dims, recent high performer