Skip to content

Commit 891a6f4

Browse files
author
Chris Warren-Smith
committed
LLAMA: RAG experiment to increase domain knowledge of a particular lib
1 parent 6c278bd commit 891a6f4

9 files changed

Lines changed: 293 additions & 395 deletions

File tree

llama/CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ add_subdirectory(${LLAMA_DIR})
113113
set(PLUGIN_SOURCES
114114
main.cpp
115115
llama-sb.cpp
116+
llama-sb-rag.cpp
116117
../include/param.cpp
117118
../include/hashmap.cpp
118119
../include/apiexec.cpp
@@ -193,6 +194,17 @@ set_target_properties(rag_index PROPERTIES
193194
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin
194195
)
195196

197+
# -----------------------------
198+
# Header preparation for RAG indexer
199+
# -----------------------------
200+
add_executable(chunk_headers
201+
chunk_headers.cpp
202+
)
203+
204+
set_target_properties(chunk_headers PROPERTIES
205+
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin
206+
)
207+
196208
# ------------------------------------------------------------------
197209
# Android native library
198210
# ------------------------------------------------------------------

llama/RAG.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ notcurses headers
2121
chunk_headers ← semantic chunker, outputs chunks.jsonl
2222
2323
24-
rag_index ← embeds each chunk via nomic-embed-text GGUF
24+
rag_index ← embeds each chunk via qwen3-embedding-0.6b-q4_k_m.gguf
2525
2626
2727
notcurses.db ← binary vector store (embeddings + text)
@@ -64,9 +64,7 @@ history ← appended for next turn (KV cache intact)
6464

6565
- [llama.cpp](https://github.com/ggerganov/llama.cpp)`libllama` + `llama.h`
6666
- A GGUF **inference model** — tested with `Qwen3.5-9B-Q4_K_M.gguf`
67-
- A GGUF **embedding model**
68-
`nomic-embed-text-v1.5.Q4_K_M.gguf`
69-
([download](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF))
67+
- A GGUF **embedding model**`qwen3-embedding-0.6b-q4_k_m.gguf`
7068
- C++17 compiler (gcc 8+, clang 7+, MSVC 2019+)
7169

7270
---
@@ -114,7 +112,7 @@ head -5 chunks.jsonl | python3 -m json.tool
114112

115113
```bash
116114
./rag_index \
117-
--model nomic-embed-text-v1.5.Q4_K_M.gguf \
115+
--model qwen3-embedding-0.6b-q4_k_m.gguf \
118116
--input chunks.jsonl \
119117
--output notcurses.db
120118
```
@@ -127,7 +125,7 @@ until the library changes.
127125
```bash
128126
./example \
129127
--model Qwen3.5-9B-Q4_K_M.gguf \
130-
--embed nomic-embed-text-v1.5.Q4_K_M.gguf \
128+
--embed qwen3-embedding-0.6b-q4_k_m.gguf \
131129
--db notcurses.db
132130
```
133131

llama/chunk_headers.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <sstream>
2121
#include <string>
2222
#include <vector>
23+
#include <algorithm>
2324
#include <filesystem>
2425

2526
namespace fs = std::filesystem;

llama/llama-sb-rag.cpp

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
// This file is part of SmallBASIC
2+
//
3+
// This program is distributed under the terms of the GPL v2.0 or later
4+
// Download the GNU Public License (GPL) from www.gnu.org
5+
//
6+
// Copyright(C) 2026 Chris Warren-Smith
7+
8+
#include "llama-sb.h"
9+
10+
#include <algorithm>
11+
#include <cmath>
12+
#include <cstdint>
13+
#include <fstream>
14+
#include <iostream>
15+
#include <numeric>
16+
#include <sstream>
17+
#include <string>
18+
#include <vector>
19+
20+
struct RagChunk {
21+
std::string text;
22+
std::string source;
23+
std::string type;
24+
std::vector<float> embedding;
25+
};
26+
27+
struct RagDB {
28+
std::vector<RagChunk> chunks;
29+
int embed_dim = 0;
30+
31+
int size() const { return (int)chunks.size(); }
32+
bool empty() const { return chunks.empty(); }
33+
};
34+
35+
//
36+
// per-session deduplication + token budget
37+
//
38+
struct RagSession {
39+
std::vector<bool> seen; /* sized to db.size() on init */
40+
int tokens_used = 0;
41+
int tokens_max = 0; /* set to your n_ctx */
42+
float score_threshold = 0.60f; /* skip weak matches */
43+
44+
void init(int n_chunks, int ctx_size) {
45+
seen.assign(n_chunks, false);
46+
tokens_used = 0;
47+
tokens_max = ctx_size;
48+
}
49+
50+
void reset() {
51+
std::fill(seen.begin(), seen.end(), false);
52+
tokens_used = 0;
53+
}
54+
55+
bool is_seen(int idx) const { return idx < (int)seen.size() && seen[idx]; }
56+
void mark(int idx) { if (idx < (int)seen.size()) seen[idx] = true; }
57+
58+
/* rough token estimate: 1 token ≈ 4 chars */
59+
bool budget_ok(const std::string &text) const {
60+
return tokens_max == 0 ||
61+
(tokens_used + (int)text.size() / 4) < (int)(tokens_max * 0.85f);
62+
}
63+
64+
void charge(const std::string &text) {
65+
tokens_used += (int)text.size() / 4;
66+
}
67+
};
68+
69+
bool Llama::embed_text(const std::string &text, std::vector<float> &out, int embed_dim) {
70+
vector<llama_token> tokens = tokenize(text);
71+
if (tokens.size() == 0) {
72+
return false;
73+
}
74+
75+
// truncate to context window
76+
int n_ctx = llama_n_ctx(_ctx);
77+
int n = tokens.size();
78+
if (n > n_ctx) {
79+
_last_error = std::format("warning: chunk truncated {} -> {} tokens ", n, n_ctx);
80+
n = n_ctx;
81+
tokens.resize(n);
82+
}
83+
84+
llama_memory_clear(llama_get_memory(_ctx), true);
85+
86+
if (!batch_decode_tokens(tokens)) {
87+
return false;
88+
}
89+
90+
float *emb = llama_get_embeddings_seq(_ctx, 0);
91+
if (!emb) {
92+
emb = llama_get_embeddings_ith(_ctx, n - 1);
93+
}
94+
95+
if (!emb) {
96+
_last_error = "no embedding returned\n";
97+
return false;
98+
}
99+
100+
out.assign(emb, emb + embed_dim);
101+
102+
/* L2 normalize */
103+
float norm = 0.0f;
104+
for (float v : out) {
105+
norm += v * v;
106+
}
107+
norm = std::sqrt(norm);
108+
if (norm > 1e-9f) {
109+
for (float &v : out) {
110+
v /= norm;
111+
}
112+
}
113+
114+
return true;
115+
}
116+
117+
bool Llama::rag_load(RagDB &db, const std::string &path) {
118+
std::ifstream f(path, std::ios::binary);
119+
if (!f) {
120+
_last_error = std::format("rag_load: cannot open {}", path);
121+
return false;
122+
}
123+
124+
auto read32 = [&]() -> uint32_t {
125+
uint32_t v = 0; f.read((char*)&v, 4); return v;
126+
};
127+
auto read16 = [&]() -> uint16_t {
128+
uint16_t v = 0; f.read((char*)&v, 2); return v;
129+
};
130+
auto read8 = [&]() -> uint8_t {
131+
uint8_t v = 0; f.read((char*)&v, 1); return v;
132+
};
133+
auto readstr = [&](size_t len) -> std::string {
134+
std::string s(len, '\0');
135+
f.read(&s[0], (std::streamsize)len);
136+
return s;
137+
};
138+
139+
uint32_t magic = read32();
140+
uint32_t version = read32();
141+
uint32_t n = read32();
142+
uint32_t edim = read32();
143+
144+
if (magic != 0x52414744) {
145+
_last_error = "rag_load: bad magic";
146+
return false;
147+
}
148+
if (version != 2) {
149+
_last_error = std::format("rag_load: unsupported version {} (expected 2)", version);
150+
return false;
151+
}
152+
153+
db.embed_dim = (int)edim;
154+
db.chunks.resize(n);
155+
156+
for (uint32_t i = 0; i < n; i++) {
157+
RagChunk &c = db.chunks[i];
158+
159+
uint32_t text_len = read32();
160+
c.text = readstr(text_len);
161+
162+
uint16_t src_len = read16();
163+
c.source = readstr(src_len);
164+
165+
uint8_t type_len = read8();
166+
c.type = readstr(type_len);
167+
168+
c.embedding.resize(edim);
169+
f.read((char*)c.embedding.data(), (std::streamsize)(edim * sizeof(float)));
170+
}
171+
172+
if (!f) {
173+
_last_error = "rag_load: read error";
174+
return false;
175+
}
176+
177+
std::cerr << "rag: loaded " << db.chunks.size()
178+
<< " chunks (dim=" << db.embed_dim
179+
<< ") from " << path << "\n";
180+
return true;
181+
}
182+
183+
//
184+
// cosine similarity (vectors already L2-normalized)
185+
//
186+
static float rag_cosine(const std::vector<float> &a,
187+
const std::vector<float> &b) {
188+
float dot = 0.0f;
189+
size_t n = std::min(a.size(), b.size());
190+
for (size_t i = 0; i < n; i++) {
191+
dot += a[i] * b[i];
192+
}
193+
return dot;
194+
}
195+
196+
//
197+
// build context string from ranked results
198+
//
199+
static std::string rag_build_context(const RagDB &db,
200+
const std::vector<int> &indices,
201+
const std::vector<float> &scores) {
202+
std::ostringstream out;
203+
for (size_t i = 0; i < indices.size(); i++) {
204+
const RagChunk &c = db.chunks[indices[i]];
205+
out << "// source: " << c.source
206+
<< " [" << c.type << "]"
207+
<< " (score: " << scores[i] << ")\n"
208+
<< c.text << "\n---\n";
209+
}
210+
return out.str();
211+
}
212+
213+
//
214+
// retrieve with session
215+
//
216+
std::string Llama::rag_retrieve(const RagDB &db,
217+
const std::string &query,
218+
int top_k,
219+
RagSession &session) {
220+
if (db.empty()) {
221+
return {};
222+
}
223+
224+
std::vector<float> qvec;
225+
std::string text = "Instruct: Given a programming question, retrieve relevant API documentation\nQuery: " + query;
226+
if (!embed_text(text, qvec, db.embed_dim)) {
227+
return {};
228+
}
229+
230+
// score all chunks
231+
std::vector<int> order(db.size());
232+
std::iota(order.begin(), order.end(), 0);
233+
std::vector<float> scores(db.size());
234+
for (int i = 0; i < db.size(); i++)
235+
scores[i] = rag_cosine(qvec, db.chunks[i].embedding);
236+
237+
std::sort(order.begin(), order.end(),
238+
[&](int a, int b){ return scores[a] > scores[b]; });
239+
240+
// collect top_k unseen, within budget, above threshold
241+
std::vector<int> result_idx;
242+
std::vector<float> result_scores;
243+
244+
for (int idx : order) {
245+
if ((int)result_idx.size() >= top_k) break;
246+
if (session.is_seen(idx)) continue;
247+
if (scores[idx] < session.score_threshold) break; /* sorted, so stop */
248+
if (!session.budget_ok(db.chunks[idx].text)) break;
249+
250+
result_idx.push_back(idx);
251+
result_scores.push_back(scores[idx]);
252+
session.mark(idx);
253+
session.charge(db.chunks[idx].text);
254+
}
255+
256+
return rag_build_context(db, result_idx, result_scores);
257+
}

llama/llama-sb.cpp

Lines changed: 2 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -203,66 +203,14 @@ bool Llama::load_embedding_model(string model_path) {
203203
_ctx = llama_init_from_model(_model, cparams);
204204
if (!_ctx) {
205205
set_last_error("Create context");
206+
} else {
207+
_vocab = llama_model_get_vocab(_model);
206208
}
207209
}
208210

209211
return _last_error.empty();
210212
}
211213

212-
int Llama::get_embed_dim() {
213-
return _model != nullptr ? llama_model_n_embd(_model) : 0;
214-
}
215-
216-
bool Llama::embed_text(const std::string &text, std::vector<float> &out, int embed_dim) {
217-
std::string prefixed = "search_document: " + text;
218-
219-
vector<llama_token> tokens = tokenize(prefixed);
220-
if (tokens.size() == 0) {
221-
return false;
222-
}
223-
224-
// truncate to context window
225-
int n_ctx = llama_n_ctx(_ctx);
226-
int n = tokens.size();
227-
if (n > n_ctx) {
228-
_last_error = std::format("warning: chunk truncated {} -> {} tokens ", n, n_ctx);
229-
n = n_ctx;
230-
tokens.resize(n);
231-
}
232-
233-
llama_memory_clear(llama_get_memory(_ctx), true);
234-
235-
if (!batch_decode_tokens(tokens)) {
236-
return false;
237-
}
238-
239-
float *emb = llama_get_embeddings_seq(_ctx, 0);
240-
if (!emb) {
241-
emb = llama_get_embeddings_ith(_ctx, n - 1);
242-
}
243-
244-
if (!emb) {
245-
_last_error = "no embedding returned\n";
246-
return false;
247-
}
248-
249-
out.assign(emb, emb + embed_dim);
250-
251-
/* L2 normalize */
252-
float norm = 0.0f;
253-
for (float v : out) {
254-
norm += v * v;
255-
}
256-
norm = std::sqrt(norm);
257-
if (norm > 1e-9f) {
258-
for (float &v : out) {
259-
v /= norm;
260-
}
261-
}
262-
263-
return true;
264-
}
265-
266214
void Llama::set_grammar(const string &src, const string &root) {
267215
_grammar_src = src;
268216
_grammar_root = root;

0 commit comments

Comments
 (0)