forked from TheTom/turboquant_plus
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathturbo-quality-gate.sh
More file actions
executable file
·80 lines (70 loc) · 2.86 KB
/
turbo-quality-gate.sh
File metadata and controls
executable file
·80 lines (70 loc) · 2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/bin/bash
# TurboQuant quality + speed gate — run BEFORE pushing any changes
# Checks: (1) perplexity within 5% of q8_0, (2) context scaling ratio > 0.95
#
# Usage: bash scripts/turbo-quality-gate.sh
# Exit 0 = PASS, Exit 1 = FAIL
set -e
LLAMA=${LLAMA:-~/local_llms/llama.cpp/build-turbo/bin}
MODEL=${MODEL:-~/local_llms/models/Qwen3.5-35B-A3B-Q8_0.gguf}
WIKI=${WIKI:-~/local_llms/llama.cpp/wikitext-2-raw/wiki.test.raw}
if [ ! -f "$WIKI" ]; then
echo "Downloading wikitext-2..."
bash ~/local_llms/llama.cpp/scripts/get-wikitext-2.sh
fi
FAIL=0
echo "========================================"
echo " TurboQuant Quality + Speed Gate"
echo "========================================"
echo ""
# --- Test 1: Perplexity ---
echo "[1/2] Running perplexity check (8 chunks)..."
PPL_TURBO=$($LLAMA/llama-perplexity -m $MODEL -f $WIKI -c 512 -ctk turbo3 -ctv turbo3 -fa on --chunks 8 -ngl 99 2>&1 | grep "Final" | grep -oE 'PPL = [0-9.]+' | grep -oE '[0-9.]+')
if [ -z "$PPL_TURBO" ]; then
echo " FAIL: Could not get turbo3 perplexity (crash or timeout)"
FAIL=1
else
BASELINE_PPL=6.111
MAX_PPL=$(echo "$BASELINE_PPL * 1.05" | bc)
PPL_OK=$(echo "$PPL_TURBO < $MAX_PPL" | bc)
if [ "$PPL_OK" -eq 1 ]; then
echo " PASS: turbo3 PPL = $PPL_TURBO (< $MAX_PPL, within 5% of q8_0 $BASELINE_PPL)"
else
echo " FAIL: turbo3 PPL = $PPL_TURBO (> $MAX_PPL, exceeds 5% threshold)"
FAIL=1
fi
fi
echo ""
# --- Test 2: Context Scaling ---
echo "[2/2] Running context scaling check (4K prefill)..."
TURBO_TPS=$($LLAMA/llama-perplexity -m $MODEL -f $WIKI -c 4096 -ctk turbo3 -ctv turbo3 -fa on --chunks 4 -ngl 99 2>&1 | grep "prompt eval" | grep -oE '[0-9.]+ tokens per second' | grep -oE '[0-9.]+')
Q8_TPS=$($LLAMA/llama-perplexity -m $MODEL -f $WIKI -c 4096 -ctk q8_0 -ctv q8_0 -fa on --chunks 4 -ngl 99 2>&1 | grep "prompt eval" | grep -oE '[0-9.]+ tokens per second' | grep -oE '[0-9.]+')
if [ -z "$TURBO_TPS" ] || [ -z "$Q8_TPS" ]; then
echo " FAIL: Could not measure speed (crash or timeout)"
echo " turbo3=$TURBO_TPS q8_0=$Q8_TPS"
FAIL=1
else
RATIO=$(echo "scale=4; $TURBO_TPS / $Q8_TPS" | bc)
RATIO_OK=$(echo "$RATIO > 0.95" | bc)
if [ "$RATIO_OK" -eq 1 ]; then
echo " PASS: turbo3/q8_0 = ${RATIO}x at 4K context (> 0.95 threshold)"
echo " turbo3 = $TURBO_TPS tok/s, q8_0 = $Q8_TPS tok/s"
else
echo " FAIL: turbo3/q8_0 = ${RATIO}x at 4K context (< 0.95 threshold)"
echo " turbo3 = $TURBO_TPS tok/s, q8_0 = $Q8_TPS tok/s"
echo " Context scaling regression detected!"
FAIL=1
fi
fi
echo ""
# --- Summary ---
echo "========================================"
if [ "$FAIL" -eq 0 ]; then
echo " ALL CHECKS PASSED"
echo "========================================"
exit 0
else
echo " CHECKS FAILED — DO NOT PUSH"
echo "========================================"
exit 1
fi