-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark.py
More file actions
68 lines (56 loc) · 2.46 KB
/
benchmark.py
File metadata and controls
68 lines (56 loc) · 2.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import json
import time
from fireform.extractor import extract_incident_data as real_extract
from fireform.models import ExtractionResult
from tests.fixtures_incidents import INCIDENTS
def mock_extract(text, schema=None, model="llama3.1", temperature=0.0):
time.sleep(1.8) # simulate model latency
for inc in INCIDENTS:
if inc["input"] == text:
# mock successful return
return ExtractionResult(data=inc["expected"], attempts=1)
return ExtractionResult(data={}, attempts=1)
def run_benchmarks(models=["llama3.1", "mistral"], use_mock=False):
extract_func = mock_extract if use_mock else real_extract
with open("schemas/incident_schema.json") as f:
schema = json.load(f)
print("Model Benchmarking Suite")
print("=" * 40)
for model in models:
print(f"\nModel: {model}")
success_count = 0
total_time = 0
for inc in INCIDENTS:
start = time.time()
try:
result = extract_func(text=inc["input"], schema=schema, model=model)
if result.error:
print(f"Error internally: {result.error}")
data = result.data or {}
total_time += time.time() - start
match = True
expected = inc["expected"]
if data.get("em:IncidentCategoryCode") != expected.get("em:IncidentCategoryCode"):
match = False
if match and data:
success_count += 1
except Exception as e:
print(f"Error for model {model}: {e}")
if len(INCIDENTS) > 0:
acc = (success_count / len(INCIDENTS)) * 100
avg_time = total_time / len(INCIDENTS)
print(f"Accuracy: {success_count}/{len(INCIDENTS)} ({acc:.1f}%)")
print(f"Average Time: {avg_time:.2f}s per extraction")
if __name__ == "__main__":
try:
import httpx
r = httpx.get("http://localhost:11434/", timeout=1.0)
real_ollama = r.status_code == 200
except Exception:
real_ollama = False
if real_ollama:
print("Real Ollama instance detected! Running actual bench.")
run_benchmarks(["llama3.1", "mistral", "phi3"], use_mock=False)
else:
print("Note: Local Ollama not running. Using deterministic mocking for demo.")
run_benchmarks(["llama3.1", "mistral", "phi3"], use_mock=True)