-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathrun.py
More file actions
121 lines (101 loc) · 4.96 KB
/
run.py
File metadata and controls
121 lines (101 loc) · 4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import sys
import time
import psutil
import argparse
import subprocess
from utils.benchmark import parse_threads_range
def get_file_dir():
return os.path.dirname(os.path.realpath(__file__))
def docker_init(docker_image):
# if subprocess.run(
# ["docker", "pull", docker_image]).returncode != 0:
# print("Docker pull process failed!")
# sys.exit(1)
container_name = "llama_benchmark"
subprocess.run(["docker", "rm", "-f", container_name])
memory = (psutil.virtual_memory().total >> 30) - 30 # leave 30GB for OS
assert memory > 10, "less than 10GB of memory available on the system for llama.cpp"
if subprocess.run(
["docker", "run", "--privileged=true", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v",
f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", docker_image]).returncode != 0:
print("Docker run process failed!")
sys.exit(1)
return container_name
def docker_restart(docker_name):
break_time = 15
def docker_stop():
if subprocess.run(["docker", "stop", docker_name]).returncode != 0:
print(f"Stopping docker container {docker_name} failed, retrying in {break_time} seconds.")
time.sleep(break_time)
docker_stop()
def docker_start():
if subprocess.run(["docker", "start", docker_name]).returncode != 0:
print(f"Starting docker container {docker_name} failed, retrying in {break_time} seconds.")
time.sleep(break_time)
docker_start()
print(f"\nRestarting docker container {docker_name} ...")
docker_stop()
docker_start()
def benchmark(docker_container_name, args):
num_available_threads = len(parse_threads_range(args.threads_range))
if num_available_threads < max(args.num_threads):
print(
f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
sys.exit(1)
docker_restart(docker_container_name)
for model in args.model_names:
for prompt_size in sorted(args.prompt_sizes):
for batch_size in sorted(args.batch_sizes):
for num_threads in sorted(args.num_threads):
num_processes = int(num_available_threads / num_threads)
case = f"{num_processes} x {num_threads} [proc x threads], bs = {batch_size}"
print(f"\nRunning {case}")
cmd = (f"cd /runner; python3 utils/benchmark.py -m models/{model} -n {str(num_processes)} "
f"-t {str(num_threads)} -b {str(batch_size)} -p {str(prompt_size)} -r {args.threads_range}")
cmd = ["docker", "exec", "-i", docker_container_name, "bash", "-c", cmd]
print(f"Executing: {' '.join(cmd)}")
success = False
start = time.time()
p = subprocess.Popen(cmd, start_new_session=True)
while time.time() - start < args.timeout:
time.sleep(1)
exit_code = p.poll()
if exit_code is not None:
success = exit_code == 0
break
if success:
print(f"SUCCESS: {case}")
else:
print(f"FAIL: {case}")
docker_restart(docker_container_name)
def parse_args():
parser = argparse.ArgumentParser(description="Run set of benchmarks.")
parser.add_argument("-m", "--model_names",
type=str, required=True, nargs="+",
help="model names, e.g. 'Meta-Llama-3-8B-Instruct.Q8_0.gguf'")
parser.add_argument("-d", "--docker_image",
type=str, required=True,
help="Docker image to use for benchmarking")
parser.add_argument("-t", "--num_threads",
type=int, required=True, nargs="+",
help="number of threads per process to use")
parser.add_argument("-b", "--batch_sizes",
type=int, required=True, nargs="+",
help="batch sizes to cover")
parser.add_argument("-p", "--prompt_sizes",
type=int, required=True, nargs="+",
help="prompt sizes to cover")
parser.add_argument("-r", "--threads_range",
type=str, required=True,
help="range of threads to use in offline mode, e.g. '0-63,128-191', threads will be divided "
"between processes - hint: 'lscpu | grep NUMA'")
parser.add_argument("--timeout",
type=float, default=900,
help="timeout to apply per single benchmark case")
return parser.parse_args()
def main():
args = parse_args()
benchmark(docker_init(args.docker_image), args)
if __name__ == "__main__":
main()