-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathrun.py
More file actions
129 lines (106 loc) · 5.22 KB
/
run.py
File metadata and controls
129 lines (106 loc) · 5.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import sys
import time
import psutil
import argparse
import subprocess
from utils.benchmark import parse_threads_range
def get_file_dir():
return os.path.dirname(os.path.realpath(__file__))
def docker_init(node):
tag = "amperecomputingai/llama.cpp:3.2.0"
if subprocess.run(
["docker", "pull", tag]).returncode != 0:
print("Docker pull process failed!")
sys.exit(1)
container_name = f"llama_benchmark_n{node}"
subprocess.run(["docker", "rm", "-f", container_name])
memory = (psutil.virtual_memory().total >> 30) - 30 # leave 30GB for OS
assert memory > 10, "less than 10GB of memory available on the system for llama.cpp"
if subprocess.run(
["docker", "run", "--privileged=true", "--cpuset-mems", f"{str(node)}", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v",
f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", tag]).returncode != 0:
print("Docker run process failed!")
sys.exit(1)
return container_name
def docker_restart(docker_name):
break_time = 15
def docker_stop():
if subprocess.run(["docker", "stop", docker_name]).returncode != 0:
print(f"Stopping docker container {docker_name} failed, retrying in {break_time} seconds.")
time.sleep(break_time)
docker_stop()
def docker_start():
if subprocess.run(["docker", "start", docker_name]).returncode != 0:
print(f"Starting docker container {docker_name} failed, retrying in {break_time} seconds.")
time.sleep(break_time)
docker_start()
print(f"\nRestarting docker container {docker_name} ...")
docker_stop()
docker_start()
def benchmark(docker_container_name, args):
num_available_threads = len(parse_threads_range(args.threads_range))
if num_available_threads < max(args.num_threads):
print(f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
sys.exit(1)
docker_restart(docker_container_name)
for model in args.model_names:
for prompt_size in sorted(args.prompt_sizes):
for batch_size in sorted(args.batch_sizes):
for num_threads in sorted(args.num_threads):
num_processes = int(num_available_threads / num_threads)
case = f"{num_processes} x {num_threads} [proc x threads], bs = {batch_size}"
print(f"\nRunning {case}")
cmd = (f"cd /runner; python3 utils/benchmark.py -m models/{model} -n {str(num_processes)} "
f"-t {str(num_threads)} -b {str(batch_size)} -p {str(prompt_size)} -r {args.threads_range}")
if args.fa != 0 :
cmd += " -fa 1"
cmd = ["docker", "exec", "-i", docker_container_name, "bash", "-c", cmd]
print(f"Executing: {' '.join(cmd)}")
success = False
start = time.time()
p = subprocess.Popen(cmd, start_new_session=True)
while time.time() - start < args.timeout:
time.sleep(1)
exit_code = p.poll()
if exit_code is not None:
success = exit_code == 0
break
if success:
print(f"SUCCESS: {case}")
else:
print(f"FAIL: {case}")
docker_restart(docker_container_name)
def parse_args():
parser = argparse.ArgumentParser(description="Run set of benchmarks.")
parser.add_argument("-m", "--model_names",
type=str, required=True, nargs="+",
help="model names, e.g. 'Meta-Llama-3-8B-Instruct.Q8_0.gguf'")
parser.add_argument("-t", "--num_threads",
type=int, required=True, nargs="+",
help="number of threads per process to use")
parser.add_argument("-b", "--batch_sizes",
type=int, required=True, nargs="+",
help="batch sizes to cover")
parser.add_argument("-p", "--prompt_sizes",
type=int, required=True, nargs="+",
help="prompt sizes to cover")
parser.add_argument("-r", "--threads_range",
type=str, required=True,
help="range of threads to use in offline mode, e.g. '0-63,128-191', threads will be divided "
"between processes - hint: 'lscpu | grep NUMA'")
parser.add_argument("--timeout",
type=float, default=900,
help="timeout to apply per single benchmark case")
parser.add_argument("-n", "--numa",
type=int, default=0,
help="numa mode of the docker container")
parser.add_argument("-fa",
type=int, default=0, choices=range(0, 2),
help="enable flash attention")
return parser.parse_args()
def main():
args = parse_args()
benchmark(docker_init(args.numa), args)
if __name__ == "__main__":
main()