accelerator-microbenchmarks/src/benchmark_utils.py at afd28a047dce44570c2c40be331cb33e3a07dd16 · AI-Hypercomputer/accelerator-microbenchmarks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
"""Utility functions for microbenchmarking."""

import datetime
import os
from typing import Dict, Any
import glob

import jax
import jsonlines
import numpy as np
import random
import string
import pathlib
import gzip
import json
import re
from collections import defaultdict
import subprocess
import shutil

# The dictionary to map a JAX (collective) function to its main HLO.
TARGET_TASK_NAME_COLLECTIVES_MAP = {
    "all_to_all_ici_op": r"all-to-all.[0-9]+",
    "all_gather_ici_op": r"all-gather.[0-9]+",
    "psum_ici_op": r"all-reduce.[0-9]+",
    "ppermute_ici_op": r"collective-permute.[0-9]+",
}

def simple_timeit(f, *args, matrix_dim=None, tries=10, task=None, trace_dir=None) -> float:
    """Simple utility to time a function for multiple runs."""
    assert task is not None

    if trace_dir:
        try:
            outcomes_ms = timeit_from_trace(
                f, *args, matrix_dim=matrix_dim, tries=tries, task=task, trace_dir=trace_dir
            )
            if outcomes_ms is not None:
                return outcomes_ms
            print("Warning: timeit_from_trace returned empty results. Falling back to manual timing.")
        except Exception as e:
            print(f"Warning: Failed to get metrics from trace due to: {e}. Falling back to manual timing.")

    outcomes_ms = []
    jax.block_until_ready(f(*args))  # warm it up!
    for _ in range(tries):
        jax.devices()  # Force synchronization across devices
        s = datetime.datetime.now()
        jax.block_until_ready(f(*args))
        e = datetime.datetime.now()
        outcomes_ms.append(1000 * (e - s).total_seconds())
    return outcomes_ms


def get_trace(log_dir: str) -> dict[str, Any]:
    """Extract the trace object from the log directory.

    Returns:
      A trace object in JSON format.
    """
    # Navigate to the folder with the latest trace dump to find `trace.json.jz`
    trace_folders = (pathlib.Path(log_dir).absolute() / "plugins" / "profile").iterdir()
    latest_trace_folder = max(trace_folders, key=os.path.getmtime)
    trace_jsons = latest_trace_folder.glob("*.trace.json.gz")
    try:
        (trace_json,) = trace_jsons
    except ValueError as value_error:
        raise ValueError(
            f"Invalid trace folder: {latest_trace_folder}"
        ) from value_error

    with gzip.open(trace_json, "rb") as f:
        trace = json.load(f)

    return trace


def get_metrics_from_trace(trace: dict[str, Any], task: str) -> list[float]:

    # Check if the given task name is a collective with corresponding TPU opertion.
    # This is a workaround and should be reverted or refactored in future.
    # If task is not present in the map, fallback to the default behavior to measure the timing from the CPU end.
    if task in TARGET_TASK_NAME_COLLECTIVES_MAP:
        try:
            task = TARGET_TASK_NAME_COLLECTIVES_MAP[task]
            return get_metrics_from_trace_tpu(trace, task)
        except:
            return None
    event_matcher = re.compile(task)

    if "traceEvents" not in trace:
        raise KeyError("Key 'traceEvents' not found in trace.")

    events = []
    for e in trace["traceEvents"]:
        if "name" in e and event_matcher.match(e["name"]):
            events.append(e)

    events_by_run_id = defaultdict(list)
    for e in events:
        run_id = e["args"]["run_id"] if "args" in e and "run_id" in e["args"] else "0"
        events_by_run_id[run_id].append(e)
    durations_ms = []
    try:
        # Duration is in us.
        durations_ms = [
            max([e["dur"] for e in es]) / 1e3 for run_id, es in events_by_run_id.items()
        ]
    except KeyError:
        print("KeyError: Key 'dur' not found in the event object")
        raise
    return durations_ms

def get_metrics_from_trace_tpu(trace: dict[str, Any], task: str) -> list[float]:
    event_matcher = re.compile(task)

    if "traceEvents" not in trace:
        raise KeyError("Key 'traceEvents' not found in trace.")

    events = []
    for e in trace["traceEvents"]:
        if "name" in e and event_matcher.match(e["name"]):
            events.append(e)

    # For each trace, find the TPU with smallest `pid` value and consider it to be TPU-0
    min_pid = min([e["pid"] for e in events])
    events_from_min_pid = [e for e in events if e["pid"] == min_pid]
    try:
        durations_ms = [float(e["args"]["device_duration_ps"]) / 1e9 for e in events_from_min_pid]
    except KeyError:
        print("KeyError: Key 'device_duration_ps' not found in the event object")
        raise
    return durations_ms

def is_local_directory_path(dir: str) -> bool:
    """
    Returns true if the path is a local path.
    """
    if not dir:  # Handle None or empty string
        return False

    # Heuristics for local paths
    return dir.startswith("/") or dir.startswith("./") or dir.startswith("../")


def timeit_from_trace(f, *args, matrix_dim=None, tries=10, task=None, trace_dir=None) -> float:
    """
    Time a function with jax.profiler and get the run time from the trace.
    """
    LOCAL_TRACE_DIR = "/tmp/microbenchmarks_tmptrace"

    jax.block_until_ready(f(*args))  # warm it up!

    if matrix_dim is not None:
        trace_name = f"{task}_dim_{matrix_dim}"
    else:
        trace_name = f"t_{task}_" + "".join(
            random.choices(string.ascii_uppercase + string.digits, k=10)
        )

    trace_full_dir = f"{trace_dir}/{trace_name}"
    tmp_trace_dir = trace_full_dir
    # If the trace_dir isn't a local path, create one for dumping the trace for parsing and getting metrics.
    if trace_dir and not is_local_directory_path(trace_dir):
        tmp_trace_dir = f"{LOCAL_TRACE_DIR}/{trace_name}"
    with jax.profiler.trace(tmp_trace_dir):
        for _ in range(tries):
            jax.devices()  # Force synchronization across devices
            with jax.profiler.TraceAnnotation(task):
                jax.block_until_ready(f(*args))

    trace = get_trace(tmp_trace_dir)

    if trace_full_dir != tmp_trace_dir:
        # Upload the traces to desired location
        upload_to_storage(trace_dir=trace_full_dir, local_file=tmp_trace_dir)
    return get_metrics_from_trace(trace, task)


def maybe_write_metrics_file(
    metrics_dir, metrics, metadata, test_name, test_start_time, test_end_time
):
    """Writes metrics to a JSONL file to be consumed by the XLML metrics pipeline."""

    # Only write metrics from one host.
    if jax.process_index() != 0:
        return

    jsonl_name = "metrics_report.jsonl"
    jsonl_path = metrics_dir + "/" + jsonl_name
    metadata.update(
        {
            "testsuite": "microbenchmark",
            "test_name": f"{test_name}",
            "test_start_timestamp": f"{test_start_time}",
            "test_end_timestamp": f"{test_end_time}",
        }
    )
    metrics_data = {
        "metrics": metrics,
        "dimensions": metadata,
    }
    # Make sure the metadata value is a string.
    for key, value in metadata.items():
        metadata[key] = str(value)

    # Ensure the directory exists
    os.makedirs(os.path.dirname(jsonl_path), exist_ok=True)

    print(f"Writing metrics to JSONL file: {jsonl_path}")
    with jsonlines.open(jsonl_path, mode="a") as writer:
        writer.write(metrics_data)


def upload_to_storage(trace_dir: str, local_file: str):
    """
    Uploads a local file to a specified storage location.
    """

    if trace_dir.startswith("gs://"):  # Google Cloud Storage (GCS)
        try:
            subprocess.run(
                ["gsutil", "cp", "-r", local_file, trace_dir],
                check=True,
                capture_output=True,
            )

        except subprocess.CalledProcessError as e:
            print(
                f"Failed to upload '{local_file}' to GCS: '{trace_dir}'. Error: {e.stderr.decode()}"
            )
    else:
        raise KeyError(f"{trace_dir} is not a valid GCS path.")


class MetricsStatistics:
    """
    Represents statistics for a list of metrics.
    """

    def __init__(self, metrics_list, metrics_name: str):
        self.metrics_list = metrics_list
        self.metrics_name = metrics_name
        self.statistics = self._calculate_statistics()

    def _calculate_statistics(self) -> Dict[str, float]:
        """Calculates the statistics of the metrics list."""
        if not self.metrics_list:
            return {}  # Return an empty dict if metrics_list is empty
        return {
            "p50": np.percentile(self.metrics_list, 50),
            "p90": np.percentile(self.metrics_list, 90),
            "p95": np.percentile(self.metrics_list, 95),
            "p99": np.percentile(self.metrics_list, 99),
            "avg": np.mean(self.metrics_list),
        }

    def __repr__(self):
        return (
            f"MetricsStatistics(metrics_name='{self.metrics_name}', "
            f"statistics={self.statistics})"
        )

    def serialize_statistics(self):
        serialized = {}
        for stat_name, stat_value in self.statistics.items():
            serialized[f"{self.metrics_name}_{stat_name}"] = stat_value
        return serialized


def rename_xla_dump(
    tmp_xla_dump_dir: str,
    dest_xla_dump_dir: str,
    benchmark_name: str,
    benchmark_param: Dict[str, Any],
):
    """
    Finds the latest XLA dump file matching '*jit_f*before_optimizations*.txt',
    then identifies all other files that share the same 'jit_f.[unique_id]' identifier
    and renames them to 'benchmark_name_serialized_params.original_suffix_with_extension'.
    """

    serialized_benchmark_param = "_".join(
        f"{key}_{value}" for key, value in benchmark_param.items()
    )
    anchor_pattern = os.path.join(tmp_xla_dump_dir, "*jit_f*before_optimizations*.txt")
    matching_anchor_files = glob.glob(anchor_pattern)

    if not matching_anchor_files:
        print(
            f"No files found for anchor pattern: '{anchor_pattern}'. No files will be renamed."
        )
        return

    # Sort anchor files by modification time (latest first)
    matching_anchor_files.sort(key=os.path.getmtime, reverse=True)
    latest_anchor_file = matching_anchor_files[0]

    # Example: 'module_0080.jit_f.cl_747713181.before_optimizations.txt'
    # This will extract 'module_0080.jit_f.cl_747713181'
    filename_base = os.path.basename(latest_anchor_file)
    jit_id_match = re.search(r"(module.*jit_f\.[^.]+)", filename_base)

    if not jit_id_match:
        print(
            f"Could not extract 'jit_f.[unique_id]' from '{filename_base}'. Cannot proceed with renaming."
        )
        return

    common_jit_id_prefix = jit_id_match.group(1)

    # Find all files in the directory that contain this specific common_jit_id_prefix
    all_related_files_pattern = os.path.join(
        tmp_xla_dump_dir, f"*{common_jit_id_prefix}*"
    )
    all_related_files = glob.glob(all_related_files_pattern)

    if not all_related_files:
        print(
            f"No files found containing '{common_jit_id_prefix}'. This is unexpected if an anchor was found."
        )
        return

    new_base_name = f"{benchmark_name}_{serialized_benchmark_param}"

    for original_filepath in all_related_files:
        original_filename = os.path.basename(original_filepath)

        # Find the specific suffix part *after* the common_jit_id_prefix.
        # This regex looks for the common_jit_id_prefix, then captures everything after it,
        # ensuring it starts with a dot if there's more.
        # Example: if original_filename is 'module_0080.jit_f.cl_747713181.after_codegen.txt'
        # and common_jit_id_prefix is 'jit_f.cl_747713181'
        # we want to capture '.after_codegen.txt'
        suffix_match = re.search(
            re.escape(common_jit_id_prefix) + r"(\..*)", original_filename
        )

        if suffix_match:
            original_suffix_with_extension = suffix_match.group(
                1
            )  # e.g., '.after_codegen.txt'

        new_filename = f"{new_base_name}{original_suffix_with_extension}"
        new_filepath = os.path.join(dest_xla_dump_dir, new_filename)

        if original_filepath == new_filepath:
            print(
                f"Skipping: '{original_filename}' already has the desired name or path."
            )
            continue

        # Copy the renamed files to desired location
        if is_local_directory_path(dest_xla_dump_dir):
            try:
                os.makedirs(dest_xla_dump_dir, exist_ok=True)
                shutil.copy(original_filepath, new_filepath)
            except Exception as e:
                print(
                    f"An unexpected error occurred while copy '{original_filepath}': {e}"
                )
        else:
            upload_to_storage(trace_dir=new_filepath, local_file=original_filepath)
    print(f"The XLA dump is stored in {dest_xla_dump_dir}")