Skip to content

Commit 4b5347d

Browse files
committed
Adding README back into the gemm directory and integrate new preshuffle functions
1 parent 2e0ac15 commit 4b5347d

20 files changed

Lines changed: 732 additions & 380 deletions

Jenkinsfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1644,7 +1644,7 @@ pipeline {
16441644
-D GEMM_PRESHUFFLE_LAYOUT="rcr" \
16451645
-D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \
16461646
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
1647-
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
1647+
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
16481648
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
16491649
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
16501650
}
@@ -1685,7 +1685,7 @@ pipeline {
16851685
-D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
16861686
-D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
16871687
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
1688-
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
1688+
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
16891689
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
16901690
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
16911691
}
@@ -1710,7 +1710,7 @@ pipeline {
17101710
-D GEMM_UNIVERSAL_DATATYPE="fp16" \
17111711
-D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" .. && \
17121712
ninja -j${nthreads()} benchmark_gemm_universal_all && \
1713-
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
1713+
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
17141714
}
17151715
steps{
17161716
buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)

test/ck_tile/gemm_tile_engine/CMakeLists.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# ============================================================================
1111

1212
# Locate tile_engine GEMM scripts directory
13-
set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm")
13+
set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm/gemm_universal")
1414

1515
if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR})
1616
message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}")
@@ -32,11 +32,11 @@ endif()
3232
# config_json - Full path to JSON configuration file
3333
# ============================================================================
3434
function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json)
35-
set(target_name "test_gemm_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
35+
set(target_name "test_gemm_universal_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
3636
set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
3737

3838
# Generated header path (already created during cmake configuration)
39-
set(test_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
39+
set(test_header "${working_path}/gemm_universal_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
4040
set(test_params_header "${working_path}/test_params.hpp")
4141

4242
# Verify header exists (should have been generated during cmake configuration)
@@ -118,7 +118,7 @@ function(build_gemm_test_targets datatype layout config_name)
118118

119119
# STEP 1: Discovery phase - list all valid kernel configurations
120120
execute_process(
121-
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
121+
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
122122
--working_path ${working_path}
123123
--datatype ${datatype}
124124
--layout ${layout}
@@ -178,7 +178,7 @@ function(build_gemm_test_targets datatype layout config_name)
178178

179179
# Generate header using --gen_single
180180
execute_process(
181-
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
181+
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
182182
--working_path ${working_path}
183183
--gpu_target "${GEMM_TEST_GPU_TARGETS}"
184184
--datatype ${datatype}

tile_engine/ops/common/benchmark_utils.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,16 @@
22
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
33
# SPDX-License-Identifier: MIT
44

5-
import sys
65
import json
76
import subprocess
8-
import argparse
97
import csv
10-
import time
118
from pathlib import Path
12-
from typing import List, Dict, Tuple, Optional
9+
from typing import List, Dict, Optional
1310

1411

15-
def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False) -> Optional[Dict]:
12+
def run_kernel(
13+
build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False
14+
) -> Optional[Dict]:
1615
"""Run a single kernel with given parameters and save output to individual JSON file"""
1716
# Create results directory
1817
results_dir = build_dir / "results"
@@ -59,6 +58,7 @@ def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbo
5958
print(f"Error running {kernel_path.name}: {e}")
6059
return None
6160

61+
6262
def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
6363
"""Parse JSON data from individual kernel output file"""
6464
try:
@@ -88,9 +88,8 @@ def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
8888
print(f"Error reading JSON file {json_file}: {e}")
8989
return None
9090

91-
def find_best_kernel(
92-
results: List[Dict], metric: str = "tflops"
93-
) -> Optional[Dict]:
91+
92+
def find_best_kernel(results: List[Dict], metric: str = "tflops") -> Optional[Dict]:
9493
"""Find the best performing kernel based on metric"""
9594
if not results:
9695
return None
@@ -126,7 +125,8 @@ def export_csv(results: List[Dict], filename: str, verbose: bool = False):
126125

127126
print(f"Results exported to {filename}")
128127

129-
def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = False):
128+
129+
def export_best_kernels(best_kernels: Dict, filename: str, verbose: bool = False):
130130
"""Export best kernel selections to file"""
131131
with open(filename, "w") as f:
132132
f.write("# Best kernel selections\n")
@@ -141,7 +141,10 @@ def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = Fals
141141

142142
print(f"Best kernels exported to {filename}")
143143

144-
def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False):
144+
145+
def export_json(
146+
results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False
147+
):
145148
"""Export all results and best kernels to JSON with comprehensive metadata"""
146149
from datetime import datetime
147150

@@ -223,9 +226,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
223226
"benchmark_metadata": {
224227
"timestamp": datetime.now().isoformat(),
225228
"total_kernels_tested": len(results),
226-
"unique_kernels": len(
227-
set(r.get("name", "unknown") for r in results)
228-
),
229+
"unique_kernels": len(set(r.get("name", "unknown") for r in results)),
229230
"successful_runs": len(successful_results),
230231
"failed_runs": len(results) - len(successful_results),
231232
},
@@ -265,9 +266,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
265266
"by_scheduler": scheduler_stats,
266267
"by_data_type": data_type_stats,
267268
},
268-
"total_problem_configurations": len(best_kernels)
269-
if best_kernels
270-
else 0,
269+
"total_problem_configurations": len(best_kernels) if best_kernels else 0,
271270
},
272271
"kernel_results": results,
273272
"best_kernels_by_problem": best_kernels or {},
@@ -282,4 +281,3 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
282281
print(f" - Best TFLOPS: {max(tflops_values, default=0):.2f}")
283282
print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
284283
print(f" - Best latency: {min(latency_values, default=0):.2f}ms")
285-

tile_engine/ops/common/utils.hpp

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -20,89 +20,6 @@ constexpr auto is_row_major(Layout)
2020
return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
2121
}
2222

23-
// Structure to hold kernel traits for dispatcher
24-
struct KernelTraits
25-
{
26-
std::string pipeline; // compv3, compv4, mem
27-
std::string scheduler; // intrawave, interwave
28-
std::string epilogue; // cshuffle, default
29-
bool pad_m;
30-
bool pad_n;
31-
bool pad_k;
32-
bool persistent;
33-
34-
// Constructor with defaults
35-
KernelTraits()
36-
: pipeline("compv3"),
37-
scheduler("intrawave"),
38-
epilogue("cshuffle"),
39-
pad_m(false),
40-
pad_n(false),
41-
pad_k(false),
42-
persistent(false)
43-
{
44-
}
45-
};
46-
47-
48-
// Create argument parser
49-
inline auto create_args(int argc, char* argv[])
50-
{
51-
ck_tile::ArgParser arg_parser;
52-
arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
53-
.insert("n", "4096", "The value for n dimension. Default is 4096.")
54-
.insert("k", "2048", "The value for k dimension. Default is 2048.")
55-
.insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
56-
.insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
57-
.insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
58-
.insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
59-
.insert("split_k", "1", "The split value for k dimension. Default is 1.")
60-
.insert("verify",
61-
"2",
62-
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
63-
"for validation on GPU. Default is 2, GPU validation.")
64-
.insert("log",
65-
"false",
66-
"Whether output kernel instance information or not. Possible values are true or "
67-
"false. Default is false")
68-
.insert(
69-
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
70-
.insert(
71-
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
72-
.insert("timer",
73-
"true",
74-
"Whether if the timer is gpu timer or not. Possible values are false or true. "
75-
"Default is true.")
76-
.insert("init",
77-
"0",
78-
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
79-
"for constant(1). Default is 0, random.")
80-
.insert("flush_cache",
81-
"true",
82-
"To flush cache, possible values are true or false. "
83-
"Default is false.")
84-
.insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
85-
.insert("metric",
86-
"0",
87-
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
88-
"tflops, or 2 for bandwidth. Default is 0, latency.")
89-
.insert("csv_filename",
90-
"",
91-
"The filename of benchmark result. Default is empty (no CSV output).")
92-
.insert("structured_sparsity",
93-
"false",
94-
"Whether use sparsity kernel or not. Possible values are true or false. Default is "
95-
"false")
96-
.insert("json_output",
97-
"false",
98-
"Whether to output results in JSON format only. Possible values are true or false. "
99-
"Default is "
100-
"false");
101-
102-
bool result = arg_parser.parse(argc, argv);
103-
return std::make_tuple(result, arg_parser);
104-
}
105-
10623
enum class Metric
10724
{
10825
LATENCY = 0,

0 commit comments

Comments
 (0)