[AArch64] Add intrinsics for multi-vector FEAT_SVE_BFSCALE instructions#163536
[AArch64] Add intrinsics for multi-vector FEAT_SVE_BFSCALE instructions#163536
Conversation
|
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-aarch64 Author: None (Lukacma) ChangesThis patch add intrinsics support for multi-vector BFMUL and BFSCALE instruction based on ARM-software/acle#410 ACLE specification proposal Patch is 31.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163536.diff 7 Files Affected:
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d2b7b78b9970f..96b5e55beca6d 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2082,6 +2082,13 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,sve-b16b16"in {
defm SVBFMAXNM : BfSingleMultiVector<"maxnm">;
}
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,sve-bfscale" in {
+ // BFMUL
+ defm SVBFMUL : BfSingleMultiVector<"mul">;
+ // BFSCALE
+ defm SVBFSCALE : BfSingleMultiVector<"scale">;
+}
+
let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
// == ADD (vectors) ==
def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfmul.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfmul.c
new file mode 100644
index 0000000000000..187e9390f742c
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfmul.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svmul_single_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svmul_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_svmul_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __arm_streaming{
+ return SVE_ACLE_FUNC(svmul,_single_bf16_x2)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svmul_single_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svmul_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svmul_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __arm_streaming{
+ return SVE_ACLE_FUNC(svmul,_single_bf16_x4)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svmul_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z18test_svmul_bf16_x214svbfloat16x2_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_svmul_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_streaming{
+ return SVE_ACLE_FUNC(svmul,_bf16_x2)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svmul_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE2]], <vscale x 8 x bfloat> [[ZM_COERCE3]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z18test_svmul_bf16_x414svbfloat16x4_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fmul.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE2]], <vscale x 8 x bfloat> [[ZM_COERCE3]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svmul_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm) __arm_streaming{
+ return SVE_ACLE_FUNC(svmul,_bf16_x4)(zdn, zm);
+}
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfscale.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfscale.c
new file mode 100644
index 0000000000000..6f8606c22954f
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bfscale.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svscale_single_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z27test_svscale_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_svscale_single_bf16_x2(svbfloat16x2_t zdn, svbfloat16_t zm) __arm_streaming{
+ return SVE_ACLE_FUNC(svscale,_single_bf16_x2)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svscale_single_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z27test_svscale_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svscale_single_bf16_x4(svbfloat16x4_t zdn, svbfloat16_t zm) __arm_streaming{
+ return SVE_ACLE_FUNC(svscale,_single_bf16_x4)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svscale_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z20test_svscale_bf16_x214svbfloat16x2_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_svscale_bf16_x2(svbfloat16x2_t zdn, svbfloat16x2_t zm) __arm_streaming{
+ return SVE_ACLE_FUNC(svscale,_bf16_x2)(zdn, zm);
+}
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svscale_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE2]], <vscale x 8 x bfloat> [[ZM_COERCE3]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z20test_svscale_bf16_x414svbfloat16x4_tS_(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[ZDN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZDN_COERCE3:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE2:%.*]], <vscale x 8 x bfloat> [[ZM_COERCE3:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fscale.x4.nxv8bf16(<vscale x 8 x bfloat> [[ZDN_COERCE0]], <vscale x 8 x bfloat> [[ZDN_COERCE1]], <vscale x 8 x bfloat> [[ZDN_COERCE2]], <vscale x 8 x bfloat> [[ZDN_COERCE3]], <vscale x 8 x bfloat> [[ZM_COERCE0]], <vscale x 8 x bfloat> [[ZM_COERCE1]], <vscale x 8 x bfloat> [[ZM_COERCE2]], <vscale x 8 x bfloat> [[ZM_COERCE3]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svscale_bf16_x4(svbfloat16x4_t zdn, svbfloat16x4_t zm) __arm_streaming{
+ return SVE_ACLE_FUNC(svscale,_bf16_x4)(zdn, zm);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b0269eec3347a..09cc158dc9767 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3627,10 +3627,10 @@ let TargetPrefix = "aarch64" in {
}
//
- // Multi-vector floating point min/max number
+ // Multi-vector floating point min/max number, scale, and multiply
//
- foreach instr = ["fmaxnm", "fminnm"] in {
+ foreach instr = ["fmaxnm", "fminnm", "fscale", "fmul"] in {
def int_aarch64_sve_ # instr # _single_x2 : SVE2_VG2_Multi_Single_Intr...
[truncated]
|
|
Same patch as #163346. Just had to revert it due to ACLE not being finilized yet. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
CarolineConcatto
left a comment
There was a problem hiding this comment.
LGTM.
I would only update the commit message to add the C intrinsics:
These are the intrinsics prototype implemented, according to [1]
//BFMUL:
svbfloat16x2_t svmul[_bf16_x2](svbfloat16x2_t zd, svbfloat16x2_t zm) __arm_streaming;
svbfloat16x2_t svmul[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zm) __arm_streaming;
svbfloat16x4_t svmul[_bf16_x4](svbfloat16x4_t zd, svbfloat16x4_t zm) __arm_streaming;
svbfloat16x4_t svmul[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zm) __arm_streaming;
//BFSCALE
svbfloat16x2_t svscale[_bf16_x2](svbfloat16x2_t zdn, svint16x2_t zm) __arm_streaming;
svbfloat16x2_t svscale[_single_bf16_x2](svbfloat16x2_t zn, svint16_t zm) __arm_streaming;
svbfloat16x4_t svscale[_bf16_x4](svbfloat16x4_t zdn, svint16x4_t zm) __arm_streaming;
svbfloat16x4_t svscale[_single_bf16_x4](svbfloat16x4_t zn, svint16_t zm) __arm_streaming;
[1]ARM-software/acle#410
…ns (llvm#163536) These are the intrinsics prototype implemented, according to [1] //BFMUL: svbfloat16x2_t svmul[_bf16_x2](svbfloat16x2_t zd, svbfloat16x2_t zm) __arm_streaming; svbfloat16x2_t svmul[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zm) __arm_streaming; svbfloat16x4_t svmul[_bf16_x4](svbfloat16x4_t zd, svbfloat16x4_t zm) __arm_streaming; svbfloat16x4_t svmul[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zm) __arm_streaming; //BFSCALE svbfloat16x2_t svscale[_bf16_x2](svbfloat16x2_t zdn, svint16x2_t zm) __arm_streaming; svbfloat16x2_t svscale[_single_bf16_x2](svbfloat16x2_t zn, svint16_t zm) __arm_streaming; svbfloat16x4_t svscale[_bf16_x4](svbfloat16x4_t zdn, svint16x4_t zm) __arm_streaming; svbfloat16x4_t svscale[_single_bf16_x4](svbfloat16x4_t zn, svint16_t zm) __arm_streaming; [1]ARM-software/acle#410
This patch adds intrinsics for multi-vector FMUL instructions introduced in armv9.6, based on [this](ARM-software/acle#412) ACLE proposal. Depends on #163536 Intrinsics implemented: //BFMUL: svbfloat16x2_t svmul[_bf16_x2](svbfloat16x2_t zd, svbfloat16x2_t zm) __arm_streaming; svbfloat16x2_t svmul[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zm) __arm_streaming; svbfloat16x4_t svmul[_bf16_x4](svbfloat16x4_t zd, svbfloat16x4_t zm) __arm_streaming; svbfloat16x4_t svmul[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zm) __arm_streaming;
This patch add intrinsics support for multi-vector BFMUL and BFSCALE instruction based on ARM-software/acle#410 ACLE specification proposal