diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt
index 06512de6d66..4767571209a 100644
--- a/backends/iluvatar_gpu/CMakeLists.txt
+++ b/backends/iluvatar_gpu/CMakeLists.txt
@@ -817,6 +817,12 @@ list(
 file(
   GLOB_RECURSE CC_SRCS
   RELATIVE ${CMAKE_SOURCE_DIR}
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/axpr/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/fs/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_module/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_gen/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/kernel_dispatch/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/*.cc
   runtime/runtime.cc
   runtime/iluvatar_context.cc
   common/*.cc
diff --git a/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
new file mode 100644
index 00000000000..96cca79f412
--- /dev/null
+++ b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
@@ -0,0 +1,119 @@
+import os
+import subprocess
+import unittest
+import time
+import numpy as np
+import paddle
+import paddle.incubate.cc as pcc
+import paddle.incubate.cc.typing as pct
+
+
+def GetPirProgram(fused_func, tensor_args):
+    dtypes = tuple(tensor.dtype for tensor in tensor_args)
+    func = fused_func.func_overload_ctx.dtypes2func.get(dtypes, None)
+    return str(func.infer_program.forward_program)
+
+DT = 'float16'
+BS = 4
+MS = 784
+NS = 192
+KS = 768
+
+class TestMatmulEpilogue(unittest.TestCase):
+    def setUp(self):
+        dtype = DT
+        x_shape = [BS, MS, KS]
+        self.x = paddle.randn(x_shape, dtype=dtype)
+        self.x.stop_gradient = True
+
+        y_shape = [KS, NS]
+        self.y = paddle.randn(y_shape, dtype=dtype)
+        self.y.stop_gradient = True
+
+        b_shape = [BS, MS, NS]
+        self.b = paddle.randn(b_shape, dtype=dtype)
+        self.b.stop_gradient = True
+
+        b1_shape = [1]
+        self.b1 = paddle.randn(b1_shape, dtype=dtype)
+        self.b1.stop_gradient = True
+
+        bias_shape = [NS]
+        self.bias = paddle.randn(bias_shape, dtype=dtype)
+        self.bias.stop_gradient = True
+
+        residual_shape = [BS, MS, NS]
+        self.residual = paddle.randn(residual_shape, dtype=dtype)
+        self.residual.stop_gradient = True
+
+        mask_shape = [BS, MS, NS]
+        self.mask = paddle.randn(mask_shape, dtype=dtype)
+        self.mask.stop_gradient = True
+
+
+    def get_subgraph(self):
+        B = pct.DimVar(BS)
+        M = pct.DimVar(MS)
+        K = pct.DimVar(KS)
+        N = pct.DimVar(NS)
+        T = pct.DTypeVar("T", DT)
+
+        def matmul_add_act(
+            x: pct.Tensor([B, M, K], T),
+            y: pct.Tensor([K, N], T),
+            b: pct.Tensor([B, M, N], T),
+            b1: pct.Tensor([1], T),
+        ):
+
+            out = paddle.matmul(x, y)
+            out = out + b
+            return paddle.nn.functional.relu(out)
+
+
+        def matmul_add_divide_multipy_add_S1(
+            x: pct.Tensor([B, M, K], T),
+            y: pct.Tensor([K, N], T),
+            bias: pct.Tensor([N], T),
+            residual: pct.Tensor([B, M, N], T),
+            mask: pct.Tensor([B, M, N], T),
+        ):
+            out = paddle.matmul(x, y)
+            out = out + bias
+            # out = out / 1.2
+            out = out * mask
+            return residual + out
+        
+        return matmul_add_divide_multipy_add_S1
+
+    def test_subgraph(self):
+        foo = self.get_subgraph()
+        fused_foo = pcc.compile(
+            foo, ap_path=f"{os.path.dirname(paddle.__file__)}/apy/matmul_pass"
+        )
+
+        ap_outs = fused_foo(self.x, self.y, self.bias, self.residual, self.mask)
+        dy_outs = foo(self.x, self.y, self.bias, self.residual, self.mask)
+        generated_pir_program = GetPirProgram(fused_foo, [x, y, b])
+    
+    
+        assert 'pd_op.ap_variadic' in generated_pir_program, "fusion failed, excludes pd_op.ap_variadic"
+
+        iters = 10
+        # warmup
+        _ = fused_foo(self.x, self.y, self.b, self.b1)
+        _ = foo(self.x, self.y, self.b, self.b1)
+
+        paddle.device.synchronize()
+        start = time.time()
+        for _ in range(iters):
+            _ = fused_foo(self.x, self.y, self.b, self.b1)
+        paddle.device.synchronize()
+        end = time.time()
+        avg_time = (end - start) / iters
+        print(f"[Performance] Avg latency per run: {avg_time:.6f} s")
+
+        for dy_out, ap_out in zip(dy_outs, ap_outs):
+            np.testing.assert_allclose(dy_out, ap_out, rtol=5e-2, atol=1e-1)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.sh b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.sh
new file mode 100644
index 00000000000..fd7430883f2
--- /dev/null
+++ b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+export FLAGS_prim_enable_dynamic=true 
+export FLAGS_prim_all=true
+
+# CINN related FLAG
+export FLAGS_use_cinn=false
+export FLAGS_group_schedule_tiling_first=true
+# PIR mode
+export FLAGS_enable_pir_api=true
+
+# print Program IR
+export FLAGS_print_ir=true
+
+# debug log
+export GLOG_v=0
+export GLOG_vmodule=ap_generic_drr_pass=6
+
+export CUDA_VISIBLE_DEVICES=11
+export FLAGS_enable_ap=1
+
+PADDLE_ROOT="${PADDLE_ROOT:-/path/to/your/paddle/build}"
+export PYTHONPATH="${PADDLE_ROOT}/python:$PYTHONPATH"
+export AP_WORKSPACE_DIR="/tmp/ap_workspace"
+export AP_PATH="${PADDLE_ROOT}/python/paddle/apy/sys:${PADDLE_ROOT}/python/paddle/apy/matmul_pass:$AP_PATH"
+
+python test_matmul_epilogue.py 2>&1 | tee output.log