Skip to content

Commit 4edb011

Browse files
committed
[NE16] Add GAP9_w_NE16 platform: NE16 accelerator Engine on GAP9
Mirrors the Siracusa_w_neureka pattern. NE16Platform extends GAP9Platform with engines=[NE16Engine, GAP9ClusterEngine]; NE16Deployer extends GAP9Deployer (reuses ClDma transformers via GAP9Bindings). New Target: Deeploy/Targets/NE16/ (Platform, Engine, Bindings, Parsers, Tiler, Deployer, Templates, TileConstraints, TopologyOptimizationPasses). The _weightEncode function is ported from pulp-nnx/test/Ne16Weight.py (single CIN_SUBTILE=16 mode, no 1x1 vs 3x3 split). ConvTemplate subtile constants set per ne16_task_defs.h (output 3x3, weight stride bytes PW=16 DW/Dense=144). New test infrastructure: - DeeployTest/deeployRunner_tiled_gap9_w_ne16.py - DeeployTest/test_gap9_ne16_tiled_config.py (PW/DW/Dense RQ Conv) DeeployTest wiring: - testUtils/platformMapping.py: register GAP9_w_NE16 in the platforms list, mapPlatform, setupMemoryPlatform, mapDeployer. - testMVP.py: include GAP9_w_NE16 in the EngineColoringDeployerWrapper branch (without it NE16AdjustWeightMemoryLayoutPass never fires and parsing backtracks to exhaustion). - testUtils/core/execution.py: build the GAP9 SDK 'image' target for GAP9_w_NE16 too (so chip.soc.mram.bin is produced before gvsoc run). - CMakeLists.txt, DeeployTest/CMakeLists.txt: accept GAP9_w_NE16 alongside GAP9 in the platform branches. - TargetLibraries/GAP9/CMakeLists.txt: for GAP9_w_NE16 platform, add_subdirectory on pulp-nnx with USE_NE16=ON and link it into deeploygap9. Fix: Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py referenced an undefined symbol float32_tPtr from Deeploy.AbstractDataTypes; define it locally via PointerClass(float32_t) to unblock the import chain reached by NE16Platform. Verified on gvsoc gap9.evk: PW 1x1 RQ (Regular_RQ): 0/1152 errors, 901917 cycles DW 3x3 RQ (DW_2D_RQ): 0/1280 errors, 27339 cycles (--enable-3x3) Dense 3x3 (Regular_2D_RQ): 0/6372 errors, 244595 cycles (--enable-3x3)
1 parent 8595539 commit 4edb011

32 files changed

Lines changed: 2448 additions & 11 deletions
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
---
6+
name: _runner-gap9-w-ne16-tiled
7+
8+
"on":
9+
workflow_call:
10+
inputs:
11+
runner:
12+
required: true
13+
type: string
14+
docker-image:
15+
required: true
16+
type: string
17+
pytest-markers:
18+
required: true
19+
type: string
20+
21+
jobs:
22+
test-runner-gap9-w-ne16-tiled:
23+
runs-on: ${{ inputs.runner }}
24+
container:
25+
image: ${{ inputs.docker-image }}
26+
steps:
27+
- name: Checkout Repo
28+
uses: actions/checkout@v4
29+
with:
30+
submodules: recursive
31+
- name: Build Deeploy
32+
shell: bash
33+
run: |
34+
source /app/install/gap9-sdk/.gap9-venv/bin/activate
35+
source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
36+
pip install -e . || true
37+
deactivate
38+
- name: Cache ccache
39+
uses: actions/cache/restore@v4
40+
with:
41+
path: /app/.ccache
42+
key: ccache-gap9
43+
- name: Run Test
44+
run: |
45+
source /app/install/gap9-sdk/.gap9-venv/bin/activate
46+
source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
47+
export GVSOC_INSTALL_DIR=/app/install/gap9-sdk/install/workstation
48+
export GAP_RISCV_GCC_TOOLCHAIN=/app/install/gcc/gap9
49+
cd DeeployTest
50+
mkdir -p /app/.ccache
51+
export CCACHE_DIR=/app/.ccache
52+
pytest test_platforms.py -v -m "${{ inputs.pytest-markers }}"
53+
deactivate
54+
shell: bash
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
---
6+
name: CI • GAP9 + NE16 (Tiled)
7+
8+
"on":
9+
push:
10+
branches:
11+
- "**"
12+
tags:
13+
- "v*.*.*"
14+
pull_request:
15+
workflow_dispatch:
16+
inputs:
17+
docker_image_deeploy:
18+
description: "Deeploy Image to use"
19+
required: false
20+
default: "ghcr.io/pulp-platform/deeploy-gap9:latest"
21+
22+
concurrency:
23+
group: ${{ github.workflow }}-${{ github.ref }}
24+
cancel-in-progress: true
25+
26+
jobs:
27+
select-env:
28+
uses: ./.github/workflows/_select-env.yml
29+
with:
30+
docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || github.repository == 'pulp-platform/Deeploy' && 'ghcr.io/pulp-platform/deeploy-gap9:latest'}}
31+
32+
gap9-w-ne16-kernels-tiled-singlebuffer-L2:
33+
needs: select-env
34+
uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
35+
with:
36+
runner: ${{ needs.select-env.outputs.runner }}
37+
docker-image: ${{ needs.select-env.outputs.image }}
38+
pytest-markers: "gap9_w_ne16_tiled and kernels and singlebuffer and l2"
39+
40+
gap9-w-ne16-kernels-tiled-doublebuffer-L2:
41+
needs: select-env
42+
uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
43+
with:
44+
runner: ${{ needs.select-env.outputs.runner }}
45+
docker-image: ${{ needs.select-env.outputs.image }}
46+
pytest-markers: "gap9_w_ne16_tiled and kernels and doublebuffer and l2"

CMakeLists.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC)
2020
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
2121
endif()
2222

23-
set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)")
24-
set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch)
23+
set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, GAP9_w_NE16, Generic, Snitch)")
24+
set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 GAP9_w_NE16 Generic Snitch)
2525

2626
if(platform STREQUAL MemPool)
2727
message(STATUS "Building for platform 'MemPool'")
@@ -33,8 +33,8 @@ elseif(platform STREQUAL Siracusa_w_neureka)
3333
message(STATUS "Building for platform 'Siracusa_w_neureka'")
3434
elseif(platform STREQUAL PULPOpen)
3535
message(STATUS "Building for platform 'PULP-Open'")
36-
elseif(platform STREQUAL GAP9)
37-
message(STATUS "Building for platform 'GAP9'")
36+
elseif(platform STREQUAL GAP9 OR platform STREQUAL GAP9_w_NE16)
37+
message(STATUS "Building for platform '${platform}'")
3838
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
3939
set(ENV{KCONFIG_CONFIG} DeeployTest/Platforms/GAP9/sdk.config)
4040
include($ENV{GAP_SDK_HOME}/utils/cmake/setup.cmake)
@@ -53,7 +53,7 @@ endif()
5353
# Import useful functions / macros
5454
include(${CMAKE_CURRENT_LIST_DIR}/cmake/Util.cmake)
5555
# Only if not GAP9
56-
if(NOT platform STREQUAL GAP9)
56+
if(NOT platform STREQUAL GAP9 AND NOT platform STREQUAL GAP9_w_NE16)
5757
include(${CMAKE_CURRENT_LIST_DIR}/cmake/common.cmake)
5858
endif()
5959
include(${CMAKE_CURRENT_LIST_DIR}/cmake/simulation.cmake)
@@ -222,7 +222,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor
222222

223223
endif()
224224

225-
if(platform STREQUAL GAP9)
225+
if(platform STREQUAL GAP9 OR platform STREQUAL GAP9_w_NE16)
226226
project(${TESTNAME} LANGUAGES C ASM)
227227
include(${CMAKE_CURRENT_LIST_DIR}/cmake/gap9/gap9_gvsoc.cmake)
228228
add_compile_options(

Deeploy/Targets/NE16/Bindings.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from Deeploy.AbstractDataTypes import PointerClass
6+
from Deeploy.CommonExtensions.DataTypes import int8_t, int32_t, uint8_t
7+
from Deeploy.DeeployTypes import NodeBinding
8+
from Deeploy.Targets.GAP9.Bindings import GAP9ClusterTransformer as ClusterTransformer
9+
from Deeploy.Targets.Generic.TypeCheckers import ConvChecker
10+
from Deeploy.Targets.NE16.Templates.ConvTemplate import NE16DenseConv2D_Template, NE16DWConv2D_Template, \
11+
NE16PWConv2D_Template, NE16RqntDenseConv2D_Template, NE16RqntDWConv2D_Template, NE16RqntPWConv2D_Template
12+
from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker
13+
14+
NE16RQSPWConv2DBindings = [
15+
NodeBinding(
16+
PULPConvChecker(
17+
[PointerClass(data_in_type),
18+
PointerClass(weight_type),
19+
PointerClass(int32_t),
20+
PointerClass(int32_t)], [PointerClass(data_out_type)]), NE16RqntPWConv2D_Template, ClusterTransformer)
21+
for data_in_type in [uint8_t, int8_t]
22+
for data_out_type in [uint8_t, int8_t]
23+
for weight_type in [uint8_t, int8_t]
24+
]
25+
NE16PWConv2DBindings = [
26+
NodeBinding(
27+
ConvChecker(
28+
[PointerClass(data_in_type), PointerClass(weight_type),
29+
PointerClass(int32_t)], [PointerClass(int32_t)]), NE16PWConv2D_Template, ClusterTransformer)
30+
for data_in_type in [uint8_t, int8_t]
31+
for weight_type in [uint8_t, int8_t]
32+
]
33+
34+
NE16RQSDWConv2DBindings = [
35+
NodeBinding(
36+
PULPConvChecker(
37+
[PointerClass(data_in_type),
38+
PointerClass(weight_type),
39+
PointerClass(int32_t),
40+
PointerClass(int32_t)], [PointerClass(data_out_type)]), NE16RqntDWConv2D_Template, ClusterTransformer)
41+
for data_in_type in [uint8_t, int8_t]
42+
for data_out_type in [uint8_t, int8_t]
43+
for weight_type in [uint8_t, int8_t]
44+
]
45+
NE16DWConv2DBindings = [
46+
NodeBinding(
47+
ConvChecker(
48+
[PointerClass(data_in_type), PointerClass(weight_type),
49+
PointerClass(int32_t)], [PointerClass(int32_t)]), NE16DWConv2D_Template, ClusterTransformer)
50+
for data_in_type in [uint8_t, int8_t]
51+
for weight_type in [uint8_t, int8_t]
52+
]
53+
54+
NE16RQSDenseConv2DBindings = [
55+
NodeBinding(
56+
PULPConvChecker(
57+
[PointerClass(data_in_type),
58+
PointerClass(weight_type),
59+
PointerClass(int32_t),
60+
PointerClass(int32_t)], [PointerClass(data_out_type)]), NE16RqntDenseConv2D_Template, ClusterTransformer)
61+
for data_in_type in [uint8_t, int8_t]
62+
for data_out_type in [uint8_t, int8_t]
63+
for weight_type in [uint8_t, int8_t]
64+
]
65+
NE16DenseConv2DBindings = [
66+
NodeBinding(
67+
ConvChecker(
68+
[PointerClass(data_in_type), PointerClass(weight_type),
69+
PointerClass(int32_t)], [PointerClass(int32_t)]), NE16DenseConv2D_Template, ClusterTransformer)
70+
for data_in_type in [uint8_t, int8_t]
71+
for weight_type in [uint8_t, int8_t]
72+
]

Deeploy/Targets/NE16/Deployer.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import Callable, Dict, Type
6+
7+
import onnx_graphsurgeon as gs
8+
9+
from Deeploy.AbstractDataTypes import Pointer
10+
from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
11+
NCHWtoNHWCPass, PULPNCHWtoNHWCPass
12+
from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
13+
from Deeploy.Targets.GAP9.Deployer import GAP9Deployer
14+
from Deeploy.Targets.NE16.TopologyOptimizationPasses.Passes import ConvEngineDiscolorationPass, NE16OptimizationPass
15+
16+
17+
class NE16Deployer(GAP9Deployer):
18+
19+
def __init__(self,
20+
graph: gs.Graph,
21+
deploymentPlatform: DeploymentPlatform,
22+
inputTypes: Dict[str, Type[Pointer]],
23+
loweringOptimizer: TopologyOptimizer,
24+
scheduler: Callable = lambda graph: list(graph.nodes),
25+
name: str = 'DeeployNetwork',
26+
default_channels_first = False,
27+
deeployStateDir: str = "DeeployStateDir",
28+
inputOffsets = {}):
29+
super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
30+
default_channels_first, deeployStateDir, inputOffsets)
31+
32+
if self.Platform.engines[0].enable3x3:
33+
for idx in range(len(self.loweringOptimizer.passes)):
34+
if isinstance(self.loweringOptimizer.passes[idx], PULPNCHWtoNHWCPass):
35+
self.loweringOptimizer.passes[idx] = NCHWtoNHWCPass(self.default_channels_first)
36+
37+
self.loweringOptimizer.passes += [
38+
ConvEngineDiscolorationPass(),
39+
NE16OptimizationPass(self.default_channels_first, "NE16")
40+
]

Deeploy/Targets/NE16/Engine.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import List
6+
7+
import onnx_graphsurgeon as gs
8+
9+
from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper
10+
from Deeploy.Targets.Generic.Layers import ConvLayer
11+
from Deeploy.Targets.NE16.Parsers import NE16DenseConv2DParser, NE16DWConv2DParser, NE16PWConv2DParser, \
12+
NE16RQSDenseConv2DParser, NE16RQSDWConv2DParser, NE16RQSPWConv2DParser
13+
from Deeploy.Targets.NE16.Tiler import NE16DenseConv2DTilingReadyBindings, NE16DWConv2DTilingReadyBindings, \
14+
NE16PWConv2DTilingReadyBindings, NE16RQSDenseConv2DTilingReadyBindings, NE16RQSDWConv2DTilingReadyBindings, \
15+
NE16RQSPWConv2DTilingReadyBindings
16+
from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer
17+
18+
NE16RqntPWConv2DMapper = NodeMapper(NE16RQSPWConv2DParser(), NE16RQSPWConv2DTilingReadyBindings)
19+
NE16PWConv2DMapper = NodeMapper(NE16PWConv2DParser(), NE16PWConv2DTilingReadyBindings)
20+
21+
NE16RqntDWConv2DMapper = NodeMapper(NE16RQSDWConv2DParser(), NE16RQSDWConv2DTilingReadyBindings)
22+
NE16DWConv2DMapper = NodeMapper(NE16DWConv2DParser(), NE16DWConv2DTilingReadyBindings)
23+
24+
NE16RqntDenseConv2DMapper = NodeMapper(NE16RQSDenseConv2DParser(), NE16RQSDenseConv2DTilingReadyBindings)
25+
NE16DenseConv2DMapper = NodeMapper(NE16DenseConv2DParser(), NE16DenseConv2DTilingReadyBindings)
26+
27+
NE16Mapping = {
28+
'RequantizedConv': PULPRQSConvLayer([NE16RqntPWConv2DMapper, NE16RqntDWConv2DMapper, NE16RqntDenseConv2DMapper]),
29+
'Conv': ConvLayer([NE16PWConv2DMapper, NE16DWConv2DMapper, NE16DenseConv2DMapper]),
30+
}
31+
32+
_includeList = ["pulp_nnx_ne16.h", "pulp_nnx_util.h", "ne16_pulp_bsp.h", "ne16.h", "ne16_task.h"]
33+
34+
_ne16InitCode = r"""
35+
ne16_pulp_conf_t conf = {.max_stall = 8};
36+
ne16_nnx_init(ne16_pulp_get_dev(), &conf);
37+
"""
38+
39+
40+
class NE16Engine(DeploymentEngine):
41+
42+
def __init__(self,
43+
name: str,
44+
Mapping = NE16Mapping,
45+
initCode: str = _ne16InitCode,
46+
includeList: List[str] = _includeList,
47+
enable3x3: bool = False,
48+
enableStrides: bool = False) -> None:
49+
super().__init__(name, Mapping, initCode, includeList)
50+
51+
self.enable3x3 = enable3x3
52+
self.enableStrides = enableStrides
53+
54+
def isDenseConv(self, node) -> bool:
55+
return node.op in ["Conv", "RequantizedConv"] and \
56+
isinstance(node.inputs[1], gs.Constant) and \
57+
node.attrs['kernel_shape'] == [3, 3] and \
58+
node.attrs['dilations'] == [1, 1] and \
59+
node.attrs['group'] == 1 and \
60+
(node.attrs['strides'] == [1, 1] or self.enableStrides)
61+
62+
def isPWConv(self, node) -> bool:
63+
return node.op in ["Conv", "RequantizedConv"] and \
64+
isinstance(node.inputs[1], gs.Constant) and \
65+
node.attrs['kernel_shape'] == [1, 1] and \
66+
node.attrs['dilations'] == [1, 1] and \
67+
(node.attrs['strides'] == [1, 1] or self.enableStrides)
68+
69+
def isDWConv(self, node) -> bool:
70+
return node.op in ["Conv", "RequantizedConv"] and \
71+
isinstance(node.inputs[1], gs.Constant) and \
72+
node.attrs['kernel_shape'] == [3, 3] and \
73+
node.attrs['dilations'] == [1, 1] and \
74+
node.attrs['group'] != 1 and \
75+
(node.attrs['strides'] == [1, 1] or self.enableStrides)
76+
77+
def canExecute(self, node: gs.Node) -> bool:
78+
if self.enable3x3:
79+
return self.isPWConv(node) or self.isDWConv(node) or self.isDenseConv(node)
80+
else:
81+
return self.isPWConv(node)
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import Tuple
6+
7+
import numpy as np
8+
import onnx_graphsurgeon as gs
9+
10+
from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import SequentialPass
11+
from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext
12+
from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryLevel
13+
14+
15+
class AnnotateNE16WeightMemoryLevel(SequentialPass):
16+
17+
def __init__(self, ne16EngineName: str, weightMemoryLevel: MemoryLevel):
18+
self._weightMemoryLevel = weightMemoryLevel
19+
self.ne16EngineName = ne16EngineName
20+
super().__init__()
21+
22+
def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext, gs.Graph]:
23+
24+
def _ne16WeightBufferSize(buffer: ConstantBuffer) -> int:
25+
return int(np.prod(buffer.shape)) # Weights are encoded as bytes so no need to check for typeWidth
26+
27+
weightMemoryOccupation = 0
28+
29+
# Current weight memory occupation
30+
for buffer in {**ctxt.globalObjects, **ctxt.localObjects}.values():
31+
if hasattr(buffer, "_memoryLevel") and buffer._memoryLevel == self._weightMemoryLevel.name:
32+
weightMemoryOccupation += _ne16WeightBufferSize(buffer)
33+
34+
ne16Nodes = [node for node in graph.nodes if node.attrs["engine"] == self.ne16EngineName]
35+
for node in ne16Nodes:
36+
if node.op in ["Conv", "RequantizedConv"]:
37+
38+
if not (ctxt.is_local(node.inputs[1].name) or ctxt.is_global(node.inputs[1].name)):
39+
continue
40+
41+
buffer = ctxt.lookup(node.inputs[1].name)
42+
if weightMemoryOccupation + _ne16WeightBufferSize(buffer) < self._weightMemoryLevel.size:
43+
buffer._memoryLevel = self._weightMemoryLevel.name
44+
weightMemoryOccupation += _ne16WeightBufferSize(buffer)
45+
return ctxt, graph
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from . import *

0 commit comments

Comments
 (0)