Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions .github/workflows/_runner-gap9-w-ne16-tiled.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
#
# SPDX-License-Identifier: Apache-2.0

---
name: _runner-gap9-w-ne16-tiled

"on":
workflow_call:
inputs:
runner:
required: true
type: string
docker-image:
required: true
type: string
pytest-markers:
required: true
type: string

jobs:
test-runner-gap9-w-ne16-tiled:
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.docker-image }}
steps:
- name: Checkout Repo
uses: actions/checkout@v4
with:
submodules: recursive
- name: Build Deeploy
shell: bash
run: |
source /app/install/gap9-sdk/.gap9-venv/bin/activate
source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
pip install -e . || true
deactivate
- name: Cache ccache
uses: actions/cache/restore@v4
with:
path: /app/.ccache
key: ccache-gap9
- name: Run Test
run: |
source /app/install/gap9-sdk/.gap9-venv/bin/activate
source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
export GVSOC_INSTALL_DIR=/app/install/gap9-sdk/install/workstation
export GAP_RISCV_GCC_TOOLCHAIN=/app/install/gcc/gap9
cd DeeployTest
mkdir -p /app/.ccache
export CCACHE_DIR=/app/.ccache
pytest test_platforms.py -v -m "${{ inputs.pytest-markers }}"
deactivate
shell: bash
4 changes: 2 additions & 2 deletions .github/workflows/_runner-snitch-tiled-sequential.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ jobs:
- name: Build Deeploy
shell: bash
run: pip install -e .
- name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
- name: Run Test # 2-way parallel: 4-way OOMs the GitHub runner on the FP32 GEMM/TransB build.
run: |
cd DeeployTest
mkdir -p /app/.ccache
export CCACHE_DIR=/app/.ccache
pytest test_platforms.py -v -n 4 -m "snitch_tiled and ${{ inputs.pytest-marker }}"
pytest test_platforms.py -v -n 2 -m "snitch_tiled and ${{ inputs.pytest-marker }}"
shell: bash
3 changes: 3 additions & 0 deletions .github/workflows/ci-platform-gap9-tiled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ concurrency:

jobs:
select-env:
# ghcr.io/pulp-platform/deeploy-gap9 is private; only upstream's
# self-hosted runners have credentials. Skip cleanly on forks.
if: github.repository == 'pulp-platform/Deeploy'
uses: ./.github/workflows/_select-env.yml
with:
docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}
Expand Down
52 changes: 52 additions & 0 deletions .github/workflows/ci-platform-gap9-w-ne16-tiled.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
#
# SPDX-License-Identifier: Apache-2.0

---
name: CI • GAP9 + NE16 (Tiled)

"on":
push:
branches:
- "**"
tags:
- "v*.*.*"
pull_request:
workflow_dispatch:
inputs:
docker_image_deeploy:
description: "Deeploy Image to use"
required: false
default: "ghcr.io/pulp-platform/deeploy-gap9:devel"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
select-env:
# The GAP9 + NE16 image is hosted in pulp-platform's private ghcr.io
# registry; only upstream's self-hosted runners have credentials to
# pull it. On forks the docker pull always returns "denied", so skip
# the whole pipeline cleanly there. (Same constraint as the existing
# ci-platform-gap9{,-tiled}.yml jobs.)
if: github.repository == 'pulp-platform/Deeploy'
uses: ./.github/workflows/_select-env.yml
with:
docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}

gap9-w-ne16-kernels-tiled-singlebuffer-L2:
needs: select-env
uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
with:
runner: ${{ needs.select-env.outputs.runner }}
docker-image: ${{ needs.select-env.outputs.image }}
pytest-markers: "gap9_w_ne16_tiled and kernels and singlebuffer and l2"

gap9-w-ne16-kernels-tiled-doublebuffer-L2:
needs: select-env
uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
with:
runner: ${{ needs.select-env.outputs.runner }}
docker-image: ${{ needs.select-env.outputs.image }}
pytest-markers: "gap9_w_ne16_tiled and kernels and doublebuffer and l2"
3 changes: 3 additions & 0 deletions .github/workflows/ci-platform-gap9.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ concurrency:

jobs:
select-env:
# ghcr.io/pulp-platform/deeploy-gap9 is private; only upstream's
# self-hosted runners have credentials. Skip cleanly on forks.
if: github.repository == 'pulp-platform/Deeploy'
uses: ./.github/workflows/_select-env.yml
with:
docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}
Expand Down
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
endif()

set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)")
set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch)
set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, GAP9_w_NE16, Generic, Snitch)")
set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 GAP9_w_NE16 Generic Snitch)

if(platform STREQUAL MemPool)
message(STATUS "Building for platform 'MemPool'")
Expand All @@ -33,8 +33,8 @@ elseif(platform STREQUAL Siracusa_w_neureka)
message(STATUS "Building for platform 'Siracusa_w_neureka'")
elseif(platform STREQUAL PULPOpen)
message(STATUS "Building for platform 'PULP-Open'")
elseif(platform STREQUAL GAP9)
message(STATUS "Building for platform 'GAP9'")
elseif(platform STREQUAL GAP9 OR platform STREQUAL GAP9_w_NE16)
message(STATUS "Building for platform '${platform}'")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})

# Select SDK config based on simulator type
Expand Down Expand Up @@ -62,7 +62,7 @@ endif()
# Import useful functions / macros
include(${CMAKE_CURRENT_LIST_DIR}/cmake/Util.cmake)
# Only if not GAP9
if(NOT platform STREQUAL GAP9)
if(NOT platform STREQUAL GAP9 AND NOT platform STREQUAL GAP9_w_NE16)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/common.cmake)
endif()
include(${CMAKE_CURRENT_LIST_DIR}/cmake/simulation.cmake)
Expand Down Expand Up @@ -231,7 +231,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor

endif()

if(platform STREQUAL GAP9)
if(platform STREQUAL GAP9 OR platform STREQUAL GAP9_w_NE16)
project(${TESTNAME} LANGUAGES C ASM)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/gap9/gap9_gvsoc.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/gap9/gap9_board.cmake)
Expand Down
42 changes: 33 additions & 9 deletions Deeploy/Targets/GAP9/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@
from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
from Deeploy.Targets.GAP9.DMA.L3Dma import gap9L3DmaHack
from Deeploy.Targets.GAP9.DMA.L3Dma import GAP9L3Dma
from Deeploy.Targets.GAP9.DMA.MchanDma import GAP9MchanDma
from Deeploy.Targets.GAP9.Templates import GAP9SDKDequantQuantTemplate, NE16GEMMTemplate
# Import templates from PULPOpen and Generic
from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \
FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
FloatReduceSumTemplate, GatherTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
Expand Down Expand Up @@ -57,7 +58,7 @@
MemoryManagementGeneration("L1"),
TilingVariableReplacement("L2"),
MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA
PULPL3Tiling("L3", "L2", GAP9L3Dma()), # Use GAP9-specific L3 DMA
PULPProfileUntiled(),
ArgumentStructGeneration(),
L3MemoryAwareFunctionCallClosure(writeback = False),
Expand All @@ -76,7 +77,7 @@
MemoryManagementGeneration("L1"),
TilingVariableReplacement("L2"),
MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA
PULPL3Tiling("L3", "L2", GAP9L3Dma()), # Use GAP9-specific L3 DMA
PULPProfileUntiled(),
ArgumentStructGeneration(),
L3MemoryAwareFunctionCallClosure(writeback = False),
Expand Down Expand Up @@ -183,6 +184,26 @@
GAP9Transformer) for type1, type2 in zip([int8_t, uint8_t, int8_t, uint8_t], [int8_t, uint8_t, uint8_t, int8_t])
]

GAP9NE16RQSGEMMBindings = [
NodeBinding(
PULPLinearChecker([
PointerClass(type1),
PointerClass(int8_t),
PointerClass(int32_t),
PointerClass(uint8_t),
PointerClass(uint8_t)
], [PointerClass(type2)]), NE16GEMMTemplate.referenceTemplate, GAP9ClusterTransformer)
for type1 in [int8_t, uint8_t]
for type2 in [int8_t, uint8_t]
]

GAP9NE16GEMMInt32Bindings = [
NodeBinding(
GEMMChecker([PointerClass(type1), PointerClass(int8_t),
PointerClass(int32_t)], [PointerClass(int32_t)]), NE16GEMMTemplate.int32OutputTemplate,
GAP9ClusterTransformer) for type1 in [int8_t, uint8_t]
]

GAP9FloatGEMMBindings = [
NodeBinding(
GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
Expand Down Expand Up @@ -386,14 +407,17 @@
]

GAP9QuantBindings = [
NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(int8_t)]), QuantTemplate.referenceTemplate,
GAP9Transformer),
NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(int8_t)]),
GAP9SDKDequantQuantTemplate.fp32QuantI8Template, GAP9Transformer),
NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(uint8_t)]),
GAP9SDKDequantQuantTemplate.fp32QuantU8Template, GAP9Transformer),
]

GAP9DequantBindings = [
NodeBinding(DequantChecker([PointerClass(int8_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
GAP9Transformer),
] + [
NodeBinding(DequantChecker([PointerClass(int8_t)], [PointerClass(float32_t)]),
GAP9SDKDequantQuantTemplate.fp32DequantI8Template, GAP9Transformer),
NodeBinding(DequantChecker([PointerClass(uint8_t)], [PointerClass(float32_t)]),
GAP9SDKDequantQuantTemplate.fp32DequantU8Template, GAP9Transformer),
NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
GAP9Transformer),
]
9 changes: 2 additions & 7 deletions Deeploy/Targets/GAP9/DMA/L3Dma.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from typing import Dict, Tuple

from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
from Deeploy.TilingExtension.AsyncDma import AsyncDma, BlockingDmaFromAsyncDmaAdapter, DmaDirection, Future, \
PerTensorWaitingStrategy
from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy


class GAP9L3DmaFuture(Future):
Expand All @@ -29,7 +28,7 @@ class GAP9L3Dma(AsyncDma):
_transferTemplates = {
2:
NodeTemplate(
"pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
"pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t)${ext}, (void *)${loc}, (uint32_t)${transfer_size}, (uint32_t)${stride}, (uint32_t)${length}, ${ext2loc}, &${future});"
)
}
_waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture)
Expand Down Expand Up @@ -58,7 +57,3 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu
"stride": strideExt[0],
})
return operatorRepresentation


# Blocking adapter for L3 DMA (used in GAP9 L3 tiling)
gap9L3DmaHack = BlockingDmaFromAsyncDmaAdapter(GAP9L3Dma())
37 changes: 37 additions & 0 deletions Deeploy/Targets/GAP9/Parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
#
# SPDX-License-Identifier: Apache-2.0

from typing import Tuple

import onnx_graphsurgeon as gs

from Deeploy.DeeployTypes import NetworkContext
from Deeploy.Targets.Generic.Parsers import GEMMParser, RQSParserInterface


class NE16GEMMParser(GEMMParser, RQSParserInterface):
"""Parser for NE16 RequantizedGemm nodes with 5 inputs [A, B, C, mul, scale_n]."""

def __init__(self):
super().__init__(noBiasHoisting = True)

def parseNode(self, node: gs.Node) -> bool:
ret_rqs = RQSParserInterface.parseNode(self, node)
ret_matmul = GEMMParser.parseNode(self, node)
ret = all([ret_rqs, ret_matmul, 'shift' in node.attrs, len(node.inputs) == 5])
if ret:
self.operatorRepresentation['shift'] = int(node.attrs['shift'].values)
return ret

def parseNodeCtxt(self,
ctxt: NetworkContext,
node: gs.Node,
channels_first: bool = True) -> Tuple[NetworkContext, bool]:
newCtxt, ret = GEMMParser.parseNodeCtxt(self, ctxt, node, channels_first)
if ret:
inputs = ['A', 'B', 'C', 'mul', 'scale_n']
for idx, inputNode in enumerate(node.inputs):
self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
return newCtxt, True
return ctxt, False
Loading
Loading