Skip to content

Commit cc03ebd

Browse files
committed
small changes
1 parent 72cd3bd commit cc03ebd

1 file changed

Lines changed: 10 additions & 9 deletions

File tree

20241129_viz.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,14 @@ will allow us to see the generated code, and NOOPT tells tinygrad not to enable
2020
see the generated metal/cuda code:
2121

2222
```c++
23-
kernel void r_4(device float* data0, device float* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
24-
float acc0 = 0.0f;
25-
for (int ridx0 = 0; ridx0 < 4; ridx0++) {
26-
float val0 = *(data1+ridx0);
27-
acc0 = (acc0+val0);
23+
kernel void r_4(device float* data0_1, device float* data1_4, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
24+
float acc0[1];
25+
*(acc0+0) = 0.0f;
26+
for (int ridx1000 = 0; ridx1000 < 4; ridx1000++) {
27+
float val0 = (*(data1_4+ridx1000));
28+
*(acc0+0) = ((*(acc0+0))+val0);
2829
}
29-
*(data0+0) = acc0;
30+
*(data0_1+0) = (*(acc0+0));
3031
}
3132
```
3233
@@ -35,9 +36,9 @@ of vectorized data type, and remove the loop. In fact, that's what the optimizat
3536
on this time `DEBUG=5 python script.py`:
3637
3738
```c++
38-
kernel void r_4(device float* data0, device float* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
39-
float4 val0 = *((device float4*)((data1+0)));
40-
*(data0+0) = (val0.w+val0.z+val0.x+val0.y);
39+
kernel void r_4(device float* data0_1, device float* data1_4, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
40+
float4 val0 = (*((device float4*)((data1_4+0))));
41+
*(data0_1+0) = (val0.x+val0.y+val0.z+val0.w);
4142
}
4243
```
4344

0 commit comments

Comments
 (0)