@@ -20,13 +20,14 @@ will allow us to see the generated code, and NOOPT tells tinygrad not to enable
2020see the generated metal/cuda code:
2121
2222``` c++
23- kernel void r_4 (device float* data0, device float* data1, uint3 gid [[ threadgroup_position_in_grid]] , uint3 lid [[ thread_position_in_threadgroup]] ) {
24- float acc0 = 0.0f;
25- for (int ridx0 = 0; ridx0 < 4; ridx0++) {
26- float val0 = * (data1+ridx0);
27- acc0 = (acc0+val0);
23+ kernel void r_4 (device float* data0_1, device float* data1_4, uint3 gid [[ threadgroup_position_in_grid]] , uint3 lid [[ thread_position_in_threadgroup]] ) {
24+ float acc0[ 1] ;
25+ * (acc0+0) = 0.0f;
26+ for (int ridx1000 = 0; ridx1000 < 4; ridx1000++) {
27+ float val0 = (* (data1_4+ridx1000));
28+ * (acc0+0) = ((* (acc0+0))+val0);
2829 }
29- * (data0 +0) = acc0;
30+ * (data0_1 +0) = ( * ( acc0+0)) ;
3031}
3132```
3233
@@ -35,9 +36,9 @@ of vectorized data type, and remove the loop. In fact, that's what the optimizat
3536on this time `DEBUG=5 python script.py`:
3637
3738```c++
38- kernel void r_4(device float* data0 , device float* data1 , uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
39- float4 val0 = *((device float4*)((data1+0 )));
40- *(data0 +0) = (val0.w +val0.z +val0.x +val0.y );
39+ kernel void r_4(device float* data0_1 , device float* data1_4 , uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
40+ float4 val0 = ( *((device float4*)((data1_4+0) )));
41+ *(data0_1 +0) = (val0.x +val0.y +val0.z +val0.w );
4142}
4243```
4344
0 commit comments