small changes

Bchass · Bchass · commit cc03ebd784d8 · 2025-11-13T10:22:01.000-05:00
diff --git a/20241129_viz.md b/20241129_viz.md
@@ -20,13 +20,14 @@ will allow us to see the generated code, and NOOPT tells tinygrad not to enable
 see the generated metal/cuda code:
 
 ```c++
-kernel void r_4(device float* data0, device float* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
-  float acc0 = 0.0f;
-  for (int ridx0 = 0; ridx0 < 4; ridx0++) {
-    float val0 = *(data1+ridx0);
-    acc0 = (acc0+val0);
+kernel void r_4(device float* data0_1, device float* data1_4, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
+  float acc0[1];
+  *(acc0+0) = 0.0f;
+  for (int ridx1000 = 0; ridx1000 < 4; ridx1000++) {
+    float val0 = (*(data1_4+ridx1000));
+    *(acc0+0) = ((*(acc0+0))+val0);
   }
-  *(data0+0) = acc0;
+  *(data0_1+0) = (*(acc0+0));
 }
 ```
 
@@ -35,9 +36,9 @@ of vectorized data type, and remove the loop. In fact, that's what the optimizat
 on this time `DEBUG=5 python script.py`:
 
 ```c++
-kernel void r_4(device float* data0, device float* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
-  float4 val0 = *((device float4*)((data1+0)));
-  *(data0+0) = (val0.w+val0.z+val0.x+val0.y);
+kernel void r_4(device float* data0_1, device float* data1_4, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {
+  float4 val0 = (*((device float4*)((data1_4+0))));
+  *(data0_1+0) = (val0.x+val0.y+val0.z+val0.w);
 }
 ```