@@ -170,23 +170,20 @@ progressively "lowered" into a form that can be used for actual code generation.
170170## Code generation
171171
172172The UOp used for code generation is contains much more details and is of lower level. Here I have built an example that you can use
173- to play around. If things go out of date, commit id is: ae00fa3b2833dbe0595d54d5fb0b679e1731ae01
173+ to play around. If things go out of date, please refer to commit ` tinygrad/tinygrad@2893feb9f6f3c7eed825494e51a9a9e84c6b8a2e ` .
174174
175175Suppose we just want to add two numbers:
176176
177177``` python
178178from tinygrad.renderer.cstyle import MetalRenderer
179- from tinygrad.ops import UOp, Ops
180- from tinygrad import dtypes
179+ from tinygrad.uop import Ops
180+ from tinygrad import UOP , dtypes
181181
182182const = UOp(Ops.CONST , dtypes.float, arg = 1.0 )
183183add = UOp(Ops.ADD , dtypes.float, src = (const, const), arg = None )
184184
185185print (add)
186- print (MetalRenderer().render(" example" , [
187- const,
188- add
189- ]))
186+ print (MetalRenderer().render([const, add]))
190187```
191188
192189The ` add ` variable shows something like:
@@ -203,7 +200,7 @@ let's see the rendered code:
203200``` c++
204201#include < metal_stdlib>
205202using namespace metal ;
206- kernel void example (uint3 gid [[ threadgroup_position_in_grid]] , uint3 lid [[ thread_position_in_threadgroup]] ) {
203+ kernel void test (uint3 gid [[ threadgroup_position_in_grid]] , uint3 lid [[ thread_position_in_threadgroup]] ) {
207204 float alu0 = (1.0f+1.0f);
208205}
209206```
@@ -212,18 +209,15 @@ Let me show you the CUDA version also, where you would replace the import:
212209
213210```python
214211from tinygrad.renderer.cstyle import CUDARenderer
215- from tinygrad.ops import UOp, Ops
216- from tinygrad import dtypes
212+ from tinygrad.uop import Ops
213+ from tinygrad import UOP, dtypes
217214
218215const = UOp(Ops.CONST, dtypes.float, arg=1.0)
219216add = UOp(Ops.ADD, dtypes.float, src=(const, const), arg=None)
220217
221218print(add)
222219
223- print(CUDARenderer("sm_50").render("example", [
224- const,
225- add
226- ]))
220+ print(CUDARenderer("sm_50").render([const, add]))
227221```
228222
229223Note that you have to pass in the "architecture" as argument, it affects the compiler, this value is set automatically
@@ -232,7 +226,7 @@ by querying `cuDeviceComputeCapability`, for our render purpose, pass in just tw
232226``` c++
233227#define INFINITY (__ int_as_float(0x7f800000))
234228#define NAN (__ int_as_float(0x7fffffff))
235- extern "C" __ global__ void __ launch_bounds__ (1) example () {
229+ extern "C" __ global__ void __ launch_bounds__ (1) test () {
236230 float alu0 = (1.0f+1.0f);
237231}
238232```
@@ -242,15 +236,13 @@ two constants is "folded" before the render stage, so you get the value 2, inste
242236optimization techniques. Let's see another example that renders the thread position:
243237
244238```python
245- MetalRenderer().render("example", [
246- UOp(Ops.SPECIAL, dtypes.int, arg=("gidx0", 16))
247- ])
239+ print(MetalRenderer().render([UOp(Ops.SPECIAL, dtypes.int, arg=("gidx0", 16))]))
248240```
249241
250242```c++
251243#include < metal_stdlib>
252244using namespace metal ;
253- kernel void example (uint3 gid [[ threadgroup_position_in_grid]] , uint3 lid [[ thread_position_in_threadgroup]] ) {
245+ kernel void test (uint3 gid [[ threadgroup_position_in_grid]] , uint3 lid [[ thread_position_in_threadgroup]] ) {
254246 int gidx0 = gid.x; /* 16 * /
255247}
256248```
@@ -260,7 +252,7 @@ On CUDA:
260252```c++
261253#define INFINITY (__int_as_float(0x7f800000))
262254#define NAN (__int_as_float(0x7fffffff))
263- extern "C" __global__ void __launch_bounds__(1) example () {
255+ extern "C" __global__ void __launch_bounds__(1) test () {
264256 int gidx0 = blockIdx.x; /* 16 */
265257}
266258```
@@ -270,14 +262,16 @@ also handle the count, so it renders `.x` `.y` automtically if you pass more tha
270262
271263
272264``` python
273- print (CUDARenderer(" sm_50" ).render(" example " , [
265+ print (CUDARenderer(" sm_50" ).render([
274266 UOp(Ops.SPECIAL , dtypes.int, arg = (" gidx0" , 16 )),
275267 UOp(Ops.SPECIAL , dtypes.int, arg = (" gidx1" , 16 ))
276268]))
277269```
278270
279271``` c++
280- extern "C" __ global__ void __ launch_bounds__ (1) example() {
272+ #define INFINITY (__ int_as_float(0x7f800000))
273+ #define NAN (__ int_as_float(0x7fffffff))
274+ extern "C" __ global__ void __ launch_bounds__ (1) test() {
281275 int gidx0 = blockIdx.x; /* 16 * /
282276 int gidx1 = blockIdx.y; /* 16 * /
283277}
0 commit comments