carrot/tinygrad_repo/examples/torch_cuda_kernel.py

#!POPCORN leaderboard grayscale
#!POPCORN gpu A100
# not a stable API, but works

import torch, functools
from tinygrad import Tensor, TinyJit, Device
from tinygrad.engine.realize import CompiledRunner
from tinygrad.helpers import get_single_element, Context, OSX
from tinygrad.dtype import _from_torch_dtype

@TinyJit
def f(tg_out, tg_data): return tg_out.assign(tg_data[:, :, 0] * 0.2989 + tg_data[:, :, 1] * 0.5870 + tg_data[:, :, 2] * 0.1140).realize()

def custom_kernel(data: torch.Tensor, device="CUDA") -> torch.Tensor:
  assert data.dtype == torch.float32
  tg_data = Tensor.from_blob(data.data_ptr(), data.shape, dtype=_from_torch_dtype(data.dtype), device=device)

  out = torch.empty((data.shape[0], data.shape[1]), dtype=data.dtype, device=data.device)
  tg_out = Tensor.from_blob(out.data_ptr(), out.shape, dtype=_from_torch_dtype(out.dtype), device=device)

  # Need to sync torch to make sure the data is valid.
  if data.device.type == "mps": torch.mps.synchronize()
  else: torch.cuda.synchronize()

  with Context(BEAM=2): f(tg_out, tg_data)

  # Wait for computation to finish and the data is valid.
  Device[device].synchronize()

  return out

if __name__ == "__main__":
  for i in range(3):
    if OSX:
      out = custom_kernel(inp:=torch.rand(16, 16, 3, device=torch.device("mps")), device="METAL")
    else:
      out = custom_kernel(inp:=torch.rand(16, 16, 3, device=torch.device("cuda")), device="CUDA")
    assert torch.allclose(out, inp[:, :, 0] * 0.2989 + inp[:, :, 1] * 0.5870 + inp[:, :, 2] * 0.1140)
update 250418 2025-04-18 20:38:55 +09:00			`#!POPCORN leaderboard grayscale`
			`#!POPCORN gpu A100`
			`# not a stable API, but works`

			`import torch, functools`
			`from tinygrad import Tensor, TinyJit, Device`
			`from tinygrad.engine.realize import CompiledRunner`
			`from tinygrad.helpers import get_single_element, Context, OSX`
			`from tinygrad.dtype import _from_torch_dtype`

			`@TinyJit`
			`def f(tg_out, tg_data): return tg_out.assign(tg_data[:, :, 0] * 0.2989 + tg_data[:, :, 1] * 0.5870 + tg_data[:, :, 2] * 0.1140).realize()`

			`def custom_kernel(data: torch.Tensor, device="CUDA") -> torch.Tensor:`
			`assert data.dtype == torch.float32`
			`tg_data = Tensor.from_blob(data.data_ptr(), data.shape, dtype=_from_torch_dtype(data.dtype), device=device)`

			`out = torch.empty((data.shape[0], data.shape[1]), dtype=data.dtype, device=data.device)`
			`tg_out = Tensor.from_blob(out.data_ptr(), out.shape, dtype=_from_torch_dtype(out.dtype), device=device)`

			`# Need to sync torch to make sure the data is valid.`
			`if data.device.type == "mps": torch.mps.synchronize()`
			`else: torch.cuda.synchronize()`

			`with Context(BEAM=2): f(tg_out, tg_data)`

			`# Wait for computation to finish and the data is valid.`
			`Device[device].synchronize()`

			`return out`

			`if __name__ == "__main__":`
			`for i in range(3):`
			`if OSX:`
			`out = custom_kernel(inp:=torch.rand(16, 16, 3, device=torch.device("mps")), device="METAL")`
			`else:`
			`out = custom_kernel(inp:=torch.rand(16, 16, 3, device=torch.device("cuda")), device="CUDA")`
			`assert torch.allclose(out, inp[:, :, 0] * 0.2989 + inp[:, :, 1] * 0.5870 + inp[:, :, 2] * 0.1140)`