carrot/tinygrad_repo/test/external/external_multi_gpu.py

#!/usr/bin/env python3
# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.helpers import colored
from tinygrad.helpers import Timing
from tinygrad.runtime.ops_gpu import CL

# TODO: support multidevice in cuda
device = 'gpu'

if __name__ == "__main__":
  sz = 1024*1024*256  # 1 GB
  #sz = 1024*64

  with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
    c0 = Tensor.ones(sz, device="cpu").realize()
    c1 = (Tensor.ones(sz, device="cpu")/2).realize()

  with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    a0 = c0.to(f'{device}:0').realize()
    CL.synchronize()
  with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    b1 = c1.to(f'{device}:1').realize()
    CL.synchronize()

  # cross copy. this is going through the CPU
  with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    a1 = a0.to(f'{device}:1').realize()
    CL.synchronize()
  with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    b0 = b1.to(f'{device}:0').realize()
    CL.synchronize()

  # sum
  with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    ab0 = (a0 + b0).realize()
    CL.synchronize()
  with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    ab1 = (a1 + b1).realize()
    CL.synchronize()

  # cross device sum (does this work?)
  # is this making a copy first? is that copy through the CPU?
  # the slowness comes from the *blocking* clprg call, is this pyopencl?
  with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    abx0 = (a0 + b1).realize()
    CL.synchronize()

  with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    abx1 = (b1 + a0).realize()
    CL.synchronize()

  # copy back
  # NOTE: half of this slowness is caused by allocating memory on the CPU
  with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    cc0 = ab0.numpy()
  with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
    cc1 = ab1.numpy()

  # same
  print("testing")
  np.testing.assert_allclose(cc0, cc1)

  # devices
  print(ab0)
  print(ab1)
  print(abx0)
  print(abx1)
openpilot v0.9.7 release date: 2024-03-17T10:14:38 master commit: 7e9a909e0e57ecb31df4c87c5b9a06b1204fd034 2024-03-18 06:57:41 -07:00			`#!/usr/bin/env python3`
			`# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git`
			`# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py`
			`import numpy as np`
			`from tinygrad.tensor import Tensor`
			`from tinygrad.helpers import colored`
			`from tinygrad.helpers import Timing`
			`from tinygrad.runtime.ops_gpu import CL`

			`# TODO: support multidevice in cuda`
			`device = 'gpu'`

			`if __name__ == "__main__":`
			`sz = 10241024256 # 1 GB`
			`#sz = 1024*64`

			`with Timing("CPU creation: ", on_exit=lambda x: f", {(sz42)/x:.2f} GB/sec"):`
			`c0 = Tensor.ones(sz, device="cpu").realize()`
			`c1 = (Tensor.ones(sz, device="cpu")/2).realize()`

			`with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`a0 = c0.to(f'{device}:0').realize()`
			`CL.synchronize()`
			`with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`b1 = c1.to(f'{device}:1').realize()`
			`CL.synchronize()`

			`# cross copy. this is going through the CPU`
			`with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`a1 = a0.to(f'{device}:1').realize()`
			`CL.synchronize()`
			`with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`b0 = b1.to(f'{device}:0').realize()`
			`CL.synchronize()`

			`# sum`
			`with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`ab0 = (a0 + b0).realize()`
			`CL.synchronize()`
			`with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`ab1 = (a1 + b1).realize()`
			`CL.synchronize()`

			`# cross device sum (does this work?)`
			`# is this making a copy first? is that copy through the CPU?`
			`# the slowness comes from the blocking clprg call, is this pyopencl?`
			`with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`abx0 = (a0 + b1).realize()`
			`CL.synchronize()`

			`with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`abx1 = (b1 + a0).realize()`
			`CL.synchronize()`

			`# copy back`
			`# NOTE: half of this slowness is caused by allocating memory on the CPU`
			`with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`cc0 = ab0.numpy()`
			`with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):`
			`cc1 = ab1.numpy()`

			`# same`
			`print("testing")`
			`np.testing.assert_allclose(cc0, cc1)`

			`# devices`
			`print(ab0)`
			`print(ab1)`
			`print(abx0)`
			`print(abx1)`