carrot/tinygrad_repo/test/external/external_multi_gpu.py

71 lines
2.4 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.helpers import colored
from tinygrad.helpers import Timing
from tinygrad.runtime.ops_gpu import CL
# TODO: support multidevice in cuda
device = 'gpu'
if __name__ == "__main__":
sz = 1024*1024*256 # 1 GB
#sz = 1024*64
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
c0 = Tensor.ones(sz, device="cpu").realize()
c1 = (Tensor.ones(sz, device="cpu")/2).realize()
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a0 = c0.to(f'{device}:0').realize()
CL.synchronize()
with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b1 = c1.to(f'{device}:1').realize()
CL.synchronize()
# cross copy. this is going through the CPU
with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a1 = a0.to(f'{device}:1').realize()
CL.synchronize()
with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b0 = b1.to(f'{device}:0').realize()
CL.synchronize()
# sum
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab0 = (a0 + b0).realize()
CL.synchronize()
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab1 = (a1 + b1).realize()
CL.synchronize()
# cross device sum (does this work?)
# is this making a copy first? is that copy through the CPU?
# the slowness comes from the *blocking* clprg call, is this pyopencl?
with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx0 = (a0 + b1).realize()
CL.synchronize()
with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx1 = (b1 + a0).realize()
CL.synchronize()
# copy back
# NOTE: half of this slowness is caused by allocating memory on the CPU
with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc0 = ab0.numpy()
with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc1 = ab1.numpy()
# same
print("testing")
np.testing.assert_allclose(cc0, cc1)
# devices
print(ab0)
print(ab1)
print(abx0)
print(abx1)