69 lines
1.8 KiB
Python
69 lines
1.8 KiB
Python
from extra import dist
|
|
from tinygrad.jit import TinyJit
|
|
if __name__ == "__main__":
|
|
dist.preinit()
|
|
|
|
from extra.dist import world
|
|
from tinygrad.helpers import CI, getenv
|
|
from tinygrad.tensor import Tensor
|
|
import numpy as np
|
|
|
|
@TinyJit
|
|
def send_jit(t, target_rank, cache_id=None) -> Tensor:
|
|
return world.send(t, target_rank, cache_id=cache_id).realize()
|
|
|
|
@TinyJit
|
|
def recv_jit(t, target_rank, cache_id=None) -> Tensor:
|
|
return world.recv(t, target_rank, cache_id=cache_id).realize()
|
|
|
|
SIZE = 2048 if not CI else 2
|
|
|
|
def run():
|
|
# set a deterministic seed so that both ranks generate the same random tensor
|
|
Tensor.manual_seed(42)
|
|
|
|
rank = getenv("RANK")
|
|
|
|
# loop 3 times to make sure it works with the jit
|
|
for _ in range(3):
|
|
# create a tensor to send
|
|
t = Tensor.randn(SIZE, SIZE)
|
|
|
|
# send to rank 1
|
|
if rank == 0:
|
|
send_jit(t, 1, cache_id="test")
|
|
elif rank == 1:
|
|
t2 = Tensor.empty(SIZE, SIZE)
|
|
recv_jit(t2, 0, cache_id="test")
|
|
|
|
# recv from rank 1
|
|
if rank == 0:
|
|
t2 = Tensor.empty(SIZE, SIZE)
|
|
recv_jit(t2, 1, cache_id="test2")
|
|
elif rank == 1:
|
|
send_jit(t2, 0, cache_id="test2")
|
|
|
|
# check that the received tensor is the same as the sent tensor
|
|
if rank == 0:
|
|
assert np.allclose(t.numpy(), t2.numpy()), f"{t2.numpy()} wasn't equal to {t.numpy()}"
|
|
|
|
print(f"rank {rank} passed")
|
|
|
|
if __name__ == "__main__":
|
|
if getenv("HIP"):
|
|
devices = ["hip:0", "hip:1"]
|
|
else:
|
|
devices = ["gpu:0", "gpu:1" if not CI else "gpu:0"]
|
|
world_size = len(devices)
|
|
|
|
dist.init_oob(world_size)
|
|
|
|
processes = []
|
|
for rank, device in enumerate(devices):
|
|
processes.append(dist.spawn(rank, device, fn=run, args=()))
|
|
for p in processes: p.join()
|
|
|
|
# exit with error code if any of the processes failed
|
|
for p in processes:
|
|
if p.exitcode != 0: exit(p.exitcode)
|