carrot/tinygrad_repo/test/external/external_multi_gpu.py
carrot 9c7833faf9
KerryGold Model, AGNOS12.4, AdjustLaneChange, EnglighSound (#182)
* Vegetarian Filet o Fish model

* fix.. atc..

* test cluster_speed_limit

* fix.. cluster_speed_limit.. 2

* fix.. clusterspeedlimit3

* cruise speed to roadlimit speed

* fix..

* fix.. eng

* deltaUp/Down for lanechange

* fix.. atc desire...

* fix..

* ff

* ff

* fix..

* fix.. eng

* fix engsound

* Update desire_helper.py

* fix.. connect...

* fix curve_min speed

* Revert "fix curve_min speed"

This reverts commit fcc9c2eb14eb3504abef3e420db93e8882e56f37.

* Reapply "fix curve_min speed"

This reverts commit 2d2bba476c58a7b4e13bac3c3ad0e4694c95515d.

* fix.. auto speed up.. roadlimit

* fix.. atc auto lanechange...

* Update desire_helper.py

* Update cruise.py

* debug atc...

* fix.. waze alert offset..

* fix..

* test atc..

* fix..

* fix.. atc

* atc test..

* fix.. atc

* fix.. atc2

* fix.. atc3

* KerryGold Model.  latsmooth_sec = 0.0

* lat smooth seconds 0.13

* fix comment

* fix.. auto cruise, and speed unit

* change lanemode switching.

* erase mazda lkas button.
2025-06-22 10:51:42 +09:00

80 lines
2.5 KiB
Python

#!/usr/bin/env python3
# cd extra/disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.helpers import colored, Timing, getenv
from tinygrad.device import Device
d0, d1 = f'{Device.DEFAULT}:0', f'{Device.DEFAULT}:1'
def sync():
Device[d0].synchronize()
Device[d1].synchronize()
if __name__ == "__main__":
print("GPU devices", d0, d1)
sz = getenv("N", 1024*1024*256) # 1 GB
with Timing("GPU initial sync: "): sync()
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
c0 = (Tensor.ones(sz, device="CPU")/2).realize()
c1 = (Tensor.ones(sz, device="CPU")/4).realize()
print(c0.uop.base.realized)
print(c1.uop.base.realized)
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a0 = c0.to(d0).realize()
sync()
with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b1 = c1.to(d1).realize()
sync()
# cross copy. this is (sometimes) going through the CPU
with Timing("0 -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a1 = a0.to(d1).realize()
sync()
with Timing("1 -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b0 = b1.to(d0).realize()
sync()
# sum
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab0 = (a0 + b0).realize()
sync()
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab1 = (a1 + b1).realize()
sync()
# cross device sum (does this work?)
with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx0 = (a0 + b1.to(d0)).realize()
sync()
with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx1 = (b1 + a0.to(d1)).realize()
sync()
# copy back
# NOTE: half of this slowness is caused by allocating memory on the CPU
with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc0 = ab0.numpy()
with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc1 = ab1.numpy()
# same
print("testing")
np.testing.assert_allclose(cc0, cc1)
# same (cross)
print("testing (cross)")
np.testing.assert_allclose(cc0, abx0.numpy())
np.testing.assert_allclose(cc0, abx1.numpy())
# devices
print(ab0)
print(ab1)
print(abx0)
print(abx1)