carrot/tinygrad_repo/extra/fastvits/fastvits_speed.py
FrogAi 659adb6457 openpilot v0.9.7 release
date: 2024-03-17T10:14:38
master commit: 7e9a909e0e57ecb31df4c87c5b9a06b1204fd034
2024-05-24 17:43:27 -07:00

54 lines
2.5 KiB
Python

"""
61: op Conv shape [(1, 256, 32, 64), (64, 256, 1, 1), (64,)] opt {'dilations': (1, 1), 'group': 1, 'kernel_shape': (1, 1), 'pads': (0, 0, 0, 0), 'strides': (1, 1)}
62: op Mul shape [(1, 64, 32, 64), (64, 1, 1)] opt {}
63: op Add shape [(1, 64, 32, 64), (1, 64, 32, 64)] opt {}
64: op Conv shape [(1, 64, 32, 64), (64, 1, 3, 3), (64,)] opt {'dilations': (1, 1), 'group': 64, 'kernel_shape': (3, 3), 'pads': (1, 1, 1, 1), 'strides': (1, 1)}
65: op Conv shape [(1, 64, 32, 64), (64, 1, 7, 7), (64,)] opt {'dilations': (1, 1), 'group': 64, 'kernel_shape': (7, 7), 'pads': (3, 3, 3, 3), 'strides': (1, 1)}
66: op Conv shape [(1, 64, 32, 64), (256, 64, 1, 1), (256,)] opt {'dilations': (1, 1), 'group': 1, 'kernel_shape': (1, 1), 'pads': (0, 0, 0, 0), 'strides': (1, 1)}
"""
import pathlib
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d
from tinygrad.realize import run_schedule
from tinygrad.helpers import partition, GlobalCounters, Context, getenv, prod, dtypes
from tinygrad.runtime.ops_gpu import CLBuffer, CLProgram
from tinygrad.ops import LoadOps, ReduceOps
def single_kernel():
# single kernel
sz1, sz2, sz3 = (32, 1024, 4), (32, 4096, 4), (16, 256, 4)
out = CLBuffer(prod(sz1), dtypes.imageh(sz1))
x = CLBuffer(prod(sz2), dtypes.imageh(sz2))
w = CLBuffer(prod(sz3), dtypes.imageh(sz3))
old = CLProgram("r_32_16_16_64_4_4_4", open(pathlib.Path(__file__).parent / "conv1_reorder.cl").read())
old_tms = [old([1,1,32], [16,16,1], out, x, w, wait=True)*1e6 for _ in range(5)]
print(old_tms, 67.107/min(old_tms)*1e3)
exit(0)
# CONV=0 PYTHONPATH="." LATEDEBUG=5 OPT=99 IMAGE=2 FLOAT16=1 NOLOCALS=1 python3 extra/fastvits/fastvits_speed.py
if __name__ == "__main__":
#single_kernel()
# this is stage 1 in fastvits
c1 = Conv2d(256, 64, (1,1), bias=False)
c2 = Conv2d(64, 64, (3,3), groups=64, padding=1, bias=False)
c3 = Conv2d(64, 64, (7,7), groups=64, padding=3, bias=False)
c4 = Conv2d(64, 256, (1,1), bias=False)
c5 = Conv2d(256, 64, (1,1), bias=False)
# TODO: the elementwise ops shouldn't rerun with normal realize
x = Tensor.randn(1, 256, 32, 64)
out = x.sequential([c1,c2,c3,c4,c5])
schedule = out.lazydata.schedule()
schedule, schedule_input = partition(schedule, lambda x: x.ast.op not in LoadOps and any(y.op in ReduceOps for y in x.ast.get_lazyops()))
run_schedule(schedule_input)
run_schedule(schedule[:getenv("CONV")])
print("*** init done ***")
GlobalCounters.reset()
with Context(DEBUG=getenv("LATEDEBUG", 2), BEAM=getenv("LATEBEAM")):
run_schedule(schedule[getenv("CONV"):getenv("CONV")+1])