carrot/tinygrad_repo/test/external/external_test_speed_llama.py
carrot efee1712aa
KerryGoldModel, AGNOS12.3, ButtonMode3, autoDetectLFA2, (#181)
* fix.. speed_limit error...

* draw tpms settings.

* fix.. traffic light stopping only..

* fix.. waze cam

* fix.. waze...

* add setting (Enable comma connect )

* auto detect LFA2

* fix.. cruisespeed1

* vff2 driving model.

* fix..

* agnos 12.3

* fix..

* ff

* ff

* test

* ff

* fix.. drawTurnInfo..

* Update drive_helpers.py

* fix..

support eng  voice

eng sounds

fix settings... english

fix.. mph..

fix.. roadlimit speed bug..

* new vff model.. 250608

* fix soundd..

* fix safe exit speed..

* fix.. sounds.

* fix.. radar timeStep..

* KerryGold model

* Update drive_helpers.py

* fix.. model.

* fix..

* fix..

* Revert "fix.."

This reverts commit b09ec459afb855c533d47fd7e8a1a6b1a09466e7.

* Revert "fix.."

This reverts commit 290bec6b83a4554ca232d531a911edccf94a2156.

* fix esim

* add more acc table. 10kph

* kg update..

* fix cruisebutton mode3

* test atc..cond.

* fix.. canfd

* fix.. angle control limit
2025-06-13 15:59:36 +09:00

59 lines
2.3 KiB
Python

# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
import unittest, time
from examples.llama import Transformer, MODEL_PARAMS
from tinygrad.tensor import Tensor
from tinygrad import Device
from tinygrad.nn.state import get_state_dict
from tinygrad.device import Allocator, Compiled
from tinygrad.engine.realize import method_cache
from tinygrad.helpers import Profiling
class FakeProgram:
def __init__(self, name:str, prg:bytes): pass
def __call__(self, *bufs, global_size, local_size, vals=(), wait=False): pass
class FakeAllocator(Allocator[Compiled]):
def _alloc(self, sz, options): return None
def _copyin(self, dest, src:memoryview): pass
class TestLLaMASpeed(unittest.TestCase):
def test_llama_compile(self):
backup_program = Device[Device.DEFAULT].runtime
backup_allocator = Device[Device.DEFAULT].allocator
backup_compiler = Device[Device.DEFAULT].compiler
Device[Device.DEFAULT].runtime = FakeProgram
Device[Device.DEFAULT].allocator = FakeAllocator(Device.default)
print("testing llama python run time")
model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
print("built model")
# assign fake tensors to the values
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
print("assigned empty tensors, doing warmup")
def run_llama(st, empty_method_cache=True):
if empty_method_cache: method_cache.clear()
tms = [time.perf_counter()]
for i in range(5):
model(Tensor([[1,2,3,4]]), i).realize()
tms.append(time.perf_counter())
timings = [(tms[i+1]-tms[i])*1000 for i in range(len(tms)-1)]
print(f"{st:15s} mean runtime: {sum(timings)/len(timings):7.2f}ms, runs: ", ", ".join(f'{x:7.2f}' for x in timings))
run_llama("codegen(0)")
run_llama("codegen(1)")
# test no compiler use for this
Device[Device.DEFAULT].compiler = None
run_llama("methodcache", False)
with Profiling(sort='time', frac=0.1, fn="/tmp/llama.prof", ts=5):
run_llama("profile", False)
Device[Device.DEFAULT].runtime = backup_program
Device[Device.DEFAULT].allocator = backup_allocator
Device[Device.DEFAULT].compiler = backup_compiler
if __name__ == '__main__':
TestLLaMASpeed().test_llama_compile()
#unittest.main()