carrot efee1712aa
KerryGoldModel, AGNOS12.3, ButtonMode3, autoDetectLFA2, (#181)
* fix.. speed_limit error...

* draw tpms settings.

* fix.. traffic light stopping only..

* fix.. waze cam

* fix.. waze...

* add setting (Enable comma connect )

* auto detect LFA2

* fix.. cruisespeed1

* vff2 driving model.

* fix..

* agnos 12.3

* fix..

* ff

* ff

* test

* ff

* fix.. drawTurnInfo..

* Update drive_helpers.py

* fix..

support eng  voice

eng sounds

fix settings... english

fix.. mph..

fix.. roadlimit speed bug..

* new vff model.. 250608

* fix soundd..

* fix safe exit speed..

* fix.. sounds.

* fix.. radar timeStep..

* KerryGold model

* Update drive_helpers.py

* fix.. model.

* fix..

* fix..

* Revert "fix.."

This reverts commit b09ec459afb855c533d47fd7e8a1a6b1a09466e7.

* Revert "fix.."

This reverts commit 290bec6b83a4554ca232d531a911edccf94a2156.

* fix esim

* add more acc table. 10kph

* kg update..

* fix cruisebutton mode3

* test atc..cond.

* fix.. canfd

* fix.. angle control limit
2025-06-13 15:59:36 +09:00

79 lines
3.9 KiB
Python

import pathlib
from tinygrad.device import Device
from tinygrad.runtime.ops_amd import AMDProgram, HIPCompiler
import time
import os
NUM_WORKGROUPS = 96
WAVE_SIZE = 32
NUM_WAVES = 2
FLOPS_PER_MATMUL = 16*16*16*2
INTERNAL_LOOP = 1_000_000
INSTRUCTIONS_PER_LOOP = 1_000
assemblyTemplate = (pathlib.Path(__file__).parent / "template.s").read_text()
def launchBenchmark(instruction, vgprIndices, dense = True):
if dense:
instructions = "{} v[0:{}], v[{}:{}], v[{}:{}], 1\n".format(instruction, vgprIndices[0],
vgprIndices[1], vgprIndices[2],
vgprIndices[1], vgprIndices[2]) * INSTRUCTIONS_PER_LOOP
else:
instructions = "{} v[0:{}], v[{}:{}], v[{}:{}], v{}\n".format(instruction, vgprIndices[0],
vgprIndices[1], vgprIndices[2],
vgprIndices[3], vgprIndices[4],
vgprIndices[5]) * INSTRUCTIONS_PER_LOOP
src = assemblyTemplate.replace("INSTRUCTION", instructions)
lib = COMPILER.compile(src)
fxn = AMDProgram(DEV, "matmul", lib)
start = time.perf_counter()
fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) #For some reason the returned time is very small after the first kernel execution
end = time.perf_counter()
elapsed = end-start
FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
print("{:<29} : {} T(FL)OPS".format(instruction, round(FLOPs/elapsed/10**12, 2)))
if __name__=="__main__":
DEVICENUM = os.getenv("DEVICENUM", "0")
try:
DEV = Device['AMD:' + DEVICENUM]
except:
raise RuntimeError("Error while initiating AMD device")
if (ARCH := DEV.arch) not in ['gfx1100', 'gfx1201']:
raise RuntimeError("only gfx1100 and gfx1201 supported")
COMPILER = HIPCompiler(ARCH)
if ARCH == 'gfx1100':
launchBenchmark("v_wmma_bf16_16x16x16_bf16", (7,8,15))
launchBenchmark("v_wmma_f16_16x16x16_f16", (7,8,15))
launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,15))
launchBenchmark("v_wmma_f32_16x16x16_f16", (7,8,15))
launchBenchmark("v_wmma_i32_16x16x16_iu4", (7,8,9))
launchBenchmark("v_wmma_i32_16x16x16_iu8", (7,8,11))
if ARCH == 'gfx1201':
NUM_WORKGROUPS = 64
launchBenchmark("v_wmma_bf16_16x16x16_bf16", (3,4,7))
launchBenchmark("v_wmma_f16_16x16x16_f16", (3,4,7))
launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,11))
launchBenchmark("v_wmma_f32_16x16x16_f16", (7,8,11))
launchBenchmark("v_wmma_i32_16x16x16_iu4", (7,8,8))
launchBenchmark("v_wmma_i32_16x16x16_iu8", (7,8,9))
launchBenchmark("v_wmma_f32_16x16x16_fp8_fp8", (7,8,9))
launchBenchmark("v_wmma_f32_16x16x16_fp8_bf8", (7,8,9))
launchBenchmark("v_wmma_f32_16x16x16_bf8_fp8", (7,8,9))
launchBenchmark("v_wmma_f32_16x16x16_bf8_bf8", (7,8,9))
FLOPS_PER_MATMUL = 16*16*32*2
launchBenchmark("v_wmma_i32_16X16X32_iu4", (7,8,9))
launchBenchmark("v_swmmac_f32_16x16x32_f16", (7,8,11,12,19,20), False)
launchBenchmark("v_swmmac_f32_16x16x32_bf16", (7,8,11,12,19,20), False)
launchBenchmark("v_swmmac_f16_16x16x32_f16", (3,4,7,8,15,16), False)
launchBenchmark("v_swmmac_bf16_16x16x32_bf16", (3,4,7,8,15,16), False)
launchBenchmark("v_swmmac_i32_16x16x32_iu8", (7,8,9,10,13,14), False)
launchBenchmark("v_swmmac_i32_16x16x32_iu4", (7,8,8,9,10,11), False)
launchBenchmark("v_swmmac_f32_16x16x32_fp8_fp8", (7,8,9,10,13,14), False)
launchBenchmark("v_swmmac_f32_16x16x32_fp8_bf8", (7,8,9,10,13,14), False)
launchBenchmark("v_swmmac_f32_16x16x32_bf8_fp8", (7,8,9,10,13,14), False)
launchBenchmark("v_swmmac_f32_16x16x32_bf8_bf8", (7,8,9,10,13,14), False)
FLOPS_PER_MATMUL = 16*16*64*2
launchBenchmark("v_swmmac_i32_16x16x64_iu4", (7,8,9,10,13,14), False)