KerryGold Model, AGNOS12.4, AdjustLaneChange, EnglighSound (#182)

* Vegetarian Filet o Fish model

* fix.. atc..

* test cluster_speed_limit

* fix.. cluster_speed_limit.. 2

* fix.. clusterspeedlimit3

* cruise speed to roadlimit speed

* fix..

* fix.. eng

* deltaUp/Down for lanechange

* fix.. atc desire...

* fix..

* ff

* ff

* fix..

* fix.. eng

* fix engsound

* Update desire_helper.py

* fix.. connect...

* fix curve_min speed

* Revert "fix curve_min speed"

This reverts commit fcc9c2eb14eb3504abef3e420db93e8882e56f37.

* Reapply "fix curve_min speed"

This reverts commit 2d2bba476c58a7b4e13bac3c3ad0e4694c95515d.

* fix.. auto speed up.. roadlimit

* fix.. atc auto lanechange...

* Update desire_helper.py

* Update cruise.py

* debug atc...

* fix.. waze alert offset..

* fix..

* test atc..

* fix..

* fix.. atc

* atc test..

* fix.. atc

* fix.. atc2

* fix.. atc3

* KerryGold Model.  latsmooth_sec = 0.0

* lat smooth seconds 0.13

* fix comment

* fix.. auto cruise, and speed unit

* change lanemode switching.

* erase mazda lkas button.
This commit is contained in:
carrot 2025-06-22 10:51:42 +09:00 committed by GitHub
parent efee1712aa
commit 9c7833faf9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
385 changed files with 12951 additions and 12621 deletions

View File

@ -236,7 +236,6 @@ inline static std::unordered_map<std::string, uint32_t> keys = {
{"HapticFeedbackWhenSpeedCamera", PERSISTENT},
{"UseLaneLineSpeed", PERSISTENT},
{"UseLaneLineCurveSpeed", PERSISTENT},
{"UseLaneLineSpeedApply", PERSISTENT},
{"AdjustLaneOffset", PERSISTENT},
{"LaneChangeNeedTorque", PERSISTENT},
{"LaneChangeDelay", PERSISTENT },
@ -261,6 +260,8 @@ inline static std::unordered_map<std::string, uint32_t> keys = {
{"CustomSteerMax", PERSISTENT},
{"CustomSteerDeltaUp", PERSISTENT},
{"CustomSteerDeltaDown", PERSISTENT},
{"CustomSteerDeltaUpLC", PERSISTENT},
{"CustomSteerDeltaDownLC", PERSISTENT},
{"SpeedFromPCM", PERSISTENT},
{"MaxTimeOffroadMin", PERSISTENT},
{"DisableDM", PERSISTENT},

View File

@ -7,7 +7,7 @@ export OPENBLAS_NUM_THREADS=1
export VECLIB_MAXIMUM_THREADS=1
if [ -z "$AGNOS_VERSION" ]; then
export AGNOS_VERSION="12.3"
export AGNOS_VERSION="12.4"
fi
export STAGING_ROOT="/data/safe_staging"

View File

@ -246,6 +246,7 @@ struct CarState {
speedLimitDistance @65 :Float32;
gearStep @66 :Int16;
tpms @67 : Tpms;
useLaneLineSpeed @68 : Float32;
struct Tpms {
fl @0 :Float32;

View File

@ -96,6 +96,9 @@ class CarController(CarControllerBase):
self.activeCarrot = 0
self.camera_scc_params = Params().get_int("HyundaiCameraSCC")
self.steerDeltaUpOrg = self.steerDeltaUp = self.steerDeltaUpLC = self.params.STEER_DELTA_UP
self.steerDeltaDownOrg = self.steerDeltaDown = self.steerDeltaDownLC = self.params.STEER_DELTA_DOWN
def update(self, CC, CS, now_nanos):
if self.frame % 50 == 0:
@ -104,14 +107,30 @@ class CarController(CarControllerBase):
steerMax = params.get_int("CustomSteerMax")
steerDeltaUp = params.get_int("CustomSteerDeltaUp")
steerDeltaDown = params.get_int("CustomSteerDeltaDown")
steerDeltaUpLC = params.get_int("CustomSteerDeltaUpLC")
steerDeltaDownLC = params.get_int("CustomSteerDeltaDownLC")
if steerMax > 0:
self.params.STEER_MAX = steerMax
if steerDeltaUp > 0:
self.params.STEER_DELTA_UP = steerDeltaUp
self.steerDeltaUp = steerDeltaUp
#self.params.ANGLE_TORQUE_UP_RATE = steerDeltaUp
else:
self.steerDeltaUp = self.steerDeltaUpOrg
if steerDeltaDown > 0:
self.params.STEER_DELTA_DOWN = steerDeltaDown
self.steerDeltaDown = steerDeltaDown
#self.params.ANGLE_TORQUE_DOWN_RATE = steerDeltaDown
else:
self.steerDeltaDown = self.steerDeltaDownOrg
if steerDeltaUpLC > 0:
self.steerDeltaUpLC = steerDeltaUpLC
else:
self.steerDeltaUpLC = self.steerDeltaUp
if steerDeltaDownLC > 0:
self.steerDeltaDownLC = steerDeltaDownLC
else:
self.steerDeltaDownLC = self.steerDeltaDown
self.soft_hold_mode = 1 if params.get_int("AutoCruiseControl") > 1 else 2
self.hapticFeedbackWhenSpeedCamera = int(params.get_int("HapticFeedbackWhenSpeedCamera"))
@ -126,6 +145,13 @@ class CarController(CarControllerBase):
actuators = CC.actuators
hud_control = CC.hudControl
if hud_control.modelDesire in [3,4]:
self.params.STEER_DELTA_UP = self.steerDeltaUpLC
self.params.STEER_DELTA_DOWN = self.steerDeltaDownLC
else:
self.params.STEER_DELTA_UP = self.steerDeltaUp
self.params.STEER_DELTA_DOWN = self.steerDeltaDown
angle_control = self.CP.flags & HyundaiFlags.ANGLE_CONTROL
# steering torque

View File

@ -76,6 +76,7 @@ class CarState(CarStateBase):
self.cruise_buttons_msg = None
self.hda2_lfa_block_msg = None
self.cluster_speed_limit_msg = None
# On some cars, CLU15->CF_Clu_VehicleSpeed can oscillate faster than the dash updates. Sample at 5 Hz
self.cluster_speed = 0
@ -461,6 +462,9 @@ class CarState(CarStateBase):
if "TCS" in cp.vl:
self.tcs_info_373 = copy.copy(cp.vl.get("TCS", {}))
if "CLUSTER_SPEED_LIMIT" in cp.vl:
self.cluster_speed_limit_msg = copy.copy(cp.vl.get("CLUSTER_SPEED_LIMIT", {}))
if "GEAR" in cp.vl:
ret.gearStep = cp.vl["GEAR"]["GEAR_STEP"]
elif "GEAR_ALT" in cp.vl:
@ -596,6 +600,8 @@ class CarState(CarStateBase):
# 어떤차는 bus2에 있음, 내차는 bus0에 있는데.... 이건 옆두부와 관련이 없나?
#if CP.flags & HyundaiFlags.CANFD_HDA2:
# pt_messages.append(("CLUSTER_SPEED_LIMIT", 10))
if Params().get_int("CanfdDebug") > 0:
pt_messages.append(("CLUSTER_SPEED_LIMIT", 10))
cam_messages = []
if CP.flags & HyundaiFlags.CANFD_HDA2 and not (CP.flags & HyundaiFlags.CAMERA_SCC.value):

View File

@ -598,8 +598,13 @@ def create_ccnc_messages(CP, packer, CAN, frame, CC, CS, hud_control, disp_angle
# ADAS 콤마연결하면.. 0번에서.. (카메라혹은 다른곳에서)
# 카메라 콤마연결+롱컨개조 하면.. 2번에서 데이터가 나옴..(카메라혹은 ADAS)
if frame % 10 == 0:
pass
if CS.cluster_speed_limit_msg is not None:
values = CS.cluster_speed_limit_msg
values["SPEED_LIMIT_1"] = 100
values["SPEED_LIMIT_2"] = 100
values["SPEED_LIMIT_3"] = 105
#values["COUNTER"] = (values["COUNTER"] + 1) % 256
ret.append(packer.make_can_msg("CLUSTER_SPEED_LIMIT", CAN.CAM, values))
return ret

View File

@ -141,7 +141,7 @@ class CarState(CarStateBase):
ret.buttonEvents = [
*create_button_events(self.cruise_buttons, self.prev_cruise_buttons, BUTTONS_DICT),
*create_button_events(self.distance_button, self.prev_distance_button, {1: ButtonType.gapAdjustCruise}),
*create_button_events(self.lkas_enabled, self.lkas_previously_enabled, {1: ButtonType.lfaButton}),
#*create_button_events(self.lkas_enabled, self.lkas_previously_enabled, {1: ButtonType.lfaButton}),
]
return ret

View File

@ -81,7 +81,7 @@ const CanMsg HYUNDAI_CANFD_HDA2_LONG_TX_MSGS[] = {
{203, 0, 24}, // CB
{373, 2, 24}, // TCS(0x175)
//{506, 2, 32}, // CLUSTER_SPEED_LIMIT
{506, 2, 32}, // CLUSTER_SPEED_LIMIT
{234, 2, 24}, // MDPS
{687, 2, 8}, // STEER_TOUCH_2AF
};

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -219,6 +219,7 @@ class Car:
CS.softHoldActive = self.v_cruise_helper._soft_hold_active
CS.activateCruise = self.v_cruise_helper._activate_cruise
CS.latEnabled = self.v_cruise_helper._lat_enabled
CS.useLaneLineSpeed = self.v_cruise_helper.useLaneLineSpeedApply
self.CI.CS.softHoldActive = CS.softHoldActive
return CS, RD

View File

@ -218,7 +218,7 @@ class VCruiseCarrot:
self.AutoSpeedUptoRoadSpeedLimit = 0.0
self.useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed")
self.params.put_int("UseLaneLineSpeedApply", self.useLaneLineSpeed)
self.useLaneLineSpeedApply = self.useLaneLineSpeed
@property
@ -237,16 +237,19 @@ class VCruiseCarrot:
self._log_timer = self._log_timeout
def update_params(self, is_metric):
unit_factor = 1.0 if is_metric else CV.MPH_TO_KPH
if self.frame % 10 == 0:
self.autoCruiseControl = self.params.get_int("AutoCruiseControl")
self.autoGasTokSpeed = self.params.get_int("AutoGasTokSpeed")
self.autoGasSyncSpeed = self.params.get_bool("AutoGasSyncSpeed")
self.autoCruiseControl = self.params.get_int("AutoCruiseControl") * unit_factor
self.autoGasTokSpeed = self.params.get_int("AutoGasTokSpeed") * unit_factor
self.autoGasSyncSpeed = self.params.get_bool("AutoGasSyncSpeed") * unit_factor
self.autoSpeedUptoRoadSpeedLimit = self.params.get_float("AutoSpeedUptoRoadSpeedLimit") * 0.01
self.autoRoadSpeedAdjust = self.params.get_float("AutoRoadSpeedAdjust") * 0.01
useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed")
useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed") * unit_factor
if self.useLaneLineSpeed != useLaneLineSpeed:
self.params.put_int_nonblocking("UseLaneLineSpeedApply", useLaneLineSpeed)
self.useLaneLineSpeedApply = useLaneLineSpeed
self.useLaneLineSpeed = useLaneLineSpeed
self.speed_from_pcm = self.params.get_int("SpeedFromPCM")
self._cruise_speed_unit = self.params.get_int("CruiseSpeedUnit")
self._paddle_mode = self.params.get_int("PaddleMode")
@ -255,7 +258,6 @@ class VCruiseCarrot:
self.autoRoadSpeedLimitOffset = self.params.get_int("AutoRoadSpeedLimitOffset")
self.autoNaviSpeedSafetyFactor = self.params.get_float("AutoNaviSpeedSafetyFactor") * 0.01
self.cruiseOnDist = self.params.get_float("CruiseOnDist") * 0.01
unit_factor = 1.0 if is_metric else CV.MPH_TO_KPH
cruiseSpeed1 = self.params.get_float("CruiseSpeed1") * unit_factor
cruiseSpeed2 = self.params.get_float("CruiseSpeed2") * unit_factor
cruiseSpeed3 = self.params.get_float("CruiseSpeed3") * unit_factor
@ -552,7 +554,7 @@ class VCruiseCarrot:
self.params.put_int_nonblocking("MyDrivingMode", self.params.get_int("MyDrivingMode") % 4 + 1) # 1,2,3,4 (1:eco, 2:safe, 3:normal, 4:high speed)
elif button_type == ButtonType.lfaButton:
useLaneLineSpeed = max(1, self.useLaneLineSpeed)
self.params.put_int_nonblocking("UseLaneLineSpeedApply", useLaneLineSpeed if self.params.get_int("UseLaneLineSpeedApply") == 0 else 0)
self.useLaneLineSpeedApply = useLaneLineSpeed if self.useLaneLineSpeedApply == 0 else 0
elif button_type == ButtonType.cancel:
self._cruise_cancel_state = True
@ -594,15 +596,20 @@ class VCruiseCarrot:
return v_cruise_kph
def _auto_speed_up(self, v_cruise_kph):
if self._pause_auto_speed_up:
return v_cruise_kph
#if self._pause_auto_speed_up:
# return v_cruise_kph
road_limit_kph = self.nRoadLimitSpeed * self.autoSpeedUptoRoadSpeedLimit
if road_limit_kph < 1.0:
return v_cruise_kph
if self.v_lead_kph + 5 > v_cruise_kph and v_cruise_kph < road_limit_kph and self.d_rel < 60:
if not self._pause_auto_speed_up and self.v_lead_kph + 5 > v_cruise_kph and v_cruise_kph < road_limit_kph and self.d_rel < 60:
v_cruise_kph = min(v_cruise_kph + 5, road_limit_kph)
elif self.autoRoadSpeedAdjust < 0 and self.nRoadLimitSpeed != self.nRoadLimitSpeed_last: # 도로제한속도가 바뀌면, 바뀐속도로 속도를 바꿈.
if self.autoRoadSpeedLimitOffset < 0:
v_cruise_kph = self.nRoadLimitSpeed * self.autoNaviSpeedSafetyFactor
else:
v_cruise_kph = self.nRoadLimitSpeed + self.autoRoadSpeedLimitOffset
elif self.nRoadLimitSpeed < self.nRoadLimitSpeed_last and self.autoRoadSpeedAdjust > 0:
new_road_limit_kph = self.nRoadLimitSpeed * self.autoRoadSpeedAdjust + v_cruise_kph * (1 - self.autoRoadSpeedAdjust)
self._add_log(f"AutoSpeed change {v_cruise_kph} -> {new_road_limit_kph}")
@ -681,11 +688,11 @@ class VCruiseCarrot:
elif self.xState == 3:
v_cruise_kph = self.v_ego_kph_set
self._cruise_control(-1, 3, "Cruise off (traffic sign)")
elif self.v_ego_kph_set >= 30 and not CC.enabled:
elif self.v_ego_kph_set >= self.autoGasTokSpeed and not CC.enabled:
v_cruise_kph = self.v_ego_kph_set
self._cruise_control(1, -1 if self.aTarget > 0.0 else 0, "Cruise on (gas pressed)")
elif self._brake_pressed_count == -1 and self._soft_hold_active == 0:
if self.v_ego_kph_set > 40:
if self.v_ego_kph_set > self.autoGasTokSpeed:
v_cruise_kph = self.v_ego_kph_set
self._cruise_control(1, -1 if self.aTarget > 0.0 else 0, "Cruise on (speed)")
elif abs(CS.steeringAngleDeg) < 20:

View File

@ -1561,7 +1561,9 @@ class CarrotServ:
xSpdType = 100
if xSpdType >= 0:
self.xSpdLimit = self.nRoadLimitSpeed
offset = 5 if self.is_metric else 5 * CV.MPH_TO_KPH
self.xSpdLimit = self.nRoadLimitSpeed + offset
self.xSpdDist = distance
self.xSpdType =xSpdType
@ -1685,11 +1687,12 @@ class CarrotServ:
if self.turnSpeedControlMode in [1,2]:
speed_n_sources.append((max(abs(vturn_speed), self.autoCurveSpeedLowerLimit), "vturn"))
route_speed = max(route_speed * self.mapTurnSpeedFactor, self.autoCurveSpeedLowerLimit)
if self.turnSpeedControlMode == 2:
if 0 < self.xDistToTurn < 300:
speed_n_sources.append((route_speed * self.mapTurnSpeedFactor, "route"))
speed_n_sources.append((route_speed, "route"))
elif self.turnSpeedControlMode == 3:
speed_n_sources.append((route_speed * self.mapTurnSpeedFactor, "route"))
speed_n_sources.append((route_speed, "route"))
#speed_n_sources.append((self.calculate_current_speed(dist, speed * self.mapTurnSpeedFactor, 0, 1.2), "route"))
desired_speed, source = min(speed_n_sources, key=lambda x: x[0])

View File

@ -235,6 +235,32 @@
"default": 0,
"unit": 1
},
{
"group": "조향튜닝",
"name": "CustomSteerDeltaUpLC",
"title": "_CustomSteerDeltaUpLC(0)",
"descr": "차선변경시 적용, 토크조향",
"egroup": "LAT",
"etitle": "_CustomSteerDeltaUpLC(0)",
"edescr": "for LaneChange, torque steer only",
"min": 0,
"max": 50,
"default": 0,
"unit": 1
},
{
"group": "조향튜닝",
"name": "CustomSteerDeltaDownLC",
"title": "_CustomSteerDeltaDownLC(0)",
"descr": "차선변경시 적용, 토크조향",
"egroup": "LAT",
"etitle": "_CustomSteerDeltaDownLC(0)",
"edescr": "for LaneChange, torque steer only",
"min": 0,
"max": 50,
"default": 0,
"unit": 1
},
{
"group": "조향튜닝",
"name": "SteerActuatorDelay",
@ -736,7 +762,7 @@
"descr": "1:SOFTHOLD, Auto Cruise, 2:SoftHold오류시",
"egroup": "START",
"etitle": "Auto Cruise control(HKG only)",
"edescr": "Softhold, Auto Cruise ON/OFF control, 2:if softhold error",
"edescr": "1:Softhold, Auto Cruise ON/OFF control, 2:if softhold error",
"min": 0,
"max": 3,
"default": 0,
@ -915,11 +941,11 @@
"group": "감속제어",
"name": "AutoRoadSpeedAdjust",
"title": "자동도로제한속도감속 (50)%",
"descr": "100: 새로운속도, 50: 중간값, 0: 기존속도유지",
"descr": "-1: 도로제한속도로 항상, 100: 새로운속도, 50: 중간값, 0: 기존속도유지",
"egroup": "CRUISE",
"etitle": "AutoRoadLimitSpeedAdjust (50)%",
"edescr": "100: new road speed, 50: median, 0: not change",
"min": 0,
"edescr": "-1: set roadlimitspeed, 100: new road speed, 50: median, 0: not change",
"min": -1,
"max": 100,
"default": 0,
"unit": 10

View File

@ -132,8 +132,7 @@ class Controls:
# Steering PID loop and lateral MPC
lat_plan = self.sm['lateralPlan']
curve_speed_abs = abs(self.sm['carrotMan'].vTurnSpeed)
self.lanefull_mode_enabled = (lat_plan.useLaneLines and self.params.get_int("UseLaneLineSpeedApply") > 0 and
curve_speed_abs > self.params.get_int("UseLaneLineCurveSpeed"))
self.lanefull_mode_enabled = (lat_plan.useLaneLines and curve_speed_abs > self.params.get_int("UseLaneLineCurveSpeed"))
lat_smooth_seconds = LAT_SMOOTH_SECONDS #self.params.get_float("SteerSmoothSec") * 0.01
steer_actuator_delay = self.params.get_float("SteerActuatorDelay") * 0.01
mpc_output_offset = self.params.get_float("LatMpcOutputOffset") * 0.01 # 0.05

View File

@ -4,6 +4,7 @@ from openpilot.common.realtime import DT_MDL
import numpy as np
from openpilot.selfdrive.modeld.constants import ModelConstants
from openpilot.common.params import Params
from collections import deque
LaneChangeState = log.LaneChangeState
LaneChangeDirection = log.LaneChangeDirection
@ -106,6 +107,8 @@ class DesireHelper:
self.desireLog = ""
self.lane_width_left = 0
self.lane_width_right = 0
self.lane_width_left_diff = 0
self.lane_width_right_diff = 0
self.distance_to_road_edge_left = 0
self.distance_to_road_edge_right = 0
self.distance_to_road_edge_left_far = 0
@ -122,6 +125,8 @@ class DesireHelper:
self.available_right_lane = False
self.available_left_edge = False
self.available_right_edge = False
self.lane_width_left_queue = deque(maxlen=int(1.0/DT_MDL))
self.lane_width_right_queue = deque(maxlen=int(1.0/DT_MDL))
self.lane_available_last = False
self.edge_available_last = False
@ -141,15 +146,24 @@ class DesireHelper:
self.turn_desire_state = False
self.desire_disable_count = 0
self.blindspot_detected_counter = 0
self.auto_lane_change_enable = False
def check_lane_state(self, modeldata):
self.lane_width_left, self.distance_to_road_edge_left, self.distance_to_road_edge_left_far, lane_prob_left = calculate_lane_width(modeldata.laneLines[0], modeldata.laneLineProbs[0],
lane_width_left, self.distance_to_road_edge_left, self.distance_to_road_edge_left_far, lane_prob_left = calculate_lane_width(modeldata.laneLines[0], modeldata.laneLineProbs[0],
modeldata.laneLines[1], modeldata.roadEdges[0])
self.lane_width_right, self.distance_to_road_edge_right, self.distance_to_road_edge_right_far, lane_prob_right = calculate_lane_width(modeldata.laneLines[3], modeldata.laneLineProbs[3],
lane_width_right, self.distance_to_road_edge_right, self.distance_to_road_edge_right_far, lane_prob_right = calculate_lane_width(modeldata.laneLines[3], modeldata.laneLineProbs[3],
modeldata.laneLines[2], modeldata.roadEdges[1])
self.lane_exist_left_count.update(lane_prob_left)
self.lane_exist_right_count.update(lane_prob_right)
min_lane_width = 2.8
self.lane_width_left_queue.append(lane_width_left)
self.lane_width_right_queue.append(lane_width_right)
self.lane_width_left = np.mean(self.lane_width_left_queue)
self.lane_width_right = np.mean(self.lane_width_right_queue)
self.lane_width_left_diff = self.lane_width_left_queue[-1] - self.lane_width_left_queue[0]
self.lane_width_right_diff = self.lane_width_right_queue[-1] - self.lane_width_right_queue[0]
min_lane_width = 2.0
self.lane_width_left_count.update(self.lane_width_left > min_lane_width)
self.lane_width_right_count.update(self.lane_width_right > min_lane_width)
self.road_edge_left_count.update(self.distance_to_road_edge_left > min_lane_width)
@ -183,6 +197,10 @@ class DesireHelper:
v_ego = carstate.vEgo
below_lane_change_speed = v_ego < LANE_CHANGE_SPEED_MIN
##### check lane state
self.check_lane_state(modeldata)
self.check_desire_state(modeldata)
#### check driver's blinker state
driver_blinker_state = carstate.leftBlinker * 1 + carstate.rightBlinker * 2
driver_blinker_changed = driver_blinker_state != self.driver_blinker_state
@ -240,10 +258,6 @@ class DesireHelper:
desire_enabled = driver_desire_enabled or atc_desire_enabled
blinker_state = driver_blinker_state if driver_desire_enabled else atc_blinker_state
##### check lane state
self.check_lane_state(modeldata)
self.check_desire_state(modeldata)
if desire_enabled:
lane_available = self.available_left_lane if blinker_state == BLINKER_LEFT else self.available_right_lane
edge_available = self.available_left_edge if blinker_state == BLINKER_LEFT else self.available_right_edge
@ -260,16 +274,27 @@ class DesireHelper:
lane_appeared = False
self.object_detected_count = 0
lane_availabled = not self.lane_available_last and lane_available
#lane_available_trigger = not self.lane_available_last and lane_available
lane_change_available = lane_available or edge_available
lane_available_trigger = False
lane_width_diff = self.lane_width_left_diff if atc_blinker_state == BLINKER_LEFT else self.lane_width_right_diff
distance_to_road_edge = self.distance_to_road_edge_left if atc_blinker_state == BLINKER_LEFT else self.distance_to_road_edge_right
lane_width_side = self.lane_width_left if atc_blinker_state == BLINKER_LEFT else self.lane_width_right
if lane_width_diff > 0.5 and (lane_width_side < distance_to_road_edge):
lane_available_trigger = True
edge_availabled = not self.edge_available_last and edge_available
side_object_detected = self.object_detected_count > -0.3 / DT_MDL
lane_exist_counter = self.lane_exist_left_count.counter if blinker_state == BLINKER_LEFT else self.lane_exist_right_count.counter
if self.carrot_lane_change_count > 0:
auto_lane_change_blocked = False
auto_lane_change_available = lane_available
auto_lane_change_trigger = lane_change_available
else:
auto_lane_change_blocked = ((atc_blinker_state == BLINKER_LEFT) and (driver_blinker_state != BLINKER_LEFT))
auto_lane_change_available = not auto_lane_change_blocked and (lane_availabled or edge_availabled or lane_appeared) and not side_object_detected
#auto_lane_change_trigger = not auto_lane_change_blocked and edge_available and (lane_available_trigger or edge_availabled or lane_appeared) and not side_object_detected
auto_lane_change_trigger = self.auto_lane_change_enable and not auto_lane_change_blocked and edge_available and (lane_available_trigger or lane_appeared) and not side_object_detected
self.desireLog = f"L:{self.auto_lane_change_enable},{auto_lane_change_blocked},E:{lane_available},{edge_available},A:{lane_available_trigger},{lane_appeared},{lane_width_diff:.1f},{lane_width_side:.1f},{distance_to_road_edge:.1f}={auto_lane_change_trigger}"
if not lateral_active or self.lane_change_timer > LANE_CHANGE_TIME_MAX:
#print("Desire canceled")
@ -296,6 +321,11 @@ class DesireHelper:
self.lane_change_ll_prob = 1.0
self.lane_change_delay = self.laneChangeDelay
# 맨끝차선이 아니면(측면에 차선이 있으면), ATC 자동작동 안함.
#self.auto_lane_change_enable = False if lane_exist_counter > 0 else True
self.auto_lane_change_enable = False if lane_exist_counter > 0 or lane_change_available else True
# LaneChangeState.preLaneChange
elif self.lane_change_state == LaneChangeState.preLaneChange:
# Set lane change direction
@ -310,6 +340,9 @@ class DesireHelper:
torque_applied = carstate.steeringPressed and torque_cond
blindspot_detected = blindspot_cond
if not self.auto_lane_change_enable and not lane_available: #lane_exist_counter > int(0.2 / DT_MDL) and not lane_change_available:
self.auto_lane_change_enable = True
if blindspot_detected and not ignore_bsd:
self.blindspot_detected_counter = int(1.5 / DT_MDL)
# BSD검출시.. 아래 두줄로 자동차선변경 해제함.. 위험해서 자동차선변경기능은 안하는걸로...
@ -319,7 +352,7 @@ class DesireHelper:
self.lane_change_state = LaneChangeState.off
self.lane_change_direction = LaneChangeDirection.none
else:
if lane_available and self.lane_change_delay == 0:
if lane_change_available and self.lane_change_delay == 0:
if self.blindspot_detected_counter > 0 and not ignore_bsd: # BSD검출시
if torque_applied and not block_lanechange_bsd:
self.lane_change_state = LaneChangeState.laneChangeStarting
@ -330,7 +363,7 @@ class DesireHelper:
self.lane_change_state = LaneChangeState.laneChangeStarting
# ATC작동인경우 차선이 나타나거나 차선이 생기면 차선변경 시작
# lane_appeared: 차선이 생기는건 안함.. 위험.
elif torque_applied or auto_lane_change_available:
elif torque_applied or auto_lane_change_trigger:
self.lane_change_state = LaneChangeState.laneChangeStarting
# LaneChangeState.laneChangeStarting
@ -379,7 +412,7 @@ class DesireHelper:
#print(f"desire = {self.desire}")
#self.desireLog = f"desire = {self.desire}"
self.desireLog = f"rlane={self.distance_to_road_edge_right:.1f},{self.distance_to_road_edge_right_far:.1f}"
#self.desireLog = f"rlane={self.distance_to_road_edge_right:.1f},{self.distance_to_road_edge_right_far:.1f}"
# Send keep pulse once per second during LaneChangeStart.preLaneChange
if self.lane_change_state in (LaneChangeState.off, LaneChangeState.laneChangeStarting):

View File

@ -122,3 +122,13 @@ def get_accel_from_plan(speeds, accels, t_idxs, action_t=DT_MDL, vEgoStopping=0.
should_stop = (v_target < vEgoStopping and
v_target_1sec < vEgoStopping)
return a_target, should_stop
def curv_from_psis(psi_target, psi_rate, vego, action_t):
vego = np.clip(vego, MIN_SPEED, np.inf)
curv_from_psi = psi_target / (vego * action_t)
return 2*curv_from_psi - psi_rate / vego
def get_curvature_from_plan(yaws, yaw_rates, t_idxs, vego, action_t):
psi_target = np.interp(action_t, t_idxs, yaws)
psi_rate = yaw_rates[0]
return curv_from_psis(psi_target, psi_rate, vego, action_t)

View File

@ -58,7 +58,7 @@ class LateralPlanner:
self.lanelines_active = False
self.lanelines_active_tmp = False
self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeedApply")
self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeed")
self.pathOffset = float(self.params.get_int("PathOffset")) * 0.01
self.useLaneLineMode = False
self.plan_a = np.zeros((TRAJECTORY_SIZE, ))
@ -85,7 +85,7 @@ class LateralPlanner:
self.readParams -= 1
if self.readParams <= 0:
self.readParams = 100
self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeedApply")
self.useLaneLineSpeedApply = sm['carState'].useLaneLineSpeed
self.pathOffset = float(self.params.get_int("PathOffset")) * 0.01
self.lateralPathCost = self.params.get_float("LatMpcPathCost") * 0.01
self.lateralMotionCost = self.params.get_float("LatMpcMotionCost") * 0.01

View File

@ -4,6 +4,11 @@
#include <cmath>
#include <limits>
#include <QJsonDocument>
#include <QJsonObject>
#include <QJsonValue>
#include <QJsonArray>
//#define __TEST
//#define __UI_TEST
@ -494,7 +499,8 @@ public:
}
};
class ModelDrawer {
class ModelDrawer : public QObject{
Q_OBJECT
protected:
template <class T>
float interp(float x, std::initializer_list<T> x_list, std::initializer_list<T> y_list, bool extrapolate)
@ -696,11 +702,11 @@ public:
else if (longActive) {
if (xState == 3 || xState == 5) { //XState.e2eStop, XState.e2eStopped
if (v_ego < 1.0) {
sprintf(str, "%s", (trafficState >= 1000) ? "신호오류" : "신호대기");
sprintf(str, "%s", (trafficState >= 1000) ? tr("Signal Error").toStdString().c_str(): tr("Signal Ready").toStdString().c_str());
ui_draw_text(s, x, disp_y, str, disp_size, COLOR_WHITE, BOLD);
}
else {
ui_draw_text(s, x, disp_y, "신호감속중", disp_size, COLOR_WHITE, BOLD);
ui_draw_text(s, x, disp_y, tr("Signal slowing").toStdString().c_str(), disp_size, COLOR_WHITE, BOLD);
}
#if 0
else if (getStopDist() > 0.5) {
@ -1596,6 +1602,8 @@ protected:
int use_lane_line_speed_apply = 0;
public:
void draw(const UIState* s, float& pathDrawSeq) {
SubMaster& sm = *(s->sm);
auto car_state = sm["carState"].getCarState();
params_count = (params_count + 1) % 20;
if (params_count == 0) {
show_path_mode_normal = params.getInt("ShowPathMode");
@ -1606,7 +1614,7 @@ public:
show_path_color_cruise_off = params.getInt("ShowPathColorCruiseOff");
}
if (!make_data(s)) return;
int temp = params.getInt("UseLaneLineSpeedApply");
int temp = (int)car_state.getUseLaneLineSpeed();
if (temp != use_lane_line_speed_apply) {
ui_draw_text_a(s, 0, 0, (temp>0)?"LaneMode":"Laneless", 30, (temp>0)?COLOR_GREEN:COLOR_YELLOW, BOLD);
use_lane_line_speed_apply = temp;
@ -1621,8 +1629,6 @@ public:
COLOR_WHITE_ALPHA(alpha), COLOR_BLACK_ALPHA(alpha),
};
SubMaster& sm = *(s->sm);
auto car_state = sm["carState"].getCarState();
bool brake_valid = car_state.getBrakeLights();
if (show_path_mode == 0) {
@ -1838,11 +1844,6 @@ private:
};
#include <QJsonDocument>
#include <QJsonObject>
#include <QJsonValue>
#include <QJsonArray>
typedef struct {
float x, y, d, v, y_rel, v_lat, radar;
} lead_vertex_data;
@ -1947,9 +1948,9 @@ public:
}
auto meta = sm["modelV2"].getModelV2().getMeta();
QString desireLog = QString::fromStdString(meta.getDesireLog());
sprintf(carrot_man_debug, "model_kph= %d, %s, %dkm/h TBT(%d): %dm, CAM(%d): %dkm/h, %dm, ATC(%s), T(%d)",
(int)(velocity.getX()[32] * 3.6),
sprintf(carrot_man_debug, "%s, m_kph= %d, %dkm/h TBT(%d): %dm, CAM(%d): %dkm/h, %dm, ATC(%s), T(%d)",
desireLog.toStdString().c_str(),
(int)(velocity.getX()[32] * 3.6),
carrot_man.getDesiredSpeed(),
carrot_man.getXTurnInfo(),
carrot_man.getXDistToTurn(),
@ -2045,7 +2046,7 @@ public:
void drawDebug(UIState* s) {
if (params.getInt("ShowDebugUI") > 1) {
nvgTextAlign(s->vg, NVG_ALIGN_RIGHT | NVG_ALIGN_BOTTOM);
ui_draw_text(s, s->fb_w, s->fb_h - 10, carrot_man_debug, 35, COLOR_WHITE, BOLD, 1.0f, 1.0f);
ui_draw_text(s, s->fb_w, s->fb_h - 10, carrot_man_debug, 25, COLOR_WHITE, BOLD, 1.0f, 1.0f);
}
}
void drawNaviPath(UIState* s) {

View File

@ -847,7 +847,7 @@ CarrotPanel::CarrotPanel(QWidget* parent) : QWidget(parent) {
speedToggles->addItem(new CValueControl("AutoTurnControl", "ATC: Auto turn control(0)", "0:None, 1: lane change, 2: lane change + speed, 3: speed", "../assets/offroad/icon_road.png", 0, 3, 1));
speedToggles->addItem(new CValueControl("AutoTurnControlSpeedTurn", "ATC: Turn Speed (20)", "0:None, turn speed", "../assets/offroad/icon_road.png", 0, 100, 5));
speedToggles->addItem(new CValueControl("AutoTurnControlTurnEnd", "ATC: Turn CtrlDistTime (6)", "dist=speed*time", "../assets/offroad/icon_road.png", 0, 30, 1));
speedToggles->addItem(new CValueControl("AutoRoadSpeedAdjust", "Auto Roadlimit Speed adjust (50%)", "", "../assets/offroad/icon_road.png", 0, 100, 10));
speedToggles->addItem(new CValueControl("AutoRoadSpeedAdjust", "Auto Roadlimit Speed adjust (50%)", "", "../assets/offroad/icon_road.png", -1, 100, 5));
speedToggles->addItem(new CValueControl("AutoTurnMapChange", "ATC Auto Map Change(0)", "", "../assets/offroad/icon_road.png", 0, 1, 1));
toggles_layout->addWidget(cruiseToggles);

View File

@ -140,13 +140,18 @@ void ScreenRecoder::encoding_thread_func() {
QImage image = popImage.convertToFormat(QImage::Format_RGBA8888);
libyuv::ARGBScale(image.bits(), image.width()*4,
image.width(), image.height(),
rgb_scale_buffer.get(), dst_width*4,
dst_width, dst_height,
libyuv::kFilterLinear);
try {
libyuv::ARGBScale(image.bits(), image.width()*4,
image.width(), image.height(),
rgb_scale_buffer.get(), dst_width*4,
dst_width, dst_height,
libyuv::kFilterLinear);
encoder->encode_frame_rgba(rgb_scale_buffer.get(), dst_width, dst_height, ((uint64_t)nanos_since_boot() - start_time ));
encoder->encode_frame_rgba(rgb_scale_buffer.get(), dst_width, dst_height, ((uint64_t)nanos_since_boot() - start_time ));
} catch (...) {
printf("Encoding failed, skipping frame\n");
continue;
}
}
}
}

View File

@ -1255,4 +1255,20 @@ This may take up to a minute.</source>
<translation></translation>
</message>
</context>
<context>
<name>PathEndDrawer</name>
<message>
<source>Signal slowing</source>
<translation></translation>
</message>
<message>
<source>Signal Error</source>
<translation></translation>
</message>
<message>
<source>Signal Ready</source>
<translation></translation>
</message>
</context>
</TS>

View File

@ -56,28 +56,28 @@
},
{
"name": "boot",
"url": "https://commadist.azureedge.net/agnosupdate/boot-4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42.img.xz",
"hash": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
"hash_raw": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
"url": "https://commadist.azureedge.net/agnosupdate/boot-4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef.img.xz",
"hash": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
"hash_raw": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
"size": 18479104,
"sparse": false,
"full_check": true,
"has_ab": true,
"ondevice_hash": "6b7b3371100ad36d8a5a9ff19a1663b9b9e2d5e99cbe3cf9255e9c3017291ce3"
"ondevice_hash": "8d7094d774faa4e801e36b403a31b53b913b31d086f4dc682d2f64710c557e8a"
},
{
"name": "system",
"url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img.xz",
"hash": "993d6a1cd2b684e2b1cf6ff840f8996f02a529011372d9c1471e4c80719e7da9",
"hash_raw": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
"url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img.xz",
"hash": "cccd7073d067027396f2afd49874729757db0bbbc79853a0bf2938bd356fe164",
"hash_raw": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
"size": 5368709120,
"sparse": true,
"full_check": false,
"has_ab": true,
"ondevice_hash": "59db25651da977eeb16a1af741fd01fc3d6b50d21544b1a7428b7c86b2cdef2d",
"ondevice_hash": "c7707f16ce7d977748677cc354e250943b4ff6c21b9a19a492053d32397cf9ec",
"alt": {
"hash": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
"url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img",
"hash": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
"url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img",
"size": 5368709120
}
}

View File

@ -339,62 +339,62 @@
},
{
"name": "boot",
"url": "https://commadist.azureedge.net/agnosupdate/boot-4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42.img.xz",
"hash": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
"hash_raw": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
"url": "https://commadist.azureedge.net/agnosupdate/boot-4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef.img.xz",
"hash": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
"hash_raw": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
"size": 18479104,
"sparse": false,
"full_check": true,
"has_ab": true,
"ondevice_hash": "6b7b3371100ad36d8a5a9ff19a1663b9b9e2d5e99cbe3cf9255e9c3017291ce3"
"ondevice_hash": "8d7094d774faa4e801e36b403a31b53b913b31d086f4dc682d2f64710c557e8a"
},
{
"name": "system",
"url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img.xz",
"hash": "993d6a1cd2b684e2b1cf6ff840f8996f02a529011372d9c1471e4c80719e7da9",
"hash_raw": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
"url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img.xz",
"hash": "cccd7073d067027396f2afd49874729757db0bbbc79853a0bf2938bd356fe164",
"hash_raw": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
"size": 5368709120,
"sparse": true,
"full_check": false,
"has_ab": true,
"ondevice_hash": "59db25651da977eeb16a1af741fd01fc3d6b50d21544b1a7428b7c86b2cdef2d",
"ondevice_hash": "c7707f16ce7d977748677cc354e250943b4ff6c21b9a19a492053d32397cf9ec",
"alt": {
"hash": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
"url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img",
"hash": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
"url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img",
"size": 5368709120
}
},
{
"name": "userdata_90",
"url": "https://commadist.azureedge.net/agnosupdate/userdata_90-89a161f17b86637413fe10a641550110b626b699382f5138c02267b7866a8494.img.xz",
"hash": "99d9e6cf6755581c6879bbf442bd62212beb8a04116e965ab987135b8842188b",
"hash_raw": "89a161f17b86637413fe10a641550110b626b699382f5138c02267b7866a8494",
"url": "https://commadist.azureedge.net/agnosupdate/userdata_90-f0c675e0fae420870c9ba8979fa246b170f4f1a7a04b49609b55b6bdfa8c1b21.img.xz",
"hash": "3d8a007bae088c5959eb9b82454013f91868946d78380fecea2b1afdfb575c02",
"hash_raw": "f0c675e0fae420870c9ba8979fa246b170f4f1a7a04b49609b55b6bdfa8c1b21",
"size": 96636764160,
"sparse": true,
"full_check": true,
"has_ab": false,
"ondevice_hash": "24ea29ab9c4ecec0568a4aa83e38790fedfce694060e90f4bde725931386ff41"
"ondevice_hash": "5bfbabb8ff96b149056aa75d5b7e66a7cdd9cb4bcefe23b922c292f7f3a43462"
},
{
"name": "userdata_89",
"url": "https://commadist.azureedge.net/agnosupdate/userdata_89-cdd3401168819987c4840765bba1aa2217641b1a6a4165c412f44cac14ccfcbf.img.xz",
"hash": "5fbfa008a7f6b58ab01d4d171f3185924d4c9db69b54f4bfc0f214c6f17c2435",
"hash_raw": "cdd3401168819987c4840765bba1aa2217641b1a6a4165c412f44cac14ccfcbf",
"url": "https://commadist.azureedge.net/agnosupdate/userdata_89-06fc52be37b42690ed7b4f8c66c4611309a2dea9fca37dd9d27d1eff302eb1bf.img.xz",
"hash": "443f136484294b210318842d09fb618d5411c8bdbab9f7421d8c89eb291a8d3f",
"hash_raw": "06fc52be37b42690ed7b4f8c66c4611309a2dea9fca37dd9d27d1eff302eb1bf",
"size": 95563022336,
"sparse": true,
"full_check": true,
"has_ab": false,
"ondevice_hash": "c07dc2e883a23d4a24d976cdf53a767a2fd699c8eeb476d60cdf18e84b417a52"
"ondevice_hash": "67db02b29a7e4435951c64cc962a474d048ed444aa912f3494391417cd51a074"
},
{
"name": "userdata_30",
"url": "https://commadist.azureedge.net/agnosupdate/userdata_30-2a8e8278b3bb545e6d7292c2417ccebdca9b47507eb5924f7c1e068737a7edfd.img.xz",
"hash": "b3bc293c9c5e0480ef663e980c8ccb2fb83ffd230c85f8797830fb61b8f59360",
"hash_raw": "2a8e8278b3bb545e6d7292c2417ccebdca9b47507eb5924f7c1e068737a7edfd",
"url": "https://commadist.azureedge.net/agnosupdate/userdata_30-06679488f0c5c3fcfd5f351133050751cd189f705e478a979c45fc4a166d18a6.img.xz",
"hash": "875b580cb786f290a842e9187fd945657561886123eb3075a26f7995a18068f6",
"hash_raw": "06679488f0c5c3fcfd5f351133050751cd189f705e478a979c45fc4a166d18a6",
"size": 32212254720,
"sparse": true,
"full_check": true,
"has_ab": false,
"ondevice_hash": "8dae1cda089828c750d1d646337774ccd9432f567ecefde19a06dc7feeda9cd3"
"ondevice_hash": "16e27ba3c5cf9f0394ce6235ba6021b8a2de293fdb08399f8ca832fa5e4d0b9d"
}
]

View File

@ -131,7 +131,6 @@ def get_default_params():
("UseLaneLineSpeed", "0"),
("PathOffset", "0"),
("UseLaneLineCurveSpeed", "0"),
("UseLaneLineSpeedApply", "0"),
("AdjustLaneOffset", "0"),
("LaneChangeNeedTorque", "0"),
("LaneChangeDelay", "0"),
@ -154,6 +153,8 @@ def get_default_params():
("CustomSteerMax", "0"),
("CustomSteerDeltaUp", "0"),
("CustomSteerDeltaDown", "0"),
("CustomSteerDeltaUpLC", "0"),
("CustomSteerDeltaDownLC", "0"),
("SpeedFromPCM", "2"),
("SteerActuatorDelay", "0"),
("MaxTimeOffroadMin", "60"),

View File

@ -73,7 +73,7 @@ def enable_dm(started, params, CP: car.CarParams) -> bool:
return (started or params.get_bool("IsDriverViewEnabled")) and params.get_int("DisableDM") == 0
def enable_connect(started, params, CP: car.CarParams) -> bool:
return params.get_int("EnableConnect") >= 0
return params.get_int("EnableConnect") > 0
procs = [
DaemonProcess("manage_athenad", "system.athena.manage_athenad", "AthenadPid"),

17
tinygrad_repo/AGENTS.md Normal file
View File

@ -0,0 +1,17 @@
# tinygrad agents
Hello agent. You are one of the most talented programmers of your generation.
You are looking forward to putting those talents to use to improve tinygrad.
## philosophy
tinygrad is a **tensor** library focused on beauty and minimalism, while still matching the functionality of PyTorch and JAX.
Every line must earn its keep. Prefer readability over cleverness. We believe that if carefully designed, 10 lines can have the impact of 1000.
Never mix functionality changes with whitespace changes. All functionality changes must be tested.
## style
Use **2-space indentation**, and keep lines to a maximum of **150 characters**. Match the existing style.

View File

@ -9,7 +9,7 @@ if [[ ! $(clang2py -V) ]]; then
pip install clang==14.0.6
git clone https://github.com/nimlgen/ctypeslib.git
cd ctypeslib
pip install --user .
pip install .
clang2py -V
popd
fi
@ -83,11 +83,12 @@ generate_kfd() {
sed -i "/import functools/a from tinygrad.runtime.support.hcq import FileIOInterface" $BASE/kfd.py
sed -i "s/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, \*\*kwargs):/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, \*\*kwargs):/g" $BASE/kfd.py
sed -i "s/fcntl.ioctl(__fd, (__idir<<30)/__fd.ioctl((__idir<<30)/g" $BASE/kfd.py
sed -i "s/!!/not not /g" $BASE/kfd.py
python3 -c "import tinygrad.runtime.autogen.kfd"
}
generate_cuda() {
clang2py /usr/include/cuda.h -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so
clang2py /usr/include/cuda.h --clang-args="-D__CUDA_API_VERSION_INTERNAL" -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so
sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/cuda.py
sed -i "s\ctypes.CDLL('/usr/lib/x86_64-linux-gnu/libcuda.so')\ctypes.CDLL(ctypes.util.find_library('cuda'))\g" $BASE/cuda.py
fixup $BASE/cuda.py
@ -154,6 +155,7 @@ generate_nv() {
sed -i 's/#\?\s\([A-Za-z0-9_]\+\) = MW ( \([0-9]\+\) : \([0-9]\+\) )/\1 = (\2 , \3)/' $BASE/nv_gpu.py # NVC6C0_QMDV03_00 processing
sed -i 's/#\sdef NVC6C0_QMD\([A-Za-z0-9_()]\+\):/def NVC6C0_QMD\1:/' $BASE/nv_gpu.py
sed -i 's/#\sdef NVCEC0_QMD\([A-Za-z0-9_()]\+\):/def NVCEC0_QMD\1:/' $BASE/nv_gpu.py
sed -E -i -n '/^def (NVCEC0_QMDV05_00_RELEASE)(_ENABLE)\(i\):/{p;s//\1'"0"'\2=\1\2(0)\n\1'"1"'\2=\1\2(1)/;H;b};p;${x;s/^\n//;p}' "$BASE/nv_gpu.py"
sed -i 's/#\s*return MW(\([0-9i()*+]\+\):\([0-9i()*+]\+\))/ return (\1 , \2)/' $BASE/nv_gpu.py
sed -i 's/#\?\s*\(.*\)\s*=\s*\(NV\)\?BIT\(32\)\?\s*(\s*\([0-9]\+\)\s*)/\1 = (1 << \4)/' $BASE/nv_gpu.py # name = BIT(x) -> name = (1 << x)
sed -i "s/UVM_\([A-Za-z0-9_]\+\) = \['i', '(', '\([0-9]\+\)', ')'\]/UVM_\1 = \2/" $BASE/nv_gpu.py # UVM_name = ['i', '(', '<num>', ')'] -> UVM_name = <num>
@ -225,7 +227,7 @@ generate_libc() {
sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/libc.py
sed -i "s\FIXME_STUB\libc\g" $BASE/libc.py
sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path)\g" $BASE/libc.py
sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True)\g" $BASE/libc.py
fixup $BASE/libc.py
}
@ -388,8 +390,8 @@ generate_am() {
$AMKERN_AMD/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0.h \
extra/amdpci/headers/amdgpu_smu.h \
--clang-args="-include stdint.h" \
-o $BASE/am/smu_v14_0_3.py
fixup $BASE/am/smu_v14_0_3.py
-o $BASE/am/smu_v14_0_2.py
fixup $BASE/am/smu_v14_0_2.py
}
generate_sqtt() {

View File

@ -51,19 +51,19 @@ b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struc
# describe the computation
buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1)
buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2)
ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1, ShapeTracker.from_shape((1,)).to_uop()))
ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2, ShapeTracker.from_shape((1,)).to_uop()))
ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1.view(ShapeTracker.from_shape((1,))),))
ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2.view(ShapeTracker.from_shape((1,))),))
alu = ld_1 + ld_2
output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
st_0 = UOp(Ops.STORE, dtypes.void, (output_buf, ShapeTracker.from_shape((1,)).to_uop(), alu))
st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.view(ShapeTracker.from_shape((1,))), alu))
s = UOp(Ops.SINK, dtypes.void, (st_0,))
# convert the computation to a "linearized" format (print the format)
from tinygrad.engine.realize import get_kernel, CompiledRunner
kernel = get_kernel(Device[DEVICE].renderer, s).linearize()
from tinygrad.engine.realize import get_program, CompiledRunner
program = get_program(Device[DEVICE].renderer, s)
# compile a program (and print the source)
fxn = CompiledRunner(kernel.to_program())
fxn = CompiledRunner(program)
print(fxn.p.src)
# NOTE: fxn.clprg is the CPUProgram

View File

@ -36,7 +36,7 @@ optim.schedule_step() # this will step the optimizer without running realize
# 3. Create a schedule.
# The weight Tensors have been assigned to, but not yet realized. Everything is still lazy at this point
# l1.lazydata and l2.lazydata define a computation graph
# l1.uop and l2.uop define a computation graph
from tinygrad.engine.schedule import ScheduleItem
schedule: List[ScheduleItem] = Tensor.schedule(l1, l2)

View File

@ -34,7 +34,7 @@ print(out) # <Tensor <UOp METAL (1,) int (<Ops.ASSIGN: 66>, None)> on METAL with
The multiply Tensor stays the same because it is fused. The output Tensor's UOp becomes a new ASSIGN UOp:
```py
print(out.lazydata)
print(out.uop)
```
The first source is the output BUFFER:
@ -72,7 +72,7 @@ Once a Tensor is kernelized, all children will LOAD its BUFFER, instead of fusin
```py
child = out+2
child.kernelize()
print(child.lazydata.src[1].arg.ast)
print(child.uop.src[1].arg.ast)
```
```

View File

@ -36,7 +36,6 @@ CUDA | [1] | enable CUDA backend
AMD | [1] | enable AMD backend
NV | [1] | enable NV backend
METAL | [1] | enable Metal backend (for Mac M1 and after)
METAL_XCODE | [1] | enable Metal using macOS Xcode SDK
CPU | [1] | enable CPU (Clang) backend
LLVM | [1] | enable LLVM backend
BEAM | [#] | number of beams in kernel beam search

293
tinygrad_repo/docs/ramp.py Normal file
View File

@ -0,0 +1,293 @@
#!/usr/bin/env python3
# this file is a "ramp" for people new to tinygrad to think about how to approach it
# it is runnable and editable.
# whenever you see stuff like DEBUG=2 or CPU=1 discussed, these are environment variables
# in a unix shell like bash `DEBUG=2 CPU=1 python docs/ramp.py`
# this pip installs tinygrad master for the system
# the -e allows you to edit the tinygrad folder and update system tinygrad
# tinygrad is pure Python, so you are encouraged to do this
# git pull in the tinygrad directory will also get you the latest
"""
git clone https://github.com/tinygrad/tinygrad.git
cd tinygrad
python3 -m pip install -e .
"""
# %% ********
print("******* PART 1 *******")
# we start with a Device.
# a Device is where Tensors are stored and compute is run
# tinygrad autodetects the best device on your system and makes it the DEFAULT
from tinygrad import Device
print(Device.DEFAULT) # on Mac, you can see this prints METAL
# now, lets create a Tensor
from tinygrad import Tensor, dtypes
t = Tensor([1,2,3,4])
# you can see this Tensor is on the DEFAULT device with int dtype and shape (4,)
assert t.device == Device.DEFAULT
assert t.dtype == dtypes.int
assert t.shape == (4,)
# unlike in torch, if we print it, it doesn't print the contents
# this is because tinygrad is lazy
# this Tensor has not been computed yet
print(t)
# <Tensor <UOp METAL (4,) int (<Ops.COPY: 7>, None)> on METAL with grad None>
# the ".uop" property on Tensor contains the specification of how to compute it
print(t.uop)
"""
UOp(Ops.COPY, dtypes.int, arg=None, src=(
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=0, src=()),
UOp(Ops.DEVICE, dtypes.void, arg='PYTHON', src=()),)),
UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
"""
# as you can see, it's specifying a copy from PYTHON device
# which is where the [1,2,3,4] array lives
# UOps are the specification language in tinygrad
# they are immutable and form a DAG
# they have a "Ops", a "dtype", a tuple of srcs (parents), and an arg
t.realize()
# if we want to "realize" a tensor, we can with the "realize" method
# now when we look at the uop, it's changed
print(t.uop)
"""
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
"""
# the copy was actually run, and now the "uop" of the Tensor is just a BUFFER
# if you run this script with DEBUG=2 in the environment, you can see the copy happen
# *** METAL 1 copy 16, METAL <- PYTHON ...
# now let's do some compute
# we look at the uop to see the specification of the compute
t_times_2 = t * 2
print(t_times_2.uop)
"""
UOp(Ops.MUL, dtypes.int, arg=None, src=(
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
x2:=UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)),
UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
UOp(Ops.CONST, dtypes.int, arg=2, src=(
UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
x2,)),)),)),)),))
"""
# the BUFFER from above is being multiplied by a CONST 2
# it's RESHAPEd and EXPANDed to broadcast the CONST to the BUFFER
# we can check the result with
assert t_times_2.tolist() == [2, 4, 6, 8]
# UOps are both immutable and globally unique
# if i multiply the Tensor by 4 twice, these result Tensors will have the same uop specification
t_times_4_try_1 = t * 4
t_times_4_try_2 = t * 4
assert t_times_4_try_1.uop is t_times_4_try_2.uop
# the specification isn't just the same, it's the exact same Python object
assert t_times_4_try_1 is not t_times_4_try_2
# the Tensor is a different Python object
# if we realize `t_times_4_try_1` ...
t_times_4_try_1.realize()
print(t_times_4_try_2.uop)
"""
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=4, src=()),
UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
"""
# ... `t_times_4_try_2` also becomes the same BUFFER
assert t_times_4_try_1.uop is t_times_4_try_2.uop
# so this print doesn't require any computation, just a copy back to the CPU so we can print it
print("** only the copy start")
print(t_times_4_try_2.tolist()) # [4, 8, 12, 16]
print("** only the copy end")
# you can confirm this with DEBUG=2, seeing what's printed in between the "**" prints
# tinygrad has an auto differentiation engine that operates according to these same principles
# the derivative of "log(x)" is "1/x", and you can see this on line 20 of gradient.py
t_float = Tensor([3.0])
t_log = t_float.log()
t_log_grad, = t_log.sum().gradient(t_float)
# due to how log is implemented, this gradient contains a lot of UOps
print(t_log_grad.uop)
# ...not shown here...
# but if you run with DEBUG=4 (CPU=1 used here for simpler code), you can see the generated code
"""
void E_(float* restrict data0, float* restrict data1) {
float val0 = *(data1+0);
*(data0+0) = (0.6931471805599453f*(1/(val0*0.6931471805599453f)));
}
"""
# the derivative is close to 1/3
assert (t_log_grad.item() - 1/3) < 1e-6
# %% ********
print("******* PART 2 *******")
# we redefine the same t here so this cell can run on it's own
from tinygrad import Tensor
t = Tensor([1,2,3,4])
# what's above gives you enough of an understanding to go use tinygrad as a library
# however, a lot of the beauty of tinygrad is in how easy it is to interact with the internals
# NOTE: the APIs here are subject to change
t_plus_3_plus_4 = t + 3 + 4
print(t_plus_3_plus_4.uop)
"""
UOp(Ops.ADD, dtypes.int, arg=None, src=(
UOp(Ops.ADD, dtypes.int, arg=None, src=(
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
x3:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
UOp(Ops.CONST, dtypes.int, arg=3, src=(
x7:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
x3,)),)),)),)),)),
UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
UOp(Ops.CONST, dtypes.int, arg=4, src=(
x7,)),)),)),))
"""
# you can see it's adding both 3 and 4
# but by the time we are actually running the code, it's adding 7
# `kernelize` will simplify and group the operations in the graph into kernels
t_plus_3_plus_4.kernelize()
print(t_plus_3_plus_4.uop)
"""
UOp(Ops.ASSIGN, dtypes.int, arg=None, src=(
x0:=UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=7, src=()),
x2:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
UOp(Ops.KERNEL, dtypes.void, arg=<Kernel 12 SINK(<Ops.STORE: 48>,) (__add__,)>, src=(
x0,
UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
x2,)),)),))
"""
# ASSIGN has two srcs, src[0] is the BUFFER that's assigned to, and src[1] is the thing to assign
# src[1] is the GPU Kernel that's going to be run
# we can get the ast of the Kernel as follows
kernel_ast = t_plus_3_plus_4.uop.src[1].arg.ast
# almost everything in tinygrad functions as a rewrite of the UOps
# the codegen rewrites the ast to a simplified form ready for "rendering"
from tinygrad.codegen import full_rewrite_to_sink
rewritten_ast = full_rewrite_to_sink(kernel_ast)
print(rewritten_ast)
"""
UOp(Ops.SINK, dtypes.void, arg=None, src=(
UOp(Ops.STORE, dtypes.void, arg=None, src=(
UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=0, src=()),
x3:=UOp(Ops.SPECIAL, dtypes.int, arg=('gidx0', 4), src=()),)),
UOp(Ops.ADD, dtypes.int, arg=None, src=(
UOp(Ops.LOAD, dtypes.int, arg=None, src=(
UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=1, src=()),
x3,)),)),
UOp(Ops.CONST, dtypes.int, arg=7, src=()),)),)),))
"""
# you can see at this point we are adding 7, not 3 and 4
# with DEBUG=4, we can see the code.
# since optimizations are on, it UPCASTed the operation, explicitly writing out all 4 +7s
t_plus_3_plus_4.realize()
"""
void E_4n2(int* restrict data0, int* restrict data1) {
int val0 = *(data1+0);
int val1 = *(data1+1);
int val2 = *(data1+2);
int val3 = *(data1+3);
*(data0+0) = (val0+7);
*(data0+1) = (val1+7);
*(data0+2) = (val2+7);
*(data0+3) = (val3+7);
}
"""
# the function name E_4n2 is "E" for elementwise op (as opposed to "r" for reduce op)
# "4" for the size, and "n2" for name deduping (it's the 3rd function with the same E and 4 in this session)
# when you print the name with DEBUG=2, you'll see the 4 is yellow, meaning that it's upcasted
# if you run with NOOPT=1 ...
"""
void E_4n2(int* restrict data0, int* restrict data1) {
for (int ridx0 = 0; ridx0 < 4; ridx0++) {
int val0 = *(data1+ridx0);
*(data0+ridx0) = (val0+7);
}
}
"""
# ... you get this unoptimized code with a loop and the 4 is blue (for global). the color code is in kernel.py
# %% ********
print("******* PART 3 *******")
# now, we go even lower and understand UOps better and how the graph rewrite engine works.
# it's much simpler than what's in LLVM or MLIR
from tinygrad import dtypes
from tinygrad.uop.ops import UOp, Ops
# first, we'll construct some const UOps
a = UOp(Ops.CONST, dtypes.int, arg=2)
b = UOp(Ops.CONST, dtypes.int, arg=2)
# if you have been paying attention, you should know these are the same Python object
assert a is b
# UOps support normal Python math operations, so a_plus_b expresses the spec for 2 + 2
a_plus_b = a + b
print(a_plus_b)
"""
UOp(Ops.ADD, dtypes.int, arg=None, src=(
x0:=UOp(Ops.CONST, dtypes.int, arg=2, src=()),
x0,))
"""
# we could actually render this 2+2 into a language like c and run it
# or, we can use tinygrad's graph rewrite engine to "constant fold"
from tinygrad.uop.ops import graph_rewrite, UPat, PatternMatcher
# a `PatternMatcher` is a list of tuples. for each element in the list:
# [0] is the pattern to match, and [1] is the function to run.
# this function can return either a UOp to replace the pattern with, or None to not replace
simple_pm = PatternMatcher([
(UPat(Ops.ADD, src=(UPat(Ops.CONST, name="c1"), UPat(Ops.CONST, name="c2"))),
lambda c1,c2: UOp(Ops.CONST, dtype=c1.dtype, arg=c1.arg+c2.arg)),
])
# this pattern matches the addition of two CONST and rewrites it into a single CONST UOp
# to actually apply the pattern to a_plus_b, we use graph_rewrite
a_plus_b_simplified = graph_rewrite(a_plus_b, simple_pm)
print(a_plus_b_simplified)
"""
UOp(Ops.CONST, dtypes.int, arg=4, src=())
"""
# 2+2 is in fact, 4
# we can also use syntactic sugar to write the pattern nicer
simpler_pm = PatternMatcher([
(UPat.cvar("c1")+UPat.cvar("c2"), lambda c1,c2: c1.const_like(c1.arg+c2.arg))
])
assert graph_rewrite(a_plus_b, simple_pm) is graph_rewrite(a_plus_b, simpler_pm)
# note again the use of is, UOps are immutable and globally unique
# %% ********
# that brings you to an understanding of the most core concepts in tinygrad
# you can run this with VIZ=1 to use the web based graph rewrite explorer
# hopefully now you understand it. the nodes in the graph are just UOps

View File

@ -24,6 +24,7 @@
::: tinygrad.Tensor.randn
::: tinygrad.Tensor.randn_like
::: tinygrad.Tensor.randint
::: tinygrad.Tensor.randperm
::: tinygrad.Tensor.normal
::: tinygrad.Tensor.uniform
::: tinygrad.Tensor.scaled_uniform

View File

@ -37,8 +37,10 @@
::: tinygrad.Tensor.scatter
::: tinygrad.Tensor.scatter_reduce
::: tinygrad.Tensor.masked_select
::: tinygrad.Tensor.masked_fill
::: tinygrad.Tensor.sort
::: tinygrad.Tensor.topk
::: tinygrad.Tensor.multinomial
## Neural Network (functional)

View File

@ -78,10 +78,7 @@ if __name__ == "__main__":
@TinyJit
def get_action(obs:Tensor) -> Tensor:
# TODO: with no_grad
Tensor.no_grad = True
ret = model(obs)[0].exp().multinomial().realize()
Tensor.no_grad = False
return ret
st, steps = time.perf_counter(), 0

View File

@ -3,14 +3,19 @@ start_tm = time.perf_counter()
import math
from typing import Tuple, cast
import numpy as np
from tinygrad import Tensor, nn, GlobalCounters, TinyJit, dtypes
from tinygrad import Tensor, nn, GlobalCounters, TinyJit, dtypes, Device
from tinygrad.helpers import partition, trange, getenv, Context
from extra.lr_scheduler import OneCycleLR
GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 1))]
# override tinygrad defaults
dtypes.default_float = dtypes.half
Context(FUSE_ARANGE=1, FUSE_OPTIM=1).__enter__()
# from https://github.com/tysam-code/hlb-CIFAR10/blob/main/main.py
batchsize = getenv("BS", 1024)
assert batchsize % len(GPUS) == 0, f"{batchsize=} is not a multiple of {len(GPUS)=}"
bias_scaler = 64
hyp = {
'opt': {
@ -67,7 +72,7 @@ class ConvGroup:
cast(Tensor, self.norm2.weight).requires_grad = False
def __call__(self, x:Tensor) -> Tensor:
x = self.norm1(self.conv1(x).max_pool2d().float()).cast(dtypes.default_float).quick_gelu()
return self.norm2(self.conv2(x).float()).cast(dtypes.default_float).quick_gelu()
return self.norm2(self.conv2(x).float()).cast(dtypes.default_float).quick_gelu() + x
class SpeedyConvNet:
def __init__(self):
@ -78,23 +83,25 @@ class SpeedyConvNet:
self.linear = nn.Linear(depths['block3'], depths['num_classes'], bias=False)
def __call__(self, x:Tensor) -> Tensor:
x = self.whiten(x).quick_gelu()
# ************* HACKS *************
x = x.pad((1,0,0,1)) # TODO: this pad should not be here! copied from hlb_cifar10 for speed
# ************* HACKS *************
x = x.sequential([self.conv_group_1, self.conv_group_2, self.conv_group_3])
return self.linear(x.max(axis=(2,3))) * hyp['opt']['scaling_factor']
if __name__ == "__main__":
# *** dataset ***
X_train, Y_train, X_test, Y_test = nn.datasets.cifar()
# TODO: without this line indexing doesn't fuse!
X_train, Y_train, X_test, Y_test = [x.contiguous() for x in [X_train, Y_train, X_test, Y_test]]
cifar10_std, cifar10_mean = X_train.float().std_mean(axis=(0, 2, 3))
def preprocess(X:Tensor, Y:Tensor) -> Tuple[Tensor, Tensor]:
return ((X - cifar10_mean.view(1, -1, 1, 1)) / cifar10_std.view(1, -1, 1, 1)).cast(dtypes.default_float), Y.one_hot(depths['num_classes'])
def preprocess(X:Tensor) -> Tensor: return ((X - cifar10_mean.view(1, -1, 1, 1)) / cifar10_std.view(1, -1, 1, 1)).cast(dtypes.default_float)
# *** model ***
model = SpeedyConvNet()
state_dict = nn.state.get_state_dict(model)
#for k,v in nn.state.torch_load("/tmp/cifar_net.pt").items(): print(k)
if len(GPUS) > 1:
cifar10_std.to_(GPUS)
cifar10_mean.to_(GPUS)
for x in state_dict.values(): x.to_(GPUS)
params_bias, params_non_bias = partition(state_dict.items(), lambda x: 'bias' in x[0])
opt_bias = nn.optim.SGD([x[1] for x in params_bias], lr=0.01, momentum=.85, nesterov=True, weight_decay=hyp['opt']['bias_decay'])
@ -111,40 +118,37 @@ if __name__ == "__main__":
lr_sched_bias = OneCycleLR(opt_bias, max_lr=hyp['opt']['bias_lr'], pct_start=pct_start, div_factor=initial_div_factor, final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=total_train_steps)
lr_sched_non_bias = OneCycleLR(opt_non_bias, max_lr=hyp['opt']['non_bias_lr'], pct_start=pct_start, div_factor=initial_div_factor, final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=total_train_steps)
def loss_fn(out, Y):
return out.cross_entropy(Y, reduction='none', label_smoothing=0.2).mul(hyp['opt']['loss_scale_scaler']*loss_batchsize_scaler).sum().div(hyp['opt']['loss_scale_scaler'])
def loss_fn(out:Tensor, Y:Tensor) -> Tensor:
ret = out.sparse_categorical_crossentropy(Y, reduction='none', label_smoothing=0.2)
return ret.mul(hyp['opt']['loss_scale_scaler']*loss_batchsize_scaler).sum().div(hyp['opt']['loss_scale_scaler'])
@TinyJit
@Tensor.train()
def train_step(idxs:Tensor) -> Tensor:
with Context(SPLIT_REDUCEOP=0, FUSE_ARANGE=1):
X = X_train[idxs]
Y = Y_train[idxs].realize(X)
X, Y = preprocess(X, Y)
out = model(X)
X, Y = X_train[idxs], Y_train[idxs]
if len(GPUS) > 1:
X.shard_(GPUS, axis=0)
Y.shard_(GPUS, axis=0)
out = model(preprocess(X))
loss = loss_fn(out, Y)
opt.zero_grad()
loss.backward()
opt.step()
lr_sched_bias.step()
lr_sched_non_bias.step()
return loss / (batchsize*loss_batchsize_scaler)
return (loss / (batchsize*loss_batchsize_scaler)).realize(*opt.schedule_step(),
*lr_sched_bias.schedule_step(), *lr_sched_non_bias.schedule_step())
eval_batchsize = 2500
@TinyJit
@Tensor.test()
def val_step() -> Tuple[Tensor, Tensor]:
# TODO with Tensor.no_grad()
Tensor.no_grad = True
loss, acc = [], []
for i in range(0, X_test.size(0), eval_batchsize):
X, Y = preprocess(X_test[i:i+eval_batchsize], Y_test[i:i+eval_batchsize])
out = model(X)
X, Y = X_test[i:i+eval_batchsize], Y_test[i:i+eval_batchsize]
if len(GPUS) > 1:
X.shard_(GPUS, axis=0)
Y.shard_(GPUS, axis=0)
out = model(preprocess(X))
loss.append(loss_fn(out, Y))
acc.append((out.argmax(-1).one_hot(depths['num_classes']) * Y).sum() / eval_batchsize)
ret = Tensor.stack(*loss).mean() / (batchsize*loss_batchsize_scaler), Tensor.stack(*acc).mean()
Tensor.no_grad = False
return ret
acc.append((out.argmax(-1) == Y).sum() / eval_batchsize)
return Tensor.stack(*loss).mean() / (batchsize*loss_batchsize_scaler), Tensor.stack(*acc).mean()
np.random.seed(1337)
for epoch in range(math.ceil(hyp['misc']['train_epochs'])):

View File

@ -34,7 +34,6 @@ if __name__ == "__main__":
return loss
@TinyJit
@Tensor.test()
def get_test_acc() -> Tensor: return (model(X_test).argmax(axis=1) == Y_test).mean()*100
test_acc = float('nan')

View File

@ -1,10 +1,10 @@
import sys, onnx, time, pickle
import sys, time, pickle
from tinygrad import TinyJit, GlobalCounters, fetch, getenv
from tinygrad.frontend.onnx import OnnxRunner
from tinygrad.frontend.onnx import OnnxRunner, onnx_load
from extra.onnx_helpers import get_example_inputs, validate
def load_onnx_model(onnx_file):
onnx_model = onnx.load(onnx_file)
onnx_model = onnx_load(onnx_file)
run_onnx = OnnxRunner(onnx_model)
run_onnx_jit = TinyJit(lambda **kwargs: next(iter(run_onnx({k:v.to(None) for k,v in kwargs.items()}).values())), prune=True, optimize=True)
return run_onnx_jit, run_onnx.graph_inputs

View File

@ -23,8 +23,6 @@ def create_fixed_tokenizer(output_file):
# echo -en "write 2+2\nwrite hello world\ny\n" | TEMP=0 python3 examples/coder.py
if __name__ == "__main__":
Tensor.no_grad = True
# https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/config.json
with Timing("create model: "):
model = Transformer(4096, 14336, n_heads=32, n_layers=32, norm_eps=1e-5, vocab_size=32002, n_kv_heads=8, max_context=4096, jit=getenv("JIT", 1))

View File

@ -159,7 +159,6 @@ def init_vits(
text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
# Load the model.
Tensor.no_grad = True
if seed is not None:
Tensor.manual_seed(seed)
np.random.seed(seed)
@ -221,7 +220,6 @@ def mp_output_stream(q: mp.Queue, counter: mp.Value, num_channels: int, sample_r
if __name__ == "__main__":
import nltk
nltk.download("punkt")
Tensor.no_grad = True
# Parse CLI arguments
parser = argparse.ArgumentParser("Have a tiny conversation with tinygrad")

View File

@ -85,7 +85,10 @@ class Transformer:
seqlen = tokens.shape[1]
tok_emb = self.wte(tokens)
pos_emb = self.wpe(self.allpos.shrink((None, (start_pos, start_pos+seqlen))))
# not symbolic when consuming the prompt
selected_pos = (0, seqlen) if start_pos.val == 0 else (start_pos, start_pos+1)
pos_emb = self.wpe(self.allpos.shrink((None, selected_pos)))
h = tok_emb + pos_emb
if HALF: h = h.half()
@ -190,7 +193,7 @@ class GPT2:
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing):
with WallTimeEvent(BenchEvent.STEP):
if batch_size == 1 and len(toks[0][start_pos:]) == 1:
tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos])
tokens = Variable("tokens", 0, VOCAB_SIZE-1).bind(toks[0][start_pos])
else:
tokens = Tensor([x[start_pos:] for x in toks])
tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
@ -201,7 +204,6 @@ class GPT2:
# **** main code ****
if __name__ == "__main__":
Tensor.no_grad = True
print(f"using {Device.DEFAULT} backend")
default_prompt = "What is the answer to life, the universe, and everything?"

View File

@ -118,7 +118,7 @@ class SpeedyResNet:
# hyper-parameters were exactly the same as the original repo
bias_scaler = 58
hyp = {
'seed' : 209,
'seed' : 200,
'opt': {
'bias_lr': 1.76 * bias_scaler/512,
'non_bias_lr': 1.76 / 512,
@ -267,13 +267,10 @@ def train_cifar():
@TinyJit
def update(self, net, decay):
# TODO with Tensor.no_grad()
Tensor.no_grad = True
for net_ema_param, (param_name, net_param) in zip(get_state_dict(self.net_ema).values(), get_state_dict(net).items()):
# batchnorm currently is not being tracked
if not ("num_batches_tracked" in param_name) and not ("running" in param_name):
net_ema_param.assign(net_ema_param.detach()*decay + net_param.detach()*(1.-decay)).realize()
Tensor.no_grad = False
set_seed(getenv('SEED', hyp['seed']))

View File

@ -240,7 +240,6 @@ class LLaMa:
#elif k.endswith('.weight'): v.shard_(device, axis=-1)
#elif 'norm.' in k: v.shard_(device, axis=-1)
else: v.shard_(device, axis=None)
#print(k, v.shape, v.lazydata.axis)
# replace weights in model
load_state_dict(model, weights, strict=False, consume=True)
@ -331,7 +330,6 @@ int main()
\end{code}
"""
if __name__ == "__main__":
Tensor.no_grad = True
print(f"using {Device.DEFAULT} backend")
parser = argparse.ArgumentParser(description="Run LLaMA in tinygrad", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@ -447,7 +445,7 @@ After you are done speaking, output [EOS]. You are not Chad.
print(f"using LLaMA{LLAMA_SUFFIX}-{args.size} model")
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
llama = LLaMa.build(MODEL_PATH, TOKENIZER_PATH, model_gen=args.gen, model_size=args.size, quantize=args.quantize, device=device)
param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(llama.model))
param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(llama.model))
outputted = pre_prompt if chatbot else args.prompt
start_pos, toks = 0, [llama.tokenizer.bos_id()] + llama.tokenizer.encode(outputted)

View File

@ -233,8 +233,6 @@ def prefill(model, toks, start_pos=0):
return start_pos
if __name__ == "__main__":
Tensor.no_grad = True
parser = argparse.ArgumentParser()
parser.add_argument("--download_model", action="store_true", help="Download a model")
parser.add_argument("--model", type=Path, help="Model path")
@ -286,7 +284,7 @@ if __name__ == "__main__":
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
model = build_transformer(args.model, model_size=args.size, quantize=args.quantize, device=device)
param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(model))
param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(model))
if not args.no_api and not args.benchmark:
from bottle import Bottle, request, response, HTTPResponse, abort, static_file

View File

@ -16,7 +16,7 @@ if __name__ == "__main__":
#model.load_pretrained()
for p in nn.state.get_parameters(model): p.replace(Tensor.empty(p.shape, dtype=p.dtype)) # fake load pretrained
#early_sched = create_schedule([x.lazydata for x in nn.state.get_parameters(model)])
#early_sched = create_schedule([x.uop for x in nn.state.get_parameters(model)])
#print(f"built model {len(early_sched)}")
#B, T = Variable("B", 1, 128).bind(4), 64 #Variable("T", 1, 1024).bind(64)
@ -56,7 +56,7 @@ if __name__ == "__main__":
state_dict.update({'X': X, 'Y': Y, 'loss': loss})
grad_state_dict = {}
for k,v in state_dict.items():
if v.lazydata.base.buffer not in used_buffers: print(f"UNUSED: {k}")
if v.uop.base.buffer not in used_buffers: print(f"UNUSED: {k}")
if v.grad is not None: grad_state_dict['grad_'+k] = v.grad
state_dict.update(grad_state_dict)
state_dict.update({'adam_b1_t': optimizer.b1_t, 'adam_b2_t': optimizer.b2_t, 'adam_lr': optimizer.lr})
@ -65,7 +65,7 @@ if __name__ == "__main__":
nm = inverse_state_dict[p]
state_dict["adam_m_"+nm] = m
state_dict["adam_v_"+nm] = v
named_buffers = {v.lazydata.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}
named_buffers = {v.uop.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}
c_code = ["#include <stdlib.h>", "#include <tgmath.h>", "#include <stdbool.h>"]
if TIMING: c_code += ["#include <stdio.h>", "#include <time.h>"]

View File

@ -146,7 +146,6 @@ if __name__ == "__main__":
return loss
@TinyJit
@Tensor.test()
def sample(z:Tensor, cond:Tensor) -> Tensor:
return model.sample(z, cond, Tensor.full_like(cond, 10), sample_steps=getenv("SAMPLE_STEPS", 20))[-1]

View File

@ -56,7 +56,7 @@ if __name__ == "__main__":
with Profiling(sort="time", frac=0.1, enabled=args.profile):
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"):
with WallTimeEvent(BenchEvent.STEP):
tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item()
tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024-1).bind(start_pos), args.temperature).item()
toks.append(tok)
start_pos += 1
print(spp.decode(toks))

View File

@ -71,7 +71,7 @@ def loader_process(q_in, q_out, X:Tensor, seed):
#storage_tensor._copyin(img_tensor.numpy())
# faster
X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
# ideal
#X[idx].assign(img.tobytes()) # NOTE: this is slow!
@ -262,8 +262,8 @@ def load_unet3d_data(preprocessed_dataset_dir, seed, queue_in, queue_out, X:Tens
x = random_brightness_augmentation(x)
x = gaussian_noise(x)
X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
Y[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()
X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
Y[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()
queue_out.put(idx)
queue_out.put(None)
@ -377,12 +377,12 @@ def load_retinanet_data(base_dir:Path, val:bool, queue_in:Queue, queue_out:Queue
clipped_match_idxs = np.clip(match_idxs, 0, None)
clipped_boxes, clipped_labels = tgt["boxes"][clipped_match_idxs], tgt["labels"][clipped_match_idxs]
boxes[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
labels[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
matches[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
anchors[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()
boxes[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
labels[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
matches[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
anchors[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()
imgs[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
imgs[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
queue_out.put(idx)
queue_out.put(None)

View File

@ -9,7 +9,6 @@ from extra.bench_log import BenchEvent, WallTimeEvent
def tlog(x): print(f"{x:25s} @ {time.perf_counter()-start:5.2f}s")
def eval_resnet():
Tensor.no_grad = True
with WallTimeEvent(BenchEvent.FULL):
# Resnet50-v1.5
from extra.models.resnet import ResNet50
@ -245,7 +244,6 @@ def eval_mrcnn():
if __name__ == "__main__":
# inference only
Tensor.training = False
Tensor.no_grad = True
models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(",")
for m in models:

View File

@ -60,7 +60,6 @@ def spec_mrcnn():
if __name__ == "__main__":
# inference only for now
Tensor.training = False
Tensor.no_grad = True
for m in getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(","):
nm = f"spec_{m}"

View File

@ -608,7 +608,7 @@ def train_retinanet():
if getenv("RESET_STEP", 1): _train_step.reset()
with Tensor.train(mode=False), Tensor.test():
with Tensor.train(mode=False):
if not RUNMLPERF:
i, proc = 0, _fake_data_get(EVAL_BS, val=(val:=True))
else:
@ -791,7 +791,6 @@ def train_unet3d():
return loss.realize()
@Tensor.train(mode=False)
@Tensor.test()
def eval_step(model, x, y):
y_hat, y = sliding_window_inference(model, x, y, gpus=GPUS)
y_hat, y = Tensor(y_hat), Tensor(y, requires_grad=False)

View File

@ -5,7 +5,7 @@
"system_name": "tinybox 8xMI300X",
"number_of_nodes": "1",
"host_processors_per_node": "2",
"host_processor_model_name": "AMD EPYC 9354 32-Core Processor",
"host_processor_model_name": "AMD EPYC 9354",
"host_processor_core_count": "32",
"host_processor_vcpu_count": "64",
"host_processor_frequency": "",
@ -18,7 +18,7 @@
"host_networking_topology": "",
"host_memory_configuration": "24x 96GB DDR5",
"accelerators_per_node": "8",
"accelerator_model_name": "AMD Instinct MI300X",
"accelerator_model_name": "AMD Instinct MI300X 192GB HBM3",
"accelerator_host_interconnect": "PCIe 5.0 x16",
"accelerator_frequency": "",
"accelerator_on-chip_memories": "",
@ -30,10 +30,9 @@
"hw_notes": "",
"framework": "tinygrad, branch mlperf_training_v5.0",
"other_software_stack": {
"python": "3.10.16",
"ROCm": "3.0.0+94441cb"
"python": "3.10.16",
"ROCm": "3.0.0+94441cb"
},
"operating_system": "Ubuntu 24.04.1 LTS",
"sw_notes": ""
}

View File

@ -5,7 +5,7 @@
"system_name": "tinybox green",
"number_of_nodes": "1",
"host_processors_per_node": "1",
"host_processor_model_name": "AMD EPYC 7532 32-Core Processor",
"host_processor_model_name": "AMD EPYC 7532",
"host_processor_core_count": "32",
"host_processor_vcpu_count": "64",
"host_processor_frequency": "",

View File

@ -5,7 +5,7 @@
"system_name": "tinybox red",
"number_of_nodes": "1",
"host_processors_per_node": "1",
"host_processor_model_name": "AMD EPYC 7532 32-Core Processor",
"host_processor_model_name": "AMD EPYC 7532",
"host_processor_core_count": "32",
"host_processor_vcpu_count": "64",
"host_processor_frequency": "",

View File

@ -0,0 +1,15 @@
#!/bin/bash
export PYTHONPATH="." AMD=1
export MODEL="bert"
export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
# export BEAM_LOG_SURPASS_MAX=1
# export BASEDIR="/raid/datasets/wiki"
export RESET_STEP=1
export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
python3 examples/mlperf/model_train.py

View File

@ -0,0 +1,69 @@
# 1. Problem
This problem uses BERT for NLP.
## Requirements
Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
```
git clone https://github.com/tinygrad/tinygrad.git
python3 -m pip install -e ".[mlperf]"
```
Also install gdown (for dataset), numpy, tqdm and tensorflow.
```
pip install gdown numpy tqdm tensorflow
```
### tinybox_green
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
This is the default on production tinybox green.
# 2. Directions
## Steps to download and verify data
### 1. Download raw data
```
BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
```
### 2. Preprocess train and validation data
Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended.
#### Training:
```
BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
```
Generating a specific topic (Between 0 and 499)
```
BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
```
#### Validation:
```
BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
```
## Running
### tinybox_green
#### Steps to run benchmark
```
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
```
### tinybox_red
#### Steps to run benchmark
```
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
```
### tinybox_8xMI300X
#### Steps to run benchmark
```
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
```

View File

@ -0,0 +1,14 @@
#!/bin/bash
export PYTHONPATH="." AMD=1
export MODEL="bert"
export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
export BASEDIR="/raid/datasets/wiki"
export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
python3 examples/mlperf/model_train.py

View File

@ -0,0 +1,17 @@
#!/bin/bash
export PYTHONPATH="." AMD=1
export MODEL="bert"
export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
export TRAIN_STEPS=3900
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
export BASEDIR="/raid/datasets/wiki"
export WANDB=1 PARALLEL=0
RUNMLPERF=1 python3 examples/mlperf/model_train.py

View File

@ -0,0 +1,29 @@
#!/bin/bash
set -e # Exit on any error
set -o pipefail # Make pipeline fail if any command fails
export PYTHONPATH="." AMD=1
export MODEL="bert"
export SUBMISSION_PLATFORM="tinybox_8xMI300X"
export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
export TRAIN_STEPS=3900
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
export BASEDIR="/raid/datasets/wiki"
# pip install -e ".[mlperf]"
export LOGMLPERF=1
export SEED=$RANDOM
DATETIME=$(date "+%m%d%H%M")
LOGFILE="bert_8xMI300x_${DATETIME}_${SEED}.log"
# init # TODO: without DEBUG=2 it hangs
BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 DEBUG=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
# run
PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE

View File

@ -0,0 +1,69 @@
# 1. Problem
This problem uses BERT for NLP.
## Requirements
Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
```
git clone https://github.com/tinygrad/tinygrad.git
python3 -m pip install -e ".[mlperf]"
```
Also install gdown (for dataset), numpy, tqdm and tensorflow.
```
pip install gdown numpy tqdm tensorflow
```
### tinybox_green
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
This is the default on production tinybox green.
# 2. Directions
## Steps to download and verify data
### 1. Download raw data
```
BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
```
### 2. Preprocess train and validation data
Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended.
#### Training:
```
BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
```
Generating a specific topic (Between 0 and 499)
```
BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
```
#### Validation:
```
BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
```
## Running
### tinybox_green
#### Steps to run benchmark
```
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
```
### tinybox_red
#### Steps to run benchmark
```
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
```
### tinybox_8xMI300X
#### Steps to run benchmark
```
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
```

View File

@ -0,0 +1,16 @@
#!/bin/bash
export PYTHONPATH="." NV=1
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BEAM_LOG_SURPASS_MAX=1
export BASEDIR="/raid/datasets/wiki"
export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
python3 examples/mlperf/model_train.py

View File

@ -0,0 +1,15 @@
#!/bin/bash
export PYTHONPATH="." NV=1
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
export WANDB=1 PARALLEL=0
RUNMLPERF=1 python3 examples/mlperf/model_train.py

View File

@ -0,0 +1,27 @@
#!/bin/bash
set -e # Exit on any error
set -o pipefail # Make pipeline fail if any command fails
export PYTHONPATH="." NV=1
export MODEL="bert"
export SUBMISSION_PLATFORM="tinybox_green"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
# pip install -e ".[mlperf]"
export LOGMLPERF=1
export SEED=$RANDOM
DATETIME=$(date "+%m%d%H%M")
LOGFILE="bert_green_${DATETIME}_${SEED}.log"
# init
BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
# run
PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE

View File

@ -0,0 +1,69 @@
# 1. Problem
This problem uses BERT for NLP.
## Requirements
Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
```
git clone https://github.com/tinygrad/tinygrad.git
python3 -m pip install -e ".[mlperf]"
```
Also install gdown (for dataset), numpy, tqdm and tensorflow.
```
pip install gdown numpy tqdm tensorflow
```
### tinybox_green
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
This is the default on production tinybox green.
# 2. Directions
## Steps to download and verify data
### 1. Download raw data
```
BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
```
### 2. Preprocess train and validation data
Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended.
#### Training:
```
BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
```
Generating a specific topic (Between 0 and 499)
```
BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
```
#### Validation:
```
BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
```
## Running
### tinybox_green
#### Steps to run benchmark
```
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
```
### tinybox_red
#### Steps to run benchmark
```
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
```
### tinybox_8xMI300X
#### Steps to run benchmark
```
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
```

View File

@ -0,0 +1,17 @@
#!/bin/bash
export PYTHONPATH="." AMD=1
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BEAM_LOG_SURPASS_MAX=1
export BASEDIR="/raid/datasets/wiki"
export RESET_STEP=1
export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
python3 examples/mlperf/model_train.py

View File

@ -0,0 +1,15 @@
#!/bin/bash
export PYTHONPATH="." AMD=1
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
export WANDB=1 PARALLEL=0
RUNMLPERF=1 python3 examples/mlperf/model_train.py

View File

@ -0,0 +1,32 @@
#!/bin/bash
set -e # Exit on any error
set -o pipefail # Make pipeline fail if any command fails
export PYTHONPATH="." AMD=1
export MODEL="bert"
export SUBMISSION_PLATFORM="tinybox_red"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
# pip install -e ".[mlperf]"
export LOGMLPERF=1
export SEED=$RANDOM
DATETIME=$(date "+%m%d%H%M")
LOGFILE="bert_red_${DATETIME}_${SEED}.log"
export HCQDEV_WAIT_TIMEOUT_MS=100000 # prevents hang?
# init
sleep 5 && sudo rmmod amdgpu || true
BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
# run
# TODO: AM driver resulted in nan
sudo modprobe amdgpu
PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE

View File

@ -0,0 +1,50 @@
# 1. Problem
This problem uses the ResNet-50 CNN to do image classification.
## Requirements
Install tinygrad and mlperf-logging from master.
```
git clone https://github.com/tinygrad/tinygrad.git
python3 -m pip install -e ".[mlperf]"
```
### tinybox_green
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
This is the default on production tinybox green.
### tinybox_red
Disable cwsr
This is the default on production tinybox red.
```
sudo vi /etc/modprobe.d/amdgpu.conf
cat <<EOF > /etc/modprobe.d/amdgpu.conf
options amdgpu cwsr_enable=0
EOF
sudo update-initramfs -u
sudo reboot
# validate
sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
```
# 2. Directions
## Steps to download and verify data
```
IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
```
## Steps for one time setup
### tinybox_red
```
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
```
## Steps to run benchmark
```
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
```

View File

@ -0,0 +1,13 @@
#!/bin/bash
export PYTHONPATH="." NV=1
export MODEL="resnet"
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
export RESET_STEP=0
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
export BENCHMARK=10 DEBUG=2
python3 examples/mlperf/model_train.py

View File

@ -0,0 +1,15 @@
#!/bin/bash
export PYTHONPATH="." NV=1
export MODEL="resnet"
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
export RESET_STEP=0
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
export EVAL_START_EPOCH=3 EVAL_FREQ=4
export WANDB=1 PARALLEL=0
python3 examples/mlperf/model_train.py

View File

@ -0,0 +1,25 @@
#!/bin/bash
set -e # Exit on any error
set -o pipefail # Make pipeline fail if any command fails
export PYTHONPATH="." NV=1
export MODEL="resnet"
export SUBMISSION_PLATFORM="tinybox_green"
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
export RESET_STEP=0
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
# pip install -e ".[mlperf]"
export LOGMLPERF=${LOGMLPERF:-1}
export SEED=$RANDOM
DATETIME=$(date "+%m%d%H%M")
LOGFILE="resnet_green_${DATETIME}_${SEED}.log"
# init
BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
# run
PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE

View File

@ -0,0 +1,50 @@
# 1. Problem
This problem uses the ResNet-50 CNN to do image classification.
## Requirements
Install tinygrad and mlperf-logging from master.
```
git clone https://github.com/tinygrad/tinygrad.git
python3 -m pip install -e ".[mlperf]"
```
### tinybox_green
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
This is the default on production tinybox green.
### tinybox_red
Disable cwsr
This is the default on production tinybox red.
```
sudo vi /etc/modprobe.d/amdgpu.conf
cat <<EOF > /etc/modprobe.d/amdgpu.conf
options amdgpu cwsr_enable=0
EOF
sudo update-initramfs -u
sudo reboot
# validate
sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
```
# 2. Directions
## Steps to download and verify data
```
IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
```
## Steps for one time setup
### tinybox_red
```
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
```
## Steps to run benchmark
```
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
```

Some files were not shown because too many files have changed in this diff Show More