KerryGold Model, AGNOS12.4, AdjustLaneChange, EnglighSound (#182)

* Vegetarian Filet o Fish model * fix.. atc.. * test cluster_speed_limit * fix.. cluster_speed_limit.. 2 * fix.. clusterspeedlimit3 * cruise speed to roadlimit speed * fix.. * fix.. eng * deltaUp/Down for lanechange * fix.. atc desire... * fix.. * ff * ff * fix.. * fix.. eng * fix engsound * Update desire_helper.py * fix.. connect... * fix curve_min speed * Revert "fix curve_min speed" This reverts commit fcc9c2eb14eb3504abef3e420db93e8882e56f37. * Reapply "fix curve_min speed" This reverts commit 2d2bba476c58a7b4e13bac3c3ad0e4694c95515d. * fix.. auto speed up.. roadlimit * fix.. atc auto lanechange... * Update desire_helper.py * Update cruise.py * debug atc... * fix.. waze alert offset.. * fix.. * test atc.. * fix.. * fix.. atc * atc test.. * fix.. atc * fix.. atc2 * fix.. atc3 * KerryGold Model. latsmooth_sec = 0.0 * lat smooth seconds 0.13 * fix comment * fix.. auto cruise, and speed unit * change lanemode switching. * erase mazda lkas button.
2025-06-22 10:51:42 +09:00 · 2025-06-22 10:51:42 +09:00 · 9c7833faf9
commit 9c7833faf9
parent efee1712aa
385 changed files with 12951 additions and 12621 deletions
--- a/common/params_keys.h
+++ b/common/params_keys.h
@ -236,7 +236,6 @@ inline static std::unordered_map<std::string, uint32_t> keys = {
    {"HapticFeedbackWhenSpeedCamera", PERSISTENT},
    {"UseLaneLineSpeed", PERSISTENT},
    {"UseLaneLineCurveSpeed", PERSISTENT},
-    {"UseLaneLineSpeedApply", PERSISTENT},
    {"AdjustLaneOffset", PERSISTENT},
    {"LaneChangeNeedTorque", PERSISTENT},
    {"LaneChangeDelay", PERSISTENT },
@ -261,6 +260,8 @@ inline static std::unordered_map<std::string, uint32_t> keys = {
    {"CustomSteerMax", PERSISTENT},
    {"CustomSteerDeltaUp", PERSISTENT},
    {"CustomSteerDeltaDown", PERSISTENT},
+    {"CustomSteerDeltaUpLC", PERSISTENT},
+    {"CustomSteerDeltaDownLC", PERSISTENT},
    {"SpeedFromPCM", PERSISTENT},
    {"MaxTimeOffroadMin", PERSISTENT},
    {"DisableDM", PERSISTENT},
--- a/launch_env.sh
+++ b/launch_env.sh
@ -7,7 +7,7 @@ export OPENBLAS_NUM_THREADS=1
 export VECLIB_MAXIMUM_THREADS=1

 if [ -z "$AGNOS_VERSION" ]; then
-  export AGNOS_VERSION="12.3"
+  export AGNOS_VERSION="12.4"
 fi

 export STAGING_ROOT="/data/safe_staging"
--- a/opendbc_repo/opendbc/car/car.capnp
+++ b/opendbc_repo/opendbc/car/car.capnp
@ -246,6 +246,7 @@ struct CarState {
  speedLimitDistance @65 :Float32;
  gearStep @66 :Int16;          
  tpms @67 : Tpms;
+  useLaneLineSpeed @68 : Float32;

  struct Tpms {
    fl @0 :Float32;
--- a/opendbc_repo/opendbc/car/hyundai/carcontroller.py
+++ b/opendbc_repo/opendbc/car/hyundai/carcontroller.py
@ -96,6 +96,9 @@ class CarController(CarControllerBase):
    self.activeCarrot = 0
    self.camera_scc_params = Params().get_int("HyundaiCameraSCC")

+    self.steerDeltaUpOrg = self.steerDeltaUp = self.steerDeltaUpLC = self.params.STEER_DELTA_UP
+    self.steerDeltaDownOrg = self.steerDeltaDown = self.steerDeltaDownLC = self.params.STEER_DELTA_DOWN
+
  def update(self, CC, CS, now_nanos):

    if self.frame % 50 == 0:
@ -104,14 +107,30 @@ class CarController(CarControllerBase):
      steerMax = params.get_int("CustomSteerMax")
      steerDeltaUp = params.get_int("CustomSteerDeltaUp")
      steerDeltaDown = params.get_int("CustomSteerDeltaDown")
+      steerDeltaUpLC = params.get_int("CustomSteerDeltaUpLC")
+      steerDeltaDownLC = params.get_int("CustomSteerDeltaDownLC")
      if steerMax > 0:
        self.params.STEER_MAX = steerMax
      if steerDeltaUp > 0:
-        self.params.STEER_DELTA_UP = steerDeltaUp
+        self.steerDeltaUp = steerDeltaUp
        #self.params.ANGLE_TORQUE_UP_RATE = steerDeltaUp
+      else:
+        self.steerDeltaUp = self.steerDeltaUpOrg
      if steerDeltaDown > 0:
-        self.params.STEER_DELTA_DOWN = steerDeltaDown
+        self.steerDeltaDown = steerDeltaDown
        #self.params.ANGLE_TORQUE_DOWN_RATE = steerDeltaDown
+      else:
+        self.steerDeltaDown = self.steerDeltaDownOrg
+
+      if steerDeltaUpLC > 0:
+        self.steerDeltaUpLC = steerDeltaUpLC
+      else:
+        self.steerDeltaUpLC = self.steerDeltaUp
+      if steerDeltaDownLC > 0:
+        self.steerDeltaDownLC = steerDeltaDownLC
+      else:
+        self.steerDeltaDownLC = self.steerDeltaDown
+        
      self.soft_hold_mode = 1 if params.get_int("AutoCruiseControl") > 1 else 2
      self.hapticFeedbackWhenSpeedCamera = int(params.get_int("HapticFeedbackWhenSpeedCamera"))

@ -125,6 +144,13 @@ class CarController(CarControllerBase):

    actuators = CC.actuators
    hud_control = CC.hudControl
+
+    if hud_control.modelDesire in [3,4]:
+      self.params.STEER_DELTA_UP = self.steerDeltaUpLC
+      self.params.STEER_DELTA_DOWN = self.steerDeltaDownLC
+    else:
+      self.params.STEER_DELTA_UP = self.steerDeltaUp
+      self.params.STEER_DELTA_DOWN = self.steerDeltaDown
    
    angle_control = self.CP.flags & HyundaiFlags.ANGLE_CONTROL

--- a/opendbc_repo/opendbc/car/hyundai/carstate.py
+++ b/opendbc_repo/opendbc/car/hyundai/carstate.py
@ -76,6 +76,7 @@ class CarState(CarStateBase):

    self.cruise_buttons_msg = None
    self.hda2_lfa_block_msg = None
+    self.cluster_speed_limit_msg = None

    # On some cars, CLU15->CF_Clu_VehicleSpeed can oscillate faster than the dash updates. Sample at 5 Hz
    self.cluster_speed = 0
@ -461,6 +462,9 @@ class CarState(CarStateBase):
      if "TCS" in cp.vl:
        self.tcs_info_373 = copy.copy(cp.vl.get("TCS", {}))

+      if "CLUSTER_SPEED_LIMIT" in cp.vl:
+        self.cluster_speed_limit_msg = copy.copy(cp.vl.get("CLUSTER_SPEED_LIMIT", {}))
+
    if "GEAR" in cp.vl:
      ret.gearStep = cp.vl["GEAR"]["GEAR_STEP"]
    elif "GEAR_ALT" in cp.vl:
@ -596,6 +600,8 @@ class CarState(CarStateBase):
    # 어떤차는 bus2에 있음, 내차는 bus0에 있는데.... 이건 옆두부와 관련이 없나?
    #if CP.flags & HyundaiFlags.CANFD_HDA2:
    #  pt_messages.append(("CLUSTER_SPEED_LIMIT", 10))
+    if Params().get_int("CanfdDebug") > 0:
+      pt_messages.append(("CLUSTER_SPEED_LIMIT", 10))

    cam_messages = []
    if CP.flags & HyundaiFlags.CANFD_HDA2 and not (CP.flags & HyundaiFlags.CAMERA_SCC.value):
--- a/opendbc_repo/opendbc/car/hyundai/hyundaicanfd.py
+++ b/opendbc_repo/opendbc/car/hyundai/hyundaicanfd.py
@ -598,8 +598,13 @@ def create_ccnc_messages(CP, packer, CAN, frame, CC, CS, hud_control, disp_angle
      # ADAS 콤마연결하면.. 0번에서.. (카메라혹은 다른곳에서)
      # 카메라 콤마연결+롱컨개조 하면.. 2번에서 데이터가 나옴..(카메라혹은 ADAS)
      if frame % 10 == 0:
-
-        pass
+        if CS.cluster_speed_limit_msg is not None:
+          values = CS.cluster_speed_limit_msg
+          values["SPEED_LIMIT_1"] = 100
+          values["SPEED_LIMIT_2"] = 100
+          values["SPEED_LIMIT_3"] = 105
+          #values["COUNTER"] = (values["COUNTER"] + 1) % 256
+          ret.append(packer.make_can_msg("CLUSTER_SPEED_LIMIT", CAN.CAM, values))

  return ret

--- a/opendbc_repo/opendbc/car/mazda/carstate.py
+++ b/opendbc_repo/opendbc/car/mazda/carstate.py
@ -141,7 +141,7 @@ class CarState(CarStateBase):
    ret.buttonEvents = [
      *create_button_events(self.cruise_buttons, self.prev_cruise_buttons, BUTTONS_DICT),
      *create_button_events(self.distance_button, self.prev_distance_button, {1: ButtonType.gapAdjustCruise}),
-      *create_button_events(self.lkas_enabled, self.lkas_previously_enabled, {1: ButtonType.lfaButton}),
+      #*create_button_events(self.lkas_enabled, self.lkas_previously_enabled, {1: ButtonType.lfaButton}),
    ]
    return ret

--- a/opendbc_repo/opendbc/safety/safety/safety_hyundai_canfd.h
+++ b/opendbc_repo/opendbc/safety/safety/safety_hyundai_canfd.h
@ -81,7 +81,7 @@ const CanMsg HYUNDAI_CANFD_HDA2_LONG_TX_MSGS[] = {

  {203, 0, 24}, // CB
  {373, 2, 24}, // TCS(0x175)
-  //{506, 2, 32}, // CLUSTER_SPEED_LIMIT
+  {506, 2, 32}, // CLUSTER_SPEED_LIMIT
  {234, 2, 24}, // MDPS
  {687, 2, 8}, // STEER_TOUCH_2AF
 };
--- a/selfdrive/assets/sounds_eng/Wazealert.wav
+++ b/selfdrive/assets/sounds_eng/Wazealert.wav
--- a/selfdrive/assets/sounds_eng/Wazealert2.wav
+++ b/selfdrive/assets/sounds_eng/Wazealert2.wav
--- a/selfdrive/assets/sounds_eng/audio_1.wav
+++ b/selfdrive/assets/sounds_eng/audio_1.wav
--- a/selfdrive/assets/sounds_eng/audio_10.wav
+++ b/selfdrive/assets/sounds_eng/audio_10.wav
--- a/selfdrive/assets/sounds_eng/audio_2.wav
+++ b/selfdrive/assets/sounds_eng/audio_2.wav
--- a/selfdrive/assets/sounds_eng/audio_3.wav
+++ b/selfdrive/assets/sounds_eng/audio_3.wav
--- a/selfdrive/assets/sounds_eng/audio_4.wav
+++ b/selfdrive/assets/sounds_eng/audio_4.wav
--- a/selfdrive/assets/sounds_eng/audio_5.wav
+++ b/selfdrive/assets/sounds_eng/audio_5.wav
--- a/selfdrive/assets/sounds_eng/audio_6.wav
+++ b/selfdrive/assets/sounds_eng/audio_6.wav
--- a/selfdrive/assets/sounds_eng/audio_7.wav
+++ b/selfdrive/assets/sounds_eng/audio_7.wav
--- a/selfdrive/assets/sounds_eng/audio_8.wav
+++ b/selfdrive/assets/sounds_eng/audio_8.wav
--- a/selfdrive/assets/sounds_eng/audio_9.wav
+++ b/selfdrive/assets/sounds_eng/audio_9.wav
--- a/selfdrive/assets/sounds_eng/audio_auto_hold.wav
+++ b/selfdrive/assets/sounds_eng/audio_auto_hold.wav
--- a/selfdrive/assets/sounds_eng/audio_car_watchout.wav
+++ b/selfdrive/assets/sounds_eng/audio_car_watchout.wav
--- a/selfdrive/assets/sounds_eng/audio_disengage.wav
+++ b/selfdrive/assets/sounds_eng/audio_disengage.wav
--- a/selfdrive/assets/sounds_eng/audio_engage.wav
+++ b/selfdrive/assets/sounds_eng/audio_engage.wav
--- a/selfdrive/assets/sounds_eng/audio_lane_change.wav
+++ b/selfdrive/assets/sounds_eng/audio_lane_change.wav
--- a/selfdrive/assets/sounds_eng/audio_lanechange.wav
+++ b/selfdrive/assets/sounds_eng/audio_lanechange.wav
--- a/selfdrive/assets/sounds_eng/audio_speed_down.wav
+++ b/selfdrive/assets/sounds_eng/audio_speed_down.wav
--- a/selfdrive/assets/sounds_eng/audio_stopping.wav
+++ b/selfdrive/assets/sounds_eng/audio_stopping.wav
--- a/selfdrive/assets/sounds_eng/audio_stopstop.wav
+++ b/selfdrive/assets/sounds_eng/audio_stopstop.wav
--- a/selfdrive/assets/sounds_eng/audio_traffic_error.wav
+++ b/selfdrive/assets/sounds_eng/audio_traffic_error.wav
--- a/selfdrive/assets/sounds_eng/audio_turn.wav
+++ b/selfdrive/assets/sounds_eng/audio_turn.wav
--- a/selfdrive/assets/sounds_eng/audio_turn2.wav
+++ b/selfdrive/assets/sounds_eng/audio_turn2.wav
--- a/selfdrive/assets/sounds_eng/nnff.wav
+++ b/selfdrive/assets/sounds_eng/nnff.wav
--- a/selfdrive/assets/sounds_eng/reverse_gear.wav
+++ b/selfdrive/assets/sounds_eng/reverse_gear.wav
--- a/selfdrive/assets/sounds_eng/traffic_sign_changed.wav
+++ b/selfdrive/assets/sounds_eng/traffic_sign_changed.wav
--- a/selfdrive/assets/sounds_eng/traffic_sign_green.wav
+++ b/selfdrive/assets/sounds_eng/traffic_sign_green.wav
--- a/selfdrive/car/card.py
+++ b/selfdrive/car/card.py
@ -219,6 +219,7 @@ class Car:
    CS.softHoldActive = self.v_cruise_helper._soft_hold_active
    CS.activateCruise = self.v_cruise_helper._activate_cruise
    CS.latEnabled = self.v_cruise_helper._lat_enabled
+    CS.useLaneLineSpeed = self.v_cruise_helper.useLaneLineSpeedApply

    self.CI.CS.softHoldActive = CS.softHoldActive
    return CS, RD
--- a/selfdrive/car/cruise.py
+++ b/selfdrive/car/cruise.py
@ -218,7 +218,7 @@ class VCruiseCarrot:
    self.AutoSpeedUptoRoadSpeedLimit = 0.0

    self.useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed")
-    self.params.put_int("UseLaneLineSpeedApply", self.useLaneLineSpeed)
+    self.useLaneLineSpeedApply = self.useLaneLineSpeed


  @property
@ -237,16 +237,19 @@ class VCruiseCarrot:
      self._log_timer = self._log_timeout

  def update_params(self, is_metric):
+    unit_factor = 1.0 if is_metric else CV.MPH_TO_KPH
    if self.frame % 10 == 0:
-      self.autoCruiseControl = self.params.get_int("AutoCruiseControl")
-      self.autoGasTokSpeed = self.params.get_int("AutoGasTokSpeed")
-      self.autoGasSyncSpeed = self.params.get_bool("AutoGasSyncSpeed")
+      self.autoCruiseControl = self.params.get_int("AutoCruiseControl") * unit_factor
+      self.autoGasTokSpeed = self.params.get_int("AutoGasTokSpeed") * unit_factor
+      self.autoGasSyncSpeed = self.params.get_bool("AutoGasSyncSpeed") * unit_factor
      self.autoSpeedUptoRoadSpeedLimit = self.params.get_float("AutoSpeedUptoRoadSpeedLimit") * 0.01
      self.autoRoadSpeedAdjust = self.params.get_float("AutoRoadSpeedAdjust") * 0.01
-      useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed")
+
+      useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed") * unit_factor
      if self.useLaneLineSpeed != useLaneLineSpeed:
-        self.params.put_int_nonblocking("UseLaneLineSpeedApply", useLaneLineSpeed)
+        self.useLaneLineSpeedApply = useLaneLineSpeed
      self.useLaneLineSpeed = useLaneLineSpeed
+
      self.speed_from_pcm = self.params.get_int("SpeedFromPCM")
      self._cruise_speed_unit = self.params.get_int("CruiseSpeedUnit")
      self._paddle_mode = self.params.get_int("PaddleMode")
@ -255,7 +258,6 @@ class VCruiseCarrot:
      self.autoRoadSpeedLimitOffset = self.params.get_int("AutoRoadSpeedLimitOffset")
      self.autoNaviSpeedSafetyFactor = self.params.get_float("AutoNaviSpeedSafetyFactor") * 0.01
      self.cruiseOnDist = self.params.get_float("CruiseOnDist") * 0.01
-      unit_factor = 1.0 if is_metric else CV.MPH_TO_KPH
      cruiseSpeed1 = self.params.get_float("CruiseSpeed1") * unit_factor
      cruiseSpeed2 = self.params.get_float("CruiseSpeed2") * unit_factor
      cruiseSpeed3 = self.params.get_float("CruiseSpeed3") * unit_factor
@ -552,7 +554,7 @@ class VCruiseCarrot:
        self.params.put_int_nonblocking("MyDrivingMode", self.params.get_int("MyDrivingMode") % 4 + 1) # 1,2,3,4 (1:eco, 2:safe, 3:normal, 4:high speed)
      elif button_type == ButtonType.lfaButton:
        useLaneLineSpeed = max(1, self.useLaneLineSpeed)
-        self.params.put_int_nonblocking("UseLaneLineSpeedApply", useLaneLineSpeed if self.params.get_int("UseLaneLineSpeedApply") == 0 else 0)
+        self.useLaneLineSpeedApply = useLaneLineSpeed if self.useLaneLineSpeedApply == 0 else 0

      elif button_type == ButtonType.cancel:
        self._cruise_cancel_state = True
@ -594,15 +596,20 @@ class VCruiseCarrot:
    return v_cruise_kph

  def _auto_speed_up(self, v_cruise_kph):
-    if self._pause_auto_speed_up:
-      return v_cruise_kph
+    #if self._pause_auto_speed_up:
+    #  return v_cruise_kph

    road_limit_kph = self.nRoadLimitSpeed * self.autoSpeedUptoRoadSpeedLimit
    if road_limit_kph < 1.0:
      return v_cruise_kph

-    if self.v_lead_kph + 5 > v_cruise_kph and v_cruise_kph < road_limit_kph and self.d_rel < 60:
+    if not self._pause_auto_speed_up and self.v_lead_kph + 5 > v_cruise_kph and v_cruise_kph < road_limit_kph and self.d_rel < 60:
      v_cruise_kph = min(v_cruise_kph + 5, road_limit_kph)
+    elif self.autoRoadSpeedAdjust < 0 and self.nRoadLimitSpeed != self.nRoadLimitSpeed_last:  # 도로제한속도가 바뀌면, 바뀐속도로 속도를 바꿈.
+      if self.autoRoadSpeedLimitOffset < 0:
+        v_cruise_kph = self.nRoadLimitSpeed * self.autoNaviSpeedSafetyFactor
+      else:
+        v_cruise_kph = self.nRoadLimitSpeed + self.autoRoadSpeedLimitOffset
    elif self.nRoadLimitSpeed < self.nRoadLimitSpeed_last and self.autoRoadSpeedAdjust > 0:
      new_road_limit_kph = self.nRoadLimitSpeed * self.autoRoadSpeedAdjust + v_cruise_kph * (1 - self.autoRoadSpeedAdjust)
      self._add_log(f"AutoSpeed change {v_cruise_kph} -> {new_road_limit_kph}")
@ -681,11 +688,11 @@ class VCruiseCarrot:
      elif self.xState == 3:
        v_cruise_kph = self.v_ego_kph_set
        self._cruise_control(-1, 3, "Cruise off (traffic sign)")
-      elif self.v_ego_kph_set >= 30 and not CC.enabled:
+      elif self.v_ego_kph_set >= self.autoGasTokSpeed and not CC.enabled:
        v_cruise_kph = self.v_ego_kph_set
        self._cruise_control(1, -1 if self.aTarget > 0.0 else 0, "Cruise on (gas pressed)")
    elif self._brake_pressed_count == -1 and self._soft_hold_active == 0:
-      if self.v_ego_kph_set > 40:
+      if self.v_ego_kph_set > self.autoGasTokSpeed:
        v_cruise_kph = self.v_ego_kph_set
        self._cruise_control(1, -1 if self.aTarget > 0.0 else 0, "Cruise on (speed)")
      elif abs(CS.steeringAngleDeg) < 20:
--- a/selfdrive/carrot/carrot_man.py
+++ b/selfdrive/carrot/carrot_man.py
@ -1561,7 +1561,9 @@ class CarrotServ:
        xSpdType = 100

      if xSpdType >= 0:
-        self.xSpdLimit = self.nRoadLimitSpeed
+        offset = 5 if self.is_metric else 5 * CV.MPH_TO_KPH
+        self.xSpdLimit = self.nRoadLimitSpeed + offset
+        
        self.xSpdDist = distance
        self.xSpdType =xSpdType 
    
@ -1685,11 +1687,12 @@ class CarrotServ:
    if self.turnSpeedControlMode in [1,2]:
      speed_n_sources.append((max(abs(vturn_speed), self.autoCurveSpeedLowerLimit), "vturn"))

+    route_speed = max(route_speed * self.mapTurnSpeedFactor, self.autoCurveSpeedLowerLimit)
    if self.turnSpeedControlMode == 2:
      if 0 < self.xDistToTurn < 300:
-        speed_n_sources.append((route_speed * self.mapTurnSpeedFactor, "route"))
+        speed_n_sources.append((route_speed, "route"))
    elif self.turnSpeedControlMode == 3:
-      speed_n_sources.append((route_speed * self.mapTurnSpeedFactor, "route"))
+      speed_n_sources.append((route_speed, "route"))
      #speed_n_sources.append((self.calculate_current_speed(dist, speed * self.mapTurnSpeedFactor, 0, 1.2), "route"))

    desired_speed, source = min(speed_n_sources, key=lambda x: x[0])
--- a/selfdrive/carrot_settings.json
+++ b/selfdrive/carrot_settings.json
@ -235,6 +235,32 @@
      "default": 0,
      "unit": 1
    },
+    {
+      "group": "조향튜닝",
+      "name": "CustomSteerDeltaUpLC",
+      "title": "_CustomSteerDeltaUpLC(0)",
+      "descr": "차선변경시 적용, 토크조향",
+      "egroup": "LAT",
+      "etitle": "_CustomSteerDeltaUpLC(0)",
+      "edescr": "for LaneChange, torque steer only",
+      "min": 0,
+      "max": 50,
+      "default": 0,
+      "unit": 1
+    },
+    {
+      "group": "조향튜닝",
+      "name": "CustomSteerDeltaDownLC",
+      "title": "_CustomSteerDeltaDownLC(0)",
+      "descr": "차선변경시 적용, 토크조향",
+      "egroup": "LAT",
+      "etitle": "_CustomSteerDeltaDownLC(0)",
+      "edescr": "for LaneChange, torque steer only",
+      "min": 0,
+      "max": 50,
+      "default": 0,
+      "unit": 1
+    },
    {
      "group": "조향튜닝",
      "name": "SteerActuatorDelay",
@ -736,7 +762,7 @@
      "descr": "1:SOFTHOLD, Auto Cruise, 2:SoftHold오류시",
      "egroup": "START",
      "etitle": "Auto Cruise control(HKG only)",
-      "edescr": "Softhold, Auto Cruise ON/OFF control, 2:if softhold error",
+      "edescr": "1:Softhold, Auto Cruise ON/OFF control, 2:if softhold error",
      "min": 0,
      "max": 3,
      "default": 0,
@ -915,11 +941,11 @@
      "group": "감속제어",
      "name": "AutoRoadSpeedAdjust",
      "title": "자동도로제한속도감속 (50)%",
-      "descr": "100: 새로운속도, 50: 중간값, 0: 기존속도유지",
+      "descr": "-1: 도로제한속도로 항상, 100: 새로운속도, 50: 중간값, 0: 기존속도유지",
      "egroup": "CRUISE",
      "etitle": "AutoRoadLimitSpeedAdjust (50)%",
-      "edescr": "100: new road speed, 50: median, 0: not change",
-      "min": 0,
+      "edescr": "-1: set roadlimitspeed, 100: new road speed, 50: median, 0: not change",
+      "min": -1,
      "max": 100,
      "default": 0,
      "unit": 10
--- a/selfdrive/controls/controlsd.py
+++ b/selfdrive/controls/controlsd.py
@ -132,8 +132,7 @@ class Controls:
    # Steering PID loop and lateral MPC
    lat_plan = self.sm['lateralPlan']
    curve_speed_abs = abs(self.sm['carrotMan'].vTurnSpeed)
-    self.lanefull_mode_enabled = (lat_plan.useLaneLines and self.params.get_int("UseLaneLineSpeedApply") > 0 and
-                                  curve_speed_abs > self.params.get_int("UseLaneLineCurveSpeed"))
+    self.lanefull_mode_enabled = (lat_plan.useLaneLines and curve_speed_abs > self.params.get_int("UseLaneLineCurveSpeed"))
    lat_smooth_seconds = LAT_SMOOTH_SECONDS #self.params.get_float("SteerSmoothSec") * 0.01
    steer_actuator_delay = self.params.get_float("SteerActuatorDelay") * 0.01
    mpc_output_offset = self.params.get_float("LatMpcOutputOffset") * 0.01 # 0.05
--- a/selfdrive/controls/lib/desire_helper.py
+++ b/selfdrive/controls/lib/desire_helper.py
@ -4,6 +4,7 @@ from openpilot.common.realtime import DT_MDL
 import numpy as np
 from openpilot.selfdrive.modeld.constants import ModelConstants
 from openpilot.common.params import Params
+from collections import deque

 LaneChangeState = log.LaneChangeState
 LaneChangeDirection = log.LaneChangeDirection
@ -106,6 +107,8 @@ class DesireHelper:
    self.desireLog = ""
    self.lane_width_left = 0
    self.lane_width_right = 0
+    self.lane_width_left_diff = 0
+    self.lane_width_right_diff = 0
    self.distance_to_road_edge_left = 0
    self.distance_to_road_edge_right = 0
    self.distance_to_road_edge_left_far = 0
@ -122,6 +125,8 @@ class DesireHelper:
    self.available_right_lane = False
    self.available_left_edge = False
    self.available_right_edge = False
+    self.lane_width_left_queue = deque(maxlen=int(1.0/DT_MDL))
+    self.lane_width_right_queue = deque(maxlen=int(1.0/DT_MDL))

    self.lane_available_last = False
    self.edge_available_last = False
@ -141,15 +146,24 @@ class DesireHelper:
    self.turn_desire_state = False
    self.desire_disable_count = 0
    self.blindspot_detected_counter = 0
+    self.auto_lane_change_enable = False

  def check_lane_state(self, modeldata):
-    self.lane_width_left, self.distance_to_road_edge_left, self.distance_to_road_edge_left_far, lane_prob_left = calculate_lane_width(modeldata.laneLines[0], modeldata.laneLineProbs[0],
+    lane_width_left, self.distance_to_road_edge_left, self.distance_to_road_edge_left_far, lane_prob_left = calculate_lane_width(modeldata.laneLines[0], modeldata.laneLineProbs[0],
                                                                                                 modeldata.laneLines[1], modeldata.roadEdges[0])
-    self.lane_width_right, self.distance_to_road_edge_right, self.distance_to_road_edge_right_far, lane_prob_right = calculate_lane_width(modeldata.laneLines[3], modeldata.laneLineProbs[3],
+    lane_width_right, self.distance_to_road_edge_right, self.distance_to_road_edge_right_far, lane_prob_right = calculate_lane_width(modeldata.laneLines[3], modeldata.laneLineProbs[3],
                                                                                                    modeldata.laneLines[2], modeldata.roadEdges[1])
    self.lane_exist_left_count.update(lane_prob_left)
    self.lane_exist_right_count.update(lane_prob_right)
-    min_lane_width = 2.8
+    
+    self.lane_width_left_queue.append(lane_width_left)
+    self.lane_width_right_queue.append(lane_width_right)
+    self.lane_width_left = np.mean(self.lane_width_left_queue)
+    self.lane_width_right = np.mean(self.lane_width_right_queue)
+    self.lane_width_left_diff = self.lane_width_left_queue[-1] - self.lane_width_left_queue[0]
+    self.lane_width_right_diff = self.lane_width_right_queue[-1] - self.lane_width_right_queue[0]
+    
+    min_lane_width = 2.0
    self.lane_width_left_count.update(self.lane_width_left > min_lane_width)
    self.lane_width_right_count.update(self.lane_width_right > min_lane_width)
    self.road_edge_left_count.update(self.distance_to_road_edge_left > min_lane_width)
@ -183,6 +197,10 @@ class DesireHelper:
    v_ego = carstate.vEgo
    below_lane_change_speed = v_ego < LANE_CHANGE_SPEED_MIN

+    ##### check lane state
+    self.check_lane_state(modeldata)
+    self.check_desire_state(modeldata)
+
    #### check driver's blinker state
    driver_blinker_state = carstate.leftBlinker * 1 + carstate.rightBlinker * 2
    driver_blinker_changed = driver_blinker_state != self.driver_blinker_state
@ -216,7 +234,7 @@ class DesireHelper:
    elif atc_type in ["fork left", "fork right", "atc left", "atc right"]:
      if self.atc_active != 2:
        below_lane_change_speed = False
-        atc_blinker_state = BLINKER_LEFT if atc_type in ["fork left", "atc left"] else BLINKER_RIGHT
+        atc_blinker_state = BLINKER_LEFT if atc_type in ["fork left", "atc left"] else BLINKER_RIGHT        
        self.atc_active = 1
    else:
      self.atc_active = 0
@ -240,10 +258,6 @@ class DesireHelper:
    desire_enabled = driver_desire_enabled or atc_desire_enabled
    blinker_state = driver_blinker_state if driver_desire_enabled else atc_blinker_state
    
-    ##### check lane state
-    self.check_lane_state(modeldata)
-    self.check_desire_state(modeldata)
-    
    if desire_enabled:
      lane_available = self.available_left_lane if blinker_state == BLINKER_LEFT else self.available_right_lane
      edge_available = self.available_left_edge if blinker_state == BLINKER_LEFT else self.available_right_edge
@ -260,16 +274,27 @@ class DesireHelper:
      lane_appeared = False
      self.object_detected_count = 0

-    lane_availabled = not self.lane_available_last and lane_available
+    #lane_available_trigger = not self.lane_available_last and lane_available
+    lane_change_available = lane_available or edge_available  
+    lane_available_trigger = False
+    lane_width_diff = self.lane_width_left_diff if atc_blinker_state == BLINKER_LEFT else self.lane_width_right_diff
+    distance_to_road_edge = self.distance_to_road_edge_left if atc_blinker_state == BLINKER_LEFT else self.distance_to_road_edge_right
+    lane_width_side = self.lane_width_left if atc_blinker_state == BLINKER_LEFT else self.lane_width_right
+    if lane_width_diff > 0.5 and (lane_width_side < distance_to_road_edge):
+      lane_available_trigger = True
    edge_availabled = not self.edge_available_last and edge_available
    side_object_detected = self.object_detected_count > -0.3 / DT_MDL
+    lane_exist_counter = self.lane_exist_left_count.counter if blinker_state == BLINKER_LEFT else self.lane_exist_right_count.counter
+

    if self.carrot_lane_change_count > 0:
      auto_lane_change_blocked = False
-      auto_lane_change_available = lane_available
+      auto_lane_change_trigger = lane_change_available
    else:
      auto_lane_change_blocked = ((atc_blinker_state == BLINKER_LEFT) and (driver_blinker_state != BLINKER_LEFT))
-      auto_lane_change_available = not auto_lane_change_blocked and (lane_availabled or edge_availabled or lane_appeared) and not side_object_detected
+      #auto_lane_change_trigger = not auto_lane_change_blocked and edge_available and (lane_available_trigger or edge_availabled or lane_appeared) and not side_object_detected
+      auto_lane_change_trigger = self.auto_lane_change_enable and not auto_lane_change_blocked and edge_available and (lane_available_trigger or lane_appeared) and not side_object_detected
+      self.desireLog = f"L:{self.auto_lane_change_enable},{auto_lane_change_blocked},E:{lane_available},{edge_available},A:{lane_available_trigger},{lane_appeared},{lane_width_diff:.1f},{lane_width_side:.1f},{distance_to_road_edge:.1f}={auto_lane_change_trigger}"

    if not lateral_active or self.lane_change_timer > LANE_CHANGE_TIME_MAX:
      #print("Desire canceled")
@ -296,6 +321,11 @@ class DesireHelper:
        self.lane_change_ll_prob = 1.0
        self.lane_change_delay = self.laneChangeDelay

+        # 맨끝차선이 아니면(측면에 차선이 있으면), ATC 자동작동 안함.
+        #self.auto_lane_change_enable = False if lane_exist_counter > 0 else True
+        self.auto_lane_change_enable = False if lane_exist_counter > 0 or lane_change_available else True
+         
+
      # LaneChangeState.preLaneChange
      elif self.lane_change_state == LaneChangeState.preLaneChange:
        # Set lane change direction
@ -310,6 +340,9 @@ class DesireHelper:
        torque_applied = carstate.steeringPressed and torque_cond
        blindspot_detected = blindspot_cond

+        if not self.auto_lane_change_enable and not lane_available: #lane_exist_counter > int(0.2 / DT_MDL) and not lane_change_available:
+          self.auto_lane_change_enable = True
+
        if blindspot_detected and not ignore_bsd:
          self.blindspot_detected_counter = int(1.5 / DT_MDL)
          # BSD검출시.. 아래 두줄로 자동차선변경 해제함.. 위험해서 자동차선변경기능은 안하는걸로...
@ -319,7 +352,7 @@ class DesireHelper:
          self.lane_change_state = LaneChangeState.off
          self.lane_change_direction = LaneChangeDirection.none
        else:
-          if lane_available and self.lane_change_delay == 0:
+          if lane_change_available and self.lane_change_delay == 0:
            if self.blindspot_detected_counter > 0 and not ignore_bsd:  # BSD검출시
              if torque_applied and not block_lanechange_bsd:
                self.lane_change_state = LaneChangeState.laneChangeStarting
@ -330,7 +363,7 @@ class DesireHelper:
              self.lane_change_state = LaneChangeState.laneChangeStarting
            # ATC작동인경우 차선이 나타나거나 차선이 생기면 차선변경 시작
            # lane_appeared: 차선이 생기는건 안함.. 위험.
-            elif torque_applied or auto_lane_change_available:
+            elif torque_applied or auto_lane_change_trigger:
              self.lane_change_state = LaneChangeState.laneChangeStarting

      # LaneChangeState.laneChangeStarting
@ -379,7 +412,7 @@ class DesireHelper:

    #print(f"desire = {self.desire}")
    #self.desireLog = f"desire = {self.desire}"
-    self.desireLog = f"rlane={self.distance_to_road_edge_right:.1f},{self.distance_to_road_edge_right_far:.1f}"
+    #self.desireLog = f"rlane={self.distance_to_road_edge_right:.1f},{self.distance_to_road_edge_right_far:.1f}"

    # Send keep pulse once per second during LaneChangeStart.preLaneChange
    if self.lane_change_state in (LaneChangeState.off, LaneChangeState.laneChangeStarting):
--- a/selfdrive/controls/lib/drive_helpers.py
+++ b/selfdrive/controls/lib/drive_helpers.py
@ -122,3 +122,13 @@ def get_accel_from_plan(speeds, accels, t_idxs, action_t=DT_MDL, vEgoStopping=0.
  should_stop = (v_target < vEgoStopping and
                 v_target_1sec < vEgoStopping)
  return a_target, should_stop
+
+def curv_from_psis(psi_target, psi_rate, vego, action_t):
+  vego = np.clip(vego, MIN_SPEED, np.inf)
+  curv_from_psi = psi_target / (vego * action_t)
+  return 2*curv_from_psi - psi_rate / vego
+
+def get_curvature_from_plan(yaws, yaw_rates, t_idxs, vego, action_t):
+  psi_target = np.interp(action_t, t_idxs, yaws)
+  psi_rate = yaw_rates[0]
+  return curv_from_psis(psi_target, psi_rate, vego, action_t)
--- a/selfdrive/controls/lib/lateral_planner.py
+++ b/selfdrive/controls/lib/lateral_planner.py
@ -58,7 +58,7 @@ class LateralPlanner:
    self.lanelines_active = False
    self.lanelines_active_tmp = False

-    self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeedApply")
+    self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeed")
    self.pathOffset = float(self.params.get_int("PathOffset")) * 0.01
    self.useLaneLineMode = False
    self.plan_a = np.zeros((TRAJECTORY_SIZE, ))
@ -85,7 +85,7 @@ class LateralPlanner:
    self.readParams -= 1
    if self.readParams <= 0:
      self.readParams = 100
-      self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeedApply")
+      self.useLaneLineSpeedApply = sm['carState'].useLaneLineSpeed
      self.pathOffset = float(self.params.get_int("PathOffset")) * 0.01
      self.lateralPathCost = self.params.get_float("LatMpcPathCost") * 0.01
      self.lateralMotionCost = self.params.get_float("LatMpcMotionCost") * 0.01
--- a/selfdrive/modeld/models/driving_policy.onnx
+++ b/selfdrive/modeld/models/driving_policy.onnx
--- a/selfdrive/ui/carrot.cc
+++ b/selfdrive/ui/carrot.cc
@ -4,6 +4,11 @@
 #include <cmath>
 #include <limits>

+#include <QJsonDocument>
+#include <QJsonObject>
+#include <QJsonValue>
+#include <QJsonArray>
+
 //#define __TEST
 //#define __UI_TEST

@ -494,7 +499,8 @@ public:
    }
 };

-class ModelDrawer {
+class ModelDrawer : public QObject{
+      Q_OBJECT
 protected:
    template <class T>
    float interp(float x, std::initializer_list<T> x_list, std::initializer_list<T> y_list, bool extrapolate)
@ -696,11 +702,11 @@ public:
        else if (longActive) {
            if (xState == 3 || xState == 5) {      //XState.e2eStop, XState.e2eStopped
                if (v_ego < 1.0) {
-                    sprintf(str, "%s", (trafficState >= 1000) ? "신호오류" : "신호대기");
+                    sprintf(str, "%s", (trafficState >= 1000) ? tr("Signal Error").toStdString().c_str(): tr("Signal Ready").toStdString().c_str());
                    ui_draw_text(s, x, disp_y, str, disp_size, COLOR_WHITE, BOLD);
                }
                else {
-                    ui_draw_text(s, x, disp_y, "신호감속중", disp_size, COLOR_WHITE, BOLD);
+                    ui_draw_text(s, x, disp_y, tr("Signal slowing").toStdString().c_str(), disp_size, COLOR_WHITE, BOLD);
                }
 #if 0
                else if (getStopDist() > 0.5) {
@ -1596,6 +1602,8 @@ protected:
    int use_lane_line_speed_apply = 0;
 public:
    void draw(const UIState* s, float& pathDrawSeq) {
+        SubMaster& sm = *(s->sm);
+        auto car_state = sm["carState"].getCarState();
        params_count = (params_count + 1) % 20;
        if (params_count == 0) {
            show_path_mode_normal = params.getInt("ShowPathMode");
@ -1606,7 +1614,7 @@ public:
            show_path_color_cruise_off = params.getInt("ShowPathColorCruiseOff");
        }
        if (!make_data(s)) return;
-        int temp = params.getInt("UseLaneLineSpeedApply");
+        int temp = (int)car_state.getUseLaneLineSpeed();
        if (temp != use_lane_line_speed_apply) {
            ui_draw_text_a(s, 0, 0, (temp>0)?"LaneMode":"Laneless", 30, (temp>0)?COLOR_GREEN:COLOR_YELLOW, BOLD);
            use_lane_line_speed_apply = temp;
@ -1621,8 +1629,6 @@ public:
            COLOR_WHITE_ALPHA(alpha),         COLOR_BLACK_ALPHA(alpha),
        };

-        SubMaster& sm = *(s->sm);
-        auto car_state = sm["carState"].getCarState();
        bool brake_valid = car_state.getBrakeLights();

        if (show_path_mode == 0) {
@ -1838,11 +1844,6 @@ private:
 };


-#include <QJsonDocument>
-#include <QJsonObject>
-#include <QJsonValue>
-#include <QJsonArray>
-
 typedef struct {
    float x, y, d, v, y_rel, v_lat, radar;
 } lead_vertex_data;
@ -1947,9 +1948,9 @@ public:
            }
            auto meta = sm["modelV2"].getModelV2().getMeta();
            QString desireLog = QString::fromStdString(meta.getDesireLog());
-            sprintf(carrot_man_debug, "model_kph= %d, %s, %dkm/h TBT(%d): %dm, CAM(%d): %dkm/h, %dm, ATC(%s), T(%d)",
-                (int)(velocity.getX()[32] * 3.6),
+            sprintf(carrot_man_debug, "%s, m_kph= %d, %dkm/h TBT(%d): %dm, CAM(%d): %dkm/h, %dm, ATC(%s), T(%d)",
                desireLog.toStdString().c_str(),
+                (int)(velocity.getX()[32] * 3.6),
                carrot_man.getDesiredSpeed(),
                carrot_man.getXTurnInfo(),
                carrot_man.getXDistToTurn(),
@ -2045,7 +2046,7 @@ public:
    void drawDebug(UIState* s) {
        if (params.getInt("ShowDebugUI") > 1) {
            nvgTextAlign(s->vg, NVG_ALIGN_RIGHT | NVG_ALIGN_BOTTOM);
-            ui_draw_text(s, s->fb_w, s->fb_h - 10, carrot_man_debug, 35, COLOR_WHITE, BOLD, 1.0f, 1.0f);
+            ui_draw_text(s, s->fb_w, s->fb_h - 10, carrot_man_debug, 25, COLOR_WHITE, BOLD, 1.0f, 1.0f);
        }
    }
    void drawNaviPath(UIState* s) {
--- a/selfdrive/ui/qt/offroad/settings.cc
+++ b/selfdrive/ui/qt/offroad/settings.cc
@ -847,7 +847,7 @@ CarrotPanel::CarrotPanel(QWidget* parent) : QWidget(parent) {
  speedToggles->addItem(new CValueControl("AutoTurnControl", "ATC: Auto turn control(0)", "0:None, 1: lane change, 2: lane change + speed, 3: speed", "../assets/offroad/icon_road.png", 0, 3, 1));
  speedToggles->addItem(new CValueControl("AutoTurnControlSpeedTurn", "ATC: Turn Speed (20)", "0:None, turn speed", "../assets/offroad/icon_road.png", 0, 100, 5));
  speedToggles->addItem(new CValueControl("AutoTurnControlTurnEnd", "ATC: Turn CtrlDistTime (6)", "dist=speed*time", "../assets/offroad/icon_road.png", 0, 30, 1));
-  speedToggles->addItem(new CValueControl("AutoRoadSpeedAdjust", "Auto Roadlimit Speed adjust (50%)", "", "../assets/offroad/icon_road.png", 0, 100, 10));
+  speedToggles->addItem(new CValueControl("AutoRoadSpeedAdjust", "Auto Roadlimit Speed adjust (50%)", "", "../assets/offroad/icon_road.png", -1, 100, 5));
  speedToggles->addItem(new CValueControl("AutoTurnMapChange", "ATC Auto Map Change(0)", "", "../assets/offroad/icon_road.png", 0, 1, 1));

  toggles_layout->addWidget(cruiseToggles);
--- a/selfdrive/ui/qt/screenrecorder/screenrecorder.cc
+++ b/selfdrive/ui/qt/screenrecorder/screenrecorder.cc
@ -140,13 +140,18 @@ void ScreenRecoder::encoding_thread_func() {

      QImage image = popImage.convertToFormat(QImage::Format_RGBA8888);

-      libyuv::ARGBScale(image.bits(), image.width()*4,
-            image.width(), image.height(),
-            rgb_scale_buffer.get(), dst_width*4,
-            dst_width, dst_height,
-            libyuv::kFilterLinear);
-
-      encoder->encode_frame_rgba(rgb_scale_buffer.get(), dst_width, dst_height, ((uint64_t)nanos_since_boot() - start_time ));
+      try {
+        libyuv::ARGBScale(image.bits(), image.width()*4,
+              image.width(), image.height(),
+              rgb_scale_buffer.get(), dst_width*4,
+              dst_width, dst_height,
+              libyuv::kFilterLinear);
+  
+        encoder->encode_frame_rgba(rgb_scale_buffer.get(), dst_width, dst_height, ((uint64_t)nanos_since_boot() - start_time ));
+      } catch (...) {
+        printf("Encoding failed, skipping frame\n");
+        continue;
+      }
    }
  }
 }
--- a/selfdrive/ui/translations/main_ko.ts
+++ b/selfdrive/ui/translations/main_ko.ts
@ -1255,4 +1255,20 @@ This may take up to a minute.</source>
      <translation>레인리스</translation>
    </message>
  </context>
+  <context>
+  <name>PathEndDrawer</name>
+    <message>
+      <source>Signal slowing</source>
+      <translation>신호감속중</translation>
+    </message>
+    <message>
+      <source>Signal Error</source>
+      <translation>신호오류</translation>
+    </message>
+    <message>
+      <source>Signal Ready</source>
+      <translation>신호대기</translation>
+    </message>
+  </context>
+
 </TS>
--- a/system/hardware/tici/agnos.json
+++ b/system/hardware/tici/agnos.json
@ -56,28 +56,28 @@
  },
  {
    "name": "boot",
-    "url": "https://commadist.azureedge.net/agnosupdate/boot-4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42.img.xz",
-    "hash": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
-    "hash_raw": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
+    "url": "https://commadist.azureedge.net/agnosupdate/boot-4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef.img.xz",
+    "hash": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
+    "hash_raw": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
    "size": 18479104,
    "sparse": false,
    "full_check": true,
    "has_ab": true,
-    "ondevice_hash": "6b7b3371100ad36d8a5a9ff19a1663b9b9e2d5e99cbe3cf9255e9c3017291ce3"
+    "ondevice_hash": "8d7094d774faa4e801e36b403a31b53b913b31d086f4dc682d2f64710c557e8a"
  },
  {
    "name": "system",
-    "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img.xz",
-    "hash": "993d6a1cd2b684e2b1cf6ff840f8996f02a529011372d9c1471e4c80719e7da9",
-    "hash_raw": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
+    "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img.xz",
+    "hash": "cccd7073d067027396f2afd49874729757db0bbbc79853a0bf2938bd356fe164",
+    "hash_raw": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
    "size": 5368709120,
    "sparse": true,
    "full_check": false,
    "has_ab": true,
-    "ondevice_hash": "59db25651da977eeb16a1af741fd01fc3d6b50d21544b1a7428b7c86b2cdef2d",
+    "ondevice_hash": "c7707f16ce7d977748677cc354e250943b4ff6c21b9a19a492053d32397cf9ec",
    "alt": {
-      "hash": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
-      "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img",
+      "hash": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
+      "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img",
      "size": 5368709120
    }
  }
--- a/system/hardware/tici/all-partitions.json
+++ b/system/hardware/tici/all-partitions.json
@ -339,62 +339,62 @@
  },
  {
    "name": "boot",
-    "url": "https://commadist.azureedge.net/agnosupdate/boot-4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42.img.xz",
-    "hash": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
-    "hash_raw": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
+    "url": "https://commadist.azureedge.net/agnosupdate/boot-4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef.img.xz",
+    "hash": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
+    "hash_raw": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
    "size": 18479104,
    "sparse": false,
    "full_check": true,
    "has_ab": true,
-    "ondevice_hash": "6b7b3371100ad36d8a5a9ff19a1663b9b9e2d5e99cbe3cf9255e9c3017291ce3"
+    "ondevice_hash": "8d7094d774faa4e801e36b403a31b53b913b31d086f4dc682d2f64710c557e8a"
  },
  {
    "name": "system",
-    "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img.xz",
-    "hash": "993d6a1cd2b684e2b1cf6ff840f8996f02a529011372d9c1471e4c80719e7da9",
-    "hash_raw": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
+    "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img.xz",
+    "hash": "cccd7073d067027396f2afd49874729757db0bbbc79853a0bf2938bd356fe164",
+    "hash_raw": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
    "size": 5368709120,
    "sparse": true,
    "full_check": false,
    "has_ab": true,
-    "ondevice_hash": "59db25651da977eeb16a1af741fd01fc3d6b50d21544b1a7428b7c86b2cdef2d",
+    "ondevice_hash": "c7707f16ce7d977748677cc354e250943b4ff6c21b9a19a492053d32397cf9ec",
    "alt": {
-      "hash": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
-      "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img",
+      "hash": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
+      "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img",
      "size": 5368709120
    }
  },
  {
    "name": "userdata_90",
-    "url": "https://commadist.azureedge.net/agnosupdate/userdata_90-89a161f17b86637413fe10a641550110b626b699382f5138c02267b7866a8494.img.xz",
-    "hash": "99d9e6cf6755581c6879bbf442bd62212beb8a04116e965ab987135b8842188b",
-    "hash_raw": "89a161f17b86637413fe10a641550110b626b699382f5138c02267b7866a8494",
+    "url": "https://commadist.azureedge.net/agnosupdate/userdata_90-f0c675e0fae420870c9ba8979fa246b170f4f1a7a04b49609b55b6bdfa8c1b21.img.xz",
+    "hash": "3d8a007bae088c5959eb9b82454013f91868946d78380fecea2b1afdfb575c02",
+    "hash_raw": "f0c675e0fae420870c9ba8979fa246b170f4f1a7a04b49609b55b6bdfa8c1b21",
    "size": 96636764160,
    "sparse": true,
    "full_check": true,
    "has_ab": false,
-    "ondevice_hash": "24ea29ab9c4ecec0568a4aa83e38790fedfce694060e90f4bde725931386ff41"
+    "ondevice_hash": "5bfbabb8ff96b149056aa75d5b7e66a7cdd9cb4bcefe23b922c292f7f3a43462"
  },
  {
    "name": "userdata_89",
-    "url": "https://commadist.azureedge.net/agnosupdate/userdata_89-cdd3401168819987c4840765bba1aa2217641b1a6a4165c412f44cac14ccfcbf.img.xz",
-    "hash": "5fbfa008a7f6b58ab01d4d171f3185924d4c9db69b54f4bfc0f214c6f17c2435",
-    "hash_raw": "cdd3401168819987c4840765bba1aa2217641b1a6a4165c412f44cac14ccfcbf",
+    "url": "https://commadist.azureedge.net/agnosupdate/userdata_89-06fc52be37b42690ed7b4f8c66c4611309a2dea9fca37dd9d27d1eff302eb1bf.img.xz",
+    "hash": "443f136484294b210318842d09fb618d5411c8bdbab9f7421d8c89eb291a8d3f",
+    "hash_raw": "06fc52be37b42690ed7b4f8c66c4611309a2dea9fca37dd9d27d1eff302eb1bf",
    "size": 95563022336,
    "sparse": true,
    "full_check": true,
    "has_ab": false,
-    "ondevice_hash": "c07dc2e883a23d4a24d976cdf53a767a2fd699c8eeb476d60cdf18e84b417a52"
+    "ondevice_hash": "67db02b29a7e4435951c64cc962a474d048ed444aa912f3494391417cd51a074"
  },
  {
    "name": "userdata_30",
-    "url": "https://commadist.azureedge.net/agnosupdate/userdata_30-2a8e8278b3bb545e6d7292c2417ccebdca9b47507eb5924f7c1e068737a7edfd.img.xz",
-    "hash": "b3bc293c9c5e0480ef663e980c8ccb2fb83ffd230c85f8797830fb61b8f59360",
-    "hash_raw": "2a8e8278b3bb545e6d7292c2417ccebdca9b47507eb5924f7c1e068737a7edfd",
+    "url": "https://commadist.azureedge.net/agnosupdate/userdata_30-06679488f0c5c3fcfd5f351133050751cd189f705e478a979c45fc4a166d18a6.img.xz",
+    "hash": "875b580cb786f290a842e9187fd945657561886123eb3075a26f7995a18068f6",
+    "hash_raw": "06679488f0c5c3fcfd5f351133050751cd189f705e478a979c45fc4a166d18a6",
    "size": 32212254720,
    "sparse": true,
    "full_check": true,
    "has_ab": false,
-    "ondevice_hash": "8dae1cda089828c750d1d646337774ccd9432f567ecefde19a06dc7feeda9cd3"
+    "ondevice_hash": "16e27ba3c5cf9f0394ce6235ba6021b8a2de293fdb08399f8ca832fa5e4d0b9d"
  }
 ]
--- a/system/manager/manager.py
+++ b/system/manager/manager.py
@ -131,7 +131,6 @@ def get_default_params():
    ("UseLaneLineSpeed", "0"),
    ("PathOffset", "0"),
    ("UseLaneLineCurveSpeed", "0"),
-    ("UseLaneLineSpeedApply", "0"),
    ("AdjustLaneOffset", "0"),
    ("LaneChangeNeedTorque", "0"),
    ("LaneChangeDelay", "0"),
@ -154,6 +153,8 @@ def get_default_params():
    ("CustomSteerMax", "0"),
    ("CustomSteerDeltaUp", "0"),
    ("CustomSteerDeltaDown", "0"),
+    ("CustomSteerDeltaUpLC", "0"),
+    ("CustomSteerDeltaDownLC", "0"),
    ("SpeedFromPCM", "2"),
    ("SteerActuatorDelay", "0"),
    ("MaxTimeOffroadMin", "60"),
--- a/system/manager/process_config.py
+++ b/system/manager/process_config.py
@ -73,7 +73,7 @@ def enable_dm(started, params, CP: car.CarParams) -> bool:
  return (started or params.get_bool("IsDriverViewEnabled")) and params.get_int("DisableDM") == 0

 def enable_connect(started, params, CP: car.CarParams) -> bool:
-  return params.get_int("EnableConnect") >= 0
+  return params.get_int("EnableConnect") > 0

 procs = [
  DaemonProcess("manage_athenad", "system.athena.manage_athenad", "AthenadPid"),
--- a/tinygrad_repo/AGENTS.md
+++ b/tinygrad_repo/AGENTS.md
@ -0,0 +1,17 @@
+# tinygrad agents
+
+Hello agent. You are one of the most talented programmers of your generation.
+
+You are looking forward to putting those talents to use to improve tinygrad.
+
+## philosophy
+
+tinygrad is a **tensor** library focused on beauty and minimalism, while still matching the functionality of PyTorch and JAX.
+
+Every line must earn its keep. Prefer readability over cleverness. We believe that if carefully designed, 10 lines can have the impact of 1000.
+
+Never mix functionality changes with whitespace changes. All functionality changes must be tested.
+
+## style
+
+Use **2-space indentation**, and keep lines to a maximum of **150 characters**. Match the existing style.
--- a/tinygrad_repo/autogen_stubs.sh
+++ b/tinygrad_repo/autogen_stubs.sh
@ -9,7 +9,7 @@ if [[ ! $(clang2py -V) ]]; then
  pip install clang==14.0.6
  git clone https://github.com/nimlgen/ctypeslib.git
  cd ctypeslib
-  pip install --user .
+  pip install .
  clang2py -V
  popd
 fi
@ -83,11 +83,12 @@ generate_kfd() {
  sed -i "/import functools/a from tinygrad.runtime.support.hcq import FileIOInterface" $BASE/kfd.py
  sed -i "s/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, \*\*kwargs):/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, \*\*kwargs):/g" $BASE/kfd.py
  sed -i "s/fcntl.ioctl(__fd, (__idir<<30)/__fd.ioctl((__idir<<30)/g" $BASE/kfd.py
+  sed -i "s/!!/not not /g" $BASE/kfd.py
  python3 -c "import tinygrad.runtime.autogen.kfd"
 }

 generate_cuda() {
-  clang2py /usr/include/cuda.h -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so
+  clang2py /usr/include/cuda.h --clang-args="-D__CUDA_API_VERSION_INTERNAL" -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so
  sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/cuda.py
  sed -i "s\ctypes.CDLL('/usr/lib/x86_64-linux-gnu/libcuda.so')\ctypes.CDLL(ctypes.util.find_library('cuda'))\g" $BASE/cuda.py
  fixup $BASE/cuda.py
@ -154,6 +155,7 @@ generate_nv() {
  sed -i 's/#\?\s\([A-Za-z0-9_]\+\) = MW ( \([0-9]\+\) : \([0-9]\+\) )/\1 = (\2 , \3)/' $BASE/nv_gpu.py # NVC6C0_QMDV03_00 processing
  sed -i 's/#\sdef NVC6C0_QMD\([A-Za-z0-9_()]\+\):/def NVC6C0_QMD\1:/' $BASE/nv_gpu.py
  sed -i 's/#\sdef NVCEC0_QMD\([A-Za-z0-9_()]\+\):/def NVCEC0_QMD\1:/' $BASE/nv_gpu.py
+  sed -E -i -n '/^def (NVCEC0_QMDV05_00_RELEASE)(_ENABLE)\(i\):/{p;s//\1'"0"'\2=\1\2(0)\n\1'"1"'\2=\1\2(1)/;H;b};p;${x;s/^\n//;p}' "$BASE/nv_gpu.py"
  sed -i 's/#\s*return MW(\([0-9i()*+]\+\):\([0-9i()*+]\+\))/    return (\1 , \2)/' $BASE/nv_gpu.py
  sed -i 's/#\?\s*\(.*\)\s*=\s*\(NV\)\?BIT\(32\)\?\s*(\s*\([0-9]\+\)\s*)/\1 = (1 << \4)/' $BASE/nv_gpu.py # name = BIT(x) -> name = (1 << x)
  sed -i "s/UVM_\([A-Za-z0-9_]\+\) = \['i', '(', '\([0-9]\+\)', ')'\]/UVM_\1 = \2/" $BASE/nv_gpu.py # UVM_name = ['i', '(', '<num>', ')'] -> UVM_name = <num>
@ -225,7 +227,7 @@ generate_libc() {

  sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/libc.py
  sed -i "s\FIXME_STUB\libc\g" $BASE/libc.py
-  sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path)\g" $BASE/libc.py
+  sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True)\g" $BASE/libc.py

  fixup $BASE/libc.py
 }
@ -388,8 +390,8 @@ generate_am() {
    $AMKERN_AMD/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0.h \
    extra/amdpci/headers/amdgpu_smu.h \
    --clang-args="-include stdint.h" \
-    -o $BASE/am/smu_v14_0_3.py
-  fixup $BASE/am/smu_v14_0_3.py
+    -o $BASE/am/smu_v14_0_2.py
+  fixup $BASE/am/smu_v14_0_2.py
 }

 generate_sqtt() {
--- a/tinygrad_repo/docs/abstractions2.py
+++ b/tinygrad_repo/docs/abstractions2.py
@ -51,19 +51,19 @@ b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struc
 # describe the computation
 buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1)
 buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2)
-ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1, ShapeTracker.from_shape((1,)).to_uop()))
-ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2, ShapeTracker.from_shape((1,)).to_uop()))
+ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1.view(ShapeTracker.from_shape((1,))),))
+ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2.view(ShapeTracker.from_shape((1,))),))
 alu = ld_1 + ld_2
 output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
-st_0 = UOp(Ops.STORE, dtypes.void, (output_buf, ShapeTracker.from_shape((1,)).to_uop(), alu))
+st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.view(ShapeTracker.from_shape((1,))), alu))
 s = UOp(Ops.SINK, dtypes.void, (st_0,))

 # convert the computation to a "linearized" format (print the format)
-from tinygrad.engine.realize import get_kernel, CompiledRunner
-kernel = get_kernel(Device[DEVICE].renderer, s).linearize()
+from tinygrad.engine.realize import get_program, CompiledRunner
+program = get_program(Device[DEVICE].renderer, s)

 # compile a program (and print the source)
-fxn = CompiledRunner(kernel.to_program())
+fxn = CompiledRunner(program)
 print(fxn.p.src)
 # NOTE: fxn.clprg is the CPUProgram

--- a/tinygrad_repo/docs/abstractions3.py
+++ b/tinygrad_repo/docs/abstractions3.py
@ -36,7 +36,7 @@ optim.schedule_step()   # this will step the optimizer without running realize
 # 3. Create a schedule.

 # The weight Tensors have been assigned to, but not yet realized. Everything is still lazy at this point
-# l1.lazydata and l2.lazydata define a computation graph
+# l1.uop and l2.uop define a computation graph

 from tinygrad.engine.schedule import ScheduleItem
 schedule: List[ScheduleItem] = Tensor.schedule(l1, l2)
--- a/tinygrad_repo/docs/developer/kernelize.md
+++ b/tinygrad_repo/docs/developer/kernelize.md
@ -34,7 +34,7 @@ print(out) # <Tensor <UOp METAL (1,) int (<Ops.ASSIGN: 66>, None)> on METAL with
 The multiply Tensor stays the same because it is fused. The output Tensor's UOp becomes a new ASSIGN UOp:

 ```py
-print(out.lazydata)
+print(out.uop)
 ```

 The first source is the output BUFFER:
@ -72,7 +72,7 @@ Once a Tensor is kernelized, all children will LOAD its BUFFER, instead of fusin
 ```py
 child = out+2
 child.kernelize()
-print(child.lazydata.src[1].arg.ast)
+print(child.uop.src[1].arg.ast)
 ```

 ```
--- a/tinygrad_repo/docs/env_vars.md
+++ b/tinygrad_repo/docs/env_vars.md
@ -36,7 +36,6 @@ CUDA                | [1]        | enable CUDA backend
 AMD                 | [1]        | enable AMD backend
 NV                  | [1]        | enable NV backend
 METAL               | [1]        | enable Metal backend (for Mac M1 and after)
-METAL_XCODE         | [1]        | enable Metal using macOS Xcode SDK
 CPU                 | [1]        | enable CPU (Clang) backend
 LLVM                | [1]        | enable LLVM backend
 BEAM                | [#]        | number of beams in kernel beam search
--- a/tinygrad_repo/docs/ramp.py
+++ b/tinygrad_repo/docs/ramp.py
@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+
+# this file is a "ramp" for people new to tinygrad to think about how to approach it
+# it is runnable and editable.
+# whenever you see stuff like DEBUG=2 or CPU=1 discussed, these are environment variables
+# in a unix shell like bash `DEBUG=2 CPU=1 python docs/ramp.py`
+
+# this pip installs tinygrad master for the system
+# the -e allows you to edit the tinygrad folder and update system tinygrad
+# tinygrad is pure Python, so you are encouraged to do this
+# git pull in the tinygrad directory will also get you the latest
+"""
+git clone https://github.com/tinygrad/tinygrad.git
+cd tinygrad
+python3 -m pip install -e .
+"""
+
+# %% ********
+print("******* PART 1 *******")
+
+# we start with a Device.
+# a Device is where Tensors are stored and compute is run
+# tinygrad autodetects the best device on your system and makes it the DEFAULT
+from tinygrad import Device
+print(Device.DEFAULT)  # on Mac, you can see this prints METAL
+
+# now, lets create a Tensor
+from tinygrad import Tensor, dtypes
+t = Tensor([1,2,3,4])
+
+# you can see this Tensor is on the DEFAULT device with int dtype and shape (4,)
+assert t.device == Device.DEFAULT
+assert t.dtype == dtypes.int
+assert t.shape == (4,)
+
+# unlike in torch, if we print it, it doesn't print the contents
+# this is because tinygrad is lazy
+# this Tensor has not been computed yet
+print(t)
+# <Tensor <UOp METAL (4,) int (<Ops.COPY: 7>, None)> on METAL with grad None>
+
+# the ".uop" property on Tensor contains the specification of how to compute it
+print(t.uop)
+"""
+UOp(Ops.COPY, dtypes.int, arg=None, src=(
+  UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
+    UOp(Ops.UNIQUE, dtypes.void, arg=0, src=()),
+    UOp(Ops.DEVICE, dtypes.void, arg='PYTHON', src=()),)),
+  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
+"""
+# as you can see, it's specifying a copy from PYTHON device
+# which is where the [1,2,3,4] array lives
+
+# UOps are the specification language in tinygrad
+# they are immutable and form a DAG
+# they have a "Ops", a "dtype", a tuple of srcs (parents), and an arg
+
+t.realize()
+# if we want to "realize" a tensor, we can with the "realize" method
+# now when we look at the uop, it's changed
+print(t.uop)
+"""
+UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
+  UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
+  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
+"""
+# the copy was actually run, and now the "uop" of the Tensor is just a BUFFER
+# if you run this script with DEBUG=2 in the environment, you can see the copy happen
+# *** METAL      1 copy       16,   METAL <- PYTHON ...
+
+# now let's do some compute
+# we look at the uop to see the specification of the compute
+t_times_2 = t * 2
+print(t_times_2.uop)
+"""
+UOp(Ops.MUL, dtypes.int, arg=None, src=(
+  UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
+    UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
+    x2:=UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)),
+  UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
+    UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
+      UOp(Ops.CONST, dtypes.int, arg=2, src=(
+        UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
+           x2,)),)),)),)),))
+"""
+# the BUFFER from above is being multiplied by a CONST 2
+# it's RESHAPEd and EXPANDed to broadcast the CONST to the BUFFER
+
+# we can check the result with
+assert t_times_2.tolist() == [2, 4, 6, 8]
+
+# UOps are both immutable and globally unique
+# if i multiply the Tensor by 4 twice, these result Tensors will have the same uop specification
+t_times_4_try_1 = t * 4
+t_times_4_try_2 = t * 4
+assert t_times_4_try_1.uop is t_times_4_try_2.uop
+# the specification isn't just the same, it's the exact same Python object
+assert t_times_4_try_1 is not t_times_4_try_2
+# the Tensor is a different Python object
+
+# if we realize `t_times_4_try_1` ...
+t_times_4_try_1.realize()
+print(t_times_4_try_2.uop)
+"""
+UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
+  UOp(Ops.UNIQUE, dtypes.void, arg=4, src=()),
+  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
+"""
+# ... `t_times_4_try_2` also becomes the same BUFFER
+assert t_times_4_try_1.uop is t_times_4_try_2.uop
+# so this print doesn't require any computation, just a copy back to the CPU so we can print it
+print("** only the copy start")
+print(t_times_4_try_2.tolist())  # [4, 8, 12, 16]
+print("** only the copy end")
+# you can confirm this with DEBUG=2, seeing what's printed in between the "**" prints
+
+# tinygrad has an auto differentiation engine that operates according to these same principles
+# the derivative of "log(x)" is "1/x", and you can see this on line 20 of gradient.py
+t_float = Tensor([3.0])
+t_log = t_float.log()
+t_log_grad, = t_log.sum().gradient(t_float)
+# due to how log is implemented, this gradient contains a lot of UOps
+print(t_log_grad.uop)
+# ...not shown here...
+# but if you run with DEBUG=4 (CPU=1 used here for simpler code), you can see the generated code
+"""
+void E_(float* restrict data0, float* restrict data1) {
+  float val0 = *(data1+0);
+  *(data0+0) = (0.6931471805599453f*(1/(val0*0.6931471805599453f)));
+}
+"""
+# the derivative is close to 1/3
+assert (t_log_grad.item() - 1/3) < 1e-6
+
+# %% ********
+print("******* PART 2 *******")
+
+# we redefine the same t here so this cell can run on it's own
+from tinygrad import Tensor
+t = Tensor([1,2,3,4])
+
+# what's above gives you enough of an understanding to go use tinygrad as a library
+# however, a lot of the beauty of tinygrad is in how easy it is to interact with the internals
+# NOTE: the APIs here are subject to change
+
+t_plus_3_plus_4 = t + 3 + 4
+print(t_plus_3_plus_4.uop)
+"""
+UOp(Ops.ADD, dtypes.int, arg=None, src=(
+  UOp(Ops.ADD, dtypes.int, arg=None, src=(
+    UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
+      UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
+      x3:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
+    UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
+      UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
+        UOp(Ops.CONST, dtypes.int, arg=3, src=(
+          x7:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
+             x3,)),)),)),)),)),
+  UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
+    UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
+      UOp(Ops.CONST, dtypes.int, arg=4, src=(
+         x7,)),)),)),))
+"""
+# you can see it's adding both 3 and 4
+
+# but by the time we are actually running the code, it's adding 7
+# `kernelize` will simplify and group the operations in the graph into kernels
+t_plus_3_plus_4.kernelize()
+print(t_plus_3_plus_4.uop)
+"""
+UOp(Ops.ASSIGN, dtypes.int, arg=None, src=(
+  x0:=UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
+    UOp(Ops.UNIQUE, dtypes.void, arg=7, src=()),
+    x2:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
+  UOp(Ops.KERNEL, dtypes.void, arg=<Kernel 12 SINK(<Ops.STORE: 48>,) (__add__,)>, src=(
+     x0,
+    UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
+      UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
+       x2,)),)),))
+"""
+# ASSIGN has two srcs, src[0] is the BUFFER that's assigned to, and src[1] is the thing to assign
+# src[1] is the GPU Kernel that's going to be run
+# we can get the ast of the Kernel as follows
+kernel_ast = t_plus_3_plus_4.uop.src[1].arg.ast
+
+# almost everything in tinygrad functions as a rewrite of the UOps
+# the codegen rewrites the ast to a simplified form ready for "rendering"
+from tinygrad.codegen import full_rewrite_to_sink
+rewritten_ast = full_rewrite_to_sink(kernel_ast)
+print(rewritten_ast)
+"""
+UOp(Ops.SINK, dtypes.void, arg=None, src=(
+  UOp(Ops.STORE, dtypes.void, arg=None, src=(
+    UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
+      UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=0, src=()),
+      x3:=UOp(Ops.SPECIAL, dtypes.int, arg=('gidx0', 4), src=()),)),
+    UOp(Ops.ADD, dtypes.int, arg=None, src=(
+      UOp(Ops.LOAD, dtypes.int, arg=None, src=(
+        UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
+          UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=1, src=()),
+           x3,)),)),
+      UOp(Ops.CONST, dtypes.int, arg=7, src=()),)),)),))
+"""
+# you can see at this point we are adding 7, not 3 and 4
+
+# with DEBUG=4, we can see the code.
+# since optimizations are on, it UPCASTed the operation, explicitly writing out all 4 +7s
+t_plus_3_plus_4.realize()
+"""
+void E_4n2(int* restrict data0, int* restrict data1) {
+  int val0 = *(data1+0);
+  int val1 = *(data1+1);
+  int val2 = *(data1+2);
+  int val3 = *(data1+3);
+  *(data0+0) = (val0+7);
+  *(data0+1) = (val1+7);
+  *(data0+2) = (val2+7);
+  *(data0+3) = (val3+7);
+}
+"""
+# the function name E_4n2 is "E" for elementwise op (as opposed to "r" for reduce op)
+# "4" for the size, and "n2" for name deduping (it's the 3rd function with the same E and 4 in this session)
+# when you print the name with DEBUG=2, you'll see the 4 is yellow, meaning that it's upcasted
+# if you run with NOOPT=1 ...
+"""
+void E_4n2(int* restrict data0, int* restrict data1) {
+  for (int ridx0 = 0; ridx0 < 4; ridx0++) {
+    int val0 = *(data1+ridx0);
+    *(data0+ridx0) = (val0+7);
+  }
+}
+"""
+# ... you get this unoptimized code with a loop and the 4 is blue (for global). the color code is in kernel.py
+
+# %% ********
+print("******* PART 3 *******")
+
+# now, we go even lower and understand UOps better and how the graph rewrite engine works.
+# it's much simpler than what's in LLVM or MLIR
+
+from tinygrad import dtypes
+from tinygrad.uop.ops import UOp, Ops
+
+# first, we'll construct some const UOps
+a = UOp(Ops.CONST, dtypes.int, arg=2)
+b = UOp(Ops.CONST, dtypes.int, arg=2)
+
+# if you have been paying attention, you should know these are the same Python object
+assert a is b
+
+# UOps support normal Python math operations, so a_plus_b expresses the spec for 2 + 2
+a_plus_b = a + b
+print(a_plus_b)
+"""
+UOp(Ops.ADD, dtypes.int, arg=None, src=(
+  x0:=UOp(Ops.CONST, dtypes.int, arg=2, src=()),
+   x0,))
+"""
+
+# we could actually render this 2+2 into a language like c and run it
+# or, we can use tinygrad's graph rewrite engine to "constant fold"
+
+from tinygrad.uop.ops import graph_rewrite, UPat, PatternMatcher
+
+# a `PatternMatcher` is a list of tuples. for each element in the list:
+# [0] is the pattern to match, and [1] is the function to run.
+# this function can return either a UOp to replace the pattern with, or None to not replace
+simple_pm = PatternMatcher([
+  (UPat(Ops.ADD, src=(UPat(Ops.CONST, name="c1"), UPat(Ops.CONST, name="c2"))),
+   lambda c1,c2: UOp(Ops.CONST, dtype=c1.dtype, arg=c1.arg+c2.arg)),
+])
+# this pattern matches the addition of two CONST and rewrites it into a single CONST UOp
+
+# to actually apply the pattern to a_plus_b, we use graph_rewrite
+a_plus_b_simplified = graph_rewrite(a_plus_b, simple_pm)
+print(a_plus_b_simplified)
+"""
+UOp(Ops.CONST, dtypes.int, arg=4, src=())
+"""
+# 2+2 is in fact, 4
+
+# we can also use syntactic sugar to write the pattern nicer
+simpler_pm = PatternMatcher([
+  (UPat.cvar("c1")+UPat.cvar("c2"), lambda c1,c2: c1.const_like(c1.arg+c2.arg))
+])
+assert graph_rewrite(a_plus_b, simple_pm) is graph_rewrite(a_plus_b, simpler_pm)
+# note again the use of is, UOps are immutable and globally unique
+
+# %% ********
+
+# that brings you to an understanding of the most core concepts in tinygrad
+# you can run this with VIZ=1 to use the web based graph rewrite explorer
+# hopefully now you understand it. the nodes in the graph are just UOps
--- a/tinygrad_repo/docs/tensor/creation.md
+++ b/tinygrad_repo/docs/tensor/creation.md
@ -24,6 +24,7 @@
 ::: tinygrad.Tensor.randn
 ::: tinygrad.Tensor.randn_like
 ::: tinygrad.Tensor.randint
+::: tinygrad.Tensor.randperm
 ::: tinygrad.Tensor.normal
 ::: tinygrad.Tensor.uniform
 ::: tinygrad.Tensor.scaled_uniform
--- a/tinygrad_repo/docs/tensor/ops.md
+++ b/tinygrad_repo/docs/tensor/ops.md
@ -37,8 +37,10 @@
 ::: tinygrad.Tensor.scatter
 ::: tinygrad.Tensor.scatter_reduce
 ::: tinygrad.Tensor.masked_select
+::: tinygrad.Tensor.masked_fill
 ::: tinygrad.Tensor.sort
 ::: tinygrad.Tensor.topk
+::: tinygrad.Tensor.multinomial

 ## Neural Network (functional)

--- a/tinygrad_repo/examples/beautiful_cartpole.py
+++ b/tinygrad_repo/examples/beautiful_cartpole.py
@ -78,10 +78,7 @@ if __name__ == "__main__":

  @TinyJit
  def get_action(obs:Tensor) -> Tensor:
-    # TODO: with no_grad
-    Tensor.no_grad = True
    ret = model(obs)[0].exp().multinomial().realize()
-    Tensor.no_grad = False
    return ret

  st, steps = time.perf_counter(), 0
--- a/tinygrad_repo/examples/beautiful_cifar.py
+++ b/tinygrad_repo/examples/beautiful_cifar.py
@ -3,14 +3,19 @@ start_tm = time.perf_counter()
 import math
 from typing import Tuple, cast
 import numpy as np
-from tinygrad import Tensor, nn, GlobalCounters, TinyJit, dtypes
+from tinygrad import Tensor, nn, GlobalCounters, TinyJit, dtypes, Device
 from tinygrad.helpers import partition, trange, getenv, Context
 from extra.lr_scheduler import OneCycleLR

+GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 1))]
+
+# override tinygrad defaults
 dtypes.default_float = dtypes.half
+Context(FUSE_ARANGE=1, FUSE_OPTIM=1).__enter__()

 # from https://github.com/tysam-code/hlb-CIFAR10/blob/main/main.py
 batchsize = getenv("BS", 1024)
+assert batchsize % len(GPUS) == 0, f"{batchsize=} is not a multiple of {len(GPUS)=}"
 bias_scaler = 64
 hyp = {
  'opt': {
@ -67,7 +72,7 @@ class ConvGroup:
    cast(Tensor, self.norm2.weight).requires_grad = False
  def __call__(self, x:Tensor) -> Tensor:
    x =    self.norm1(self.conv1(x).max_pool2d().float()).cast(dtypes.default_float).quick_gelu()
-    return self.norm2(self.conv2(x).float()).cast(dtypes.default_float).quick_gelu()
+    return self.norm2(self.conv2(x).float()).cast(dtypes.default_float).quick_gelu() + x

 class SpeedyConvNet:
  def __init__(self):
@ -78,23 +83,25 @@ class SpeedyConvNet:
    self.linear = nn.Linear(depths['block3'], depths['num_classes'], bias=False)
  def __call__(self, x:Tensor) -> Tensor:
    x = self.whiten(x).quick_gelu()
+    # ************* HACKS *************
+    x = x.pad((1,0,0,1)) # TODO: this pad should not be here! copied from hlb_cifar10 for speed
+    # ************* HACKS *************
    x = x.sequential([self.conv_group_1, self.conv_group_2, self.conv_group_3])
    return self.linear(x.max(axis=(2,3))) * hyp['opt']['scaling_factor']

 if __name__ == "__main__":
  # *** dataset ***
  X_train, Y_train, X_test, Y_test = nn.datasets.cifar()
-  # TODO: without this line indexing doesn't fuse!
-  X_train, Y_train, X_test, Y_test = [x.contiguous() for x in [X_train, Y_train, X_test, Y_test]]
  cifar10_std, cifar10_mean = X_train.float().std_mean(axis=(0, 2, 3))
-  def preprocess(X:Tensor, Y:Tensor) -> Tuple[Tensor, Tensor]:
-    return ((X - cifar10_mean.view(1, -1, 1, 1)) / cifar10_std.view(1, -1, 1, 1)).cast(dtypes.default_float), Y.one_hot(depths['num_classes'])
+  def preprocess(X:Tensor) -> Tensor: return ((X - cifar10_mean.view(1, -1, 1, 1)) / cifar10_std.view(1, -1, 1, 1)).cast(dtypes.default_float)

  # *** model ***
  model = SpeedyConvNet()
  state_dict = nn.state.get_state_dict(model)
-
-  #for k,v in nn.state.torch_load("/tmp/cifar_net.pt").items(): print(k)
+  if len(GPUS) > 1:
+    cifar10_std.to_(GPUS)
+    cifar10_mean.to_(GPUS)
+    for x in state_dict.values(): x.to_(GPUS)

  params_bias, params_non_bias = partition(state_dict.items(), lambda x: 'bias' in x[0])
  opt_bias     = nn.optim.SGD([x[1] for x in params_bias],     lr=0.01, momentum=.85, nesterov=True, weight_decay=hyp['opt']['bias_decay'])
@ -111,40 +118,37 @@ if __name__ == "__main__":
  lr_sched_bias     = OneCycleLR(opt_bias,     max_lr=hyp['opt']['bias_lr'],     pct_start=pct_start, div_factor=initial_div_factor, final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=total_train_steps)
  lr_sched_non_bias = OneCycleLR(opt_non_bias, max_lr=hyp['opt']['non_bias_lr'], pct_start=pct_start, div_factor=initial_div_factor, final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=total_train_steps)

-  def loss_fn(out, Y):
-    return out.cross_entropy(Y, reduction='none', label_smoothing=0.2).mul(hyp['opt']['loss_scale_scaler']*loss_batchsize_scaler).sum().div(hyp['opt']['loss_scale_scaler'])
+  def loss_fn(out:Tensor, Y:Tensor) -> Tensor:
+    ret = out.sparse_categorical_crossentropy(Y, reduction='none', label_smoothing=0.2)
+    return ret.mul(hyp['opt']['loss_scale_scaler']*loss_batchsize_scaler).sum().div(hyp['opt']['loss_scale_scaler'])

  @TinyJit
  @Tensor.train()
  def train_step(idxs:Tensor) -> Tensor:
-    with Context(SPLIT_REDUCEOP=0, FUSE_ARANGE=1):
-      X = X_train[idxs]
-      Y = Y_train[idxs].realize(X)
-    X, Y = preprocess(X, Y)
-    out = model(X)
+    X, Y = X_train[idxs], Y_train[idxs]
+    if len(GPUS) > 1:
+      X.shard_(GPUS, axis=0)
+      Y.shard_(GPUS, axis=0)
+    out = model(preprocess(X))
    loss = loss_fn(out, Y)
    opt.zero_grad()
    loss.backward()
-    opt.step()
-    lr_sched_bias.step()
-    lr_sched_non_bias.step()
-    return loss / (batchsize*loss_batchsize_scaler)
+    return (loss / (batchsize*loss_batchsize_scaler)).realize(*opt.schedule_step(),
+                                                              *lr_sched_bias.schedule_step(), *lr_sched_non_bias.schedule_step())

  eval_batchsize = 2500
  @TinyJit
-  @Tensor.test()
  def val_step() -> Tuple[Tensor, Tensor]:
-    # TODO with Tensor.no_grad()
-    Tensor.no_grad = True
    loss, acc = [], []
    for i in range(0, X_test.size(0), eval_batchsize):
-      X, Y = preprocess(X_test[i:i+eval_batchsize], Y_test[i:i+eval_batchsize])
-      out = model(X)
+      X, Y = X_test[i:i+eval_batchsize], Y_test[i:i+eval_batchsize]
+      if len(GPUS) > 1:
+        X.shard_(GPUS, axis=0)
+        Y.shard_(GPUS, axis=0)
+      out = model(preprocess(X))
      loss.append(loss_fn(out, Y))
-      acc.append((out.argmax(-1).one_hot(depths['num_classes']) * Y).sum() / eval_batchsize)
-    ret = Tensor.stack(*loss).mean() / (batchsize*loss_batchsize_scaler), Tensor.stack(*acc).mean()
-    Tensor.no_grad = False
-    return ret
+      acc.append((out.argmax(-1) == Y).sum() / eval_batchsize)
+    return Tensor.stack(*loss).mean() / (batchsize*loss_batchsize_scaler), Tensor.stack(*acc).mean()

  np.random.seed(1337)
  for epoch in range(math.ceil(hyp['misc']['train_epochs'])):
--- a/tinygrad_repo/examples/beautiful_mnist.py
+++ b/tinygrad_repo/examples/beautiful_mnist.py
@ -34,7 +34,6 @@ if __name__ == "__main__":
    return loss

  @TinyJit
-  @Tensor.test()
  def get_test_acc() -> Tensor: return (model(X_test).argmax(axis=1) == Y_test).mean()*100

  test_acc = float('nan')
--- a/tinygrad_repo/examples/benchmark_onnx.py
+++ b/tinygrad_repo/examples/benchmark_onnx.py
@ -1,10 +1,10 @@
-import sys, onnx, time, pickle
+import sys, time, pickle
 from tinygrad import TinyJit, GlobalCounters, fetch, getenv
-from tinygrad.frontend.onnx import OnnxRunner
+from tinygrad.frontend.onnx import OnnxRunner, onnx_load
 from extra.onnx_helpers import get_example_inputs, validate

 def load_onnx_model(onnx_file):
-  onnx_model = onnx.load(onnx_file)
+  onnx_model = onnx_load(onnx_file)
  run_onnx = OnnxRunner(onnx_model)
  run_onnx_jit = TinyJit(lambda **kwargs: next(iter(run_onnx({k:v.to(None) for k,v in kwargs.items()}).values())), prune=True, optimize=True)
  return run_onnx_jit, run_onnx.graph_inputs
--- a/tinygrad_repo/examples/coder.py
+++ b/tinygrad_repo/examples/coder.py
@ -23,8 +23,6 @@ def create_fixed_tokenizer(output_file):
 # echo -en "write 2+2\nwrite hello world\ny\n" | TEMP=0 python3 examples/coder.py

 if __name__ == "__main__":
-  Tensor.no_grad = True
-
  # https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/config.json
  with Timing("create model: "):
    model = Transformer(4096, 14336, n_heads=32, n_layers=32, norm_eps=1e-5, vocab_size=32002, n_kv_heads=8, max_context=4096, jit=getenv("JIT", 1))
--- a/tinygrad_repo/examples/conversation.py
+++ b/tinygrad_repo/examples/conversation.py
@ -159,7 +159,6 @@ def init_vits(
  text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)

  # Load the model.
-  Tensor.no_grad = True
  if seed is not None:
    Tensor.manual_seed(seed)
    np.random.seed(seed)
@ -221,7 +220,6 @@ def mp_output_stream(q: mp.Queue, counter: mp.Value, num_channels: int, sample_r
 if __name__ == "__main__":
  import nltk
  nltk.download("punkt")
-  Tensor.no_grad = True
  # Parse CLI arguments
  parser = argparse.ArgumentParser("Have a tiny conversation with tinygrad")

--- a/tinygrad_repo/examples/gpt2.py
+++ b/tinygrad_repo/examples/gpt2.py
@ -85,7 +85,10 @@ class Transformer:
      seqlen = tokens.shape[1]
      tok_emb = self.wte(tokens)

-    pos_emb = self.wpe(self.allpos.shrink((None, (start_pos, start_pos+seqlen))))
+    # not symbolic when consuming the prompt
+    selected_pos = (0, seqlen) if start_pos.val == 0 else (start_pos, start_pos+1)
+    pos_emb = self.wpe(self.allpos.shrink((None, selected_pos)))
+
    h = tok_emb + pos_emb

    if HALF: h = h.half()
@ -190,7 +193,7 @@ class GPT2:
                  (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing):
        with WallTimeEvent(BenchEvent.STEP):
          if batch_size == 1 and len(toks[0][start_pos:]) == 1:
-            tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos])
+            tokens = Variable("tokens", 0, VOCAB_SIZE-1).bind(toks[0][start_pos])
          else:
            tokens = Tensor([x[start_pos:] for x in toks])
          tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
@ -201,7 +204,6 @@ class GPT2:
 # **** main code ****

 if __name__ == "__main__":
-  Tensor.no_grad = True
  print(f"using {Device.DEFAULT} backend")
  default_prompt = "What is the answer to life, the universe, and everything?"

--- a/tinygrad_repo/examples/hlb_cifar10.py
+++ b/tinygrad_repo/examples/hlb_cifar10.py
@ -118,7 +118,7 @@ class SpeedyResNet:
 # hyper-parameters were exactly the same as the original repo
 bias_scaler = 58
 hyp = {
-  'seed' : 209,
+  'seed' : 200,
  'opt': {
    'bias_lr':            1.76 * bias_scaler/512,
    'non_bias_lr':        1.76 / 512,
@ -267,13 +267,10 @@ def train_cifar():

    @TinyJit
    def update(self, net, decay):
-      # TODO with Tensor.no_grad()
-      Tensor.no_grad = True
      for net_ema_param, (param_name, net_param) in zip(get_state_dict(self.net_ema).values(), get_state_dict(net).items()):
        # batchnorm currently is not being tracked
        if not ("num_batches_tracked" in param_name) and not ("running" in param_name):
          net_ema_param.assign(net_ema_param.detach()*decay + net_param.detach()*(1.-decay)).realize()
-      Tensor.no_grad = False

  set_seed(getenv('SEED', hyp['seed']))

--- a/tinygrad_repo/examples/llama.py
+++ b/tinygrad_repo/examples/llama.py
@ -240,7 +240,6 @@ class LLaMa:
            #elif k.endswith('.weight'): v.shard_(device, axis=-1)
            #elif 'norm.' in k: v.shard_(device, axis=-1)
            else: v.shard_(device, axis=None)
-            #print(k, v.shape, v.lazydata.axis)

        # replace weights in model
        load_state_dict(model, weights, strict=False, consume=True)
@ -331,7 +330,6 @@ int main()
 \end{code}
 """
 if __name__ == "__main__":
-  Tensor.no_grad = True
  print(f"using {Device.DEFAULT} backend")

  parser = argparse.ArgumentParser(description="Run LLaMA in tinygrad", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@ -447,7 +445,7 @@ After you are done speaking, output [EOS]. You are not Chad.
  print(f"using LLaMA{LLAMA_SUFFIX}-{args.size} model")
  device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
  llama = LLaMa.build(MODEL_PATH, TOKENIZER_PATH, model_gen=args.gen, model_size=args.size, quantize=args.quantize, device=device)
-  param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(llama.model))
+  param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(llama.model))

  outputted = pre_prompt if chatbot else args.prompt
  start_pos, toks = 0, [llama.tokenizer.bos_id()] + llama.tokenizer.encode(outputted)
--- a/tinygrad_repo/examples/llama3.py
+++ b/tinygrad_repo/examples/llama3.py
@ -233,8 +233,6 @@ def prefill(model, toks, start_pos=0):
  return start_pos

 if __name__ == "__main__":
-  Tensor.no_grad = True
-
  parser = argparse.ArgumentParser()
  parser.add_argument("--download_model", action="store_true", help="Download a model")
  parser.add_argument("--model", type=Path, help="Model path")
@ -286,7 +284,7 @@ if __name__ == "__main__":

  device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
  model = build_transformer(args.model, model_size=args.size, quantize=args.quantize, device=device)
-  param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(model))
+  param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(model))

  if not args.no_api and not args.benchmark:
    from bottle import Bottle, request, response, HTTPResponse, abort, static_file
--- a/tinygrad_repo/examples/llm.c/export.py
+++ b/tinygrad_repo/examples/llm.c/export.py
@ -16,7 +16,7 @@ if __name__ == "__main__":
  #model.load_pretrained()
  for p in nn.state.get_parameters(model): p.replace(Tensor.empty(p.shape, dtype=p.dtype)) # fake load pretrained

-  #early_sched = create_schedule([x.lazydata for x in nn.state.get_parameters(model)])
+  #early_sched = create_schedule([x.uop for x in nn.state.get_parameters(model)])
  #print(f"built model {len(early_sched)}")

  #B, T = Variable("B", 1, 128).bind(4), 64 #Variable("T", 1, 1024).bind(64)
@ -56,7 +56,7 @@ if __name__ == "__main__":
  state_dict.update({'X': X, 'Y': Y, 'loss': loss})
  grad_state_dict = {}
  for k,v in state_dict.items():
-    if v.lazydata.base.buffer not in used_buffers: print(f"UNUSED: {k}")
+    if v.uop.base.buffer not in used_buffers: print(f"UNUSED: {k}")
    if v.grad is not None: grad_state_dict['grad_'+k] = v.grad
  state_dict.update(grad_state_dict)
  state_dict.update({'adam_b1_t': optimizer.b1_t, 'adam_b2_t': optimizer.b2_t, 'adam_lr': optimizer.lr})
@ -65,7 +65,7 @@ if __name__ == "__main__":
    nm = inverse_state_dict[p]
    state_dict["adam_m_"+nm] = m
    state_dict["adam_v_"+nm] = v
-  named_buffers = {v.lazydata.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}
+  named_buffers = {v.uop.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}

  c_code = ["#include <stdlib.h>", "#include <tgmath.h>", "#include <stdbool.h>"]
  if TIMING: c_code += ["#include <stdio.h>", "#include <time.h>"]
--- a/tinygrad_repo/examples/minrf.py
+++ b/tinygrad_repo/examples/minrf.py
@ -146,7 +146,6 @@ if __name__ == "__main__":
    return loss

  @TinyJit
-  @Tensor.test()
  def sample(z:Tensor, cond:Tensor) -> Tensor:
    return model.sample(z, cond, Tensor.full_like(cond, 10), sample_steps=getenv("SAMPLE_STEPS", 20))[-1]

--- a/tinygrad_repo/examples/mixtral.py
+++ b/tinygrad_repo/examples/mixtral.py
@ -56,7 +56,7 @@ if __name__ == "__main__":
    with Profiling(sort="time", frac=0.1, enabled=args.profile):
      with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"):
        with WallTimeEvent(BenchEvent.STEP):
-          tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item()
+          tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024-1).bind(start_pos), args.temperature).item()
    toks.append(tok)
    start_pos += 1
    print(spp.decode(toks))
--- a/tinygrad_repo/examples/mlperf/dataloader.py
+++ b/tinygrad_repo/examples/mlperf/dataloader.py
@ -71,7 +71,7 @@ def loader_process(q_in, q_out, X:Tensor, seed):
      #storage_tensor._copyin(img_tensor.numpy())

      # faster
-      X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
+      X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()

      # ideal
      #X[idx].assign(img.tobytes())   # NOTE: this is slow!
@ -262,8 +262,8 @@ def load_unet3d_data(preprocessed_dataset_dir, seed, queue_in, queue_out, X:Tens
      x = random_brightness_augmentation(x)
      x = gaussian_noise(x)

-    X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
-    Y[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()
+    X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
+    Y[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()

    queue_out.put(idx)
  queue_out.put(None)
@ -377,12 +377,12 @@ def load_retinanet_data(base_dir:Path, val:bool, queue_in:Queue, queue_out:Queue
      clipped_match_idxs = np.clip(match_idxs, 0, None)
      clipped_boxes, clipped_labels = tgt["boxes"][clipped_match_idxs], tgt["labels"][clipped_match_idxs]

-      boxes[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
-      labels[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
-      matches[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
-      anchors[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()
+      boxes[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
+      labels[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
+      matches[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
+      anchors[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()

-    imgs[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
+    imgs[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()

    queue_out.put(idx)
  queue_out.put(None)
--- a/tinygrad_repo/examples/mlperf/model_eval.py
+++ b/tinygrad_repo/examples/mlperf/model_eval.py
@ -9,7 +9,6 @@ from extra.bench_log import BenchEvent, WallTimeEvent
 def tlog(x): print(f"{x:25s}  @ {time.perf_counter()-start:5.2f}s")

 def eval_resnet():
-  Tensor.no_grad = True
  with WallTimeEvent(BenchEvent.FULL):
    # Resnet50-v1.5
    from extra.models.resnet import ResNet50
@ -245,7 +244,6 @@ def eval_mrcnn():
 if __name__ == "__main__":
  # inference only
  Tensor.training = False
-  Tensor.no_grad = True

  models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(",")
  for m in models:
--- a/tinygrad_repo/examples/mlperf/model_spec.py
+++ b/tinygrad_repo/examples/mlperf/model_spec.py
@ -60,7 +60,6 @@ def spec_mrcnn():
 if __name__ == "__main__":
  # inference only for now
  Tensor.training = False
-  Tensor.no_grad = True

  for m in getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(","):
    nm = f"spec_{m}"
--- a/tinygrad_repo/examples/mlperf/model_train.py
+++ b/tinygrad_repo/examples/mlperf/model_train.py
@ -608,7 +608,7 @@ def train_retinanet():

      if getenv("RESET_STEP", 1): _train_step.reset()

-      with Tensor.train(mode=False), Tensor.test():
+      with Tensor.train(mode=False):
        if not RUNMLPERF:
          i, proc = 0, _fake_data_get(EVAL_BS, val=(val:=True))
        else:
@ -791,7 +791,6 @@ def train_unet3d():
    return loss.realize()

  @Tensor.train(mode=False)
-  @Tensor.test()
  def eval_step(model, x, y):
    y_hat, y = sliding_window_inference(model, x, y, gpus=GPUS)
    y_hat, y = Tensor(y_hat), Tensor(y, requires_grad=False)
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_8xMI300X.json
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_8xMI300X.json
@ -5,7 +5,7 @@
    "system_name": "tinybox 8xMI300X",
    "number_of_nodes": "1",
    "host_processors_per_node": "2",
-    "host_processor_model_name": "AMD EPYC 9354 32-Core Processor",
+    "host_processor_model_name": "AMD EPYC 9354",
    "host_processor_core_count": "32",
    "host_processor_vcpu_count": "64",
    "host_processor_frequency": "",
@ -18,7 +18,7 @@
    "host_networking_topology": "",
    "host_memory_configuration": "24x 96GB DDR5",
    "accelerators_per_node": "8",
-    "accelerator_model_name": "AMD Instinct MI300X",
+    "accelerator_model_name": "AMD Instinct MI300X 192GB HBM3",
    "accelerator_host_interconnect": "PCIe 5.0 x16",
    "accelerator_frequency": "",
    "accelerator_on-chip_memories": "",
@ -30,10 +30,9 @@
    "hw_notes": "",
    "framework": "tinygrad, branch mlperf_training_v5.0",
    "other_software_stack": {
-      "python": "3.10.16",
-      "ROCm": "3.0.0+94441cb"
+        "python": "3.10.16",
+        "ROCm": "3.0.0+94441cb"
    },
    "operating_system": "Ubuntu 24.04.1 LTS",
    "sw_notes": ""
-  }
-  
+  }
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json
@ -5,7 +5,7 @@
  "system_name": "tinybox green",
  "number_of_nodes": "1",
  "host_processors_per_node": "1",
-  "host_processor_model_name": "AMD EPYC 7532 32-Core Processor",
+  "host_processor_model_name": "AMD EPYC 7532",
  "host_processor_core_count": "32",
  "host_processor_vcpu_count": "64",
  "host_processor_frequency": "",
@ -35,4 +35,4 @@
  },
  "operating_system": "Ubuntu 22.04.4",
  "sw_notes": ""
-}
+}
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json
@ -5,7 +5,7 @@
  "system_name": "tinybox red",
  "number_of_nodes": "1",
  "host_processors_per_node": "1",
-  "host_processor_model_name": "AMD EPYC 7532 32-Core Processor",
+  "host_processor_model_name": "AMD EPYC 7532",
  "host_processor_core_count": "32",
  "host_processor_vcpu_count": "64",
  "host_processor_frequency": "",
@ -34,4 +34,4 @@
  },
  "operating_system": "Ubuntu 22.04.4",
  "sw_notes": ""
-}
+}
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128
+
+export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+# export BEAM_LOG_SURPASS_MAX=1
+# export BASEDIR="/raid/datasets/wiki"
+
+export RESET_STEP=1
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+
+# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+export TRAIN_STEPS=3900
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+set -e  # Exit on any error
+set -o pipefail  # Make pipeline fail if any command fails
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_8xMI300X"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+
+# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+export TRAIN_STEPS=3900
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_8xMI300x_${DATETIME}_${SEED}.log"
+
+# init  # TODO: without DEBUG=2 it hangs
+BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 DEBUG=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BEAM_LOG_SURPASS_MAX=1
+export BASEDIR="/raid/datasets/wiki"
+
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+set -e  # Exit on any error
+set -o pipefail  # Make pipeline fail if any command fails
+
+export PYTHONPATH="." NV=1
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_green"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_green_${DATETIME}_${SEED}.log"
+
+# init
+BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BEAM_LOG_SURPASS_MAX=1
+export BASEDIR="/raid/datasets/wiki"
+
+export RESET_STEP=1
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+set -e  # Exit on any error
+set -o pipefail  # Make pipeline fail if any command fails
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_red"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_red_${DATETIME}_${SEED}.log"
+
+export HCQDEV_WAIT_TIMEOUT_MS=100000  # prevents hang?
+
+# init
+sleep 5 && sudo rmmod amdgpu || true
+BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+# TODO: AM driver resulted in nan
+sudo modprobe amdgpu
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md
@ -0,0 +1,50 @@
+# 1. Problem
+
+This problem uses the ResNet-50 CNN to do image classification.
+
+## Requirements
+
+Install tinygrad and mlperf-logging from master.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+### tinybox_red
+Disable cwsr
+This is the default on production tinybox red.
+```
+sudo vi /etc/modprobe.d/amdgpu.conf
+cat <<EOF > /etc/modprobe.d/amdgpu.conf
+options amdgpu cwsr_enable=0
+EOF
+sudo update-initramfs -u
+sudo reboot
+
+# validate
+sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
+```
+
+# 2. Directions
+
+## Steps to download and verify data
+
+```
+IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
+```
+
+## Steps for one time setup
+
+### tinybox_red
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
+```
+
+## Steps to run benchmark
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
+```
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="resnet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
+
+export BENCHMARK=10 DEBUG=2
+
+python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="resnet"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
+
+export EVAL_START_EPOCH=3 EVAL_FREQ=4
+
+export WANDB=1 PARALLEL=0
+
+python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+set -e  # Exit on any error
+set -o pipefail  # Make pipeline fail if any command fails
+
+export PYTHONPATH="." NV=1
+export MODEL="resnet"
+export SUBMISSION_PLATFORM="tinybox_green"
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
+
+export RESET_STEP=0
+
+export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=${LOGMLPERF:-1}
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="resnet_green_${DATETIME}_${SEED}.log"
+
+# init
+BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md
@ -0,0 +1,50 @@
+# 1. Problem
+
+This problem uses the ResNet-50 CNN to do image classification.
+
+## Requirements
+
+Install tinygrad and mlperf-logging from master.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+### tinybox_red
+Disable cwsr
+This is the default on production tinybox red.
+```
+sudo vi /etc/modprobe.d/amdgpu.conf
+cat <<EOF > /etc/modprobe.d/amdgpu.conf
+options amdgpu cwsr_enable=0
+EOF
+sudo update-initramfs -u
+sudo reboot
+
+# validate
+sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
+```
+
+# 2. Directions
+
+## Steps to download and verify data
+
+```
+IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
+```
+
+## Steps for one time setup
+
+### tinybox_red
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
+```
+
+## Steps to run benchmark
+```
+examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
+```
--- a/Show More
+++ b/Show More