KerryGold Model, AGNOS12.4, AdjustLaneChange, EnglighSound (#182)

* Vegetarian Filet o Fish model * fix.. atc.. * test cluster_speed_limit * fix.. cluster_speed_limit.. 2 * fix.. clusterspeedlimit3 * cruise speed to roadlimit speed * fix.. * fix.. eng * deltaUp/Down for lanechange * fix.. atc desire... * fix.. * ff * ff * fix.. * fix.. eng * fix engsound * Update desire_helper.py * fix.. connect... * fix curve_min speed * Revert "fix curve_min speed" This reverts commit fcc9c2eb14eb3504abef3e420db93e8882e56f37. * Reapply "fix curve_min speed" This reverts commit 2d2bba476c58a7b4e13bac3c3ad0e4694c95515d. * fix.. auto speed up.. roadlimit * fix.. atc auto lanechange... * Update desire_helper.py * Update cruise.py * debug atc... * fix.. waze alert offset.. * fix.. * test atc.. * fix.. * fix.. atc * atc test.. * fix.. atc * fix.. atc2 * fix.. atc3 * KerryGold Model. latsmooth_sec = 0.0 * lat smooth seconds 0.13 * fix comment * fix.. auto cruise, and speed unit * change lanemode switching. * erase mazda lkas button.
2025-06-22 10:51:42 +09:00 · 2025-06-22 10:51:42 +09:00 · 9c7833faf9
commit 9c7833faf9
parent efee1712aa
385 changed files with 12951 additions and 12621 deletions
--- a/common/params_keys.h
+++ b/common/params_keys.h
@ -236,7 +236,6 @@ inline static std::unordered_map<std::string, uint32_t> keys = {
    {"HapticFeedbackWhenSpeedCamera", PERSISTENT},
    {"UseLaneLineSpeed", PERSISTENT},
    {"UseLaneLineCurveSpeed", PERSISTENT},
    {"UseLaneLineSpeedApply", PERSISTENT},
    {"AdjustLaneOffset", PERSISTENT},
    {"LaneChangeNeedTorque", PERSISTENT},
    {"LaneChangeDelay", PERSISTENT },
@ -261,6 +260,8 @@ inline static std::unordered_map<std::string, uint32_t> keys = {
    {"CustomSteerMax", PERSISTENT},
    {"CustomSteerDeltaUp", PERSISTENT},
    {"CustomSteerDeltaDown", PERSISTENT},
    {"CustomSteerDeltaUpLC", PERSISTENT},
    {"CustomSteerDeltaDownLC", PERSISTENT},
    {"SpeedFromPCM", PERSISTENT},
    {"MaxTimeOffroadMin", PERSISTENT},
    {"DisableDM", PERSISTENT},
--- a/launch_env.sh
+++ b/launch_env.sh
@ -7,7 +7,7 @@ export OPENBLAS_NUM_THREADS=1
 export VECLIB_MAXIMUM_THREADS=1
 if [ -z "$AGNOS_VERSION" ]; then
-  export AGNOS_VERSION="12.3"
+  export AGNOS_VERSION="12.4"
 fi
 export STAGING_ROOT="/data/safe_staging"
--- a/opendbc_repo/opendbc/car/car.capnp
+++ b/opendbc_repo/opendbc/car/car.capnp
@ -246,6 +246,7 @@ struct CarState {
  speedLimitDistance @65 :Float32;
  gearStep @66 :Int16;          
  tpms @67 : Tpms;
  useLaneLineSpeed @68 : Float32;
  struct Tpms {
    fl @0 :Float32;
--- a/opendbc_repo/opendbc/car/hyundai/carcontroller.py
+++ b/opendbc_repo/opendbc/car/hyundai/carcontroller.py
@ -96,6 +96,9 @@ class CarController(CarControllerBase):
    self.activeCarrot = 0
    self.camera_scc_params = Params().get_int("HyundaiCameraSCC")
    self.steerDeltaUpOrg = self.steerDeltaUp = self.steerDeltaUpLC = self.params.STEER_DELTA_UP
    self.steerDeltaDownOrg = self.steerDeltaDown = self.steerDeltaDownLC = self.params.STEER_DELTA_DOWN
  def update(self, CC, CS, now_nanos):
    if self.frame % 50 == 0:
@ -104,14 +107,30 @@ class CarController(CarControllerBase):
      steerMax = params.get_int("CustomSteerMax")
      steerDeltaUp = params.get_int("CustomSteerDeltaUp")
      steerDeltaDown = params.get_int("CustomSteerDeltaDown")
      steerDeltaUpLC = params.get_int("CustomSteerDeltaUpLC")
      steerDeltaDownLC = params.get_int("CustomSteerDeltaDownLC")
      if steerMax > 0:
        self.params.STEER_MAX = steerMax
      if steerDeltaUp > 0:
-        self.params.STEER_DELTA_UP = steerDeltaUp
+        self.steerDeltaUp = steerDeltaUp
        #self.params.ANGLE_TORQUE_UP_RATE = steerDeltaUp
      else:
        self.steerDeltaUp = self.steerDeltaUpOrg
      if steerDeltaDown > 0:
-        self.params.STEER_DELTA_DOWN = steerDeltaDown
+        self.steerDeltaDown = steerDeltaDown
        #self.params.ANGLE_TORQUE_DOWN_RATE = steerDeltaDown
      else:
        self.steerDeltaDown = self.steerDeltaDownOrg
      if steerDeltaUpLC > 0:
        self.steerDeltaUpLC = steerDeltaUpLC
      else:
        self.steerDeltaUpLC = self.steerDeltaUp
      if steerDeltaDownLC > 0:
        self.steerDeltaDownLC = steerDeltaDownLC
      else:
        self.steerDeltaDownLC = self.steerDeltaDown
      self.soft_hold_mode = 1 if params.get_int("AutoCruiseControl") > 1 else 2
      self.hapticFeedbackWhenSpeedCamera = int(params.get_int("HapticFeedbackWhenSpeedCamera"))
@ -126,6 +145,13 @@ class CarController(CarControllerBase):
    actuators = CC.actuators
    hud_control = CC.hudControl
    if hud_control.modelDesire in [3,4]:
      self.params.STEER_DELTA_UP = self.steerDeltaUpLC
      self.params.STEER_DELTA_DOWN = self.steerDeltaDownLC
    else:
      self.params.STEER_DELTA_UP = self.steerDeltaUp
      self.params.STEER_DELTA_DOWN = self.steerDeltaDown
    angle_control = self.CP.flags & HyundaiFlags.ANGLE_CONTROL
    # steering torque
--- a/opendbc_repo/opendbc/car/hyundai/carstate.py
+++ b/opendbc_repo/opendbc/car/hyundai/carstate.py
@ -76,6 +76,7 @@ class CarState(CarStateBase):
    self.cruise_buttons_msg = None
    self.hda2_lfa_block_msg = None
    self.cluster_speed_limit_msg = None
    # On some cars, CLU15->CF_Clu_VehicleSpeed can oscillate faster than the dash updates. Sample at 5 Hz
    self.cluster_speed = 0
@ -461,6 +462,9 @@ class CarState(CarStateBase):
      if "TCS" in cp.vl:
        self.tcs_info_373 = copy.copy(cp.vl.get("TCS", {}))
      if "CLUSTER_SPEED_LIMIT" in cp.vl:
        self.cluster_speed_limit_msg = copy.copy(cp.vl.get("CLUSTER_SPEED_LIMIT", {}))
    if "GEAR" in cp.vl:
      ret.gearStep = cp.vl["GEAR"]["GEAR_STEP"]
    elif "GEAR_ALT" in cp.vl:
@ -596,6 +600,8 @@ class CarState(CarStateBase):
    # 어떤차는 bus2에 있음, 내차는 bus0에 있는데.... 이건 옆두부와 관련이 없나?
    #if CP.flags & HyundaiFlags.CANFD_HDA2:
    #  pt_messages.append(("CLUSTER_SPEED_LIMIT", 10))
    if Params().get_int("CanfdDebug") > 0:
      pt_messages.append(("CLUSTER_SPEED_LIMIT", 10))
    cam_messages = []
    if CP.flags & HyundaiFlags.CANFD_HDA2 and not (CP.flags & HyundaiFlags.CAMERA_SCC.value):
--- a/opendbc_repo/opendbc/car/hyundai/hyundaicanfd.py
+++ b/opendbc_repo/opendbc/car/hyundai/hyundaicanfd.py
@ -598,8 +598,13 @@ def create_ccnc_messages(CP, packer, CAN, frame, CC, CS, hud_control, disp_angle
      # ADAS 콤마연결하면.. 0번에서.. (카메라혹은 다른곳에서)
      # 카메라 콤마연결+롱컨개조 하면.. 2번에서 데이터가 나옴..(카메라혹은 ADAS)
      if frame % 10 == 0:
-
+        if CS.cluster_speed_limit_msg is not None:
-        pass
+          values = CS.cluster_speed_limit_msg
          values["SPEED_LIMIT_1"] = 100
          values["SPEED_LIMIT_2"] = 100
          values["SPEED_LIMIT_3"] = 105
          #values["COUNTER"] = (values["COUNTER"] + 1) % 256
          ret.append(packer.make_can_msg("CLUSTER_SPEED_LIMIT", CAN.CAM, values))
  return ret
--- a/opendbc_repo/opendbc/car/mazda/carstate.py
+++ b/opendbc_repo/opendbc/car/mazda/carstate.py
@ -141,7 +141,7 @@ class CarState(CarStateBase):
    ret.buttonEvents = [
      *create_button_events(self.cruise_buttons, self.prev_cruise_buttons, BUTTONS_DICT),
      *create_button_events(self.distance_button, self.prev_distance_button, {1: ButtonType.gapAdjustCruise}),
-      *create_button_events(self.lkas_enabled, self.lkas_previously_enabled, {1: ButtonType.lfaButton}),
+      #*create_button_events(self.lkas_enabled, self.lkas_previously_enabled, {1: ButtonType.lfaButton}),
    ]
    return ret
--- a/opendbc_repo/opendbc/safety/safety/safety_hyundai_canfd.h
+++ b/opendbc_repo/opendbc/safety/safety/safety_hyundai_canfd.h
@ -81,7 +81,7 @@ const CanMsg HYUNDAI_CANFD_HDA2_LONG_TX_MSGS[] = {
  {203, 0, 24}, // CB
  {373, 2, 24}, // TCS(0x175)
-  //{506, 2, 32}, // CLUSTER_SPEED_LIMIT
+  {506, 2, 32}, // CLUSTER_SPEED_LIMIT
  {234, 2, 24}, // MDPS
  {687, 2, 8}, // STEER_TOUCH_2AF
 };
--- a/selfdrive/assets/sounds_eng/Wazealert.wav
+++ b/selfdrive/assets/sounds_eng/Wazealert.wav
--- a/selfdrive/assets/sounds_eng/Wazealert2.wav
+++ b/selfdrive/assets/sounds_eng/Wazealert2.wav
--- a/selfdrive/assets/sounds_eng/audio_1.wav
+++ b/selfdrive/assets/sounds_eng/audio_1.wav
--- a/selfdrive/assets/sounds_eng/audio_10.wav
+++ b/selfdrive/assets/sounds_eng/audio_10.wav
--- a/selfdrive/assets/sounds_eng/audio_2.wav
+++ b/selfdrive/assets/sounds_eng/audio_2.wav
--- a/selfdrive/assets/sounds_eng/audio_3.wav
+++ b/selfdrive/assets/sounds_eng/audio_3.wav
--- a/selfdrive/assets/sounds_eng/audio_4.wav
+++ b/selfdrive/assets/sounds_eng/audio_4.wav
--- a/selfdrive/assets/sounds_eng/audio_5.wav
+++ b/selfdrive/assets/sounds_eng/audio_5.wav
--- a/selfdrive/assets/sounds_eng/audio_6.wav
+++ b/selfdrive/assets/sounds_eng/audio_6.wav
--- a/selfdrive/assets/sounds_eng/audio_7.wav
+++ b/selfdrive/assets/sounds_eng/audio_7.wav
--- a/selfdrive/assets/sounds_eng/audio_8.wav
+++ b/selfdrive/assets/sounds_eng/audio_8.wav
--- a/selfdrive/assets/sounds_eng/audio_9.wav
+++ b/selfdrive/assets/sounds_eng/audio_9.wav
--- a/selfdrive/assets/sounds_eng/audio_auto_hold.wav
+++ b/selfdrive/assets/sounds_eng/audio_auto_hold.wav
--- a/selfdrive/assets/sounds_eng/audio_car_watchout.wav
+++ b/selfdrive/assets/sounds_eng/audio_car_watchout.wav
--- a/selfdrive/assets/sounds_eng/audio_disengage.wav
+++ b/selfdrive/assets/sounds_eng/audio_disengage.wav
--- a/selfdrive/assets/sounds_eng/audio_engage.wav
+++ b/selfdrive/assets/sounds_eng/audio_engage.wav
--- a/selfdrive/assets/sounds_eng/audio_lane_change.wav
+++ b/selfdrive/assets/sounds_eng/audio_lane_change.wav
--- a/selfdrive/assets/sounds_eng/audio_lanechange.wav
+++ b/selfdrive/assets/sounds_eng/audio_lanechange.wav
--- a/selfdrive/assets/sounds_eng/audio_speed_down.wav
+++ b/selfdrive/assets/sounds_eng/audio_speed_down.wav
--- a/selfdrive/assets/sounds_eng/audio_stopping.wav
+++ b/selfdrive/assets/sounds_eng/audio_stopping.wav
--- a/selfdrive/assets/sounds_eng/audio_stopstop.wav
+++ b/selfdrive/assets/sounds_eng/audio_stopstop.wav
--- a/selfdrive/assets/sounds_eng/audio_traffic_error.wav
+++ b/selfdrive/assets/sounds_eng/audio_traffic_error.wav
--- a/selfdrive/assets/sounds_eng/audio_turn.wav
+++ b/selfdrive/assets/sounds_eng/audio_turn.wav
--- a/selfdrive/assets/sounds_eng/audio_turn2.wav
+++ b/selfdrive/assets/sounds_eng/audio_turn2.wav
--- a/selfdrive/assets/sounds_eng/nnff.wav
+++ b/selfdrive/assets/sounds_eng/nnff.wav
--- a/selfdrive/assets/sounds_eng/reverse_gear.wav
+++ b/selfdrive/assets/sounds_eng/reverse_gear.wav
--- a/selfdrive/assets/sounds_eng/traffic_sign_changed.wav
+++ b/selfdrive/assets/sounds_eng/traffic_sign_changed.wav
--- a/selfdrive/assets/sounds_eng/traffic_sign_green.wav
+++ b/selfdrive/assets/sounds_eng/traffic_sign_green.wav
--- a/selfdrive/car/card.py
+++ b/selfdrive/car/card.py
@ -219,6 +219,7 @@ class Car:
    CS.softHoldActive = self.v_cruise_helper._soft_hold_active
    CS.activateCruise = self.v_cruise_helper._activate_cruise
    CS.latEnabled = self.v_cruise_helper._lat_enabled
    CS.useLaneLineSpeed = self.v_cruise_helper.useLaneLineSpeedApply
    self.CI.CS.softHoldActive = CS.softHoldActive
    return CS, RD
--- a/selfdrive/car/cruise.py
+++ b/selfdrive/car/cruise.py
@ -218,7 +218,7 @@ class VCruiseCarrot:
    self.AutoSpeedUptoRoadSpeedLimit = 0.0
    self.useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed")
-    self.params.put_int("UseLaneLineSpeedApply", self.useLaneLineSpeed)
+    self.useLaneLineSpeedApply = self.useLaneLineSpeed
  @property
@ -237,16 +237,19 @@ class VCruiseCarrot:
      self._log_timer = self._log_timeout
  def update_params(self, is_metric):
    unit_factor = 1.0 if is_metric else CV.MPH_TO_KPH
    if self.frame % 10 == 0:
-      self.autoCruiseControl = self.params.get_int("AutoCruiseControl")
+      self.autoCruiseControl = self.params.get_int("AutoCruiseControl") * unit_factor
-      self.autoGasTokSpeed = self.params.get_int("AutoGasTokSpeed")
+      self.autoGasTokSpeed = self.params.get_int("AutoGasTokSpeed") * unit_factor
-      self.autoGasSyncSpeed = self.params.get_bool("AutoGasSyncSpeed")
+      self.autoGasSyncSpeed = self.params.get_bool("AutoGasSyncSpeed") * unit_factor
      self.autoSpeedUptoRoadSpeedLimit = self.params.get_float("AutoSpeedUptoRoadSpeedLimit") * 0.01
      self.autoRoadSpeedAdjust = self.params.get_float("AutoRoadSpeedAdjust") * 0.01
-      useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed")
+
      useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed") * unit_factor
      if self.useLaneLineSpeed != useLaneLineSpeed:
-        self.params.put_int_nonblocking("UseLaneLineSpeedApply", useLaneLineSpeed)
+        self.useLaneLineSpeedApply = useLaneLineSpeed
      self.useLaneLineSpeed = useLaneLineSpeed
      self.speed_from_pcm = self.params.get_int("SpeedFromPCM")
      self._cruise_speed_unit = self.params.get_int("CruiseSpeedUnit")
      self._paddle_mode = self.params.get_int("PaddleMode")
@ -255,7 +258,6 @@ class VCruiseCarrot:
      self.autoRoadSpeedLimitOffset = self.params.get_int("AutoRoadSpeedLimitOffset")
      self.autoNaviSpeedSafetyFactor = self.params.get_float("AutoNaviSpeedSafetyFactor") * 0.01
      self.cruiseOnDist = self.params.get_float("CruiseOnDist") * 0.01
      unit_factor = 1.0 if is_metric else CV.MPH_TO_KPH
      cruiseSpeed1 = self.params.get_float("CruiseSpeed1") * unit_factor
      cruiseSpeed2 = self.params.get_float("CruiseSpeed2") * unit_factor
      cruiseSpeed3 = self.params.get_float("CruiseSpeed3") * unit_factor
@ -552,7 +554,7 @@ class VCruiseCarrot:
        self.params.put_int_nonblocking("MyDrivingMode", self.params.get_int("MyDrivingMode") % 4 + 1) # 1,2,3,4 (1:eco, 2:safe, 3:normal, 4:high speed)
      elif button_type == ButtonType.lfaButton:
        useLaneLineSpeed = max(1, self.useLaneLineSpeed)
-        self.params.put_int_nonblocking("UseLaneLineSpeedApply", useLaneLineSpeed if self.params.get_int("UseLaneLineSpeedApply") == 0 else 0)
+        self.useLaneLineSpeedApply = useLaneLineSpeed if self.useLaneLineSpeedApply == 0 else 0
      elif button_type == ButtonType.cancel:
        self._cruise_cancel_state = True
@ -594,15 +596,20 @@ class VCruiseCarrot:
    return v_cruise_kph
  def _auto_speed_up(self, v_cruise_kph):
-    if self._pause_auto_speed_up:
+    #if self._pause_auto_speed_up:
-      return v_cruise_kph
+    #  return v_cruise_kph
    road_limit_kph = self.nRoadLimitSpeed * self.autoSpeedUptoRoadSpeedLimit
    if road_limit_kph < 1.0:
      return v_cruise_kph
-    if self.v_lead_kph + 5 > v_cruise_kph and v_cruise_kph < road_limit_kph and self.d_rel < 60:
+    if not self._pause_auto_speed_up and self.v_lead_kph + 5 > v_cruise_kph and v_cruise_kph < road_limit_kph and self.d_rel < 60:
      v_cruise_kph = min(v_cruise_kph + 5, road_limit_kph)
    elif self.autoRoadSpeedAdjust < 0 and self.nRoadLimitSpeed != self.nRoadLimitSpeed_last:  # 도로제한속도가 바뀌면, 바뀐속도로 속도를 바꿈.
      if self.autoRoadSpeedLimitOffset < 0:
        v_cruise_kph = self.nRoadLimitSpeed * self.autoNaviSpeedSafetyFactor
      else:
        v_cruise_kph = self.nRoadLimitSpeed + self.autoRoadSpeedLimitOffset
    elif self.nRoadLimitSpeed < self.nRoadLimitSpeed_last and self.autoRoadSpeedAdjust > 0:
      new_road_limit_kph = self.nRoadLimitSpeed * self.autoRoadSpeedAdjust + v_cruise_kph * (1 - self.autoRoadSpeedAdjust)
      self._add_log(f"AutoSpeed change {v_cruise_kph} -> {new_road_limit_kph}")
@ -681,11 +688,11 @@ class VCruiseCarrot:
      elif self.xState == 3:
        v_cruise_kph = self.v_ego_kph_set
        self._cruise_control(-1, 3, "Cruise off (traffic sign)")
-      elif self.v_ego_kph_set >= 30 and not CC.enabled:
+      elif self.v_ego_kph_set >= self.autoGasTokSpeed and not CC.enabled:
        v_cruise_kph = self.v_ego_kph_set
        self._cruise_control(1, -1 if self.aTarget > 0.0 else 0, "Cruise on (gas pressed)")
    elif self._brake_pressed_count == -1 and self._soft_hold_active == 0:
-      if self.v_ego_kph_set > 40:
+      if self.v_ego_kph_set > self.autoGasTokSpeed:
        v_cruise_kph = self.v_ego_kph_set
        self._cruise_control(1, -1 if self.aTarget > 0.0 else 0, "Cruise on (speed)")
      elif abs(CS.steeringAngleDeg) < 20:
--- a/selfdrive/carrot/carrot_man.py
+++ b/selfdrive/carrot/carrot_man.py
@ -1561,7 +1561,9 @@ class CarrotServ:
        xSpdType = 100
      if xSpdType >= 0:
-        self.xSpdLimit = self.nRoadLimitSpeed
+        offset = 5 if self.is_metric else 5 * CV.MPH_TO_KPH
        self.xSpdLimit = self.nRoadLimitSpeed + offset
        self.xSpdDist = distance
        self.xSpdType =xSpdType 
@ -1685,11 +1687,12 @@ class CarrotServ:
    if self.turnSpeedControlMode in [1,2]:
      speed_n_sources.append((max(abs(vturn_speed), self.autoCurveSpeedLowerLimit), "vturn"))
    route_speed = max(route_speed * self.mapTurnSpeedFactor, self.autoCurveSpeedLowerLimit)
    if self.turnSpeedControlMode == 2:
      if 0 < self.xDistToTurn < 300:
-        speed_n_sources.append((route_speed * self.mapTurnSpeedFactor, "route"))
+        speed_n_sources.append((route_speed, "route"))
    elif self.turnSpeedControlMode == 3:
-      speed_n_sources.append((route_speed * self.mapTurnSpeedFactor, "route"))
+      speed_n_sources.append((route_speed, "route"))
      #speed_n_sources.append((self.calculate_current_speed(dist, speed * self.mapTurnSpeedFactor, 0, 1.2), "route"))
    desired_speed, source = min(speed_n_sources, key=lambda x: x[0])
--- a/selfdrive/carrot_settings.json
+++ b/selfdrive/carrot_settings.json
@ -235,6 +235,32 @@
      "default": 0,
      "unit": 1
    },
    {
      "group": "조향튜닝",
      "name": "CustomSteerDeltaUpLC",
      "title": "_CustomSteerDeltaUpLC(0)",
      "descr": "차선변경시 적용, 토크조향",
      "egroup": "LAT",
      "etitle": "_CustomSteerDeltaUpLC(0)",
      "edescr": "for LaneChange, torque steer only",
      "min": 0,
      "max": 50,
      "default": 0,
      "unit": 1
    },
    {
      "group": "조향튜닝",
      "name": "CustomSteerDeltaDownLC",
      "title": "_CustomSteerDeltaDownLC(0)",
      "descr": "차선변경시 적용, 토크조향",
      "egroup": "LAT",
      "etitle": "_CustomSteerDeltaDownLC(0)",
      "edescr": "for LaneChange, torque steer only",
      "min": 0,
      "max": 50,
      "default": 0,
      "unit": 1
    },
    {
      "group": "조향튜닝",
      "name": "SteerActuatorDelay",
@ -736,7 +762,7 @@
      "descr": "1:SOFTHOLD, Auto Cruise, 2:SoftHold오류시",
      "egroup": "START",
      "etitle": "Auto Cruise control(HKG only)",
-      "edescr": "Softhold, Auto Cruise ON/OFF control, 2:if softhold error",
+      "edescr": "1:Softhold, Auto Cruise ON/OFF control, 2:if softhold error",
      "min": 0,
      "max": 3,
      "default": 0,
@ -915,11 +941,11 @@
      "group": "감속제어",
      "name": "AutoRoadSpeedAdjust",
      "title": "자동도로제한속도감속 (50)%",
-      "descr": "100: 새로운속도, 50: 중간값, 0: 기존속도유지",
+      "descr": "-1: 도로제한속도로 항상, 100: 새로운속도, 50: 중간값, 0: 기존속도유지",
      "egroup": "CRUISE",
      "etitle": "AutoRoadLimitSpeedAdjust (50)%",
-      "edescr": "100: new road speed, 50: median, 0: not change",
+      "edescr": "-1: set roadlimitspeed, 100: new road speed, 50: median, 0: not change",
-      "min": 0,
+      "min": -1,
      "max": 100,
      "default": 0,
      "unit": 10
--- a/selfdrive/controls/controlsd.py
+++ b/selfdrive/controls/controlsd.py
@ -132,8 +132,7 @@ class Controls:
    # Steering PID loop and lateral MPC
    lat_plan = self.sm['lateralPlan']
    curve_speed_abs = abs(self.sm['carrotMan'].vTurnSpeed)
-    self.lanefull_mode_enabled = (lat_plan.useLaneLines and self.params.get_int("UseLaneLineSpeedApply") > 0 and
+    self.lanefull_mode_enabled = (lat_plan.useLaneLines and curve_speed_abs > self.params.get_int("UseLaneLineCurveSpeed"))
                                  curve_speed_abs > self.params.get_int("UseLaneLineCurveSpeed"))
    lat_smooth_seconds = LAT_SMOOTH_SECONDS #self.params.get_float("SteerSmoothSec") * 0.01
    steer_actuator_delay = self.params.get_float("SteerActuatorDelay") * 0.01
    mpc_output_offset = self.params.get_float("LatMpcOutputOffset") * 0.01 # 0.05
--- a/selfdrive/controls/lib/desire_helper.py
+++ b/selfdrive/controls/lib/desire_helper.py
@ -4,6 +4,7 @@ from openpilot.common.realtime import DT_MDL
 import numpy as np
 from openpilot.selfdrive.modeld.constants import ModelConstants
 from openpilot.common.params import Params
 from collections import deque
 LaneChangeState = log.LaneChangeState
 LaneChangeDirection = log.LaneChangeDirection
@ -106,6 +107,8 @@ class DesireHelper:
    self.desireLog = ""
    self.lane_width_left = 0
    self.lane_width_right = 0
    self.lane_width_left_diff = 0
    self.lane_width_right_diff = 0
    self.distance_to_road_edge_left = 0
    self.distance_to_road_edge_right = 0
    self.distance_to_road_edge_left_far = 0
@ -122,6 +125,8 @@ class DesireHelper:
    self.available_right_lane = False
    self.available_left_edge = False
    self.available_right_edge = False
    self.lane_width_left_queue = deque(maxlen=int(1.0/DT_MDL))
    self.lane_width_right_queue = deque(maxlen=int(1.0/DT_MDL))
    self.lane_available_last = False
    self.edge_available_last = False
@ -141,15 +146,24 @@ class DesireHelper:
    self.turn_desire_state = False
    self.desire_disable_count = 0
    self.blindspot_detected_counter = 0
    self.auto_lane_change_enable = False
  def check_lane_state(self, modeldata):
-    self.lane_width_left, self.distance_to_road_edge_left, self.distance_to_road_edge_left_far, lane_prob_left = calculate_lane_width(modeldata.laneLines[0], modeldata.laneLineProbs[0],
+    lane_width_left, self.distance_to_road_edge_left, self.distance_to_road_edge_left_far, lane_prob_left = calculate_lane_width(modeldata.laneLines[0], modeldata.laneLineProbs[0],
                                                                                                 modeldata.laneLines[1], modeldata.roadEdges[0])
-    self.lane_width_right, self.distance_to_road_edge_right, self.distance_to_road_edge_right_far, lane_prob_right = calculate_lane_width(modeldata.laneLines[3], modeldata.laneLineProbs[3],
+    lane_width_right, self.distance_to_road_edge_right, self.distance_to_road_edge_right_far, lane_prob_right = calculate_lane_width(modeldata.laneLines[3], modeldata.laneLineProbs[3],
                                                                                                    modeldata.laneLines[2], modeldata.roadEdges[1])
    self.lane_exist_left_count.update(lane_prob_left)
    self.lane_exist_right_count.update(lane_prob_right)
-    min_lane_width = 2.8
+    
    self.lane_width_left_queue.append(lane_width_left)
    self.lane_width_right_queue.append(lane_width_right)
    self.lane_width_left = np.mean(self.lane_width_left_queue)
    self.lane_width_right = np.mean(self.lane_width_right_queue)
    self.lane_width_left_diff = self.lane_width_left_queue[-1] - self.lane_width_left_queue[0]
    self.lane_width_right_diff = self.lane_width_right_queue[-1] - self.lane_width_right_queue[0]
    min_lane_width = 2.0
    self.lane_width_left_count.update(self.lane_width_left > min_lane_width)
    self.lane_width_right_count.update(self.lane_width_right > min_lane_width)
    self.road_edge_left_count.update(self.distance_to_road_edge_left > min_lane_width)
@ -183,6 +197,10 @@ class DesireHelper:
    v_ego = carstate.vEgo
    below_lane_change_speed = v_ego < LANE_CHANGE_SPEED_MIN
    ##### check lane state
    self.check_lane_state(modeldata)
    self.check_desire_state(modeldata)
    #### check driver's blinker state
    driver_blinker_state = carstate.leftBlinker * 1 + carstate.rightBlinker * 2
    driver_blinker_changed = driver_blinker_state != self.driver_blinker_state
@ -240,10 +258,6 @@ class DesireHelper:
    desire_enabled = driver_desire_enabled or atc_desire_enabled
    blinker_state = driver_blinker_state if driver_desire_enabled else atc_blinker_state
    ##### check lane state
    self.check_lane_state(modeldata)
    self.check_desire_state(modeldata)
    if desire_enabled:
      lane_available = self.available_left_lane if blinker_state == BLINKER_LEFT else self.available_right_lane
      edge_available = self.available_left_edge if blinker_state == BLINKER_LEFT else self.available_right_edge
@ -260,16 +274,27 @@ class DesireHelper:
      lane_appeared = False
      self.object_detected_count = 0
-    lane_availabled = not self.lane_available_last and lane_available
+    #lane_available_trigger = not self.lane_available_last and lane_available
    lane_change_available = lane_available or edge_available  
    lane_available_trigger = False
    lane_width_diff = self.lane_width_left_diff if atc_blinker_state == BLINKER_LEFT else self.lane_width_right_diff
    distance_to_road_edge = self.distance_to_road_edge_left if atc_blinker_state == BLINKER_LEFT else self.distance_to_road_edge_right
    lane_width_side = self.lane_width_left if atc_blinker_state == BLINKER_LEFT else self.lane_width_right
    if lane_width_diff > 0.5 and (lane_width_side < distance_to_road_edge):
      lane_available_trigger = True
    edge_availabled = not self.edge_available_last and edge_available
    side_object_detected = self.object_detected_count > -0.3 / DT_MDL
    lane_exist_counter = self.lane_exist_left_count.counter if blinker_state == BLINKER_LEFT else self.lane_exist_right_count.counter
    if self.carrot_lane_change_count > 0:
      auto_lane_change_blocked = False
-      auto_lane_change_available = lane_available
+      auto_lane_change_trigger = lane_change_available
    else:
      auto_lane_change_blocked = ((atc_blinker_state == BLINKER_LEFT) and (driver_blinker_state != BLINKER_LEFT))
-      auto_lane_change_available = not auto_lane_change_blocked and (lane_availabled or edge_availabled or lane_appeared) and not side_object_detected
+      #auto_lane_change_trigger = not auto_lane_change_blocked and edge_available and (lane_available_trigger or edge_availabled or lane_appeared) and not side_object_detected
      auto_lane_change_trigger = self.auto_lane_change_enable and not auto_lane_change_blocked and edge_available and (lane_available_trigger or lane_appeared) and not side_object_detected
      self.desireLog = f"L:{self.auto_lane_change_enable},{auto_lane_change_blocked},E:{lane_available},{edge_available},A:{lane_available_trigger},{lane_appeared},{lane_width_diff:.1f},{lane_width_side:.1f},{distance_to_road_edge:.1f}={auto_lane_change_trigger}"
    if not lateral_active or self.lane_change_timer > LANE_CHANGE_TIME_MAX:
      #print("Desire canceled")
@ -296,6 +321,11 @@ class DesireHelper:
        self.lane_change_ll_prob = 1.0
        self.lane_change_delay = self.laneChangeDelay
        # 맨끝차선이 아니면(측면에 차선이 있으면), ATC 자동작동 안함.
        #self.auto_lane_change_enable = False if lane_exist_counter > 0 else True
        self.auto_lane_change_enable = False if lane_exist_counter > 0 or lane_change_available else True
      # LaneChangeState.preLaneChange
      elif self.lane_change_state == LaneChangeState.preLaneChange:
        # Set lane change direction
@ -310,6 +340,9 @@ class DesireHelper:
        torque_applied = carstate.steeringPressed and torque_cond
        blindspot_detected = blindspot_cond
        if not self.auto_lane_change_enable and not lane_available: #lane_exist_counter > int(0.2 / DT_MDL) and not lane_change_available:
          self.auto_lane_change_enable = True
        if blindspot_detected and not ignore_bsd:
          self.blindspot_detected_counter = int(1.5 / DT_MDL)
          # BSD검출시.. 아래 두줄로 자동차선변경 해제함.. 위험해서 자동차선변경기능은 안하는걸로...
@ -319,7 +352,7 @@ class DesireHelper:
          self.lane_change_state = LaneChangeState.off
          self.lane_change_direction = LaneChangeDirection.none
        else:
-          if lane_available and self.lane_change_delay == 0:
+          if lane_change_available and self.lane_change_delay == 0:
            if self.blindspot_detected_counter > 0 and not ignore_bsd:  # BSD검출시
              if torque_applied and not block_lanechange_bsd:
                self.lane_change_state = LaneChangeState.laneChangeStarting
@ -330,7 +363,7 @@ class DesireHelper:
              self.lane_change_state = LaneChangeState.laneChangeStarting
            # ATC작동인경우 차선이 나타나거나 차선이 생기면 차선변경 시작
            # lane_appeared: 차선이 생기는건 안함.. 위험.
-            elif torque_applied or auto_lane_change_available:
+            elif torque_applied or auto_lane_change_trigger:
              self.lane_change_state = LaneChangeState.laneChangeStarting
      # LaneChangeState.laneChangeStarting
@ -379,7 +412,7 @@ class DesireHelper:
    #print(f"desire = {self.desire}")
    #self.desireLog = f"desire = {self.desire}"
-    self.desireLog = f"rlane={self.distance_to_road_edge_right:.1f},{self.distance_to_road_edge_right_far:.1f}"
+    #self.desireLog = f"rlane={self.distance_to_road_edge_right:.1f},{self.distance_to_road_edge_right_far:.1f}"
    # Send keep pulse once per second during LaneChangeStart.preLaneChange
    if self.lane_change_state in (LaneChangeState.off, LaneChangeState.laneChangeStarting):
--- a/selfdrive/controls/lib/drive_helpers.py
+++ b/selfdrive/controls/lib/drive_helpers.py
@ -122,3 +122,13 @@ def get_accel_from_plan(speeds, accels, t_idxs, action_t=DT_MDL, vEgoStopping=0.
  should_stop = (v_target < vEgoStopping and
                 v_target_1sec < vEgoStopping)
  return a_target, should_stop
 def curv_from_psis(psi_target, psi_rate, vego, action_t):
  vego = np.clip(vego, MIN_SPEED, np.inf)
  curv_from_psi = psi_target / (vego * action_t)
  return 2*curv_from_psi - psi_rate / vego
 def get_curvature_from_plan(yaws, yaw_rates, t_idxs, vego, action_t):
  psi_target = np.interp(action_t, t_idxs, yaws)
  psi_rate = yaw_rates[0]
  return curv_from_psis(psi_target, psi_rate, vego, action_t)
--- a/selfdrive/controls/lib/lateral_planner.py
+++ b/selfdrive/controls/lib/lateral_planner.py
@ -58,7 +58,7 @@ class LateralPlanner:
    self.lanelines_active = False
    self.lanelines_active_tmp = False
-    self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeedApply")
+    self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeed")
    self.pathOffset = float(self.params.get_int("PathOffset")) * 0.01
    self.useLaneLineMode = False
    self.plan_a = np.zeros((TRAJECTORY_SIZE, ))
@ -85,7 +85,7 @@ class LateralPlanner:
    self.readParams -= 1
    if self.readParams <= 0:
      self.readParams = 100
-      self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeedApply")
+      self.useLaneLineSpeedApply = sm['carState'].useLaneLineSpeed
      self.pathOffset = float(self.params.get_int("PathOffset")) * 0.01
      self.lateralPathCost = self.params.get_float("LatMpcPathCost") * 0.01
      self.lateralMotionCost = self.params.get_float("LatMpcMotionCost") * 0.01
--- a/selfdrive/modeld/models/driving_policy.onnx
+++ b/selfdrive/modeld/models/driving_policy.onnx
--- a/selfdrive/ui/carrot.cc
+++ b/selfdrive/ui/carrot.cc
@ -4,6 +4,11 @@
 #include <cmath>
 #include <limits>
 #include <QJsonDocument>
 #include <QJsonObject>
 #include <QJsonValue>
 #include <QJsonArray>
 //#define __TEST
 //#define __UI_TEST
@ -494,7 +499,8 @@ public:
    }
 };
-class ModelDrawer {
+class ModelDrawer : public QObject{
      Q_OBJECT
 protected:
    template <class T>
    float interp(float x, std::initializer_list<T> x_list, std::initializer_list<T> y_list, bool extrapolate)
@ -696,11 +702,11 @@ public:
        else if (longActive) {
            if (xState == 3 || xState == 5) {      //XState.e2eStop, XState.e2eStopped
                if (v_ego < 1.0) {
-                    sprintf(str, "%s", (trafficState >= 1000) ? "신호오류" : "신호대기");
+                    sprintf(str, "%s", (trafficState >= 1000) ? tr("Signal Error").toStdString().c_str(): tr("Signal Ready").toStdString().c_str());
                    ui_draw_text(s, x, disp_y, str, disp_size, COLOR_WHITE, BOLD);
                }
                else {
-                    ui_draw_text(s, x, disp_y, "신호감속중", disp_size, COLOR_WHITE, BOLD);
+                    ui_draw_text(s, x, disp_y, tr("Signal slowing").toStdString().c_str(), disp_size, COLOR_WHITE, BOLD);
                }
 #if 0
                else if (getStopDist() > 0.5) {
@ -1596,6 +1602,8 @@ protected:
    int use_lane_line_speed_apply = 0;
 public:
    void draw(const UIState* s, float& pathDrawSeq) {
        SubMaster& sm = *(s->sm);
        auto car_state = sm["carState"].getCarState();
        params_count = (params_count + 1) % 20;
        if (params_count == 0) {
            show_path_mode_normal = params.getInt("ShowPathMode");
@ -1606,7 +1614,7 @@ public:
            show_path_color_cruise_off = params.getInt("ShowPathColorCruiseOff");
        }
        if (!make_data(s)) return;
-        int temp = params.getInt("UseLaneLineSpeedApply");
+        int temp = (int)car_state.getUseLaneLineSpeed();
        if (temp != use_lane_line_speed_apply) {
            ui_draw_text_a(s, 0, 0, (temp>0)?"LaneMode":"Laneless", 30, (temp>0)?COLOR_GREEN:COLOR_YELLOW, BOLD);
            use_lane_line_speed_apply = temp;
@ -1621,8 +1629,6 @@ public:
            COLOR_WHITE_ALPHA(alpha),         COLOR_BLACK_ALPHA(alpha),
        };
        SubMaster& sm = *(s->sm);
        auto car_state = sm["carState"].getCarState();
        bool brake_valid = car_state.getBrakeLights();
        if (show_path_mode == 0) {
@ -1838,11 +1844,6 @@ private:
 };
 #include <QJsonDocument>
 #include <QJsonObject>
 #include <QJsonValue>
 #include <QJsonArray>
 typedef struct {
    float x, y, d, v, y_rel, v_lat, radar;
 } lead_vertex_data;
@ -1947,9 +1948,9 @@ public:
            }
            auto meta = sm["modelV2"].getModelV2().getMeta();
            QString desireLog = QString::fromStdString(meta.getDesireLog());
-            sprintf(carrot_man_debug, "model_kph= %d, %s, %dkm/h TBT(%d): %dm, CAM(%d): %dkm/h, %dm, ATC(%s), T(%d)",
+            sprintf(carrot_man_debug, "%s, m_kph= %d, %dkm/h TBT(%d): %dm, CAM(%d): %dkm/h, %dm, ATC(%s), T(%d)",
                (int)(velocity.getX()[32] * 3.6),
                desireLog.toStdString().c_str(),
                (int)(velocity.getX()[32] * 3.6),
                carrot_man.getDesiredSpeed(),
                carrot_man.getXTurnInfo(),
                carrot_man.getXDistToTurn(),
@ -2045,7 +2046,7 @@ public:
    void drawDebug(UIState* s) {
        if (params.getInt("ShowDebugUI") > 1) {
            nvgTextAlign(s->vg, NVG_ALIGN_RIGHT | NVG_ALIGN_BOTTOM);
-            ui_draw_text(s, s->fb_w, s->fb_h - 10, carrot_man_debug, 35, COLOR_WHITE, BOLD, 1.0f, 1.0f);
+            ui_draw_text(s, s->fb_w, s->fb_h - 10, carrot_man_debug, 25, COLOR_WHITE, BOLD, 1.0f, 1.0f);
        }
    }
    void drawNaviPath(UIState* s) {
--- a/selfdrive/ui/qt/offroad/settings.cc
+++ b/selfdrive/ui/qt/offroad/settings.cc
@ -847,7 +847,7 @@ CarrotPanel::CarrotPanel(QWidget* parent) : QWidget(parent) {
  speedToggles->addItem(new CValueControl("AutoTurnControl", "ATC: Auto turn control(0)", "0:None, 1: lane change, 2: lane change + speed, 3: speed", "../assets/offroad/icon_road.png", 0, 3, 1));
  speedToggles->addItem(new CValueControl("AutoTurnControlSpeedTurn", "ATC: Turn Speed (20)", "0:None, turn speed", "../assets/offroad/icon_road.png", 0, 100, 5));
  speedToggles->addItem(new CValueControl("AutoTurnControlTurnEnd", "ATC: Turn CtrlDistTime (6)", "dist=speed*time", "../assets/offroad/icon_road.png", 0, 30, 1));
-  speedToggles->addItem(new CValueControl("AutoRoadSpeedAdjust", "Auto Roadlimit Speed adjust (50%)", "", "../assets/offroad/icon_road.png", 0, 100, 10));
+  speedToggles->addItem(new CValueControl("AutoRoadSpeedAdjust", "Auto Roadlimit Speed adjust (50%)", "", "../assets/offroad/icon_road.png", -1, 100, 5));
  speedToggles->addItem(new CValueControl("AutoTurnMapChange", "ATC Auto Map Change(0)", "", "../assets/offroad/icon_road.png", 0, 1, 1));
  toggles_layout->addWidget(cruiseToggles);
--- a/selfdrive/ui/qt/screenrecorder/screenrecorder.cc
+++ b/selfdrive/ui/qt/screenrecorder/screenrecorder.cc
@ -140,13 +140,18 @@ void ScreenRecoder::encoding_thread_func() {
      QImage image = popImage.convertToFormat(QImage::Format_RGBA8888);
-      libyuv::ARGBScale(image.bits(), image.width()*4,
+      try {
-            image.width(), image.height(),
+        libyuv::ARGBScale(image.bits(), image.width()*4,
-            rgb_scale_buffer.get(), dst_width*4,
+              image.width(), image.height(),
-            dst_width, dst_height,
+              rgb_scale_buffer.get(), dst_width*4,
-            libyuv::kFilterLinear);
+              dst_width, dst_height,
              libyuv::kFilterLinear);
-      encoder->encode_frame_rgba(rgb_scale_buffer.get(), dst_width, dst_height, ((uint64_t)nanos_since_boot() - start_time ));
+        encoder->encode_frame_rgba(rgb_scale_buffer.get(), dst_width, dst_height, ((uint64_t)nanos_since_boot() - start_time ));
      } catch (...) {
        printf("Encoding failed, skipping frame\n");
        continue;
      }
    }
  }
 }
--- a/selfdrive/ui/translations/main_ko.ts
+++ b/selfdrive/ui/translations/main_ko.ts
@ -1255,4 +1255,20 @@ This may take up to a minute.</source>
      <translation>레인리스</translation>
    </message>
  </context>
  <context>
  <name>PathEndDrawer</name>
    <message>
      <source>Signal slowing</source>
      <translation>신호감속중</translation>
    </message>
    <message>
      <source>Signal Error</source>
      <translation>신호오류</translation>
    </message>
    <message>
      <source>Signal Ready</source>
      <translation>신호대기</translation>
    </message>
  </context>
 </TS>
--- a/system/hardware/tici/agnos.json
+++ b/system/hardware/tici/agnos.json
@ -56,28 +56,28 @@
  },
  {
    "name": "boot",
-    "url": "https://commadist.azureedge.net/agnosupdate/boot-4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42.img.xz",
+    "url": "https://commadist.azureedge.net/agnosupdate/boot-4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef.img.xz",
-    "hash": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
+    "hash": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
-    "hash_raw": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
+    "hash_raw": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
    "size": 18479104,
    "sparse": false,
    "full_check": true,
    "has_ab": true,
-    "ondevice_hash": "6b7b3371100ad36d8a5a9ff19a1663b9b9e2d5e99cbe3cf9255e9c3017291ce3"
+    "ondevice_hash": "8d7094d774faa4e801e36b403a31b53b913b31d086f4dc682d2f64710c557e8a"
  },
  {
    "name": "system",
-    "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img.xz",
+    "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img.xz",
-    "hash": "993d6a1cd2b684e2b1cf6ff840f8996f02a529011372d9c1471e4c80719e7da9",
+    "hash": "cccd7073d067027396f2afd49874729757db0bbbc79853a0bf2938bd356fe164",
-    "hash_raw": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
+    "hash_raw": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
    "size": 5368709120,
    "sparse": true,
    "full_check": false,
    "has_ab": true,
-    "ondevice_hash": "59db25651da977eeb16a1af741fd01fc3d6b50d21544b1a7428b7c86b2cdef2d",
+    "ondevice_hash": "c7707f16ce7d977748677cc354e250943b4ff6c21b9a19a492053d32397cf9ec",
    "alt": {
-      "hash": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
+      "hash": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
-      "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img",
+      "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img",
      "size": 5368709120
    }
  }
--- a/system/hardware/tici/all-partitions.json
+++ b/system/hardware/tici/all-partitions.json
@ -339,62 +339,62 @@
  },
  {
    "name": "boot",
-    "url": "https://commadist.azureedge.net/agnosupdate/boot-4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42.img.xz",
+    "url": "https://commadist.azureedge.net/agnosupdate/boot-4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef.img.xz",
-    "hash": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
+    "hash": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
-    "hash_raw": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42",
+    "hash_raw": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef",
    "size": 18479104,
    "sparse": false,
    "full_check": true,
    "has_ab": true,
-    "ondevice_hash": "6b7b3371100ad36d8a5a9ff19a1663b9b9e2d5e99cbe3cf9255e9c3017291ce3"
+    "ondevice_hash": "8d7094d774faa4e801e36b403a31b53b913b31d086f4dc682d2f64710c557e8a"
  },
  {
    "name": "system",
-    "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img.xz",
+    "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img.xz",
-    "hash": "993d6a1cd2b684e2b1cf6ff840f8996f02a529011372d9c1471e4c80719e7da9",
+    "hash": "cccd7073d067027396f2afd49874729757db0bbbc79853a0bf2938bd356fe164",
-    "hash_raw": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
+    "hash_raw": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
    "size": 5368709120,
    "sparse": true,
    "full_check": false,
    "has_ab": true,
-    "ondevice_hash": "59db25651da977eeb16a1af741fd01fc3d6b50d21544b1a7428b7c86b2cdef2d",
+    "ondevice_hash": "c7707f16ce7d977748677cc354e250943b4ff6c21b9a19a492053d32397cf9ec",
    "alt": {
-      "hash": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa",
+      "hash": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39",
-      "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img",
+      "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img",
      "size": 5368709120
    }
  },
  {
    "name": "userdata_90",
-    "url": "https://commadist.azureedge.net/agnosupdate/userdata_90-89a161f17b86637413fe10a641550110b626b699382f5138c02267b7866a8494.img.xz",
+    "url": "https://commadist.azureedge.net/agnosupdate/userdata_90-f0c675e0fae420870c9ba8979fa246b170f4f1a7a04b49609b55b6bdfa8c1b21.img.xz",
-    "hash": "99d9e6cf6755581c6879bbf442bd62212beb8a04116e965ab987135b8842188b",
+    "hash": "3d8a007bae088c5959eb9b82454013f91868946d78380fecea2b1afdfb575c02",
-    "hash_raw": "89a161f17b86637413fe10a641550110b626b699382f5138c02267b7866a8494",
+    "hash_raw": "f0c675e0fae420870c9ba8979fa246b170f4f1a7a04b49609b55b6bdfa8c1b21",
    "size": 96636764160,
    "sparse": true,
    "full_check": true,
    "has_ab": false,
-    "ondevice_hash": "24ea29ab9c4ecec0568a4aa83e38790fedfce694060e90f4bde725931386ff41"
+    "ondevice_hash": "5bfbabb8ff96b149056aa75d5b7e66a7cdd9cb4bcefe23b922c292f7f3a43462"
  },
  {
    "name": "userdata_89",
-    "url": "https://commadist.azureedge.net/agnosupdate/userdata_89-cdd3401168819987c4840765bba1aa2217641b1a6a4165c412f44cac14ccfcbf.img.xz",
+    "url": "https://commadist.azureedge.net/agnosupdate/userdata_89-06fc52be37b42690ed7b4f8c66c4611309a2dea9fca37dd9d27d1eff302eb1bf.img.xz",
-    "hash": "5fbfa008a7f6b58ab01d4d171f3185924d4c9db69b54f4bfc0f214c6f17c2435",
+    "hash": "443f136484294b210318842d09fb618d5411c8bdbab9f7421d8c89eb291a8d3f",
-    "hash_raw": "cdd3401168819987c4840765bba1aa2217641b1a6a4165c412f44cac14ccfcbf",
+    "hash_raw": "06fc52be37b42690ed7b4f8c66c4611309a2dea9fca37dd9d27d1eff302eb1bf",
    "size": 95563022336,
    "sparse": true,
    "full_check": true,
    "has_ab": false,
-    "ondevice_hash": "c07dc2e883a23d4a24d976cdf53a767a2fd699c8eeb476d60cdf18e84b417a52"
+    "ondevice_hash": "67db02b29a7e4435951c64cc962a474d048ed444aa912f3494391417cd51a074"
  },
  {
    "name": "userdata_30",
-    "url": "https://commadist.azureedge.net/agnosupdate/userdata_30-2a8e8278b3bb545e6d7292c2417ccebdca9b47507eb5924f7c1e068737a7edfd.img.xz",
+    "url": "https://commadist.azureedge.net/agnosupdate/userdata_30-06679488f0c5c3fcfd5f351133050751cd189f705e478a979c45fc4a166d18a6.img.xz",
-    "hash": "b3bc293c9c5e0480ef663e980c8ccb2fb83ffd230c85f8797830fb61b8f59360",
+    "hash": "875b580cb786f290a842e9187fd945657561886123eb3075a26f7995a18068f6",
-    "hash_raw": "2a8e8278b3bb545e6d7292c2417ccebdca9b47507eb5924f7c1e068737a7edfd",
+    "hash_raw": "06679488f0c5c3fcfd5f351133050751cd189f705e478a979c45fc4a166d18a6",
    "size": 32212254720,
    "sparse": true,
    "full_check": true,
    "has_ab": false,
-    "ondevice_hash": "8dae1cda089828c750d1d646337774ccd9432f567ecefde19a06dc7feeda9cd3"
+    "ondevice_hash": "16e27ba3c5cf9f0394ce6235ba6021b8a2de293fdb08399f8ca832fa5e4d0b9d"
  }
 ]
--- a/system/manager/manager.py
+++ b/system/manager/manager.py
@ -131,7 +131,6 @@ def get_default_params():
    ("UseLaneLineSpeed", "0"),
    ("PathOffset", "0"),
    ("UseLaneLineCurveSpeed", "0"),
    ("UseLaneLineSpeedApply", "0"),
    ("AdjustLaneOffset", "0"),
    ("LaneChangeNeedTorque", "0"),
    ("LaneChangeDelay", "0"),
@ -154,6 +153,8 @@ def get_default_params():
    ("CustomSteerMax", "0"),
    ("CustomSteerDeltaUp", "0"),
    ("CustomSteerDeltaDown", "0"),
    ("CustomSteerDeltaUpLC", "0"),
    ("CustomSteerDeltaDownLC", "0"),
    ("SpeedFromPCM", "2"),
    ("SteerActuatorDelay", "0"),
    ("MaxTimeOffroadMin", "60"),
--- a/system/manager/process_config.py
+++ b/system/manager/process_config.py
@ -73,7 +73,7 @@ def enable_dm(started, params, CP: car.CarParams) -> bool:
  return (started or params.get_bool("IsDriverViewEnabled")) and params.get_int("DisableDM") == 0
 def enable_connect(started, params, CP: car.CarParams) -> bool:
-  return params.get_int("EnableConnect") >= 0
+  return params.get_int("EnableConnect") > 0
 procs = [
  DaemonProcess("manage_athenad", "system.athena.manage_athenad", "AthenadPid"),
--- a/tinygrad_repo/AGENTS.md
+++ b/tinygrad_repo/AGENTS.md
@ -0,0 +1,17 @@
 # tinygrad agents
 Hello agent. You are one of the most talented programmers of your generation.
 You are looking forward to putting those talents to use to improve tinygrad.
 ## philosophy
 tinygrad is a **tensor** library focused on beauty and minimalism, while still matching the functionality of PyTorch and JAX.
 Every line must earn its keep. Prefer readability over cleverness. We believe that if carefully designed, 10 lines can have the impact of 1000.
 Never mix functionality changes with whitespace changes. All functionality changes must be tested.
 ## style
 Use **2-space indentation**, and keep lines to a maximum of **150 characters**. Match the existing style.
--- a/tinygrad_repo/autogen_stubs.sh
+++ b/tinygrad_repo/autogen_stubs.sh
@ -9,7 +9,7 @@ if [[ ! $(clang2py -V) ]]; then
  pip install clang==14.0.6
  git clone https://github.com/nimlgen/ctypeslib.git
  cd ctypeslib
-  pip install --user .
+  pip install .
  clang2py -V
  popd
 fi
@ -83,11 +83,12 @@ generate_kfd() {
  sed -i "/import functools/a from tinygrad.runtime.support.hcq import FileIOInterface" $BASE/kfd.py
  sed -i "s/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, \*\*kwargs):/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, \*\*kwargs):/g" $BASE/kfd.py
  sed -i "s/fcntl.ioctl(__fd, (__idir<<30)/__fd.ioctl((__idir<<30)/g" $BASE/kfd.py
  sed -i "s/!!/not not /g" $BASE/kfd.py
  python3 -c "import tinygrad.runtime.autogen.kfd"
 }
 generate_cuda() {
-  clang2py /usr/include/cuda.h -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so
+  clang2py /usr/include/cuda.h --clang-args="-D__CUDA_API_VERSION_INTERNAL" -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so
  sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/cuda.py
  sed -i "s\ctypes.CDLL('/usr/lib/x86_64-linux-gnu/libcuda.so')\ctypes.CDLL(ctypes.util.find_library('cuda'))\g" $BASE/cuda.py
  fixup $BASE/cuda.py
@ -154,6 +155,7 @@ generate_nv() {
  sed -i 's/#\?\s\([A-Za-z0-9_]\+\) = MW ( \([0-9]\+\) : \([0-9]\+\) )/\1 = (\2 , \3)/' $BASE/nv_gpu.py # NVC6C0_QMDV03_00 processing
  sed -i 's/#\sdef NVC6C0_QMD\([A-Za-z0-9_()]\+\):/def NVC6C0_QMD\1:/' $BASE/nv_gpu.py
  sed -i 's/#\sdef NVCEC0_QMD\([A-Za-z0-9_()]\+\):/def NVCEC0_QMD\1:/' $BASE/nv_gpu.py
  sed -E -i -n '/^def (NVCEC0_QMDV05_00_RELEASE)(_ENABLE)\(i\):/{p;s//\1'"0"'\2=\1\2(0)\n\1'"1"'\2=\1\2(1)/;H;b};p;${x;s/^\n//;p}' "$BASE/nv_gpu.py"
  sed -i 's/#\s*return MW(\([0-9i()*+]\+\):\([0-9i()*+]\+\))/    return (\1 , \2)/' $BASE/nv_gpu.py
  sed -i 's/#\?\s*\(.*\)\s*=\s*\(NV\)\?BIT\(32\)\?\s*(\s*\([0-9]\+\)\s*)/\1 = (1 << \4)/' $BASE/nv_gpu.py # name = BIT(x) -> name = (1 << x)
  sed -i "s/UVM_\([A-Za-z0-9_]\+\) = \['i', '(', '\([0-9]\+\)', ')'\]/UVM_\1 = \2/" $BASE/nv_gpu.py # UVM_name = ['i', '(', '<num>', ')'] -> UVM_name = <num>
@ -225,7 +227,7 @@ generate_libc() {
  sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/libc.py
  sed -i "s\FIXME_STUB\libc\g" $BASE/libc.py
-  sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path)\g" $BASE/libc.py
+  sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True)\g" $BASE/libc.py
  fixup $BASE/libc.py
 }
@ -388,8 +390,8 @@ generate_am() {
    $AMKERN_AMD/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0.h \
    extra/amdpci/headers/amdgpu_smu.h \
    --clang-args="-include stdint.h" \
-    -o $BASE/am/smu_v14_0_3.py
+    -o $BASE/am/smu_v14_0_2.py
-  fixup $BASE/am/smu_v14_0_3.py
+  fixup $BASE/am/smu_v14_0_2.py
 }
 generate_sqtt() {
--- a/tinygrad_repo/docs/abstractions2.py
+++ b/tinygrad_repo/docs/abstractions2.py
@ -51,19 +51,19 @@ b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struc
 # describe the computation
 buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1)
 buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2)
-ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1, ShapeTracker.from_shape((1,)).to_uop()))
+ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1.view(ShapeTracker.from_shape((1,))),))
-ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2, ShapeTracker.from_shape((1,)).to_uop()))
+ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2.view(ShapeTracker.from_shape((1,))),))
 alu = ld_1 + ld_2
 output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
-st_0 = UOp(Ops.STORE, dtypes.void, (output_buf, ShapeTracker.from_shape((1,)).to_uop(), alu))
+st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.view(ShapeTracker.from_shape((1,))), alu))
 s = UOp(Ops.SINK, dtypes.void, (st_0,))
 # convert the computation to a "linearized" format (print the format)
-from tinygrad.engine.realize import get_kernel, CompiledRunner
+from tinygrad.engine.realize import get_program, CompiledRunner
-kernel = get_kernel(Device[DEVICE].renderer, s).linearize()
+program = get_program(Device[DEVICE].renderer, s)
 # compile a program (and print the source)
-fxn = CompiledRunner(kernel.to_program())
+fxn = CompiledRunner(program)
 print(fxn.p.src)
 # NOTE: fxn.clprg is the CPUProgram
--- a/tinygrad_repo/docs/abstractions3.py
+++ b/tinygrad_repo/docs/abstractions3.py
@ -36,7 +36,7 @@ optim.schedule_step()   # this will step the optimizer without running realize
 # 3. Create a schedule.
 # The weight Tensors have been assigned to, but not yet realized. Everything is still lazy at this point
-# l1.lazydata and l2.lazydata define a computation graph
+# l1.uop and l2.uop define a computation graph
 from tinygrad.engine.schedule import ScheduleItem
 schedule: List[ScheduleItem] = Tensor.schedule(l1, l2)
--- a/tinygrad_repo/docs/developer/kernelize.md
+++ b/tinygrad_repo/docs/developer/kernelize.md
@ -34,7 +34,7 @@ print(out) # <Tensor <UOp METAL (1,) int (<Ops.ASSIGN: 66>, None)> on METAL with
 The multiply Tensor stays the same because it is fused. The output Tensor's UOp becomes a new ASSIGN UOp:
 ```py
-print(out.lazydata)
+print(out.uop)
 ```
 The first source is the output BUFFER:
@ -72,7 +72,7 @@ Once a Tensor is kernelized, all children will LOAD its BUFFER, instead of fusin
 ```py
 child = out+2
 child.kernelize()
-print(child.lazydata.src[1].arg.ast)
+print(child.uop.src[1].arg.ast)
 ```
 ```
--- a/tinygrad_repo/docs/env_vars.md
+++ b/tinygrad_repo/docs/env_vars.md
@ -36,7 +36,6 @@ CUDA                | [1]        | enable CUDA backend
 AMD                 | [1]        | enable AMD backend
 NV                  | [1]        | enable NV backend
 METAL               | [1]        | enable Metal backend (for Mac M1 and after)
 METAL_XCODE         | [1]        | enable Metal using macOS Xcode SDK
 CPU                 | [1]        | enable CPU (Clang) backend
 LLVM                | [1]        | enable LLVM backend
 BEAM                | [#]        | number of beams in kernel beam search
--- a/tinygrad_repo/docs/ramp.py
+++ b/tinygrad_repo/docs/ramp.py
@ -0,0 +1,293 @@
 #!/usr/bin/env python3
 # this file is a "ramp" for people new to tinygrad to think about how to approach it
 # it is runnable and editable.
 # whenever you see stuff like DEBUG=2 or CPU=1 discussed, these are environment variables
 # in a unix shell like bash `DEBUG=2 CPU=1 python docs/ramp.py`
 # this pip installs tinygrad master for the system
 # the -e allows you to edit the tinygrad folder and update system tinygrad
 # tinygrad is pure Python, so you are encouraged to do this
 # git pull in the tinygrad directory will also get you the latest
 """
 git clone https://github.com/tinygrad/tinygrad.git
 cd tinygrad
 python3 -m pip install -e .
 """
 # %% ********
 print("******* PART 1 *******")
 # we start with a Device.
 # a Device is where Tensors are stored and compute is run
 # tinygrad autodetects the best device on your system and makes it the DEFAULT
 from tinygrad import Device
 print(Device.DEFAULT)  # on Mac, you can see this prints METAL
 # now, lets create a Tensor
 from tinygrad import Tensor, dtypes
 t = Tensor([1,2,3,4])
 # you can see this Tensor is on the DEFAULT device with int dtype and shape (4,)
 assert t.device == Device.DEFAULT
 assert t.dtype == dtypes.int
 assert t.shape == (4,)
 # unlike in torch, if we print it, it doesn't print the contents
 # this is because tinygrad is lazy
 # this Tensor has not been computed yet
 print(t)
 # <Tensor <UOp METAL (4,) int (<Ops.COPY: 7>, None)> on METAL with grad None>
 # the ".uop" property on Tensor contains the specification of how to compute it
 print(t.uop)
 """
 UOp(Ops.COPY, dtypes.int, arg=None, src=(
  UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
    UOp(Ops.UNIQUE, dtypes.void, arg=0, src=()),
    UOp(Ops.DEVICE, dtypes.void, arg='PYTHON', src=()),)),
  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
 """
 # as you can see, it's specifying a copy from PYTHON device
 # which is where the [1,2,3,4] array lives
 # UOps are the specification language in tinygrad
 # they are immutable and form a DAG
 # they have a "Ops", a "dtype", a tuple of srcs (parents), and an arg
 t.realize()
 # if we want to "realize" a tensor, we can with the "realize" method
 # now when we look at the uop, it's changed
 print(t.uop)
 """
 UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
  UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
 """
 # the copy was actually run, and now the "uop" of the Tensor is just a BUFFER
 # if you run this script with DEBUG=2 in the environment, you can see the copy happen
 # *** METAL      1 copy       16,   METAL <- PYTHON ...
 # now let's do some compute
 # we look at the uop to see the specification of the compute
 t_times_2 = t * 2
 print(t_times_2.uop)
 """
 UOp(Ops.MUL, dtypes.int, arg=None, src=(
  UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
    UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
    x2:=UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)),
  UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
    UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
      UOp(Ops.CONST, dtypes.int, arg=2, src=(
        UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
           x2,)),)),)),)),))
 """
 # the BUFFER from above is being multiplied by a CONST 2
 # it's RESHAPEd and EXPANDed to broadcast the CONST to the BUFFER
 # we can check the result with
 assert t_times_2.tolist() == [2, 4, 6, 8]
 # UOps are both immutable and globally unique
 # if i multiply the Tensor by 4 twice, these result Tensors will have the same uop specification
 t_times_4_try_1 = t * 4
 t_times_4_try_2 = t * 4
 assert t_times_4_try_1.uop is t_times_4_try_2.uop
 # the specification isn't just the same, it's the exact same Python object
 assert t_times_4_try_1 is not t_times_4_try_2
 # the Tensor is a different Python object
 # if we realize `t_times_4_try_1` ...
 t_times_4_try_1.realize()
 print(t_times_4_try_2.uop)
 """
 UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
  UOp(Ops.UNIQUE, dtypes.void, arg=4, src=()),
  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),))
 """
 # ... `t_times_4_try_2` also becomes the same BUFFER
 assert t_times_4_try_1.uop is t_times_4_try_2.uop
 # so this print doesn't require any computation, just a copy back to the CPU so we can print it
 print("** only the copy start")
 print(t_times_4_try_2.tolist())  # [4, 8, 12, 16]
 print("** only the copy end")
 # you can confirm this with DEBUG=2, seeing what's printed in between the "**" prints
 # tinygrad has an auto differentiation engine that operates according to these same principles
 # the derivative of "log(x)" is "1/x", and you can see this on line 20 of gradient.py
 t_float = Tensor([3.0])
 t_log = t_float.log()
 t_log_grad, = t_log.sum().gradient(t_float)
 # due to how log is implemented, this gradient contains a lot of UOps
 print(t_log_grad.uop)
 # ...not shown here...
 # but if you run with DEBUG=4 (CPU=1 used here for simpler code), you can see the generated code
 """
 void E_(float* restrict data0, float* restrict data1) {
  float val0 = *(data1+0);
  *(data0+0) = (0.6931471805599453f*(1/(val0*0.6931471805599453f)));
 }
 """
 # the derivative is close to 1/3
 assert (t_log_grad.item() - 1/3) < 1e-6
 # %% ********
 print("******* PART 2 *******")
 # we redefine the same t here so this cell can run on it's own
 from tinygrad import Tensor
 t = Tensor([1,2,3,4])
 # what's above gives you enough of an understanding to go use tinygrad as a library
 # however, a lot of the beauty of tinygrad is in how easy it is to interact with the internals
 # NOTE: the APIs here are subject to change
 t_plus_3_plus_4 = t + 3 + 4
 print(t_plus_3_plus_4.uop)
 """
 UOp(Ops.ADD, dtypes.int, arg=None, src=(
  UOp(Ops.ADD, dtypes.int, arg=None, src=(
    UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
      UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
      x3:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
    UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
      UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
        UOp(Ops.CONST, dtypes.int, arg=3, src=(
          x7:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
             x3,)),)),)),)),)),
  UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=(
    UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=(
      UOp(Ops.CONST, dtypes.int, arg=4, src=(
         x7,)),)),)),))
 """
 # you can see it's adding both 3 and 4
 # but by the time we are actually running the code, it's adding 7
 # `kernelize` will simplify and group the operations in the graph into kernels
 t_plus_3_plus_4.kernelize()
 print(t_plus_3_plus_4.uop)
 """
 UOp(Ops.ASSIGN, dtypes.int, arg=None, src=(
  x0:=UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
    UOp(Ops.UNIQUE, dtypes.void, arg=7, src=()),
    x2:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),
  UOp(Ops.KERNEL, dtypes.void, arg=<Kernel 12 SINK(<Ops.STORE: 48>,) (__add__,)>, src=(
     x0,
    UOp(Ops.BUFFER, dtypes.int, arg=4, src=(
      UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),
       x2,)),)),))
 """
 # ASSIGN has two srcs, src[0] is the BUFFER that's assigned to, and src[1] is the thing to assign
 # src[1] is the GPU Kernel that's going to be run
 # we can get the ast of the Kernel as follows
 kernel_ast = t_plus_3_plus_4.uop.src[1].arg.ast
 # almost everything in tinygrad functions as a rewrite of the UOps
 # the codegen rewrites the ast to a simplified form ready for "rendering"
 from tinygrad.codegen import full_rewrite_to_sink
 rewritten_ast = full_rewrite_to_sink(kernel_ast)
 print(rewritten_ast)
 """
 UOp(Ops.SINK, dtypes.void, arg=None, src=(
  UOp(Ops.STORE, dtypes.void, arg=None, src=(
    UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
      UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=0, src=()),
      x3:=UOp(Ops.SPECIAL, dtypes.int, arg=('gidx0', 4), src=()),)),
    UOp(Ops.ADD, dtypes.int, arg=None, src=(
      UOp(Ops.LOAD, dtypes.int, arg=None, src=(
        UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=(
          UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=1, src=()),
           x3,)),)),
      UOp(Ops.CONST, dtypes.int, arg=7, src=()),)),)),))
 """
 # you can see at this point we are adding 7, not 3 and 4
 # with DEBUG=4, we can see the code.
 # since optimizations are on, it UPCASTed the operation, explicitly writing out all 4 +7s
 t_plus_3_plus_4.realize()
 """
 void E_4n2(int* restrict data0, int* restrict data1) {
  int val0 = *(data1+0);
  int val1 = *(data1+1);
  int val2 = *(data1+2);
  int val3 = *(data1+3);
  *(data0+0) = (val0+7);
  *(data0+1) = (val1+7);
  *(data0+2) = (val2+7);
  *(data0+3) = (val3+7);
 }
 """
 # the function name E_4n2 is "E" for elementwise op (as opposed to "r" for reduce op)
 # "4" for the size, and "n2" for name deduping (it's the 3rd function with the same E and 4 in this session)
 # when you print the name with DEBUG=2, you'll see the 4 is yellow, meaning that it's upcasted
 # if you run with NOOPT=1 ...
 """
 void E_4n2(int* restrict data0, int* restrict data1) {
  for (int ridx0 = 0; ridx0 < 4; ridx0++) {
    int val0 = *(data1+ridx0);
    *(data0+ridx0) = (val0+7);
  }
 }
 """
 # ... you get this unoptimized code with a loop and the 4 is blue (for global). the color code is in kernel.py
 # %% ********
 print("******* PART 3 *******")
 # now, we go even lower and understand UOps better and how the graph rewrite engine works.
 # it's much simpler than what's in LLVM or MLIR
 from tinygrad import dtypes
 from tinygrad.uop.ops import UOp, Ops
 # first, we'll construct some const UOps
 a = UOp(Ops.CONST, dtypes.int, arg=2)
 b = UOp(Ops.CONST, dtypes.int, arg=2)
 # if you have been paying attention, you should know these are the same Python object
 assert a is b
 # UOps support normal Python math operations, so a_plus_b expresses the spec for 2 + 2
 a_plus_b = a + b
 print(a_plus_b)
 """
 UOp(Ops.ADD, dtypes.int, arg=None, src=(
  x0:=UOp(Ops.CONST, dtypes.int, arg=2, src=()),
   x0,))
 """
 # we could actually render this 2+2 into a language like c and run it
 # or, we can use tinygrad's graph rewrite engine to "constant fold"
 from tinygrad.uop.ops import graph_rewrite, UPat, PatternMatcher
 # a `PatternMatcher` is a list of tuples. for each element in the list:
 # [0] is the pattern to match, and [1] is the function to run.
 # this function can return either a UOp to replace the pattern with, or None to not replace
 simple_pm = PatternMatcher([
  (UPat(Ops.ADD, src=(UPat(Ops.CONST, name="c1"), UPat(Ops.CONST, name="c2"))),
   lambda c1,c2: UOp(Ops.CONST, dtype=c1.dtype, arg=c1.arg+c2.arg)),
 ])
 # this pattern matches the addition of two CONST and rewrites it into a single CONST UOp
 # to actually apply the pattern to a_plus_b, we use graph_rewrite
 a_plus_b_simplified = graph_rewrite(a_plus_b, simple_pm)
 print(a_plus_b_simplified)
 """
 UOp(Ops.CONST, dtypes.int, arg=4, src=())
 """
 # 2+2 is in fact, 4
 # we can also use syntactic sugar to write the pattern nicer
 simpler_pm = PatternMatcher([
  (UPat.cvar("c1")+UPat.cvar("c2"), lambda c1,c2: c1.const_like(c1.arg+c2.arg))
 ])
 assert graph_rewrite(a_plus_b, simple_pm) is graph_rewrite(a_plus_b, simpler_pm)
 # note again the use of is, UOps are immutable and globally unique
 # %% ********
 # that brings you to an understanding of the most core concepts in tinygrad
 # you can run this with VIZ=1 to use the web based graph rewrite explorer
 # hopefully now you understand it. the nodes in the graph are just UOps
--- a/tinygrad_repo/docs/tensor/creation.md
+++ b/tinygrad_repo/docs/tensor/creation.md
@ -24,6 +24,7 @@
 ::: tinygrad.Tensor.randn
 ::: tinygrad.Tensor.randn_like
 ::: tinygrad.Tensor.randint
 ::: tinygrad.Tensor.randperm
 ::: tinygrad.Tensor.normal
 ::: tinygrad.Tensor.uniform
 ::: tinygrad.Tensor.scaled_uniform
--- a/tinygrad_repo/docs/tensor/ops.md
+++ b/tinygrad_repo/docs/tensor/ops.md
@ -37,8 +37,10 @@
 ::: tinygrad.Tensor.scatter
 ::: tinygrad.Tensor.scatter_reduce
 ::: tinygrad.Tensor.masked_select
 ::: tinygrad.Tensor.masked_fill
 ::: tinygrad.Tensor.sort
 ::: tinygrad.Tensor.topk
 ::: tinygrad.Tensor.multinomial
 ## Neural Network (functional)
--- a/tinygrad_repo/examples/beautiful_cartpole.py
+++ b/tinygrad_repo/examples/beautiful_cartpole.py
@ -78,10 +78,7 @@ if __name__ == "__main__":
  @TinyJit
  def get_action(obs:Tensor) -> Tensor:
    # TODO: with no_grad
    Tensor.no_grad = True
    ret = model(obs)[0].exp().multinomial().realize()
    Tensor.no_grad = False
    return ret
  st, steps = time.perf_counter(), 0
--- a/tinygrad_repo/examples/beautiful_cifar.py
+++ b/tinygrad_repo/examples/beautiful_cifar.py
@ -3,14 +3,19 @@ start_tm = time.perf_counter()
 import math
 from typing import Tuple, cast
 import numpy as np
-from tinygrad import Tensor, nn, GlobalCounters, TinyJit, dtypes
+from tinygrad import Tensor, nn, GlobalCounters, TinyJit, dtypes, Device
 from tinygrad.helpers import partition, trange, getenv, Context
 from extra.lr_scheduler import OneCycleLR
 GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 1))]
 # override tinygrad defaults
 dtypes.default_float = dtypes.half
 Context(FUSE_ARANGE=1, FUSE_OPTIM=1).__enter__()
 # from https://github.com/tysam-code/hlb-CIFAR10/blob/main/main.py
 batchsize = getenv("BS", 1024)
 assert batchsize % len(GPUS) == 0, f"{batchsize=} is not a multiple of {len(GPUS)=}"
 bias_scaler = 64
 hyp = {
  'opt': {
@ -67,7 +72,7 @@ class ConvGroup:
    cast(Tensor, self.norm2.weight).requires_grad = False
  def __call__(self, x:Tensor) -> Tensor:
    x =    self.norm1(self.conv1(x).max_pool2d().float()).cast(dtypes.default_float).quick_gelu()
-    return self.norm2(self.conv2(x).float()).cast(dtypes.default_float).quick_gelu()
+    return self.norm2(self.conv2(x).float()).cast(dtypes.default_float).quick_gelu() + x
 class SpeedyConvNet:
  def __init__(self):
@ -78,23 +83,25 @@ class SpeedyConvNet:
    self.linear = nn.Linear(depths['block3'], depths['num_classes'], bias=False)
  def __call__(self, x:Tensor) -> Tensor:
    x = self.whiten(x).quick_gelu()
    # ************* HACKS *************
    x = x.pad((1,0,0,1)) # TODO: this pad should not be here! copied from hlb_cifar10 for speed
    # ************* HACKS *************
    x = x.sequential([self.conv_group_1, self.conv_group_2, self.conv_group_3])
    return self.linear(x.max(axis=(2,3))) * hyp['opt']['scaling_factor']
 if __name__ == "__main__":
  # *** dataset ***
  X_train, Y_train, X_test, Y_test = nn.datasets.cifar()
  # TODO: without this line indexing doesn't fuse!
  X_train, Y_train, X_test, Y_test = [x.contiguous() for x in [X_train, Y_train, X_test, Y_test]]
  cifar10_std, cifar10_mean = X_train.float().std_mean(axis=(0, 2, 3))
-  def preprocess(X:Tensor, Y:Tensor) -> Tuple[Tensor, Tensor]:
+  def preprocess(X:Tensor) -> Tensor: return ((X - cifar10_mean.view(1, -1, 1, 1)) / cifar10_std.view(1, -1, 1, 1)).cast(dtypes.default_float)
    return ((X - cifar10_mean.view(1, -1, 1, 1)) / cifar10_std.view(1, -1, 1, 1)).cast(dtypes.default_float), Y.one_hot(depths['num_classes'])
  # *** model ***
  model = SpeedyConvNet()
  state_dict = nn.state.get_state_dict(model)
-
+  if len(GPUS) > 1:
-  #for k,v in nn.state.torch_load("/tmp/cifar_net.pt").items(): print(k)
+    cifar10_std.to_(GPUS)
    cifar10_mean.to_(GPUS)
    for x in state_dict.values(): x.to_(GPUS)
  params_bias, params_non_bias = partition(state_dict.items(), lambda x: 'bias' in x[0])
  opt_bias     = nn.optim.SGD([x[1] for x in params_bias],     lr=0.01, momentum=.85, nesterov=True, weight_decay=hyp['opt']['bias_decay'])
@ -111,40 +118,37 @@ if __name__ == "__main__":
  lr_sched_bias     = OneCycleLR(opt_bias,     max_lr=hyp['opt']['bias_lr'],     pct_start=pct_start, div_factor=initial_div_factor, final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=total_train_steps)
  lr_sched_non_bias = OneCycleLR(opt_non_bias, max_lr=hyp['opt']['non_bias_lr'], pct_start=pct_start, div_factor=initial_div_factor, final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=total_train_steps)
-  def loss_fn(out, Y):
+  def loss_fn(out:Tensor, Y:Tensor) -> Tensor:
-    return out.cross_entropy(Y, reduction='none', label_smoothing=0.2).mul(hyp['opt']['loss_scale_scaler']*loss_batchsize_scaler).sum().div(hyp['opt']['loss_scale_scaler'])
+    ret = out.sparse_categorical_crossentropy(Y, reduction='none', label_smoothing=0.2)
    return ret.mul(hyp['opt']['loss_scale_scaler']*loss_batchsize_scaler).sum().div(hyp['opt']['loss_scale_scaler'])
  @TinyJit
  @Tensor.train()
  def train_step(idxs:Tensor) -> Tensor:
-    with Context(SPLIT_REDUCEOP=0, FUSE_ARANGE=1):
+    X, Y = X_train[idxs], Y_train[idxs]
-      X = X_train[idxs]
+    if len(GPUS) > 1:
-      Y = Y_train[idxs].realize(X)
+      X.shard_(GPUS, axis=0)
-    X, Y = preprocess(X, Y)
+      Y.shard_(GPUS, axis=0)
-    out = model(X)
+    out = model(preprocess(X))
    loss = loss_fn(out, Y)
    opt.zero_grad()
    loss.backward()
-    opt.step()
+    return (loss / (batchsize*loss_batchsize_scaler)).realize(*opt.schedule_step(),
-    lr_sched_bias.step()
+                                                              *lr_sched_bias.schedule_step(), *lr_sched_non_bias.schedule_step())
    lr_sched_non_bias.step()
    return loss / (batchsize*loss_batchsize_scaler)
  eval_batchsize = 2500
  @TinyJit
  @Tensor.test()
  def val_step() -> Tuple[Tensor, Tensor]:
    # TODO with Tensor.no_grad()
    Tensor.no_grad = True
    loss, acc = [], []
    for i in range(0, X_test.size(0), eval_batchsize):
-      X, Y = preprocess(X_test[i:i+eval_batchsize], Y_test[i:i+eval_batchsize])
+      X, Y = X_test[i:i+eval_batchsize], Y_test[i:i+eval_batchsize]
-      out = model(X)
+      if len(GPUS) > 1:
        X.shard_(GPUS, axis=0)
        Y.shard_(GPUS, axis=0)
      out = model(preprocess(X))
      loss.append(loss_fn(out, Y))
-      acc.append((out.argmax(-1).one_hot(depths['num_classes']) * Y).sum() / eval_batchsize)
+      acc.append((out.argmax(-1) == Y).sum() / eval_batchsize)
-    ret = Tensor.stack(*loss).mean() / (batchsize*loss_batchsize_scaler), Tensor.stack(*acc).mean()
+    return Tensor.stack(*loss).mean() / (batchsize*loss_batchsize_scaler), Tensor.stack(*acc).mean()
    Tensor.no_grad = False
    return ret
  np.random.seed(1337)
  for epoch in range(math.ceil(hyp['misc']['train_epochs'])):
--- a/tinygrad_repo/examples/beautiful_mnist.py
+++ b/tinygrad_repo/examples/beautiful_mnist.py
@ -34,7 +34,6 @@ if __name__ == "__main__":
    return loss
  @TinyJit
  @Tensor.test()
  def get_test_acc() -> Tensor: return (model(X_test).argmax(axis=1) == Y_test).mean()*100
  test_acc = float('nan')
--- a/tinygrad_repo/examples/benchmark_onnx.py
+++ b/tinygrad_repo/examples/benchmark_onnx.py
@ -1,10 +1,10 @@
-import sys, onnx, time, pickle
+import sys, time, pickle
 from tinygrad import TinyJit, GlobalCounters, fetch, getenv
-from tinygrad.frontend.onnx import OnnxRunner
+from tinygrad.frontend.onnx import OnnxRunner, onnx_load
 from extra.onnx_helpers import get_example_inputs, validate
 def load_onnx_model(onnx_file):
-  onnx_model = onnx.load(onnx_file)
+  onnx_model = onnx_load(onnx_file)
  run_onnx = OnnxRunner(onnx_model)
  run_onnx_jit = TinyJit(lambda **kwargs: next(iter(run_onnx({k:v.to(None) for k,v in kwargs.items()}).values())), prune=True, optimize=True)
  return run_onnx_jit, run_onnx.graph_inputs
--- a/tinygrad_repo/examples/coder.py
+++ b/tinygrad_repo/examples/coder.py
@ -23,8 +23,6 @@ def create_fixed_tokenizer(output_file):
 # echo -en "write 2+2\nwrite hello world\ny\n" | TEMP=0 python3 examples/coder.py
 if __name__ == "__main__":
  Tensor.no_grad = True
  # https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/config.json
  with Timing("create model: "):
    model = Transformer(4096, 14336, n_heads=32, n_layers=32, norm_eps=1e-5, vocab_size=32002, n_kv_heads=8, max_context=4096, jit=getenv("JIT", 1))
--- a/tinygrad_repo/examples/conversation.py
+++ b/tinygrad_repo/examples/conversation.py
@ -159,7 +159,6 @@ def init_vits(
  text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
  # Load the model.
  Tensor.no_grad = True
  if seed is not None:
    Tensor.manual_seed(seed)
    np.random.seed(seed)
@ -221,7 +220,6 @@ def mp_output_stream(q: mp.Queue, counter: mp.Value, num_channels: int, sample_r
 if __name__ == "__main__":
  import nltk
  nltk.download("punkt")
  Tensor.no_grad = True
  # Parse CLI arguments
  parser = argparse.ArgumentParser("Have a tiny conversation with tinygrad")
--- a/tinygrad_repo/examples/gpt2.py
+++ b/tinygrad_repo/examples/gpt2.py
@ -85,7 +85,10 @@ class Transformer:
      seqlen = tokens.shape[1]
      tok_emb = self.wte(tokens)
-    pos_emb = self.wpe(self.allpos.shrink((None, (start_pos, start_pos+seqlen))))
+    # not symbolic when consuming the prompt
    selected_pos = (0, seqlen) if start_pos.val == 0 else (start_pos, start_pos+1)
    pos_emb = self.wpe(self.allpos.shrink((None, selected_pos)))
    h = tok_emb + pos_emb
    if HALF: h = h.half()
@ -190,7 +193,7 @@ class GPT2:
                  (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing):
        with WallTimeEvent(BenchEvent.STEP):
          if batch_size == 1 and len(toks[0][start_pos:]) == 1:
-            tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos])
+            tokens = Variable("tokens", 0, VOCAB_SIZE-1).bind(toks[0][start_pos])
          else:
            tokens = Tensor([x[start_pos:] for x in toks])
          tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
@ -201,7 +204,6 @@ class GPT2:
 # **** main code ****
 if __name__ == "__main__":
  Tensor.no_grad = True
  print(f"using {Device.DEFAULT} backend")
  default_prompt = "What is the answer to life, the universe, and everything?"
--- a/tinygrad_repo/examples/hlb_cifar10.py
+++ b/tinygrad_repo/examples/hlb_cifar10.py
@ -118,7 +118,7 @@ class SpeedyResNet:
 # hyper-parameters were exactly the same as the original repo
 bias_scaler = 58
 hyp = {
-  'seed' : 209,
+  'seed' : 200,
  'opt': {
    'bias_lr':            1.76 * bias_scaler/512,
    'non_bias_lr':        1.76 / 512,
@ -267,13 +267,10 @@ def train_cifar():
    @TinyJit
    def update(self, net, decay):
      # TODO with Tensor.no_grad()
      Tensor.no_grad = True
      for net_ema_param, (param_name, net_param) in zip(get_state_dict(self.net_ema).values(), get_state_dict(net).items()):
        # batchnorm currently is not being tracked
        if not ("num_batches_tracked" in param_name) and not ("running" in param_name):
          net_ema_param.assign(net_ema_param.detach()*decay + net_param.detach()*(1.-decay)).realize()
      Tensor.no_grad = False
  set_seed(getenv('SEED', hyp['seed']))
--- a/tinygrad_repo/examples/llama.py
+++ b/tinygrad_repo/examples/llama.py
@ -240,7 +240,6 @@ class LLaMa:
            #elif k.endswith('.weight'): v.shard_(device, axis=-1)
            #elif 'norm.' in k: v.shard_(device, axis=-1)
            else: v.shard_(device, axis=None)
            #print(k, v.shape, v.lazydata.axis)
        # replace weights in model
        load_state_dict(model, weights, strict=False, consume=True)
@ -331,7 +330,6 @@ int main()
 \end{code}
 """
 if __name__ == "__main__":
  Tensor.no_grad = True
  print(f"using {Device.DEFAULT} backend")
  parser = argparse.ArgumentParser(description="Run LLaMA in tinygrad", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@ -447,7 +445,7 @@ After you are done speaking, output [EOS]. You are not Chad.
  print(f"using LLaMA{LLAMA_SUFFIX}-{args.size} model")
  device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
  llama = LLaMa.build(MODEL_PATH, TOKENIZER_PATH, model_gen=args.gen, model_size=args.size, quantize=args.quantize, device=device)
-  param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(llama.model))
+  param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(llama.model))
  outputted = pre_prompt if chatbot else args.prompt
  start_pos, toks = 0, [llama.tokenizer.bos_id()] + llama.tokenizer.encode(outputted)
--- a/tinygrad_repo/examples/llama3.py
+++ b/tinygrad_repo/examples/llama3.py
@ -233,8 +233,6 @@ def prefill(model, toks, start_pos=0):
  return start_pos
 if __name__ == "__main__":
  Tensor.no_grad = True
  parser = argparse.ArgumentParser()
  parser.add_argument("--download_model", action="store_true", help="Download a model")
  parser.add_argument("--model", type=Path, help="Model path")
@ -286,7 +284,7 @@ if __name__ == "__main__":
  device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT
  model = build_transformer(args.model, model_size=args.size, quantize=args.quantize, device=device)
-  param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(model))
+  param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(model))
  if not args.no_api and not args.benchmark:
    from bottle import Bottle, request, response, HTTPResponse, abort, static_file
--- a/tinygrad_repo/examples/llm.c/export.py
+++ b/tinygrad_repo/examples/llm.c/export.py
@ -16,7 +16,7 @@ if __name__ == "__main__":
  #model.load_pretrained()
  for p in nn.state.get_parameters(model): p.replace(Tensor.empty(p.shape, dtype=p.dtype)) # fake load pretrained
-  #early_sched = create_schedule([x.lazydata for x in nn.state.get_parameters(model)])
+  #early_sched = create_schedule([x.uop for x in nn.state.get_parameters(model)])
  #print(f"built model {len(early_sched)}")
  #B, T = Variable("B", 1, 128).bind(4), 64 #Variable("T", 1, 1024).bind(64)
@ -56,7 +56,7 @@ if __name__ == "__main__":
  state_dict.update({'X': X, 'Y': Y, 'loss': loss})
  grad_state_dict = {}
  for k,v in state_dict.items():
-    if v.lazydata.base.buffer not in used_buffers: print(f"UNUSED: {k}")
+    if v.uop.base.buffer not in used_buffers: print(f"UNUSED: {k}")
    if v.grad is not None: grad_state_dict['grad_'+k] = v.grad
  state_dict.update(grad_state_dict)
  state_dict.update({'adam_b1_t': optimizer.b1_t, 'adam_b2_t': optimizer.b2_t, 'adam_lr': optimizer.lr})
@ -65,7 +65,7 @@ if __name__ == "__main__":
    nm = inverse_state_dict[p]
    state_dict["adam_m_"+nm] = m
    state_dict["adam_v_"+nm] = v
-  named_buffers = {v.lazydata.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}
+  named_buffers = {v.uop.base.buffer:k.replace(".", "_") for k,v in state_dict.items()}
  c_code = ["#include <stdlib.h>", "#include <tgmath.h>", "#include <stdbool.h>"]
  if TIMING: c_code += ["#include <stdio.h>", "#include <time.h>"]
--- a/tinygrad_repo/examples/minrf.py
+++ b/tinygrad_repo/examples/minrf.py
@ -146,7 +146,6 @@ if __name__ == "__main__":
    return loss
  @TinyJit
  @Tensor.test()
  def sample(z:Tensor, cond:Tensor) -> Tensor:
    return model.sample(z, cond, Tensor.full_like(cond, 10), sample_steps=getenv("SAMPLE_STEPS", 20))[-1]
--- a/tinygrad_repo/examples/mixtral.py
+++ b/tinygrad_repo/examples/mixtral.py
@ -56,7 +56,7 @@ if __name__ == "__main__":
    with Profiling(sort="time", frac=0.1, enabled=args.profile):
      with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"):
        with WallTimeEvent(BenchEvent.STEP):
-          tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item()
+          tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024-1).bind(start_pos), args.temperature).item()
    toks.append(tok)
    start_pos += 1
    print(spp.decode(toks))
--- a/tinygrad_repo/examples/mlperf/dataloader.py
+++ b/tinygrad_repo/examples/mlperf/dataloader.py
@ -71,7 +71,7 @@ def loader_process(q_in, q_out, X:Tensor, seed):
      #storage_tensor._copyin(img_tensor.numpy())
      # faster
-      X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
+      X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
      # ideal
      #X[idx].assign(img.tobytes())   # NOTE: this is slow!
@ -262,8 +262,8 @@ def load_unet3d_data(preprocessed_dataset_dir, seed, queue_in, queue_out, X:Tens
      x = random_brightness_augmentation(x)
      x = gaussian_noise(x)
-    X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
+    X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes()
-    Y[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()
+    Y[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes()
    queue_out.put(idx)
  queue_out.put(None)
@ -377,12 +377,12 @@ def load_retinanet_data(base_dir:Path, val:bool, queue_in:Queue, queue_out:Queue
      clipped_match_idxs = np.clip(match_idxs, 0, None)
      clipped_boxes, clipped_labels = tgt["boxes"][clipped_match_idxs], tgt["labels"][clipped_match_idxs]
-      boxes[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
+      boxes[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes()
-      labels[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
+      labels[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes()
-      matches[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
+      matches[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes()
-      anchors[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()
+      anchors[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes()
-    imgs[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
+    imgs[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes()
    queue_out.put(idx)
  queue_out.put(None)
--- a/tinygrad_repo/examples/mlperf/model_eval.py
+++ b/tinygrad_repo/examples/mlperf/model_eval.py
@ -9,7 +9,6 @@ from extra.bench_log import BenchEvent, WallTimeEvent
 def tlog(x): print(f"{x:25s}  @ {time.perf_counter()-start:5.2f}s")
 def eval_resnet():
  Tensor.no_grad = True
  with WallTimeEvent(BenchEvent.FULL):
    # Resnet50-v1.5
    from extra.models.resnet import ResNet50
@ -245,7 +244,6 @@ def eval_mrcnn():
 if __name__ == "__main__":
  # inference only
  Tensor.training = False
  Tensor.no_grad = True
  models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(",")
  for m in models:
--- a/tinygrad_repo/examples/mlperf/model_spec.py
+++ b/tinygrad_repo/examples/mlperf/model_spec.py
@ -60,7 +60,6 @@ def spec_mrcnn():
 if __name__ == "__main__":
  # inference only for now
  Tensor.training = False
  Tensor.no_grad = True
  for m in getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(","):
    nm = f"spec_{m}"
--- a/tinygrad_repo/examples/mlperf/model_train.py
+++ b/tinygrad_repo/examples/mlperf/model_train.py
@ -608,7 +608,7 @@ def train_retinanet():
      if getenv("RESET_STEP", 1): _train_step.reset()
-      with Tensor.train(mode=False), Tensor.test():
+      with Tensor.train(mode=False):
        if not RUNMLPERF:
          i, proc = 0, _fake_data_get(EVAL_BS, val=(val:=True))
        else:
@ -791,7 +791,6 @@ def train_unet3d():
    return loss.realize()
  @Tensor.train(mode=False)
  @Tensor.test()
  def eval_step(model, x, y):
    y_hat, y = sliding_window_inference(model, x, y, gpus=GPUS)
    y_hat, y = Tensor(y_hat), Tensor(y, requires_grad=False)
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_8xMI300X.json
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_8xMI300X.json
@ -5,7 +5,7 @@
    "system_name": "tinybox 8xMI300X",
    "number_of_nodes": "1",
    "host_processors_per_node": "2",
-    "host_processor_model_name": "AMD EPYC 9354 32-Core Processor",
+    "host_processor_model_name": "AMD EPYC 9354",
    "host_processor_core_count": "32",
    "host_processor_vcpu_count": "64",
    "host_processor_frequency": "",
@ -18,7 +18,7 @@
    "host_networking_topology": "",
    "host_memory_configuration": "24x 96GB DDR5",
    "accelerators_per_node": "8",
-    "accelerator_model_name": "AMD Instinct MI300X",
+    "accelerator_model_name": "AMD Instinct MI300X 192GB HBM3",
    "accelerator_host_interconnect": "PCIe 5.0 x16",
    "accelerator_frequency": "",
    "accelerator_on-chip_memories": "",
@ -30,10 +30,9 @@
    "hw_notes": "",
    "framework": "tinygrad, branch mlperf_training_v5.0",
    "other_software_stack": {
-      "python": "3.10.16",
+        "python": "3.10.16",
-      "ROCm": "3.0.0+94441cb"
+        "ROCm": "3.0.0+94441cb"
    },
    "operating_system": "Ubuntu 24.04.1 LTS",
    "sw_notes": ""
  }
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json
@ -5,7 +5,7 @@
  "system_name": "tinybox green",
  "number_of_nodes": "1",
  "host_processors_per_node": "1",
-  "host_processor_model_name": "AMD EPYC 7532 32-Core Processor",
+  "host_processor_model_name": "AMD EPYC 7532",
  "host_processor_core_count": "32",
  "host_processor_vcpu_count": "64",
  "host_processor_frequency": "",
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json
@ -5,7 +5,7 @@
  "system_name": "tinybox red",
  "number_of_nodes": "1",
  "host_processors_per_node": "1",
-  "host_processor_model_name": "AMD EPYC 7532 32-Core Processor",
+  "host_processor_model_name": "AMD EPYC 7532",
  "host_processor_core_count": "32",
  "host_processor_vcpu_count": "64",
  "host_processor_frequency": "",
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
@ -0,0 +1,15 @@
 #!/bin/bash
 export PYTHONPATH="." AMD=1
 export MODEL="bert"
 export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128
 export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
 # export BEAM_LOG_SURPASS_MAX=1
 # export BASEDIR="/raid/datasets/wiki"
 export RESET_STEP=1
 export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
@ -0,0 +1,69 @@
 # 1. Problem
 This problem uses BERT for NLP.
 ## Requirements
 Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
 ```
 git clone https://github.com/tinygrad/tinygrad.git
 python3 -m pip install -e ".[mlperf]"
 ```
 Also install gdown (for dataset), numpy, tqdm and tensorflow.
 ```
 pip install gdown numpy tqdm tensorflow
 ```
 ### tinybox_green
 Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
 This is the default on production tinybox green.
 # 2. Directions
 ## Steps to download and verify data
 ### 1. Download raw data
 ```
 BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
 ```
 ### 2. Preprocess train and validation data
 Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
 #### Training:
 ```
 BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
 ```
 Generating a specific topic (Between 0 and 499)
 ```
 BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
 ```
 #### Validation:
 ```
 BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
 ```
 ## Running
 ### tinybox_green
 #### Steps to run benchmark
 ```
 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
 ```
 ### tinybox_red
 #### Steps to run benchmark
 ```
 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
 ```
 ### tinybox_8xMI300X
 #### Steps to run benchmark
 ```
 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
 ```
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
@ -0,0 +1,14 @@
 #!/bin/bash
 export PYTHONPATH="." AMD=1
 export MODEL="bert"
 export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
 export BASEDIR="/raid/datasets/wiki"
 export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
@ -0,0 +1,17 @@
 #!/bin/bash
 export PYTHONPATH="." AMD=1
 export MODEL="bert"
 export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
 # similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
 export TRAIN_STEPS=3900
 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
 export BASEDIR="/raid/datasets/wiki"
 export WANDB=1 PARALLEL=0
 RUNMLPERF=1 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
@ -0,0 +1,29 @@
 #!/bin/bash
 set -e  # Exit on any error
 set -o pipefail  # Make pipeline fail if any command fails
 export PYTHONPATH="." AMD=1
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_8xMI300X"
 export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
 # similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
 export TRAIN_STEPS=3900
 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
 export BASEDIR="/raid/datasets/wiki"
 # pip install -e ".[mlperf]"
 export LOGMLPERF=1
 export SEED=$RANDOM
 DATETIME=$(date "+%m%d%H%M")
 LOGFILE="bert_8xMI300x_${DATETIME}_${SEED}.log"
 # init  # TODO: without DEBUG=2 it hangs
 BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 DEBUG=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
 # run
 PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
@ -0,0 +1,69 @@
 # 1. Problem
 This problem uses BERT for NLP.
 ## Requirements
 Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
 ```
 git clone https://github.com/tinygrad/tinygrad.git
 python3 -m pip install -e ".[mlperf]"
 ```
 Also install gdown (for dataset), numpy, tqdm and tensorflow.
 ```
 pip install gdown numpy tqdm tensorflow
 ```
 ### tinybox_green
 Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
 This is the default on production tinybox green.
 # 2. Directions
 ## Steps to download and verify data
 ### 1. Download raw data
 ```
 BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
 ```
 ### 2. Preprocess train and validation data
 Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
 #### Training:
 ```
 BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
 ```
 Generating a specific topic (Between 0 and 499)
 ```
 BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
 ```
 #### Validation:
 ```
 BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
 ```
 ## Running
 ### tinybox_green
 #### Steps to run benchmark
 ```
 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
 ```
 ### tinybox_red
 #### Steps to run benchmark
 ```
 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
 ```
 ### tinybox_8xMI300X
 #### Steps to run benchmark
 ```
 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
 ```
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@ -0,0 +1,16 @@
 #!/bin/bash
 export PYTHONPATH="." NV=1
 export MODEL="bert"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
 export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
 export BEAM_LOG_SURPASS_MAX=1
 export BASEDIR="/raid/datasets/wiki"
 export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@ -0,0 +1,15 @@
 #!/bin/bash
 export PYTHONPATH="." NV=1
 export MODEL="bert"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
 export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
 export BASEDIR="/raid/datasets/wiki"
 export WANDB=1 PARALLEL=0
 RUNMLPERF=1 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@ -0,0 +1,27 @@
 #!/bin/bash
 set -e  # Exit on any error
 set -o pipefail  # Make pipeline fail if any command fails
 export PYTHONPATH="." NV=1
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_green"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
 export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
 export BASEDIR="/raid/datasets/wiki"
 # pip install -e ".[mlperf]"
 export LOGMLPERF=1
 export SEED=$RANDOM
 DATETIME=$(date "+%m%d%H%M")
 LOGFILE="bert_green_${DATETIME}_${SEED}.log"
 # init
 BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
 # run
 PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
@ -0,0 +1,69 @@
 # 1. Problem
 This problem uses BERT for NLP.
 ## Requirements
 Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
 ```
 git clone https://github.com/tinygrad/tinygrad.git
 python3 -m pip install -e ".[mlperf]"
 ```
 Also install gdown (for dataset), numpy, tqdm and tensorflow.
 ```
 pip install gdown numpy tqdm tensorflow
 ```
 ### tinybox_green
 Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
 This is the default on production tinybox green.
 # 2. Directions
 ## Steps to download and verify data
 ### 1. Download raw data
 ```
 BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
 ```
 ### 2. Preprocess train and validation data
 Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
 #### Training:
 ```
 BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
 ```
 Generating a specific topic (Between 0 and 499)
 ```
 BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
 ```
 #### Validation:
 ```
 BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
 ```
 ## Running
 ### tinybox_green
 #### Steps to run benchmark
 ```
 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
 ```
 ### tinybox_red
 #### Steps to run benchmark
 ```
 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
 ```
 ### tinybox_8xMI300X
 #### Steps to run benchmark
 ```
 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
 ```
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
@ -0,0 +1,17 @@
 #!/bin/bash
 export PYTHONPATH="." AMD=1
 export MODEL="bert"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
 export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
 export BEAM_LOG_SURPASS_MAX=1
 export BASEDIR="/raid/datasets/wiki"
 export RESET_STEP=1
 export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
@ -0,0 +1,15 @@
 #!/bin/bash
 export PYTHONPATH="." AMD=1
 export MODEL="bert"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
 export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
 export BASEDIR="/raid/datasets/wiki"
 export WANDB=1 PARALLEL=0
 RUNMLPERF=1 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
@ -0,0 +1,32 @@
 #!/bin/bash
 set -e  # Exit on any error
 set -o pipefail  # Make pipeline fail if any command fails
 export PYTHONPATH="." AMD=1
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_red"
 export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
 export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
 export IGNORE_JIT_FIRST_BEAM=1
 export BASEDIR="/raid/datasets/wiki"
 # pip install -e ".[mlperf]"
 export LOGMLPERF=1
 export SEED=$RANDOM
 DATETIME=$(date "+%m%d%H%M")
 LOGFILE="bert_red_${DATETIME}_${SEED}.log"
 export HCQDEV_WAIT_TIMEOUT_MS=100000  # prevents hang?
 # init
 sleep 5 && sudo rmmod amdgpu || true
 BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
 # run
 # TODO: AM driver resulted in nan
 sudo modprobe amdgpu
 PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md
@ -0,0 +1,50 @@
 # 1. Problem
 This problem uses the ResNet-50 CNN to do image classification.
 ## Requirements
 Install tinygrad and mlperf-logging from master.
 ```
 git clone https://github.com/tinygrad/tinygrad.git
 python3 -m pip install -e ".[mlperf]"
 ```
 ### tinybox_green
 Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
 This is the default on production tinybox green.
 ### tinybox_red
 Disable cwsr
 This is the default on production tinybox red.
 ```
 sudo vi /etc/modprobe.d/amdgpu.conf
 cat <<EOF > /etc/modprobe.d/amdgpu.conf
 options amdgpu cwsr_enable=0
 EOF
 sudo update-initramfs -u
 sudo reboot
 # validate
 sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
 ```
 # 2. Directions
 ## Steps to download and verify data
 ```
 IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
 ```
 ## Steps for one time setup
 ### tinybox_red
 ```
 examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
 ```
 ## Steps to run benchmark
 ```
 examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
 ```
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 export PYTHONPATH="." NV=1
 export MODEL="resnet"
 export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
 export RESET_STEP=0
 export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
 export BENCHMARK=10 DEBUG=2
 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh
@ -0,0 +1,15 @@
 #!/bin/bash
 export PYTHONPATH="." NV=1
 export MODEL="resnet"
 export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
 export RESET_STEP=0
 export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
 export EVAL_START_EPOCH=3 EVAL_FREQ=4
 export WANDB=1 PARALLEL=0
 python3 examples/mlperf/model_train.py
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh
@ -0,0 +1,25 @@
 #!/bin/bash
 set -e  # Exit on any error
 set -o pipefail  # Make pipeline fail if any command fails
 export PYTHONPATH="." NV=1
 export MODEL="resnet"
 export SUBMISSION_PLATFORM="tinybox_green"
 export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
 export RESET_STEP=0
 export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
 # pip install -e ".[mlperf]"
 export LOGMLPERF=${LOGMLPERF:-1}
 export SEED=$RANDOM
 DATETIME=$(date "+%m%d%H%M")
 LOGFILE="resnet_green_${DATETIME}_${SEED}.log"
 # init
 BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
 # run
 PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md
+++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md
@ -0,0 +1,50 @@
 # 1. Problem
 This problem uses the ResNet-50 CNN to do image classification.
 ## Requirements
 Install tinygrad and mlperf-logging from master.
 ```
 git clone https://github.com/tinygrad/tinygrad.git
 python3 -m pip install -e ".[mlperf]"
 ```
 ### tinybox_green
 Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
 This is the default on production tinybox green.
 ### tinybox_red
 Disable cwsr
 This is the default on production tinybox red.
 ```
 sudo vi /etc/modprobe.d/amdgpu.conf
 cat <<EOF > /etc/modprobe.d/amdgpu.conf
 options amdgpu cwsr_enable=0
 EOF
 sudo update-initramfs -u
 sudo reboot
 # validate
 sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
 ```
 # 2. Directions
 ## Steps to download and verify data
 ```
 IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
 ```
 ## Steps for one time setup
 ### tinybox_red
 ```
 examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
 ```
 ## Steps to run benchmark
 ```
 examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
 ```
--- a/Show More
+++ b/Show More