diff --git a/common/params_keys.h b/common/params_keys.h index 339186f..1cc2c07 100644 --- a/common/params_keys.h +++ b/common/params_keys.h @@ -236,7 +236,6 @@ inline static std::unordered_map keys = { {"HapticFeedbackWhenSpeedCamera", PERSISTENT}, {"UseLaneLineSpeed", PERSISTENT}, {"UseLaneLineCurveSpeed", PERSISTENT}, - {"UseLaneLineSpeedApply", PERSISTENT}, {"AdjustLaneOffset", PERSISTENT}, {"LaneChangeNeedTorque", PERSISTENT}, {"LaneChangeDelay", PERSISTENT }, @@ -261,6 +260,8 @@ inline static std::unordered_map keys = { {"CustomSteerMax", PERSISTENT}, {"CustomSteerDeltaUp", PERSISTENT}, {"CustomSteerDeltaDown", PERSISTENT}, + {"CustomSteerDeltaUpLC", PERSISTENT}, + {"CustomSteerDeltaDownLC", PERSISTENT}, {"SpeedFromPCM", PERSISTENT}, {"MaxTimeOffroadMin", PERSISTENT}, {"DisableDM", PERSISTENT}, diff --git a/launch_env.sh b/launch_env.sh index 29e3e38..b0bff4e 100755 --- a/launch_env.sh +++ b/launch_env.sh @@ -7,7 +7,7 @@ export OPENBLAS_NUM_THREADS=1 export VECLIB_MAXIMUM_THREADS=1 if [ -z "$AGNOS_VERSION" ]; then - export AGNOS_VERSION="12.3" + export AGNOS_VERSION="12.4" fi export STAGING_ROOT="/data/safe_staging" diff --git a/opendbc_repo/opendbc/car/car.capnp b/opendbc_repo/opendbc/car/car.capnp index 83acdee..1aa88c6 100644 --- a/opendbc_repo/opendbc/car/car.capnp +++ b/opendbc_repo/opendbc/car/car.capnp @@ -246,6 +246,7 @@ struct CarState { speedLimitDistance @65 :Float32; gearStep @66 :Int16; tpms @67 : Tpms; + useLaneLineSpeed @68 : Float32; struct Tpms { fl @0 :Float32; diff --git a/opendbc_repo/opendbc/car/hyundai/carcontroller.py b/opendbc_repo/opendbc/car/hyundai/carcontroller.py index 9619501..ca53477 100644 --- a/opendbc_repo/opendbc/car/hyundai/carcontroller.py +++ b/opendbc_repo/opendbc/car/hyundai/carcontroller.py @@ -96,6 +96,9 @@ class CarController(CarControllerBase): self.activeCarrot = 0 self.camera_scc_params = Params().get_int("HyundaiCameraSCC") + self.steerDeltaUpOrg = self.steerDeltaUp = self.steerDeltaUpLC = self.params.STEER_DELTA_UP + self.steerDeltaDownOrg = self.steerDeltaDown = self.steerDeltaDownLC = self.params.STEER_DELTA_DOWN + def update(self, CC, CS, now_nanos): if self.frame % 50 == 0: @@ -104,14 +107,30 @@ class CarController(CarControllerBase): steerMax = params.get_int("CustomSteerMax") steerDeltaUp = params.get_int("CustomSteerDeltaUp") steerDeltaDown = params.get_int("CustomSteerDeltaDown") + steerDeltaUpLC = params.get_int("CustomSteerDeltaUpLC") + steerDeltaDownLC = params.get_int("CustomSteerDeltaDownLC") if steerMax > 0: self.params.STEER_MAX = steerMax if steerDeltaUp > 0: - self.params.STEER_DELTA_UP = steerDeltaUp + self.steerDeltaUp = steerDeltaUp #self.params.ANGLE_TORQUE_UP_RATE = steerDeltaUp + else: + self.steerDeltaUp = self.steerDeltaUpOrg if steerDeltaDown > 0: - self.params.STEER_DELTA_DOWN = steerDeltaDown + self.steerDeltaDown = steerDeltaDown #self.params.ANGLE_TORQUE_DOWN_RATE = steerDeltaDown + else: + self.steerDeltaDown = self.steerDeltaDownOrg + + if steerDeltaUpLC > 0: + self.steerDeltaUpLC = steerDeltaUpLC + else: + self.steerDeltaUpLC = self.steerDeltaUp + if steerDeltaDownLC > 0: + self.steerDeltaDownLC = steerDeltaDownLC + else: + self.steerDeltaDownLC = self.steerDeltaDown + self.soft_hold_mode = 1 if params.get_int("AutoCruiseControl") > 1 else 2 self.hapticFeedbackWhenSpeedCamera = int(params.get_int("HapticFeedbackWhenSpeedCamera")) @@ -125,6 +144,13 @@ class CarController(CarControllerBase): actuators = CC.actuators hud_control = CC.hudControl + + if hud_control.modelDesire in [3,4]: + self.params.STEER_DELTA_UP = self.steerDeltaUpLC + self.params.STEER_DELTA_DOWN = self.steerDeltaDownLC + else: + self.params.STEER_DELTA_UP = self.steerDeltaUp + self.params.STEER_DELTA_DOWN = self.steerDeltaDown angle_control = self.CP.flags & HyundaiFlags.ANGLE_CONTROL diff --git a/opendbc_repo/opendbc/car/hyundai/carstate.py b/opendbc_repo/opendbc/car/hyundai/carstate.py index 3206c7d..f841153 100644 --- a/opendbc_repo/opendbc/car/hyundai/carstate.py +++ b/opendbc_repo/opendbc/car/hyundai/carstate.py @@ -76,6 +76,7 @@ class CarState(CarStateBase): self.cruise_buttons_msg = None self.hda2_lfa_block_msg = None + self.cluster_speed_limit_msg = None # On some cars, CLU15->CF_Clu_VehicleSpeed can oscillate faster than the dash updates. Sample at 5 Hz self.cluster_speed = 0 @@ -461,6 +462,9 @@ class CarState(CarStateBase): if "TCS" in cp.vl: self.tcs_info_373 = copy.copy(cp.vl.get("TCS", {})) + if "CLUSTER_SPEED_LIMIT" in cp.vl: + self.cluster_speed_limit_msg = copy.copy(cp.vl.get("CLUSTER_SPEED_LIMIT", {})) + if "GEAR" in cp.vl: ret.gearStep = cp.vl["GEAR"]["GEAR_STEP"] elif "GEAR_ALT" in cp.vl: @@ -596,6 +600,8 @@ class CarState(CarStateBase): # 어떤차는 bus2에 있음, 내차는 bus0에 있는데.... 이건 옆두부와 관련이 없나? #if CP.flags & HyundaiFlags.CANFD_HDA2: # pt_messages.append(("CLUSTER_SPEED_LIMIT", 10)) + if Params().get_int("CanfdDebug") > 0: + pt_messages.append(("CLUSTER_SPEED_LIMIT", 10)) cam_messages = [] if CP.flags & HyundaiFlags.CANFD_HDA2 and not (CP.flags & HyundaiFlags.CAMERA_SCC.value): diff --git a/opendbc_repo/opendbc/car/hyundai/hyundaicanfd.py b/opendbc_repo/opendbc/car/hyundai/hyundaicanfd.py index 3fbcf82..94592e0 100644 --- a/opendbc_repo/opendbc/car/hyundai/hyundaicanfd.py +++ b/opendbc_repo/opendbc/car/hyundai/hyundaicanfd.py @@ -598,8 +598,13 @@ def create_ccnc_messages(CP, packer, CAN, frame, CC, CS, hud_control, disp_angle # ADAS 콤마연결하면.. 0번에서.. (카메라혹은 다른곳에서) # 카메라 콤마연결+롱컨개조 하면.. 2번에서 데이터가 나옴..(카메라혹은 ADAS) if frame % 10 == 0: - - pass + if CS.cluster_speed_limit_msg is not None: + values = CS.cluster_speed_limit_msg + values["SPEED_LIMIT_1"] = 100 + values["SPEED_LIMIT_2"] = 100 + values["SPEED_LIMIT_3"] = 105 + #values["COUNTER"] = (values["COUNTER"] + 1) % 256 + ret.append(packer.make_can_msg("CLUSTER_SPEED_LIMIT", CAN.CAM, values)) return ret diff --git a/opendbc_repo/opendbc/car/mazda/carstate.py b/opendbc_repo/opendbc/car/mazda/carstate.py index 5bb6bcc..8499b8c 100644 --- a/opendbc_repo/opendbc/car/mazda/carstate.py +++ b/opendbc_repo/opendbc/car/mazda/carstate.py @@ -141,7 +141,7 @@ class CarState(CarStateBase): ret.buttonEvents = [ *create_button_events(self.cruise_buttons, self.prev_cruise_buttons, BUTTONS_DICT), *create_button_events(self.distance_button, self.prev_distance_button, {1: ButtonType.gapAdjustCruise}), - *create_button_events(self.lkas_enabled, self.lkas_previously_enabled, {1: ButtonType.lfaButton}), + #*create_button_events(self.lkas_enabled, self.lkas_previously_enabled, {1: ButtonType.lfaButton}), ] return ret diff --git a/opendbc_repo/opendbc/safety/safety/safety_hyundai_canfd.h b/opendbc_repo/opendbc/safety/safety/safety_hyundai_canfd.h index 8abae81..957174d 100644 --- a/opendbc_repo/opendbc/safety/safety/safety_hyundai_canfd.h +++ b/opendbc_repo/opendbc/safety/safety/safety_hyundai_canfd.h @@ -81,7 +81,7 @@ const CanMsg HYUNDAI_CANFD_HDA2_LONG_TX_MSGS[] = { {203, 0, 24}, // CB {373, 2, 24}, // TCS(0x175) - //{506, 2, 32}, // CLUSTER_SPEED_LIMIT + {506, 2, 32}, // CLUSTER_SPEED_LIMIT {234, 2, 24}, // MDPS {687, 2, 8}, // STEER_TOUCH_2AF }; diff --git a/selfdrive/assets/sounds_eng/Wazealert.wav b/selfdrive/assets/sounds_eng/Wazealert.wav new file mode 100644 index 0000000..865e465 Binary files /dev/null and b/selfdrive/assets/sounds_eng/Wazealert.wav differ diff --git a/selfdrive/assets/sounds_eng/Wazealert2.wav b/selfdrive/assets/sounds_eng/Wazealert2.wav new file mode 100644 index 0000000..52e9b5b Binary files /dev/null and b/selfdrive/assets/sounds_eng/Wazealert2.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_1.wav b/selfdrive/assets/sounds_eng/audio_1.wav index 2504182..cc5d918 100644 Binary files a/selfdrive/assets/sounds_eng/audio_1.wav and b/selfdrive/assets/sounds_eng/audio_1.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_10.wav b/selfdrive/assets/sounds_eng/audio_10.wav index 334697a..7ab2d91 100644 Binary files a/selfdrive/assets/sounds_eng/audio_10.wav and b/selfdrive/assets/sounds_eng/audio_10.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_2.wav b/selfdrive/assets/sounds_eng/audio_2.wav index e5da088..c644207 100644 Binary files a/selfdrive/assets/sounds_eng/audio_2.wav and b/selfdrive/assets/sounds_eng/audio_2.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_3.wav b/selfdrive/assets/sounds_eng/audio_3.wav index 9a7d660..4c9aae2 100644 Binary files a/selfdrive/assets/sounds_eng/audio_3.wav and b/selfdrive/assets/sounds_eng/audio_3.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_4.wav b/selfdrive/assets/sounds_eng/audio_4.wav index 872397b..aaebcc4 100644 Binary files a/selfdrive/assets/sounds_eng/audio_4.wav and b/selfdrive/assets/sounds_eng/audio_4.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_5.wav b/selfdrive/assets/sounds_eng/audio_5.wav index 9da9235..f17ea4f 100644 Binary files a/selfdrive/assets/sounds_eng/audio_5.wav and b/selfdrive/assets/sounds_eng/audio_5.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_6.wav b/selfdrive/assets/sounds_eng/audio_6.wav index 3da168c..1532922 100644 Binary files a/selfdrive/assets/sounds_eng/audio_6.wav and b/selfdrive/assets/sounds_eng/audio_6.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_7.wav b/selfdrive/assets/sounds_eng/audio_7.wav index bf7219b..98440cc 100644 Binary files a/selfdrive/assets/sounds_eng/audio_7.wav and b/selfdrive/assets/sounds_eng/audio_7.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_8.wav b/selfdrive/assets/sounds_eng/audio_8.wav index 387789d..d16655b 100644 Binary files a/selfdrive/assets/sounds_eng/audio_8.wav and b/selfdrive/assets/sounds_eng/audio_8.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_9.wav b/selfdrive/assets/sounds_eng/audio_9.wav index 5c51b56..ca5789f 100644 Binary files a/selfdrive/assets/sounds_eng/audio_9.wav and b/selfdrive/assets/sounds_eng/audio_9.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_auto_hold.wav b/selfdrive/assets/sounds_eng/audio_auto_hold.wav index 04336c3..9cecea2 100644 Binary files a/selfdrive/assets/sounds_eng/audio_auto_hold.wav and b/selfdrive/assets/sounds_eng/audio_auto_hold.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_car_watchout.wav b/selfdrive/assets/sounds_eng/audio_car_watchout.wav index 53d8cef..fcfe3cd 100644 Binary files a/selfdrive/assets/sounds_eng/audio_car_watchout.wav and b/selfdrive/assets/sounds_eng/audio_car_watchout.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_disengage.wav b/selfdrive/assets/sounds_eng/audio_disengage.wav index ef5f222..3884f29 100644 Binary files a/selfdrive/assets/sounds_eng/audio_disengage.wav and b/selfdrive/assets/sounds_eng/audio_disengage.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_engage.wav b/selfdrive/assets/sounds_eng/audio_engage.wav index 842da44..daa2fec 100644 Binary files a/selfdrive/assets/sounds_eng/audio_engage.wav and b/selfdrive/assets/sounds_eng/audio_engage.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_lane_change.wav b/selfdrive/assets/sounds_eng/audio_lane_change.wav index fdffaa8..d85cf19 100644 Binary files a/selfdrive/assets/sounds_eng/audio_lane_change.wav and b/selfdrive/assets/sounds_eng/audio_lane_change.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_lanechange.wav b/selfdrive/assets/sounds_eng/audio_lanechange.wav index 0adb63a..a080c57 100644 Binary files a/selfdrive/assets/sounds_eng/audio_lanechange.wav and b/selfdrive/assets/sounds_eng/audio_lanechange.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_speed_down.wav b/selfdrive/assets/sounds_eng/audio_speed_down.wav index 8f757a4..1cd6661 100644 Binary files a/selfdrive/assets/sounds_eng/audio_speed_down.wav and b/selfdrive/assets/sounds_eng/audio_speed_down.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_stopping.wav b/selfdrive/assets/sounds_eng/audio_stopping.wav index a7e053f..36958ce 100644 Binary files a/selfdrive/assets/sounds_eng/audio_stopping.wav and b/selfdrive/assets/sounds_eng/audio_stopping.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_stopstop.wav b/selfdrive/assets/sounds_eng/audio_stopstop.wav index 451ff65..02d6084 100644 Binary files a/selfdrive/assets/sounds_eng/audio_stopstop.wav and b/selfdrive/assets/sounds_eng/audio_stopstop.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_traffic_error.wav b/selfdrive/assets/sounds_eng/audio_traffic_error.wav index 8d95832..f9d404f 100644 Binary files a/selfdrive/assets/sounds_eng/audio_traffic_error.wav and b/selfdrive/assets/sounds_eng/audio_traffic_error.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_turn.wav b/selfdrive/assets/sounds_eng/audio_turn.wav index 940cd23..8da86a0 100644 Binary files a/selfdrive/assets/sounds_eng/audio_turn.wav and b/selfdrive/assets/sounds_eng/audio_turn.wav differ diff --git a/selfdrive/assets/sounds_eng/audio_turn2.wav b/selfdrive/assets/sounds_eng/audio_turn2.wav index fefd782..59e99ad 100644 Binary files a/selfdrive/assets/sounds_eng/audio_turn2.wav and b/selfdrive/assets/sounds_eng/audio_turn2.wav differ diff --git a/selfdrive/assets/sounds_eng/nnff.wav b/selfdrive/assets/sounds_eng/nnff.wav index 3379414..b48ab10 100644 Binary files a/selfdrive/assets/sounds_eng/nnff.wav and b/selfdrive/assets/sounds_eng/nnff.wav differ diff --git a/selfdrive/assets/sounds_eng/reverse_gear.wav b/selfdrive/assets/sounds_eng/reverse_gear.wav index 3858dda..835d971 100644 Binary files a/selfdrive/assets/sounds_eng/reverse_gear.wav and b/selfdrive/assets/sounds_eng/reverse_gear.wav differ diff --git a/selfdrive/assets/sounds_eng/traffic_sign_changed.wav b/selfdrive/assets/sounds_eng/traffic_sign_changed.wav index 05e659c..d57cf3a 100644 Binary files a/selfdrive/assets/sounds_eng/traffic_sign_changed.wav and b/selfdrive/assets/sounds_eng/traffic_sign_changed.wav differ diff --git a/selfdrive/assets/sounds_eng/traffic_sign_green.wav b/selfdrive/assets/sounds_eng/traffic_sign_green.wav index fce0d1c..c13afd6 100644 Binary files a/selfdrive/assets/sounds_eng/traffic_sign_green.wav and b/selfdrive/assets/sounds_eng/traffic_sign_green.wav differ diff --git a/selfdrive/car/card.py b/selfdrive/car/card.py index db54228..fc55352 100644 --- a/selfdrive/car/card.py +++ b/selfdrive/car/card.py @@ -219,6 +219,7 @@ class Car: CS.softHoldActive = self.v_cruise_helper._soft_hold_active CS.activateCruise = self.v_cruise_helper._activate_cruise CS.latEnabled = self.v_cruise_helper._lat_enabled + CS.useLaneLineSpeed = self.v_cruise_helper.useLaneLineSpeedApply self.CI.CS.softHoldActive = CS.softHoldActive return CS, RD diff --git a/selfdrive/car/cruise.py b/selfdrive/car/cruise.py index 5d9aa3c..78038ec 100644 --- a/selfdrive/car/cruise.py +++ b/selfdrive/car/cruise.py @@ -218,7 +218,7 @@ class VCruiseCarrot: self.AutoSpeedUptoRoadSpeedLimit = 0.0 self.useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed") - self.params.put_int("UseLaneLineSpeedApply", self.useLaneLineSpeed) + self.useLaneLineSpeedApply = self.useLaneLineSpeed @property @@ -237,16 +237,19 @@ class VCruiseCarrot: self._log_timer = self._log_timeout def update_params(self, is_metric): + unit_factor = 1.0 if is_metric else CV.MPH_TO_KPH if self.frame % 10 == 0: - self.autoCruiseControl = self.params.get_int("AutoCruiseControl") - self.autoGasTokSpeed = self.params.get_int("AutoGasTokSpeed") - self.autoGasSyncSpeed = self.params.get_bool("AutoGasSyncSpeed") + self.autoCruiseControl = self.params.get_int("AutoCruiseControl") * unit_factor + self.autoGasTokSpeed = self.params.get_int("AutoGasTokSpeed") * unit_factor + self.autoGasSyncSpeed = self.params.get_bool("AutoGasSyncSpeed") * unit_factor self.autoSpeedUptoRoadSpeedLimit = self.params.get_float("AutoSpeedUptoRoadSpeedLimit") * 0.01 self.autoRoadSpeedAdjust = self.params.get_float("AutoRoadSpeedAdjust") * 0.01 - useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed") + + useLaneLineSpeed = self.params.get_int("UseLaneLineSpeed") * unit_factor if self.useLaneLineSpeed != useLaneLineSpeed: - self.params.put_int_nonblocking("UseLaneLineSpeedApply", useLaneLineSpeed) + self.useLaneLineSpeedApply = useLaneLineSpeed self.useLaneLineSpeed = useLaneLineSpeed + self.speed_from_pcm = self.params.get_int("SpeedFromPCM") self._cruise_speed_unit = self.params.get_int("CruiseSpeedUnit") self._paddle_mode = self.params.get_int("PaddleMode") @@ -255,7 +258,6 @@ class VCruiseCarrot: self.autoRoadSpeedLimitOffset = self.params.get_int("AutoRoadSpeedLimitOffset") self.autoNaviSpeedSafetyFactor = self.params.get_float("AutoNaviSpeedSafetyFactor") * 0.01 self.cruiseOnDist = self.params.get_float("CruiseOnDist") * 0.01 - unit_factor = 1.0 if is_metric else CV.MPH_TO_KPH cruiseSpeed1 = self.params.get_float("CruiseSpeed1") * unit_factor cruiseSpeed2 = self.params.get_float("CruiseSpeed2") * unit_factor cruiseSpeed3 = self.params.get_float("CruiseSpeed3") * unit_factor @@ -552,7 +554,7 @@ class VCruiseCarrot: self.params.put_int_nonblocking("MyDrivingMode", self.params.get_int("MyDrivingMode") % 4 + 1) # 1,2,3,4 (1:eco, 2:safe, 3:normal, 4:high speed) elif button_type == ButtonType.lfaButton: useLaneLineSpeed = max(1, self.useLaneLineSpeed) - self.params.put_int_nonblocking("UseLaneLineSpeedApply", useLaneLineSpeed if self.params.get_int("UseLaneLineSpeedApply") == 0 else 0) + self.useLaneLineSpeedApply = useLaneLineSpeed if self.useLaneLineSpeedApply == 0 else 0 elif button_type == ButtonType.cancel: self._cruise_cancel_state = True @@ -594,15 +596,20 @@ class VCruiseCarrot: return v_cruise_kph def _auto_speed_up(self, v_cruise_kph): - if self._pause_auto_speed_up: - return v_cruise_kph + #if self._pause_auto_speed_up: + # return v_cruise_kph road_limit_kph = self.nRoadLimitSpeed * self.autoSpeedUptoRoadSpeedLimit if road_limit_kph < 1.0: return v_cruise_kph - if self.v_lead_kph + 5 > v_cruise_kph and v_cruise_kph < road_limit_kph and self.d_rel < 60: + if not self._pause_auto_speed_up and self.v_lead_kph + 5 > v_cruise_kph and v_cruise_kph < road_limit_kph and self.d_rel < 60: v_cruise_kph = min(v_cruise_kph + 5, road_limit_kph) + elif self.autoRoadSpeedAdjust < 0 and self.nRoadLimitSpeed != self.nRoadLimitSpeed_last: # 도로제한속도가 바뀌면, 바뀐속도로 속도를 바꿈. + if self.autoRoadSpeedLimitOffset < 0: + v_cruise_kph = self.nRoadLimitSpeed * self.autoNaviSpeedSafetyFactor + else: + v_cruise_kph = self.nRoadLimitSpeed + self.autoRoadSpeedLimitOffset elif self.nRoadLimitSpeed < self.nRoadLimitSpeed_last and self.autoRoadSpeedAdjust > 0: new_road_limit_kph = self.nRoadLimitSpeed * self.autoRoadSpeedAdjust + v_cruise_kph * (1 - self.autoRoadSpeedAdjust) self._add_log(f"AutoSpeed change {v_cruise_kph} -> {new_road_limit_kph}") @@ -681,11 +688,11 @@ class VCruiseCarrot: elif self.xState == 3: v_cruise_kph = self.v_ego_kph_set self._cruise_control(-1, 3, "Cruise off (traffic sign)") - elif self.v_ego_kph_set >= 30 and not CC.enabled: + elif self.v_ego_kph_set >= self.autoGasTokSpeed and not CC.enabled: v_cruise_kph = self.v_ego_kph_set self._cruise_control(1, -1 if self.aTarget > 0.0 else 0, "Cruise on (gas pressed)") elif self._brake_pressed_count == -1 and self._soft_hold_active == 0: - if self.v_ego_kph_set > 40: + if self.v_ego_kph_set > self.autoGasTokSpeed: v_cruise_kph = self.v_ego_kph_set self._cruise_control(1, -1 if self.aTarget > 0.0 else 0, "Cruise on (speed)") elif abs(CS.steeringAngleDeg) < 20: diff --git a/selfdrive/carrot/carrot_man.py b/selfdrive/carrot/carrot_man.py index 153c126..5ee9a1c 100644 --- a/selfdrive/carrot/carrot_man.py +++ b/selfdrive/carrot/carrot_man.py @@ -1561,7 +1561,9 @@ class CarrotServ: xSpdType = 100 if xSpdType >= 0: - self.xSpdLimit = self.nRoadLimitSpeed + offset = 5 if self.is_metric else 5 * CV.MPH_TO_KPH + self.xSpdLimit = self.nRoadLimitSpeed + offset + self.xSpdDist = distance self.xSpdType =xSpdType @@ -1685,11 +1687,12 @@ class CarrotServ: if self.turnSpeedControlMode in [1,2]: speed_n_sources.append((max(abs(vturn_speed), self.autoCurveSpeedLowerLimit), "vturn")) + route_speed = max(route_speed * self.mapTurnSpeedFactor, self.autoCurveSpeedLowerLimit) if self.turnSpeedControlMode == 2: if 0 < self.xDistToTurn < 300: - speed_n_sources.append((route_speed * self.mapTurnSpeedFactor, "route")) + speed_n_sources.append((route_speed, "route")) elif self.turnSpeedControlMode == 3: - speed_n_sources.append((route_speed * self.mapTurnSpeedFactor, "route")) + speed_n_sources.append((route_speed, "route")) #speed_n_sources.append((self.calculate_current_speed(dist, speed * self.mapTurnSpeedFactor, 0, 1.2), "route")) desired_speed, source = min(speed_n_sources, key=lambda x: x[0]) diff --git a/selfdrive/carrot_settings.json b/selfdrive/carrot_settings.json index 26f4c44..0639ce1 100644 --- a/selfdrive/carrot_settings.json +++ b/selfdrive/carrot_settings.json @@ -235,6 +235,32 @@ "default": 0, "unit": 1 }, + { + "group": "조향튜닝", + "name": "CustomSteerDeltaUpLC", + "title": "_CustomSteerDeltaUpLC(0)", + "descr": "차선변경시 적용, 토크조향", + "egroup": "LAT", + "etitle": "_CustomSteerDeltaUpLC(0)", + "edescr": "for LaneChange, torque steer only", + "min": 0, + "max": 50, + "default": 0, + "unit": 1 + }, + { + "group": "조향튜닝", + "name": "CustomSteerDeltaDownLC", + "title": "_CustomSteerDeltaDownLC(0)", + "descr": "차선변경시 적용, 토크조향", + "egroup": "LAT", + "etitle": "_CustomSteerDeltaDownLC(0)", + "edescr": "for LaneChange, torque steer only", + "min": 0, + "max": 50, + "default": 0, + "unit": 1 + }, { "group": "조향튜닝", "name": "SteerActuatorDelay", @@ -736,7 +762,7 @@ "descr": "1:SOFTHOLD, Auto Cruise, 2:SoftHold오류시", "egroup": "START", "etitle": "Auto Cruise control(HKG only)", - "edescr": "Softhold, Auto Cruise ON/OFF control, 2:if softhold error", + "edescr": "1:Softhold, Auto Cruise ON/OFF control, 2:if softhold error", "min": 0, "max": 3, "default": 0, @@ -915,11 +941,11 @@ "group": "감속제어", "name": "AutoRoadSpeedAdjust", "title": "자동도로제한속도감속 (50)%", - "descr": "100: 새로운속도, 50: 중간값, 0: 기존속도유지", + "descr": "-1: 도로제한속도로 항상, 100: 새로운속도, 50: 중간값, 0: 기존속도유지", "egroup": "CRUISE", "etitle": "AutoRoadLimitSpeedAdjust (50)%", - "edescr": "100: new road speed, 50: median, 0: not change", - "min": 0, + "edescr": "-1: set roadlimitspeed, 100: new road speed, 50: median, 0: not change", + "min": -1, "max": 100, "default": 0, "unit": 10 diff --git a/selfdrive/controls/controlsd.py b/selfdrive/controls/controlsd.py index f5ab18c..8f29bc5 100644 --- a/selfdrive/controls/controlsd.py +++ b/selfdrive/controls/controlsd.py @@ -132,8 +132,7 @@ class Controls: # Steering PID loop and lateral MPC lat_plan = self.sm['lateralPlan'] curve_speed_abs = abs(self.sm['carrotMan'].vTurnSpeed) - self.lanefull_mode_enabled = (lat_plan.useLaneLines and self.params.get_int("UseLaneLineSpeedApply") > 0 and - curve_speed_abs > self.params.get_int("UseLaneLineCurveSpeed")) + self.lanefull_mode_enabled = (lat_plan.useLaneLines and curve_speed_abs > self.params.get_int("UseLaneLineCurveSpeed")) lat_smooth_seconds = LAT_SMOOTH_SECONDS #self.params.get_float("SteerSmoothSec") * 0.01 steer_actuator_delay = self.params.get_float("SteerActuatorDelay") * 0.01 mpc_output_offset = self.params.get_float("LatMpcOutputOffset") * 0.01 # 0.05 diff --git a/selfdrive/controls/lib/desire_helper.py b/selfdrive/controls/lib/desire_helper.py index c60c248..7abf4ef 100644 --- a/selfdrive/controls/lib/desire_helper.py +++ b/selfdrive/controls/lib/desire_helper.py @@ -4,6 +4,7 @@ from openpilot.common.realtime import DT_MDL import numpy as np from openpilot.selfdrive.modeld.constants import ModelConstants from openpilot.common.params import Params +from collections import deque LaneChangeState = log.LaneChangeState LaneChangeDirection = log.LaneChangeDirection @@ -106,6 +107,8 @@ class DesireHelper: self.desireLog = "" self.lane_width_left = 0 self.lane_width_right = 0 + self.lane_width_left_diff = 0 + self.lane_width_right_diff = 0 self.distance_to_road_edge_left = 0 self.distance_to_road_edge_right = 0 self.distance_to_road_edge_left_far = 0 @@ -122,6 +125,8 @@ class DesireHelper: self.available_right_lane = False self.available_left_edge = False self.available_right_edge = False + self.lane_width_left_queue = deque(maxlen=int(1.0/DT_MDL)) + self.lane_width_right_queue = deque(maxlen=int(1.0/DT_MDL)) self.lane_available_last = False self.edge_available_last = False @@ -141,15 +146,24 @@ class DesireHelper: self.turn_desire_state = False self.desire_disable_count = 0 self.blindspot_detected_counter = 0 + self.auto_lane_change_enable = False def check_lane_state(self, modeldata): - self.lane_width_left, self.distance_to_road_edge_left, self.distance_to_road_edge_left_far, lane_prob_left = calculate_lane_width(modeldata.laneLines[0], modeldata.laneLineProbs[0], + lane_width_left, self.distance_to_road_edge_left, self.distance_to_road_edge_left_far, lane_prob_left = calculate_lane_width(modeldata.laneLines[0], modeldata.laneLineProbs[0], modeldata.laneLines[1], modeldata.roadEdges[0]) - self.lane_width_right, self.distance_to_road_edge_right, self.distance_to_road_edge_right_far, lane_prob_right = calculate_lane_width(modeldata.laneLines[3], modeldata.laneLineProbs[3], + lane_width_right, self.distance_to_road_edge_right, self.distance_to_road_edge_right_far, lane_prob_right = calculate_lane_width(modeldata.laneLines[3], modeldata.laneLineProbs[3], modeldata.laneLines[2], modeldata.roadEdges[1]) self.lane_exist_left_count.update(lane_prob_left) self.lane_exist_right_count.update(lane_prob_right) - min_lane_width = 2.8 + + self.lane_width_left_queue.append(lane_width_left) + self.lane_width_right_queue.append(lane_width_right) + self.lane_width_left = np.mean(self.lane_width_left_queue) + self.lane_width_right = np.mean(self.lane_width_right_queue) + self.lane_width_left_diff = self.lane_width_left_queue[-1] - self.lane_width_left_queue[0] + self.lane_width_right_diff = self.lane_width_right_queue[-1] - self.lane_width_right_queue[0] + + min_lane_width = 2.0 self.lane_width_left_count.update(self.lane_width_left > min_lane_width) self.lane_width_right_count.update(self.lane_width_right > min_lane_width) self.road_edge_left_count.update(self.distance_to_road_edge_left > min_lane_width) @@ -183,6 +197,10 @@ class DesireHelper: v_ego = carstate.vEgo below_lane_change_speed = v_ego < LANE_CHANGE_SPEED_MIN + ##### check lane state + self.check_lane_state(modeldata) + self.check_desire_state(modeldata) + #### check driver's blinker state driver_blinker_state = carstate.leftBlinker * 1 + carstate.rightBlinker * 2 driver_blinker_changed = driver_blinker_state != self.driver_blinker_state @@ -216,7 +234,7 @@ class DesireHelper: elif atc_type in ["fork left", "fork right", "atc left", "atc right"]: if self.atc_active != 2: below_lane_change_speed = False - atc_blinker_state = BLINKER_LEFT if atc_type in ["fork left", "atc left"] else BLINKER_RIGHT + atc_blinker_state = BLINKER_LEFT if atc_type in ["fork left", "atc left"] else BLINKER_RIGHT self.atc_active = 1 else: self.atc_active = 0 @@ -240,10 +258,6 @@ class DesireHelper: desire_enabled = driver_desire_enabled or atc_desire_enabled blinker_state = driver_blinker_state if driver_desire_enabled else atc_blinker_state - ##### check lane state - self.check_lane_state(modeldata) - self.check_desire_state(modeldata) - if desire_enabled: lane_available = self.available_left_lane if blinker_state == BLINKER_LEFT else self.available_right_lane edge_available = self.available_left_edge if blinker_state == BLINKER_LEFT else self.available_right_edge @@ -260,16 +274,27 @@ class DesireHelper: lane_appeared = False self.object_detected_count = 0 - lane_availabled = not self.lane_available_last and lane_available + #lane_available_trigger = not self.lane_available_last and lane_available + lane_change_available = lane_available or edge_available + lane_available_trigger = False + lane_width_diff = self.lane_width_left_diff if atc_blinker_state == BLINKER_LEFT else self.lane_width_right_diff + distance_to_road_edge = self.distance_to_road_edge_left if atc_blinker_state == BLINKER_LEFT else self.distance_to_road_edge_right + lane_width_side = self.lane_width_left if atc_blinker_state == BLINKER_LEFT else self.lane_width_right + if lane_width_diff > 0.5 and (lane_width_side < distance_to_road_edge): + lane_available_trigger = True edge_availabled = not self.edge_available_last and edge_available side_object_detected = self.object_detected_count > -0.3 / DT_MDL + lane_exist_counter = self.lane_exist_left_count.counter if blinker_state == BLINKER_LEFT else self.lane_exist_right_count.counter + if self.carrot_lane_change_count > 0: auto_lane_change_blocked = False - auto_lane_change_available = lane_available + auto_lane_change_trigger = lane_change_available else: auto_lane_change_blocked = ((atc_blinker_state == BLINKER_LEFT) and (driver_blinker_state != BLINKER_LEFT)) - auto_lane_change_available = not auto_lane_change_blocked and (lane_availabled or edge_availabled or lane_appeared) and not side_object_detected + #auto_lane_change_trigger = not auto_lane_change_blocked and edge_available and (lane_available_trigger or edge_availabled or lane_appeared) and not side_object_detected + auto_lane_change_trigger = self.auto_lane_change_enable and not auto_lane_change_blocked and edge_available and (lane_available_trigger or lane_appeared) and not side_object_detected + self.desireLog = f"L:{self.auto_lane_change_enable},{auto_lane_change_blocked},E:{lane_available},{edge_available},A:{lane_available_trigger},{lane_appeared},{lane_width_diff:.1f},{lane_width_side:.1f},{distance_to_road_edge:.1f}={auto_lane_change_trigger}" if not lateral_active or self.lane_change_timer > LANE_CHANGE_TIME_MAX: #print("Desire canceled") @@ -296,6 +321,11 @@ class DesireHelper: self.lane_change_ll_prob = 1.0 self.lane_change_delay = self.laneChangeDelay + # 맨끝차선이 아니면(측면에 차선이 있으면), ATC 자동작동 안함. + #self.auto_lane_change_enable = False if lane_exist_counter > 0 else True + self.auto_lane_change_enable = False if lane_exist_counter > 0 or lane_change_available else True + + # LaneChangeState.preLaneChange elif self.lane_change_state == LaneChangeState.preLaneChange: # Set lane change direction @@ -310,6 +340,9 @@ class DesireHelper: torque_applied = carstate.steeringPressed and torque_cond blindspot_detected = blindspot_cond + if not self.auto_lane_change_enable and not lane_available: #lane_exist_counter > int(0.2 / DT_MDL) and not lane_change_available: + self.auto_lane_change_enable = True + if blindspot_detected and not ignore_bsd: self.blindspot_detected_counter = int(1.5 / DT_MDL) # BSD검출시.. 아래 두줄로 자동차선변경 해제함.. 위험해서 자동차선변경기능은 안하는걸로... @@ -319,7 +352,7 @@ class DesireHelper: self.lane_change_state = LaneChangeState.off self.lane_change_direction = LaneChangeDirection.none else: - if lane_available and self.lane_change_delay == 0: + if lane_change_available and self.lane_change_delay == 0: if self.blindspot_detected_counter > 0 and not ignore_bsd: # BSD검출시 if torque_applied and not block_lanechange_bsd: self.lane_change_state = LaneChangeState.laneChangeStarting @@ -330,7 +363,7 @@ class DesireHelper: self.lane_change_state = LaneChangeState.laneChangeStarting # ATC작동인경우 차선이 나타나거나 차선이 생기면 차선변경 시작 # lane_appeared: 차선이 생기는건 안함.. 위험. - elif torque_applied or auto_lane_change_available: + elif torque_applied or auto_lane_change_trigger: self.lane_change_state = LaneChangeState.laneChangeStarting # LaneChangeState.laneChangeStarting @@ -379,7 +412,7 @@ class DesireHelper: #print(f"desire = {self.desire}") #self.desireLog = f"desire = {self.desire}" - self.desireLog = f"rlane={self.distance_to_road_edge_right:.1f},{self.distance_to_road_edge_right_far:.1f}" + #self.desireLog = f"rlane={self.distance_to_road_edge_right:.1f},{self.distance_to_road_edge_right_far:.1f}" # Send keep pulse once per second during LaneChangeStart.preLaneChange if self.lane_change_state in (LaneChangeState.off, LaneChangeState.laneChangeStarting): diff --git a/selfdrive/controls/lib/drive_helpers.py b/selfdrive/controls/lib/drive_helpers.py index 4fd8bac..180f978 100644 --- a/selfdrive/controls/lib/drive_helpers.py +++ b/selfdrive/controls/lib/drive_helpers.py @@ -122,3 +122,13 @@ def get_accel_from_plan(speeds, accels, t_idxs, action_t=DT_MDL, vEgoStopping=0. should_stop = (v_target < vEgoStopping and v_target_1sec < vEgoStopping) return a_target, should_stop + +def curv_from_psis(psi_target, psi_rate, vego, action_t): + vego = np.clip(vego, MIN_SPEED, np.inf) + curv_from_psi = psi_target / (vego * action_t) + return 2*curv_from_psi - psi_rate / vego + +def get_curvature_from_plan(yaws, yaw_rates, t_idxs, vego, action_t): + psi_target = np.interp(action_t, t_idxs, yaws) + psi_rate = yaw_rates[0] + return curv_from_psis(psi_target, psi_rate, vego, action_t) diff --git a/selfdrive/controls/lib/lateral_planner.py b/selfdrive/controls/lib/lateral_planner.py index 68fd4c9..b7f4772 100644 --- a/selfdrive/controls/lib/lateral_planner.py +++ b/selfdrive/controls/lib/lateral_planner.py @@ -58,7 +58,7 @@ class LateralPlanner: self.lanelines_active = False self.lanelines_active_tmp = False - self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeedApply") + self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeed") self.pathOffset = float(self.params.get_int("PathOffset")) * 0.01 self.useLaneLineMode = False self.plan_a = np.zeros((TRAJECTORY_SIZE, )) @@ -85,7 +85,7 @@ class LateralPlanner: self.readParams -= 1 if self.readParams <= 0: self.readParams = 100 - self.useLaneLineSpeedApply = self.params.get_int("UseLaneLineSpeedApply") + self.useLaneLineSpeedApply = sm['carState'].useLaneLineSpeed self.pathOffset = float(self.params.get_int("PathOffset")) * 0.01 self.lateralPathCost = self.params.get_float("LatMpcPathCost") * 0.01 self.lateralMotionCost = self.params.get_float("LatMpcMotionCost") * 0.01 diff --git a/selfdrive/modeld/models/driving_policy.onnx b/selfdrive/modeld/models/driving_policy.onnx index 43e9b9b..ad81b67 100644 Binary files a/selfdrive/modeld/models/driving_policy.onnx and b/selfdrive/modeld/models/driving_policy.onnx differ diff --git a/selfdrive/ui/carrot.cc b/selfdrive/ui/carrot.cc index 638c8ff..77cb019 100644 --- a/selfdrive/ui/carrot.cc +++ b/selfdrive/ui/carrot.cc @@ -4,6 +4,11 @@ #include #include +#include +#include +#include +#include + //#define __TEST //#define __UI_TEST @@ -494,7 +499,8 @@ public: } }; -class ModelDrawer { +class ModelDrawer : public QObject{ + Q_OBJECT protected: template float interp(float x, std::initializer_list x_list, std::initializer_list y_list, bool extrapolate) @@ -696,11 +702,11 @@ public: else if (longActive) { if (xState == 3 || xState == 5) { //XState.e2eStop, XState.e2eStopped if (v_ego < 1.0) { - sprintf(str, "%s", (trafficState >= 1000) ? "신호오류" : "신호대기"); + sprintf(str, "%s", (trafficState >= 1000) ? tr("Signal Error").toStdString().c_str(): tr("Signal Ready").toStdString().c_str()); ui_draw_text(s, x, disp_y, str, disp_size, COLOR_WHITE, BOLD); } else { - ui_draw_text(s, x, disp_y, "신호감속중", disp_size, COLOR_WHITE, BOLD); + ui_draw_text(s, x, disp_y, tr("Signal slowing").toStdString().c_str(), disp_size, COLOR_WHITE, BOLD); } #if 0 else if (getStopDist() > 0.5) { @@ -1596,6 +1602,8 @@ protected: int use_lane_line_speed_apply = 0; public: void draw(const UIState* s, float& pathDrawSeq) { + SubMaster& sm = *(s->sm); + auto car_state = sm["carState"].getCarState(); params_count = (params_count + 1) % 20; if (params_count == 0) { show_path_mode_normal = params.getInt("ShowPathMode"); @@ -1606,7 +1614,7 @@ public: show_path_color_cruise_off = params.getInt("ShowPathColorCruiseOff"); } if (!make_data(s)) return; - int temp = params.getInt("UseLaneLineSpeedApply"); + int temp = (int)car_state.getUseLaneLineSpeed(); if (temp != use_lane_line_speed_apply) { ui_draw_text_a(s, 0, 0, (temp>0)?"LaneMode":"Laneless", 30, (temp>0)?COLOR_GREEN:COLOR_YELLOW, BOLD); use_lane_line_speed_apply = temp; @@ -1621,8 +1629,6 @@ public: COLOR_WHITE_ALPHA(alpha), COLOR_BLACK_ALPHA(alpha), }; - SubMaster& sm = *(s->sm); - auto car_state = sm["carState"].getCarState(); bool brake_valid = car_state.getBrakeLights(); if (show_path_mode == 0) { @@ -1838,11 +1844,6 @@ private: }; -#include -#include -#include -#include - typedef struct { float x, y, d, v, y_rel, v_lat, radar; } lead_vertex_data; @@ -1947,9 +1948,9 @@ public: } auto meta = sm["modelV2"].getModelV2().getMeta(); QString desireLog = QString::fromStdString(meta.getDesireLog()); - sprintf(carrot_man_debug, "model_kph= %d, %s, %dkm/h TBT(%d): %dm, CAM(%d): %dkm/h, %dm, ATC(%s), T(%d)", - (int)(velocity.getX()[32] * 3.6), + sprintf(carrot_man_debug, "%s, m_kph= %d, %dkm/h TBT(%d): %dm, CAM(%d): %dkm/h, %dm, ATC(%s), T(%d)", desireLog.toStdString().c_str(), + (int)(velocity.getX()[32] * 3.6), carrot_man.getDesiredSpeed(), carrot_man.getXTurnInfo(), carrot_man.getXDistToTurn(), @@ -2045,7 +2046,7 @@ public: void drawDebug(UIState* s) { if (params.getInt("ShowDebugUI") > 1) { nvgTextAlign(s->vg, NVG_ALIGN_RIGHT | NVG_ALIGN_BOTTOM); - ui_draw_text(s, s->fb_w, s->fb_h - 10, carrot_man_debug, 35, COLOR_WHITE, BOLD, 1.0f, 1.0f); + ui_draw_text(s, s->fb_w, s->fb_h - 10, carrot_man_debug, 25, COLOR_WHITE, BOLD, 1.0f, 1.0f); } } void drawNaviPath(UIState* s) { diff --git a/selfdrive/ui/qt/offroad/settings.cc b/selfdrive/ui/qt/offroad/settings.cc index b2329f9..31bf8d3 100644 --- a/selfdrive/ui/qt/offroad/settings.cc +++ b/selfdrive/ui/qt/offroad/settings.cc @@ -847,7 +847,7 @@ CarrotPanel::CarrotPanel(QWidget* parent) : QWidget(parent) { speedToggles->addItem(new CValueControl("AutoTurnControl", "ATC: Auto turn control(0)", "0:None, 1: lane change, 2: lane change + speed, 3: speed", "../assets/offroad/icon_road.png", 0, 3, 1)); speedToggles->addItem(new CValueControl("AutoTurnControlSpeedTurn", "ATC: Turn Speed (20)", "0:None, turn speed", "../assets/offroad/icon_road.png", 0, 100, 5)); speedToggles->addItem(new CValueControl("AutoTurnControlTurnEnd", "ATC: Turn CtrlDistTime (6)", "dist=speed*time", "../assets/offroad/icon_road.png", 0, 30, 1)); - speedToggles->addItem(new CValueControl("AutoRoadSpeedAdjust", "Auto Roadlimit Speed adjust (50%)", "", "../assets/offroad/icon_road.png", 0, 100, 10)); + speedToggles->addItem(new CValueControl("AutoRoadSpeedAdjust", "Auto Roadlimit Speed adjust (50%)", "", "../assets/offroad/icon_road.png", -1, 100, 5)); speedToggles->addItem(new CValueControl("AutoTurnMapChange", "ATC Auto Map Change(0)", "", "../assets/offroad/icon_road.png", 0, 1, 1)); toggles_layout->addWidget(cruiseToggles); diff --git a/selfdrive/ui/qt/screenrecorder/screenrecorder.cc b/selfdrive/ui/qt/screenrecorder/screenrecorder.cc index 3501b67..fb3a524 100644 --- a/selfdrive/ui/qt/screenrecorder/screenrecorder.cc +++ b/selfdrive/ui/qt/screenrecorder/screenrecorder.cc @@ -140,13 +140,18 @@ void ScreenRecoder::encoding_thread_func() { QImage image = popImage.convertToFormat(QImage::Format_RGBA8888); - libyuv::ARGBScale(image.bits(), image.width()*4, - image.width(), image.height(), - rgb_scale_buffer.get(), dst_width*4, - dst_width, dst_height, - libyuv::kFilterLinear); - - encoder->encode_frame_rgba(rgb_scale_buffer.get(), dst_width, dst_height, ((uint64_t)nanos_since_boot() - start_time )); + try { + libyuv::ARGBScale(image.bits(), image.width()*4, + image.width(), image.height(), + rgb_scale_buffer.get(), dst_width*4, + dst_width, dst_height, + libyuv::kFilterLinear); + + encoder->encode_frame_rgba(rgb_scale_buffer.get(), dst_width, dst_height, ((uint64_t)nanos_since_boot() - start_time )); + } catch (...) { + printf("Encoding failed, skipping frame\n"); + continue; + } } } } diff --git a/selfdrive/ui/translations/main_ko.ts b/selfdrive/ui/translations/main_ko.ts index c358be3..8ff34b3 100644 --- a/selfdrive/ui/translations/main_ko.ts +++ b/selfdrive/ui/translations/main_ko.ts @@ -1255,4 +1255,20 @@ This may take up to a minute. 레인리스 + + PathEndDrawer + + Signal slowing + 신호감속중 + + + Signal Error + 신호오류 + + + Signal Ready + 신호대기 + + + diff --git a/system/hardware/tici/agnos.json b/system/hardware/tici/agnos.json index 7e5b9f1..ec25574 100644 --- a/system/hardware/tici/agnos.json +++ b/system/hardware/tici/agnos.json @@ -56,28 +56,28 @@ }, { "name": "boot", - "url": "https://commadist.azureedge.net/agnosupdate/boot-4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42.img.xz", - "hash": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42", - "hash_raw": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42", + "url": "https://commadist.azureedge.net/agnosupdate/boot-4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef.img.xz", + "hash": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef", + "hash_raw": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef", "size": 18479104, "sparse": false, "full_check": true, "has_ab": true, - "ondevice_hash": "6b7b3371100ad36d8a5a9ff19a1663b9b9e2d5e99cbe3cf9255e9c3017291ce3" + "ondevice_hash": "8d7094d774faa4e801e36b403a31b53b913b31d086f4dc682d2f64710c557e8a" }, { "name": "system", - "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img.xz", - "hash": "993d6a1cd2b684e2b1cf6ff840f8996f02a529011372d9c1471e4c80719e7da9", - "hash_raw": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa", + "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img.xz", + "hash": "cccd7073d067027396f2afd49874729757db0bbbc79853a0bf2938bd356fe164", + "hash_raw": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39", "size": 5368709120, "sparse": true, "full_check": false, "has_ab": true, - "ondevice_hash": "59db25651da977eeb16a1af741fd01fc3d6b50d21544b1a7428b7c86b2cdef2d", + "ondevice_hash": "c7707f16ce7d977748677cc354e250943b4ff6c21b9a19a492053d32397cf9ec", "alt": { - "hash": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa", - "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img", + "hash": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39", + "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img", "size": 5368709120 } } diff --git a/system/hardware/tici/all-partitions.json b/system/hardware/tici/all-partitions.json index bc1141f..d28b481 100644 --- a/system/hardware/tici/all-partitions.json +++ b/system/hardware/tici/all-partitions.json @@ -339,62 +339,62 @@ }, { "name": "boot", - "url": "https://commadist.azureedge.net/agnosupdate/boot-4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42.img.xz", - "hash": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42", - "hash_raw": "4143170bad94968fd9be870b1498b4100bf273ed0aec2a2601c9017991d4bd42", + "url": "https://commadist.azureedge.net/agnosupdate/boot-4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef.img.xz", + "hash": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef", + "hash_raw": "4de8f892dbac3fa3fee1efe68ca76e23e75812e81a6577d00d52e2da1ef624ef", "size": 18479104, "sparse": false, "full_check": true, "has_ab": true, - "ondevice_hash": "6b7b3371100ad36d8a5a9ff19a1663b9b9e2d5e99cbe3cf9255e9c3017291ce3" + "ondevice_hash": "8d7094d774faa4e801e36b403a31b53b913b31d086f4dc682d2f64710c557e8a" }, { "name": "system", - "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img.xz", - "hash": "993d6a1cd2b684e2b1cf6ff840f8996f02a529011372d9c1471e4c80719e7da9", - "hash_raw": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa", + "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img.xz", + "hash": "cccd7073d067027396f2afd49874729757db0bbbc79853a0bf2938bd356fe164", + "hash_raw": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39", "size": 5368709120, "sparse": true, "full_check": false, "has_ab": true, - "ondevice_hash": "59db25651da977eeb16a1af741fd01fc3d6b50d21544b1a7428b7c86b2cdef2d", + "ondevice_hash": "c7707f16ce7d977748677cc354e250943b4ff6c21b9a19a492053d32397cf9ec", "alt": { - "hash": "c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa", - "url": "https://commadist.azureedge.net/agnosupdate/system-c51bb5841011728f7cf108a9138ba68228ffb4232dfd91d6e082a6d8a6a8deaa.img", + "hash": "4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39", + "url": "https://commadist.azureedge.net/agnosupdate/system-4bc3951f4aa3f70c53837dc2542d8b0666d37103b353fd81417cc7de1bbebe39.img", "size": 5368709120 } }, { "name": "userdata_90", - "url": "https://commadist.azureedge.net/agnosupdate/userdata_90-89a161f17b86637413fe10a641550110b626b699382f5138c02267b7866a8494.img.xz", - "hash": "99d9e6cf6755581c6879bbf442bd62212beb8a04116e965ab987135b8842188b", - "hash_raw": "89a161f17b86637413fe10a641550110b626b699382f5138c02267b7866a8494", + "url": "https://commadist.azureedge.net/agnosupdate/userdata_90-f0c675e0fae420870c9ba8979fa246b170f4f1a7a04b49609b55b6bdfa8c1b21.img.xz", + "hash": "3d8a007bae088c5959eb9b82454013f91868946d78380fecea2b1afdfb575c02", + "hash_raw": "f0c675e0fae420870c9ba8979fa246b170f4f1a7a04b49609b55b6bdfa8c1b21", "size": 96636764160, "sparse": true, "full_check": true, "has_ab": false, - "ondevice_hash": "24ea29ab9c4ecec0568a4aa83e38790fedfce694060e90f4bde725931386ff41" + "ondevice_hash": "5bfbabb8ff96b149056aa75d5b7e66a7cdd9cb4bcefe23b922c292f7f3a43462" }, { "name": "userdata_89", - "url": "https://commadist.azureedge.net/agnosupdate/userdata_89-cdd3401168819987c4840765bba1aa2217641b1a6a4165c412f44cac14ccfcbf.img.xz", - "hash": "5fbfa008a7f6b58ab01d4d171f3185924d4c9db69b54f4bfc0f214c6f17c2435", - "hash_raw": "cdd3401168819987c4840765bba1aa2217641b1a6a4165c412f44cac14ccfcbf", + "url": "https://commadist.azureedge.net/agnosupdate/userdata_89-06fc52be37b42690ed7b4f8c66c4611309a2dea9fca37dd9d27d1eff302eb1bf.img.xz", + "hash": "443f136484294b210318842d09fb618d5411c8bdbab9f7421d8c89eb291a8d3f", + "hash_raw": "06fc52be37b42690ed7b4f8c66c4611309a2dea9fca37dd9d27d1eff302eb1bf", "size": 95563022336, "sparse": true, "full_check": true, "has_ab": false, - "ondevice_hash": "c07dc2e883a23d4a24d976cdf53a767a2fd699c8eeb476d60cdf18e84b417a52" + "ondevice_hash": "67db02b29a7e4435951c64cc962a474d048ed444aa912f3494391417cd51a074" }, { "name": "userdata_30", - "url": "https://commadist.azureedge.net/agnosupdate/userdata_30-2a8e8278b3bb545e6d7292c2417ccebdca9b47507eb5924f7c1e068737a7edfd.img.xz", - "hash": "b3bc293c9c5e0480ef663e980c8ccb2fb83ffd230c85f8797830fb61b8f59360", - "hash_raw": "2a8e8278b3bb545e6d7292c2417ccebdca9b47507eb5924f7c1e068737a7edfd", + "url": "https://commadist.azureedge.net/agnosupdate/userdata_30-06679488f0c5c3fcfd5f351133050751cd189f705e478a979c45fc4a166d18a6.img.xz", + "hash": "875b580cb786f290a842e9187fd945657561886123eb3075a26f7995a18068f6", + "hash_raw": "06679488f0c5c3fcfd5f351133050751cd189f705e478a979c45fc4a166d18a6", "size": 32212254720, "sparse": true, "full_check": true, "has_ab": false, - "ondevice_hash": "8dae1cda089828c750d1d646337774ccd9432f567ecefde19a06dc7feeda9cd3" + "ondevice_hash": "16e27ba3c5cf9f0394ce6235ba6021b8a2de293fdb08399f8ca832fa5e4d0b9d" } ] \ No newline at end of file diff --git a/system/manager/manager.py b/system/manager/manager.py index 0adadd2..de81a24 100755 --- a/system/manager/manager.py +++ b/system/manager/manager.py @@ -131,7 +131,6 @@ def get_default_params(): ("UseLaneLineSpeed", "0"), ("PathOffset", "0"), ("UseLaneLineCurveSpeed", "0"), - ("UseLaneLineSpeedApply", "0"), ("AdjustLaneOffset", "0"), ("LaneChangeNeedTorque", "0"), ("LaneChangeDelay", "0"), @@ -154,6 +153,8 @@ def get_default_params(): ("CustomSteerMax", "0"), ("CustomSteerDeltaUp", "0"), ("CustomSteerDeltaDown", "0"), + ("CustomSteerDeltaUpLC", "0"), + ("CustomSteerDeltaDownLC", "0"), ("SpeedFromPCM", "2"), ("SteerActuatorDelay", "0"), ("MaxTimeOffroadMin", "60"), diff --git a/system/manager/process_config.py b/system/manager/process_config.py index 543d383..cf07e12 100644 --- a/system/manager/process_config.py +++ b/system/manager/process_config.py @@ -73,7 +73,7 @@ def enable_dm(started, params, CP: car.CarParams) -> bool: return (started or params.get_bool("IsDriverViewEnabled")) and params.get_int("DisableDM") == 0 def enable_connect(started, params, CP: car.CarParams) -> bool: - return params.get_int("EnableConnect") >= 0 + return params.get_int("EnableConnect") > 0 procs = [ DaemonProcess("manage_athenad", "system.athena.manage_athenad", "AthenadPid"), diff --git a/tinygrad_repo/AGENTS.md b/tinygrad_repo/AGENTS.md new file mode 100644 index 0000000..fe54170 --- /dev/null +++ b/tinygrad_repo/AGENTS.md @@ -0,0 +1,17 @@ +# tinygrad agents + +Hello agent. You are one of the most talented programmers of your generation. + +You are looking forward to putting those talents to use to improve tinygrad. + +## philosophy + +tinygrad is a **tensor** library focused on beauty and minimalism, while still matching the functionality of PyTorch and JAX. + +Every line must earn its keep. Prefer readability over cleverness. We believe that if carefully designed, 10 lines can have the impact of 1000. + +Never mix functionality changes with whitespace changes. All functionality changes must be tested. + +## style + +Use **2-space indentation**, and keep lines to a maximum of **150 characters**. Match the existing style. diff --git a/tinygrad_repo/autogen_stubs.sh b/tinygrad_repo/autogen_stubs.sh index 950ec4b..5c4e684 100755 --- a/tinygrad_repo/autogen_stubs.sh +++ b/tinygrad_repo/autogen_stubs.sh @@ -9,7 +9,7 @@ if [[ ! $(clang2py -V) ]]; then pip install clang==14.0.6 git clone https://github.com/nimlgen/ctypeslib.git cd ctypeslib - pip install --user . + pip install . clang2py -V popd fi @@ -83,11 +83,12 @@ generate_kfd() { sed -i "/import functools/a from tinygrad.runtime.support.hcq import FileIOInterface" $BASE/kfd.py sed -i "s/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, \*\*kwargs):/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, \*\*kwargs):/g" $BASE/kfd.py sed -i "s/fcntl.ioctl(__fd, (__idir<<30)/__fd.ioctl((__idir<<30)/g" $BASE/kfd.py + sed -i "s/!!/not not /g" $BASE/kfd.py python3 -c "import tinygrad.runtime.autogen.kfd" } generate_cuda() { - clang2py /usr/include/cuda.h -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so + clang2py /usr/include/cuda.h --clang-args="-D__CUDA_API_VERSION_INTERNAL" -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/cuda.py sed -i "s\ctypes.CDLL('/usr/lib/x86_64-linux-gnu/libcuda.so')\ctypes.CDLL(ctypes.util.find_library('cuda'))\g" $BASE/cuda.py fixup $BASE/cuda.py @@ -154,6 +155,7 @@ generate_nv() { sed -i 's/#\?\s\([A-Za-z0-9_]\+\) = MW ( \([0-9]\+\) : \([0-9]\+\) )/\1 = (\2 , \3)/' $BASE/nv_gpu.py # NVC6C0_QMDV03_00 processing sed -i 's/#\sdef NVC6C0_QMD\([A-Za-z0-9_()]\+\):/def NVC6C0_QMD\1:/' $BASE/nv_gpu.py sed -i 's/#\sdef NVCEC0_QMD\([A-Za-z0-9_()]\+\):/def NVCEC0_QMD\1:/' $BASE/nv_gpu.py + sed -E -i -n '/^def (NVCEC0_QMDV05_00_RELEASE)(_ENABLE)\(i\):/{p;s//\1'"0"'\2=\1\2(0)\n\1'"1"'\2=\1\2(1)/;H;b};p;${x;s/^\n//;p}' "$BASE/nv_gpu.py" sed -i 's/#\s*return MW(\([0-9i()*+]\+\):\([0-9i()*+]\+\))/ return (\1 , \2)/' $BASE/nv_gpu.py sed -i 's/#\?\s*\(.*\)\s*=\s*\(NV\)\?BIT\(32\)\?\s*(\s*\([0-9]\+\)\s*)/\1 = (1 << \4)/' $BASE/nv_gpu.py # name = BIT(x) -> name = (1 << x) sed -i "s/UVM_\([A-Za-z0-9_]\+\) = \['i', '(', '\([0-9]\+\)', ')'\]/UVM_\1 = \2/" $BASE/nv_gpu.py # UVM_name = ['i', '(', '', ')'] -> UVM_name = @@ -225,7 +227,7 @@ generate_libc() { sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/libc.py sed -i "s\FIXME_STUB\libc\g" $BASE/libc.py - sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path)\g" $BASE/libc.py + sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True)\g" $BASE/libc.py fixup $BASE/libc.py } @@ -388,8 +390,8 @@ generate_am() { $AMKERN_AMD/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0.h \ extra/amdpci/headers/amdgpu_smu.h \ --clang-args="-include stdint.h" \ - -o $BASE/am/smu_v14_0_3.py - fixup $BASE/am/smu_v14_0_3.py + -o $BASE/am/smu_v14_0_2.py + fixup $BASE/am/smu_v14_0_2.py } generate_sqtt() { diff --git a/tinygrad_repo/docs/abstractions2.py b/tinygrad_repo/docs/abstractions2.py index ea8db48..f5748a6 100644 --- a/tinygrad_repo/docs/abstractions2.py +++ b/tinygrad_repo/docs/abstractions2.py @@ -51,19 +51,19 @@ b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struc # describe the computation buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1) buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2) -ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1, ShapeTracker.from_shape((1,)).to_uop())) -ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2, ShapeTracker.from_shape((1,)).to_uop())) +ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1.view(ShapeTracker.from_shape((1,))),)) +ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2.view(ShapeTracker.from_shape((1,))),)) alu = ld_1 + ld_2 output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0) -st_0 = UOp(Ops.STORE, dtypes.void, (output_buf, ShapeTracker.from_shape((1,)).to_uop(), alu)) +st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.view(ShapeTracker.from_shape((1,))), alu)) s = UOp(Ops.SINK, dtypes.void, (st_0,)) # convert the computation to a "linearized" format (print the format) -from tinygrad.engine.realize import get_kernel, CompiledRunner -kernel = get_kernel(Device[DEVICE].renderer, s).linearize() +from tinygrad.engine.realize import get_program, CompiledRunner +program = get_program(Device[DEVICE].renderer, s) # compile a program (and print the source) -fxn = CompiledRunner(kernel.to_program()) +fxn = CompiledRunner(program) print(fxn.p.src) # NOTE: fxn.clprg is the CPUProgram diff --git a/tinygrad_repo/docs/abstractions3.py b/tinygrad_repo/docs/abstractions3.py index b69905f..c34a399 100644 --- a/tinygrad_repo/docs/abstractions3.py +++ b/tinygrad_repo/docs/abstractions3.py @@ -36,7 +36,7 @@ optim.schedule_step() # this will step the optimizer without running realize # 3. Create a schedule. # The weight Tensors have been assigned to, but not yet realized. Everything is still lazy at this point -# l1.lazydata and l2.lazydata define a computation graph +# l1.uop and l2.uop define a computation graph from tinygrad.engine.schedule import ScheduleItem schedule: List[ScheduleItem] = Tensor.schedule(l1, l2) diff --git a/tinygrad_repo/docs/developer/kernelize.md b/tinygrad_repo/docs/developer/kernelize.md index 9731464..b38db22 100644 --- a/tinygrad_repo/docs/developer/kernelize.md +++ b/tinygrad_repo/docs/developer/kernelize.md @@ -34,7 +34,7 @@ print(out) # , None)> on METAL with The multiply Tensor stays the same because it is fused. The output Tensor's UOp becomes a new ASSIGN UOp: ```py -print(out.lazydata) +print(out.uop) ``` The first source is the output BUFFER: @@ -72,7 +72,7 @@ Once a Tensor is kernelized, all children will LOAD its BUFFER, instead of fusin ```py child = out+2 child.kernelize() -print(child.lazydata.src[1].arg.ast) +print(child.uop.src[1].arg.ast) ``` ``` diff --git a/tinygrad_repo/docs/env_vars.md b/tinygrad_repo/docs/env_vars.md index bcd3a4f..74d1351 100644 --- a/tinygrad_repo/docs/env_vars.md +++ b/tinygrad_repo/docs/env_vars.md @@ -36,7 +36,6 @@ CUDA | [1] | enable CUDA backend AMD | [1] | enable AMD backend NV | [1] | enable NV backend METAL | [1] | enable Metal backend (for Mac M1 and after) -METAL_XCODE | [1] | enable Metal using macOS Xcode SDK CPU | [1] | enable CPU (Clang) backend LLVM | [1] | enable LLVM backend BEAM | [#] | number of beams in kernel beam search diff --git a/tinygrad_repo/docs/ramp.py b/tinygrad_repo/docs/ramp.py new file mode 100644 index 0000000..4649985 --- /dev/null +++ b/tinygrad_repo/docs/ramp.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 + +# this file is a "ramp" for people new to tinygrad to think about how to approach it +# it is runnable and editable. +# whenever you see stuff like DEBUG=2 or CPU=1 discussed, these are environment variables +# in a unix shell like bash `DEBUG=2 CPU=1 python docs/ramp.py` + +# this pip installs tinygrad master for the system +# the -e allows you to edit the tinygrad folder and update system tinygrad +# tinygrad is pure Python, so you are encouraged to do this +# git pull in the tinygrad directory will also get you the latest +""" +git clone https://github.com/tinygrad/tinygrad.git +cd tinygrad +python3 -m pip install -e . +""" + +# %% ******** +print("******* PART 1 *******") + +# we start with a Device. +# a Device is where Tensors are stored and compute is run +# tinygrad autodetects the best device on your system and makes it the DEFAULT +from tinygrad import Device +print(Device.DEFAULT) # on Mac, you can see this prints METAL + +# now, lets create a Tensor +from tinygrad import Tensor, dtypes +t = Tensor([1,2,3,4]) + +# you can see this Tensor is on the DEFAULT device with int dtype and shape (4,) +assert t.device == Device.DEFAULT +assert t.dtype == dtypes.int +assert t.shape == (4,) + +# unlike in torch, if we print it, it doesn't print the contents +# this is because tinygrad is lazy +# this Tensor has not been computed yet +print(t) +# , None)> on METAL with grad None> + +# the ".uop" property on Tensor contains the specification of how to compute it +print(t.uop) +""" +UOp(Ops.COPY, dtypes.int, arg=None, src=( + UOp(Ops.BUFFER, dtypes.int, arg=4, src=( + UOp(Ops.UNIQUE, dtypes.void, arg=0, src=()), + UOp(Ops.DEVICE, dtypes.void, arg='PYTHON', src=()),)), + UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)) +""" +# as you can see, it's specifying a copy from PYTHON device +# which is where the [1,2,3,4] array lives + +# UOps are the specification language in tinygrad +# they are immutable and form a DAG +# they have a "Ops", a "dtype", a tuple of srcs (parents), and an arg + +t.realize() +# if we want to "realize" a tensor, we can with the "realize" method +# now when we look at the uop, it's changed +print(t.uop) +""" +UOp(Ops.BUFFER, dtypes.int, arg=4, src=( + UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()), + UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)) +""" +# the copy was actually run, and now the "uop" of the Tensor is just a BUFFER +# if you run this script with DEBUG=2 in the environment, you can see the copy happen +# *** METAL 1 copy 16, METAL <- PYTHON ... + +# now let's do some compute +# we look at the uop to see the specification of the compute +t_times_2 = t * 2 +print(t_times_2.uop) +""" +UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.BUFFER, dtypes.int, arg=4, src=( + UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()), + x2:=UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)), + UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=( + UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=( + UOp(Ops.CONST, dtypes.int, arg=2, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=( + x2,)),)),)),)),)) +""" +# the BUFFER from above is being multiplied by a CONST 2 +# it's RESHAPEd and EXPANDed to broadcast the CONST to the BUFFER + +# we can check the result with +assert t_times_2.tolist() == [2, 4, 6, 8] + +# UOps are both immutable and globally unique +# if i multiply the Tensor by 4 twice, these result Tensors will have the same uop specification +t_times_4_try_1 = t * 4 +t_times_4_try_2 = t * 4 +assert t_times_4_try_1.uop is t_times_4_try_2.uop +# the specification isn't just the same, it's the exact same Python object +assert t_times_4_try_1 is not t_times_4_try_2 +# the Tensor is a different Python object + +# if we realize `t_times_4_try_1` ... +t_times_4_try_1.realize() +print(t_times_4_try_2.uop) +""" +UOp(Ops.BUFFER, dtypes.int, arg=4, src=( + UOp(Ops.UNIQUE, dtypes.void, arg=4, src=()), + UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),)) +""" +# ... `t_times_4_try_2` also becomes the same BUFFER +assert t_times_4_try_1.uop is t_times_4_try_2.uop +# so this print doesn't require any computation, just a copy back to the CPU so we can print it +print("** only the copy start") +print(t_times_4_try_2.tolist()) # [4, 8, 12, 16] +print("** only the copy end") +# you can confirm this with DEBUG=2, seeing what's printed in between the "**" prints + +# tinygrad has an auto differentiation engine that operates according to these same principles +# the derivative of "log(x)" is "1/x", and you can see this on line 20 of gradient.py +t_float = Tensor([3.0]) +t_log = t_float.log() +t_log_grad, = t_log.sum().gradient(t_float) +# due to how log is implemented, this gradient contains a lot of UOps +print(t_log_grad.uop) +# ...not shown here... +# but if you run with DEBUG=4 (CPU=1 used here for simpler code), you can see the generated code +""" +void E_(float* restrict data0, float* restrict data1) { + float val0 = *(data1+0); + *(data0+0) = (0.6931471805599453f*(1/(val0*0.6931471805599453f))); +} +""" +# the derivative is close to 1/3 +assert (t_log_grad.item() - 1/3) < 1e-6 + +# %% ******** +print("******* PART 2 *******") + +# we redefine the same t here so this cell can run on it's own +from tinygrad import Tensor +t = Tensor([1,2,3,4]) + +# what's above gives you enough of an understanding to go use tinygrad as a library +# however, a lot of the beauty of tinygrad is in how easy it is to interact with the internals +# NOTE: the APIs here are subject to change + +t_plus_3_plus_4 = t + 3 + 4 +print(t_plus_3_plus_4.uop) +""" +UOp(Ops.ADD, dtypes.int, arg=None, src=( + UOp(Ops.ADD, dtypes.int, arg=None, src=( + UOp(Ops.BUFFER, dtypes.int, arg=4, src=( + UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()), + x3:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)), + UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=( + UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=( + UOp(Ops.CONST, dtypes.int, arg=3, src=( + x7:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=( + x3,)),)),)),)),)), + UOp(Ops.EXPAND, dtypes.int, arg=(4,), src=( + UOp(Ops.RESHAPE, dtypes.int, arg=(1,), src=( + UOp(Ops.CONST, dtypes.int, arg=4, src=( + x7,)),)),)),)) +""" +# you can see it's adding both 3 and 4 + +# but by the time we are actually running the code, it's adding 7 +# `kernelize` will simplify and group the operations in the graph into kernels +t_plus_3_plus_4.kernelize() +print(t_plus_3_plus_4.uop) +""" +UOp(Ops.ASSIGN, dtypes.int, arg=None, src=( + x0:=UOp(Ops.BUFFER, dtypes.int, arg=4, src=( + UOp(Ops.UNIQUE, dtypes.void, arg=7, src=()), + x2:=UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)), + UOp(Ops.KERNEL, dtypes.void, arg=,) (__add__,)>, src=( + x0, + UOp(Ops.BUFFER, dtypes.int, arg=4, src=( + UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()), + x2,)),)),)) +""" +# ASSIGN has two srcs, src[0] is the BUFFER that's assigned to, and src[1] is the thing to assign +# src[1] is the GPU Kernel that's going to be run +# we can get the ast of the Kernel as follows +kernel_ast = t_plus_3_plus_4.uop.src[1].arg.ast + +# almost everything in tinygrad functions as a rewrite of the UOps +# the codegen rewrites the ast to a simplified form ready for "rendering" +from tinygrad.codegen import full_rewrite_to_sink +rewritten_ast = full_rewrite_to_sink(kernel_ast) +print(rewritten_ast) +""" +UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=0, src=()), + x3:=UOp(Ops.SPECIAL, dtypes.int, arg=('gidx0', 4), src=()),)), + UOp(Ops.ADD, dtypes.int, arg=None, src=( + UOp(Ops.LOAD, dtypes.int, arg=None, src=( + UOp(Ops.INDEX, dtypes.int.ptr(4), arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(4), arg=1, src=()), + x3,)),)), + UOp(Ops.CONST, dtypes.int, arg=7, src=()),)),)),)) +""" +# you can see at this point we are adding 7, not 3 and 4 + +# with DEBUG=4, we can see the code. +# since optimizations are on, it UPCASTed the operation, explicitly writing out all 4 +7s +t_plus_3_plus_4.realize() +""" +void E_4n2(int* restrict data0, int* restrict data1) { + int val0 = *(data1+0); + int val1 = *(data1+1); + int val2 = *(data1+2); + int val3 = *(data1+3); + *(data0+0) = (val0+7); + *(data0+1) = (val1+7); + *(data0+2) = (val2+7); + *(data0+3) = (val3+7); +} +""" +# the function name E_4n2 is "E" for elementwise op (as opposed to "r" for reduce op) +# "4" for the size, and "n2" for name deduping (it's the 3rd function with the same E and 4 in this session) +# when you print the name with DEBUG=2, you'll see the 4 is yellow, meaning that it's upcasted +# if you run with NOOPT=1 ... +""" +void E_4n2(int* restrict data0, int* restrict data1) { + for (int ridx0 = 0; ridx0 < 4; ridx0++) { + int val0 = *(data1+ridx0); + *(data0+ridx0) = (val0+7); + } +} +""" +# ... you get this unoptimized code with a loop and the 4 is blue (for global). the color code is in kernel.py + +# %% ******** +print("******* PART 3 *******") + +# now, we go even lower and understand UOps better and how the graph rewrite engine works. +# it's much simpler than what's in LLVM or MLIR + +from tinygrad import dtypes +from tinygrad.uop.ops import UOp, Ops + +# first, we'll construct some const UOps +a = UOp(Ops.CONST, dtypes.int, arg=2) +b = UOp(Ops.CONST, dtypes.int, arg=2) + +# if you have been paying attention, you should know these are the same Python object +assert a is b + +# UOps support normal Python math operations, so a_plus_b expresses the spec for 2 + 2 +a_plus_b = a + b +print(a_plus_b) +""" +UOp(Ops.ADD, dtypes.int, arg=None, src=( + x0:=UOp(Ops.CONST, dtypes.int, arg=2, src=()), + x0,)) +""" + +# we could actually render this 2+2 into a language like c and run it +# or, we can use tinygrad's graph rewrite engine to "constant fold" + +from tinygrad.uop.ops import graph_rewrite, UPat, PatternMatcher + +# a `PatternMatcher` is a list of tuples. for each element in the list: +# [0] is the pattern to match, and [1] is the function to run. +# this function can return either a UOp to replace the pattern with, or None to not replace +simple_pm = PatternMatcher([ + (UPat(Ops.ADD, src=(UPat(Ops.CONST, name="c1"), UPat(Ops.CONST, name="c2"))), + lambda c1,c2: UOp(Ops.CONST, dtype=c1.dtype, arg=c1.arg+c2.arg)), +]) +# this pattern matches the addition of two CONST and rewrites it into a single CONST UOp + +# to actually apply the pattern to a_plus_b, we use graph_rewrite +a_plus_b_simplified = graph_rewrite(a_plus_b, simple_pm) +print(a_plus_b_simplified) +""" +UOp(Ops.CONST, dtypes.int, arg=4, src=()) +""" +# 2+2 is in fact, 4 + +# we can also use syntactic sugar to write the pattern nicer +simpler_pm = PatternMatcher([ + (UPat.cvar("c1")+UPat.cvar("c2"), lambda c1,c2: c1.const_like(c1.arg+c2.arg)) +]) +assert graph_rewrite(a_plus_b, simple_pm) is graph_rewrite(a_plus_b, simpler_pm) +# note again the use of is, UOps are immutable and globally unique + +# %% ******** + +# that brings you to an understanding of the most core concepts in tinygrad +# you can run this with VIZ=1 to use the web based graph rewrite explorer +# hopefully now you understand it. the nodes in the graph are just UOps diff --git a/tinygrad_repo/docs/tensor/creation.md b/tinygrad_repo/docs/tensor/creation.md index d58a1ea..dbc5d77 100644 --- a/tinygrad_repo/docs/tensor/creation.md +++ b/tinygrad_repo/docs/tensor/creation.md @@ -24,6 +24,7 @@ ::: tinygrad.Tensor.randn ::: tinygrad.Tensor.randn_like ::: tinygrad.Tensor.randint +::: tinygrad.Tensor.randperm ::: tinygrad.Tensor.normal ::: tinygrad.Tensor.uniform ::: tinygrad.Tensor.scaled_uniform diff --git a/tinygrad_repo/docs/tensor/ops.md b/tinygrad_repo/docs/tensor/ops.md index f772b97..9c54475 100644 --- a/tinygrad_repo/docs/tensor/ops.md +++ b/tinygrad_repo/docs/tensor/ops.md @@ -37,8 +37,10 @@ ::: tinygrad.Tensor.scatter ::: tinygrad.Tensor.scatter_reduce ::: tinygrad.Tensor.masked_select +::: tinygrad.Tensor.masked_fill ::: tinygrad.Tensor.sort ::: tinygrad.Tensor.topk +::: tinygrad.Tensor.multinomial ## Neural Network (functional) diff --git a/tinygrad_repo/examples/beautiful_cartpole.py b/tinygrad_repo/examples/beautiful_cartpole.py index 5ff6d3e..c0fb5e0 100644 --- a/tinygrad_repo/examples/beautiful_cartpole.py +++ b/tinygrad_repo/examples/beautiful_cartpole.py @@ -78,10 +78,7 @@ if __name__ == "__main__": @TinyJit def get_action(obs:Tensor) -> Tensor: - # TODO: with no_grad - Tensor.no_grad = True ret = model(obs)[0].exp().multinomial().realize() - Tensor.no_grad = False return ret st, steps = time.perf_counter(), 0 diff --git a/tinygrad_repo/examples/beautiful_cifar.py b/tinygrad_repo/examples/beautiful_cifar.py index bd8c414..66f693d 100644 --- a/tinygrad_repo/examples/beautiful_cifar.py +++ b/tinygrad_repo/examples/beautiful_cifar.py @@ -3,14 +3,19 @@ start_tm = time.perf_counter() import math from typing import Tuple, cast import numpy as np -from tinygrad import Tensor, nn, GlobalCounters, TinyJit, dtypes +from tinygrad import Tensor, nn, GlobalCounters, TinyJit, dtypes, Device from tinygrad.helpers import partition, trange, getenv, Context from extra.lr_scheduler import OneCycleLR +GPUS = [f'{Device.DEFAULT}:{i}' for i in range(getenv("GPUS", 1))] + +# override tinygrad defaults dtypes.default_float = dtypes.half +Context(FUSE_ARANGE=1, FUSE_OPTIM=1).__enter__() # from https://github.com/tysam-code/hlb-CIFAR10/blob/main/main.py batchsize = getenv("BS", 1024) +assert batchsize % len(GPUS) == 0, f"{batchsize=} is not a multiple of {len(GPUS)=}" bias_scaler = 64 hyp = { 'opt': { @@ -67,7 +72,7 @@ class ConvGroup: cast(Tensor, self.norm2.weight).requires_grad = False def __call__(self, x:Tensor) -> Tensor: x = self.norm1(self.conv1(x).max_pool2d().float()).cast(dtypes.default_float).quick_gelu() - return self.norm2(self.conv2(x).float()).cast(dtypes.default_float).quick_gelu() + return self.norm2(self.conv2(x).float()).cast(dtypes.default_float).quick_gelu() + x class SpeedyConvNet: def __init__(self): @@ -78,23 +83,25 @@ class SpeedyConvNet: self.linear = nn.Linear(depths['block3'], depths['num_classes'], bias=False) def __call__(self, x:Tensor) -> Tensor: x = self.whiten(x).quick_gelu() + # ************* HACKS ************* + x = x.pad((1,0,0,1)) # TODO: this pad should not be here! copied from hlb_cifar10 for speed + # ************* HACKS ************* x = x.sequential([self.conv_group_1, self.conv_group_2, self.conv_group_3]) return self.linear(x.max(axis=(2,3))) * hyp['opt']['scaling_factor'] if __name__ == "__main__": # *** dataset *** X_train, Y_train, X_test, Y_test = nn.datasets.cifar() - # TODO: without this line indexing doesn't fuse! - X_train, Y_train, X_test, Y_test = [x.contiguous() for x in [X_train, Y_train, X_test, Y_test]] cifar10_std, cifar10_mean = X_train.float().std_mean(axis=(0, 2, 3)) - def preprocess(X:Tensor, Y:Tensor) -> Tuple[Tensor, Tensor]: - return ((X - cifar10_mean.view(1, -1, 1, 1)) / cifar10_std.view(1, -1, 1, 1)).cast(dtypes.default_float), Y.one_hot(depths['num_classes']) + def preprocess(X:Tensor) -> Tensor: return ((X - cifar10_mean.view(1, -1, 1, 1)) / cifar10_std.view(1, -1, 1, 1)).cast(dtypes.default_float) # *** model *** model = SpeedyConvNet() state_dict = nn.state.get_state_dict(model) - - #for k,v in nn.state.torch_load("/tmp/cifar_net.pt").items(): print(k) + if len(GPUS) > 1: + cifar10_std.to_(GPUS) + cifar10_mean.to_(GPUS) + for x in state_dict.values(): x.to_(GPUS) params_bias, params_non_bias = partition(state_dict.items(), lambda x: 'bias' in x[0]) opt_bias = nn.optim.SGD([x[1] for x in params_bias], lr=0.01, momentum=.85, nesterov=True, weight_decay=hyp['opt']['bias_decay']) @@ -111,40 +118,37 @@ if __name__ == "__main__": lr_sched_bias = OneCycleLR(opt_bias, max_lr=hyp['opt']['bias_lr'], pct_start=pct_start, div_factor=initial_div_factor, final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=total_train_steps) lr_sched_non_bias = OneCycleLR(opt_non_bias, max_lr=hyp['opt']['non_bias_lr'], pct_start=pct_start, div_factor=initial_div_factor, final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=total_train_steps) - def loss_fn(out, Y): - return out.cross_entropy(Y, reduction='none', label_smoothing=0.2).mul(hyp['opt']['loss_scale_scaler']*loss_batchsize_scaler).sum().div(hyp['opt']['loss_scale_scaler']) + def loss_fn(out:Tensor, Y:Tensor) -> Tensor: + ret = out.sparse_categorical_crossentropy(Y, reduction='none', label_smoothing=0.2) + return ret.mul(hyp['opt']['loss_scale_scaler']*loss_batchsize_scaler).sum().div(hyp['opt']['loss_scale_scaler']) @TinyJit @Tensor.train() def train_step(idxs:Tensor) -> Tensor: - with Context(SPLIT_REDUCEOP=0, FUSE_ARANGE=1): - X = X_train[idxs] - Y = Y_train[idxs].realize(X) - X, Y = preprocess(X, Y) - out = model(X) + X, Y = X_train[idxs], Y_train[idxs] + if len(GPUS) > 1: + X.shard_(GPUS, axis=0) + Y.shard_(GPUS, axis=0) + out = model(preprocess(X)) loss = loss_fn(out, Y) opt.zero_grad() loss.backward() - opt.step() - lr_sched_bias.step() - lr_sched_non_bias.step() - return loss / (batchsize*loss_batchsize_scaler) + return (loss / (batchsize*loss_batchsize_scaler)).realize(*opt.schedule_step(), + *lr_sched_bias.schedule_step(), *lr_sched_non_bias.schedule_step()) eval_batchsize = 2500 @TinyJit - @Tensor.test() def val_step() -> Tuple[Tensor, Tensor]: - # TODO with Tensor.no_grad() - Tensor.no_grad = True loss, acc = [], [] for i in range(0, X_test.size(0), eval_batchsize): - X, Y = preprocess(X_test[i:i+eval_batchsize], Y_test[i:i+eval_batchsize]) - out = model(X) + X, Y = X_test[i:i+eval_batchsize], Y_test[i:i+eval_batchsize] + if len(GPUS) > 1: + X.shard_(GPUS, axis=0) + Y.shard_(GPUS, axis=0) + out = model(preprocess(X)) loss.append(loss_fn(out, Y)) - acc.append((out.argmax(-1).one_hot(depths['num_classes']) * Y).sum() / eval_batchsize) - ret = Tensor.stack(*loss).mean() / (batchsize*loss_batchsize_scaler), Tensor.stack(*acc).mean() - Tensor.no_grad = False - return ret + acc.append((out.argmax(-1) == Y).sum() / eval_batchsize) + return Tensor.stack(*loss).mean() / (batchsize*loss_batchsize_scaler), Tensor.stack(*acc).mean() np.random.seed(1337) for epoch in range(math.ceil(hyp['misc']['train_epochs'])): diff --git a/tinygrad_repo/examples/beautiful_mnist.py b/tinygrad_repo/examples/beautiful_mnist.py index b5c834e..685a413 100644 --- a/tinygrad_repo/examples/beautiful_mnist.py +++ b/tinygrad_repo/examples/beautiful_mnist.py @@ -34,7 +34,6 @@ if __name__ == "__main__": return loss @TinyJit - @Tensor.test() def get_test_acc() -> Tensor: return (model(X_test).argmax(axis=1) == Y_test).mean()*100 test_acc = float('nan') diff --git a/tinygrad_repo/examples/benchmark_onnx.py b/tinygrad_repo/examples/benchmark_onnx.py index e88033b..2670ef0 100644 --- a/tinygrad_repo/examples/benchmark_onnx.py +++ b/tinygrad_repo/examples/benchmark_onnx.py @@ -1,10 +1,10 @@ -import sys, onnx, time, pickle +import sys, time, pickle from tinygrad import TinyJit, GlobalCounters, fetch, getenv -from tinygrad.frontend.onnx import OnnxRunner +from tinygrad.frontend.onnx import OnnxRunner, onnx_load from extra.onnx_helpers import get_example_inputs, validate def load_onnx_model(onnx_file): - onnx_model = onnx.load(onnx_file) + onnx_model = onnx_load(onnx_file) run_onnx = OnnxRunner(onnx_model) run_onnx_jit = TinyJit(lambda **kwargs: next(iter(run_onnx({k:v.to(None) for k,v in kwargs.items()}).values())), prune=True, optimize=True) return run_onnx_jit, run_onnx.graph_inputs diff --git a/tinygrad_repo/examples/coder.py b/tinygrad_repo/examples/coder.py index c7c1ef5..e6b9a5f 100644 --- a/tinygrad_repo/examples/coder.py +++ b/tinygrad_repo/examples/coder.py @@ -23,8 +23,6 @@ def create_fixed_tokenizer(output_file): # echo -en "write 2+2\nwrite hello world\ny\n" | TEMP=0 python3 examples/coder.py if __name__ == "__main__": - Tensor.no_grad = True - # https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/config.json with Timing("create model: "): model = Transformer(4096, 14336, n_heads=32, n_layers=32, norm_eps=1e-5, vocab_size=32002, n_kv_heads=8, max_context=4096, jit=getenv("JIT", 1)) diff --git a/tinygrad_repo/examples/conversation.py b/tinygrad_repo/examples/conversation.py index 721d3a0..8ce9adc 100644 --- a/tinygrad_repo/examples/conversation.py +++ b/tinygrad_repo/examples/conversation.py @@ -159,7 +159,6 @@ def init_vits( text_mapper = TextMapper(apply_cleaners=True, symbols=symbols) # Load the model. - Tensor.no_grad = True if seed is not None: Tensor.manual_seed(seed) np.random.seed(seed) @@ -221,7 +220,6 @@ def mp_output_stream(q: mp.Queue, counter: mp.Value, num_channels: int, sample_r if __name__ == "__main__": import nltk nltk.download("punkt") - Tensor.no_grad = True # Parse CLI arguments parser = argparse.ArgumentParser("Have a tiny conversation with tinygrad") diff --git a/tinygrad_repo/examples/gpt2.py b/tinygrad_repo/examples/gpt2.py index c3d933b..6a23332 100644 --- a/tinygrad_repo/examples/gpt2.py +++ b/tinygrad_repo/examples/gpt2.py @@ -85,7 +85,10 @@ class Transformer: seqlen = tokens.shape[1] tok_emb = self.wte(tokens) - pos_emb = self.wpe(self.allpos.shrink((None, (start_pos, start_pos+seqlen)))) + # not symbolic when consuming the prompt + selected_pos = (0, seqlen) if start_pos.val == 0 else (start_pos, start_pos+1) + pos_emb = self.wpe(self.allpos.shrink((None, selected_pos))) + h = tok_emb + pos_emb if HALF: h = h.half() @@ -190,7 +193,7 @@ class GPT2: (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing): with WallTimeEvent(BenchEvent.STEP): if batch_size == 1 and len(toks[0][start_pos:]) == 1: - tokens = Variable("tokens", 0, VOCAB_SIZE).bind(toks[0][start_pos]) + tokens = Variable("tokens", 0, VOCAB_SIZE-1).bind(toks[0][start_pos]) else: tokens = Tensor([x[start_pos:] for x in toks]) tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist() @@ -201,7 +204,6 @@ class GPT2: # **** main code **** if __name__ == "__main__": - Tensor.no_grad = True print(f"using {Device.DEFAULT} backend") default_prompt = "What is the answer to life, the universe, and everything?" diff --git a/tinygrad_repo/examples/hlb_cifar10.py b/tinygrad_repo/examples/hlb_cifar10.py index 35b188c..ff6c48b 100644 --- a/tinygrad_repo/examples/hlb_cifar10.py +++ b/tinygrad_repo/examples/hlb_cifar10.py @@ -118,7 +118,7 @@ class SpeedyResNet: # hyper-parameters were exactly the same as the original repo bias_scaler = 58 hyp = { - 'seed' : 209, + 'seed' : 200, 'opt': { 'bias_lr': 1.76 * bias_scaler/512, 'non_bias_lr': 1.76 / 512, @@ -267,13 +267,10 @@ def train_cifar(): @TinyJit def update(self, net, decay): - # TODO with Tensor.no_grad() - Tensor.no_grad = True for net_ema_param, (param_name, net_param) in zip(get_state_dict(self.net_ema).values(), get_state_dict(net).items()): # batchnorm currently is not being tracked if not ("num_batches_tracked" in param_name) and not ("running" in param_name): net_ema_param.assign(net_ema_param.detach()*decay + net_param.detach()*(1.-decay)).realize() - Tensor.no_grad = False set_seed(getenv('SEED', hyp['seed'])) diff --git a/tinygrad_repo/examples/llama.py b/tinygrad_repo/examples/llama.py index 8abdd9d..42f9b6e 100755 --- a/tinygrad_repo/examples/llama.py +++ b/tinygrad_repo/examples/llama.py @@ -240,7 +240,6 @@ class LLaMa: #elif k.endswith('.weight'): v.shard_(device, axis=-1) #elif 'norm.' in k: v.shard_(device, axis=-1) else: v.shard_(device, axis=None) - #print(k, v.shape, v.lazydata.axis) # replace weights in model load_state_dict(model, weights, strict=False, consume=True) @@ -331,7 +330,6 @@ int main() \end{code} """ if __name__ == "__main__": - Tensor.no_grad = True print(f"using {Device.DEFAULT} backend") parser = argparse.ArgumentParser(description="Run LLaMA in tinygrad", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -447,7 +445,7 @@ After you are done speaking, output [EOS]. You are not Chad. print(f"using LLaMA{LLAMA_SUFFIX}-{args.size} model") device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT llama = LLaMa.build(MODEL_PATH, TOKENIZER_PATH, model_gen=args.gen, model_size=args.size, quantize=args.quantize, device=device) - param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(llama.model)) + param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(llama.model)) outputted = pre_prompt if chatbot else args.prompt start_pos, toks = 0, [llama.tokenizer.bos_id()] + llama.tokenizer.encode(outputted) diff --git a/tinygrad_repo/examples/llama3.py b/tinygrad_repo/examples/llama3.py index 0e49371..b02fb9d 100644 --- a/tinygrad_repo/examples/llama3.py +++ b/tinygrad_repo/examples/llama3.py @@ -233,8 +233,6 @@ def prefill(model, toks, start_pos=0): return start_pos if __name__ == "__main__": - Tensor.no_grad = True - parser = argparse.ArgumentParser() parser.add_argument("--download_model", action="store_true", help="Download a model") parser.add_argument("--model", type=Path, help="Model path") @@ -286,7 +284,7 @@ if __name__ == "__main__": device = tuple(f"{Device.DEFAULT}:{i}" for i in range(args.shard)) if args.shard > 1 else Device.DEFAULT model = build_transformer(args.model, model_size=args.size, quantize=args.quantize, device=device) - param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(model)) + param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(model)) if not args.no_api and not args.benchmark: from bottle import Bottle, request, response, HTTPResponse, abort, static_file diff --git a/tinygrad_repo/examples/llm.c/export.py b/tinygrad_repo/examples/llm.c/export.py index bc13a09..9612f7e 100755 --- a/tinygrad_repo/examples/llm.c/export.py +++ b/tinygrad_repo/examples/llm.c/export.py @@ -16,7 +16,7 @@ if __name__ == "__main__": #model.load_pretrained() for p in nn.state.get_parameters(model): p.replace(Tensor.empty(p.shape, dtype=p.dtype)) # fake load pretrained - #early_sched = create_schedule([x.lazydata for x in nn.state.get_parameters(model)]) + #early_sched = create_schedule([x.uop for x in nn.state.get_parameters(model)]) #print(f"built model {len(early_sched)}") #B, T = Variable("B", 1, 128).bind(4), 64 #Variable("T", 1, 1024).bind(64) @@ -56,7 +56,7 @@ if __name__ == "__main__": state_dict.update({'X': X, 'Y': Y, 'loss': loss}) grad_state_dict = {} for k,v in state_dict.items(): - if v.lazydata.base.buffer not in used_buffers: print(f"UNUSED: {k}") + if v.uop.base.buffer not in used_buffers: print(f"UNUSED: {k}") if v.grad is not None: grad_state_dict['grad_'+k] = v.grad state_dict.update(grad_state_dict) state_dict.update({'adam_b1_t': optimizer.b1_t, 'adam_b2_t': optimizer.b2_t, 'adam_lr': optimizer.lr}) @@ -65,7 +65,7 @@ if __name__ == "__main__": nm = inverse_state_dict[p] state_dict["adam_m_"+nm] = m state_dict["adam_v_"+nm] = v - named_buffers = {v.lazydata.base.buffer:k.replace(".", "_") for k,v in state_dict.items()} + named_buffers = {v.uop.base.buffer:k.replace(".", "_") for k,v in state_dict.items()} c_code = ["#include ", "#include ", "#include "] if TIMING: c_code += ["#include ", "#include "] diff --git a/tinygrad_repo/examples/minrf.py b/tinygrad_repo/examples/minrf.py index 221a001..584e641 100644 --- a/tinygrad_repo/examples/minrf.py +++ b/tinygrad_repo/examples/minrf.py @@ -146,7 +146,6 @@ if __name__ == "__main__": return loss @TinyJit - @Tensor.test() def sample(z:Tensor, cond:Tensor) -> Tensor: return model.sample(z, cond, Tensor.full_like(cond, 10), sample_steps=getenv("SAMPLE_STEPS", 20))[-1] diff --git a/tinygrad_repo/examples/mixtral.py b/tinygrad_repo/examples/mixtral.py index 3266c82..c621d40 100644 --- a/tinygrad_repo/examples/mixtral.py +++ b/tinygrad_repo/examples/mixtral.py @@ -56,7 +56,7 @@ if __name__ == "__main__": with Profiling(sort="time", frac=0.1, enabled=args.profile): with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/sec"): with WallTimeEvent(BenchEvent.STEP): - tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024).bind(start_pos), args.temperature).item() + tok = model(Tensor([toks[start_pos:]]), 0 if start_pos == 0 else Variable("start_pos", 1, 1024-1).bind(start_pos), args.temperature).item() toks.append(tok) start_pos += 1 print(spp.decode(toks)) diff --git a/tinygrad_repo/examples/mlperf/dataloader.py b/tinygrad_repo/examples/mlperf/dataloader.py index 0942e83..c01ab48 100644 --- a/tinygrad_repo/examples/mlperf/dataloader.py +++ b/tinygrad_repo/examples/mlperf/dataloader.py @@ -71,7 +71,7 @@ def loader_process(q_in, q_out, X:Tensor, seed): #storage_tensor._copyin(img_tensor.numpy()) # faster - X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes() + X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes() # ideal #X[idx].assign(img.tobytes()) # NOTE: this is slow! @@ -262,8 +262,8 @@ def load_unet3d_data(preprocessed_dataset_dir, seed, queue_in, queue_out, X:Tens x = random_brightness_augmentation(x) x = gaussian_noise(x) - X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes() - Y[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes() + X[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes() + Y[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes() queue_out.put(idx) queue_out.put(None) @@ -377,12 +377,12 @@ def load_retinanet_data(base_dir:Path, val:bool, queue_in:Queue, queue_out:Queue clipped_match_idxs = np.clip(match_idxs, 0, None) clipped_boxes, clipped_labels = tgt["boxes"][clipped_match_idxs], tgt["labels"][clipped_match_idxs] - boxes[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes() - labels[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes() - matches[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes() - anchors[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes() + boxes[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_boxes.tobytes() + labels[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = clipped_labels.tobytes() + matches[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = match_idxs.tobytes() + anchors[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = anchor.tobytes() - imgs[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes() + imgs[idx].contiguous().realize().uop.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes() queue_out.put(idx) queue_out.put(None) diff --git a/tinygrad_repo/examples/mlperf/model_eval.py b/tinygrad_repo/examples/mlperf/model_eval.py index 35ad33e..fa3ca9d 100644 --- a/tinygrad_repo/examples/mlperf/model_eval.py +++ b/tinygrad_repo/examples/mlperf/model_eval.py @@ -9,7 +9,6 @@ from extra.bench_log import BenchEvent, WallTimeEvent def tlog(x): print(f"{x:25s} @ {time.perf_counter()-start:5.2f}s") def eval_resnet(): - Tensor.no_grad = True with WallTimeEvent(BenchEvent.FULL): # Resnet50-v1.5 from extra.models.resnet import ResNet50 @@ -245,7 +244,6 @@ def eval_mrcnn(): if __name__ == "__main__": # inference only Tensor.training = False - Tensor.no_grad = True models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(",") for m in models: diff --git a/tinygrad_repo/examples/mlperf/model_spec.py b/tinygrad_repo/examples/mlperf/model_spec.py index c22bfd9..1c4411c 100644 --- a/tinygrad_repo/examples/mlperf/model_spec.py +++ b/tinygrad_repo/examples/mlperf/model_spec.py @@ -60,7 +60,6 @@ def spec_mrcnn(): if __name__ == "__main__": # inference only for now Tensor.training = False - Tensor.no_grad = True for m in getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(","): nm = f"spec_{m}" diff --git a/tinygrad_repo/examples/mlperf/model_train.py b/tinygrad_repo/examples/mlperf/model_train.py index 17859b7..8938840 100644 --- a/tinygrad_repo/examples/mlperf/model_train.py +++ b/tinygrad_repo/examples/mlperf/model_train.py @@ -608,7 +608,7 @@ def train_retinanet(): if getenv("RESET_STEP", 1): _train_step.reset() - with Tensor.train(mode=False), Tensor.test(): + with Tensor.train(mode=False): if not RUNMLPERF: i, proc = 0, _fake_data_get(EVAL_BS, val=(val:=True)) else: @@ -791,7 +791,6 @@ def train_unet3d(): return loss.realize() @Tensor.train(mode=False) - @Tensor.test() def eval_step(model, x, y): y_hat, y = sliding_window_inference(model, x, y, gpus=GPUS) y_hat, y = Tensor(y_hat), Tensor(y, requires_grad=False) diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_8xMI300X.json b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_8xMI300X.json index 174b064..1e0f789 100644 --- a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_8xMI300X.json +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_8xMI300X.json @@ -5,7 +5,7 @@ "system_name": "tinybox 8xMI300X", "number_of_nodes": "1", "host_processors_per_node": "2", - "host_processor_model_name": "AMD EPYC 9354 32-Core Processor", + "host_processor_model_name": "AMD EPYC 9354", "host_processor_core_count": "32", "host_processor_vcpu_count": "64", "host_processor_frequency": "", @@ -18,7 +18,7 @@ "host_networking_topology": "", "host_memory_configuration": "24x 96GB DDR5", "accelerators_per_node": "8", - "accelerator_model_name": "AMD Instinct MI300X", + "accelerator_model_name": "AMD Instinct MI300X 192GB HBM3", "accelerator_host_interconnect": "PCIe 5.0 x16", "accelerator_frequency": "", "accelerator_on-chip_memories": "", @@ -30,10 +30,9 @@ "hw_notes": "", "framework": "tinygrad, branch mlperf_training_v5.0", "other_software_stack": { - "python": "3.10.16", - "ROCm": "3.0.0+94441cb" + "python": "3.10.16", + "ROCm": "3.0.0+94441cb" }, "operating_system": "Ubuntu 24.04.1 LTS", "sw_notes": "" - } - \ No newline at end of file + } \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json index eca528f..24cbce1 100644 --- a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_green.json @@ -5,7 +5,7 @@ "system_name": "tinybox green", "number_of_nodes": "1", "host_processors_per_node": "1", - "host_processor_model_name": "AMD EPYC 7532 32-Core Processor", + "host_processor_model_name": "AMD EPYC 7532", "host_processor_core_count": "32", "host_processor_vcpu_count": "64", "host_processor_frequency": "", @@ -35,4 +35,4 @@ }, "operating_system": "Ubuntu 22.04.4", "sw_notes": "" -} +} \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json index 8031c6c..58b6efe 100644 --- a/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.0/tinycorp/systems/tinybox_red.json @@ -5,7 +5,7 @@ "system_name": "tinybox red", "number_of_nodes": "1", "host_processors_per_node": "1", - "host_processor_model_name": "AMD EPYC 7532 32-Core Processor", + "host_processor_model_name": "AMD EPYC 7532", "host_processor_core_count": "32", "host_processor_vcpu_count": "64", "host_processor_frequency": "", @@ -34,4 +34,4 @@ }, "operating_system": "Ubuntu 22.04.4", "sw_notes": "" -} +} \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh new file mode 100644 index 0000000..35080c3 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128 + +export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +# export BEAM_LOG_SURPASS_MAX=1 +# export BASEDIR="/raid/datasets/wiki" + +export RESET_STEP=1 +export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md new file mode 100644 index 0000000..844b90f --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md @@ -0,0 +1,69 @@ +# 1. Problem + +This problem uses BERT for NLP. + +## Requirements + +Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` +Also install gdown (for dataset), numpy, tqdm and tensorflow. +``` +pip install gdown numpy tqdm tensorflow +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +# 2. Directions + +## Steps to download and verify data + +### 1. Download raw data + +``` +BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py +``` + +### 2. Preprocess train and validation data + +Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. + +#### Training: +``` +BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all +``` + +Generating a specific topic (Between 0 and 499) +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42 +``` + +#### Validation: +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval +``` +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +``` + +### tinybox_red + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +``` +### tinybox_8xMI300X + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh +``` \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh new file mode 100644 index 0000000..dff326f --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 +export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 + +export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0 +export BASEDIR="/raid/datasets/wiki" + +export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh new file mode 100644 index 0000000..ee43e95 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 + +# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54 +export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 +export TRAIN_STEPS=3900 + +export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0 +export BASEDIR="/raid/datasets/wiki" + +export WANDB=1 PARALLEL=0 + +RUNMLPERF=1 python3 examples/mlperf/model_train.py \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh new file mode 100644 index 0000000..ac81482 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export SUBMISSION_PLATFORM="tinybox_8xMI300X" +export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 + +# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54 +export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 +export TRAIN_STEPS=3900 + +export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0 +export BASEDIR="/raid/datasets/wiki" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="bert_8xMI300x_${DATETIME}_${SEED}.log" + +# init # TODO: without DEBUG=2 it hangs +BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 DEBUG=2 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md new file mode 100644 index 0000000..844b90f --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md @@ -0,0 +1,69 @@ +# 1. Problem + +This problem uses BERT for NLP. + +## Requirements + +Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` +Also install gdown (for dataset), numpy, tqdm and tensorflow. +``` +pip install gdown numpy tqdm tensorflow +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +# 2. Directions + +## Steps to download and verify data + +### 1. Download raw data + +``` +BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py +``` + +### 2. Preprocess train and validation data + +Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. + +#### Training: +``` +BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all +``` + +Generating a specific topic (Between 0 and 499) +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42 +``` + +#### Validation: +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval +``` +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +``` + +### tinybox_red + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +``` +### tinybox_8xMI300X + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh +``` \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh new file mode 100644 index 0000000..1205c21 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 + +export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 + +export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BEAM_LOG_SURPASS_MAX=1 +export BASEDIR="/raid/datasets/wiki" + +export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh new file mode 100644 index 0000000..f71688a --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 + +export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 + +export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +export WANDB=1 PARALLEL=0 + +RUNMLPERF=1 python3 examples/mlperf/model_train.py \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh new file mode 100644 index 0000000..52d85ee --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." NV=1 +export MODEL="bert" +export SUBMISSION_PLATFORM="tinybox_green" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 + +export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 + +export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="bert_green_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md new file mode 100644 index 0000000..844b90f --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md @@ -0,0 +1,69 @@ +# 1. Problem + +This problem uses BERT for NLP. + +## Requirements + +Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` +Also install gdown (for dataset), numpy, tqdm and tensorflow. +``` +pip install gdown numpy tqdm tensorflow +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +# 2. Directions + +## Steps to download and verify data + +### 1. Download raw data + +``` +BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py +``` + +### 2. Preprocess train and validation data + +Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. + +#### Training: +``` +BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all +``` + +Generating a specific topic (Between 0 and 499) +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42 +``` + +#### Validation: +``` +BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval +``` +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +``` + +### tinybox_red + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +``` +### tinybox_8xMI300X + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh +``` \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh new file mode 100644 index 0000000..f99bf30 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 + +export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 + +export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BEAM_LOG_SURPASS_MAX=1 +export BASEDIR="/raid/datasets/wiki" + +export RESET_STEP=1 +export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh new file mode 100644 index 0000000..7f577c9 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 + +export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 + +export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +export WANDB=1 PARALLEL=0 + +RUNMLPERF=1 python3 examples/mlperf/model_train.py \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh new file mode 100644 index 0000000..e3667e6 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." AMD=1 +export MODEL="bert" +export SUBMISSION_PLATFORM="tinybox_red" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 + +export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 + +export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/wiki" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="bert_red_${DATETIME}_${SEED}.log" + +export HCQDEV_WAIT_TIMEOUT_MS=100000 # prevents hang? + +# init +sleep 5 && sudo rmmod amdgpu || true +BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +# TODO: AM driver resulted in nan +sudo modprobe amdgpu +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md new file mode 100644 index 0000000..d380cec --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md @@ -0,0 +1,50 @@ +# 1. Problem + +This problem uses the ResNet-50 CNN to do image classification. + +## Requirements + +Install tinygrad and mlperf-logging from master. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +### tinybox_red +Disable cwsr +This is the default on production tinybox red. +``` +sudo vi /etc/modprobe.d/amdgpu.conf +cat < /etc/modprobe.d/amdgpu.conf +options amdgpu cwsr_enable=0 +EOF +sudo update-initramfs -u +sudo reboot + +# validate +sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0 +``` + +# 2. Directions + +## Steps to download and verify data + +``` +IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py +``` + +## Steps for one time setup + +### tinybox_red +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh +``` + +## Steps to run benchmark +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh +``` diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh new file mode 100644 index 0000000..2319da3 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 + +export BENCHMARK=10 DEBUG=2 + +python3 examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh new file mode 100644 index 0000000..ebe927c --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 + +export EVAL_START_EPOCH=3 EVAL_FREQ=4 + +export WANDB=1 PARALLEL=0 + +python3 examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh new file mode 100644 index 0000000..9c71932 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." NV=1 +export MODEL="resnet" +export SUBMISSION_PLATFORM="tinybox_green" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 + +# pip install -e ".[mlperf]" +export LOGMLPERF=${LOGMLPERF:-1} + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="resnet_green_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md new file mode 100644 index 0000000..d380cec --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md @@ -0,0 +1,50 @@ +# 1. Problem + +This problem uses the ResNet-50 CNN to do image classification. + +## Requirements + +Install tinygrad and mlperf-logging from master. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +### tinybox_red +Disable cwsr +This is the default on production tinybox red. +``` +sudo vi /etc/modprobe.d/amdgpu.conf +cat < /etc/modprobe.d/amdgpu.conf +options amdgpu cwsr_enable=0 +EOF +sudo update-initramfs -u +sudo reboot + +# validate +sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0 +``` + +# 2. Directions + +## Steps to download and verify data + +``` +IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py +``` + +## Steps for one time setup + +### tinybox_red +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh +``` + +## Steps to run benchmark +``` +examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh +``` diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh new file mode 100644 index 0000000..7bcbec2 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export BENCHMARK=10 DEBUG=${DEBUG:-2} + +python3 examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh new file mode 100644 index 0000000..aad23e4 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="resnet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export EVAL_START_EPOCH=3 EVAL_FREQ=4 + +export WANDB=1 PARALLEL=0 + +python3 examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh new file mode 100644 index 0000000..7a93d43 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." AMD=1 +export MODEL="resnet" +export SUBMISSION_PLATFORM="tinybox_red" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192 + +export RESET_STEP=0 + +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +# pip install -e ".[mlperf]" +export LOGMLPERF=${LOGMLPERF:-1} + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="resnet_red_${DATETIME}_${SEED}.log" + +# init +sleep 5 && sudo rmmod amdgpu || true +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh new file mode 100644 index 0000000..a980616 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +rocm-smi --setprofile compute +rocm-smi --setmclk 3 +rocm-smi --setperflevel high + +# power cap to 350W +echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md new file mode 100644 index 0000000..ce1ac9b --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md @@ -0,0 +1,38 @@ +# 1. Problem + +This problem uses RetinaNet for SSD. + +## Requirements + +Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0. +``` +git clone https://github.com/tinygrad/tinygrad.git +python3 -m pip install -e ".[mlperf]" +``` + +Also install the following dependencies: +``` +pip install tqdm numpy pycocotools boto3 pandas torch torchvision +``` + +### tinybox_green +Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md) +This is the default on production tinybox green. + +# 2. Directions + +## Steps to download data + +Run the following: +``` +BASEDIR=/raid/datasets/openimages python3 extra/datasets/openimages.py +``` + +## Running + +### tinybox_green + +#### Steps to run benchmark +``` +examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh +``` diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_beam.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_beam.sh new file mode 100644 index 0000000..6e25bb9 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_beam.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="retinanet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 +export BASEDIR="/raid/datasets/openimages" + +# export RESET_STEP=0 + +export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export BENCHMARK=5 DEBUG=2 + +python examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_run.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_run.sh new file mode 100644 index 0000000..7a3ee0d --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." NV=1 +export MODEL="retinanet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 +export BASEDIR="/raid/datasets/openimages" + +# export RESET_STEP=0 + +export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export WANDB=1 PARALLEL=0 +export RUNMLPERF=1 + +python examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh new file mode 100644 index 0000000..74cdc87 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_green/run_and_time.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e # Exit on any error +set -o pipefail # Make pipeline fail if any command fails + +export PYTHONPATH="." NV=1 +export MODEL="retinanet" +export SUBMISSION_PLATFORM="tinybox_green" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 + +export TRAIN_BEAM=2 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 +export IGNORE_JIT_FIRST_BEAM=1 +export BASEDIR="/raid/datasets/openimages" + +# pip install -e ".[mlperf]" +export LOGMLPERF=1 + +export SEED=$RANDOM +DATETIME=$(date "+%m%d%H%M") +LOGFILE="retinanet_green_${DATETIME}_${SEED}.log" + +# init +BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE + +# run +PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh new file mode 100644 index 0000000..97aa515 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="retinanet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 +export BASEDIR="/raid/datasets/openimages" + +# export RESET_STEP=0 + +export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export BENCHMARK=5 DEBUG=2 + +python examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh new file mode 100644 index 0000000..5fb4d10 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." AMD=1 +export MODEL="retinanet" +export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96 +export BASEDIR="/raid/datasets/openimages" + +# export RESET_STEP=0 + +export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 + +export WANDB=1 PARALLEL=0 +export RUNMLPERF=1 + +python examples/mlperf/model_train.py diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_8xMI300X.json b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_8xMI300X.json new file mode 100644 index 0000000..1e0f789 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_8xMI300X.json @@ -0,0 +1,38 @@ +{ + "submitter": "tinycorp", + "division": "closed", + "status": "Available on-premise", + "system_name": "tinybox 8xMI300X", + "number_of_nodes": "1", + "host_processors_per_node": "2", + "host_processor_model_name": "AMD EPYC 9354", + "host_processor_core_count": "32", + "host_processor_vcpu_count": "64", + "host_processor_frequency": "", + "host_processor_caches": "", + "host_processor_interconnect": "", + "host_memory_capacity": "2304GB", + "host_storage_type": "NVMe SSD", + "host_storage_capacity": "3x 4TB raid array", + "host_networking": "", + "host_networking_topology": "", + "host_memory_configuration": "24x 96GB DDR5", + "accelerators_per_node": "8", + "accelerator_model_name": "AMD Instinct MI300X 192GB HBM3", + "accelerator_host_interconnect": "PCIe 5.0 x16", + "accelerator_frequency": "", + "accelerator_on-chip_memories": "", + "accelerator_memory_configuration": "HBM3", + "accelerator_memory_capacity": "192GB", + "accelerator_interconnect": "", + "accelerator_interconnect_topology": "", + "cooling": "air", + "hw_notes": "", + "framework": "tinygrad, branch mlperf_training_v5.0", + "other_software_stack": { + "python": "3.10.16", + "ROCm": "3.0.0+94441cb" + }, + "operating_system": "Ubuntu 24.04.1 LTS", + "sw_notes": "" + } \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_green.json b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_green.json new file mode 100644 index 0000000..24cbce1 --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_green.json @@ -0,0 +1,38 @@ +{ + "submitter": "tinycorp", + "division": "closed", + "status": "Available on-premise", + "system_name": "tinybox green", + "number_of_nodes": "1", + "host_processors_per_node": "1", + "host_processor_model_name": "AMD EPYC 7532", + "host_processor_core_count": "32", + "host_processor_vcpu_count": "64", + "host_processor_frequency": "", + "host_processor_caches": "", + "host_processor_interconnect": "", + "host_memory_capacity": "128GB", + "host_storage_type": "NVMe SSD", + "host_storage_capacity": "4 TB raid array + 1 TB boot", + "host_networking": "", + "host_networking_topology": "", + "host_memory_configuration": "8x 16GB DDR4", + "accelerators_per_node": "6", + "accelerator_model_name": "NVIDIA GeForce RTX 4090", + "accelerator_host_interconnect": "PCIe 4.0 x16", + "accelerator_frequency": "", + "accelerator_on-chip_memories": "", + "accelerator_memory_configuration": "GDDR6X", + "accelerator_memory_capacity": "24GB", + "accelerator_interconnect": "", + "accelerator_interconnect_topology": "", + "cooling": "air", + "hw_notes": "", + "framework": "tinygrad, branch mlperf_training_v5.0", + "other_software_stack": { + "python": "3.10.12", + "CUDA": "12.4" + }, + "operating_system": "Ubuntu 22.04.4", + "sw_notes": "" +} \ No newline at end of file diff --git a/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_red.json b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_red.json new file mode 100644 index 0000000..58b6efe --- /dev/null +++ b/tinygrad_repo/examples/mlperf/training_submission_v5.1/tinycorp/systems/tinybox_red.json @@ -0,0 +1,37 @@ +{ + "submitter": "tinycorp", + "division": "closed", + "status": "Available on-premise", + "system_name": "tinybox red", + "number_of_nodes": "1", + "host_processors_per_node": "1", + "host_processor_model_name": "AMD EPYC 7532", + "host_processor_core_count": "32", + "host_processor_vcpu_count": "64", + "host_processor_frequency": "", + "host_processor_caches": "", + "host_processor_interconnect": "", + "host_memory_capacity": "128GB", + "host_storage_type": "NVMe SSD", + "host_storage_capacity": "4 TB raid array + 1 TB boot", + "host_networking": "", + "host_networking_topology": "", + "host_memory_configuration": "8x 16GB DDR4", + "accelerators_per_node": "6", + "accelerator_model_name": "AMD Radeon RX 7900 XTX", + "accelerator_host_interconnect": "PCIe 4.0 x16", + "accelerator_frequency": "", + "accelerator_on-chip_memories": "", + "accelerator_memory_configuration": "GDDR6", + "accelerator_memory_capacity": "24GB", + "accelerator_interconnect": "", + "accelerator_interconnect_topology": "", + "cooling": "air", + "hw_notes": "", + "framework": "tinygrad, branch mlperf_training_v5.0", + "other_software_stack": { + "python": "3.10.12" + }, + "operating_system": "Ubuntu 22.04.4", + "sw_notes": "" +} \ No newline at end of file diff --git a/tinygrad_repo/examples/openpilot/compile3.py b/tinygrad_repo/examples/openpilot/compile3.py index 476d407..9f284ed 100644 --- a/tinygrad_repo/examples/openpilot/compile3.py +++ b/tinygrad_repo/examples/openpilot/compile3.py @@ -12,13 +12,13 @@ from tinygrad.engine.realize import CompiledRunner import onnx from onnx.helper import tensor_dtype_to_np_dtype -from tinygrad.frontend.onnx import OnnxRunner +from tinygrad.frontend.onnx import OnnxRunner, onnx_load OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx" OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/openpilot.pkl" def compile(onnx_file): - onnx_model = onnx.load(onnx_file) + onnx_model = onnx_load(onnx_file) run_onnx = OnnxRunner(onnx_model) print("loaded model") diff --git a/tinygrad_repo/examples/openpilot/compile4.py b/tinygrad_repo/examples/openpilot/compile4.py index a2bac69..db8fc01 100644 --- a/tinygrad_repo/examples/openpilot/compile4.py +++ b/tinygrad_repo/examples/openpilot/compile4.py @@ -1,6 +1,6 @@ import sys, onnx from tinygrad import Tensor, fetch, GlobalCounters -from tinygrad.uop import UOp +from tinygrad.uop.ops import UOp from tinygrad.frontend.onnx import OnnxRunner from tinygrad.engine.grouper import get_kernelize_map from tinygrad.engine.schedule import create_schedule_with_vars @@ -19,8 +19,8 @@ if __name__ == "__main__": inputs = run_onnx.get_empty_input_data("npy") out: Tensor = next(iter(run_onnx({k:v.to(None) for k,v in inputs.items()}).values())).to('cpu') - root = out.lazydata - targets = [x.lazydata for x in inputs.values()] + root = out.uop + targets = [x.uop for x in inputs.values()] print(targets) # TODO: abstract this from gradient? @@ -37,12 +37,12 @@ if __name__ == "__main__": independent = UOp.sink(*independent_set.keys()) kernelized = get_kernelize_map(independent) independent = independent.substitute(kernelized) - schedule, var_vals, becomes_map = create_schedule_with_vars(independent) + schedule, var_vals = create_schedule_with_vars(independent) run_schedule(schedule) print("**** real ****") GlobalCounters.reset() - out.lazydata = root.substitute(kernelized).substitute(becomes_map) + out.uop = root.substitute(kernelized) out.kernelize() # realize diff --git a/tinygrad_repo/examples/qwq.py b/tinygrad_repo/examples/qwq.py index baa09c5..fad8769 100644 --- a/tinygrad_repo/examples/qwq.py +++ b/tinygrad_repo/examples/qwq.py @@ -52,8 +52,6 @@ def load_model(model_path:Path, model_params:Dict[str, Union[int, float]]) -> Tr if __name__ == "__main__": - Tensor.no_grad = True - parser = argparse.ArgumentParser(description="Run QwQ in tinygrad", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--size", choices=["32B"], default="32B", help="Model size") parser.add_argument("--count", type=int, default=30, help="Max number of tokens to generate") @@ -68,7 +66,7 @@ if __name__ == "__main__": model_path = Path(args.weights) if args.weights else download_weights(model_info["total_num_weights"]) transformer = load_model(model_path, model_info["model_params"]) tokenizer = AutoTokenizer.from_pretrained(model_info["tokenizer"]) - param_bytes = sum(x.lazydata.size * x.dtype.itemsize for x in get_parameters(transformer)) + param_bytes = sum(x.uop.size * x.dtype.itemsize for x in get_parameters(transformer)) outputted = args.prompt start_pos, toks = 0, tokenizer(outputted)["input_ids"] diff --git a/tinygrad_repo/examples/sdv2.py b/tinygrad_repo/examples/sdv2.py index 89af31a..29b1abb 100644 --- a/tinygrad_repo/examples/sdv2.py +++ b/tinygrad_repo/examples/sdv2.py @@ -107,7 +107,6 @@ if __name__ == "__main__": assert args.width % F == 0, f"img_width must be multiple of {F}, got {args.width}" assert args.height % F == 0, f"img_height must be multiple of {F}, got {args.height}" - Tensor.no_grad = True if args.seed is not None: Tensor.manual_seed(args.seed) diff --git a/tinygrad_repo/examples/sdxl.py b/tinygrad_repo/examples/sdxl.py index 0b7e13c..d449eb5 100644 --- a/tinygrad_repo/examples/sdxl.py +++ b/tinygrad_repo/examples/sdxl.py @@ -376,23 +376,24 @@ if __name__ == "__main__": parser.add_argument('--weights', type=str, help="Custom path to weights") parser.add_argument('--timing', action='store_true', help="Print timing per step") parser.add_argument('--noshow', action='store_true', help="Don't show the image") + parser.add_argument('--fakeweights', action='store_true', help="Load fake weights") args = parser.parse_args() - Tensor.no_grad = True if args.seed is not None: Tensor.manual_seed(args.seed) model = SDXL(configs["SDXL_Base"]) - default_weight_url = 'https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors' - weights = args.weights if args.weights else fetch(default_weight_url, 'sd_xl_base_1.0.safetensors') - loaded_weights = load_state_dict(model, safe_load(weights), strict=False, verbose=False, realize=False) + if not args.fakeweights: + default_weight_url = 'https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors' + weights = args.weights if args.weights else fetch(default_weight_url, 'sd_xl_base_1.0.safetensors') + loaded_weights = load_state_dict(model, safe_load(weights), strict=False, verbose=False, realize=False) - start_mem_used = GlobalCounters.mem_used - with Timing("loaded weights in ", lambda et_ns: f", {(B:=(GlobalCounters.mem_used-start_mem_used))/1e9:.2f} GB loaded at {B/et_ns:.2f} GB/s"): - with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): - Tensor.realize(*loaded_weights) - del loaded_weights + start_mem_used = GlobalCounters.mem_used + with Timing("loaded weights in ", lambda et_ns: f", {(B:=(GlobalCounters.mem_used-start_mem_used))/1e9:.2f} GB loaded at {B/et_ns:.2f} GB/s"): + with WallTimeEvent(BenchEvent.LOAD_WEIGHTS): + Tensor.realize(*loaded_weights) + del loaded_weights N = 1 C = 4 diff --git a/tinygrad_repo/examples/sdxl_seed0.png b/tinygrad_repo/examples/sdxl_seed0.png index 26569f6..386f860 100644 Binary files a/tinygrad_repo/examples/sdxl_seed0.png and b/tinygrad_repo/examples/sdxl_seed0.png differ diff --git a/tinygrad_repo/examples/so_vits_svc.py b/tinygrad_repo/examples/so_vits_svc.py index 95e90fa..41b6d39 100644 --- a/tinygrad_repo/examples/so_vits_svc.py +++ b/tinygrad_repo/examples/so_vits_svc.py @@ -587,7 +587,7 @@ if __name__=="__main__": vits_model = args.model encoder_location, vits_location = ENCODER_MODELS[ENCODER_MODEL], VITS_MODELS[vits_model] - Tensor.no_grad, Tensor.training = True, False + Tensor.training = False # Get Synthesizer and ContentVec net_g, hps = Synthesizer.load_from_pretrained(vits_location[0], vits_location[2], vits_location[1], vits_location[3]) Encoder = get_encoder(hps.model.ssl_dim) diff --git a/tinygrad_repo/examples/stable_diffusion.py b/tinygrad_repo/examples/stable_diffusion.py index e47d6bf..44dca39 100644 --- a/tinygrad_repo/examples/stable_diffusion.py +++ b/tinygrad_repo/examples/stable_diffusion.py @@ -229,7 +229,6 @@ if __name__ == "__main__": parser.add_argument('--guidance', type=float, default=7.5, help="Prompt strength") args = parser.parse_args() - Tensor.no_grad = True model = StableDiffusion() # load in weights diff --git a/tinygrad_repo/examples/stunning_mnist.py b/tinygrad_repo/examples/stunning_mnist.py index 7314486..66c5aa8 100644 --- a/tinygrad_repo/examples/stunning_mnist.py +++ b/tinygrad_repo/examples/stunning_mnist.py @@ -45,8 +45,7 @@ if __name__ == "__main__": print("*** scheduled training") # evaluate the model - with Tensor.test(): - test_acc = ((model(X_test).argmax(axis=1) == Y_test).mean()*100) + test_acc = ((model(X_test).argmax(axis=1) == Y_test).mean()*100) print("*** scheduled eval") # NOTE: there's no kernels run in the scheduling phase diff --git a/tinygrad_repo/examples/tinychat/tinychat-browser/compile.py b/tinygrad_repo/examples/tinychat/tinychat-browser/compile.py index 8b898ec..d1a1e64 100644 --- a/tinygrad_repo/examples/tinychat/tinychat-browser/compile.py +++ b/tinygrad_repo/examples/tinychat/tinychat-browser/compile.py @@ -13,8 +13,8 @@ def prepare_browser_chunks(model): chunk_size = 16 * 1024 * 1024 # small chunks based on iphone browser constraints metadata = {} # We won't export cache_kv bytes (because we start inference on client at start_pos=0), but we will tell the client how big cache_kv needs to be - t_infos = [(v.lazydata.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" not in k] - empty_t_infos = [(v.lazydata.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" in k] + t_infos = [(v.uop.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" not in k] + empty_t_infos = [(v.uop.base.realized.nbytes, k, v.dtype) for k,v in state_dict.items() if "cache_kv" in k] split_t_infos = [] for size, name, dtype in t_infos: @@ -48,7 +48,7 @@ def prepare_browser_chunks(model): weight_metadata = metadata.get(name, default) weight_metadata["parts"][part_num] = {"file": i, "file_start_pos": cursor, "size": size} metadata[name] = weight_metadata - data = bytes(state_dict[name].lazydata.base.realized.as_buffer()) + data = bytes(state_dict[name].uop.base.realized.as_buffer()) data = data if not offsets else data[offsets[0]:offsets[1]] writer.write(data) cursor += size @@ -109,7 +109,6 @@ if __name__=="__main__": tokenizer = Tokenizer(str(tokenizer_path)) model_path = fetch("https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-f16.gguf", "Llama-3.2-1B-Instruct-f16.gguf", subdir="llama3-1b-instruct") - Tensor.no_grad = True max_context=1024 tok = 128000 TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P = 0.95, 0, 0.0, 0.0, 0.0 diff --git a/tinygrad_repo/examples/vits.py b/tinygrad_repo/examples/vits.py index 3a02ece..b315a52 100644 --- a/tinygrad_repo/examples/vits.py +++ b/tinygrad_repo/examples/vits.py @@ -651,7 +651,7 @@ class TextMapper: # Based on https://github.com/keithito/tacotron VITS_PATH = Path(__file__).parents[1] / "weights/VITS/" MODELS = { # config_url, weights_url "ljs": ("https://raw.githubusercontent.com/jaywalnut310/vits/main/configs/ljs_base.json", "https://drive.google.com/uc?export=download&id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT&confirm=t"), - "vctk": ("https://raw.githubusercontent.com/jaywalnut310/vits/main/configs/vctk_base.json", "https://drive.google.com/uc?export=download&id=11aHOlhnxzjpdWDpsz1vFDCzbeEfoIxru&confirm=t"), + "vctk": ("https://huggingface.co/csukuangfj/vits-vctk/resolve/main/vctk_base.json", "https://huggingface.co/csukuangfj/vits-vctk/resolve/main/pretrained_vctk.pth"), "mmts-tts": ("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/config.json", "https://huggingface.co/facebook/mms-tts/resolve/main/full_models/eng/G_100000.pth"), "uma_trilingual": ("https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/raw/main/configs/uma_trilingual.json", "https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth"), "cjks": ("https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/config.json", "https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/model.pth"), @@ -707,7 +707,6 @@ if __name__ == '__main__': text_mapper = TextMapper(apply_cleaners=True, symbols=symbols) # Load the model. - Tensor.no_grad = True if args.seed is not None: Tensor.manual_seed(args.seed) np.random.seed(args.seed) diff --git a/tinygrad_repo/examples/webgpu/stable_diffusion/compile.py b/tinygrad_repo/examples/webgpu/stable_diffusion/compile.py index a26be7c..6f47a5b 100644 --- a/tinygrad_repo/examples/webgpu/stable_diffusion/compile.py +++ b/tinygrad_repo/examples/webgpu/stable_diffusion/compile.py @@ -82,7 +82,6 @@ if __name__ == "__main__": args = parser.parse_args() Device.DEFAULT = "WEBGPU" - Tensor.no_grad = True model = StableDiffusion() # load in weights @@ -115,7 +114,7 @@ if __name__ == "__main__": run, special_names = jit_model(step, *step.input) functions, statements, bufs, _ = compile_net(run, special_names) state = get_state_dict(model) - weights = {id(x.lazydata.base.realized): name for name, x in state.items()} + weights = {id(x.uop.base.realized): name for name, x in state.items()} kernel_code = '\n\n'.join([f"const {key} = `{fixup_code(code, key)}`;" for key, code in functions.items()]) kernel_names = ', '.join([name for (name, _, _, _) in statements]) input_names = [name for _,name in special_names.items() if "input" in name] diff --git a/tinygrad_repo/examples/whisper.py b/tinygrad_repo/examples/whisper.py index b44f764..dc2832a 100644 --- a/tinygrad_repo/examples/whisper.py +++ b/tinygrad_repo/examples/whisper.py @@ -94,7 +94,7 @@ class AudioEncoder: class TextDecoder: def __init__(self, n_vocab, n_text_ctx, n_text_state, n_text_head, n_text_layer, **_): self.max_tokens_to_sample = n_text_ctx // 2 - self.max_self_attn_cache_len = self.max_tokens_to_sample * 2 + 5 # roughly prompt + start toks + max_tokens_to_sample + self.max_self_attn_cache_len = n_text_ctx self.token_embedding = nn.Embedding(n_vocab, n_text_state) self.positional_embedding = Tensor.empty(n_text_ctx, n_text_state) @@ -104,7 +104,7 @@ class TextDecoder: self.getjitted = collections.defaultdict(lambda: TinyJit(self.forward)) def __call__(self, x: Tensor, pos: int, encoded_audio: Tensor): - pos = Variable("self_attn_cache_len", 1, self.max_self_attn_cache_len).bind(pos) if pos else 0 + pos = Variable("self_attn_cache_len", 1, self.max_self_attn_cache_len-1).bind(pos) if pos else 0 return self.getjitted[x.shape](x, pos, encoded_audio) def forward(self, x:Tensor, pos:Union[Variable, Literal[0]], encoded_audio:Tensor): diff --git a/tinygrad_repo/examples/yolov8-onnx.py b/tinygrad_repo/examples/yolov8-onnx.py index 9d14024..5f76ab7 100644 --- a/tinygrad_repo/examples/yolov8-onnx.py +++ b/tinygrad_repo/examples/yolov8-onnx.py @@ -1,9 +1,8 @@ #!/usr/bin/env python3 import os from ultralytics import YOLO -import onnx from pathlib import Path -from tinygrad.frontend.onnx import OnnxRunner +from tinygrad.frontend.onnx import OnnxRunner, onnx_load from extra.onnx_helpers import get_example_inputs from tinygrad.tensor import Tensor @@ -11,6 +10,6 @@ os.chdir("/tmp") if not Path("yolov8n-seg.onnx").is_file(): model = YOLO("yolov8n-seg.pt") model.export(format="onnx", imgsz=[480,640]) -onnx_model = onnx.load(open("yolov8n-seg.onnx", "rb")) +onnx_model = onnx_load(open("yolov8n-seg.onnx", "rb")) run_onnx = OnnxRunner(onnx_model) run_onnx(get_example_inputs(run_onnx.graph_inputs), debug=True) diff --git a/tinygrad_repo/extra/accel/MAPPING b/tinygrad_repo/extra/accel/MAPPING deleted file mode 100644 index 5e07826..0000000 --- a/tinygrad_repo/extra/accel/MAPPING +++ /dev/null @@ -1,52 +0,0 @@ -We have to figure out how to make the tinygrad ops match to hw. -Generic folded reduce may not work. - - -GPUs: - - -AMD: - -RDNA2: https://developer.amd.com/wp-content/resources/RDNA2_Shader_ISA_November2020.pdf - -We have RX6900XT with 80 CU, 40 WGP, and 1 "processor" -@ 1.825 GHz, there's 18,688 FP32 GFLOPS of compute. 10240 FLOPS/cycle, 128 per CU (32 FMAs per vALU, 2 per compute unit) - -286 GFLOP for ENET=2 BS=64. At theoretical max, (286/18688)*1000 = 15.3 ms. -We observe about 10x factor off with pytorch. - -We will focus on speed for AMD, since we have complete docs for that GPU. -Each "processor" has an "ultra threaded dispatch processor" - -Each SIMD unit has 256 vector registers (or 1024?), and operates on 32 at once. -Ahh, I think there's 1024 total, but only 256 per wavefront - - -M1: - -On M1 GPU, theoretical is 2.275 TFLOPS. https://www.notebookcheck.net/Apple-M1-GPU-Benchmarks-and-Specs.503610.0.html - -We observe 2000ms for BS=8 (37 GFLOP). 37/2275 = 11.9 ms. tinygrad is over a factor of 100x off (similar on AMD GPU) - -NOTE: the timer in the M1 OpenCL doesn't seem to be anywhere close to wall time. - - -Adreno: - -TBD, no comma three here. Image > Buffer because the L1 cache is used. Would UBWC help on weights? - -We have a good bit of work on this in hyperthneed. Let's get the disassembler out and make this fast. - - - -TPUs: - -These use really big systolic arrays and have a lot less flexibility. - -IIRC, their vector math unit is similar to the GPU. - - - - - - diff --git a/tinygrad_repo/extra/accel/README b/tinygrad_repo/extra/accel/README deleted file mode 100644 index ee32b24..0000000 --- a/tinygrad_repo/extra/accel/README +++ /dev/null @@ -1,5 +0,0 @@ -This is where we scope out adding accelerators to tinygrad - -ane -- Apple Neural Engine, in the M1 + newer iPhones -tpu -- Google's TPU, available for rent in Google Cloud - diff --git a/tinygrad_repo/extra/accel/ane/1_build/.gitignore b/tinygrad_repo/extra/accel/ane/1_build/.gitignore deleted file mode 100644 index f5bdd21..0000000 --- a/tinygrad_repo/extra/accel/ane/1_build/.gitignore +++ /dev/null @@ -1 +0,0 @@ -run diff --git a/tinygrad_repo/extra/accel/ane/1_build/coreml_ane.py b/tinygrad_repo/extra/accel/ane/1_build/coreml_ane.py deleted file mode 100755 index cefd143..0000000 --- a/tinygrad_repo/extra/accel/ane/1_build/coreml_ane.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -import numpy as np -import coremltools as ct -from coremltools.models.neural_network import datatypes, NeuralNetworkBuilder - -# KxK GEMM with bias -K = 64 - -input_features = [('image', datatypes.Array(K))] -input_features2 = [('image2', datatypes.Array(K))] -output_features = [('probs', datatypes.Array(K))] - -weights = np.zeros((K, K)) + 3 -bias = np.ones(K) - -builder = NeuralNetworkBuilder(input_features+input_features2, output_features) - -#builder.add_inner_product(name='ip_layer', W=weights, b=None, input_channels=K, output_channels=K, has_bias=False, input_name='image', output_name='med') -#builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs') -builder.add_elementwise(name='element', input_names=['image', 'image2'], output_name='probs', mode='ADD') -#builder.add_bias(name='bias', b=bias, input_name='med', output_name='probs', shape_bias=(K,)) -#builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs') - -# compile the spec -mlmodel = ct.models.MLModel(builder.spec) - -# trigger the ANE! -out = mlmodel.predict({"image": np.zeros(K, dtype=np.float32)+1, "image2": np.zeros(K, dtype=np.float32)+2}) -print(out) -mlmodel.save('test.mlmodel') diff --git a/tinygrad_repo/extra/accel/ane/1_build/run.swift b/tinygrad_repo/extra/accel/ane/1_build/run.swift deleted file mode 100644 index 0aad7e7..0000000 --- a/tinygrad_repo/extra/accel/ane/1_build/run.swift +++ /dev/null @@ -1,36 +0,0 @@ -import CoreML - -// ANE? -let config = MLModelConfiguration() -config.computeUnits = .all - -// CPU? -let opts = MLPredictionOptions() -opts.usesCPUOnly = false - -class MNISTInput : MLFeatureProvider { - var featureNames: Set { - get { - return ["image", "image2"] - } - } - func featureValue(for featureName: String) -> MLFeatureValue? { - if (featureName == "image") { - let tokenIDMultiArray = try? MLMultiArray(shape: [64], dataType: MLMultiArrayDataType.float32) - tokenIDMultiArray?[0] = NSNumber(value: 1337) - return MLFeatureValue(multiArray: tokenIDMultiArray!) - } - if (featureName == "image2") { - let tokenIDMultiArray = try? MLMultiArray(shape: [64], dataType: MLMultiArrayDataType.float32) - tokenIDMultiArray?[0] = NSNumber(value: 1337) - return MLFeatureValue(multiArray: tokenIDMultiArray!) - } - return nil - } -} - -let compiledUrl = try MLModel.compileModel(at: URL(string: "test.mlmodel")!) -let model = try MLModel(contentsOf: compiledUrl, configuration: config) -let out = try model.prediction(from: MNISTInput(), options: opts) - -print(out.featureValue(for: "probs") as Any) diff --git a/tinygrad_repo/extra/accel/ane/1_build/test.mlmodel b/tinygrad_repo/extra/accel/ane/1_build/test.mlmodel deleted file mode 100644 index 4dbe43a..0000000 Binary files a/tinygrad_repo/extra/accel/ane/1_build/test.mlmodel and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/2_compile/.gitignore b/tinygrad_repo/extra/accel/ane/2_compile/.gitignore deleted file mode 100644 index 9f9d150..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*.hwx -anecompiler.swap.* -context_switch_log.txt -debug/ diff --git a/tinygrad_repo/extra/accel/ane/2_compile/ane.py b/tinygrad_repo/extra/accel/ane/2_compile/ane.py deleted file mode 120000 index f35d850..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/ane.py +++ /dev/null @@ -1 +0,0 @@ -../lib/ane.py \ No newline at end of file diff --git a/tinygrad_repo/extra/accel/ane/2_compile/aneregs.json b/tinygrad_repo/extra/accel/ane/2_compile/aneregs.json deleted file mode 120000 index 2d09790..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/aneregs.json +++ /dev/null @@ -1 +0,0 @@ -../lib/aneregs.json \ No newline at end of file diff --git a/tinygrad_repo/extra/accel/ane/2_compile/compile.m b/tinygrad_repo/extra/accel/ane/2_compile/compile.m deleted file mode 100644 index 79e21bd..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/compile.m +++ /dev/null @@ -1,57 +0,0 @@ -#import -#include -#include - -typedef unsigned int ANECStatus; - -int ANECCompile(NSDictionary* param_1, NSDictionary* param_2, - void (^param_3)(ANECStatus status, - NSDictionary* statusDictionary)); - -int main(int argc, char* argv[]) -{ - os_log(OS_LOG_DEFAULT, "start compiler"); - - NSDictionary* iDictionary = @ { - @"NetworkPlistName" : [NSString stringWithCString:argv[1] - encoding:NSUTF8StringEncoding], - @"NetworkPlistPath" : @"./", - }; - NSArray* plistArray = @[ iDictionary ]; - - NSMutableDictionary* optionsDictionary = - [NSMutableDictionary dictionaryWithCapacity:4]; - NSMutableDictionary* flagsDictionary = - [NSMutableDictionary dictionaryWithCapacity:4]; - optionsDictionary[@"InputNetworks"] = plistArray; - - optionsDictionary[@"OutputFilePath"] = @"./"; - - // h11 (or anything?) works here too, and creates different outputs that don't - // run - flagsDictionary[@"TargetArchitecture"] = @"h13"; - - if (argc > 2) { - optionsDictionary[@"OutputFileName"] = @"debug/model.hwx"; - - flagsDictionary[@"CompileANEProgramForDebugging"] = - [NSNumber numberWithBool:YES]; - int debug_mask = 0x7fffffff; - flagsDictionary[@"DebugMask"] = [NSNumber numberWithInt:debug_mask]; - } else { - optionsDictionary[@"OutputFileName"] = @"model.hwx"; - } - - void (^simpleBlock)(ANECStatus status, NSDictionary* statusDictionary) = ^(ANECStatus status, NSDictionary* statusDictionary) { - NSLog(@"status = %d\n", status); - // when status != 0 dump the dictionary - if (status) - NSLog(@"%@", statusDictionary); - }; - - printf("hello\n"); - int ret = ANECCompile(optionsDictionary, flagsDictionary, simpleBlock); - printf("compile: %d\n", ret); - - return ret; -} diff --git a/tinygrad_repo/extra/accel/ane/2_compile/compile.mm b/tinygrad_repo/extra/accel/ane/2_compile/compile.mm deleted file mode 100644 index 2ccdae8..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/compile.mm +++ /dev/null @@ -1,74 +0,0 @@ -#include -#include -#import -#include -#include - -extern "C" { - int ANECCompile(CFDictionaryRef param_1, CFDictionaryRef param_2, unsigned long param_3); - std::string _ZN21ZinIrEnumToStringUtil14OpCodeToStringE22ZinIrOpLayerOpCodeType(int op); - std::string _ZN21ZinIrEnumToStringUtil21NonLinearModeToStringE18ZinIrNonLinearMode(int op); - std::string _ZN19ZinMirCacheHintUtil17CacheHintToStringE15ZinMirCacheHint(int op); - std::string _ZN30ZinMirKernelSizeSplitterEngine16ConvKindToStringENS_8ConvKindE(int op); - - /*void _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE( - unsigned long param_1, void *param_2,int param_3, std::basic_ostream *param_4); - -void debugregs(int a1, void *dat, int a2) { - _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(a1, dat, a2, &std::cout); -}*/ - -} - -int main(int argc, char* argv[]) { - os_log(OS_LOG_DEFAULT, "start compiler"); - - /*for (int i = 0; i < 60; i++) { - std::string tmp = _ZN21ZinIrEnumToStringUtil14OpCodeToStringE22ZinIrOpLayerOpCodeType(i); - //std::string tmp = _ZN21ZinIrEnumToStringUtil21NonLinearModeToStringE18ZinIrNonLinearMode(i); - printf("%2d: %s\n", i, tmp.c_str()); - }*/ - - CFTypeRef ikeys[2]; - ikeys[0] = CFSTR("NetworkPlistName"); - ikeys[1] = CFSTR("NetworkPlistPath"); - - CFTypeRef ivalues[2]; - ivalues[0] = CFStringCreateWithCString(kCFAllocatorDefault, argv[1], kCFStringEncodingUTF8); - ivalues[1] = CFSTR("./"); - - CFDictionaryRef iDictionary = CFDictionaryCreate(kCFAllocatorDefault, ikeys, ivalues, 2, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); - CFArrayRef array = CFArrayCreate(kCFAllocatorDefault, (const void**)&iDictionary, 1, &kCFTypeArrayCallBacks); - - CFMutableDictionaryRef optionsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); - CFMutableDictionaryRef flagsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); - - CFDictionaryAddValue(optionsDictionary, CFSTR("InputNetworks"), array); - CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFilePath"), CFSTR("./")); - //CFDictionaryAddValue(optionsDictionary, CFSTR("OptionsFilePath"), CFSTR("good.options")); - - // h11 (or anything?) works here too, and creates different outputs that don't run - CFDictionaryAddValue(flagsDictionary, CFSTR("TargetArchitecture"), CFSTR("h13")); - - if (argc > 2) { - CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFileName"), CFSTR("debug/model.hwx")); - //CFDictionaryAddValue(flagsDictionary, CFSTR("DebugDetailPrint"), kCFBooleanTrue); - CFDictionaryAddValue(flagsDictionary, CFSTR("CompileANEProgramForDebugging"), kCFBooleanTrue); - int debug_mask = 0x7fffffff; - CFDictionaryAddValue(flagsDictionary, CFSTR("DebugMask"), CFNumberCreate(kCFAllocatorDefault, 3, &debug_mask)); - } else { - CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFileName"), CFSTR("model.hwx")); - } - //CFDictionaryAddValue(flagsDictionary, CFSTR("DisableMergeScaleBias"), kCFBooleanTrue); - //CFDictionaryAddValue(flagsDictionary, CFSTR("Externs"), CFSTR("swag")); - - //CFShow(optionsDictionary); - //CFShow(flagsDictionary); - - printf("hello\n"); - int ret = ANECCompile(optionsDictionary, flagsDictionary, 0); - printf("compile: %d\n", ret); - - - return ret; -} diff --git a/tinygrad_repo/extra/accel/ane/2_compile/compile.sh b/tinygrad_repo/extra/accel/ane/2_compile/compile.sh deleted file mode 100755 index 8df875e..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/compile.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -e -g++ compile.mm -F /System/Library/PrivateFrameworks/ -framework ANECompiler -framework CoreFoundation -rm -f model.hwx -./a.out net.plist debug -rm -f context_switch_log.txt -log show --process a.out --last 1m --info --debug - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/dcompile.py b/tinygrad_repo/extra/accel/ane/2_compile/dcompile.py deleted file mode 100755 index 7afdbf5..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/dcompile.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -import networkx as nx -import pylab as plt -from networkx.drawing.nx_pydot import read_dot - -ret = os.system("./a.out "+sys.argv[1]+" debug") -assert(ret == 0) - -df = "debug/model.hwx.zinir_graph_after_reg_spill.dot" - -#from graphviz import render -#render('dot', 'png', df) - -#plt = Image(pdot.create_png() -#display(plt) diff --git a/tinygrad_repo/extra/accel/ane/2_compile/debug/README b/tinygrad_repo/extra/accel/ane/2_compile/debug/README deleted file mode 100644 index 6613b95..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/debug/README +++ /dev/null @@ -1 +0,0 @@ -Run compiler with debug in argv[2] to generate these files diff --git a/tinygrad_repo/extra/accel/ane/2_compile/hwx_parse.py b/tinygrad_repo/extra/accel/ane/2_compile/hwx_parse.py deleted file mode 100755 index 88ba4ea..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/hwx_parse.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python3 -import sys -from hexdump import hexdump -from macholib import MachO -from tinygrad.helpers import getenv -def get_macho(fn): - # mod to make the header okay - # MH_CIGAM_64 is good - dat = open(fn, "rb").read() - dat = b"\xcf\xfa\xed\xfe"+dat[4:] - from tempfile import NamedTemporaryFile - with NamedTemporaryFile(delete=False) as f: - f.write(dat) - f.close() - return MachO.MachO(f.name) - -a = get_macho("model.hwx.golden") - -# load commands -for c in a.headers[0].commands: - print("command", c[0], c[1]) - if c[0].cmd == 4: - hexdump(c[2]) - pass - if c[0].cmd == 6: - print("name:", c[2].decode('utf-8')) - if c[0].cmd == 8: - print(c[2].decode('utf-8')) - if c[0].cmd == 25: - for section in c[2]: - print(section.segname.strip(b'\0'), section.sectname.strip(b'\0'), hex(section.addr), hex(section.size), "@", hex(c[1].fileoff)) - #print(dir(section)) - if c[1].filesize > 0: - if len(section.section_data) < 0x100: - hexdump(section.section_data) - else: - print("in file, not dumping 0x%x" % len(section.section_data)) - -# this parser is wrong (fixed with 64-bit one) -from macholib import SymbolTable -sym = SymbolTable.SymbolTable(a) - -syms = {} -for l in sym.nlists: - print(l) - if l[0].n_value != 0: - syms[l[1]] = l[0].n_value - -for k,v in syms.items(): - print(k, hex(v)) - - -# **** document what we know *** -from ane import ANE_Struct, ANE -ane = ANE() - -aneb = set() -for typ, num, nam in ANE_Struct: - ltyp = {"u32": 4, "u16": 2, "u8": 1}[typ] - for l in range(num, num+ltyp): - aneb.add(l) - -# we understand these too -for l in range(0x34, 0xF4): - aneb.add(l) - -from termcolor import colored -def compare(x, y): - ss = [] - ln = [] - ln2 = [] - - ll = (max(len(x), len(y)) + 0xF)//0x10 * 0x10 - - highlight = False - next_highlight = 0x2b - for i in range(ll+1): - if i == next_highlight: - highlight = True - if i < len(y): - next_highlight += y[i]+8 - else: - next_highlight = None - else: - highlight = False - a = "%02X" % x[i] if i < len(x) else "--", \ - "%02X" % y[i] if i < len(y) else "--" - def fj(x): - ss = [] - for i in range(0, 0x10, 4): - ss.append(' '.join(x[i:i+4])) - return ' '.join(ss) - - if i!=0 and i%0x10 == 0: - ss.append("%8X: " % (i-0x10)+fj(ln)+" | "+fj(ln2)+"\n") - ln = [] - ln2 = [] - if a[0] != a[1] and a[0] != "--" and a[1] != "--": - ln.append(colored(a[0], 'green')) - ln2.append(colored(a[1], 'red')) - else: - if highlight: - ln.append(colored(a[0], 'yellow')) - ln2.append(colored(a[1], 'yellow')) - else: - if i in aneb: - ln.append(colored(a[0], 'white')) - ln2.append(colored(a[1], 'white')) - else: - ln.append(a[0]) - ln2.append(a[1]) - return ''.join(ss) - -import json -aneregs = dict(json.load(open("aneregs.json"))) -g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1]) -f1 = g.headers[0].commands[1][2][0].section_data -f2 = a.headers[0].commands[1][2][0].section_data -for i in range(0, len(f2), 0x300): - print("===== op %d =====" % (i//0x300)) - if len(f1) < 0x300: - c1, c2 = f1, f2[i:i+0x300] - else: - c1, c2 = f1[i:i+0x300], f2[i:i+0x300] - dbg1 = ane.debug(c1, 16) - dbg2 = ane.debug(c2, 16) - if getenv("PRINTALL"): - for k in dbg2: - if k in aneregs: - rr = aneregs[k] if k in aneregs else (-1,-1,-1) - print("0x%3x %d %2d" % tuple(rr), k, dbg1[k], "->", dbg2[k]) - else: - for k in dbg1: - if dbg1[k] != dbg2[k]: - rr = aneregs[k] if k in aneregs else (-1,-1,-1) - print("0x%3x %d %2d" % tuple(rr), k, dbg1[k], "->", dbg2[k]) - - print(compare(c1, c2)) -#open("/tmp/data.section", "wb").write(f2) -#print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read())) diff --git a/tinygrad_repo/extra/accel/ane/2_compile/min.weights b/tinygrad_repo/extra/accel/ane/2_compile/min.weights deleted file mode 100644 index 83917d3..0000000 Binary files a/tinygrad_repo/extra/accel/ane/2_compile/min.weights and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/2_compile/min_uint8.weights b/tinygrad_repo/extra/accel/ane/2_compile/min_uint8.weights deleted file mode 100644 index 7ee0f5c..0000000 Binary files a/tinygrad_repo/extra/accel/ane/2_compile/min_uint8.weights and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/2_compile/model.espresso.weights b/tinygrad_repo/extra/accel/ane/2_compile/model.espresso.weights deleted file mode 100644 index f68a7ac..0000000 Binary files a/tinygrad_repo/extra/accel/ane/2_compile/model.espresso.weights and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/2_compile/model.hwx.golden b/tinygrad_repo/extra/accel/ane/2_compile/model.hwx.golden deleted file mode 100644 index a199507..0000000 Binary files a/tinygrad_repo/extra/accel/ane/2_compile/model.hwx.golden and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/2_compile/net.additional.weights b/tinygrad_repo/extra/accel/ane/2_compile/net.additional.weights deleted file mode 100644 index 31188c5..0000000 Binary files a/tinygrad_repo/extra/accel/ane/2_compile/net.additional.weights and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/2_compile/net.plist b/tinygrad_repo/extra/accel/ane/2_compile/net.plist deleted file mode 100644 index a18df35..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/net.plist +++ /dev/null @@ -1,127 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - image - - Outputs - - probs@output - - Units - - probs_tmp_0 - probs - - Weights - - model.espresso.weights - net.additional.weights - - image - - BatchSize - 1 - InputChannels - 3 - InputHeight - 1 - InputInterleave - 1 - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - InputWidth - 1 - - probs - - Bottom - probs_tmp_0 - Name - probs - OutputType - Float16 - Params - - BiasScaleGroupData - - BiasCount - 2 - BiasIndex - 1 - BiasOffset - 0 - BiasType - Float16 - - - Type - GOC - - probs@output - - Bottom - probs - OutputInterleave - 1 - OutputPlaneStride - 64 - OutputRowStride - 64 - OutputType - Float16 - - probs_tmp_0 - - Bottom - image - Name - probs_tmp_0 - OutputChannels - 2 - OutputType - Float16 - Params - - KernelGroupReuse - - KernelHeight - 1 - KernelIndex - 0 - KernelMode - Dense - KernelOffset - 192 - KernelType - Float32 - KernelWidth - 1 - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/plists/broadcast.plist b/tinygrad_repo/extra/accel/ane/2_compile/plists/broadcast.plist deleted file mode 100644 index 16fc0b8..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/plists/broadcast.plist +++ /dev/null @@ -1,196 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - A - - BatchSize - 7 - InputBatchStride - 64 - InputChannels - 1 - InputDepth - 1 - InputDepthStride - 448 - InputHeight - 1 - InputInterleave - 1 - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - InputWidth - 1 - - A_broadcasted_output - - Bottom - A - Name - A_broadcasted_output - OutputType - Float16 - Params - - BroadcastInfo - - - Dimension - Width - Size - 2 - - - - Type - Broadcast - UnescapedBottom - A - UnescapedName - A_broadcasted_output - - B - - BatchSize - 1 - InputChannels - 1 - InputHeight - 1 - InputInterleave - 1 - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - InputWidth - 2 - - B_broadcasted_output - - Bottom - B - Name - B_broadcasted_output - OutputType - Float16 - Params - - BroadcastInfo - - - Dimension - Depth - Size - 1 - - - Dimension - Batch - Size - 7 - - - Dimension - Channel - Size - 1 - - - Dimension - Height - Size - 1 - - - - Type - Broadcast - UnescapedBottom - B - UnescapedName - B_broadcasted_output - - Inputs - - B - A - - Outputs - - output@output - - Units - - A_broadcasted_output - B_broadcasted_output - output - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpy5yeqxdi.mlmodelc/model.espresso.weights - net.additional.weights - - output - - Bottom - - A_broadcasted_output - B_broadcasted_output - - Name - output - OutputType - Float16 - Params - - Scale - 15360 - Type - Min - - Type - ScaledElementWise - UnescapedBottom - - A_broadcasted_output - B_broadcasted_output - - UnescapedName - output - - output@output - - Bottom - output - OutputBatchStride - 64 - OutputDepthStride - 448 - OutputInterleave - 1 - OutputPlaneStride - 64 - OutputRowStride - 64 - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/plists/concat.plist b/tinygrad_repo/extra/accel/ane/2_compile/plists/concat.plist deleted file mode 100644 index ea5cf15..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/plists/concat.plist +++ /dev/null @@ -1,128 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - input_1 - input_0 - - Outputs - - output@output - - Units - - output - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp0yvkl2ux.mlmodelc/model.espresso.weights - net.additional.weights - - input_0 - - BatchSize - 2 - InputBatchStride - 512 - InputChannels - 4 - InputDepth - 4 - InputDepthStride - 1024 - InputHeight - 2 - InputInterleave - 1 - InputPlaneStride - 128 - InputRowStride - 64 - InputType - Float16 - InputWidth - 3 - - input_1 - - BatchSize - 2 - InputBatchStride - 256 - InputChannels - 2 - InputDepth - 4 - InputDepthStride - 512 - InputHeight - 2 - InputInterleave - 1 - InputPlaneStride - 128 - InputRowStride - 64 - InputType - Float16 - InputWidth - 3 - - output - - Bottom - - input_0 - input_1 - - Name - output - OutputChannels - 6 - OutputType - Float16 - Params - - Dimension - Channel - - Type - Concat - UnescapedBottom - - input_0 - input_1 - - UnescapedName - output - - output@output - - Bottom - output - OutputBatchStride - 768 - OutputDepthStride - 1536 - OutputInterleave - 1 - OutputPlaneStride - 128 - OutputRowStride - 64 - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/plists/gemm.plist b/tinygrad_repo/extra/accel/ane/2_compile/plists/gemm.plist deleted file mode 100644 index dd7b301..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/plists/gemm.plist +++ /dev/null @@ -1,135 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - image - - Outputs - - probs@output - - Units - - probs_tmp_0 - probs - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmph2sg50xi.mlmodelc/model.espresso.weights - net.additional.weights - - image - - BatchSize - 1 - InputChannels - 64 - InputHeight - 1 - InputInterleave - 1 - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - InputWidth - 1 - - probs - - Bottom - probs_tmp_0 - Name - probs - OutputType - Float16 - Params - - BiasScaleGroupData - - BiasCount - 64 - BiasIndex - 1 - BiasOffset - 0 - BiasType - Float16 - - - Type - GOC - UnescapedBottom - probs_tmp_0 - UnescapedName - probs - - probs@output - - Bottom - probs - OutputInterleave - 1 - OutputPlaneStride - 64 - OutputRowStride - 64 - OutputType - Float16 - - probs_tmp_0 - - Bottom - image - Name - probs_tmp_0 - OutputChannels - 64 - OutputType - Float16 - Params - - KernelGroupReuse - - KernelHeight - 1 - KernelIndex - 0 - KernelMode - Dense - KernelOffset - 384 - KernelType - Float32 - KernelWidth - 1 - Step - - 1 - 1 - - Type - Conv - - Type - Conv - UnescapedBottom - image - UnescapedName - probs_tmp_0 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/plists/goc.plist b/tinygrad_repo/extra/accel/ane/2_compile/plists/goc.plist deleted file mode 100644 index 34f5c70..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/plists/goc.plist +++ /dev/null @@ -1,86 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - data - - Outputs - - output@output - - Units - - output - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpm7rb6ba9.mlmodelc/model.espresso.weights - net.additional.weights - - data - - BatchSize - 1 - InputChannels - 1 - InputHeight - 1 - InputInterleave - 1 - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - InputWidth - 6 - - output - - Bottom - data - Name - output - OutputType - Float16 - Params - - BiasScalar - 16354 - ScaleScalar - 20544 - - Type - GOC - UnescapedBottom - data - UnescapedName - output - - output@output - - Bottom - output - OutputInterleave - 1 - OutputPlaneStride - 64 - OutputRowStride - 64 - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/plists/inputview.plist b/tinygrad_repo/extra/accel/ane/2_compile/plists/inputview.plist deleted file mode 100644 index 91d431d..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/plists/inputview.plist +++ /dev/null @@ -1,166 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - data - - Outputs - - out_2@output - out_1@output - out_0@output - - Units - - out_0 - out_1 - out_2 - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp_c4fweo3.mlmodelc/model.espresso.weights - net.additional.weights - - data - - BatchSize - 1 - InputChannels - 9 - InputHeight - 2 - InputInterleave - 1 - InputPlaneStride - 128 - InputRowStride - 64 - InputType - Float16 - InputWidth - 2 - - out_0 - - Bottom - data - Name - out_0 - OutputType - Float16 - Params - - Dimension - Channel - Offset - 0 - Size - 3 - - Type - InputView - UnescapedBottom - data - UnescapedName - out_0 - - out_0@output - - Bottom - out_0 - OutputInterleave - 1 - OutputPlaneStride - 128 - OutputRowStride - 64 - OutputType - Float16 - - out_1 - - Bottom - data - Name - out_1 - OutputType - Float16 - Params - - Dimension - Channel - Offset - 3 - Size - 3 - - Type - InputView - UnescapedBottom - data - UnescapedName - out_1 - - out_1@output - - Bottom - out_1 - OutputInterleave - 1 - OutputPlaneStride - 128 - OutputRowStride - 64 - OutputType - Float16 - - out_2 - - Bottom - data - Name - out_2 - OutputType - Float16 - Params - - Dimension - Channel - Offset - 6 - Size - 3 - - Type - InputView - UnescapedBottom - data - UnescapedName - out_2 - - out_2@output - - Bottom - out_2 - OutputInterleave - 1 - OutputPlaneStride - 128 - OutputRowStride - 64 - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/plists/neuron.plist b/tinygrad_repo/extra/accel/ane/2_compile/plists/neuron.plist deleted file mode 100644 index e3314d3..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/plists/neuron.plist +++ /dev/null @@ -1,84 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - data - - Outputs - - output@output - - Units - - output - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpwvvanb0c.mlmodelc/model.espresso.weights - net.additional.weights - - data - - BatchSize - 7 - InputChannels - 7 - InputHeight - 7 - InputInterleave - 1 - InputPlaneStride - 448 - InputRowStride - 64 - InputType - Float16 - InputWidth - 7 - - output - - Bottom - data - Name - output - OutputType - Float16 - Params - - Type - Exp2 - - Type - Neuron - UnescapedBottom - data - UnescapedName - output - - output@output - - Bottom - output - OutputInterleave - 1 - OutputPlaneStride - 448 - OutputRowStride - 64 - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/plists/reshape.plist b/tinygrad_repo/extra/accel/ane/2_compile/plists/reshape.plist deleted file mode 100644 index 7cda3a5..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/plists/reshape.plist +++ /dev/null @@ -1,92 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - data - - Outputs - - output@output - - Units - - output - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpcwj7kqrw.mlmodelc/model.espresso.weights - net.additional.weights - - data - - BatchSize - 1 - InputChannels - 1 - InputHeight - 2 - InputInterleave - 1 - InputPlaneStride - 128 - InputRowStride - 64 - InputType - Float16 - InputWidth - 5 - - output - - Bottom - data - Name - output - OutputType - Float16 - Params - - ReshapedBatch - 1 - ReshapedChannel - 10 - ReshapedDepth - 1 - ReshapedHeight - 1 - ReshapedWidth - 1 - - Type - Reshape - UnescapedBottom - data - UnescapedName - output - - output@output - - Bottom - output - OutputInterleave - 1 - OutputPlaneStride - 64 - OutputRowStride - 64 - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/plists/scaled.plist b/tinygrad_repo/extra/accel/ane/2_compile/plists/scaled.plist deleted file mode 100644 index 33eda0d..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/plists/scaled.plist +++ /dev/null @@ -1,140 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - A - - BatchSize - 1 - InputChannels - 1 - InputHeight - 5 - InputInterleave - 1 - InputPlaneStride - 320 - InputRowStride - 64 - InputType - Float16 - InputWidth - 7 - - B - - BatchSize - 1 - InputChannels - 1 - InputHeight - 1 - InputInterleave - 1 - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - InputWidth - 7 - - B_broadcasted_output - - Bottom - B - Name - B_broadcasted_output - OutputType - Float16 - Params - - BroadcastInfo - - - Dimension - Height - Size - 5 - - - - Type - Broadcast - UnescapedBottom - B - UnescapedName - B_broadcasted_output - - Inputs - - B - A - - Outputs - - output@output - - Units - - B_broadcasted_output - output - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp40ksdbf5.mlmodelc/model.espresso.weights - net.additional.weights - - output - - Bottom - - A - B_broadcasted_output - - Name - output - OutputType - Float16 - Params - - Scale - 15360 - Type - Min - - Type - ScaledElementWise - UnescapedBottom - - A - B_broadcasted_output - - UnescapedName - output - - output@output - - Bottom - output - OutputInterleave - 1 - OutputPlaneStride - 320 - OutputRowStride - 64 - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/plists/sum.plist b/tinygrad_repo/extra/accel/ane/2_compile/plists/sum.plist deleted file mode 100644 index cb4d508..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/plists/sum.plist +++ /dev/null @@ -1,112 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - image2 - image - - Outputs - - probs@output - - Units - - probs - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpkp9irqtj.mlmodelc/model.espresso.weights - net.additional.weights - - image - - BatchSize - 1 - InputChannels - 64 - InputHeight - 1 - InputInterleave - 1 - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - InputWidth - 1 - - image2 - - BatchSize - 1 - InputChannels - 64 - InputHeight - 1 - InputInterleave - 1 - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - InputWidth - 1 - - probs - - Bottom - - image - image2 - - Name - probs - OutputType - Float16 - Params - - Scale - 15360 - Type - Add - - Type - ScaledElementWise - UnescapedBottom - - image - image2 - - UnescapedName - probs - - probs@output - - Bottom - probs - OutputInterleave - 1 - OutputPlaneStride - 64 - OutputRowStride - 64 - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/concat.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/concat.plist deleted file mode 100644 index 89c76ff..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/concat.plist +++ /dev/null @@ -1,78 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - input_1 - input_0 - - Outputs - - output@output - - Units - - output - - Weights - - /private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp0yvkl2ux.mlmodelc/model.espresso.weights - net.additional.weights - - input_0 - - InputChannels - 16384 - InputHeight - 1 - InputWidth - 1 - - InputType - Float16 - - input_1 - - InputChannels - 16 - InputHeight - 1 - InputWidth - 1 - - InputType - Float16 - - output - - Bottom - - input_0 - input_1 - - Name - output - OutputType - Float16 - Type - Concat - - output@output - - Bottom - output - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/conv.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/conv.plist deleted file mode 100644 index eecb9a5..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/conv.plist +++ /dev/null @@ -1,94 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - - Outputs - - probs@output - - - - Weights - - ../twos.weights - - - image - - InputChannels - 3 - InputHeight - 1 - InputWidth - 1 - - InputType - Float16 - - - my_layer - - Bottom - image - - Name - my_layer - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float16 - - Step - - 1 - 1 - - - Type - Conv - - - Type - Conv - - - probs@output - - Bottom - my_layer - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/convneuron.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/convneuron.plist deleted file mode 100644 index 5e5f9d7..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/convneuron.plist +++ /dev/null @@ -1,130 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - my_layer_2 - - Outputs - - probs@output - - - - Weights - - ../min.weights - - - image - - - BatchSize - 1 - - InputChannels - 3 - InputHeight - 1 - InputWidth - 1 - - InputInterleave - 1 - - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - - - my_layer - - Bottom - image - - Name - my_layer - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float16 - - Step - - 1 - 1 - - Type - Conv - - - Type - Conv - - - my_layer_2 - - Bottom - my_layer - Name - my_layer_2 - OutputType - Float16 - Params - - Type - Sign - - Type - Neuron - - - probs@output - - Bottom - my_layer_2 - - OutputInterleave - 1 - - OutputPlaneStride - 64 - OutputRowStride - 64 - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/convuint8.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/convuint8.plist deleted file mode 100644 index 2a24af4..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/convuint8.plist +++ /dev/null @@ -1,135 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - - Outputs - - probs@output - - - - Weights - - ../min_uint8.weights - - - image - - - BatchSize - 1 - - InputChannels - 1 - InputHeight - 1 - InputWidth - 3 - - InputDepth - 1 - - InputInterleave - 1 - - InputBatchStride - 256 - InputDepthStride - 256 - InputPlaneStride - 64 - InputRowStride - 64 - InputType - UInt8 - - - my_layer - - Bottom - image - - Name - my_layer - OutputChannels - 3 - OutputType - UInt8 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelDepth - 1 - - PadTop - 0 - PadBot - 0 - PadLeft - 0 - PadRight - 0 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - UInt8 - - Step - - 1 - 1 - - Type - Conv - - - Type - Conv - - - probs@output - - Bottom - my_layer - - OutputInterleave - 1 - - OutputBatchStride - 256 - OutputDepthStride - 256 - OutputPlaneStride - 64 - OutputRowStride - 64 - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleconv.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleconv.plist deleted file mode 100644 index fd0846d..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleconv.plist +++ /dev/null @@ -1,154 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - my_layer_2 - - Outputs - - probs@output - zalt@output - - - - Weights - - ../min.weights - - - image - - - BatchSize - 1 - - InputChannels - 3 - InputHeight - 1 - InputWidth - 1 - - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - - - my_layer - - Bottom - image - - Name - my_layer - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - my_layer_2 - - Bottom - my_layer - - Name - my_layer_2 - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - probs@output - - Bottom - my_layer - OutputPlaneStride - 64 - OutputRowStride - 64 - - - zalt@output - - Bottom - my_layer_2 - OutputPlaneStride - 64 - OutputRowStride - 64 - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleconvrev.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleconvrev.plist deleted file mode 100644 index 0ad32fa..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleconvrev.plist +++ /dev/null @@ -1,154 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - my_layer_2 - - Outputs - - probs@output - aalt@output - - - - Weights - - ../min.weights - - - image - - - BatchSize - 1 - - InputChannels - 3 - InputHeight - 1 - InputWidth - 1 - - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - - - my_layer - - Bottom - image - - Name - my_layer - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - my_layer_2 - - Bottom - my_layer - - Name - my_layer_2 - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - probs@output - - Bottom - my_layer - OutputPlaneStride - 64 - OutputRowStride - 64 - - - aalt@output - - Bottom - my_layer_2 - OutputPlaneStride - 64 - OutputRowStride - 64 - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleconvsout.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleconvsout.plist deleted file mode 100644 index 08f23fa..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleconvsout.plist +++ /dev/null @@ -1,143 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - my_layer_2 - - Outputs - - probs@output - - - - Weights - - ../min.weights - - - image - - - BatchSize - 1 - - InputChannels - 3 - InputHeight - 1 - InputWidth - 1 - - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - - - my_layer - - Bottom - image - - Name - my_layer - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - my_layer_2 - - Bottom - my_layer - - Name - my_layer_2 - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - probs@output - - Bottom - my_layer_2 - OutputPlaneStride - 64 - OutputRowStride - 64 - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleneuron.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleneuron.plist deleted file mode 100644 index 2ee0a39..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/doubleneuron.plist +++ /dev/null @@ -1,87 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - my_layer_2 - - Outputs - - probs@output - - - image - - - BatchSize - 1 - - InputChannels - 1 - InputHeight - 1 - InputWidth - 77 - - InputType - Float16 - - - my_layer - - Bottom - image - Name - my_layer - OutputType - Float16 - Params - - Type - Sigmoid - - Type - Neuron - - - my_layer_2 - - Bottom - my_layer - Name - my_layer_2 - OutputType - Float16 - Params - - Type - Sigmoid - - Type - Neuron - - - probs@output - - Bottom - my_layer_2 - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/gemm.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/gemm.plist deleted file mode 100644 index fff1d3e..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/gemm.plist +++ /dev/null @@ -1,90 +0,0 @@ - - - - - - - Networks - - net - - Version - 1.0.9 - net - - Inputs - - image - - Outputs - - probs@output - - Units - - probs - - Weights - - /tmp/zero - - - image - - BatchSize - 512 - InputChannels - 512 - InputHeight - 1 - InputWidth - 1 - InputType - Float16 - - probs - - Bottom - image - Name - probs - OutputChannels - 512 - OutputType - Float16 - Params - - KernelHeight - 1 - KernelWidth - 1 - KernelType - Float16 - - KernelMode - Dense - KernelOffset - 0 - - Step - - 1 - 1 - - - Type - Conv - - Type - Conv - - probs@output - - Bottom - probs - OutputType - Float16 - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/goc.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/goc.plist deleted file mode 100644 index 0d8078f..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/goc.plist +++ /dev/null @@ -1,79 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - - Outputs - - probs@output - - - image - - - BatchSize - 1 - - InputChannels - 3 - InputHeight - 1 - InputWidth - 1 - - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - - - my_layer - - Bottom - image - Name - my_layer - OutputType - Float16 - Params - - BiasScalar - 16354 - ScaleScalar - 20544 - - Type - GOC - - - probs@output - - Bottom - my_layer - OutputPlaneStride - 64 - OutputRowStride - 64 - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/neuron.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/neuron.plist deleted file mode 100644 index 48a9df0..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/neuron.plist +++ /dev/null @@ -1,69 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - - Outputs - - probs@output - - - image - - - BatchSize - 1 - - InputChannels - 1 - InputHeight - 1 - InputWidth - 77 - - InputType - Float16 - - - my_layer - - Bottom - image - Name - my_layer - OutputType - Float16 - Params - - Type - Sigmoid - - Type - Neuron - - - probs@output - - Bottom - my_layer - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/quadconv.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/quadconv.plist deleted file mode 100644 index 8564e5f..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/quadconv.plist +++ /dev/null @@ -1,221 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - my_layer_2 - my_layer_3 - my_layer_4 - - Outputs - - probs@output - - - - Weights - - ../min.weights - - - image - - - BatchSize - 1 - - InputChannels - 3 - InputHeight - 1 - InputWidth - 1 - - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - - - my_layer - - Bottom - image - - Name - my_layer - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - my_layer_2 - - Bottom - my_layer - - Name - my_layer_2 - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - my_layer_3 - - Bottom - my_layer_2 - - Name - my_layer_3 - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - my_layer_4 - - Bottom - my_layer_3 - - Name - my_layer_4 - OutputChannels - 3 - OutputType - Float16 - - Params - - KernelHeight - 1 - KernelWidth - 1 - - KernelIndex - 0 - KernelOffset - 0 - KernelType - Float32 - - Step - - 1 - 1 - - Type - Conv - - Type - Conv - - - probs@output - - Bottom - my_layer_4 - OutputPlaneStride - 64 - OutputRowStride - 64 - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/simple/reshape.plist b/tinygrad_repo/extra/accel/ane/2_compile/simple/reshape.plist deleted file mode 100644 index 50a008c..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/simple/reshape.plist +++ /dev/null @@ -1,85 +0,0 @@ - - - - - Networks - - net - - Version - 1.0.9 - net - - - Inputs - - image - - Units - - my_layer - - Outputs - - probs@output - - - image - - - BatchSize - 1 - - InputChannels - 3 - InputHeight - 1 - InputWidth - 1 - - InputPlaneStride - 64 - InputRowStride - 64 - InputType - Float16 - - - my_layer - - Bottom - image - Name - my_layer - OutputType - Float16 - Params - - ReshapedBatch - 1 - ReshapedChannel - 3 - ReshapedDepth - 1 - ReshapedHeight - 1 - ReshapedWidth - 1 - - Type - Reshape - - - probs@output - - Bottom - my_layer - OutputPlaneStride - 64 - OutputRowStride - 64 - - - - - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/struct_recover.py b/tinygrad_repo/extra/accel/ane/2_compile/struct_recover.py deleted file mode 100755 index 86e859b..0000000 --- a/tinygrad_repo/extra/accel/ane/2_compile/struct_recover.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 -from ane import ANE -ane = ANE() - -lens = {} - -dat = b"\xff"*0x300 -ret = ane.debug(dat, 16) -for k,v in ret.items(): - found = None - for i in range(33): - #print(v, (1 << i) - 1) - if v == (1 << i) - 1: - found = i - break - #print(k, hex(v), found) - lens[k] = found - -pos = [] -dat = b"\x00"*0x300 -for i in range(0x300): - for j in range(8): - dat = b"\x00"*i - dat += bytes([1 << j]) - dat += b"\x00"*(0x300-len(dat)) - ret = ane.debug(dat, 16) - for k,v in ret.items(): - if v == 1: - print("0x%3x %d %2d" % (i, j, lens[k]), k) - pos.append((k, (i,j, lens[k]))) - -import json -jpos = json.dumps(pos, indent=2) -with open("aneregs.json", "w") as f: - f.write(jpos) - diff --git a/tinygrad_repo/extra/accel/ane/2_compile/twos.weights b/tinygrad_repo/extra/accel/ane/2_compile/twos.weights deleted file mode 100644 index be1b0e8..0000000 Binary files a/tinygrad_repo/extra/accel/ane/2_compile/twos.weights and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/3_run/build.sh b/tinygrad_repo/extra/accel/ane/3_run/build.sh deleted file mode 100755 index 896b620..0000000 --- a/tinygrad_repo/extra/accel/ane/3_run/build.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -e -clang++ test.mm -F /System/Library/PrivateFrameworks/ -framework ANEServices -framework IOSurface -framework Foundation -framework IOKit -codesign --entitlements entitlements.xml -s "Taylor Swift Child 2" a.out - diff --git a/tinygrad_repo/extra/accel/ane/3_run/entitlements.xml b/tinygrad_repo/extra/accel/ane/3_run/entitlements.xml deleted file mode 100644 index ffb6ff1..0000000 --- a/tinygrad_repo/extra/accel/ane/3_run/entitlements.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - com.apple.ane.iokit-user-access - - - diff --git a/tinygrad_repo/extra/accel/ane/3_run/h11ane.h b/tinygrad_repo/extra/accel/ane/3_run/h11ane.h deleted file mode 100644 index 3cc1e54..0000000 --- a/tinygrad_repo/extra/accel/ane/3_run/h11ane.h +++ /dev/null @@ -1,150 +0,0 @@ -enum ANEDeviceUsageType { - UsageNoProgram, - UsageWithProgram, // used in running process - UsageCompile // used in aned -}; - -struct H11ANEDeviceInfoStruct { - uint64_t program_handle; - uint64_t program_auth_code; - uint64_t sleep_timer; - uint64_t junk[0x100]; -}; - -struct H11ANEStatusStruct { - uint64_t junk[0x100]; -}; - -struct H11ANEProgramCreateArgsStruct { - void *program; - uint64_t program_length; - uint64_t empty[4]; - char has_signature; -}; - -struct H11ANEProgramCreateArgsStructOutput { - uint64_t program_handle; - int unknown[0x2000]; -}; - -struct H11ANEProgramPrepareArgsStruct { - uint64_t program_handle; - uint64_t flags; - uint64_t empty[0x100]; -}; - -struct H11ANEProgramRequestArgsStruct { - uint64_t args[0x1000]; -}; - -namespace H11ANE { - class H11ANEDevice; - - class H11ANEDeviceController { - public: - H11ANEDeviceController( - int (*callback)(H11ANE::H11ANEDeviceController*, void*, H11ANE::H11ANEDevice*), - void *arg); - int SetupDeviceController(); - private: // size is 0x50 - CFArrayRef array_ref; - mach_port_t *master_port; - IONotificationPortRef port_ref; - CFRunLoopSourceRef source_ref; - int (*callback)(H11ANE::H11ANEDeviceController*, void*, H11ANE::H11ANEDevice*); - void *callback_arg; - CFRunLoopRef run_loop_ref; - io_iterator_t io_iterator; - pthread_t thread_self; - uint64_t unused; - }; - - // we should switch to the IOKit kernel interface, it's likely a lot more stable - // actually this probably isn't true. ANEServices is normal dynamic links - // https://googleprojectzero.blogspot.com/2020/11/oops-i-missed-it-again.html - - // H11ANEInDirectPathClient - // _ANE_DeviceOpen - // _ANE_DeviceClose - // _ANE_ProgramSendRequest - - // * if they need kernel debugger attached - // H11ANEInUserClient - // _ANE_DeviceOpen - // _ANE_DeviceClose - // _ANE_ProgramSendRequest - // _ANE_ProgramCreate - // _ANE_ProgramPrepare - // _ANE_ProgramUnprepare - // _ANE_ProgramDestroy - // _ANE_GetStatus - // _ANE_PowerOn - // _ANE_PowerOff - // _ANE_IsPowered - // * _ANE_LoadFirmware - // * _ANE_ForgetFirmware - // * _ANE_SendCommand - // _ANE_SetPowerManagement - // _ANE_GetTime - // * _ANE_SetDriverLoggingFlags - // * _ANE_ShowSharedMemoryAllocations - // * _ANE_SetDARTCacheTTL - // * _ANE_SetFirmwareBootArg - // * _ANE_SetThrottlingPercentage - // * _ANE_AddPersistentClient - // * _ANE_RemovePersistentClient - // * _ANE_CreateClientLoggingSession - // * _ANE_TerminateClientLoggingSession - // _ANE_GetDriverLoggingFlags - // * _ANE_FlushInactiveDARTMappings - // _ANE_GetVersion - // _ANE_RegisterFirmwareWorkProcessor - // _ANE_UnregisterFirmwareWorkProcessor - // * _ANE_GetFirmwareWorkProcessorItem - // _ANE_CompleteFirmwareWorkProcessorItem - // _ANE_ReleaseFirmwareWorkProcessorBuffers - // * _ANE_ReadANERegister - // * _ANE_WriteANERegister - // _ANE_ProgramCreateInstance - - // note, this is not the raw IOKit class, it's in ANEServices.framework - class H11ANEDevice { - public: - H11ANEDevice(H11ANE::H11ANEDeviceController *param_1, unsigned int param_2); - - unsigned long H11ANEDeviceOpen( - int (*callback)(H11ANE::H11ANEDevice*, unsigned int, void*, void*), - void *param_2, ANEDeviceUsageType param_3, H11ANEDeviceInfoStruct *param_4); - - void EnableDeviceMessages(); - int ANE_AddPersistentClient(); - int ANE_GetStatus(H11ANEStatusStruct *param_1); - - // power management - int ANE_IsPowered(); - int ANE_PowerOn(); - int ANE_PowerOff(); - - // logging (e00002c7 error, needs PE_i_can_has_debugger) - int ANE_CreateClientLoggingSession(unsigned int log_iosurface); - int ANE_TerminateClientLoggingSession(unsigned int log_iosurface); - int ANE_GetDriverLoggingFlags(unsigned int *flags); - int ANE_SetDriverLoggingFlags(unsigned int flags); - - // program creation - int ANE_ProgramCreate(H11ANEProgramCreateArgsStruct*, - H11ANEProgramCreateArgsStructOutput*); - int ANE_ProgramPrepare(H11ANEProgramPrepareArgsStruct*); - int ANE_ProgramSendRequest(H11ANEProgramRequestArgsStruct*, mach_port_t); - - // need PE_i_can_has_debugger - int ANE_ReadANERegister(unsigned int param_1, unsigned int *param_2); - int ANE_ForgetFirmware(); - - - private: // size is 0x88 - unsigned char unknown[0x88]; - }; - -}; - diff --git a/tinygrad_repo/extra/accel/ane/3_run/test.mm b/tinygrad_repo/extra/accel/ane/3_run/test.mm deleted file mode 100644 index d76ac65..0000000 --- a/tinygrad_repo/extra/accel/ane/3_run/test.mm +++ /dev/null @@ -1,184 +0,0 @@ -#include -#include -#include - -#import - -#import -#import - -void hexdump(void *vdat, int l) { - unsigned char *dat = (unsigned char *)vdat; - for (int i = 0; i < l; i++) { - if (i!=0 && (i%0x10) == 0) printf("\n"); - printf("%02X ", dat[i]); - } - printf("\n"); -} - -#include "h11ane.h" - -using namespace H11ANE; - -H11ANEDevice *device = NULL; - -int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) { - printf("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3); - device = param_3; - return 0; -} - -int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) { - printf("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3); - return 0; -} - -int main() { - int ret; - printf("hello %d\n", getpid()); - - H11ANEDeviceController dc(MyH11ANEDeviceControllerNotification, NULL); - dc.SetupDeviceController(); - assert(device != NULL); - H11ANEDevice *dev = device; - dev->EnableDeviceMessages(); - - char empty[0x90] = {0}; - H11ANEDeviceInfoStruct dis = {0}; - //dis.nothing = 0x87c15a20a; - //dis.sleep_timer = 5000; - ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis); - printf("open 0x%x %p\n", ret, dev); - - /*ret = dev->ANE_AddPersistentClient(); - printf("add persistent %x\n", ret);*/ - - H11ANEStatusStruct blah = {0}; - ret = dev->ANE_GetStatus(&blah); - printf("get status %x\n", ret); - - // this isn't callable anymore, it requires debugger - ret = dev->ANE_PowerOn(); - printf("power on: %x\n", ret); - - ret = dev->ANE_IsPowered(); - printf("powered? %d\n", ret); - - /*if (ret == 0) { - printf("POWER ON FAILED\n"); - return -1; - }*/ - - H11ANEProgramCreateArgsStruct mprog = {0}; - mprog.program_length = 0xc000; - char *prog = (char*)aligned_alloc(0x1000, mprog.program_length); - mprog.program = prog; - - FILE *f = fopen("../2_compile/model.hwx", "rb"); - assert(f); - int sz = fread(prog, 1, mprog.program_length, f); - printf("read %x %p\n", sz, prog); - fclose(f); - - H11ANEProgramCreateArgsStructOutput *out = new H11ANEProgramCreateArgsStructOutput; - memset(out, 0, sizeof(H11ANEProgramCreateArgsStructOutput)); - ret = dev->ANE_ProgramCreate(&mprog, out); - uint64_t program_handle = out->program_handle; - printf("program create: %lx %lx\n", ret, program_handle); - - H11ANEProgramPrepareArgsStruct pas = {0}; - pas.program_handle = program_handle; - pas.flags = 0x0000000100010001; - //pas.flags = 0x0000000102010001; - ret = dev->ANE_ProgramPrepare(&pas); - printf("program prepare: %lx\n", ret); - - // input buffer - NSDictionary* dict = [NSDictionary dictionaryWithObjectsAndKeys: - [NSNumber numberWithInt:16], kIOSurfaceWidth, - [NSNumber numberWithInt:16], kIOSurfaceHeight, - [NSNumber numberWithInt:1], kIOSurfaceBytesPerElement, - [NSNumber numberWithInt:64], kIOSurfaceBytesPerRow, - [NSNumber numberWithInt:1278226536], kIOSurfacePixelFormat, - nil]; - IOSurfaceRef in_surf = IOSurfaceCreate((CFDictionaryRef)dict); - int in_surf_id = IOSurfaceGetID(in_surf); - printf("we have surface %p with id 0x%x\n", in_surf, in_surf_id); - - // load inputs - IOSurfaceLock(in_surf, 0, nil); - unsigned char *inp = (unsigned char *)IOSurfaceGetBaseAddress(in_surf); - for (int i = 0; i < 16; i++) inp[i] = (i+1)*0x10; - /*inp[0] = 0x39; - inp[1] = 0x65;*/ - hexdump(inp, 0x20); - IOSurfaceUnlock(in_surf, 0, nil); - - // output buffer - NSDictionary* odict = [NSDictionary dictionaryWithObjectsAndKeys: - [NSNumber numberWithInt:16], kIOSurfaceWidth, - [NSNumber numberWithInt:16], kIOSurfaceHeight, - [NSNumber numberWithInt:1], kIOSurfaceBytesPerElement, - [NSNumber numberWithInt:64], kIOSurfaceBytesPerRow, - [NSNumber numberWithInt:1278226536], kIOSurfacePixelFormat, - nil]; - IOSurfaceRef out_surf = IOSurfaceCreate((CFDictionaryRef)odict); - int out_surf_id = IOSurfaceGetID(out_surf); - printf("we have surface %p with id 0x%x\n", out_surf, out_surf_id); - - H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct; - memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct)); - - // TODO: make real struct - pras->args[0] = program_handle; - pras->args[4] = 0x0000002100000003; - - // inputs - pras->args[0x28/8] = 1; - pras->args[0x128/8] = (long long)in_surf_id<<32LL; - - // outputs - pras->args[0x528/8] = 1; - // 0x628 = outputBufferSurfaceId - pras->args[0x628/8] = (long long)out_surf_id<<32LL; - - mach_port_t recvPort = 0; - IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort); - printf("recv port: 0x%x\n", recvPort); - - // *** reopen with other client *** - H11ANEDeviceController dc2(MyH11ANEDeviceControllerNotification, NULL); - dc2.SetupDeviceController(); - assert(device != NULL); - dev = device; - dev->EnableDeviceMessages(); - - char empty2[0x90] = {0}; - dis.program_handle = program_handle; - dis.program_auth_code = 0; - ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty2, UsageWithProgram, &dis); - printf("reopen 0x%x %p\n", ret, dev); - - // run program (i think we need the other client for this) - ret = dev->ANE_ProgramSendRequest(pras, recvPort); - printf("send 0x%x\n", ret); - - struct { - mach_msg_header_t header; - char data[256]; - } message; - - ret = mach_msg(&message.header, - MACH_RCV_MSG, - 0, sizeof(message), - recvPort, - MACH_MSG_TIMEOUT_NONE, - MACH_PORT_NULL); - printf("got message: %d sz %d\n", ret, message.header.msgh_size); - - unsigned char *dat = (unsigned char *)IOSurfaceGetBaseAddress(out_surf); - printf("%p\n", dat); - hexdump(dat, 0x100); -} - - diff --git a/tinygrad_repo/extra/accel/ane/README.md b/tinygrad_repo/extra/accel/ane/README.md deleted file mode 100644 index 289cff0..0000000 --- a/tinygrad_repo/extra/accel/ane/README.md +++ /dev/null @@ -1,98 +0,0 @@ -# The Apple Neural Engine - -The Apple Neural Engine is a fancy DMA Engine that is based around convolutions. We don't have all the details worked out yet, but we can do some things with it. At its core, it runs through 0x300 ops in an hwx file. See `aneregs` for the registers used in each op. - -It operates out of RAM or its 4MB L2 cache. The L2 "cache" appears to be manually managed, and only applies to the input and output, not the weights. The weights are usually included in the program, and it's unclear where they are copied to. - -The 16 cores likely refer to the 16 wide Kernel DMA engine. They claim 11 TOPS total, which would be 687.5 GOPS/core. Perhaps it's a 32x32 MAC running at 335 MHz. That clock speed matches the cycle count time ratio from the debug perf stats. - -It works with 5D Tensors, you specify the stride for the latter 4. All strides must be a multiple of 0x40 bytes -* Column (width) -- aneRegs.Common.InDim.Win / aneRegs.Common.OutDim.Wout -* Row (height) -- aneRegs.Common.InDim.Hin / aneRegs.Common.OutDim.Hout -* Plane (channels) -- aneRegs.Common.Cin.Cin / aneRegs.Common.Cout.Cout -* Depth -* Group (batch) -- aneRegs.Common.GroupConvCfg.NumGroups - -It works with 3 data types -* UInt8 -* Int8 -* Float16 - -The ops have several parts -* Header -- The base addresses for the DMA engines -* KernelDMASrc -- 16x wide DMA engine for the weights/bias/nonlinearity -* Common -- Specifies the parameters for the convolution -* TileDMASrc -- Input DMA engine -* L2 -- Use the L2 cache for Source/Result instead of RAM -* NE -- Configure Kernel/MAC/Post -* TileDMADst -- Output DMA engine - -It can work with 8 base addresses for the DMA streams per OP -* 2x Read, both used for things like sum -* 1x Write -* 1x T? -* 4x Kernel, though only the first one seems used - -## Normal Flow for ANE Usage - -* Keras/ONNX model -> coremltools -* CoreML model -> Espresso -* net.plist -> ANECompiler -* model.hwx -> ANEServices -* AppleH11ANEInterface, an IOKit interface to the kernel - -## hwx file? - -This is a Mach-O file. We haven't figured out all the details, but the ops are at 0x4000. See `hwx_parse.py` - -## amfid - -Sadly disabling amfi breaks things like vscode. You can runtime patch - -``` -# MacOS 12.4 - -smol :: ~/fun/tinygrad » sha1sum /usr/libexec/amfid -0f7e7f7e41408f83d7ebc7564a3828f41cb2ab58 /usr/libexec/amfid - -# with patching +0x8e38 - -(lldb) image list -[ 0] 04B6DF6C-6068-3F18-81A7-978985574387 0x0000000102ad0000 /usr/libexec/amfid -(lldb) p *(unsigned int *)0x102ad8e38=0xd2800000 -``` - -This disables the entitlement check, then you don't need a bootarg. I wish Apple made a better way to do this. - -## Extracting ANEServices.framework - -``` -# install xcode and -sudo xcode-select --switch /Applications/Xcode.app -# xcode also contains ANEServices.tbd -brew install keith/formulae/dyld-shared-cache-extractor -dyld-shared-cache-extractor /System/Library/dyld/dyld_shared_cache_arm64e /tmp/libraries -cp /tmp/libraries/System/Library/PrivateFrameworks/ANECompiler.framework/Versions/A/ANECompiler . -cp /tmp/libraries/System/Library/PrivateFrameworks/ANEServices.framework/Versions/A/ANEServices . -cp /tmp/libraries/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/Versions/A/AppleNeuralEngine . -``` - -## Other work - -``` -# sadly also relies on ZinIrRegBitPrintOutDebug -https://github.com/antgroup-arclab/ANETools.git - -# sadly looks like we do actually need a direct connection to run hwx files, aned is at the espresso level -* frame #0: 0x00000001c250fecc AppleNeuralEngine`-[_ANEDaemonConnection loadModel:sandboxExtension:options:qos:withReply:] -(lldb) po $x2 -_ANEModel: { modelURL=file:///var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/test_F48D9B88-A68D-476F-ADC8-32BDAF9A2498.mlmodelc/ : key={"isegment":0,"inputs":{"image":{"shape":[1,1,1,64,1]},"image2":{"shape":[1,1,1,64,1]}},"outputs":{"probs":{"shape":[1,1,1,64,1]}}} : string_id=0x00000000 : program=(null) : state=1 : programHandle=0 : intermediateBufferHandle=0 : queueDepth=0 : attr={ -} : perfStatsMask=0} -``` - -## Choices - -* Disable amfid (breaks vscode) -* Patch amfid to allow restricted entitlements -* Sign with a "provisioning profile" to allow the entitlement -* Patch the ANE kext to not require a special entitlement (this is ideal, as we don't need to resign python) diff --git a/tinygrad_repo/extra/accel/ane/README.old b/tinygrad_repo/extra/accel/ane/README.old deleted file mode 100644 index 83b97cb..0000000 --- a/tinygrad_repo/extra/accel/ane/README.old +++ /dev/null @@ -1,367 +0,0 @@ -kernel driver: AppleH11ANEInterface - requires entitlement: com.apple.ane.iokit-user-access - compiler is run in ANE_ProgramCreate_gated - -2 helper processes: - /usr/libexec/aned - ANECompilerService - -Espresso: - Contains ANECompilerEngine - -AppleNeuralEngine: Objective-C interface called by Espresso - ANEServices: communication with the device - ANECompiler: compile plist into hwx file - com.apple.ANECompilerService.allow in AppleNeuralEngine? - Called from ANECompilerService.xpc in AppleNeuralEngine.framework - -== Model Flow == - - Keras/ONNX model - | - | 1_build - | (coremltools, open source) - v - CoreML model - | - | TODO: automate this - | Grabbed plist from lldbing ANECompilerService during 1_build - | (Espresso) - v - net.plist - | - | 2_compile - | (AppleNeuralEngine, ANECompiler) - v - model.hwx - | - | 3_run - | (AppleNeuralEngine, ANEServices, AppleH11ANEInterface) - v - - -TODO: Write a nice plist grabber -DONE: Write a call to the compiler with plist+weights -DONE: Write an hwx runner - -== Tracing the Compiler == - -ANECCompileProcedure - ZinAneCreateIr - ZinParseNeuronUnit - ZinAneCoreCompile - ZinAneCodeGeneration - ZinIrCodegenHandleKernels - ZinIrTargetH13::CodegenTds - ZinIrCacheHintTable - ZinIrCodegenHandleTds_v7 - ZinIrCodegenHandleTdsMakeList<7u> - ZinAneInstruction - ZinAneTd<7u>::HandleEngineLayer - ZinAneInstruction::HandleTdHeader - HandleNELayer<7u> - ZinAneInstruction::HandleCommonConfig - ZinAneInstruction::HandleCommonConfigCommonOpcodes - ZinIrCodegenHandleTds<7u> - 0x1bb93ae00 <-- this is the store of the first byte in the hwx - CalculateSizeInBytesFromRegCount (x*4+4) - 0xf @ 0x128-0x168 (base 0x1003047b0) - 0x1b @ 0x16c-0x1dc - 0x11 @ 0x1e0-0x228 - 0x3 @ 0x22c-0x23c - 0x4 @ 0x240-0x254 - 0x6 @ 0x258-0x274(end) - AddReloc (this is gold! x4 goes in the hwx) - ZinAneTd<7u>::HandleEngineLayer - -rbreak ^ZinAneInstruction* - -weeee ZinIrRegBitPrintOutDebug_7u_ -print (void)debugregs(0, 0x0000000100211030+8, 3) - -== min.plist == - -Types: GOC, Conv, Broadcast, ScaledElementWise, Reshape, InputView, Neuron, Concat - - -ops have length 0x300, seems like one basic op repeated - -header 0x0-0x1c - -u32 0x1c = next op offset -u16 0x20 = output address? - -== section break 0x2c (weights) == -reloc 0x2c-0x74 = K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_0 - K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_1 - -16 output channel parallel: -u32[16] 0x34-0x74 = 0x80 | 1 if used -u32[16] 0x74-0xB4 = -u32[16] 0xB4-0xF4 = - -== section break 0x128 (conv) == -u16 0x128 = InputWidth -u16 0x12A = InputHeight -u16 0x12C = InputDepth - -u32 0x130 = (OutputType * 0x10) | InputType - -u32 0x134 = InputChannels -u32 0x138 = OutputChannels - -u16 0x13C = OutputWidth -u16 0x13E = OutputHeight -u16 0x140 = OutputDepth - -u16 0x144 = 0xa000 | (KernelHeight * 0x20) | KernelWidth -u16 0x146 = 0x5000 | (PadTop * 0x40) | (PadLeft * 2) - -u16 0x14C = BatchSize -u32 0x150 = OutputHeight? - -== section break 0x16c (input) == -reloc 0x16c-0x174 = image - -u32 0x178 = InputRowStride -u32 0x17C = InputPlaneStride -u32 0x180 = InputDepthStride -u32 0x184 = InputBatchStride - -u8 0x1A7 = InputInterleave - -== section break 0x1e0 == -u8 0x1E5 = InputInterleave - -u32 0x1F4 = InputChannels * 0x10 -u32 0x1F8 = InputDepth * InputChannels * 0x10 - -u8 0x211 = OutputInterleave - -u32 0x220 = OutputChannels * 0x10 -u32 0x224 = OutputDepth * OutputChannels * 0x10 - -== section break 0x22c (scaling) == -u16 0x230 = BiasScalar -u16 0x232 = ScaleScalar - -== section break 0x240 == -u8 0x240 = 0x80 | KernelType -u8 0x241 = 4 * hasbatch -u16 0x246 = 0x10 | 2 * neuron? - -== section break 0x258 (output) == -reloc 0x258-0x25c = probs@output/src - -u32 0x260 = OutputRowStride -u32 0x264 = OutputPlaneStride -u32 0x268 = OutputDepthStride -u32 0x26C = OutputBatchStride - -u8 0x273 = OutputInterleave - -== Zin Constants == - -kZinIrOpCodeConv = 0? -kZinIrOpCodePool = 1 -kZinIrOpCodeElementWiseOp = 6 -kZinIrOpCodeConcat = 8 -kZinIrOpCodeFlattenComposite -kZinIrOpCodeNEConvOp -kZinIrOpCodeTranspose - - 0: CONV - 1: POOL - 2: SCALE_BIAS - 3: TERNARY_DYNAMIC_GOC - 4: BINARY_DYNAMIC_GOC - 5: ACTIVATION - 6: EW - 7: SCALED_EW - 8: CONCAT - 9: SPLIT -10: COPY -11: FLATTEN -12: UNFLATTEN -13: CROSS_CORRELATION -14: KERNEL_RASTERIZER -15: ARG_MIN_MAX -16: MATRIX_MULT -17: BROADCAST -18: FLATTEN_COMPOSITE -19: UNFLATTEN_COMPOSITE -20: KERNEL_RASTERIZER_COMPOSITE -21: CROSS_CORRELATION_COMPOSITE -22: LIVE_IN -23: CONST_IN -24: LIVE_OUT -25: REDUCTION -26: ALIAS -27: Typecast -28: RESHAPE -29: VIEW -30: TRANSPOSE -31: SPACE_TO_BATCH -32: BATCH_TO_SPACE -33: SOFTMAX -34: INSTANCE_NORM -35: L2_NORM -36: MINMAX_NORM -37: LRN -38: COST_VOLUME -39: PIXEL_SHUFFLE -40: FPS -41: RS -42: PEFUSED_ELEMENTWISE -43: PEFUSED_POOL -44: PEFUSED_GOC -45: NEFUSED_CONV -46: NEFUSED_POOL -47: NEFUSED_EW -48: NEFUSED_BYPASS - -# guessing from the hwx -kZinTensorFormatUInt8 = 0 -kZinTensorFormatInt8 = 1 -kZinTensorFormatFloat16 = 2 -kZinTensorFormatInvalid - -Zin (plist format) ---(ZinAneCoreCompile)---> Mir (hwx format)? - ZinAneCodeGeneration? - -ZinIrStatus GetKernelFormat<6u>(ZinKernelFormat param_1,ane_ne_kernel_cfg_kernel_fmt *param_2) - List of allowed numbers - -NeuronTypes (changes the LUT): - Tanh - Log2 - Exp2 - Sign = ZinMirActivationV7::GetSignLut - ...many more in ANECompiler - -Investigate: - ZinMirActivationV7::PrintLut(ZinMirActivationV7 *this,ane_nonlinear_lut_v7up_t *param_1 - - 0: NONE - 1: RELU - 2: SIGMOID - 3: SIGMOID_HIGH_PRECISION - 4: TANH - 5: CLAMPED_RELU - 6: PRELU - 7: DIRAC - 8: INT - 9: FRAC -10: SQRT -11: RSQRT -12: INV -13: SQR -14: LOG2 -15: EXP2 -16: ELU -17: SIGN -18: EQUAL_ZERO -19: NON_ZERO -20: LESS_THAN_ZERO -21: LESS_EQUAL_ZERO -22: GREATER_EQUAL_ZERO -23: GREATER_THAN_ZERO -24: CUSTOM_LUT -25: C_DIM_CONCAT -26: C_DIM_STRIDED_CONCAT -27: H_DIM_CONCAT -28: W_DIM_CONCAT -29: D_DIM_CONCAT -30: N_DIM_CONCAT -31: H_DIM_STRIDED_CONCAT - -CacheHint -0: ALLOC -1: NOALLOC -2: DROP -3: DEPRI - -conv kinds -0: regular -1: channelwise -2: grouped - -== plist exploration == - -Float16 -> UInt8 for output works, Float32 doesn't -Same for input -All weights must be float - -Index 0: model.espresso.weights @ 192 is weights -Index 1: net.additional.weights @ 0 is bias - -Float16 -> Float32 for bias works - -It's possible the compiler is Float32 -> Float16 converting, and the engine only supports Float16 + UInt8 - -== call to the compiler (in dmesg!) == - -[54476.282258]: H11ANEIn: ANE_ProgramCreate_gated:, ZinComputeProgramMake, get Mcache size: 0x0 -[54476.282259]: H11ANEIn: ANE_ProgramCreate_gated:,Program Identifier:ANEC v1 -zin_ane_compiler v4.2.1 - -t h13 - --fdram-allocator=ffreuse - --fdram-tensor-priority=sizethenliverange - --fl2-allocator=ffreuse - --fl3-allocator=ffreuse - --fl2-cache-mode=resident - --fsignature=ident - --memcache-strategy= -[54476.282262]: --memcache-size=4194304 - --fspatial-split=disabled - --fkernel-rewind=enabled - --Wl-undefined=fvmlib - -i /Library/Caches/com.apple.aned/tmp/Python/DB7E897E7F4D5D27501A998428B6D3863AFD96CEA82DAF2207A75394E6BAC44C/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/net.plist - -o /Library/Caches/com.apple.aned/20A2411/Python/C9981871BC59572E74AFA3014B183EA37567EE9A2A08328446CE4A2B754E109D/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/model.hwx.tmp - -== ANECCompile (in ANECompiler framework) == - ANECCompile(__CFDictionary *param_1, __CFDictionary *param_2, unsigned long param_3) - -param_1: -{ - InputNetworks = ( - { - NetworkPlistName = "net.plist"; - NetworkPlistPath = "/Library/Caches/com.apple.aned/tmp/run/A2ACB9D5AA31B301563A4F62885BA379E62B0E1240E95C6902A93900FE0A9B54/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/"; - } - ); - OutputFileName = "model.hwx.tmp"; - OutputFilePath = "/Library/Caches/com.apple.aned/20A2411/run/E68910CD1994681121EEDAFAE1BC524AA8E84CF80C42AFC0C7DE2C082C58BDFD/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/"; -} - -param_2: -{ - TargetArchitecture = h13; -} - -== Backtrace of device open == - - * frame #0: 0x00000001a68fac54 ANEServices`H11ANEDeviceOpen - frame #1: 0x00000001a78405b8 AppleNeuralEngine`__29-[_ANEDeviceController start]_block_invoke + 436 - frame #2: 0x0000000193c84420 libdispatch.dylib`_dispatch_client_callout + 20 - frame #3: 0x0000000193c92a98 libdispatch.dylib`_dispatch_lane_barrier_sync_invoke_and_complete + 60 - frame #4: 0x00000001a78403e8 AppleNeuralEngine`-[_ANEDeviceController start] + 136 - ... - frame #23: 0x00000001a64a4f38 Espresso`Espresso::ANERuntimeEngine::compiler::build_segment(std::__1::shared_ptr const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2080 - ... - frame #31: 0x000000019ab6099c CoreML`-[MLNeuralNetworkEngine rebuildPlan:] + 1640 - -== Backtrace of run? == - - * frame #0: 0x00000001a68f9108 ANEServices`H11ANEProgramProcessRequestDirect - frame #1: 0x00000001a7839694 AppleNeuralEngine`-[_ANEProgramForEvaluation processRequest:qos:qIndex:modelStringID:options:error:] + 1904 - frame #2: 0x00000001a7843ba4 AppleNeuralEngine`-[_ANEClient doEvaluateDirectWithModel:options:request:qos:error:] + 1236 - frame #3: 0x00000001a7842034 AppleNeuralEngine`-[_ANEClient evaluateWithModel:options:request:qos:error:] + 104 - frame #4: 0x00000001a64a2988 Espresso`Espresso::ANERuntimeEngine::compiler::__forward_segment(std::__1::shared_ptr const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2008 - frame #5: 0x00000001a6414548 Espresso`Espresso::net_compiler_segment_based::__forward(std::__1::shared_ptr const&) + 992 - frame #6: 0x00000001a67e2e3c Espresso`EspressoLight::espresso_plan::dispatch_task_on_compute_batch(std::__1::shared_ptr const&, std::__1::shared_ptr const&) + 612 - frame #7: 0x00000001a67ebab0 Espresso`EspressoLight::espresso_plan::execute_sync() + 356 - frame #8: 0x00000001a67f26fc Espresso`espresso_plan_execute_sync + 120 - frame #9: 0x000000019ab674b8 CoreML`-[MLNeuralNetworkEngine executePlan:error:] + 136 - frame #10: 0x000000019ab6799c CoreML`-[MLNeuralNetworkEngine evaluateInputs:bufferIndex:options:error:] + 368 - diff --git a/tinygrad_repo/extra/accel/ane/amfi/new_patch.py b/tinygrad_repo/extra/accel/ane/amfi/new_patch.py deleted file mode 100644 index 5fcd1d3..0000000 --- a/tinygrad_repo/extra/accel/ane/amfi/new_patch.py +++ /dev/null @@ -1,102 +0,0 @@ -import ctypes -from subprocess import check_output -from hexdump import hexdump - -def get_pid(name): - try: - output = check_output(["pgrep", name]) - return int(output) - except: - return None - -from ctypes.util import find_library -libc = ctypes.CDLL(find_library('c')) - -amfid_pid = get_pid("amfid") - -task = ctypes.c_uint32() -mytask = libc.mach_task_self() -ret = libc.task_for_pid(mytask, ctypes.c_int(amfid_pid), ctypes.pointer(task)) -print(amfid_pid, ret, task, mytask) - -#myport = libc.mach_task_self() - -class vm_region_submap_short_info_data_64(ctypes.Structure): - _pack_ = 1 - _fields_ = [ - ("protection", ctypes.c_uint32), - ("max_protection", ctypes.c_uint32), - ("inheritance", ctypes.c_uint32), - ("offset", ctypes.c_ulonglong), - ("user_tag", ctypes.c_uint32), - ("ref_count", ctypes.c_uint32), - ("shadow_depth", ctypes.c_uint16), - ("external_pager", ctypes.c_byte), - ("share_mode", ctypes.c_byte), - ("is_submap", ctypes.c_uint32), - ("behavior", ctypes.c_uint32), - ("object_id", ctypes.c_uint32), - ("user_wired_count", ctypes.c_uint32), - ] -submap_info_size = ctypes.sizeof(vm_region_submap_short_info_data_64) // 4 - -address = ctypes.c_ulong(0) -mapsize = ctypes.c_ulong(0) -count = ctypes.c_uint32(submap_info_size) -sub_info = vm_region_submap_short_info_data_64() -depth = 0 - -c_depth = ctypes.c_uint32(depth) -for i in range(1): - ret = libc.mach_vm_region_recurse(task, - ctypes.pointer(address), ctypes.pointer(mapsize), - ctypes.pointer(c_depth), ctypes.pointer(sub_info), - ctypes.pointer(count)) - print("aslr", hex(ret), hex(address.value), mapsize, count, sub_info.protection) - #address.value += mapsize.value -#exit(0) - -patch_address = address.value + 0x8e38 -patch = b"\x00\x00\x80\xd2" - -pdata = ctypes.c_void_p(0) -data_cnt = ctypes.c_uint32(0) - -ret = libc.mach_vm_read(task, ctypes.c_ulong(patch_address), 4, ctypes.pointer(pdata), ctypes.pointer(data_cnt)) -buf = ctypes.string_at(pdata.value, data_cnt.value) -hexdump(buf) - -#ret = libc.mach_vm_wire(mytask, task, patch_address, 4, 3) -#print(ret) -#exit(0) - -""" -ret = libc.mach_vm_read(task, address, mapsize, ctypes.pointer(pdata), ctypes.pointer(data_cnt)) -buf = ctypes.string_at(pdata.value, data_cnt.value) -hexdump(buf) - -ret = libc.mach_vm_deallocate(task, address, mapsize) -print("mach_vm_deallocate", ret) - -ret = libc.mach_vm_allocate(task, ctypes.pointer(address), mapsize, 0) -print("mach_vm_allocate", ret) -""" - -ret = libc.mach_vm_protect(task, ctypes.c_ulong(patch_address), 4, True, 3) -print("protect", ret) - -longptr = ctypes.POINTER(ctypes.c_ulong) -#shellcodePtr = ctypes.cast(buf, longptr) -#ret = libc.mach_vm_write(task, address, shellcodePtr, len(buf)) -#print("write", ret) - -shellcodePtr = ctypes.cast(patch, longptr) -ret = libc.mach_vm_write(task, ctypes.c_ulong(patch_address), shellcodePtr, len(buf)) -print("write", ret) - -#libc.mach_vm_write.argtypes = [ctypes.c_uint32, ctypes.c_ulong, longptr, ctypes.c_uint32] -#libc.mach_vm_write.restype = ctypes.c_uint32 -#ret = libc.mach_vm_write(task, ctypes.c_ulong(patch_address), shellcodePtr, len(patch)) - -ret = libc.mach_vm_protect(task, ctypes.c_ulong(patch_address), 4, False, 5) -print("protect", ret) \ No newline at end of file diff --git a/tinygrad_repo/extra/accel/ane/aneregs b/tinygrad_repo/extra/accel/ane/aneregs deleted file mode 100644 index fef7791..0000000 --- a/tinygrad_repo/extra/accel/ane/aneregs +++ /dev/null @@ -1,220 +0,0 @@ -// ZinIrRegBitPrintOutDebug_7u_ - -Task_ID: 0 - -header = 10*4 = 0x28 - -aneTD.Header[0].TID = 0 -aneTD.Header[0].NID = 0 -aneTD.Header[0].LNID = 1 -aneTD.Header[0].EON = 1 -aneTD.Header[1].ExeCycles = 0 -aneTD.Header[1].NextSize = 0 -aneTD.Header[2].LogEvents = 1058 -aneTD.Header[3].Exceptions = 0 -aneTD.Header[4].DebugLogEvents = 16775274 -aneTD.Header[5].DebugExceptions = 0 -aneTD.Header[6].DisallowAbort = 0 -aneTD.Header[6].TDSkip = 0 -aneTD.Header[6].KPC = 0 -aneTD.Header[6].SPL = 0 -aneTD.Header[6].TSR = 0 -aneTD.Header[6].SPC = 0 -aneTD.Header[6].DPC = 0 -aneTD.Header[6].TSE = 0 -aneTD.Header[6].NextPriority = 0 -aneTD.Header[6].TDE = 0 -aneTD.Header[6].SrcLoc = 1 -aneTD.Header[6].DstLoc = 1 -aneTD.Header[6].TQDis = 0 -aneTD.Header[7].NextPointer = 0 -aneTD.Header[8].RBase0 = 5 -aneTD.Header[8].RBE0 = 1 -aneTD.Header[8].RBase1 = 0 -aneTD.Header[8].RBE1 = 0 -aneTD.Header[8].WBase = 4 -aneTD.Header[8].WBE = 1 -aneTD.Header[8].TBase = 0 -aneTD.Header[8].TBE = 0 -aneTD.Header[8].ENE = 1 -aneTD.Header[9].KBase0 = 1 -aneTD.Header[9].KBE0 = 1 -aneTD.Header[9].KBase1 = 0 -aneTD.Header[9].KBE1 = 0 -aneTD.Header[9].KBase2 = 0 -aneTD.Header[9].KBE2 = 0 -aneTD.Header[9].KBase3 = 0 -aneTD.Header[9].KBE3 = 0 - -0x28 = 00 F8 01 F4 = 0x1F800 -+0x30 -aneRegs.KernelDMASrc.CoeffBaseAddr[0].Addr = 0 -aneRegs.KernelDMASrc.CoeffBfrSize[0].MemBfrSize = 2 -aneRegs.KernelDMASrc.CoeffDMAConfig[0].CacheHint = 2 -aneRegs.KernelDMASrc.CoeffDMAConfig[0].CrH = 0 -aneRegs.KernelDMASrc.CoeffDMAConfig[0].En = 1 -aneRegs.KernelDMASrc.CoeffDMAConfig[0].PrefetchParticipateEn = 0 -aneRegs.KernelDMASrc.CoeffBaseAddr[1].Addr = 0 -aneRegs.KernelDMASrc.CoeffBfrSize[1].MemBfrSize = 1 -aneRegs.KernelDMASrc.CoeffDMAConfig[1].CacheHint = 2 -aneRegs.KernelDMASrc.CoeffDMAConfig[1].CrH = 0 -aneRegs.KernelDMASrc.CoeffDMAConfig[1].En = 0 -aneRegs.KernelDMASrc.CoeffDMAConfig[1].PrefetchParticipateEn = 0 -aneRegs.KernelDMASrc.CoeffBaseAddr[2].Addr = 0 -aneRegs.KernelDMASrc.CoeffBfrSize[2].MemBfrSize = 1 -aneRegs.KernelDMASrc.CoeffDMAConfig[2].CacheHint = 2 -aneRegs.KernelDMASrc.CoeffDMAConfig[2].CrH = 0 -aneRegs.KernelDMASrc.CoeffDMAConfig[2].En = 0 -aneRegs.KernelDMASrc.CoeffDMAConfig[2].PrefetchParticipateEn = 0 -# there's 13 more of these -aneRegs.KernelDMASrc.Spare0.Spare = 0 -aneRegs.KernelDMASrc.Spare1.Spare = 0 - -0x124 = 00 00 00 3C = 0 -+0x1d4 -aneRegs.Common.Cfg.AccDoubleBufEn = 1 -aneRegs.Common.Cfg.ActiveNE = 0 -aneRegs.Common.Cfg.ContextSwitchIn = 0 -aneRegs.Common.Cfg.ContextSwitchOut = 0 -aneRegs.Common.Cfg.ShMax = 1 -aneRegs.Common.Cfg.ShMin = 0 -aneRegs.Common.Cfg.ShPref = 1 -aneRegs.Common.Cfg.SmallSourceMode = 0 -aneRegs.Common.ChCfg.InFmt = 2 -aneRegs.Common.ChCfg.OutFmt = 2 -aneRegs.Common.Cin.Cin = 1 -aneRegs.Common.ConvCfg.Kh = 1 -aneRegs.Common.ConvCfg.Kw = 1 -aneRegs.Common.ConvCfg.OCGSize = 0 -aneRegs.Common.ConvCfg.Ox = 1 -aneRegs.Common.ConvCfg.Oy = 1 -aneRegs.Common.ConvCfg.Px = 0 -aneRegs.Common.ConvCfg.Py = 0 -aneRegs.Common.ConvCfg.Sx = 1 -aneRegs.Common.ConvCfg.Sy = 1 -aneRegs.Common.Cout.Cout = 1 -aneRegs.Common.DPE.Category = 0 -aneRegs.Common.GroupConvCfg.ElemMultMode = 0 -aneRegs.Common.GroupConvCfg.NumGroups = 1 -aneRegs.Common.GroupConvCfg.UnicastCin = 1 -aneRegs.Common.GroupConvCfg.UnicastEn = 1 -aneRegs.Common.InDim.Hin = 1 -aneRegs.Common.InDim.Win = 77 -aneRegs.Common.OutDim.Hout = 1 -aneRegs.Common.OutDim.Wout = 77 -aneRegs.Common.Spare0.Spare = 0 -aneRegs.Common.Spare1.Spare = 0 -aneRegs.Common.TaskInfo.NID = 1 -aneRegs.Common.TaskInfo.TaskID = 0 -aneRegs.Common.TaskInfo.TaskQ = 0 -aneRegs.Common.TileCfg.TileHeight = 1 - -0x168 = 00 38 01 6C = 0x13800 -+0x220 -aneRegs.TileDMASrc.BaseAddr.Addr = 0 -aneRegs.TileDMASrc.DMAConfig.CacheHint = 2 -aneRegs.TileDMASrc.DMAConfig.CacheHintNoReuse = 12 -aneRegs.TileDMASrc.DMAConfig.CacheHintReuse = 14 -aneRegs.TileDMASrc.DMAConfig.CrH = 0 -aneRegs.TileDMASrc.DMAConfig.DependencyMode = 0 -aneRegs.TileDMASrc.DMAConfig.En = 1 -aneRegs.TileDMASrc.Fmt.CmpVec = 0 -aneRegs.TileDMASrc.DepthStride.Stride = 3 -aneRegs.TileDMASrc.Fmt.FmtMode = 1 -aneRegs.TileDMASrc.Fmt.Interleave = 1 -aneRegs.TileDMASrc.Fmt.MemFmt = 2 -aneRegs.TileDMASrc.Fmt.OffsetCh = 0 -aneRegs.TileDMASrc.Fmt.Shift = 0 -aneRegs.TileDMASrc.Fmt.Truncate = 3 -aneRegs.TileDMASrc.GroupStride.Stride = 0 -aneRegs.TileDMASrc.PixelOffset[0].Offset = 0 -aneRegs.TileDMASrc.PixelOffset[1].Offset = 0 -aneRegs.TileDMASrc.PixelOffset[2].Offset = 0 -aneRegs.TileDMASrc.PixelOffset[3].Offset = 0 -aneRegs.TileDMASrc.PlaneStride.PlaneStride = 3 -aneRegs.TileDMASrc.RowStride.Stride = 3 -aneRegs.TileDMASrc.Spare0.Spare = 0 -aneRegs.TileDMASrc.Spare1.Spare = 0 - -0x1dc = 00 48 00 44 = 0x4800 -+0x29c -aneRegs.L2.ResultBase.Addr = 10 -aneRegs.L2.ResultCfg.AliasConvRslt = 0 -aneRegs.L2.ResultCfg.AliasConvSrc = 0 -aneRegs.L2.ResultCfg.AliasPlanarRslt = 0 -aneRegs.L2.ResultCfg.AliasPlanarSrc = 0 -aneRegs.L2.ResultCfg.ResultType = 2 -aneRegs.L2.ResultCfg.DMACmpVec = 0 -aneRegs.L2.ResultCfg.DMAFmt = 1 -aneRegs.L2.ResultCfg.DMAInterleave = 1 -aneRegs.L2.ResultCfg.DMAOffsetCh = 0 -aneRegs.L2.ResultCfg.L2BfrMode = 1 -aneRegs.L2.ConvResultChannelStride.Stride = 0 -aneRegs.L2.ConvResultRowStride.Stride = 0 -aneRegs.L2.L2Cfg.InputReLU = 0 -aneRegs.L2.L2Cfg.PaddingMode = 0 -aneRegs.L2.Spare0.Spare = 0 -aneRegs.L2.Spare1.Spare = 0 -aneRegs.L2.SourceBase.Addr = 0 -aneRegs.L2.SourceCfg.AliasConvRslt = 0 -aneRegs.L2.SourceCfg.AliasConvSrc = 0 -aneRegs.L2.SourceCfg.AliasPlanarRslt = 0 -aneRegs.L2.SourceCfg.AliasPlanarSrc = 0 -aneRegs.L2.SourceCfg.DMACmpVec = 0 -aneRegs.L2.SourceCfg.DMAFmt = 1 -aneRegs.L2.SourceCfg.DMAInterleave = 1 -aneRegs.L2.SourceCfg.DMAOffsetCh = 0 -aneRegs.L2.SourceCfg.Dependent = 0 -aneRegs.L2.SourceCfg.SourceType = 2 -aneRegs.L2.SourceChannelStride.Stride = 10 -aneRegs.L2.SourceRowStride.Stride = 10 - -0x228 = 00 88 00 0C = 0x8800 -+0x2f0 -0x23C = 00 C8 00 10 = 0xC800 -+0x30c -aneRegs.NE.AccBias.AccBias = 0 -aneRegs.NE.AccBias.AccBiasShift = 0 -aneRegs.NE.KernelCfg.GroupKernelReuse = 0 -aneRegs.NE.KernelCfg.KernelFmt = 0 -aneRegs.NE.KernelCfg.PalettizedBits = 8 -aneRegs.NE.KernelCfg.PalettizedEn = 0 -aneRegs.NE.KernelCfg.SparseFmt = 0 -aneRegs.NE.MACCfg.BiasMode = 0 -aneRegs.NE.MACCfg.BinaryPoint = 0 -aneRegs.NE.MACCfg.KernelMode = 1 -aneRegs.NE.MACCfg.MatrixBiasEn = 0 -aneRegs.NE.MACCfg.NonlinearMode = 2 -aneRegs.NE.MACCfg.OpMode = 4 -aneRegs.NE.MACCfg.PostScaleMode = 0 -aneRegs.NE.MatrixVectorBias.MatrixVectorBias = 0 -aneRegs.NE.PostScale.PostRightShift = 0 -aneRegs.NE.PostScale.PostScale = 15360 -aneRegs.NE.Spare0.Spare = 0 -aneRegs.NE.Spare1.Spare = 0 - -0x254 = 00 78 01 18 = 0x17800 -+0x32c -aneRegs.TileDMADst.BaseAddr.Addr = 0 -aneRegs.TileDMADst.DepthStride.DepthStride = 3 -aneRegs.TileDMADst.DMAConfig.BypassEOW = 0 -aneRegs.TileDMADst.DMAConfig.CacheHint = 3 -aneRegs.TileDMADst.DMAConfig.CrH = 0 -aneRegs.TileDMADst.DMAConfig.En = 1 -aneRegs.TileDMADst.DMAConfig.L2BfrMode = 1 -aneRegs.TileDMADst.Fmt.CmpVec = 0 -aneRegs.TileDMADst.Fmt.CmpVecFill = 0 -aneRegs.TileDMADst.Fmt.FmtMode = 1 -aneRegs.TileDMADst.Fmt.Interleave = 1 -aneRegs.TileDMADst.Fmt.MemFmt = 2 -aneRegs.TileDMADst.Fmt.OffsetCh = 0 -aneRegs.TileDMADst.Fmt.Shift = 0 -aneRegs.TileDMADst.Fmt.Truncate = 3 -aneRegs.TileDMADst.Fmt.ZeroPadFirst = 1 -aneRegs.TileDMADst.Fmt.ZeroPadLast = 1 -aneRegs.TileDMADst.GroupStride.GroupStride = 0 -aneRegs.TileDMADst.PlaneStride.PlaneStride = 3 -aneRegs.TileDMADst.RowStride.RowStride = 3 -aneRegs.TileDMADst.Spare0.Spare = 0 -aneRegs.TileDMADst.Spare1.Spare = 0 - diff --git a/tinygrad_repo/extra/accel/ane/lib/.gitignore b/tinygrad_repo/extra/accel/ane/lib/.gitignore deleted file mode 100644 index 997aa25..0000000 --- a/tinygrad_repo/extra/accel/ane/lib/.gitignore +++ /dev/null @@ -1 +0,0 @@ -libane.dylib diff --git a/tinygrad_repo/extra/accel/ane/lib/ane.mm b/tinygrad_repo/extra/accel/ane/lib/ane.mm deleted file mode 100644 index 309ef9d..0000000 --- a/tinygrad_repo/extra/accel/ane/lib/ane.mm +++ /dev/null @@ -1,210 +0,0 @@ -#include -#include -#include -#include - -#import - -#import -#import - -#include "h11ane.h" -using namespace H11ANE; - -//#define DEBUG printf -#define DEBUG(x, ...) - -extern "C" { - -// global vars -H11ANEDevice *dev = NULL; - -int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) { - DEBUG("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3); - dev = param_3; - return 0; -} - -int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) { - DEBUG("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3); - return 0; -} - -int ANE_Open() { - int ret; - H11ANEDeviceController dc(MyH11ANEDeviceControllerNotification, NULL); - dc.SetupDeviceController(); - assert(dev != NULL); - dev->EnableDeviceMessages(); - - char empty[0x90] = {0}; - H11ANEDeviceInfoStruct dis = {0}; - ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis); - DEBUG("open 0x%x %p\n", ret, dev); - - ret = dev->ANE_PowerOn(); - DEBUG("power on: %d\n", ret); - - ret = dev->ANE_IsPowered(); - DEBUG("powered? %d\n", ret); - - return 0; -} - -int stride_for_width(int width) { - int ret = width*2; - ret += (64-(ret % 64))%64; - return ret; -} - -void *ANE_TensorCreate(int width, int height) { - // all float16 - // input buffer - - NSDictionary* dict = [NSDictionary dictionaryWithObjectsAndKeys: - [NSNumber numberWithInt:width], kIOSurfaceWidth, - [NSNumber numberWithInt:height], kIOSurfaceHeight, - [NSNumber numberWithInt:2], kIOSurfaceBytesPerElement, - [NSNumber numberWithInt:stride_for_width(width)], kIOSurfaceBytesPerRow, - [NSNumber numberWithInt:1278226536], kIOSurfacePixelFormat, - nil]; - IOSurfaceRef in_surf = IOSurfaceCreate((CFDictionaryRef)dict); - IOSurfaceLock((IOSurfaceRef)in_surf, 0, nil); - - return (void *)in_surf; -} - -void* ANE_TensorData(void *out_surf) { - void *ret = (void *)IOSurfaceGetBaseAddress((IOSurfaceRef)out_surf); - //IOSurfaceUnlock((IOSurfaceRef)out_surf, 0, nil); - DEBUG("TensorData %p -> %p\n", out_surf, ret); - return ret; -} - -uint64_t ANE_Compile(char *iprog, int sz) { - int ret; - int cksum = 0; - for (int i = 0; i < sz; i++) cksum += iprog[i]; - DEBUG("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz); - - char *prog = (char*)aligned_alloc(0x1000, sz); - memcpy(prog, iprog, sz); - - H11ANEProgramCreateArgsStruct mprog = {0}; - mprog.program = prog; - mprog.program_length = sz; - - H11ANEProgramCreateArgsStructOutput *out = new H11ANEProgramCreateArgsStructOutput; - memset(out, 0, sizeof(H11ANEProgramCreateArgsStructOutput)); - ret = dev->ANE_ProgramCreate(&mprog, out); - uint64_t program_handle = out->program_handle; - delete out; - DEBUG("program create: %lx %lx\n", ret, program_handle); - // early failure - if (ret != 0) return 0; - - H11ANEProgramPrepareArgsStruct pas = {0}; - pas.program_handle = program_handle; - pas.flags = 0x0000000100010001; - ret = dev->ANE_ProgramPrepare(&pas); - DEBUG("program prepare: %lx\n", ret); - - return program_handle; -} - -int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf, void *w_surf) { - int ret; - DEBUG("ANE_Run %p %p\n", in_surf, out_surf); - H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct; - memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct)); - - // TODO: make real struct - pras->args[0] = program_handle; - pras->args[4] = 0x0000002100000003; - - // inputs - int in_surf_id = IOSurfaceGetID((IOSurfaceRef)in_surf); - int out_surf_id = IOSurfaceGetID((IOSurfaceRef)out_surf); - - if (w_surf != NULL) { - pras->args[0x28/8] = 0x0000010000000002; - int w_surf_id = IOSurfaceGetID((IOSurfaceRef)w_surf); - pras->args[0x130/8] = (long long)w_surf_id; - } else { - pras->args[0x28/8] = 1; - } - pras->args[0x128/8] = (long long)in_surf_id<<32LL; - - // outputs - pras->args[0x528/8] = 1; - // 0x628 = outputBufferSurfaceId - pras->args[0x628/8] = (long long)out_surf_id<<32LL; - - mach_port_t recvPort = 0; - IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort); - DEBUG("recv port: 0x%x\n", recvPort); - - // run program - ret = dev->ANE_ProgramSendRequest(pras, recvPort); - DEBUG("send 0x%x\n", ret); - - struct { - mach_msg_header_t header; - char data[256]; - } message; - - ret = mach_msg(&message.header, - MACH_RCV_MSG, - 0, sizeof(message), - recvPort, - MACH_MSG_TIMEOUT_NONE, - MACH_PORT_NULL); - DEBUG("got message: %d sz %d\n", ret, message.header.msgh_size); - delete pras; - - return 0; -} - -int ANECCompile(CFDictionaryRef param_1, CFDictionaryRef param_2, unsigned long param_3); -int ANE_CompilePlist(char *path, bool debug=false) { - CFTypeRef ikeys[2]; - ikeys[0] = CFSTR("NetworkPlistName"); - ikeys[1] = CFSTR("NetworkPlistPath"); - - CFTypeRef ivalues[2]; - ivalues[0] = CFStringCreateWithCString(kCFAllocatorDefault, path, kCFStringEncodingUTF8); - ivalues[1] = CFSTR("./"); - - CFDictionaryRef iDictionary = CFDictionaryCreate(kCFAllocatorDefault, ikeys, ivalues, 2, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); - CFArrayRef array = CFArrayCreate(kCFAllocatorDefault, (const void**)&iDictionary, 1, &kCFTypeArrayCallBacks); - - CFMutableDictionaryRef optionsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); - CFMutableDictionaryRef flagsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); - - // h11 (or anything?) works here too, and creates different outputs that don't run - CFDictionaryAddValue(flagsDictionary, CFSTR("TargetArchitecture"), CFSTR("h13")); - CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFileName"), CFSTR("model.hwx")); - - if (debug) { - CFDictionaryAddValue(flagsDictionary, CFSTR("CompileANEProgramForDebugging"), kCFBooleanTrue); - int debug_mask = 0x7fffffff; - CFDictionaryAddValue(flagsDictionary, CFSTR("DebugMask"), CFNumberCreate(kCFAllocatorDefault, 3, &debug_mask)); - } - - return ANECCompile(optionsDictionary, flagsDictionary, 0); -} - -/*void _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE( - unsigned long param_1, void *param_2,int param_3, std::basic_ostream *param_4); -char *ANE_RegDebug(int a1, void *dat, int a2) { - std::ostringstream ss; - _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(a1, dat, a2, &ss); - std::string cppstr = ss.str(); - const char *str = cppstr.c_str(); - char *ret = (char *)malloc(strlen(str)+1); - strcpy(ret, str); - return ret; -}*/ - -} - diff --git a/tinygrad_repo/extra/accel/ane/lib/ane.py b/tinygrad_repo/extra/accel/ane/lib/ane.py deleted file mode 100755 index 2e430c0..0000000 --- a/tinygrad_repo/extra/accel/ane/lib/ane.py +++ /dev/null @@ -1,222 +0,0 @@ -#!/usr/bin/env python3 -from pathlib import Path -from ctypes import * -import json -import collections -import numpy as np -import faulthandler -import struct -faulthandler.enable() - -basedir = Path(__file__).resolve().parent - -libane = None -aneregs = None -def init_libane(): - global libane, aneregs - libane = cdll.LoadLibrary((basedir / "libane.dylib").as_posix()) - - libane.ANE_Compile.argtypes = [c_char_p, c_int] - libane.ANE_Compile.restype = c_void_p - - libane.ANE_TensorCreate.restype = c_void_p - - libane.ANE_TensorData.argtypes = [c_void_p] - libane.ANE_TensorData.restype = POINTER(c_uint16) - - libane.ANE_Run.argtypes = [c_void_p]*4 - libane.ANE_Run.restype = c_int - - #libane.ANE_RegDebug.restype = c_char_p - - with open(basedir / "aneregs.json") as f: - aneregs = json.load(f) - -ANE_Struct = [ -# aneTD.Header - ("u32", 0x1C, "NextCommandOffset"), - -# KernelDMASrc @ section @ 0x2C len 0xF4 - # reloc 0x2c-0x34?? = weights - # u32[16] 0x34-0x74 = 0x80 | 1 if used - # u32[16] 0x74-0xB4 = - # u32[16] 0xB4-0xF4 = - -# Common @ section @ 0x128 len 0x3C (conv) - ("u16", 0x128, "InputWidth"), - ("u16", 0x12A, "InputHeight"), - ("u16", 0x12C, "InputDepth"), - - ("u32", 0x130, "InputOutputType"), # (OutputType * 0x10) | InputType - # UInt8 = 0, Int8 = 1, Float16 = 2 - - ("u32", 0x134, "InputChannels"), - ("u32", 0x138, "OutputChannels"), - - ("u16", 0x13C, "OutputWidth"), - ("u16", 0x13E, "OutputHeight"), - ("u16", 0x140, "OutputDepth"), - - ("u16", 0x144, "KernelSize"), # 0xa000 | (KernelHeight * 0x20) | KernelWidth - ("u16", 0x146, "Padding"), # 0x5000 | (PadTop * 0x40) | (PadLeft * 2) - - ("u16", 0x14C, "BatchSize"), - -# TileDMASrc @ section @ 0x16C len 0x6C (input) - # reloc 0x16c-0x174 = image - ("u32", 0x178, "InputRowStride"), - ("u32", 0x17C, "InputPlaneStride"), - ("u32", 0x180, "InputDepthStride"), - ("u32", 0x184, "InputBatchStride"), - - ("u8", 0x1A7, "InputInterleave"), - -# L2 @ section @ 0x1E0 len 0x44 - # [0x1ec, 0x1f0, 0x1f4, 0x1f8, 0x214] = number of engines - # [0x1f0, 0x1f4, 0x1f8, 0x214] = engines for inconv? - # [0x21c, 0x220, 0x224] = engines for outconv? - -# NE @ section @ 0x22c len 0xC (scaling) - ("u16", 0x230, "BiasScalar"), - ("u16", 0x232, "ScaleScalar"), - -# section @ 0x240 len 0x10 - ("u16", 0x246, "NeuronType"), # 0x10 = copy, 0x11 = ReLU, 0x12 = custom - ("u32", 0x250, "PostScale"), - -# TileDMADst @ section @ 0x258 len 0x18 - -# HandleTileDmaDstConfig - # 0x258 -- *(uint *)(this + 0x334) = *(uint *)(this + 0x334) & 0xfffffc3f | 0xc0; - # (GetCacheHintRegisterValue & 0xf) << 6; - ("u32", 0x25C, "OutputOffset"), # offset into output buffer to write at? - - # 0x260 -- *(uint *)(this + 0x33c) = *(uint *)(this + 0x33c) & 0x3f | (int)uVar10 << 6; - ("u32", 0x260, "OutputRowStride"), - ("u32", 0x264, "OutputPlaneStride"), - ("u32", 0x268, "OutputDepthStride"), - ("u32", 0x26C, "OutputBatchStride"), - - # 0x270 -- *(uint *)(this + 0x34c) = *(uint *)(this + 0x34c) & 0xf0ffffff | 0x1000000; - # uVar6 = *(uint *)(this + 0x34c) & 0xffffcfcc | 0x2031; - # (ZinTensorDescriptorDmaInterleave & 0xf) << 0x18; - ("u8", 0x273, "OutputInterleave"), # i also have this at 0x211? -] - -ANE_Struct_Dict = {} -for typ, num, nam in ANE_Struct: - styp = {"u32": "I", "u16": "H", "u8": "B"}[typ] - ANE_Struct_Dict[nam] = (styp, num) - -class ANETensor: - def __init__(self, *shape): - self.shape = shape - self.dtype = np.float16 - self.sz = int(np.prod(shape)) - assert(self.sz <= 0x4000) - self.tt = libane.ANE_TensorCreate(self.sz, 1) - assert(self.tt is not None) - - def data(self): - data = libane.ANE_TensorData(self.tt) - assert(data is not None) - #print(hex(addressof(data.contents))) - buf = np.ctypeslib.as_array(data, shape=(self.sz,)) - ret = np.frombuffer(buf, dtype=self.dtype) - #print(ret.data) - return ret - -class ANE: - def __init__(self): - init_libane() - libane.ANE_Open() - - def compile(self, dat): - ret = libane.ANE_Compile(create_string_buffer(dat), len(dat)) - assert(ret is not None) - return ret - - def run(self, prog, tin, tout, tweights=None): - libane.ANE_Run(prog, tin.tt, tout.tt, tweights.tt if tweights is not None else 0) - - def tensor(self, shape): - return ANETensor(shape) - - def unpack(self, dat): - dat = struct.unpack("Q"*(len(dat)//8), dat) - ret = {} - for k,v in aneregs: - by,bi,sz = v - bi += (by%8)*8 - by //= 8 - rv = (dat[by] >> bi) & ((1 << sz)-1) - ret[k] = rv - return ret - - def pack(self, pk, dat): - dat = list(struct.unpack("Q"*(len(dat)//8), dat)) - for k,v in aneregs: - by,bi,sz = v - bi += (by%8)*8 - by //= 8 - dat[by] &= ~(((1 << sz)-1) << bi) - dat[by] |= pk[k] << bi - dat = struct.pack("Q"*len(dat), *dat) - return dat - - def debug(self, dat, mems=0): - add = [0x30, 0x1d4, 0x220, 0x29c, 0x2f0, 0x30c, 0x32c] - lens = [244, 60, 108, 68, 12, 16, 24] - ptr = 0x2b - ddat = dat[0:0x28] - for a, pm in zip(add, lens): - #assert pm == dat[ptr] - ddat += b"\x00" * (a-len(ddat)) - ddat += dat[ptr+1:ptr+1+pm+4] - ptr += pm+8 - ddat += b"\x00" * 0x100 - ret = collections.OrderedDict() - for ln in libane.ANE_RegDebug(0, create_string_buffer(ddat), mems).decode('utf-8').strip().split("\n"): - lnn = ln.split(" = ") - if len(lnn) == 2: - ret[lnn[0]] = int(lnn[1]) - return ret - - def filln(self, dat, nvdict, base=0x4000): - for n,v in nvdict.items(): - styp, num = ANE_Struct_Dict[n] - dat = self.fill(dat, [num], styp, v) - return dat - - def fill(self, dat, addrs, type, val, base=0x4000): - x = struct.pack(type, val) - for a in addrs: - dat[base+a:base+a+len(x)] = x - return dat - -if __name__ == "__main__": - ane = ANE() - - tin = ANETensor(16) - tout = ANETensor(16) - - tind = tin.data() - toutd = tout.data() - - tind[0:4] = [-1,1,-2,2] - print("** before **") - print(tind) - print(toutd) - - dat = open("../ops/relu.hwx", "rb").read() - md = dat[0x4000:0x4300] - dd = ane.unpack(md) - mdf = ane.pack(dd, md) - assert(md == mdf) - - comp = ane.compile(dat) - ret = ane.run(comp, tin, tout) - print("** after **") - print(tind) - print(toutd) - diff --git a/tinygrad_repo/extra/accel/ane/lib/aneregs.json b/tinygrad_repo/extra/accel/ane/lib/aneregs.json deleted file mode 100644 index 3862d69..0000000 --- a/tinygrad_repo/extra/accel/ane/lib/aneregs.json +++ /dev/null @@ -1,2066 +0,0 @@ -[ - [ - "aneTD.Header[0].TID", - [ - 0, - 0, - 16 - ] - ], - [ - "aneTD.Header[0].NID", - [ - 2, - 0, - 8 - ] - ], - [ - "aneTD.Header[0].LNID", - [ - 3, - 0, - 1 - ] - ], - [ - "aneTD.Header[0].EON", - [ - 3, - 1, - 1 - ] - ], - [ - "aneTD.Header[1].ExeCycles", - [ - 4, - 0, - 16 - ] - ], - [ - "aneTD.Header[1].NextSize", - [ - 6, - 0, - 9 - ] - ], - [ - "aneTD.Header[2].LogEvents", - [ - 8, - 0, - 24 - ] - ], - [ - "aneTD.Header[3].Exceptions", - [ - 12, - 0, - 24 - ] - ], - [ - "aneTD.Header[4].DebugLogEvents", - [ - 16, - 0, - 24 - ] - ], - [ - "aneTD.Header[5].DebugExceptions", - [ - 20, - 0, - 24 - ] - ], - [ - "aneTD.Header[6].DisallowAbort", - [ - 25, - 0, - 1 - ] - ], - [ - "aneTD.Header[6].TDSkip", - [ - 25, - 1, - 1 - ] - ], - [ - "aneTD.Header[6].KPC", - [ - 25, - 2, - 1 - ] - ], - [ - "aneTD.Header[6].SPL", - [ - 25, - 3, - 1 - ] - ], - [ - "aneTD.Header[6].TSR", - [ - 25, - 4, - 1 - ] - ], - [ - "aneTD.Header[6].SPC", - [ - 25, - 5, - 1 - ] - ], - [ - "aneTD.Header[6].DPC", - [ - 25, - 6, - 1 - ] - ], - [ - "aneTD.Header[6].TSE", - [ - 25, - 7, - 1 - ] - ], - [ - "aneTD.Header[6].NextPriority", - [ - 26, - 0, - 6 - ] - ], - [ - "aneTD.Header[6].TDE", - [ - 27, - 0, - 1 - ] - ], - [ - "aneTD.Header[6].SrcLoc", - [ - 27, - 4, - 1 - ] - ], - [ - "aneTD.Header[6].DstLoc", - [ - 27, - 5, - 1 - ] - ], - [ - "aneTD.Header[6].TQDis", - [ - 27, - 7, - 1 - ] - ], - [ - "aneTD.Header[7].NextPointer", - [ - 28, - 0, - 32 - ] - ], - [ - "aneTD.Header[8].RBase0", - [ - 32, - 0, - 5 - ] - ], - [ - "aneTD.Header[8].RBE0", - [ - 32, - 5, - 1 - ] - ], - [ - "aneTD.Header[8].RBase1", - [ - 32, - 6, - 5 - ] - ], - [ - "aneTD.Header[8].RBE1", - [ - 33, - 3, - 1 - ] - ], - [ - "aneTD.Header[8].WBase", - [ - 33, - 4, - 5 - ] - ], - [ - "aneTD.Header[8].WBE", - [ - 34, - 1, - 1 - ] - ], - [ - "aneTD.Header[8].TBase", - [ - 34, - 2, - 5 - ] - ], - [ - "aneTD.Header[8].TBE", - [ - 34, - 7, - 1 - ] - ], - [ - "aneTD.Header[8].ENE", - [ - 35, - 0, - 3 - ] - ], - [ - "aneTD.Header[9].KBase0", - [ - 36, - 0, - 5 - ] - ], - [ - "aneTD.Header[9].KBE0", - [ - 36, - 5, - 1 - ] - ], - [ - "aneTD.Header[9].KBase1", - [ - 36, - 6, - 5 - ] - ], - [ - "aneTD.Header[9].KBE1", - [ - 37, - 3, - 1 - ] - ], - [ - "aneTD.Header[9].KBase2", - [ - 37, - 4, - 5 - ] - ], - [ - "aneTD.Header[9].KBE2", - [ - 38, - 1, - 1 - ] - ], - [ - "aneTD.Header[9].KBase3", - [ - 38, - 2, - 5 - ] - ], - [ - "aneTD.Header[9].KBE3", - [ - 38, - 7, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[0].En", - [ - 52, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[0].CrH", - [ - 52, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[0].CacheHint", - [ - 52, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[0].PrefetchParticipateEn", - [ - 55, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[1].En", - [ - 56, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[1].CrH", - [ - 56, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[1].CacheHint", - [ - 56, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[1].PrefetchParticipateEn", - [ - 59, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[2].En", - [ - 60, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[2].CrH", - [ - 60, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[2].CacheHint", - [ - 60, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[2].PrefetchParticipateEn", - [ - 63, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[3].En", - [ - 64, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[3].CrH", - [ - 64, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[3].CacheHint", - [ - 64, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[3].PrefetchParticipateEn", - [ - 67, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[4].En", - [ - 68, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[4].CrH", - [ - 68, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[4].CacheHint", - [ - 68, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[4].PrefetchParticipateEn", - [ - 71, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[5].En", - [ - 72, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[5].CrH", - [ - 72, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[5].CacheHint", - [ - 72, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[5].PrefetchParticipateEn", - [ - 75, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[6].En", - [ - 76, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[6].CrH", - [ - 76, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[6].CacheHint", - [ - 76, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[6].PrefetchParticipateEn", - [ - 79, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[7].En", - [ - 80, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[7].CrH", - [ - 80, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[7].CacheHint", - [ - 80, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[7].PrefetchParticipateEn", - [ - 83, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[8].En", - [ - 84, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[8].CrH", - [ - 84, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[8].CacheHint", - [ - 84, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[8].PrefetchParticipateEn", - [ - 87, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[9].En", - [ - 88, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[9].CrH", - [ - 88, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[9].CacheHint", - [ - 88, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[9].PrefetchParticipateEn", - [ - 91, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[10].En", - [ - 92, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[10].CrH", - [ - 92, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[10].CacheHint", - [ - 92, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[10].PrefetchParticipateEn", - [ - 95, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[11].En", - [ - 96, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[11].CrH", - [ - 96, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[11].CacheHint", - [ - 96, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[11].PrefetchParticipateEn", - [ - 99, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[12].En", - [ - 100, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[12].CrH", - [ - 100, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[12].CacheHint", - [ - 100, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[12].PrefetchParticipateEn", - [ - 103, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[13].En", - [ - 104, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[13].CrH", - [ - 104, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[13].CacheHint", - [ - 104, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[13].PrefetchParticipateEn", - [ - 107, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[14].En", - [ - 108, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[14].CrH", - [ - 108, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[14].CacheHint", - [ - 108, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[14].PrefetchParticipateEn", - [ - 111, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[15].En", - [ - 112, - 0, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[15].CrH", - [ - 112, - 4, - 2 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[15].CacheHint", - [ - 112, - 6, - 4 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffDMAConfig[15].PrefetchParticipateEn", - [ - 115, - 4, - 1 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[0].Addr", - [ - 116, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[1].Addr", - [ - 120, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[2].Addr", - [ - 124, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[3].Addr", - [ - 128, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[4].Addr", - [ - 132, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[5].Addr", - [ - 136, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[6].Addr", - [ - 140, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[7].Addr", - [ - 144, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[8].Addr", - [ - 148, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[9].Addr", - [ - 152, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[10].Addr", - [ - 156, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[11].Addr", - [ - 160, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[12].Addr", - [ - 164, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[13].Addr", - [ - 168, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[14].Addr", - [ - 172, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBaseAddr[15].Addr", - [ - 176, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[0].MemBfrSize", - [ - 180, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[1].MemBfrSize", - [ - 184, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[2].MemBfrSize", - [ - 188, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[3].MemBfrSize", - [ - 192, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[4].MemBfrSize", - [ - 196, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[5].MemBfrSize", - [ - 200, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[6].MemBfrSize", - [ - 204, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[7].MemBfrSize", - [ - 208, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[8].MemBfrSize", - [ - 212, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[9].MemBfrSize", - [ - 216, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[10].MemBfrSize", - [ - 220, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[11].MemBfrSize", - [ - 224, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[12].MemBfrSize", - [ - 228, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[13].MemBfrSize", - [ - 232, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[14].MemBfrSize", - [ - 236, - 6, - 26 - ] - ], - [ - "aneRegs.KernelDMASrc.CoeffBfrSize[15].MemBfrSize", - [ - 240, - 6, - 26 - ] - ], - [ - "aneRegs.Common.InDim.Win", - [ - 296, - 0, - 15 - ] - ], - [ - "aneRegs.Common.InDim.Hin", - [ - 298, - 0, - 15 - ] - ], - [ - "aneRegs.Common.ChCfg.InFmt", - [ - 304, - 0, - 2 - ] - ], - [ - "aneRegs.Common.ChCfg.OutFmt", - [ - 304, - 4, - 2 - ] - ], - [ - "aneRegs.Common.Cin.Cin", - [ - 308, - 0, - 17 - ] - ], - [ - "aneRegs.Common.Cout.Cout", - [ - 312, - 0, - 17 - ] - ], - [ - "aneRegs.Common.OutDim.Wout", - [ - 316, - 0, - 15 - ] - ], - [ - "aneRegs.Common.OutDim.Hout", - [ - 318, - 0, - 15 - ] - ], - [ - "aneRegs.Common.ConvCfg.Kw", - [ - 324, - 0, - 5 - ] - ], - [ - "aneRegs.Common.ConvCfg.Kh", - [ - 324, - 5, - 5 - ] - ], - [ - "aneRegs.Common.ConvCfg.OCGSize", - [ - 325, - 2, - 3 - ] - ], - [ - "aneRegs.Common.ConvCfg.Sx", - [ - 325, - 5, - 2 - ] - ], - [ - "aneRegs.Common.ConvCfg.Sy", - [ - 325, - 7, - 2 - ] - ], - [ - "aneRegs.Common.ConvCfg.Px", - [ - 326, - 1, - 5 - ] - ], - [ - "aneRegs.Common.ConvCfg.Py", - [ - 326, - 6, - 5 - ] - ], - [ - "aneRegs.Common.ConvCfg.Ox", - [ - 327, - 4, - 2 - ] - ], - [ - "aneRegs.Common.ConvCfg.Oy", - [ - 327, - 6, - 2 - ] - ], - [ - "aneRegs.Common.GroupConvCfg.NumGroups", - [ - 332, - 0, - 13 - ] - ], - [ - "aneRegs.Common.GroupConvCfg.UnicastEn", - [ - 333, - 6, - 1 - ] - ], - [ - "aneRegs.Common.GroupConvCfg.ElemMultMode", - [ - 333, - 7, - 1 - ] - ], - [ - "aneRegs.Common.GroupConvCfg.UnicastCin", - [ - 334, - 0, - 16 - ] - ], - [ - "aneRegs.Common.TileCfg.TileHeight", - [ - 336, - 0, - 15 - ] - ], - [ - "aneRegs.Common.Cfg.SmallSourceMode", - [ - 348, - 2, - 1 - ] - ], - [ - "aneRegs.Common.Cfg.ShPref", - [ - 349, - 0, - 3 - ] - ], - [ - "aneRegs.Common.Cfg.ShMin", - [ - 349, - 4, - 3 - ] - ], - [ - "aneRegs.Common.Cfg.ShMax", - [ - 350, - 0, - 3 - ] - ], - [ - "aneRegs.Common.Cfg.ActiveNE", - [ - 350, - 3, - 3 - ] - ], - [ - "aneRegs.Common.Cfg.ContextSwitchIn", - [ - 350, - 6, - 1 - ] - ], - [ - "aneRegs.Common.Cfg.ContextSwitchOut", - [ - 351, - 0, - 1 - ] - ], - [ - "aneRegs.Common.Cfg.AccDoubleBufEn", - [ - 351, - 2, - 1 - ] - ], - [ - "aneRegs.Common.TaskInfo.TaskID", - [ - 352, - 0, - 16 - ] - ], - [ - "aneRegs.Common.TaskInfo.TaskQ", - [ - 354, - 0, - 4 - ] - ], - [ - "aneRegs.Common.TaskInfo.NID", - [ - 354, - 4, - 8 - ] - ], - [ - "aneRegs.Common.DPE.Category", - [ - 356, - 0, - 4 - ] - ], - [ - "aneRegs.TileDMASrc.DMAConfig.En", - [ - 364, - 0, - 1 - ] - ], - [ - "aneRegs.TileDMASrc.DMAConfig.CrH", - [ - 364, - 4, - 2 - ] - ], - [ - "aneRegs.TileDMASrc.DMAConfig.CacheHint", - [ - 364, - 6, - 4 - ] - ], - [ - "aneRegs.TileDMASrc.DMAConfig.CacheHintReuse", - [ - 365, - 2, - 4 - ] - ], - [ - "aneRegs.TileDMASrc.DMAConfig.CacheHintNoReuse", - [ - 365, - 6, - 4 - ] - ], - [ - "aneRegs.TileDMASrc.DMAConfig.DependencyMode", - [ - 366, - 2, - 2 - ] - ], - [ - "aneRegs.TileDMASrc.BaseAddr.Addr", - [ - 372, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMASrc.RowStride.Stride", - [ - 376, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMASrc.PlaneStride.PlaneStride", - [ - 380, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMASrc.DepthStride.Stride", - [ - 384, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMASrc.GroupStride.Stride", - [ - 388, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMASrc.Fmt.FmtMode", - [ - 420, - 0, - 2 - ] - ], - [ - "aneRegs.TileDMASrc.Fmt.Truncate", - [ - 420, - 4, - 2 - ] - ], - [ - "aneRegs.TileDMASrc.Fmt.Shift", - [ - 421, - 0, - 1 - ] - ], - [ - "aneRegs.TileDMASrc.Fmt.MemFmt", - [ - 421, - 4, - 2 - ] - ], - [ - "aneRegs.TileDMASrc.Fmt.OffsetCh", - [ - 422, - 0, - 3 - ] - ], - [ - "aneRegs.TileDMASrc.Fmt.Interleave", - [ - 423, - 0, - 4 - ] - ], - [ - "aneRegs.TileDMASrc.Fmt.CmpVec", - [ - 423, - 4, - 4 - ] - ], - [ - "aneRegs.TileDMASrc.PixelOffset[0].Offset", - [ - 444, - 0, - 16 - ] - ], - [ - "aneRegs.TileDMASrc.PixelOffset[1].Offset", - [ - 448, - 0, - 16 - ] - ], - [ - "aneRegs.TileDMASrc.PixelOffset[2].Offset", - [ - 452, - 0, - 16 - ] - ], - [ - "aneRegs.TileDMASrc.PixelOffset[3].Offset", - [ - 456, - 0, - 16 - ] - ], - [ - "aneRegs.L2.L2Cfg.InputReLU", - [ - 480, - 0, - 1 - ] - ], - [ - "aneRegs.L2.L2Cfg.PaddingMode", - [ - 480, - 2, - 2 - ] - ], - [ - "aneRegs.L2.SourceCfg.SourceType", - [ - 484, - 0, - 2 - ] - ], - [ - "aneRegs.L2.SourceCfg.Dependent", - [ - 484, - 2, - 2 - ] - ], - [ - "aneRegs.L2.SourceCfg.AliasConvSrc", - [ - 484, - 4, - 1 - ] - ], - [ - "aneRegs.L2.SourceCfg.AliasConvRslt", - [ - 484, - 5, - 1 - ] - ], - [ - "aneRegs.L2.SourceCfg.DMAFmt", - [ - 484, - 6, - 2 - ] - ], - [ - "aneRegs.L2.SourceCfg.DMAInterleave", - [ - 485, - 0, - 4 - ] - ], - [ - "aneRegs.L2.SourceCfg.DMACmpVec", - [ - 485, - 4, - 4 - ] - ], - [ - "aneRegs.L2.SourceCfg.DMAOffsetCh", - [ - 486, - 0, - 3 - ] - ], - [ - "aneRegs.L2.SourceCfg.AliasPlanarSrc", - [ - 486, - 4, - 1 - ] - ], - [ - "aneRegs.L2.SourceCfg.AliasPlanarRslt", - [ - 486, - 6, - 1 - ] - ], - [ - "aneRegs.L2.SourceBase.Addr", - [ - 488, - 4, - 17 - ] - ], - [ - "aneRegs.L2.SourceChannelStride.Stride", - [ - 492, - 4, - 17 - ] - ], - [ - "aneRegs.L2.SourceRowStride.Stride", - [ - 496, - 4, - 17 - ] - ], - [ - "aneRegs.L2.ResultCfg.ResultType", - [ - 528, - 0, - 2 - ] - ], - [ - "aneRegs.L2.ResultCfg.L2BfrMode", - [ - 528, - 3, - 1 - ] - ], - [ - "aneRegs.L2.ResultCfg.AliasConvSrc", - [ - 528, - 4, - 1 - ] - ], - [ - "aneRegs.L2.ResultCfg.AliasConvRslt", - [ - 528, - 5, - 1 - ] - ], - [ - "aneRegs.L2.ResultCfg.DMAFmt", - [ - 528, - 6, - 2 - ] - ], - [ - "aneRegs.L2.ResultCfg.DMAInterleave", - [ - 529, - 0, - 4 - ] - ], - [ - "aneRegs.L2.ResultCfg.DMACmpVec", - [ - 529, - 4, - 4 - ] - ], - [ - "aneRegs.L2.ResultCfg.DMAOffsetCh", - [ - 530, - 0, - 3 - ] - ], - [ - "aneRegs.L2.ResultCfg.AliasPlanarSrc", - [ - 530, - 4, - 1 - ] - ], - [ - "aneRegs.L2.ResultCfg.AliasPlanarRslt", - [ - 530, - 6, - 1 - ] - ], - [ - "aneRegs.L2.ResultBase.Addr", - [ - 532, - 4, - 17 - ] - ], - [ - "aneRegs.L2.ConvResultChannelStride.Stride", - [ - 536, - 4, - 17 - ] - ], - [ - "aneRegs.L2.ConvResultRowStride.Stride", - [ - 540, - 4, - 17 - ] - ], - [ - "aneRegs.NE.KernelCfg.KernelFmt", - [ - 576, - 0, - 2 - ] - ], - [ - "aneRegs.NE.KernelCfg.PalettizedEn", - [ - 576, - 2, - 1 - ] - ], - [ - "aneRegs.NE.KernelCfg.PalettizedBits", - [ - 576, - 4, - 4 - ] - ], - [ - "aneRegs.NE.KernelCfg.SparseFmt", - [ - 577, - 0, - 1 - ] - ], - [ - "aneRegs.NE.KernelCfg.GroupKernelReuse", - [ - 577, - 2, - 1 - ] - ], - [ - "aneRegs.NE.MACCfg.OpMode", - [ - 580, - 0, - 3 - ] - ], - [ - "aneRegs.NE.MACCfg.KernelMode", - [ - 580, - 3, - 1 - ] - ], - [ - "aneRegs.NE.MACCfg.BiasMode", - [ - 580, - 4, - 1 - ] - ], - [ - "aneRegs.NE.MACCfg.MatrixBiasEn", - [ - 580, - 6, - 1 - ] - ], - [ - "aneRegs.NE.MACCfg.BinaryPoint", - [ - 581, - 0, - 5 - ] - ], - [ - "aneRegs.NE.MACCfg.PostScaleMode", - [ - 581, - 6, - 1 - ] - ], - [ - "aneRegs.NE.MACCfg.NonlinearMode", - [ - 582, - 0, - 2 - ] - ], - [ - "aneRegs.NE.MatrixVectorBias.MatrixVectorBias", - [ - 584, - 0, - 16 - ] - ], - [ - "aneRegs.NE.AccBias.AccBias", - [ - 588, - 0, - 16 - ] - ], - [ - "aneRegs.NE.AccBias.AccBiasShift", - [ - 590, - 0, - 5 - ] - ], - [ - "aneRegs.NE.PostScale.PostScale", - [ - 592, - 0, - 16 - ] - ], - [ - "aneRegs.NE.PostScale.PostRightShift", - [ - 594, - 0, - 5 - ] - ], - [ - "aneRegs.TileDMADst.DMAConfig.En", - [ - 600, - 0, - 1 - ] - ], - [ - "aneRegs.TileDMADst.DMAConfig.CrH", - [ - 600, - 4, - 2 - ] - ], - [ - "aneRegs.TileDMADst.DMAConfig.CacheHint", - [ - 600, - 6, - 4 - ] - ], - [ - "aneRegs.TileDMADst.DMAConfig.L2BfrMode", - [ - 603, - 2, - 1 - ] - ], - [ - "aneRegs.TileDMADst.DMAConfig.BypassEOW", - [ - 603, - 3, - 1 - ] - ], - [ - "aneRegs.TileDMADst.BaseAddr.Addr", - [ - 604, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMADst.RowStride.RowStride", - [ - 608, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMADst.PlaneStride.PlaneStride", - [ - 612, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMADst.DepthStride.DepthStride", - [ - 616, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMADst.GroupStride.GroupStride", - [ - 620, - 6, - 26 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.FmtMode", - [ - 624, - 0, - 2 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.Truncate", - [ - 624, - 4, - 2 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.Shift", - [ - 625, - 0, - 1 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.MemFmt", - [ - 625, - 4, - 2 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.OffsetCh", - [ - 626, - 0, - 3 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.ZeroPadLast", - [ - 626, - 4, - 1 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.ZeroPadFirst", - [ - 626, - 5, - 1 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.CmpVecFill", - [ - 626, - 6, - 1 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.Interleave", - [ - 627, - 0, - 4 - ] - ], - [ - "aneRegs.TileDMADst.Fmt.CmpVec", - [ - 627, - 4, - 4 - ] - ] -] \ No newline at end of file diff --git a/tinygrad_repo/extra/accel/ane/lib/build.sh b/tinygrad_repo/extra/accel/ane/lib/build.sh deleted file mode 100755 index 0b652ce..0000000 --- a/tinygrad_repo/extra/accel/ane/lib/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -clang++ ane.mm --shared -F /System/Library/PrivateFrameworks/ -framework ANEServices -framework IOSurface -framework Foundation -framework IOKit -framework ANECompiler -o libane.dylib - diff --git a/tinygrad_repo/extra/accel/ane/lib/entitlements.xml b/tinygrad_repo/extra/accel/ane/lib/entitlements.xml deleted file mode 120000 index 6cdb870..0000000 --- a/tinygrad_repo/extra/accel/ane/lib/entitlements.xml +++ /dev/null @@ -1 +0,0 @@ -../3_run/entitlements.xml \ No newline at end of file diff --git a/tinygrad_repo/extra/accel/ane/lib/h11ane.h b/tinygrad_repo/extra/accel/ane/lib/h11ane.h deleted file mode 120000 index 53c61bf..0000000 --- a/tinygrad_repo/extra/accel/ane/lib/h11ane.h +++ /dev/null @@ -1 +0,0 @@ -../3_run/h11ane.h \ No newline at end of file diff --git a/tinygrad_repo/extra/accel/ane/lib/sign_python.sh b/tinygrad_repo/extra/accel/ane/lib/sign_python.sh deleted file mode 100755 index 051d75c..0000000 --- a/tinygrad_repo/extra/accel/ane/lib/sign_python.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -codesign --force --entitlements entitlements.xml -s "Taylor Swift Child" /opt/homebrew/Cellar/python@3.9/3.9.1_1/Frameworks/Python.framework/Versions/3.9/Resources/Python.app/Contents/MacOS/Python - diff --git a/tinygrad_repo/extra/accel/ane/lib/testconv.py b/tinygrad_repo/extra/accel/ane/lib/testconv.py deleted file mode 100755 index 3b8542d..0000000 --- a/tinygrad_repo/extra/accel/ane/lib/testconv.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 -import time -from ane import ANE, ANETensor - -def benchmark(ane): - tin = ANETensor(512*0x20) - tout = ANETensor(512*0x20) - dat = open("../ops/gemm.hwx", "rb").read() - for k,v in ane.debug(dat[0x4000:0x4300], 16).items(): - print(k,v) - comp = ane.compile(dat) - - st = time.time() - for i in range(1000): - ret = ane.run(comp, tin, tout) - et = time.time() - ts = (et-st) - ops = 1000*512*512*2 - - print("%.2f ms, %.2f gigaops/sec" % (ts*1000, ops*1e-9/ts)) - - -if __name__ == "__main__": - ane = ANE() - - # 0x20 per row - tin = ANETensor(0x60) - tout = ANETensor(0x60) - tw = ANETensor(0x60) - - tind = tin.data() - toutd = tout.data() - twd = tw.data() - - #tind[0:4] = [-1,1,-2,2] - tind[0] = 1 - tind[0x20] = -2 - tind[0x40] = 3 - - # toutd[0] = \ - # tind[0] * twd[0] + \ - # tind[0x20] + twd[1] + \ - # tind[0x40] + twd[2] - - twd[0] = 4 - twd[1] = 0x100 - - twd[0x20] = 5 - twd[0x21] = 5 - twd[0x22] = 5 - - twd[0x40] = 12 - - print("** before **") - print(tind) - print(toutd) - - #benchmark(ane) - #exit(0) - - """ - dat = list(open("../ops/sum.hwx", "rb").read()) - dat = bytes(dat) - for k,v in ane.debug(dat[0x4000:0x4300], 16).items(): - print(k,v) - comp = ane.compile(dat) - ret = ane.run(comp, tin, tout, tw) - """ - - datb = open("../ops/sum.hwx", "rb").read() - dat = open("../ops/conv.hwx", "rb").read() - dd = ane.unpack(dat[0x4000:0x4300]) - # use the 3rd arg as the weights - dd["aneTD.Header[9].KBase0"] = 6 - dd["aneRegs.NE.PostScale.PostScale"] = 0x3c00 - #dd["aneRegs.L2.L2Cfg.InputReLU"] = 1 - #dd["aneRegs.NE.MACCfg.NonlinearMode"] = 1 - #dd["aneRegs.TileDMADst.Fmt.MemFmt"] = 0 - #dd["aneRegs.L2.ResultBase.Addr"] = 0 - #dd["aneRegs.Common.ChCfg.InFmt"] = 1 - #dd["aneRegs.TileDMADst.Fmt.ZeroPadFirst"] = 0 - #dd["aneRegs.TileDMADst.DMAConfig.En"] = 0 - for k,v in dd.items(): - print(k,v) - dat = datb[:0x4000] + ane.pack(dd, dat[0x4000:0x4300]) + datb[0x4300:] - comp = ane.compile(dat) - ret = ane.run(comp, tin, tout, tw) - - print("** after **") - print(tind) - print(toutd) diff --git a/tinygrad_repo/extra/accel/ane/ops/concat.hwx b/tinygrad_repo/extra/accel/ane/ops/concat.hwx deleted file mode 100644 index 0f98893..0000000 Binary files a/tinygrad_repo/extra/accel/ane/ops/concat.hwx and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/ops/conv.hwx b/tinygrad_repo/extra/accel/ane/ops/conv.hwx deleted file mode 100644 index ea5905f..0000000 Binary files a/tinygrad_repo/extra/accel/ane/ops/conv.hwx and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/ops/gemm.hwx b/tinygrad_repo/extra/accel/ane/ops/gemm.hwx deleted file mode 100644 index bc27a27..0000000 Binary files a/tinygrad_repo/extra/accel/ane/ops/gemm.hwx and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/ops/relu.hwx b/tinygrad_repo/extra/accel/ane/ops/relu.hwx deleted file mode 100644 index dc54cc5..0000000 Binary files a/tinygrad_repo/extra/accel/ane/ops/relu.hwx and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/ops/sigmoid.hwx b/tinygrad_repo/extra/accel/ane/ops/sigmoid.hwx deleted file mode 100644 index 7cc7934..0000000 Binary files a/tinygrad_repo/extra/accel/ane/ops/sigmoid.hwx and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/ops/sum.hwx b/tinygrad_repo/extra/accel/ane/ops/sum.hwx deleted file mode 100644 index 1b1855b..0000000 Binary files a/tinygrad_repo/extra/accel/ane/ops/sum.hwx and /dev/null differ diff --git a/tinygrad_repo/extra/accel/ane/tinygrad/ops_ane.py b/tinygrad_repo/extra/accel/ane/tinygrad/ops_ane.py deleted file mode 100644 index b9b792d..0000000 --- a/tinygrad_repo/extra/accel/ane/tinygrad/ops_ane.py +++ /dev/null @@ -1,39 +0,0 @@ -from functools import lru_cache -from .tensor import Device, Function, register - -@lru_cache -def compile_wrapper(ane, dat): - return ane.compile(dat) - -def roundup(x, v): - return x + (v-x)%v - -@lru_cache -def compile_relu(ane, sz): - dat = list(open("accel/ane/ops/relu.hwx", "rb").read()) - # TODO: make this all nice and once - # number of engines? (max 0x100) - l2_stride = max(0x100, roundup(sz*2, 0x10)) - # 0x1ec = L2.SourceChannelStride.Stride, 0x1f0 = L2.SourceRowStride.Stride - # 0x1f4, 0x1f8? - # 0x214 = L2.ResultBase.Addr - dat = ane.fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8, 0x214], "I", l2_stride) - stride = roundup(sz*2, 0x40) - dat = ane.filln(dat, { - "NeuronType": 0x11, # 0x10 makes this a copy, 0x11 = ReLU, 0x12 = crash - "InputWidth": sz, "OutputWidth": sz, - "InputRowStride": stride, "InputPlaneStride": stride, "InputDepthStride": stride, - "OutputRowStride": stride, "OutputPlaneStride": stride, "OutputDepthStride": stride, - }) - return compile_wrapper(ane, bytes(dat)) - -class ReLU(Function): - def forward(ctx, input): - ret = ctx.ane.tensor(input.shape) - ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret) - return ret - - def backward(ctx, grad_output): - return 0 - -register('relu', ReLU, device=Device.ANE) diff --git a/tinygrad_repo/extra/accel/intel/.gitignore b/tinygrad_repo/extra/accel/intel/.gitignore deleted file mode 100644 index cba7efc..0000000 --- a/tinygrad_repo/extra/accel/intel/.gitignore +++ /dev/null @@ -1 +0,0 @@ -a.out diff --git a/tinygrad_repo/extra/accel/intel/README b/tinygrad_repo/extra/accel/intel/README deleted file mode 100644 index 6a6ed7c..0000000 --- a/tinygrad_repo/extra/accel/intel/README +++ /dev/null @@ -1,2 +0,0 @@ -source /opt/intel/oneapi/compiler/latest/env/vars.sh -sycl-ls diff --git a/tinygrad_repo/extra/accel/intel/benchmark_matmul.py b/tinygrad_repo/extra/accel/intel/benchmark_matmul.py deleted file mode 100644 index 5999039..0000000 --- a/tinygrad_repo/extra/accel/intel/benchmark_matmul.py +++ /dev/null @@ -1,57 +0,0 @@ -import time - -onnx_path = "/tmp/my.onnx" -N = 2048 -CNT = 400 - -""" -import torch -import torch.nn as nn -#dtype = torch.bfloat16 -dtype = torch.float32 -class MatMul(nn.Module): - def __init__(self): - super().__init__() - self.a = nn.Linear(N, N, bias=False) - def forward(self, x): - x = x.to(dtype) - for i in range(CNT): x = self.a(x).relu() - return x.to(torch.float32) - -torch_model = MatMul().to(dtype) -torch.onnx.export(torch_model, torch.randn(N, N), onnx_path) -""" - -""" -import onnx -from tinygrad.tensor import Tensor -from extra.onnx import get_run_onnx -out = get_run_onnx(onnx.load(onnx_path))({"onnx::MatMul_0": Tensor.zeros(N, N)}) -for x in out.values(): x.realize() -""" - -from openvino.runtime import Core -core = Core() -devices = core.available_devices -for device in devices: - device_name = core.get_property(device, "FULL_DEVICE_NAME") - print(f"{device}: {device_name}") -model = core.read_model(onnx_path) -compiled_model = core.compile_model(model, device_name='GPU.0') -print(compiled_model) -ireq = compiled_model.create_infer_request() -for model_input in compiled_model.inputs: - tensor = ireq.get_tensor(model_input) - tensor.data[:] = 2 - print(tensor) -print("request") -ireq.infer() -ireq.infer() -print("did one") - -REPS = 20 -st = time.perf_counter() -for i in range(REPS): ireq.infer() -et = time.perf_counter() - st -print(f"{et*1000:.2f} ms {(CNT*N*N*N*REPS*2/et)*1e-9:.2f} GFLOPS") - diff --git a/tinygrad_repo/extra/accel/intel/go.sh b/tinygrad_repo/extra/accel/intel/go.sh deleted file mode 100755 index 8c67088..0000000 --- a/tinygrad_repo/extra/accel/intel/go.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -e -/opt/intel/oneapi/compiler/latest/linux/bin-llvm/clang++ joint_matrix_bfloat16.cpp -fsycl -SYCL_PI_TRACE=1 ./a.out diff --git a/tinygrad_repo/extra/accel/intel/joint_matrix_bfloat16.cpp b/tinygrad_repo/extra/accel/intel/joint_matrix_bfloat16.cpp deleted file mode 100644 index b21d608..0000000 --- a/tinygrad_repo/extra/accel/intel/joint_matrix_bfloat16.cpp +++ /dev/null @@ -1,173 +0,0 @@ -//==-------- joint_matrix_bfloat16.cpp - DPC++ joint_matrix----------- ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix - -// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out - -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; -using bfloat16 = sycl::ext::oneapi::bfloat16; - -//#define SG_SZ 16 -#define SG_SZ 8 - -#define TM 8 -#define TN SG_SZ -//#define TK 16 -#define TK 16 - -#define BF16_EPSILON 0.00781250 - -template struct big_matrix { -private: - T *mat; - -public: - T *get_data() { return mat; } - void set_data(T *data) { mat = data; } - big_matrix(T *data) : mat(data) {} -}; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); - - auto program = [&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix sub_b; - joint_matrix sub_c; - joint_matrix_load(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major); - - for (int k = 0; k < K / TK; k += 1) { // - joint_matrix_load(sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, K); - joint_matrix_load(sg, sub_b, accB.get_pointer() + (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, N * 2); - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - } - joint_matrix_store(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major); - }); // parallel for - }; - - queue q; - auto start = std::chrono::steady_clock::now(); - auto e = q.submit(program); - auto submit = std::chrono::steady_clock::now(); - e.wait(); - auto end = std::chrono::steady_clock::now(); - std::cout << "submit: " << std::chrono::duration_cast(submit - start).count() << " ms" << std::endl; - std::cout << "compute: " << std::chrono::duration_cast(end - submit).count() << " ms" << std::endl; - - // ahh, freeing is slow -} - -//#define SCALE 1024 -//#define SCALE 64 -#define SCALE 256 -static constexpr size_t MATRIX_M = TM * SCALE; -static constexpr size_t MATRIX_N = TN * SCALE; -static constexpr size_t MATRIX_K = TK * SCALE; -bfloat16 A[MATRIX_M][MATRIX_K]; -bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; -float C[MATRIX_M][MATRIX_N]; -float D[MATRIX_M][MATRIX_N]; - -float make_fp32(bfloat16 x) { - unsigned int y = *((int *)&x); - y = y << 16; - float *res = reinterpret_cast(&y); - return *res; -} - -void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, - int K) { - for (int m = 0; m < M; m++) - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k++) { - // Because B was assumed VNNIed - bfloat16 *va = (bfloat16 *)(A_mem + m * K + k); - bfloat16 *vb = (bfloat16 *)(B_mem + k * N + n); - float acc = *((float *)(C_mem + m * N + n)); - for (int i = 0; i < 2; i++) { - acc += (make_fp32(va[i]) * make_fp32(vb[i])); - } - *((float *)(C_mem + m * N + n)) = acc; - } - } -} - -int main() { - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_K; j++) { - A[i][j] = bfloat16(1.0f * (i + j)); - } - } - for (int i = 0; i < MATRIX_K / 2; i++) { - for (int j = 0; j < MATRIX_N * 2; j++) { - B[i][j] = bfloat16(2.0f * i + 3.0f * j); - } - } - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - C[i][j] = 1.0; - D[i][j] = 1.0; - } - } - - std::cout << "M" << MATRIX_M << "N" << MATRIX_N << "K" << MATRIX_K << std::endl; - - big_matrix MC((float *)&C); - big_matrix MD((float *)&D); - big_matrix MA((bfloat16 *)&A); - big_matrix MB((bfloat16 *)&B); - - matrix_multiply(MC, MA, MB); - - /*start = std::chrono::steady_clock::now(); - matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, MATRIX_N, MATRIX_K / 2); - end = std::chrono::steady_clock::now(); - std::cout << "Elapsed time in milliseconds (reference): " << std::chrono::duration_cast(end - start).count() << " ms" << std::endl; - - bool res = true; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON) - res = false; - } - } - std::cout << (res ? "passed" : "failed") << std::endl; - return !res;*/ - - return 0; -} - diff --git a/tinygrad_repo/extra/accel/tpu/README.md b/tinygrad_repo/extra/accel/tpu/README.md deleted file mode 100644 index 3d06e9f..0000000 --- a/tinygrad_repo/extra/accel/tpu/README.md +++ /dev/null @@ -1,127 +0,0 @@ -Google's TPU --------------------------------------------------------------------- - -We document the Google TPU v2/v3 in order to support it in tinygrad without the XLA compiler. - -## Creating a Google Cloud TPU VM - -This costs $4.50/hr for a TPUv2-8 machine, the cheapest VM. - -```bash -gcloud alpha compute tpus tpu-vm create test --zone=us-central1-b --accelerator-type=v2-8 --version=v2-alpha -gcloud alpha compute tpus tpu-vm ssh test --zone us-central1-b -# and for when you are done -gcloud alpha compute tpus tpu-vm delete test --zone us-central1-b -gcloud alpha compute tpus tpu-vm list --zone us-central1-b -``` - -Aside from the usual VM stuff, there's 4 accelerators on the PCI-E bus. (v2-8 is 4 chips with 2 cores each) - -``` -# lspci -00:04.0 Unassigned class [ff00]: Google, Inc. Device 0027 -00:05.0 Unassigned class [ff00]: Google, Inc. Device 0027 -00:06.0 Unassigned class [ff00]: Google, Inc. Device 0027 -00:07.0 Unassigned class [ff00]: Google, Inc. Device 0027 -``` - -They show up in `/sys/class/accel` (tons of files here) and the driver lives in `/lib/libtpu.so`. The devices are in `/dev/accel[0-3]`, and a bunch of stuff is mmaped. They are "ba16c7433" chips. - -We grab the minimal TPU [example from TensorFlow](https://github.com/tensorflow/tensorflow/blob/695b4c93d5da7277eb845937b79b66f9f363ed94/tensorflow/compiler/xla/python/tpu_driver/client/libtpu_client.c). When the compiler runs, it produces tons of great logs in `/tmp/tpu_logs` - -```bash -cd tfexample -gcc -o libtpu_client libtpu_client.c -ltpu -TPU_VLOG_LEVEL=99 ./libtpu_client -``` - -From these logs, we find the "LLO Instructions" - -## VLIW Instruction (322b VLIW bundle) - -``` - spare : 0 (0,1) - vex_mxu : 0 (1,1) -* 1 misc slot - msc_targ : 0 (2,3) - msc_opnd : 0 (5,3) - msc_op : 0 (8,5) - msc_pred : 31 (13,5) -* 2 matrix slots (push, pop) - vres_dest : 28 (18,2) - vres_op : 28 (20,2) - vres_pred : 31 (22,5) - vex_source : 28 (27,2) - vex_subop : 24 (29,3) - vex_op : 24 (32,3) - vex_pred : 31 (35,5) -* 4 vector slots (2 for load/store) - vld_ttu : 30 (40,1) - vld_stride : 24 (41,3) - vld_offset : 24 (44,2) - vld_base : 24 (46,2) - vld_submsk : 24 (48,3) - vld_dest : 0 (51,5) - vld_op : 0 (56,2) - vld_pred : 31 (58,5) - vst_ttu : 30 (63,1) - vst_iar : 30 (64,1) - vst_value_two : 24 (65,3) - vst_offset : 24 (68,2) - vst_base : 24 (70,2) - vst_value_one : 24 (72,3) - vst_source : 0 (75,5) - vst_op : 0 (80,5) - vst_pred : 31 (85,5) -* 4 vector slots (2 for ALU) - v1_dest : 0 (90,5) - v1_y_vreg : 0 (95,5) - v1_y_src : 0 (100,5) - v1_x : 0 (105,5) - v1_op : 0 (110,6) - v1_pred : 31 (116,5) - v0_dest : 0 (121,5) - v0_y_vreg : 0 (126,5) - v0_y_src : 0 (131,5) - v0_x : 0 (136,5) - v0_op : 0 (141,6) - v0_pred : 31 (147,5) -* 3 scalar registers copied in to the vector units? - vs2 : 0 (152,5) - vs1 : 0 (157,5) - vs0 : 0 (162,5) -* 6 immediates (16-bit each, two can be merged for 32) - imm_5 : 0 (167,16) - imm_4 : 0 (183,16) - imm_3 : 0 (199,16) - imm_2 : 0 (215,16) - imm_1 : 0 (231,16) - imm_0 : 0 (247,16) -* ttu? what's a ttu? - ttu_set_btr : 0 (263,1) - ttu_iterate : 0 (264,1) - ttu_row : 0 (265,3) -* 2 scalar slots - s1_dest : 0 (268,5) - s1_y : 0 (273,6) - s1_x : 0 (279,5) - s1_op : 0 (284,6) - s1_pred : 31 (290,5) - s0_dest : 0 (295,5) - s0_y : 0 (300,6) - s0_x : 0 (306,5) - s0_op : 0 (311,6) - s0_pred : 15 (317,5) -``` - -## Running a Program (WIP) - -Our goal is to run a program on TPU without the driver. - -``` -... -openat(AT_FDCWD, "/dev/accel3", O_RDWR) = 184 -mmap(NULL, 27799736, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_LOCKED, 184, 0) = 0x7f59a74b3000 -# size is 0x1a830b8, aka 28MB -``` - diff --git a/tinygrad_repo/extra/accel/tpu/logs/tpu_driver.t1v-n-852cd0d5-w-0.taylor.log.INFO.20210619-062914.26926.gz b/tinygrad_repo/extra/accel/tpu/logs/tpu_driver.t1v-n-852cd0d5-w-0.taylor.log.INFO.20210619-062914.26926.gz deleted file mode 100644 index dbb64ab..0000000 Binary files a/tinygrad_repo/extra/accel/tpu/logs/tpu_driver.t1v-n-852cd0d5-w-0.taylor.log.INFO.20210619-062914.26926.gz and /dev/null differ diff --git a/tinygrad_repo/extra/accel/tpu/tfexample/libtpu.h b/tinygrad_repo/extra/accel/tpu/tfexample/libtpu.h deleted file mode 100644 index 7a7c71f..0000000 --- a/tinygrad_repo/extra/accel/tpu/tfexample/libtpu.h +++ /dev/null @@ -1,303 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTPU_H_ -#define TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTPU_H_ - -#include -#include - -#define TPUDRIVER_CAPI_EXPORT __attribute__((visibility("default"))) - -#ifdef __cplusplus -extern "C" { -#endif - -// ------------------- TPU Driver Support ----------------------- - -struct TpuDriverFn; - -typedef struct TpuDriver TpuDriver; - -typedef struct TpuEvent TpuEvent; - -typedef struct TpuBufferHandleInternal TpuBufferHandleInternal; - -typedef struct TpuCompiledProgramHandleInternal - TpuCompiledProgramHandleInternal; - -typedef struct TpuLoadedProgramHandleInternal TpuLoadedProgramHandleInternal; - -typedef struct TpuBufferHandle { - TpuBufferHandleInternal* internal_handle; - TpuEvent* event; - int64_t size_in_bytes; -} TpuBufferHandle; - -typedef struct TpuCompiledProgramHandle { - TpuCompiledProgramHandleInternal* internal_handle; - TpuEvent* event; -} TpuCompiledProgramHandle; - -typedef struct TpuLoadedProgramHandle { - TpuLoadedProgramHandleInternal* internal_handle; - TpuEvent* event; -} TpuLoadedProgramHandle; - -// HloProto is a serialized xla::HloProto buffer. -typedef struct HloProto { - void* buffer; - int32_t size; -} HloProto; - -// DeviceAssignment is a serialized xla::DeviceAssignmentProto buffer. -typedef struct DeviceAssignment { - void* bytes; - int32_t size; -} DeviceAssignment; - -typedef struct TpuStatus { - int32_t code; - char* msg; -} TpuStatus; - -typedef struct CompiledProgramShape { - struct TpuStatus* status; - void* bytes; - int32_t size; -} CompiledProgramShape; - -typedef struct TpuAllocationShape { - void* bytes; - int32_t size; -} TpuAllocationShape; - -typedef struct TpuSystemInfo { - void* bytes; - int32_t size; -} TpuSystemInfo; - -typedef void(PrototypeTpuDriver_Initialize)(struct TpuDriverFn* driver_fn, - bool initialize); -typedef struct TpuDriver*(PrototypeTpuDriver_Open)(const char* worker); -typedef void(PrototypeTpuDriver_Close)(struct TpuDriver* driver); -typedef struct TpuStatus*(PrototypeTpuDriver_Reset)(struct TpuDriver* driver); - -typedef struct TpuSystemInfo*(PrototypeTpuDriver_QuerySystemInfo)( - struct TpuDriver* driver); - -typedef void(PrototypeTpuDriver_FreeSystemInfo)(struct TpuSystemInfo* info); - -// TODO(frankchn): Make this not a hard-coded constant. -const int32_t MemoryRegion_HBM = 1; - -typedef int64_t(PrototypeTpuDriver_ComputeLinearizedBytesFromShape)( - struct TpuDriver* driver, const struct TpuAllocationShape shape); - -typedef struct TpuStatus*(PrototypeTpuDriver_LinearizeShape)( - struct TpuDriver* driver, void* dst, const void* src, - const struct TpuAllocationShape shape); - -typedef struct TpuStatus*(PrototypeTpuDriver_DelinearizeShape)( - struct TpuDriver* driver, void* dst, const void* src, - const struct TpuAllocationShape shape); - -typedef struct TpuCompiledProgramHandle*(PrototypeTpuDriver_CompileProgram)( - struct TpuDriver* driver, const struct HloProto hlo_proto, - int32_t num_replicas, int32_t eventc, struct TpuEvent** eventv); - -typedef struct TpuCompiledProgramHandle*( - PrototypeTpuDriver_CompileProgramFromText)(struct TpuDriver* driver, - const char* hlo_text, - int32_t num_replicas, - int32_t eventc, - struct TpuEvent** eventv); - -/* Note: We are not responsible for freeing the event within the - * TpuCompiledProgramHandle. You have to call FreeEvent separately to ensure - * that memory does not leak. - */ -typedef void(PrototypeTpuDriver_FreeCompiledProgramHandle)( - struct TpuCompiledProgramHandle* handle); - -typedef struct TpuLoadedProgramHandle*(PrototypeTpuDriver_LoadProgram)( - struct TpuDriver* driver, int32_t core_id, - const struct TpuCompiledProgramHandle* compiled_program_handle, - int32_t eventc, struct TpuEvent** eventv); - -/* Note: We are not responsible for freeing the event within the - * TpuLoadedProgramHandle. You have to call FreeEvent separately to ensure that - * memory does not leak. - */ -typedef struct TpuEvent*(PrototypeTpuDriver_UnloadProgram)( - struct TpuDriver* driver, - struct TpuLoadedProgramHandle* loaded_program_handle, int32_t eventc, - struct TpuEvent** eventv); - -typedef struct TpuEvent*(PrototypeTpuDriver_ExecuteProgram)( - struct TpuDriver* driver, struct TpuLoadedProgramHandle* handle, - int32_t inputc, struct TpuBufferHandle** input_buffer_handle, - int32_t outputc, struct TpuBufferHandle** output_buffer_handle, - struct DeviceAssignment device_assignment, int32_t eventc, - struct TpuEvent** eventv); - -typedef struct TpuBufferHandle*(PrototypeTpuDriver_AllocateTuple)( - struct TpuDriver* driver, int32_t core_id, int32_t memory_region, - int32_t bufferc, struct TpuBufferHandle** buffer_handle, int32_t eventc, - struct TpuEvent** eventv); - -typedef struct TpuBufferHandle*(PrototypeTpuDriver_Allocate)( - struct TpuDriver* driver, int32_t core_id, int32_t memory_region, - int64_t num_bytes, int32_t eventc, struct TpuEvent** eventv); - -typedef struct TpuBufferHandle*(PrototypeTpuDriver_AllocateShape)( - struct TpuDriver* driver, int32_t core_id, int32_t memory_region, - const struct TpuAllocationShape shape, int32_t eventc, - struct TpuEvent** eventv); - -/* Note: We are not responsible for freeing the event within the - * TpuBufferHandle. You have to call FreeEvent separately to ensure that memory - * does not leak. - */ -typedef struct TpuEvent*(PrototypeTpuDriver_Deallocate)( - struct TpuDriver* driver, struct TpuBufferHandle* buffer_handle, - int32_t eventc, struct TpuEvent** eventv); - -typedef struct TpuEvent*(PrototypeTpuDriver_TransferToDevice)( - struct TpuDriver* driver, const void* src, struct TpuBufferHandle* dst, - int32_t eventc, struct TpuEvent** eventv); - -typedef struct TpuEvent*(PrototypeTpuDriver_TransferFromDevice)( - struct TpuDriver* driver, struct TpuBufferHandle* src, void* dst, - int32_t eventc, struct TpuEvent** eventv); - -typedef struct TpuEvent*(PrototypeTpuDriver_TransferFromDeviceToDevice)( - struct TpuDriver* driver, struct TpuBufferHandle* src, - struct TpuBufferHandle* dst, int32_t eventc, struct TpuEvent** eventv); - -typedef struct CompiledProgramShape*( - PrototypeTpuDriver_GetCompiledProgramShape)( - struct TpuCompiledProgramHandle* handle); - -typedef void(PrototypeTpuDriver_FreeCompiledProgramShape)( - struct CompiledProgramShape* shape); - -typedef void(PrototypeTpuDriver_EventAddCallback)( - struct TpuEvent* event, - void (*callback_fn)(struct TpuStatus*, void* additional_info), - void* additional_info); - -typedef struct TpuStatus*(PrototypeTpuDriver_EventAwait)(struct TpuEvent* event, - int64_t timeout_in_us); - -typedef void(PrototypeTpuDriver_FreeEvent)(struct TpuEvent* event); - -typedef void(PrototypeTpuDriver_FreeStatus)(struct TpuStatus* status); - -typedef const char*(PrototypeTpuDriver_Version)(); - -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Initialize TpuDriver_Initialize; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Open TpuDriver_Open; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Close TpuDriver_Close; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Reset TpuDriver_Reset; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_QuerySystemInfo - TpuDriver_QuerySystemInfo; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeSystemInfo - TpuDriver_FreeSystemInfo; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_ComputeLinearizedBytesFromShape - TpuDriver_ComputeLinearizedBytesFromShape; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_LinearizeShape - TpuDriver_LinearizeShape; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_DelinearizeShape - TpuDriver_DelinearizeShape; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_CompileProgram - TpuDriver_CompileProgram; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_CompileProgramFromText - TpuDriver_CompileProgramFromText; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeCompiledProgramHandle - TpuDriver_FreeCompiledProgramHandle; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_LoadProgram - TpuDriver_LoadProgram; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_UnloadProgram - TpuDriver_UnloadProgram; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_ExecuteProgram - TpuDriver_ExecuteProgram; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_AllocateTuple - TpuDriver_AllocateTuple; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Allocate TpuDriver_Allocate; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_AllocateShape - TpuDriver_AllocateShape; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Deallocate TpuDriver_Deallocate; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_TransferToDevice - TpuDriver_TransferToDevice; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_TransferFromDevice - TpuDriver_TransferFromDevice; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_TransferFromDeviceToDevice - TpuDriver_TransferFromDeviceToDevice; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_GetCompiledProgramShape - TpuDriver_GetCompiledProgramShape; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeCompiledProgramShape - TpuDriver_FreeCompiledProgramShape; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_EventAddCallback - TpuDriver_EventAddCallback; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_EventAwait TpuDriver_EventAwait; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeEvent TpuDriver_FreeEvent; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeStatus TpuDriver_FreeStatus; -TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Version TpuDriver_Version; - -#ifdef __cplusplus -} -#endif - -struct TpuDriverFn { - PrototypeTpuDriver_Open* TpuDriver_Open; // NOLINT - PrototypeTpuDriver_Close* TpuDriver_Close; // NOLINT - PrototypeTpuDriver_Reset* TpuDriver_Reset; // NOLINT - PrototypeTpuDriver_ComputeLinearizedBytesFromShape* - TpuDriver_ComputeLinearizedBytesFromShape; // NOLINT - PrototypeTpuDriver_QuerySystemInfo* TpuDriver_QuerySystemInfo; // NOLINT - PrototypeTpuDriver_FreeSystemInfo* TpuDriver_FreeSystemInfo; // NOLINT - PrototypeTpuDriver_LinearizeShape* TpuDriver_LinearizeShape; // NOLINT - PrototypeTpuDriver_DelinearizeShape* TpuDriver_DelinearizeShape; // NOLINT - PrototypeTpuDriver_CompileProgram* TpuDriver_CompileProgram; // NOLINT - PrototypeTpuDriver_CompileProgramFromText* - TpuDriver_CompileProgramFromText; // NOLINT - PrototypeTpuDriver_FreeCompiledProgramHandle* - TpuDriver_FreeCompiledProgramHandle; // NOLINT - PrototypeTpuDriver_LoadProgram* TpuDriver_LoadProgram; // NOLINT - PrototypeTpuDriver_UnloadProgram* TpuDriver_UnloadProgram; // NOLINT - PrototypeTpuDriver_ExecuteProgram* TpuDriver_ExecuteProgram; // NOLINT - PrototypeTpuDriver_AllocateTuple* TpuDriver_AllocateTuple; // NOLINT - PrototypeTpuDriver_Allocate* TpuDriver_Allocate; // NOLINT - PrototypeTpuDriver_AllocateShape* TpuDriver_AllocateShape; // NOLINT - PrototypeTpuDriver_Deallocate* TpuDriver_Deallocate; // NOLINT - PrototypeTpuDriver_TransferToDevice* TpuDriver_TransferToDevice; // NOLINT - PrototypeTpuDriver_TransferFromDevice* - TpuDriver_TransferFromDevice; // NOLINT - PrototypeTpuDriver_TransferFromDeviceToDevice* - TpuDriver_TransferFromDeviceToDevice; // NOLINT - PrototypeTpuDriver_GetCompiledProgramShape* - TpuDriver_GetCompiledProgramShape; // NOLINT - PrototypeTpuDriver_FreeCompiledProgramShape* - TpuDriver_FreeCompiledProgramShape; // NOLINT - PrototypeTpuDriver_EventAddCallback* TpuDriver_EventAddCallback; // NOLINT - PrototypeTpuDriver_EventAwait* TpuDriver_EventAwait; // NOLINT - PrototypeTpuDriver_FreeEvent* TpuDriver_FreeEvent; // NOLINT - PrototypeTpuDriver_FreeStatus* TpuDriver_FreeStatus; // NOLINT - - PrototypeTpuDriver_Version* TpuDriver_Version; // NOLINT -}; - -#endif // TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTPU_H_ diff --git a/tinygrad_repo/extra/accel/tpu/tfexample/libtpu_client.c b/tinygrad_repo/extra/accel/tpu/tfexample/libtpu_client.c deleted file mode 100644 index 88c64e4..0000000 --- a/tinygrad_repo/extra/accel/tpu/tfexample/libtpu_client.c +++ /dev/null @@ -1,159 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// Before you start, make sure libtpu.so, libtpu.h and libtpu_client.c are in -// the same working directory. -// -// To compile: gcc -o libtpu_client libtpu_client.c -ldl -// To run: sudo ./libtpu_client - -#include -#include -#include - -#include "libtpu.h" - -void hexdump(void *dat, int len) { - /*unsigned char *cdat = (unsigned char*)dat; - for (int i = 0; i < len; i++) { - if (i!=0 && i%0x10 == 0) printf("\n"); - printf("%2.2X ", cdat[i]); - } - printf("\n");*/ -} - - -int main(int argc, char** argv) { - struct TpuDriverFn driver_fn; - TpuDriver_Initialize(&driver_fn, true); - - fprintf(stdout, "------ Going to Query Version ------\n"); - fprintf(stdout, "TPU Driver Version: %s\n", driver_fn.TpuDriver_Version()); - - fprintf(stdout, "------ Going to Open a TPU Driver ------\n"); - struct TpuDriver* driver = driver_fn.TpuDriver_Open("local://"); - - fprintf(stdout, "------ Going to Query for System Information ------\n"); - struct TpuSystemInfo* info = driver_fn.TpuDriver_QuerySystemInfo(driver); - driver_fn.TpuDriver_FreeSystemInfo(info); - - // An example of simple program to sum two parameters. - const char* hlo_module_text = R"(HloModule add_vec_module - ENTRY %add_vec (a: s32[256], b: s32[256]) -> s32[256] { - %a = s32[256] parameter(0) - %b = s32[256] parameter(1) - ROOT %sum = s32[256] add(%a, %b) - } - )"; - - fprintf(stdout, "------ Going to Compile a TPU program ------\n"); - struct TpuCompiledProgramHandle* cph = - driver_fn.TpuDriver_CompileProgramFromText(driver, hlo_module_text, - /*num_replicas=*/1, /*eventc=*/0, /*eventv*/NULL); - - //hexdump(cph->internal_handle, 0x100); - - TpuEvent* compile_events[] = {cph->event}; - fprintf(stdout, "------ Going to Load a TPU program ------\n"); - struct TpuLoadedProgramHandle* lph = - driver_fn.TpuDriver_LoadProgram(driver, /*core_id=*/0, cph, - /*eventc=*/1, /*eventv=*/compile_events); - - const int size = 1024; - - fprintf(stdout, "------ Going to Allocate a TPU Buffer ------\n"); - struct TpuBufferHandle* buf_a_handle = - driver_fn.TpuDriver_Allocate(driver, /*core-id=*/0, /*memory_region=*/1, - /*bytes=*/size, /*eventc=*/0, /*eventv=*/NULL); - fprintf(stdout, "------ Going to Allocate a TPU Buffer ------\n"); - struct TpuBufferHandle* buf_b_handle = - driver_fn.TpuDriver_Allocate(driver, /*core-id=*/0, /*memory_region=*/1, - /*bytes=*/size, /*eventc=*/0, /*eventv=*/NULL); - fprintf(stdout, "------ Going to Allocate a TPU Buffer ------\n"); - struct TpuBufferHandle* buf_sum_handle = - driver_fn.TpuDriver_Allocate(driver, /*core-id=*/0, /*memory_region=*/1, - /*bytes=*/size, /*eventc=*/0, /*eventv=*/NULL); - - char a_src[size], b_src[size], sum_src[size]; - for (int i = 0; i < size; ++i) { - a_src[i] = 1; - b_src[i] = 2; - sum_src[i] = 0; - } - - TpuEvent* allocate_buf_a_events[] = {buf_a_handle->event}; - fprintf(stdout, "------ Going to Transfer To Device ------\n"); - struct TpuEvent* transfer_ev1 = - driver_fn.TpuDriver_TransferToDevice(driver, a_src, buf_a_handle, - /*eventc=*/1, /*eventv=*/allocate_buf_a_events); - TpuEvent* allocate_buf_b_events[] = {buf_a_handle->event}; - fprintf(stdout, "------ Going to Transfer To Device ------\n"); - struct TpuEvent* transfer_ev2 = - driver_fn.TpuDriver_TransferToDevice(driver, b_src, buf_b_handle, - /*eventc=*/1, /*eventv=*/allocate_buf_b_events); - - //getchar(); - - fprintf(stdout, "------ Going to Execute a TPU program ------\n"); - DeviceAssignment device_assignment = {NULL, 0}; - TpuBufferHandle* input_buffer_handle[] = {buf_a_handle, buf_b_handle}; - TpuBufferHandle* output_buffer_handle[] = {buf_sum_handle}; - TpuEvent* transfer_events[] = {transfer_ev1, transfer_ev2}; - struct TpuEvent* execute_event = - driver_fn.TpuDriver_ExecuteProgram(driver, lph, - /*inputc=*/2, /*input_buffer_handle=*/input_buffer_handle, - /*outputc=*/1, /*output_buffer_handle=*/output_buffer_handle, - device_assignment, - /*eventc=*/2, /*eventv*/transfer_events); - - fprintf(stdout, "------ Going to Transfer From Device ------\n"); - TpuEvent* execute_events[] = {execute_event}; - struct TpuEvent* transfer_sum_event = - driver_fn.TpuDriver_TransferFromDevice(driver, buf_sum_handle, sum_src, - /*eventc=*/1, /*eventv=*/execute_events); - - TpuStatus* status = driver_fn.TpuDriver_EventAwait(transfer_sum_event, - 10000000); - if (status->code != 0) { - fprintf(stdout, "Transfer Event Await: Code: %d, Message: %s\n", - status->code, status->msg); - } - - fprintf(stdout, "------ Going to Unload a TPU program ------\n"); - struct TpuEvent* unload_program_event = driver_fn.TpuDriver_UnloadProgram( - driver, lph, /*eventc=*/1, /*eventv=*/execute_events); - - fprintf(stdout, "------ Going to Deallocate a TPU Buffer ------\n"); - struct TpuEvent* dealloc_ev1 = driver_fn.TpuDriver_Deallocate(driver, - buf_a_handle, /*eventc=*/0, /*eventv=*/NULL); - driver_fn.TpuDriver_FreeEvent(dealloc_ev1); - - fprintf(stdout, "------ Going to Deallocate a TPU Buffer ------\n"); - struct TpuEvent* dealloc_ev2 = driver_fn.TpuDriver_Deallocate(driver, - buf_b_handle, /*eventc=*/0, /*eventv=*/NULL); - driver_fn.TpuDriver_FreeEvent(dealloc_ev2); - - fprintf(stdout, "------ Going to Deallocate a TPU Buffer ------\n"); - struct TpuEvent* dealloc_ev3 = driver_fn.TpuDriver_Deallocate(driver, - buf_sum_handle, /*eventc=*/0, /*eventv=*/NULL); - driver_fn.TpuDriver_FreeEvent(dealloc_ev3); - - fprintf(stdout, "sum:\n"); - for (size_t i = 0; i < size; ++i) { - fprintf(stdout, "%d ", sum_src[i]); - } - - exit(EXIT_SUCCESS); -} diff --git a/tinygrad_repo/extra/amdpci/am_smi.py b/tinygrad_repo/extra/amdpci/am_smi.py index 6668310..31f29f2 100644 --- a/tinygrad_repo/extra/amdpci/am_smi.py +++ b/tinygrad_repo/extra/amdpci/am_smi.py @@ -1,14 +1,14 @@ #!/usr/bin/env python3 -import time, mmap, sys, shutil, os, glob, subprocess, argparse +import time, mmap, sys, shutil, os, glob, subprocess, argparse, collections from tinygrad.helpers import DEBUG, colored, ansilen from tinygrad.runtime.autogen import libc from tinygrad.runtime.autogen.am import am from tinygrad.runtime.support.hcq import MMIOInterface -from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager +from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager, AMPageTableEntry from tinygrad.runtime.support.am.ip import AM_SOC, AM_GMC, AM_IH, AM_PSP, AM_SMU, AM_GFX, AM_SDMA -AM_VERSION = 0xA0000004 +AM_VERSION = 0xA0000005 def bold(s): return f"\033[1m{s}\033[0m" @@ -27,10 +27,14 @@ def color_temp(temp): def color_voltage(voltage): return colored(f"{voltage/1000:>5.3f}V", "cyan") -def draw_bar(percentage, width=40, fill='█', empty='░'): +def draw_bar(percentage, width=40, fill='|', empty=' ', opt_text='', color='cyan'): filled_width = int(width * percentage) + if not opt_text: opt_text = f'{percentage*100:.1f}%' + bar = fill * filled_width + empty * (width - filled_width) - return f'[{bar}] {percentage*100:5.1f}%' + bar = (bar[:-len(opt_text)] + opt_text) if opt_text else bar + bar = colored(bar[:filled_width], color) + bar[filled_width:] + return f'[{bar}]' def same_line(strs:list[list[str]|None], split=8) -> list[str]: strs = [s for s in strs if s is not None] @@ -175,9 +179,25 @@ class SMICtx: def get_power(self, dev, metrics): return metrics.SmuMetrics.AverageSocketPower, metrics.SmuMetrics.dGPU_W_MAX - def draw(self): + def get_mem_usage(self, dev): + usage = 0 + pt_stack = [dev.mm.root_page_table] + while len(pt_stack) > 0: + pt = pt_stack.pop() + for i in range(512): + entry = pt.entries[i] + + if (entry & am.AMDGPU_PTE_VALID) == 0: continue + if pt.lv!=am.AMDGPU_VM_PTB and not dev.gmc.is_pte_huge_page(entry): + pt_stack.append(AMPageTableEntry(dev, entry & 0x0000FFFFFFFFF000, lv=pt.lv+1)) + continue + if (entry & am.AMDGPU_PTE_SYSTEM) != 0: continue + usage += (1 << ((9 * (3-pt.lv)) + 12)) + return usage + + def draw(self, once): terminal_width, terminal_height = shutil.get_terminal_size() - if self.prev_terminal_width != terminal_width or self.prev_terminal_height != terminal_height: + if not once and (self.prev_terminal_width != terminal_width or self.prev_terminal_height != terminal_height): os.system('clear') self.prev_terminal_width, self.prev_terminal_height = terminal_width, terminal_height @@ -196,9 +216,14 @@ class SMICtx: [pad(f"PCI State: {dev.pci_state}", col_size)]) continue + mem_used = self.get_mem_usage(dev) + mem_total = dev.vram_size + mem_fmt = f"{mem_used/1024**3:.1f}/{mem_total/1024**3:.1f}G" + device_line = [f"{bold(dev.pcibus)} {trim(self.lspci[dev.pcibus[5:]], col_size - 20)}"] + [pad("", col_size)] activity_line = [f"GFX Activity {draw_bar(self.get_gfx_activity(dev, metrics) / 100, activity_line_width)}"] \ - + [f"MEM Activity {draw_bar(self.get_mem_activity(dev, metrics) / 100, activity_line_width)}"] + + [f"MEM Activity {draw_bar(self.get_mem_activity(dev, metrics) / 100, activity_line_width)}"] \ + + [f"MEM Usage {draw_bar((mem_used / mem_total) / 100, activity_line_width, opt_text=mem_fmt)}"] \ temps_data, temps_data_compact = self.get_temps(dev, metrics), self.get_temps(dev, metrics, compact=True) temps_table = ["=== Temps (°C) ==="] + [f"{name:<16}: {color_temp(val)}" for name, val in temps_data.items()] @@ -208,8 +233,8 @@ class SMICtx: power_table = ["=== Power ==="] + [f"Fan Speed: {fan_rpm} RPM"] + [f"Fan Power: {fan_pwm}%"] total_power, max_power = self.get_power(dev, metrics) - power_line = [f"Power: {total_power:>3}W " + draw_bar(total_power / max_power, 16)] - power_line_compact = [f"Power: {total_power:>3}W " + draw_bar(total_power / max_power, activity_line_width)] + power_line = [f"Power: " + draw_bar(total_power / max_power, 16, opt_text=f"{total_power}/{max_power}W")] + power_line_compact = [f"Power: " + draw_bar(total_power / max_power, activity_line_width, opt_text=f"{total_power}/{max_power}W")] voltage_data = self.get_voltage(dev, metrics) voltage_table = ["=== Voltages ==="] + [f"{name:<20}: {color_voltage(voltage)}" for name, voltage in voltage_data.items()] @@ -252,6 +277,7 @@ class SMICtx: if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument("--list", action="store_true", help="Run once and exit") parser.add_argument("--pids", action="store_true", help="Print pids for all AM devices") parser.add_argument("--kill", action="store_true", help="Kill all pids associated with AM devices. Valid only with --pids") parser.add_argument("--dev", type=str, default=None, help="PCI bus ID of the AM device to monitor (e.g., 0000:01:00.0)") @@ -265,9 +291,18 @@ if __name__ == "__main__": try: if args.kill: - pid = subprocess.check_output(['sudo', 'lsof', '-t', dev]).decode('utf-8').split('\n')[0] - os.system(f'sudo kill -9 {pid}') - print(f"{dev[8:-5]}: killed process {pid}") + stopped_pids = collections.defaultdict(int) + while True: + try: pid = subprocess.check_output(['sudo', 'lsof', '-t', dev]).decode('utf-8').split('\n')[0] + except subprocess.CalledProcessError: break + if stopped_pids[pid] > 0: time.sleep(0.5) + if stopped_pids[pid] == 10: + print(f"{dev[8:-5]}: can't stop process {pid}, exitting") + exit(1) + + print(f"{dev[8:-5]}: killing process {pid}") + os.system(f'sudo pkill -g -9 {pid}') + stopped_pids[pid] += 1 else: pid = subprocess.check_output(['sudo', 'lsof', dev]).decode('utf-8').strip().split('\n')[1].split()[1] print(f"{dev[8:-5]}: {pid}") @@ -276,10 +311,11 @@ if __name__ == "__main__": sys.exit(0) try: - os.system('clear') + if not args.list: os.system('clear') smi_ctx = SMICtx() while True: smi_ctx.rescan_devs() - smi_ctx.draw() + smi_ctx.draw(args.list) + if args.list: break time.sleep(1) except KeyboardInterrupt: print("Exiting...") diff --git a/tinygrad_repo/extra/datasets/sops.gz b/tinygrad_repo/extra/datasets/sops.gz index b476386..1362cbe 100644 Binary files a/tinygrad_repo/extra/datasets/sops.gz and b/tinygrad_repo/extra/datasets/sops.gz differ diff --git a/tinygrad_repo/extra/export_model.py b/tinygrad_repo/extra/export_model.py index 63b4098..2b2aa1d 100644 --- a/tinygrad_repo/extra/export_model.py +++ b/tinygrad_repo/extra/export_model.py @@ -48,13 +48,13 @@ def jit_model(model, *args) -> Tuple[TinyJit,Dict[int,str]]: # hack to put the inputs back for (j,i),idx in run.input_replace.items(): - realized_input = args[idx].lazydata.base.realized + realized_input = args[idx].uop.base.realized run.jit_cache[j].bufs[i] = realized_input special_names[id(realized_input)] = f'input{idx}' # TODO: fetch this from the jit in self.input_replace and self.ret (hint: use get_parameters on self.ret) for i, output in enumerate(the_output): - special_names[id(output.lazydata.base.realized)] = f'output{i}' + special_names[id(output.uop.base.realized)] = f'output{i}' return run, special_names def export_model_clang(functions:Dict[str,str], statements:Dict[str,Tuple[str,int,int]], bufs:Dict[str,Tuple[str,int,int]], @@ -242,7 +242,7 @@ def export_model(model, target:str, *inputs, model_name: Optional[str] = "model" with Context(JIT=2): run,special_names = jit_model(model, *inputs) functions, statements, bufs, bufs_to_save = compile_net(run, special_names) state = get_state_dict(model) - weight_names = {id(x.lazydata.base.realized): name for name, x in state.items()} + weight_names = {id(x.uop.base.realized): name for name, x in state.items()} input_names = [name for _,name in special_names.items() if "input" in name] output_names = [name for _,name in special_names.items() if "output" in name] diff --git a/tinygrad_repo/extra/gemm/metal_matmul.py b/tinygrad_repo/extra/gemm/metal_matmul.py index b4a9706..88bf384 100644 --- a/tinygrad_repo/extra/gemm/metal_matmul.py +++ b/tinygrad_repo/extra/gemm/metal_matmul.py @@ -26,7 +26,7 @@ metalalloc._copyin(c,nc.tobytes()) FLOPS = N*N*N*2 BW = N*N*3*4 -prog = MetalProgram(device, "test", MetalCompiler(device).compile(f""" +prog = MetalProgram(device, "test", MetalCompiler().compile(f""" #include #include // Available from Metal version 2.3 released with OS X 11.0+ using namespace metal; diff --git a/tinygrad_repo/extra/gemm/metal_matvec.py b/tinygrad_repo/extra/gemm/metal_matvec.py index abd1b72..8fc8c9d 100644 --- a/tinygrad_repo/extra/gemm/metal_matvec.py +++ b/tinygrad_repo/extra/gemm/metal_matvec.py @@ -30,7 +30,7 @@ WORKSIZE_ROW = 16 WORKSIZE_COL = 1 LOCAL_SIZE = [32, WORKSIZE_COL, WORKSIZE_ROW] GLOBAL_SIZE = [M//(LOCAL_SIZE[0]*LOCAL_SIZE[1]*4), 1, 1] -prog = MetalProgram(device, "test", MetalCompiler(device).compile(f""" +prog = MetalProgram(device, "test", MetalCompiler().compile(f""" #include using namespace metal; kernel void test(device float* data0, const device float* data1, const device float* data2, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) {{ diff --git a/tinygrad_repo/extra/hip_gpu_driver/test_pm4.py b/tinygrad_repo/extra/hip_gpu_driver/test_pm4.py index b5a4541..d0cf920 100644 --- a/tinygrad_repo/extra/hip_gpu_driver/test_pm4.py +++ b/tinygrad_repo/extra/hip_gpu_driver/test_pm4.py @@ -47,7 +47,7 @@ if __name__ == "__main__": a = Tensor([0.,1.,2.], device="KFD").realize() b = a + 7 - b.lazydata.buffer.allocate() + b.uop.buffer.allocate() si = b.schedule()[-1] runner = dev.get_runner(*si.ast) prg: AMDProgram = runner.clprg @@ -69,8 +69,8 @@ if __name__ == "__main__": #scratch = dev._gpu_alloc(0x10000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) ka = to_mv(dev.kernargs_ptr, 0x10).cast("Q") - ka[0] = b.lazydata.buffer._buf.va_addr - ka[1] = a.lazydata.buffer._buf.va_addr + ka[0] = b.uop.buffer._buf.va_addr + ka[1] = a.uop.buffer._buf.va_addr compute_read_pointer = to_mv(compute_queue.read_pointer_address, 8).cast("Q") compute_write_pointer = to_mv(compute_queue.write_pointer_address, 8).cast("Q") diff --git a/tinygrad_repo/extra/huggingface_onnx/run_models.py b/tinygrad_repo/extra/huggingface_onnx/run_models.py index c819522..af920f8 100644 --- a/tinygrad_repo/extra/huggingface_onnx/run_models.py +++ b/tinygrad_repo/extra/huggingface_onnx/run_models.py @@ -1,6 +1,6 @@ import onnx, yaml, tempfile, time, collections, pprint, argparse, json from pathlib import Path -from tinygrad.frontend.onnx import OnnxRunner +from tinygrad.frontend.onnx import OnnxRunner, onnx_load from extra.onnx import get_onnx_ops from extra.onnx_helpers import validate, get_example_inputs @@ -13,7 +13,7 @@ def get_config(root_path: Path): return ret def run_huggingface_validate(onnx_model_path, config, rtol, atol): - onnx_model = onnx.load(onnx_model_path) + onnx_model = onnx_load(onnx_model_path) onnx_runner = OnnxRunner(onnx_model) inputs = get_example_inputs(onnx_runner.graph_inputs, config) validate(onnx_model_path, inputs, rtol=rtol, atol=atol) @@ -116,7 +116,7 @@ if __name__ == "__main__": # repo id # validates all onnx models inside repo repo_id = "/".join(path) - root_path = Path(snapshot_download(repo_id=repo_id, allow_patterns=["*.onnx", ".onnx_data"], cache_dir=download_dir)) + root_path = Path(snapshot_download(repo_id=repo_id, allow_patterns=["*.onnx", "*.onnx_data"], cache_dir=download_dir)) snapshot_download(repo_id=repo_id, allow_patterns=["*config.json"], cache_dir=download_dir) config = get_config(root_path) for onnx_model in root_path.rglob("*.onnx"): diff --git a/tinygrad_repo/extra/lr_scheduler.py b/tinygrad_repo/extra/lr_scheduler.py index 9b2756e..87ff077 100644 --- a/tinygrad_repo/extra/lr_scheduler.py +++ b/tinygrad_repo/extra/lr_scheduler.py @@ -10,9 +10,8 @@ class LR_Scheduler: def get_lr(self): pass - def step(self) -> None: - self.epoch_counter.assign(self.epoch_counter + 1).realize() - self.optimizer.lr.assign(self.get_lr()).realize() + def schedule_step(self) -> list[Tensor]: return [self.epoch_counter.assign(self.epoch_counter + 1), self.optimizer.lr.assign(self.get_lr())] + def step(self) -> None: Tensor.realize(*self.schedule_step()) class LRSchedulerGroup: def __init__(self, *schedulers: LR_Scheduler): self.schedulers = schedulers diff --git a/tinygrad_repo/extra/models/convnext.py b/tinygrad_repo/extra/models/convnext.py index 591112a..7fb0da1 100644 --- a/tinygrad_repo/extra/models/convnext.py +++ b/tinygrad_repo/extra/models/convnext.py @@ -59,7 +59,6 @@ if __name__ == "__main__": img = Tensor(preprocess(chicken_img)) Tensor.training = False - Tensor.no_grad = True out = model(img).numpy() print(_LABELS[out.argmax()]) diff --git a/tinygrad_repo/extra/models/llama.py b/tinygrad_repo/extra/models/llama.py index 56f6c01..002c0cc 100644 --- a/tinygrad_repo/extra/models/llama.py +++ b/tinygrad_repo/extra/models/llama.py @@ -191,7 +191,7 @@ class Transformer: def __call__(self, tokens:Tensor, start_pos:int, temperature:float=0.0, top_k:int=0, top_p:float=0.8, alpha_f:float=0.0, alpha_p:float=0.0): # TODO: better way to handle the first call v.s. the rest? if tokens.shape[0:2] == (1,1) and self.forward_jit is not None and start_pos != 0: - return self.forward_jit(tokens, Variable("start_pos", 1, self.max_context).bind(start_pos), temperature, top_k, top_p, alpha_f, alpha_p) + return self.forward_jit(tokens, Variable("start_pos", 1, self.max_context-1).bind(start_pos), temperature, top_k, top_p, alpha_f, alpha_p) return self.forward(tokens, start_pos, temperature, top_k, top_p, alpha_f, alpha_p) # *** helpers *** diff --git a/tinygrad_repo/extra/multitensor.py b/tinygrad_repo/extra/multitensor.py index dadcf1e..af7ddc3 100644 --- a/tinygrad_repo/extra/multitensor.py +++ b/tinygrad_repo/extra/multitensor.py @@ -23,7 +23,7 @@ def explicit_shard_W_axis_1(X, W): x = x.reshape(N, 1, N).expand(N, N, N) w = w.T.reshape(1, N, N).expand(N, N, N) m = x*w - assert m.lazydata.st.views[0].mask is not None + assert m.uop.st.views[0].mask is not None ret = m.sum(2) return ret #Os = [lm(Xs[0], Ws[0]), lm(Xs[1], Ws[1])] diff --git a/tinygrad_repo/extra/nv_gpu_driver/clcec0qmd.h b/tinygrad_repo/extra/nv_gpu_driver/clcec0qmd.h index dd2759a..4ee8a12 100644 --- a/tinygrad_repo/extra/nv_gpu_driver/clcec0qmd.h +++ b/tinygrad_repo/extra/nv_gpu_driver/clcec0qmd.h @@ -1,65 +1,69 @@ +/******************************************************************************* + Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + +*******************************************************************************/ + #ifndef __CLCEC0QMD_H__ #define __CLCEC0QMD_H__ -#define NVCEC0_QMDV05_00_CTA_RASTER_WIDTH MW(1279:1248) // aka GRID_WIDTH -#define NVCEC0_QMDV05_00_CTA_RASTER_HEIGHT MW(1311:1280) // aka GRID_HEIGHT -#define NVCEC0_QMDV05_00_CTA_RASTER_DEPTH MW(1343:1312) // aka GRID_DEPTH +/* +** Queue Meta Data, Version 05_00 + */ -#define NVCEC0_QMDV05_00_REGISTER_COUNT_V MW(1136:1128) -#define NVCEC0_QMDV05_00_BARRIER_COUNT MW(1137:1137) // ?? - -#define NVCEC0_QMDV05_00_QMD_MINOR_VERSION MW(467:464) -#define NVCEC0_QMDV05_00_QMD_MAJOR_VERSION MW(471:468) - -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_ADDR_LOWER_SHIFTED6(i) MW((1375+(i)*64):(1344+(i)*64)) -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_ADDR_UPPER_SHIFTED6(i) MW((1394+(i)*64):(1376+(i)*64)) -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1407+(i)*64):(1395+(i)*64)) - -#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION0 MW(1103:1088) -#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION1 MW(1119:1104) -#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION2 MW(1128:1120) - -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID(i) MW((1856+(i)*4):(1856+(i)*4)) -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID_FALSE 0x00000000 -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID_TRUE 0x00000001 -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH(i) MW((1858+(i)*4):(1857+(i)*4)) -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_NONE 0x00000000 -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_PRE 0x00000001 -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_POST 0x00000002 -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE(i) MW((1859+(i)*4):(1859+(i)*4)) -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 -#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 - -#define NVCEC0_QMDV05_00_DEPENDENCE_COUNTER MW(143:128) // ?? +#define NVCEC0_QMDV05_00_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(29:0) +#define NVCEC0_QMDV05_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(30:30) +#define NVCEC0_QMDV05_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(31:31) +#define NVCEC0_QMDV05_00_HW_ONLY_SKED_NEXT_QMD_POINTER MW(63:32) +#define NVCEC0_QMDV05_00_INNER_GET MW(94:64) +#define NVCEC0_QMDV05_00_INNER_OVERFLOW MW(95:95) +#define NVCEC0_QMDV05_00_INNER_PUT MW(126:96) +#define NVCEC0_QMDV05_00_INNER_STICKY_OVERFLOW MW(127:127) +#define NVCEC0_QMDV05_00_DEPENDENCE_COUNTER MW(143:128) #define NVCEC0_QMDV05_00_QMD_GROUP_ID MW(149:144) - -#define NVCEC0_QMDV05_00_PROGRAM_ADDRESS_LOWER MW(1055:1024) -#define NVCEC0_QMDV05_00_PROGRAM_ADDRESS_UPPER MW(1080:1056) - -#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_POINTER MW(415:384) -#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_POINTER MW(447:416) - -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(i) MW((336+(i)*5):(336+(i)*5)) -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE_FALSE 0x00000000 -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE_TRUE 0x00000001 -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(i) MW((339+(i)*5):(337+(i)*5)) -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_INCREMENT_PUT 0x00000000 -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_SCHEDULE 0x00000001 -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003 -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004 -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(i) MW((340+(i)*5):(340+(i)*5)) -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH_FALSE 0x00000000 -#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH_TRUE 0x00000001 - -#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ENABLE NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(0) -#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ENABLE NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(1) - -#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ACTION NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(0) -#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ACTION NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(1) - -#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_PREFETCH NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(0) -#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_PREFETCH NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(1) - +#define NVCEC0_QMDV05_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(150:150) +#define NVCEC0_QMDV05_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_QMD_TYPE MW(153:151) +#define NVCEC0_QMDV05_00_QMD_TYPE_QUEUE 0x00000000 +#define NVCEC0_QMDV05_00_QMD_TYPE_GRID_NULL 0x00000001 +#define NVCEC0_QMDV05_00_QMD_TYPE_GRID_CTA 0x00000002 +#define NVCEC0_QMDV05_00_QMD_TYPE_GRID_GPC_CGA 0x00000003 +#define NVCEC0_QMDV05_00_QMD_TYPE_GRID_GPU_CGA 0x00000004 +#define NVCEC0_QMDV05_00_QMD_TYPE_GRID_GPU_GPC_CGA 0x00000005 +#define NVCEC0_QMDV05_00_NUM_SUB_TASKS_PER_TASK MW(157:154) +#define NVCEC0_QMDV05_00_REQUIRE_SCHEDULING_PCAS MW(158:158) +#define NVCEC0_QMDV05_00_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_TPC_DISABLE_MASK_VALID MW(159:159) +#define NVCEC0_QMDV05_00_TPC_DISABLE_MASK_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_TPC_DISABLE_MASK_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_CIRCULAR_QUEUE_SIZE MW(184:160) +#define NVCEC0_QMDV05_00_HW_ONLY_DEPENDENCE_COUNTER MW(207:192) +#define NVCEC0_QMDV05_00_RESUME_SUB_TASK_ID MW(210:208) +#define NVCEC0_QMDV05_00_COMPLETED_SUB_TASK_MASK MW(218:211) +#define NVCEC0_QMDV05_00_GRID_WIDTH_RESUME MW(255:224) +#define NVCEC0_QMDV05_00_GRID_HEIGHT_RESUME MW(271:256) +#define NVCEC0_QMDV05_00_GRID_DEPTH_RESUME MW(287:272) #define NVCEC0_QMDV05_00_RELEASE_ENABLE(i) MW((288+(i)*16):(288+(i)*16)) #define NVCEC0_QMDV05_00_RELEASE_ENABLE_FALSE 0x00000000 #define NVCEC0_QMDV05_00_RELEASE_ENABLE_TRUE 0x00000001 @@ -94,38 +98,59 @@ #define NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B_FALSE 0x00000000 #define NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B_TRUE 0x00000001 #define NVCEC0_QMDV05_00_RELEASE_RESERVED_INFO(i) MW((303+(i)*16):(301+(i)*16)) - -#define NVCEC0_QMDV05_00_RELEASE0_ENABLE NVCEC0_QMDV05_00_RELEASE_ENABLE(0) -#define NVCEC0_QMDV05_00_RELEASE1_ENABLE NVCEC0_QMDV05_00_RELEASE_ENABLE(1) - -#define NVCEC0_QMDV05_00_RELEASE0_STRUCTURE_SIZE NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE(0) -#define NVCEC0_QMDV05_00_RELEASE1_STRUCTURE_SIZE NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE(1) - -#define NVCEC0_QMDV05_00_RELEASE0_MEMBAR_TYPE NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE(0) -#define NVCEC0_QMDV05_00_RELEASE1_MEMBAR_TYPE NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE(1) - -#define NVCEC0_QMDV05_00_RELEASE0_REDUCTION_OP NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP(0) -#define NVCEC0_QMDV05_00_RELEASE1_REDUCTION_OP NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP(1) - -#define NVCEC0_QMDV05_00_RELEASE0_REDUCTION_FORMAT NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT(0) -#define NVCEC0_QMDV05_00_RELEASE1_REDUCTION_FORMAT NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT(1) - -#define NVCEC0_QMDV05_00_RELEASE0_TRAP_TYPE NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE(0) -#define NVCEC0_QMDV05_00_RELEASE1_TRAP_TYPE NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE(1) - -#define NVCEC0_QMDV05_00_RELEASE0_PAYLOAD64B NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B(0) -#define NVCEC0_QMDV05_00_RELEASE1_PAYLOAD64B NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B(1) - -#define NVCEC0_QMDV05_00_RELEASE0_ADDRESS_LOWER MW(511:480) -#define NVCEC0_QMDV05_00_RELEASE0_ADDRESS_UPPER MW(543:512) -#define NVCEC0_QMDV05_00_RELEASE0_PAYLOAD_LOWER MW(575:544) -#define NVCEC0_QMDV05_00_RELEASE0_PAYLOAD_UPPER MW(607:576) - -#define NVCEC0_QMDV05_00_RELEASE1_ADDRESS_LOWER MW(799:768) -#define NVCEC0_QMDV05_00_RELEASE1_ADDRESS_UPPER MW(831:800) -#define NVCEC0_QMDV05_00_RELEASE1_PAYLOAD_LOWER MW(863:832) -#define NVCEC0_QMDV05_00_RELEASE1_PAYLOAD_UPPER MW(895:864) - +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ENABLE MW(336:336) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ACTION MW(339:337) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ACTION_QMD_INCREMENT_PUT 0x00000000 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ACTION_QMD_SCHEDULE 0x00000001 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_PREFETCH MW(340:340) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_PREFETCH_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_PREFETCH_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_SELF_COPY_ON_COMPLETION MW(341:341) +#define NVCEC0_QMDV05_00_SELF_COPY_ON_COMPLETION_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_SELF_COPY_ON_COMPLETION_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_DEMOTE_L2_EVICT_LAST MW(342:342) +#define NVCEC0_QMDV05_00_DEMOTE_L2_EVICT_LAST_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_DEMOTE_L2_EVICT_LAST_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_DISABLE_AUTO_INVALIDATE MW(343:343) +#define NVCEC0_QMDV05_00_DISABLE_AUTO_INVALIDATE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_DISABLE_AUTO_INVALIDATE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ENABLE MW(344:344) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ACTION MW(347:345) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ACTION_QMD_INCREMENT_PUT 0x00000000 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ACTION_QMD_SCHEDULE 0x00000001 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_PREFETCH MW(348:348) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_PREFETCH_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_PREFETCH_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_CORRELATION_ID_INTERNAL MW(349:349) +#define NVCEC0_QMDV05_00_CORRELATION_ID_INTERNAL_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_CORRELATION_ID_INTERNAL_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_CWD_MEMBAR_TASK_CHASING_ENABLE MW(350:350) +#define NVCEC0_QMDV05_00_CWD_MEMBAR_TASK_CHASING_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_CWD_MEMBAR_TASK_CHASING_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_SHARED_ALLOCATION_ENABLE MW(351:351) +#define NVCEC0_QMDV05_00_SHARED_ALLOCATION_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_SHARED_ALLOCATION_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_CORRELATION_ID MW(383:352) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_POINTER MW(415:384) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_POINTER MW(447:416) +#define NVCEC0_QMDV05_00_SASS_VERSION MW(455:448) +#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT MW(456:456) +#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT__32 0x00000000 +#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 +#define NVCEC0_QMDV05_00_SAMPLER_INDEX MW(457:457) +#define NVCEC0_QMDV05_00_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 +#define NVCEC0_QMDV05_00_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 +#define NVCEC0_QMDV05_00_CONSTANT_BANK_PREFETCH_PRE_MAX_SIZE_SHIFTED8 MW(463:458) +#define NVCEC0_QMDV05_00_QMD_MINOR_VERSION MW(467:464) +#define NVCEC0_QMDV05_00_QMD_MAJOR_VERSION MW(471:468) #define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_HEADER_CACHE MW(472:472) #define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 #define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 @@ -144,29 +169,695 @@ #define NVCEC0_QMDV05_00_INVALIDATE_SHADER_CONSTANT_CACHE MW(477:477) #define NVCEC0_QMDV05_00_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 #define NVCEC0_QMDV05_00_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 - -#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(1919:1888) -#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1936:1920) -#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_SIZE MW(1945:1937) -#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE MW(1947:1946) +#define NVCEC0_QMDV05_00_LATCH_ACQUIRE_INVALIDATE_SHADER_DATA_CACHE MW(478:478) +#define NVCEC0_QMDV05_00_LATCH_ACQUIRE_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_LATCH_ACQUIRE_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_LATCH_ACQUIRE_INVALIDATE_TEXTURE_DATA_CACHE MW(479:479) +#define NVCEC0_QMDV05_00_LATCH_ACQUIRE_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_LATCH_ACQUIRE_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE0_ADDR_LOWER MW(511:480) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE0_ADDR_UPPER MW(536:512) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE0_PAYLOAD_LOWER MW(575:544) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE0_PAYLOAD_UPPER MW(607:576) +#define NVCEC0_QMDV05_00_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(615:608) +#define NVCEC0_QMDV05_00_CWD_REFERENCE_COUNT_ID MW(621:616) +#define NVCEC0_QMDV05_00_CWD_REFERENCE_COUNT_INCR_ENABLE MW(622:622) +#define NVCEC0_QMDV05_00_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_CWD_REFERENCE_COUNT_DECR_ENABLE MW(623:623) +#define NVCEC0_QMDV05_00_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_CWD_MEMBAR_TYPE MW(625:624) +#define NVCEC0_QMDV05_00_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 +#define NVCEC0_QMDV05_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 +#define NVCEC0_QMDV05_00_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 +#define NVCEC0_QMDV05_00_LATCH_ACQUIRE_INVALIDATE_CONSTANT_CACHE MW(626:626) +#define NVCEC0_QMDV05_00_LATCH_ACQUIRE_INVALIDATE_CONSTANT_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_LATCH_ACQUIRE_INVALIDATE_CONSTANT_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_CTA_LAUNCH_QUEUE MW(627:627) +#define NVCEC0_QMDV05_00_FREE_CTA_SLOTS_EMPTY_SM MW(635:628) +#define NVCEC0_QMDV05_00_SYNC_DOMAIN_ID MW(637:636) +#define NVCEC0_QMDV05_00_PRE_EXIT_AT_LAST_CTA_LAUNCH MW(638:638) +#define NVCEC0_QMDV05_00_PRE_EXIT_AT_LAST_CTA_LAUNCH_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_PRE_EXIT_AT_LAST_CTA_LAUNCH_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_ENABLE_PROGRAM_PRE_EXIT MW(639:639) +#define NVCEC0_QMDV05_00_ENABLE_PROGRAM_PRE_EXIT_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_ENABLE_PROGRAM_PRE_EXIT_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_ARRIVE_AT_LATCH_ID MW(671:640) +#define NVCEC0_QMDV05_00_WAIT_ON_LATCH_ID MW(703:672) +#define NVCEC0_QMDV05_00_OCCUPANCY_THRESHOLD_SHARED_MEM MW(721:714) +#define NVCEC0_QMDV05_00_OCCUPANCY_MAX_SHARED_MEM MW(729:722) +#define NVCEC0_QMDV05_00_ARRIVE_AT_LATCH_VALID MW(730:730) +#define NVCEC0_QMDV05_00_ARRIVE_AT_LATCH_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_ARRIVE_AT_LATCH_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_WAIT_ON_LATCH_VALID MW(731:731) +#define NVCEC0_QMDV05_00_WAIT_ON_LATCH_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_WAIT_ON_LATCH_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_LATCH_RELEASE_INVALIDATE_ENABLE MW(732:732) +#define NVCEC0_QMDV05_00_LATCH_RELEASE_INVALIDATE_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_LATCH_RELEASE_INVALIDATE_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_HOLD_CTA_LAUNCH_UNTIL_PARENT_LATCH_ACQUIRE_AND_CTA_COMPLETE MW(733:733) +#define NVCEC0_QMDV05_00_HOLD_CTA_LAUNCH_UNTIL_PARENT_LATCH_ACQUIRE_AND_CTA_COMPLETE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_HOLD_CTA_LAUNCH_UNTIL_PARENT_LATCH_ACQUIRE_AND_CTA_COMPLETE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_HOLD_MEMBAR_UNTIL_LATCH_ACQUIRE MW(734:734) +#define NVCEC0_QMDV05_00_HOLD_MEMBAR_UNTIL_LATCH_ACQUIRE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_HOLD_MEMBAR_UNTIL_LATCH_ACQUIRE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_PRIORITY_DEMOTE_UNTIL_LATCH_ACQUIRE MW(735:735) +#define NVCEC0_QMDV05_00_PRIORITY_DEMOTE_UNTIL_LATCH_ACQUIRE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_PRIORITY_DEMOTE_UNTIL_LATCH_ACQUIRE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_OCCUPANCY_THRESHOLD_WARP MW(743:736) +#define NVCEC0_QMDV05_00_OCCUPANCY_MAX_WARP MW(751:744) +#define NVCEC0_QMDV05_00_OCCUPANCY_THRESHOLD_REGISTER MW(759:752) +#define NVCEC0_QMDV05_00_OCCUPANCY_MAX_REGISTER MW(767:760) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE1_ADDR_LOWER MW(799:768) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE1_ADDR_UPPER MW(824:800) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE1_PAYLOAD_LOWER MW(863:832) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE1_PAYLOAD_UPPER MW(895:864) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE2_ADDR_LOWER MW(927:896) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE2_ADDR_UPPER MW(952:928) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE2_PAYLOAD_LOWER MW(991:960) +#define NVCEC0_QMDV05_00_RELEASE_SEMAPHORE2_PAYLOAD_UPPER MW(1023:992) +#define NVCEC0_QMDV05_00_PROGRAM_ADDRESS_LOWER_SHIFTED4 MW(1055:1024) +#define NVCEC0_QMDV05_00_PROGRAM_ADDRESS_UPPER_SHIFTED4 MW(1076:1056) +#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_SIZE MW(1085:1077) +#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE MW(1087:1086) #define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE_PREFETCH_LAUNCH 0x00000000 #define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE_PREFTECH_POST 0x00000001 - -#define NVCEC0_QMDV05_00_SHARED_MEMORY_SIZE MW(1162:1145) +#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION0 MW(1103:1088) +#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION1 MW(1119:1104) +#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION2 MW(1127:1120) +#define NVCEC0_QMDV05_00_REGISTER_COUNT MW(1136:1128) +#define NVCEC0_QMDV05_00_BARRIER_COUNT MW(1141:1137) +#define NVCEC0_QMDV05_00_ICC_PREFETCH_SIZE MW(1147:1142) +#define NVCEC0_QMDV05_00_SHARED_MEMORY_SIZE_SHIFTED7 MW(1162:1152) #define NVCEC0_QMDV05_00_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(1168:1163) #define NVCEC0_QMDV05_00_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(1174:1169) #define NVCEC0_QMDV05_00_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(1180:1175) +#define NVCEC0_QMDV05_00_SHARED_MEM_BARRIER_INIT_ENABLE MW(1181:1181) +#define NVCEC0_QMDV05_00_SHADER_LOCAL_MEMORY_LOW_SIZE_SHIFTED4 MW(1199:1184) +#define NVCEC0_QMDV05_00_SHADER_LOCAL_MEMORY_HIGH_SIZE_SHIFTED4 MW(1215:1200) +#define NVCEC0_QMDV05_00_VIRTUAL_RESOURCE_COUNT MW(1223:1216) +#define NVCEC0_QMDV05_00_GRID_WIDTH MW(1279:1248) +#define NVCEC0_QMDV05_00_GRID_HEIGHT MW(1295:1280) +#define NVCEC0_QMDV05_00_GRID_DEPTH MW(1327:1312) +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_ADDR_LOWER_SHIFTED6(i) MW((1375+(i)*64):(1344+(i)*64)) +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_ADDR_UPPER_SHIFTED6(i) MW((1394+(i)*64):(1376+(i)*64)) +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1407+(i)*64):(1395+(i)*64)) +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID(i) MW((1856+(i)*4):(1856+(i)*4)) +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH(i) MW((1858+(i)*4):(1857+(i)*4)) +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_NONE 0x00000000 +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_PRE 0x00000001 +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_POST 0x00000002 +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE(i) MW((1859+(i)*4):(1859+(i)*4)) +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(1919:1888) +#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1936:1920) +#define NVCEC0_QMDV05_00_GPC_CGA_WIDTH MW(2053:2048) +#define NVCEC0_QMDV05_00_GPC_CGA_HEIGHT MW(2061:2056) +#define NVCEC0_QMDV05_00_GPC_CGA_DEPTH MW(2069:2064) +#define NVCEC0_QMDV05_00_LARGE_GPC_CGA_WIDTH_MINUS_ONE MW(2075:2072) +#define NVCEC0_QMDV05_00_LARGE_GPC_CGA_HEIGHT_MINUS_ONE MW(2079:2076) +#define NVCEC0_QMDV05_00_CGA_CTA_DISTRIBUTION_MODE MW(2111:2111) +#define NVCEC0_QMDV05_00_CGA_CTA_DISTRIBUTION_MODE_LOAD_BALANCING 0x00000000 +#define NVCEC0_QMDV05_00_CGA_CTA_DISTRIBUTION_MODE_MULTI_CAST 0x00000001 +#define NVCEC0_QMDV05_00_GPU_CGA_WIDTH MW(2127:2112) +#define NVCEC0_QMDV05_00_GPU_CGA_HEIGHT MW(2143:2128) +#define NVCEC0_QMDV05_00_GPU_CGA_DEPTH MW(2159:2144) +#define NVCEC0_QMDV05_00_DEBUG_ID_LOWER MW(2207:2176) +#define NVCEC0_QMDV05_00_DEBUG_ID_UPPER MW(2239:2208) +#define NVCEC0_QMDV05_00_TPC_DISABLE_MASK(i) MW((2271+(i)*32):(2240+(i)*32)) +#define NVCEC0_QMDV05_00_INCOMPLETE_BOX_BASE_WIDTH_RESUME MW(2527:2496) +#define NVCEC0_QMDV05_00_INCOMPLETE_BOX_BASE_HEIGHT_RESUME MW(2543:2528) +#define NVCEC0_QMDV05_00_INCOMPLETE_BOX_BASE_DEPTH_RESUME MW(2559:2544) +#define NVCEC0_QMDV05_00_INCOMPLETE_BOX_OFFSET_WIDTH_RESUME MW(2563:2560) +#define NVCEC0_QMDV05_00_INCOMPLETE_BOX_OFFSET_HEIGHT_RESUME MW(2567:2564) +#define NVCEC0_QMDV05_00_QUEUE_ENTRIES_PER_CTA_LOG2 MW(2596:2592) +#define NVCEC0_QMDV05_00_HW_ONLY_INNER_GET MW(2654:2624) +#define NVCEC0_QMDV05_00_HW_ONLY_INNER_PUT MW(2686:2656) +#define NVCEC0_QMDV05_00_OUTER_PUT MW(3038:3008) +#define NVCEC0_QMDV05_00_OUTER_OVERFLOW MW(3039:3039) +#define NVCEC0_QMDV05_00_OUTER_GET MW(3070:3040) +#define NVCEC0_QMDV05_00_OUTER_STICKY_OVERFLOW MW(3071:3071) -// ?? -#define NVCEC0_QMDV05_00_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(1213:1196) -#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT MW(456:456) -#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT__32 0x00000000 -#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 -#define NVCEC0_QMDV05_00_SAMPLER_INDEX MW(457:457) -#define NVCEC0_QMDV05_00_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 -#define NVCEC0_QMDV05_00_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 -#define NVCEC0_QMDV05_00_UNKNOWN_13 MW(159:152) // A4 -#define NVCEC0_QMDV05_00_SASS_VERSION MW(455:448) +/* +** Queue Meta Data, Version 05_00 (inferred arrays) + */ -#endif // #ifndef __CLCEC0QMD_H__ \ No newline at end of file +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(i) MW((336+(i)*8):(336+(i)*8)) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(i) MW((339+(i)*8):(337+(i)*8)) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_INCREMENT_PUT 0x00000000 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_SCHEDULE 0x00000001 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(i) MW((340+(i)*8):(340+(i)*8)) +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH_FALSE 0x00000000 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH_TRUE 0x00000001 +#define NVCEC0_QMDV05_00_DEPENDENT_QMD_POINTER(i) MW((415+(i)*32):(384+(i)*32)) + + +/* +** Queue Meta Data, Version 05_01 + */ + +#define NVCEC0_QMDV05_01_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(29:0) +#define NVCEC0_QMDV05_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(30:30) +#define NVCEC0_QMDV05_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(31:31) +#define NVCEC0_QMDV05_01_HW_ONLY_SKED_NEXT_QMD_POINTER MW(63:32) +#define NVCEC0_QMDV05_01_INNER_GET MW(94:64) +#define NVCEC0_QMDV05_01_INNER_OVERFLOW MW(95:95) +#define NVCEC0_QMDV05_01_INNER_PUT MW(126:96) +#define NVCEC0_QMDV05_01_INNER_STICKY_OVERFLOW MW(127:127) +#define NVCEC0_QMDV05_01_DEPENDENCE_COUNTER MW(143:128) +#define NVCEC0_QMDV05_01_QMD_GROUP_ID MW(149:144) +#define NVCEC0_QMDV05_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(150:150) +#define NVCEC0_QMDV05_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_QMD_TYPE MW(153:151) +#define NVCEC0_QMDV05_01_QMD_TYPE_QUEUE 0x00000000 +#define NVCEC0_QMDV05_01_QMD_TYPE_GRID_NULL 0x00000001 +#define NVCEC0_QMDV05_01_QMD_TYPE_GRID_CTA 0x00000002 +#define NVCEC0_QMDV05_01_QMD_TYPE_GRID_GPC_CGA 0x00000003 +#define NVCEC0_QMDV05_01_QMD_TYPE_GRID_GPU_CGA 0x00000004 +#define NVCEC0_QMDV05_01_QMD_TYPE_GRID_GPU_GPC_CGA 0x00000005 +#define NVCEC0_QMDV05_01_NUM_SUB_TASKS_PER_TASK MW(157:154) +#define NVCEC0_QMDV05_01_REQUIRE_SCHEDULING_PCAS MW(158:158) +#define NVCEC0_QMDV05_01_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_TPC_DISABLE_MASK_VALID MW(159:159) +#define NVCEC0_QMDV05_01_TPC_DISABLE_MASK_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_TPC_DISABLE_MASK_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_CIRCULAR_QUEUE_SIZE MW(184:160) +#define NVCEC0_QMDV05_01_HW_ONLY_DEPENDENCE_COUNTER MW(207:192) +#define NVCEC0_QMDV05_01_RESUME_SUB_TASK_ID MW(210:208) +#define NVCEC0_QMDV05_01_COMPLETED_SUB_TASK_MASK MW(218:211) +#define NVCEC0_QMDV05_01_GRID_WIDTH_RESUME MW(255:224) +#define NVCEC0_QMDV05_01_GRID_HEIGHT_RESUME MW(271:256) +#define NVCEC0_QMDV05_01_GRID_DEPTH_RESUME MW(287:272) +#define NVCEC0_QMDV05_01_RELEASE_ENABLE(i) MW((288+(i)*16):(288+(i)*16)) +#define NVCEC0_QMDV05_01_RELEASE_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_RELEASE_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_RELEASE_STRUCTURE_SIZE(i) MW((290+(i)*16):(289+(i)*16)) +#define NVCEC0_QMDV05_01_RELEASE_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000 +#define NVCEC0_QMDV05_01_RELEASE_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001 +#define NVCEC0_QMDV05_01_RELEASE_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002 +#define NVCEC0_QMDV05_01_RELEASE_MEMBAR_TYPE(i) MW((291+(i)*16):(291+(i)*16)) +#define NVCEC0_QMDV05_01_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000 +#define NVCEC0_QMDV05_01_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_ENABLE(i) MW((292+(i)*16):(292+(i)*16)) +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_OP(i) MW((295+(i)*16):(293+(i)*16)) +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_OP_RED_ADD 0x00000000 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_OP_RED_MIN 0x00000001 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_OP_RED_MAX 0x00000002 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_OP_RED_INC 0x00000003 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_OP_RED_DEC 0x00000004 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_OP_RED_AND 0x00000005 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_OP_RED_OR 0x00000006 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_OP_RED_XOR 0x00000007 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_FORMAT(i) MW((297+(i)*16):(296+(i)*16)) +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_FORMAT_UNSIGNED 0x00000000 +#define NVCEC0_QMDV05_01_RELEASE_REDUCTION_FORMAT_SIGNED 0x00000001 +#define NVCEC0_QMDV05_01_RELEASE_TRAP_TYPE(i) MW((299+(i)*16):(298+(i)*16)) +#define NVCEC0_QMDV05_01_RELEASE_TRAP_TYPE_TRAP_NONE 0x00000000 +#define NVCEC0_QMDV05_01_RELEASE_TRAP_TYPE_TRAP_UNCONDITIONAL 0x00000001 +#define NVCEC0_QMDV05_01_RELEASE_TRAP_TYPE_TRAP_CONDITIONAL 0x00000002 +#define NVCEC0_QMDV05_01_RELEASE_TRAP_TYPE_TRAP_CONDITIONAL_EXT 0x00000003 +#define NVCEC0_QMDV05_01_RELEASE_PAYLOAD64B(i) MW((300+(i)*16):(300+(i)*16)) +#define NVCEC0_QMDV05_01_RELEASE_PAYLOAD64B_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_RELEASE_PAYLOAD64B_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_RELEASE_RESERVED_INFO(i) MW((303+(i)*16):(301+(i)*16)) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_ENABLE MW(336:336) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_ACTION MW(339:337) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_ACTION_QMD_INCREMENT_PUT 0x00000000 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_ACTION_QMD_SCHEDULE 0x00000001 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_PREFETCH MW(340:340) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_PREFETCH_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_PREFETCH_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_SELF_COPY_ON_COMPLETION MW(341:341) +#define NVCEC0_QMDV05_01_SELF_COPY_ON_COMPLETION_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_SELF_COPY_ON_COMPLETION_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_DEMOTE_L2_EVICT_LAST MW(342:342) +#define NVCEC0_QMDV05_01_DEMOTE_L2_EVICT_LAST_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_DEMOTE_L2_EVICT_LAST_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_DISABLE_AUTO_INVALIDATE MW(343:343) +#define NVCEC0_QMDV05_01_DISABLE_AUTO_INVALIDATE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_DISABLE_AUTO_INVALIDATE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_ENABLE MW(344:344) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_ACTION MW(347:345) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_ACTION_QMD_INCREMENT_PUT 0x00000000 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_ACTION_QMD_SCHEDULE 0x00000001 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_PREFETCH MW(348:348) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_PREFETCH_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_PREFETCH_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_CORRELATION_ID_INTERNAL MW(349:349) +#define NVCEC0_QMDV05_01_CORRELATION_ID_INTERNAL_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_CORRELATION_ID_INTERNAL_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_CWD_MEMBAR_TASK_CHASING_ENABLE MW(350:350) +#define NVCEC0_QMDV05_01_CWD_MEMBAR_TASK_CHASING_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_CWD_MEMBAR_TASK_CHASING_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_SHARED_ALLOCATION_ENABLE MW(351:351) +#define NVCEC0_QMDV05_01_SHARED_ALLOCATION_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_SHARED_ALLOCATION_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_CORRELATION_ID MW(383:352) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD0_POINTER MW(415:384) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD1_POINTER MW(447:416) +#define NVCEC0_QMDV05_01_SASS_VERSION MW(455:448) +#define NVCEC0_QMDV05_01_API_VISIBLE_CALL_LIMIT MW(456:456) +#define NVCEC0_QMDV05_01_API_VISIBLE_CALL_LIMIT__32 0x00000000 +#define NVCEC0_QMDV05_01_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 +#define NVCEC0_QMDV05_01_SAMPLER_INDEX MW(457:457) +#define NVCEC0_QMDV05_01_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 +#define NVCEC0_QMDV05_01_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 +#define NVCEC0_QMDV05_01_CONSTANT_BANK_PREFETCH_PRE_MAX_SIZE_SHIFTED8 MW(463:458) +#define NVCEC0_QMDV05_01_QMD_MINOR_VERSION MW(467:464) +#define NVCEC0_QMDV05_01_QMD_MAJOR_VERSION MW(471:468) +#define NVCEC0_QMDV05_01_INVALIDATE_TEXTURE_HEADER_CACHE MW(472:472) +#define NVCEC0_QMDV05_01_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(473:473) +#define NVCEC0_QMDV05_01_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_INVALIDATE_TEXTURE_DATA_CACHE MW(474:474) +#define NVCEC0_QMDV05_01_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_INVALIDATE_SHADER_DATA_CACHE MW(475:475) +#define NVCEC0_QMDV05_01_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_INVALIDATE_INSTRUCTION_CACHE MW(476:476) +#define NVCEC0_QMDV05_01_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_INVALIDATE_SHADER_CONSTANT_CACHE MW(477:477) +#define NVCEC0_QMDV05_01_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_LATCH_ACQUIRE_INVALIDATE_SHADER_DATA_CACHE MW(478:478) +#define NVCEC0_QMDV05_01_LATCH_ACQUIRE_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_LATCH_ACQUIRE_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_LATCH_ACQUIRE_INVALIDATE_TEXTURE_DATA_CACHE MW(479:479) +#define NVCEC0_QMDV05_01_LATCH_ACQUIRE_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_LATCH_ACQUIRE_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_RELEASE_SEMAPHORE0_ADDR_LOWER MW(511:480) +#define NVCEC0_QMDV05_01_RELEASE_SEMAPHORE0_ADDR_UPPER MW(536:512) +#define NVCEC0_QMDV05_01_RELEASE_SEMAPHORE0_PAYLOAD_LOWER MW(575:544) +#define NVCEC0_QMDV05_01_RELEASE_SEMAPHORE0_PAYLOAD_UPPER MW(607:576) +#define NVCEC0_QMDV05_01_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(615:608) +#define NVCEC0_QMDV05_01_CWD_REFERENCE_COUNT_ID MW(621:616) +#define NVCEC0_QMDV05_01_CWD_REFERENCE_COUNT_INCR_ENABLE MW(622:622) +#define NVCEC0_QMDV05_01_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_CWD_REFERENCE_COUNT_DECR_ENABLE MW(623:623) +#define NVCEC0_QMDV05_01_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_CWD_MEMBAR_TYPE MW(625:624) +#define NVCEC0_QMDV05_01_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 +#define NVCEC0_QMDV05_01_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 +#define NVCEC0_QMDV05_01_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 +#define NVCEC0_QMDV05_01_LATCH_ACQUIRE_INVALIDATE_CONSTANT_CACHE MW(626:626) +#define NVCEC0_QMDV05_01_LATCH_ACQUIRE_INVALIDATE_CONSTANT_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_LATCH_ACQUIRE_INVALIDATE_CONSTANT_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_CTA_LAUNCH_QUEUE MW(627:627) +#define NVCEC0_QMDV05_01_FREE_CTA_SLOTS_EMPTY_SM MW(635:628) +#define NVCEC0_QMDV05_01_SYNC_DOMAIN_ID MW(637:636) +#define NVCEC0_QMDV05_01_PRE_EXIT_AT_LAST_CTA_LAUNCH MW(638:638) +#define NVCEC0_QMDV05_01_PRE_EXIT_AT_LAST_CTA_LAUNCH_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_PRE_EXIT_AT_LAST_CTA_LAUNCH_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_ENABLE_PROGRAM_PRE_EXIT MW(639:639) +#define NVCEC0_QMDV05_01_ENABLE_PROGRAM_PRE_EXIT_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_ENABLE_PROGRAM_PRE_EXIT_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_ARRIVE_AT_LATCH_ID MW(671:640) +#define NVCEC0_QMDV05_01_WAIT_ON_LATCH_ID MW(703:672) +#define NVCEC0_QMDV05_01_OCCUPANCY_THRESHOLD_SHARED_MEM MW(721:714) +#define NVCEC0_QMDV05_01_OCCUPANCY_MAX_SHARED_MEM MW(729:722) +#define NVCEC0_QMDV05_01_ARRIVE_AT_LATCH_VALID MW(730:730) +#define NVCEC0_QMDV05_01_ARRIVE_AT_LATCH_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_ARRIVE_AT_LATCH_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_WAIT_ON_LATCH_VALID MW(731:731) +#define NVCEC0_QMDV05_01_WAIT_ON_LATCH_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_WAIT_ON_LATCH_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_LATCH_RELEASE_INVALIDATE_ENABLE MW(732:732) +#define NVCEC0_QMDV05_01_LATCH_RELEASE_INVALIDATE_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_LATCH_RELEASE_INVALIDATE_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_HOLD_CTA_LAUNCH_UNTIL_PARENT_LATCH_ACQUIRE_AND_CTA_COMPLETE MW(733:733) +#define NVCEC0_QMDV05_01_HOLD_CTA_LAUNCH_UNTIL_PARENT_LATCH_ACQUIRE_AND_CTA_COMPLETE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_HOLD_CTA_LAUNCH_UNTIL_PARENT_LATCH_ACQUIRE_AND_CTA_COMPLETE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_HOLD_MEMBAR_UNTIL_LATCH_ACQUIRE MW(734:734) +#define NVCEC0_QMDV05_01_HOLD_MEMBAR_UNTIL_LATCH_ACQUIRE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_HOLD_MEMBAR_UNTIL_LATCH_ACQUIRE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_PRIORITY_DEMOTE_UNTIL_LATCH_ACQUIRE MW(735:735) +#define NVCEC0_QMDV05_01_PRIORITY_DEMOTE_UNTIL_LATCH_ACQUIRE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_PRIORITY_DEMOTE_UNTIL_LATCH_ACQUIRE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_OCCUPANCY_THRESHOLD_WARP MW(743:736) +#define NVCEC0_QMDV05_01_OCCUPANCY_MAX_WARP MW(751:744) +#define NVCEC0_QMDV05_01_OCCUPANCY_THRESHOLD_REGISTER MW(759:752) +#define NVCEC0_QMDV05_01_OCCUPANCY_MAX_REGISTER MW(767:760) +#define NVCEC0_QMDV05_01_SUB_TASK_PROGRAM_ADDRESS_LOWER_SHIFTED4(i) MW((799+(i)*416):(768+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_PROGRAM_ADDRESS_UPPER_SHIFTED4(i) MW((820+(i)*416):(800+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_PROGRAM_PREFETCH_SIZE(i) MW((829+(i)*416):(821+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_PROGRAM_PREFETCH_TYPE(i) MW((831+(i)*416):(830+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_PROGRAM_PREFETCH_TYPE_PREFETCH_LAUNCH 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_PROGRAM_PREFETCH_TYPE_PREFTECH_POST 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_CTA_THREAD_DIMENSION0(i) MW((847+(i)*416):(832+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CTA_THREAD_DIMENSION1(i) MW((863+(i)*416):(848+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CTA_THREAD_DIMENSION2(i) MW((871+(i)*416):(864+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_REGISTER_COUNT(i) MW((880+(i)*416):(872+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_BARRIER_COUNT(i) MW((885+(i)*416):(881+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_VALID(i) MW((886+(i)*416):(886+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_PREFETCH(i) MW((888+(i)*416):(887+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_PREFETCH_PREFETCH_NONE 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_PREFETCH_PREFETCH_PRE 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_PREFETCH_PREFETCH_POST 0x00000002 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_INVALIDATE(i) MW((889+(i)*416):(889+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_INVALIDATE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_INVALIDATE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_ICC_PREFETCH_SIZE(i) MW((895+(i)*416):(890+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_SHARED_MEMORY_SIZE_SHIFTED7(i) MW((906+(i)*416):(896+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_MIN_SM_CONFIG_SHARED_MEM_SIZE(i) MW((912+(i)*416):(907+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_MAX_SM_CONFIG_SHARED_MEM_SIZE(i) MW((918+(i)*416):(913+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_TARGET_SM_CONFIG_SHARED_MEM_SIZE(i) MW((924+(i)*416):(919+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_SHARED_MEM_BARRIER_INIT_ENABLE(i) MW((925+(i)*416):(925+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_SHADER_LOCAL_MEMORY_LOW_SIZE_SHIFTED4(i) MW((943+(i)*416):(928+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_SHADER_LOCAL_MEMORY_HIGH_SIZE_SHIFTED4(i) MW((959+(i)*416):(944+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_ADDR_LOWER_SHIFTED6(i) MW((991+(i)*416):(960+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_ADDR_UPPER_SHIFTED6(i) MW((1010+(i)*416):(992+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK0_SIZE_SHIFTED4(i) MW((1023+(i)*416):(1011+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_ADDR_LOWER_SHIFTED6(i) MW((1055+(i)*416):(1024+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_ADDR_UPPER_SHIFTED6(i) MW((1074+(i)*416):(1056+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_SIZE_SHIFTED4(i) MW((1087+(i)*416):(1075+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_VALID(i) MW((1088+(i)*416):(1088+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_PREFETCH(i) MW((1090+(i)*416):(1089+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_PREFETCH_PREFETCH_NONE 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_PREFETCH_PREFETCH_PRE 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_PREFETCH_PREFETCH_POST 0x00000002 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_INVALIDATE(i) MW((1091+(i)*416):(1091+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_INVALIDATE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK1_INVALIDATE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_VIRTUAL_RESOURCE_COUNT(i) MW((1099+(i)*416):(1092+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_GRID_WIDTH(i) MW((1151+(i)*416):(1120+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_GRID_HEIGHT(i) MW((1167+(i)*416):(1152+(i)*416)) +#define NVCEC0_QMDV05_01_SUB_TASK_GRID_DEPTH(i) MW((1183+(i)*416):(1168+(i)*416)) + + +/* +** Queue Meta Data, Version 05_01 (inferred arrays) + */ + +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_ENABLE(i) MW((336+(i)*8):(336+(i)*8)) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_ACTION(i) MW((339+(i)*8):(337+(i)*8)) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_ACTION_QMD_INCREMENT_PUT 0x00000000 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_ACTION_QMD_SCHEDULE 0x00000001 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_PREFETCH(i) MW((340+(i)*8):(340+(i)*8)) +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_PREFETCH_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_PREFETCH_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_DEPENDENT_QMD_POINTER(i) MW((415+(i)*32):(384+(i)*32)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_VALID(i,j) MW((886+(i)*416+(j)*202):(886+(i)*416+(j)*202)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_PREFETCH(i,j) MW((888+(i)*416+(j)*202):(887+(i)*416+(j)*202)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_PREFETCH_PREFETCH_NONE 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_PREFETCH_PREFETCH_PRE 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_PREFETCH_PREFETCH_POST 0x00000002 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_INVALIDATE(i,j) MW((889+(i)*416+(j)*202):(889+(i)*416+(j)*202)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_INVALIDATE_FALSE 0x00000000 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_INVALIDATE_TRUE 0x00000001 +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_ADDR_LOWER_SHIFTED6(i,j) MW((991+(i)*416+(j)*64):(960+(i)*416+(j)*64)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_ADDR_UPPER_SHIFTED6(i,j) MW((1010+(i)*416+(j)*64):(992+(i)*416+(j)*64)) +#define NVCEC0_QMDV05_01_SUB_TASK_CONSTANT_BANK_SIZE_SHIFTED4(i,j) MW((1023+(i)*416+(j)*64):(1011+(i)*416+(j)*64)) + + +/* +** Queue Meta Data, Version 04_01 + */ + +#define NVCEC0_QMDV04_01_DEPENDENCE_COUNTER MW(15:0) +#define NVCEC0_QMDV04_01_QMD_GROUP_ID MW(21:16) +#define NVCEC0_QMDV04_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(22:22) +#define NVCEC0_QMDV04_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_QMD_TYPE MW(25:23) +#define NVCEC0_QMDV04_01_QMD_TYPE_QUEUE 0x00000000 +#define NVCEC0_QMDV04_01_QMD_TYPE_GRID_NULL 0x00000001 +#define NVCEC0_QMDV04_01_QMD_TYPE_GRID_CTA 0x00000002 +#define NVCEC0_QMDV04_01_QMD_TYPE_GRID_GPC_CGA 0x00000003 +#define NVCEC0_QMDV04_01_QMD_TYPE_GRID_GPU_CGA 0x00000004 +#define NVCEC0_QMDV04_01_QMD_TYPE_GRID_GPU_GPC_CGA 0x00000005 +#define NVCEC0_QMDV04_01_ARRIVE_AT_LATCH_VALID MW(28:28) +#define NVCEC0_QMDV04_01_WAIT_ON_LATCH_VALID MW(29:29) +#define NVCEC0_QMDV04_01_REQUIRE_SCHEDULING_PCAS MW(30:30) +#define NVCEC0_QMDV04_01_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_TPC_DISABLE_MASK_VALID MW(31:31) +#define NVCEC0_QMDV04_01_TPC_DISABLE_MASK_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_TPC_DISABLE_MASK_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_CIRCULAR_QUEUE_SIZE MW(56:32) +#define NVCEC0_QMDV04_01_INNER_GET MW(94:64) +#define NVCEC0_QMDV04_01_INNER_OVERFLOW MW(95:95) +#define NVCEC0_QMDV04_01_INNER_PUT MW(126:96) +#define NVCEC0_QMDV04_01_INNER_STICKY_OVERFLOW MW(127:127) +#define NVCEC0_QMDV04_01_HW_ONLY_INNER_GET MW(190:160) +#define NVCEC0_QMDV04_01_HW_ONLY_INNER_PUT MW(222:192) +#define NVCEC0_QMDV04_01_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(253:224) +#define NVCEC0_QMDV04_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(254:254) +#define NVCEC0_QMDV04_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_HW_ONLY_SKED_NEXT_QMD_POINTER MW(287:256) +#define NVCEC0_QMDV04_01_HW_ONLY_DEPENDENCE_COUNTER MW(303:288) +#define NVCEC0_QMDV04_01_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(304:304) +#define NVCEC0_QMDV04_01_RELEASE_ENABLE(i) MW((320+(i)*16):(320+(i)*16)) +#define NVCEC0_QMDV04_01_RELEASE_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_RELEASE_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_RELEASE_STRUCTURE_SIZE(i) MW((322+(i)*16):(321+(i)*16)) +#define NVCEC0_QMDV04_01_RELEASE_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000 +#define NVCEC0_QMDV04_01_RELEASE_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001 +#define NVCEC0_QMDV04_01_RELEASE_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002 +#define NVCEC0_QMDV04_01_RELEASE_MEMBAR_TYPE(i) MW((323+(i)*16):(323+(i)*16)) +#define NVCEC0_QMDV04_01_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000 +#define NVCEC0_QMDV04_01_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_ENABLE(i) MW((324+(i)*16):(324+(i)*16)) +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_OP(i) MW((327+(i)*16):(325+(i)*16)) +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_OP_RED_ADD 0x00000000 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_OP_RED_MIN 0x00000001 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_OP_RED_MAX 0x00000002 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_OP_RED_INC 0x00000003 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_OP_RED_DEC 0x00000004 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_OP_RED_AND 0x00000005 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_OP_RED_OR 0x00000006 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_OP_RED_XOR 0x00000007 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_FORMAT(i) MW((329+(i)*16):(328+(i)*16)) +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_FORMAT_UNSIGNED 0x00000000 +#define NVCEC0_QMDV04_01_RELEASE_REDUCTION_FORMAT_SIGNED 0x00000001 +#define NVCEC0_QMDV04_01_RELEASE_TRAP_TYPE(i) MW((331+(i)*16):(330+(i)*16)) +#define NVCEC0_QMDV04_01_RELEASE_TRAP_TYPE_TRAP_NONE 0x00000000 +#define NVCEC0_QMDV04_01_RELEASE_TRAP_TYPE_TRAP_UNCONDITIONAL 0x00000001 +#define NVCEC0_QMDV04_01_RELEASE_TRAP_TYPE_TRAP_CONDITIONAL 0x00000002 +#define NVCEC0_QMDV04_01_RELEASE_TRAP_TYPE_TRAP_CONDITIONAL_EXT 0x00000003 +#define NVCEC0_QMDV04_01_RELEASE_PAYLOAD64B(i) MW((332+(i)*16):(332+(i)*16)) +#define NVCEC0_QMDV04_01_RELEASE_PAYLOAD64B_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_RELEASE_PAYLOAD64B_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_RELEASE_RESERVED_INFO(i) MW((335+(i)*16):(333+(i)*16)) +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_ENABLE(i) MW((368+(i)*5):(368+(i)*5)) +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_ACTION(i) MW((371+(i)*5):(369+(i)*5)) +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_ACTION_QMD_INCREMENT_PUT 0x00000000 +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_ACTION_QMD_SCHEDULE 0x00000001 +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003 +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004 +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_PREFETCH(i) MW((372+(i)*5):(372+(i)*5)) +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_PREFETCH_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_DEPENDENT_QMD_PREFETCH_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_SELF_COPY_ON_COMPLETION MW(378:378) +#define NVCEC0_QMDV04_01_SELF_COPY_ON_COMPLETION_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_SELF_COPY_ON_COMPLETION_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_DEMOTE_L2_EVICT_LAST MW(379:379) +#define NVCEC0_QMDV04_01_DEMOTE_L2_EVICT_LAST_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_DEMOTE_L2_EVICT_LAST_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_DISABLE_AUTO_INVALIDATE MW(380:380) +#define NVCEC0_QMDV04_01_DISABLE_AUTO_INVALIDATE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_DISABLE_AUTO_INVALIDATE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_CORRELATION_ID_INTERNAL MW(381:381) +#define NVCEC0_QMDV04_01_CORRELATION_ID_INTERNAL_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_CORRELATION_ID_INTERNAL_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_CWD_MEMBAR_TASK_CHASING_ENABLE MW(382:382) +#define NVCEC0_QMDV04_01_CWD_MEMBAR_TASK_CHASING_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_CWD_MEMBAR_TASK_CHASING_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_CORRELATION_ID MW(415:384) +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_VALID(i) MW((416+(i)*4):(416+(i)*4)) +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_VALID_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_VALID_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_PREFETCH(i) MW((418+(i)*4):(417+(i)*4)) +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_PREFETCH_PREFETCH_NONE 0x00000000 +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_PREFETCH_PREFETCH_PRE 0x00000001 +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_PREFETCH_PREFETCH_POST 0x00000002 +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_INVALIDATE(i) MW((419+(i)*4):(419+(i)*4)) +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_DEPENDENT_QMD0_POINTER MW(479:448) +#define NVCEC0_QMDV04_01_DEPENDENT_QMD1_POINTER MW(511:480) +#define NVCEC0_QMDV04_01_SHADER_LOCAL_MEMORY_LOW_SIZE MW(535:512) +#define NVCEC0_QMDV04_01_SASS_VERSION MW(543:536) +#define NVCEC0_QMDV04_01_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(567:544) +#define NVCEC0_QMDV04_01_API_VISIBLE_CALL_LIMIT MW(568:568) +#define NVCEC0_QMDV04_01_API_VISIBLE_CALL_LIMIT__32 0x00000000 +#define NVCEC0_QMDV04_01_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001 +#define NVCEC0_QMDV04_01_SAMPLER_INDEX MW(569:569) +#define NVCEC0_QMDV04_01_SAMPLER_INDEX_INDEPENDENTLY 0x00000000 +#define NVCEC0_QMDV04_01_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001 +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_PREFETCH_PRE_MAX_SIZE_SHIFTED8 MW(575:570) +#define NVCEC0_QMDV04_01_QMD_MINOR_VERSION MW(579:576) +#define NVCEC0_QMDV04_01_QMD_MAJOR_VERSION MW(583:580) +#define NVCEC0_QMDV04_01_SHARED_MEMORY_SIZE MW(601:584) +#define NVCEC0_QMDV04_01_INVALIDATE_TEXTURE_HEADER_CACHE MW(602:602) +#define NVCEC0_QMDV04_01_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(603:603) +#define NVCEC0_QMDV04_01_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_INVALIDATE_TEXTURE_DATA_CACHE MW(604:604) +#define NVCEC0_QMDV04_01_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_INVALIDATE_SHADER_DATA_CACHE MW(605:605) +#define NVCEC0_QMDV04_01_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_INVALIDATE_INSTRUCTION_CACHE MW(606:606) +#define NVCEC0_QMDV04_01_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_INVALIDATE_SHADER_CONSTANT_CACHE MW(607:607) +#define NVCEC0_QMDV04_01_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(613:608) +#define NVCEC0_QMDV04_01_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(619:614) +#define NVCEC0_QMDV04_01_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(625:620) +#define NVCEC0_QMDV04_01_SHARED_ALLOCATION_ENABLE MW(626:626) +#define NVCEC0_QMDV04_01_SHARED_ALLOCATION_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_SHARED_ALLOCATION_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE0_ADDR_LOWER MW(671:640) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE0_ADDR_UPPER MW(696:672) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE0_PAYLOAD_LOWER MW(735:704) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE0_PAYLOAD_UPPER MW(767:736) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE1_ADDR_LOWER MW(799:768) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE1_ADDR_UPPER MW(824:800) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE1_PAYLOAD_LOWER MW(863:832) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE1_PAYLOAD_UPPER MW(895:864) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE2_ADDR_LOWER MW(927:896) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE2_ADDR_UPPER MW(952:928) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE2_PAYLOAD_LOWER MW(991:960) +#define NVCEC0_QMDV04_01_RELEASE_SEMAPHORE2_PAYLOAD_UPPER MW(1023:992) +#define NVCEC0_QMDV04_01_GRID_WIDTH MW(1055:1024) +#define NVCEC0_QMDV04_01_GRID_HEIGHT MW(1071:1056) +#define NVCEC0_QMDV04_01_GRID_DEPTH MW(1103:1088) +#define NVCEC0_QMDV04_01_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(1127:1120) +#define NVCEC0_QMDV04_01_CWD_REFERENCE_COUNT_ID MW(1133:1128) +#define NVCEC0_QMDV04_01_CWD_REFERENCE_COUNT_INCR_ENABLE MW(1134:1134) +#define NVCEC0_QMDV04_01_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_CWD_REFERENCE_COUNT_DECR_ENABLE MW(1135:1135) +#define NVCEC0_QMDV04_01_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_CWD_MEMBAR_TYPE MW(1137:1136) +#define NVCEC0_QMDV04_01_CWD_MEMBAR_TYPE_L1_NONE 0x00000000 +#define NVCEC0_QMDV04_01_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001 +#define NVCEC0_QMDV04_01_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003 +#define NVCEC0_QMDV04_01_SEQUENTIALLY_RUN_CTAS MW(1138:1138) +#define NVCEC0_QMDV04_01_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_CTA_LAUNCH_QUEUE MW(1139:1139) +#define NVCEC0_QMDV04_01_FREE_CTA_SLOTS_EMPTY_SM MW(1147:1140) +#define NVCEC0_QMDV04_01_SYNC_DOMAIN_ID MW(1149:1148) +#define NVCEC0_QMDV04_01_PRE_EXIT_AT_LAST_CTA_LAUNCH MW(1150:1150) +#define NVCEC0_QMDV04_01_PRE_EXIT_AT_LAST_CTA_LAUNCH_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_PRE_EXIT_AT_LAST_CTA_LAUNCH_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_ENABLE_PROGRAM_PRE_EXIT MW(1151:1151) +#define NVCEC0_QMDV04_01_ENABLE_PROGRAM_PRE_EXIT_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_ENABLE_PROGRAM_PRE_EXIT_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_CTA_THREAD_DIMENSION0 MW(1167:1152) +#define NVCEC0_QMDV04_01_CTA_THREAD_DIMENSION1 MW(1183:1168) +#define NVCEC0_QMDV04_01_CTA_THREAD_DIMENSION2 MW(1191:1184) +#define NVCEC0_QMDV04_01_VIRTUAL_RESOURCE_COUNT MW(1199:1192) +#define NVCEC0_QMDV04_01_REGISTER_COUNT MW(1208:1200) +#define NVCEC0_QMDV04_01_SHARED_MEM_BARRIER_INIT_ENABLE MW(1210:1210) +#define NVCEC0_QMDV04_01_BARRIER_COUNT MW(1215:1211) +#define NVCEC0_QMDV04_01_PROGRAM_ADDRESS_LOWER MW(1247:1216) +#define NVCEC0_QMDV04_01_PROGRAM_ADDRESS_UPPER MW(1272:1248) +#define NVCEC0_QMDV04_01_OCCUPANCY_THRESHOLD_WARP MW(1287:1280) +#define NVCEC0_QMDV04_01_OCCUPANCY_MAX_WARP MW(1295:1288) +#define NVCEC0_QMDV04_01_OCCUPANCY_THRESHOLD_REGISTER MW(1303:1296) +#define NVCEC0_QMDV04_01_OCCUPANCY_MAX_REGISTER MW(1311:1304) +#define NVCEC0_QMDV04_01_OCCUPANCY_THRESHOLD_SHARED_MEM MW(1319:1312) +#define NVCEC0_QMDV04_01_OCCUPANCY_MAX_SHARED_MEM MW(1327:1320) +#define NVCEC0_QMDV04_01_ICC_PREFETCH_SIZE MW(1333:1328) +#define NVCEC0_QMDV04_01_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(1375:1344) +#define NVCEC0_QMDV04_01_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1392:1376) +#define NVCEC0_QMDV04_01_PROGRAM_PREFETCH_SIZE MW(1401:1393) +#define NVCEC0_QMDV04_01_PROGRAM_PREFETCH_TYPE MW(1403:1402) +#define NVCEC0_QMDV04_01_PROGRAM_PREFETCH_TYPE_PREFETCH_LAUNCH 0x00000000 +#define NVCEC0_QMDV04_01_PROGRAM_PREFETCH_TYPE_PREFTECH_POST 0x00000001 +#define NVCEC0_QMDV04_01_LATCH_ACQUIRE_INVALIDATE_SHADER_DATA_CACHE MW(1406:1406) +#define NVCEC0_QMDV04_01_LATCH_ACQUIRE_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_LATCH_ACQUIRE_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_LATCH_ACQUIRE_INVALIDATE_TEXTURE_DATA_CACHE MW(1407:1407) +#define NVCEC0_QMDV04_01_LATCH_ACQUIRE_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000 +#define NVCEC0_QMDV04_01_LATCH_ACQUIRE_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001 +#define NVCEC0_QMDV04_01_GRID_WIDTH_RESUME MW(1439:1408) +#define NVCEC0_QMDV04_01_GRID_HEIGHT_RESUME MW(1455:1440) +#define NVCEC0_QMDV04_01_GRID_DEPTH_RESUME MW(1471:1456) +#define NVCEC0_QMDV04_01_ARRIVE_AT_LATCH_ID MW(1503:1472) +#define NVCEC0_QMDV04_01_WAIT_ON_LATCH_ID MW(1535:1504) +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_ADDR_LOWER_SHIFTED6(i) MW((1567+(i)*64):(1536+(i)*64)) +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_ADDR_UPPER_SHIFTED6(i) MW((1586+(i)*64):(1568+(i)*64)) +#define NVCEC0_QMDV04_01_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1599+(i)*64):(1587+(i)*64)) +#define NVCEC0_QMDV04_01_COALESCE_WAITING_PERIOD MW(2135:2128) +#define NVCEC0_QMDV04_01_QUEUE_ENTRIES_PER_CTA_LOG2 MW(2140:2136) +#define NVCEC0_QMDV04_01_GPC_CGA_WIDTH MW(2149:2144) +#define NVCEC0_QMDV04_01_GPC_CGA_HEIGHT MW(2157:2152) +#define NVCEC0_QMDV04_01_GPC_CGA_DEPTH MW(2165:2160) +#define NVCEC0_QMDV04_01_LARGE_GPC_CGA_WIDTH_MINUS_ONE MW(2171:2168) +#define NVCEC0_QMDV04_01_LARGE_GPC_CGA_HEIGHT_MINUS_ONE MW(2175:2172) +#define NVCEC0_QMDV04_01_CGA_CTA_DISTRIBUTION_MODE MW(2207:2207) +#define NVCEC0_QMDV04_01_CGA_CTA_DISTRIBUTION_MODE_LOAD_BALANCING 0x00000000 +#define NVCEC0_QMDV04_01_CGA_CTA_DISTRIBUTION_MODE_MULTI_CAST 0x00000001 +#define NVCEC0_QMDV04_01_GPU_CGA_WIDTH MW(2223:2208) +#define NVCEC0_QMDV04_01_GPU_CGA_HEIGHT MW(2239:2224) +#define NVCEC0_QMDV04_01_GPU_CGA_DEPTH MW(2255:2240) +#define NVCEC0_QMDV04_01_DEBUG_ID_LOWER MW(2399:2368) +#define NVCEC0_QMDV04_01_DEBUG_ID_UPPER MW(2431:2400) +#define NVCEC0_QMDV04_01_TPC_DISABLE_MASK(i) MW((2463+(i)*32):(2432+(i)*32)) +#define NVCEC0_QMDV04_01_INCOMPLETE_BOX_BASE_WIDTH_RESUME MW(2591:2560) +#define NVCEC0_QMDV04_01_INCOMPLETE_BOX_BASE_HEIGHT_RESUME MW(2607:2592) +#define NVCEC0_QMDV04_01_INCOMPLETE_BOX_BASE_DEPTH_RESUME MW(2623:2608) +#define NVCEC0_QMDV04_01_INCOMPLETE_BOX_OFFSET_WIDTH_RESUME MW(2627:2624) +#define NVCEC0_QMDV04_01_INCOMPLETE_BOX_OFFSET_HEIGHT_RESUME MW(2631:2628) +#define NVCEC0_QMDV04_01_TPC_DISABLE_MASK_UPPER(i) MW((2719+(i)*32):(2688+(i)*32)) +#define NVCEC0_QMDV04_01_OUTER_PUT MW(3038:3008) +#define NVCEC0_QMDV04_01_OUTER_OVERFLOW MW(3039:3039) +#define NVCEC0_QMDV04_01_OUTER_GET MW(3070:3040) +#define NVCEC0_QMDV04_01_OUTER_STICKY_OVERFLOW MW(3071:3071) + + + +#endif // #ifndef __CLCEC0QMD_H__ diff --git a/tinygrad_repo/extra/onnx.py b/tinygrad_repo/extra/onnx.py index 04cad32..4cde0ec 100644 --- a/tinygrad_repo/extra/onnx.py +++ b/tinygrad_repo/extra/onnx.py @@ -1,15 +1,20 @@ +from types import SimpleNamespace from typing import Any, Sequence, cast, Literal, Callable -import dataclasses, functools, io, math, types +import dataclasses, functools, io, math, types, warnings from tinygrad.tensor import Tensor, _broadcast_shape, ReductionStr from tinygrad.helpers import getenv, DEBUG, all_same, prod, flatten, make_tuple, argsort from tinygrad.dtype import DType, ConstType, dtypes, ImageDType -from tinygrad.device import is_dtype_supported +from tinygrad.device import is_dtype_supported, Device # ***** protobuf parsing ****** from onnx import AttributeProto, ModelProto, TensorProto, TypeProto, helper import numpy as np -def dtype_parse(onnx_dtype: int) -> DType: +def has_field(onnx_type: TypeProto|SimpleNamespace, field): + if isinstance(onnx_type, TypeProto): return onnx_type.HasField(field) + return hasattr(onnx_type, field) + +def dtype_parse(onnx_dtype: int, fallback_context: str | None = None) -> DType: supported: dict[int, DType] = { TensorProto.FLOAT:dtypes.float32, TensorProto.UINT8:dtypes.uint8, TensorProto.INT8:dtypes.int8, TensorProto.UINT16:dtypes.uint16, TensorProto.INT16:dtypes.int16, TensorProto.INT32:dtypes.int32, TensorProto.INT64:dtypes.int64, @@ -21,14 +26,21 @@ def dtype_parse(onnx_dtype: int) -> DType: TensorProto.FLOAT8E5M2, TensorProto.FLOAT8E5M2FNUZ, TensorProto.UINT4, TensorProto.INT4 } if onnx_dtype in unsupported: raise NotImplementedError(f"onnx dtype {TensorProto.DataType.Name(onnx_dtype)} is not supported") - return supported[onnx_dtype] if is_dtype_supported(supported[onnx_dtype]) else dtypes.float + if is_dtype_supported(dtype := supported[onnx_dtype]): return dtype + # if fallback_context is provided, we can fall back to a default dtype + if fallback_context is not None: + default_dtype = dtypes.float + warnings.warn(f"dtype {dtype} on {Device.DEFAULT} from {fallback_context} is not supported, falling back to {default_dtype}") + return default_dtype + raise RuntimeError(f"dtype {dtype} on device {Device.DEFAULT} is not supported") def attribute_parse(onnx_attribute: AttributeProto): supported: dict[AttributeProto.AttributeType, Callable[[AttributeProto], Any]] = { AttributeProto.FLOAT: lambda a: float(a.f), AttributeProto.INT: lambda a: int(a.i), - AttributeProto.STRING: lambda a: a.s.decode("utf-8"), AttributeProto.TENSOR: lambda a: buffer_parse(a.t), + AttributeProto.STRING: lambda a: a.s.data().tobytes().decode("utf8") if isinstance(a.s, Tensor) else a.s.decode("utf8"), + AttributeProto.TENSOR: lambda a: buffer_parse(a.t), AttributeProto.FLOATS: lambda a: tuple(float(x) for x in a.floats), AttributeProto.INTS: lambda a: tuple(int(x) for x in a.ints), - AttributeProto.STRINGS: lambda a: tuple(x.decode("utf-8") for x in a.strings) + AttributeProto.STRINGS: lambda a: tuple(x.data().tobytes().decode("utf8") for x in a.strings) } unsupported = { AttributeProto.UNDEFINED, AttributeProto.GRAPH, AttributeProto.SPARSE_TENSOR, AttributeProto.TYPE_PROTO, AttributeProto.TENSORS, @@ -40,26 +52,39 @@ def attribute_parse(onnx_attribute: AttributeProto): def buffer_parse(onnx_tensor: TensorProto) -> Tensor: if onnx_tensor.string_data: raise NotImplementedError("Parsing for buffer with string data is not implemented.") - dtype, shape = dtype_parse(onnx_tensor.data_type), tuple(onnx_tensor.dims) - if data := list(onnx_tensor.float_data) or list(onnx_tensor.int32_data) or list(onnx_tensor.int64_data) or list(onnx_tensor.double_data) or \ - list(onnx_tensor.uint64_data): - if len(data) == 1: return Tensor(data[0], dtype=dtype).reshape(shape) - return Tensor(data, dtype=dtype).reshape(shape).realize() - if onnx_tensor.HasField("raw_data"): - np_buffer = np.frombuffer(onnx_tensor.raw_data, dtype=helper.tensor_dtype_to_np_dtype(onnx_tensor.data_type)).copy().reshape(shape) - if np_buffer.size == 1: return Tensor(np_buffer.item(), dtype=dtype).reshape(shape) - return Tensor(np_buffer, dtype=dtype) + dtype, shape = dtype_parse(onnx_tensor.data_type, "buffer parse"), tuple(onnx_tensor.dims) + data = None + if len(onnx_tensor.float_data): data = onnx_tensor.float_data + elif len(onnx_tensor.int32_data): data = onnx_tensor.int32_data + elif len(onnx_tensor.int64_data): data = onnx_tensor.int64_data + elif len(onnx_tensor.double_data): data = onnx_tensor.double_data + elif len(onnx_tensor.uint64_data): data = onnx_tensor.uint64_data + if isinstance(data, Tensor): + if len(data) == 1: return Tensor(data.tolist()[0], dtype=dtype).reshape(shape) + return data.cast(dtype).reshape(shape).to(Device.DEFAULT) + if has_field(onnx_tensor, "raw_data"): + raw_data = onnx_tensor.raw_data + if not isinstance(raw_data, Tensor): raw_data = Tensor(raw_data) + if onnx_tensor.data_type == TensorProto.FLOAT16: + np_buffer = np.frombuffer(raw_data.data().tobytes(), + dtype=helper.tensor_dtype_to_np_dtype(onnx_tensor.data_type)).copy().reshape(shape) + if np_buffer.size == 1: return Tensor(np_buffer.item(), dtype=dtype).reshape(shape) + return Tensor(np_buffer, dtype=dtype) + ret = raw_data.bitcast(dtype).reshape(shape).to(Device.DEFAULT) + if shape == (): ret = Tensor(ret.item(), dtype=dtype).reshape(shape) + return ret return Tensor(None) def type_parse(onnx_type: TypeProto): elem_type = onnx_type - if elem_type.HasField("map_type") or elem_type.HasField("sparse_tensor_type") or elem_type.HasField("opaque_type"): + if has_field(elem_type, "map_type") or has_field(elem_type, "sparse_tensor_type") or has_field(elem_type, "opaque_type"): raise NotImplementedError("parsing for map_type, sparse_tensor_type and opaque_type are not implemented") - if is_optional := elem_type.HasField("optional_type"): elem_type = elem_type.optional_type.elem_type - if is_sequence := elem_type.HasField("sequence_type"): elem_type = elem_type.sequence_type.elem_type - if elem_type.HasField("tensor_type"): - shape = tuple(d.dim_param or d.dim_value for d in elem_type.tensor_type.shape.dim) - dtype = dtype_parse(elem_type.tensor_type.elem_type) + if is_optional := has_field(elem_type, "optional_type"): elem_type = elem_type.optional_type.elem_type + if is_sequence := has_field(elem_type, "sequence_type"): elem_type = elem_type.sequence_type.elem_type + if has_field(elem_type, "tensor_type"): + shape = tuple(getattr(d, "dim_param", None) or getattr(d, "dim_value") for d in elem_type.tensor_type.shape.dim) \ + if has_field(elem_type.tensor_type, "shape") else None # test_identity_sequence_cpu + dtype = dtype_parse(elem_type.tensor_type.elem_type, "input type spec parse") return OnnxValue(shape, dtype, is_optional, is_sequence) raise RuntimeError(f"TypeProto was not parsed properly: {onnx_type=}") @@ -109,12 +134,11 @@ def to_python_const(t:Any, op:str, idx:int) -> list[ConstType]|ConstType|bytes: debug = int(getenv("DEBUGONNX", "0")) limit = int(getenv("ONNXLIMIT", "-1")) class OnnxRunner: - def __init__(self, model: ModelProto): + def __init__(self, model: ModelProto|SimpleNamespace): # parse model protobuf self.is_training = any(n.domain in {"ai.onnx.training", "ai.onnx.preview.training"} for n in model.graph.node) - self.old_training, self.old_no_grad = Tensor.training, Tensor.no_grad + self.old_training = Tensor.training Tensor.training = True if self.is_training else False - Tensor.no_grad = False if self.is_training else True self.graph_values = {"": None, **{x.name:buffer_parse(x) for x in model.graph.initializer}} self.graph_inputs = {x.name:type_parse(x.type) for x in model.graph.input if x.name not in self.graph_values} self.graph_outputs = tuple(x.name for x in model.graph.output) @@ -129,15 +153,15 @@ class OnnxRunner: if spec.is_optional and value is None: return None # TODO: need true float16 for dtype checking if spec.is_sequence: - if not isinstance(value, Sequence): raise RuntimeError(f"{name} received {value}, expected a sequence type") + if not isinstance(value, Sequence): raise RuntimeError(f"input {name} received {value}, expected a sequence type") sequence = [Tensor(v, dtype=spec.dtype, requires_grad=self.is_training) if not isinstance(v, Tensor) else v for v in value] - if not all_same(tuple(t.shape for t in sequence)): raise RuntimeError(f"Shapes for {name} sequence must be homogeneous") + if not all_same(tuple(t.shape for t in sequence)): raise RuntimeError(f"Shapes for input {name} sequence must be homogeneous") return sequence tensor = Tensor(value, dtype=spec.dtype, requires_grad=self.is_training) if not isinstance(value, Tensor) else value for dim, (onnx_dim, user_dim_input) in enumerate(zip(spec.shape, tensor.shape, strict=True)): if isinstance(onnx_dim, str): onnx_dim = self.variable_dims[onnx_dim] if onnx_dim in self.variable_dims else self.variable_dims.setdefault(onnx_dim, int(user_dim_input)) - if user_dim_input != onnx_dim: raise RuntimeError(f"{name} has mismatch on {dim=}. Expected {onnx_dim}, received {user_dim_input}.") + if user_dim_input != onnx_dim: raise RuntimeError(f"input {name} has mismatch on {dim=}. Expected {onnx_dim}, received {user_dim_input}.") return tensor def _dispatch_op(self, op, inps, opts): @@ -176,9 +200,9 @@ class OnnxRunner: self.graph_values.update(dict(zip(node.outputs, ret[:len(node.outputs)], strict=True))) if node.num == limit: - Tensor.training, Tensor.no_grad = self.old_training, self.old_no_grad + Tensor.training = self.old_training return {name:self.graph_values[name] for name in node.outputs} - Tensor.training, Tensor.no_grad = self.old_training, self.old_no_grad + Tensor.training = self.old_training return {name:self.graph_values[name] for name in self.graph_outputs} #################### @@ -268,7 +292,7 @@ def get_onnx_ops(): raise ValueError(f"pixel_format={pixel_format!r} is not supported.") def EyeLike(x:Tensor, dtype:int|None=None, k:int=0): - ret = Tensor.eye(cast(int, min(x.shape)), dtype=dtype_parse(dtype) if dtype is not None else x.dtype) + ret = Tensor.eye(cast(int, min(x.shape)), dtype=dtype_parse(dtype, "EyeLike op") if dtype is not None else x.dtype) return ret if x.size(0) == x.size(1) else ret.pad(tuple(None if d == ret.size(0) else (k, d-ret.shape[0]-k) for d in x.shape)) def OptionalHasElement(x:Tensor|None=None): return Tensor(x is not None and x.numel() > 0) @@ -322,7 +346,7 @@ def get_onnx_ops(): # ***** Casting Ops ***** # TODO: saturate - def Cast(x:Tensor, to:int, saturate:int=1): return x.cast(dtype_parse(to)) + def Cast(x:Tensor, to:int, saturate:int=1): return x.cast(dtype_parse(to, "Cast op")) def CastLike(x:Tensor, target_type:Tensor, saturate:int=1): return x.cast(target_type.dtype) # ***** Reduce Ops ***** @@ -715,7 +739,9 @@ def get_onnx_ops(): # ***** Quantization Ops ***** def QuantizeLinear(x:Tensor, y_scale:Tensor, y_zero_point:Tensor|int=0, axis:int=1, block_size:int=0, output_dtype:int=0, saturate=1): - out_dtype = y_zero_point.dtype if isinstance(y_zero_point, Tensor) else dtype_parse(output_dtype) if output_dtype else dtypes.uint8 + if isinstance(y_zero_point, Tensor): out_dtype = y_zero_point.dtype + elif output_dtype != 0: out_dtype = dtype_parse(output_dtype, "QuantizeLinear op") + else: out_dtype = dtypes.uint8 y_scale, y_zero_point = _prepare_quantize(x, y_scale, y_zero_point, axis, block_size) if out_dtype == dtypes.uchar: # this appears to work in practice, at least for uchar out_dtype. it folds with the quantize stuff diff --git a/tinygrad_repo/extra/onnx_helpers.py b/tinygrad_repo/extra/onnx_helpers.py index ab020fd..d2a8bb9 100644 --- a/tinygrad_repo/extra/onnx_helpers.py +++ b/tinygrad_repo/extra/onnx_helpers.py @@ -1,8 +1,7 @@ from tinygrad import Tensor from tinygrad.tensor import _to_np_dtype -from tinygrad.frontend.onnx import OnnxRunner +from tinygrad.frontend.onnx import OnnxRunner, onnx_load from extra.onnx import OnnxValue -import onnx import numpy as np import onnxruntime as ort @@ -47,7 +46,7 @@ def get_example_inputs(graph_inputs:dict[str, OnnxValue], config={}): return ret def validate(onnx_file, inputs, rtol=1e-5, atol=1e-5): - run_onnx = OnnxRunner(onnx.load(onnx_file)) + run_onnx = OnnxRunner(onnx_load(onnx_file)) ort_options = ort.SessionOptions() ort_options.log_severity_level = 3 diff --git a/tinygrad_repo/extra/onnx_parser.py b/tinygrad_repo/extra/onnx_parser.py new file mode 100644 index 0000000..7cb0795 --- /dev/null +++ b/tinygrad_repo/extra/onnx_parser.py @@ -0,0 +1,204 @@ +# https://github.com/onnx/onnx/blob/main/onnx/onnx.proto3 + +import os, pathlib, struct +from io import BufferedReader +from typing import Tuple, Union +from types import SimpleNamespace +from tinygrad.nn.state import TensorIO +from tinygrad.tensor import Tensor, dtypes + +# Protobuf Wire Types +WIRETYPE_VARINT = 0; WIRETYPE_FIXED64 = 1; WIRETYPE_LENGTH_DELIMITED = 2; WIRETYPE_START_GROUP = 3; WIRETYPE_END_GROUP = 4; WIRETYPE_FIXED32 = 5 # noqa: E702 + +# TensorProto.DataType +class TensorDataType: + UNDEFINED = 0; FLOAT = 1; UINT8 = 2; INT8 = 3; UINT16 = 4; INT16 = 5; INT32 = 6; INT64 = 7 # noqa: E702 + STRING = 8; BOOL = 9; FLOAT16 = 10; DOUBLE = 11; UINT32 = 12; UINT64 = 13; COMPLEX64 = 14; COMPLEX128 = 15; BFLOAT16 = 16 # noqa: E702 + +# AttributeProto.AttributeType +class AttributeType: + UNDEFINED = 0; FLOAT = 1; INT = 2; STRING = 3; TENSOR = 4; GRAPH = 5; SPARSE_TENSOR = 11; TYPE_PROTO = 13; FLOATS = 6; INTS = 7 # noqa: E702 + STRINGS = 8; TENSORS = 9; GRAPHS = 10; SPARSE_TENSORS = 12; TYPE_PROTOS = 14 # noqa: E702 + +class PBType: FLOAT = 1; INT = 2; STRING = 3; FLOATS = 4; INTS = 5; STRINGS = 6; BYTES = 7; SUB = 8 # noqa: E702 + +PB_INFOS = { + "OperatorSetIdProto": {1: ("domain", PBType.STRING), 2: ("version", PBType.INT)}, + "StringStringEntryProto": {1: ("key", PBType.STRING), 2: ("value", PBType.STRING)}, + "TensorProto": {1: ("dims", PBType.INT, True), 2: ("data_type", PBType.INT), 4: ("float_data", PBType.FLOATS), + 13: ("external_data", PBType.SUB, True, "StringStringEntryProto"), 14: ("data_location", PBType.INT), + 5: ("int32_data", PBType.INTS), 7: ("int64_data", PBType.INTS), 8: ("name", PBType.STRING), 9: ("raw_data", PBType.BYTES)}, + "TensorShapeProtoDimension": {1: ("dim_value", PBType.INT), 2: ("dim_param", PBType.STRING)}, + "TensorShapeProto": {1: ("dim", PBType.SUB, True, "TensorShapeProtoDimension")}, + "ModelProto": {1: ("ir_version", PBType.INT), 5: ("model_version", PBType.INT), + 2: ("producer_name", PBType.STRING), 3: ("producer_version", PBType.STRING), 4: ("domain", PBType.STRING), 6: ("doc_string", PBType.STRING), + 7: ("graph", PBType.SUB, False, ("GraphProto", lambda: {"node": [], "initializer": [], "input": [], "output": [], "value_info": []})), + 8: ("opset_import",PBType.SUB, True, "OperatorSetIdProto")}, + "GraphProto": {2: ("name", PBType.STRING), 10: ("doc_string", PBType.STRING), + 1: ("node", PBType.SUB, True, ("NodeProto", lambda: {"input": [], "output": [], "attribute": [], "domain": None})), + 5: ("initializer", PBType.SUB, True, ("TensorProto", lambda: {"dims": [], "float_data": [], "int32_data": [], "string_data": [], + "int64_data": [], "double_data": [], "uint64_data": []})), + 11: ("input", PBType.SUB, True, "ValueInfoProto"), 12: ("output", PBType.SUB, True, "ValueInfoProto")}, + "NodeProto": { 1: ("input", PBType.STRING, True), 2: ("output", PBType.STRING, True), 3: ("name", PBType.STRING), + 4: ("op_type", PBType.STRING), 6: ("doc_string", PBType.STRING), 7: ("domain", PBType.STRING), + 5: ("attribute", PBType.SUB, True, ("AttributeProto", lambda: {"floats": [], "ints": [], "strings": []}))}, + "AttributeProto": {1: ("name", PBType.STRING), 20: ("type", PBType.INT), 3: ("i", PBType.INT), 8: ("ints", PBType.INT, True), + 2: ("f", PBType.FLOAT), 7: ("floats", PBType.FLOAT, True), 4: ("s", PBType.BYTES), 9: ("strings", PBType.BYTES, True), + 5:("t", PBType.SUB, False, ("TensorProto", lambda: {"dims": [], "float_data": [], "int32_data": [], "string_data": [], "int64_data": [], + "double_data": [], "uint64_data": []}))}, + "ValueInfoProto": {1: ("name", PBType.STRING), 2: ("type", PBType.SUB, False, "TypeProto"), 3: ("doc_string", PBType.STRING)}, + "TypeProto": {1: ("tensor_type", PBType.SUB, False, "TypeProtoTensor"), 4: ("sequence_type", PBType.SUB, False, "TypeProtoSequence"), + 9: ("optional_type", PBType.SUB, False, "TypeProtoOptional"), 6: ("denotation", PBType.STRING)}, + "TypeProtoSequence": {1: ("elem_type", PBType.SUB, False, "TypeProto")}, + "TypeProtoOptional": {1: ("elem_type", PBType.SUB, False, "TypeProto")}, + "TypeProtoTensor": {1: ("elem_type", PBType.INT), 2: ("shape", PBType.SUB, False, ("TensorShapeProto", lambda: {"dim": []}))}, +} + +def onnx_load(fn: Union[Tensor, str, pathlib.Path], load_external_data: bool=True): + parser = OnnxParser(fn, load_external_data) + onnx_model = parser.parse() + model = dict_to_namespace(onnx_model) + return model + +def gen_result(obj: dict, key_name, val, repeated: bool): + if repeated: obj.setdefault(key_name, []).append(val) + else: obj[key_name] = val + +def dict_to_namespace(d): + if isinstance(d, dict): return SimpleNamespace(**{k: dict_to_namespace(v) for k, v in d.items()}) + elif isinstance(d, list): return [dict_to_namespace(i) for i in d] + return d + +class OnnxParser: + def __init__(self, inp: Union[Tensor, str, pathlib.Path], load_external_data: bool=True): + self.file_path: Union[pathlib.Path, None] = None + self.load_external_data = load_external_data + if not isinstance(inp, Tensor): + self.file_path = pathlib.Path(inp) + self.tensor = Tensor(self.file_path) + else: self.tensor = inp + self.attr_func_dict = { PBType.BYTES: self._handle_bytes, PBType.SUB: self._handle_sub_message, PBType.FLOATS: self._handle_packed_floats, + PBType.INT: self._handle_int64, PBType.INTS: self._handle_packed_int64s, PBType.STRING: self._handle_string, PBType.FLOAT: self._handle_float} + self.registered_handles = {} + for pb_name in PB_INFOS: + res = {} + for fid, config in PB_INFOS[pb_name].items(): + parser_fn, repeated = None, False + if len(config) == 2: name, attr = config + elif len(config) == 3: name, attr, repeated = config + elif len(config) == 4: name, attr, repeated, parser_fn = config + handler_fn = self.attr_func_dict[attr] + def _wrapper_handler(obj, reader, wt, h=handler_fn, n=name, p=parser_fn, r=repeated): return h(obj, n, reader, wt, parser_func=p, repeated=r) + _wrapper_handler._debug_info = f"{fid}, {name} => {handler_fn}" + res[fid] = _wrapper_handler + self.registered_handles[pb_name] = res + + def parse(self): + reader = BufferedReader(TensorIO(self.tensor)) + return self._parse_message(reader, "ModelProto", lambda: {"opset_import": [], "domain": None, "graph": None}) + + def decode_varint(self, reader: BufferedReader) -> int: + result = 0 + shift = 0 + while True: + data = reader.read(1) + if data == b"": raise EOFError("decode_varint EOF") + result |= (data[0] & 0x7F) << shift + if not (data[0] & 0x80): return result + shift += 7 + if shift >= 70: raise ValueError("Varint too long") + + def skip_field_value(self, reader: BufferedReader, wire_type): + if wire_type == WIRETYPE_VARINT: self.decode_varint(reader) + elif wire_type == WIRETYPE_FIXED64: reader.seek(8, os.SEEK_CUR) + elif wire_type == WIRETYPE_FIXED32: reader.seek(4, os.SEEK_CUR) + elif wire_type == WIRETYPE_LENGTH_DELIMITED: reader.seek(self.decode_varint(reader), os.SEEK_CUR) + else: raise ValueError(f"Unknown wire type: {wire_type}") + + def _parse_message(self, reader, message_field_handlers_name, initial_obj_factory=lambda: {}): + message_field_handlers = self.registered_handles[message_field_handlers_name] + obj = initial_obj_factory() + while True: + try: + tag_val = self.decode_varint(reader) + field_number = tag_val >> 3 + wire_type = tag_val & 0x07 + if handler := message_field_handlers.get(field_number): + handler(obj, reader, wire_type) + else: self.skip_field_value(reader, wire_type) + except EOFError: break + if message_field_handlers_name == "TensorProto" and self.load_external_data and obj.get("data_location", 0) == 1: self._parse_external_data(obj) + return obj + + def _handle_delimited(self, reader:BufferedReader, use_tensor=False) -> Tuple[bytes, Tensor]: + str_len = self.decode_varint(reader) + if not use_tensor: return reader.read(str_len) + res = reader.raw._tensor[reader.tell():(reader.tell()+str_len)] + reader.seek(str_len, os.SEEK_CUR) + return res + + def _handle_string(self, obj, key_name, reader, wire_type, parser_func=None, repeated=False): + if wire_type != WIRETYPE_LENGTH_DELIMITED: raise ValueError(f"Expected length-delimited for string field '{key_name}'") + value = self._handle_delimited(reader) + gen_result(obj, key_name, value.decode("utf-8"), repeated) + + def _handle_bytes(self, obj, key_name, reader, wire_type, parser_func=None, repeated=False): + if wire_type != WIRETYPE_LENGTH_DELIMITED: raise ValueError(f"Expected length-delimited for bytes field '{key_name}'") + value = self._handle_delimited(reader, use_tensor=True) + gen_result(obj, key_name, value, repeated) + + def _handle_int64(self, obj, key_name, reader, wire_type, parser_func=None, repeated=False): + if wire_type != WIRETYPE_VARINT: raise ValueError(f"Expected varint for int64 field '{key_name}'") + val = self.decode_varint(reader) + gen_result(obj, key_name, val - 2**64 if val & (1 << 63) else val, repeated) + + def _handle_float(self, obj, key_name, reader, wire_type, parser_func=None, repeated=False): + if wire_type != WIRETYPE_FIXED32: raise ValueError(f"Expected fixed32 for float field '{key_name}'") + val, = struct.unpack("= BS: - Tensor.no_grad, Tensor.training = False, True + Tensor.training = True x = Tensor(all_feats[:BS]) mask = np.zeros((BS, len(actions)+1), dtype=np.float32) mask[range(BS), all_acts[:BS]] = all_rews[:BS] diff --git a/tinygrad_repo/extra/remu/test/hwtest.py b/tinygrad_repo/extra/remu/test/hwtest.py index 0cc2ba0..76bd2f6 100644 --- a/tinygrad_repo/extra/remu/test/hwtest.py +++ b/tinygrad_repo/extra/remu/test/hwtest.py @@ -89,7 +89,7 @@ def get_output(s:str, n_threads:int=1): "s_waitcnt 0", "global_store_b32 v0, v1, s[0:1]", "s_nop 0", "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)", "s_endpgm"]) - test = Tensor.zeros((n_threads,), dtype=dtypes.uint32).contiguous().realize().lazydata.buffer + test = Tensor.zeros((n_threads,), dtype=dtypes.uint32).contiguous().realize().uop.buffer prg = get_prg(code, 32, 32) prg(test._buf, global_size=(1, 1, 1), local_size=(n_threads, 1, 1), wait=True) return test.numpy() diff --git a/tinygrad_repo/extra/resnet18/resnet_tinygrad.py b/tinygrad_repo/extra/resnet18/resnet_tinygrad.py index 2539fc2..a34a27b 100644 --- a/tinygrad_repo/extra/resnet18/resnet_tinygrad.py +++ b/tinygrad_repo/extra/resnet18/resnet_tinygrad.py @@ -79,7 +79,6 @@ if __name__ == "__main__": resnet18 = load() - @Tensor.test() def _forward(im): return resnet18(im) forward = TinyJit(_forward, prune=True) diff --git a/tinygrad_repo/extra/torch_backend/backend.py b/tinygrad_repo/extra/torch_backend/backend.py index a86b3d5..17c8083 100644 --- a/tinygrad_repo/extra/torch_backend/backend.py +++ b/tinygrad_repo/extra/torch_backend/backend.py @@ -65,12 +65,12 @@ for k,v in view_ops.items(): torch.library.impl(k.replace("aten.", "aten::"), "p # in place operations with views def realize_with_views(self: Tensor, views: Tensor): - if not self.lazydata.st.contiguous: self.replace(self.contiguous()) + if not self.uop.st.contiguous: self.replace(self.contiguous()) self.replace(self.clone().realize()) for v in views: - if v.lazydata.base.op is Ops.BUFFER_VIEW: continue # skip subbuffer, we just use the real buffer view + if v.uop.base.op is Ops.BUFFER_VIEW: continue # skip subbuffer, we just use the real buffer view ret = self - st = ShapeTracker(self.lazydata.st.views + v.lazydata.st.views) # TODO: is this right? + st = ShapeTracker(self.uop.st.views + v.uop.st.views) # TODO: is this right? for mo in cached_to_movement_ops(self.shape, st): ret = apply_mop(ret, mo) v.replace(ret) def maybe_realize_storage(self: Tensor) -> bool: @@ -121,6 +121,12 @@ def cummax(self, dim): # TODO: move to tinygrad def nonzero(self): return aten.nonzero(self.cpu()).tiny() +@torch.library.impl("aten::_linalg_eigh", "privateuseone") +# TODO: move to tinygrad +def _linalg_eigh(self, UPLO: str = 'U'): + w, v = torch.linalg.eigh(self.cpu(), UPLO=UPLO) + return w.tiny(), v.tiny() + def upsample_backward(grad_out, output_size, input_size, *args, f=None): return f(grad_out.cpu(), output_size, input_size, *args).tiny() for i in [ @@ -172,7 +178,7 @@ def as_strided(tensor:torch.Tensor, size, stride, storage_offset=None): # multiple as_strided do not compound base = canonical_base(tensor) # TODO: this is heavyweight - st = ShapeTracker(base.lazydata.st.views + (View.create(tuple(size), tuple(stride), storage_offset),)) + st = ShapeTracker(base.uop.st.views + (View.create(tuple(size), tuple(stride), storage_offset),)) ret = base if TORCH_DEBUG >= 1: print("**** as_strided", tensor.shape, size, stride, st) if prod(size) == 1: return ret.flatten()[storage_offset].reshape(size) @@ -309,7 +315,7 @@ def _copy_from(src: torch.Tensor, dest, non_blocking=False): to_device = _from_torch_device(dest.device) src,dest = unwrap(src),unwrap(dest) # TODO we need to properly match dest shape and strides, not blindly assign - if dest.lazydata.st.contiguous or dest.lazydata.is_realized: src = src.contiguous() # this only solves some cases + if dest.uop.st.contiguous or dest.uop.is_realized: src = src.contiguous() # this only solves some cases dest.assign(src.cast(cast_dtype).to(to_device)) if realize: Tensor.realize(dest) elif src.is_tiny and dest.is_cpu: @@ -487,7 +493,7 @@ def wrap_out(f): assert out.shape == assigned.shape, f"shape mismatch: {assigned.shape} -> {out.shape}" assert out.device == assigned.device, f"device mismatch: {assigned.device} -> {out.device}" assert out.dtype == assigned.dtype, f"dtype mismatch: {assigned.dtype} -> {out.dtype}" - if out.lazydata.is_realized: assigned = assigned.contiguous() # TODO: how does this map to torch's semantics + if out.uop.is_realized: assigned = assigned.contiguous() # TODO: how does this map to torch's semantics return out.assign(assigned) return _wrap_out diff --git a/tinygrad_repo/extra/torch_backend/test.py b/tinygrad_repo/extra/torch_backend/test.py index 113c013..5d84226 100644 --- a/tinygrad_repo/extra/torch_backend/test.py +++ b/tinygrad_repo/extra/torch_backend/test.py @@ -170,6 +170,13 @@ class TestTorchBackend(unittest.TestCase): assert torch.equal(tensor_a, tensor_b) assert not torch.equal(tensor_a, tensor_c) + def test_linalg_eigh(self): + a = torch.tensor([[1, 2], [2, 1]], dtype=torch.float32, device=device) + w, v = torch.linalg.eigh(a) + np.testing.assert_equal(w.cpu().numpy(), [-1, 3]) + recon = (v @ torch.diag(w) @ v.T).cpu().numpy() + np.testing.assert_allclose(recon, a.cpu().numpy(), atol=1e-6) + def test_scalar_assign(self): a = torch.tensor([1, 2, 3], device=device) a[1] = 4 diff --git a/tinygrad_repo/extra/torch_backend/wrapped_tensor.cpp b/tinygrad_repo/extra/torch_backend/wrapped_tensor.cpp index 0df27d0..3e5f19f 100644 --- a/tinygrad_repo/extra/torch_backend/wrapped_tensor.cpp +++ b/tinygrad_repo/extra/torch_backend/wrapped_tensor.cpp @@ -116,7 +116,7 @@ at::Tensor wrap_tensor(py::object &py_obj, c10::ScalarType dtype, c10::DeviceInd // TODO: we have to get the dtype and the shape from the tinygrad Tensor std::vector sizes = py_obj.attr("shape").cast>(); - py::list views = py_obj.attr("lazydata").attr("st").attr("views"); + py::list views = py_obj.attr("uop").attr("st").attr("views"); std::vector strides = views[views.size() - 1].attr("strides").cast>(); int64_t storage_offset = 0; for (auto& v: views) { diff --git a/tinygrad_repo/extra/torch_hook/hook_torch.py b/tinygrad_repo/extra/torch_hook/hook_torch.py index f3b8113..8f71807 100644 --- a/tinygrad_repo/extra/torch_hook/hook_torch.py +++ b/tinygrad_repo/extra/torch_hook/hook_torch.py @@ -113,7 +113,7 @@ class DispatchLog(TorchDispatchMode): _ = tiny_x.cpu().numpy() if torch.is_tensor(tiny_x) and tiny_x.device.type == "tiny": tt = tiny_torch.unwrap(tiny_x) - try: out_addr = tt.lazydata.buffer._buf.value + try: out_addr = tt.uop.buffer._buf.value except Exception: pass tiny_events = hook_cuda.collect_events(clear=True) print_events(tiny_events, colored("tiny", "magenta"), out_addr) diff --git a/tinygrad_repo/ruff.toml b/tinygrad_repo/ruff.toml index 7912b6f..ac3ee49 100644 --- a/tinygrad_repo/ruff.toml +++ b/tinygrad_repo/ruff.toml @@ -28,7 +28,6 @@ lint.select = [ "RET506", # superfluous-else-raise "RET507", # superfluous-else-continue "A", # builtin-variable-shadowing, builtin-argument-shadowing, builtin-attribute-shadowing - "SIM105", # suppressible-exception "FURB110",# if-exp-instead-of-or-operator "RUF018", # assignment-in-assert ] diff --git a/tinygrad_repo/setup.py b/tinygrad_repo/setup.py index a5fe80b..9765ed3 100644 --- a/tinygrad_repo/setup.py +++ b/tinygrad_repo/setup.py @@ -27,7 +27,7 @@ setup(name='tinygrad', packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.runtime.autogen.am', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine', 'tinygrad.viz', 'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.support.am', 'tinygrad.runtime.graph', 'tinygrad.shape', 'tinygrad.uop'], - package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'perfetto.html', 'assets/**/*', 'lib/**/*']}, + package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'perfetto.html', 'assets/**/*', 'js/*']}, classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License" @@ -85,9 +85,5 @@ setup(name='tinygrad', "black", "numpy", ], - 'testing_tf': [ - "tensorflow==2.15.1", - "tensorflow_addons", - ], }, include_package_data=True) diff --git a/tinygrad_repo/test/external/external_becnhmark_am.py b/tinygrad_repo/test/external/external_benchmark_am.py similarity index 100% rename from tinygrad_repo/test/external/external_becnhmark_am.py rename to tinygrad_repo/test/external/external_benchmark_am.py diff --git a/tinygrad_repo/test/external/external_benchmark_bert_matmuls.py b/tinygrad_repo/test/external/external_benchmark_bert_matmuls.py index b83e6d4..4f64629 100644 --- a/tinygrad_repo/test/external/external_benchmark_bert_matmuls.py +++ b/tinygrad_repo/test/external/external_benchmark_bert_matmuls.py @@ -13,6 +13,6 @@ if __name__ == "__main__": (Tensor.empty(BS, 16, 512, 512), Tensor.empty(BS, 512, 16, 64).permute(0,2,1,3)), # qk@v ] for t0, t1 in tensors: - print(f"{t0.shape=}, {t0.lazydata.st.real_strides()=}, {t1.shape=}, {t1.lazydata.st.real_strides()=}") + print(f"{t0.shape=}, {t0.uop.st.real_strides()=}, {t1.shape=}, {t1.uop.st.real_strides()=}") for _ in range(5): t0.dot(t1, dtype=acc_dtype).realize() diff --git a/tinygrad_repo/test/external/external_benchmark_keccak.py b/tinygrad_repo/test/external/external_benchmark_keccak.py new file mode 100644 index 0000000..1365ca9 --- /dev/null +++ b/tinygrad_repo/test/external/external_benchmark_keccak.py @@ -0,0 +1,20 @@ +from tinygrad import Tensor, dtypes +from tinygrad.engine.jit import TinyJit +from tinygrad.helpers import Timing, getenv + +if __name__ == "__main__": + BS = getenv("BS", 2**14) + BLOCKSIZE = getenv("BLOCKSIZE", 4096) + HASHFN = getenv("HASHFN", "shake_128") + NRUNS = getenv("NRUNS", 5) + + @TinyJit + def hasher(data: Tensor): return data.keccak(HASHFN) + + t = Tensor.randn(BS, BLOCKSIZE, dtype=dtypes.uint8).realize() + ds_mib = t.nbytes() / 1024**2 + + print(f"--- benchmarking (hash: {HASHFN}, data size: {ds_mib} MiB, block size: {BLOCKSIZE} B, batch size: {BS})") + for i in range(NRUNS): + with Timing(f"run: {i+1}, elapsed time: ", (lambda et: f", throughput: {ds_mib / (et*1e-9):.2f} MiB/s")): + hasher(t).realize() diff --git a/tinygrad_repo/test/external/external_benchmark_openpilot.py b/tinygrad_repo/test/external/external_benchmark_openpilot.py index 7316a11..de856b2 100644 --- a/tinygrad_repo/test/external/external_benchmark_openpilot.py +++ b/tinygrad_repo/test/external/external_benchmark_openpilot.py @@ -1,8 +1,7 @@ import time, sys, hashlib from pathlib import Path -import onnx from onnx.helper import tensor_dtype_to_np_dtype -from tinygrad.frontend.onnx import OnnxRunner +from tinygrad.frontend.onnx import OnnxRunner, onnx_load from tinygrad import Tensor, dtypes, TinyJit from tinygrad.helpers import IMAGE, GlobalCounters, fetch, colored, getenv, trange from tinygrad.tensor import _from_np_dtype @@ -12,7 +11,7 @@ from extra.bench_log import BenchEvent, WallTimeEvent OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx" if __name__ == "__main__": - onnx_model = onnx.load(onnx_path := fetch(OPENPILOT_MODEL)) + onnx_model = onnx_load(onnx_path := fetch(OPENPILOT_MODEL)) run_onnx = OnnxRunner(onnx_model) Tensor.manual_seed(100) diff --git a/tinygrad_repo/test/external/external_benchmark_schedule.py b/tinygrad_repo/test/external/external_benchmark_schedule.py index 6b606ee..89628ee 100644 --- a/tinygrad_repo/test/external/external_benchmark_schedule.py +++ b/tinygrad_repo/test/external/external_benchmark_schedule.py @@ -5,7 +5,7 @@ from tinygrad.helpers import Profiling, Timing, getenv, BEAM, NOOPT, DEBUG, Cont from tinygrad.uop.ops import Ops from tinygrad.codegen.kernel import Kernel from tinygrad.codegen.heuristic import hand_coded_optimizations -from tinygrad.codegen import get_rewrites_for_renderer, apply_rewrites +from tinygrad.codegen import get_rewrites_for_renderer, apply_rewrites, rewrites_for_linearizer from tinygrad.engine.search import beam_search, bufs_from_lin if __name__ == "__main__": @@ -24,7 +24,8 @@ if __name__ == "__main__": if not FORWARD_ONLY: with Timing("***** model schedule in "): - sched = out.schedule() + with Profiling(PROFILE >= 3): + sched = out.schedule() if not SCHEDULE_ONLY: asts = list({x.ast.key:x.ast for x in sched if x.ast.op is Ops.SINK}.values()) @@ -41,7 +42,7 @@ if __name__ == "__main__": kernels.append(k) with Timing("***** model prep in "): - kernels = [(k, k.get_optimized_ast(), get_rewrites_for_renderer(k.opts, linearizer=LINEARIZE)) for k in kernels] + kernels = [(k, k.get_optimized_ast(), get_rewrites_for_renderer(k.opts, linearizer=False)) for k in kernels] with Profiling(PROFILE, fn="/tmp/rewrite.prof"): with Timing("***** model rewrite in "): @@ -49,5 +50,10 @@ if __name__ == "__main__": for i,(k,u,rewrites) in enumerate(kernels): with Timing(f"rewrite {i:2d} {k.name}{' '*(50-ansilen(k.name))}", enabled=getenv("VERBOSE", 0)): rewritten_uops.append(apply_rewrites(u, rewrites)) - uops = rewritten_uops - if LINEARIZE: print(sum(len(u.arg.lst) for u in uops)) + + if LINEARIZE: + with Timing("***** model linearize in "): + uops_line = [] + for u in rewritten_uops: + uops_line.append(apply_rewrites(u, rewrites_for_linearizer)) + print(sum(len(u.arg.lst) for u in uops_line)) diff --git a/tinygrad_repo/test/external/external_llama_eval.py b/tinygrad_repo/test/external/external_llama_eval.py index cf08b38..e707825 100644 --- a/tinygrad_repo/test/external/external_llama_eval.py +++ b/tinygrad_repo/test/external/external_llama_eval.py @@ -69,7 +69,6 @@ class LLaMaAdaptor(BaseLM): return self.llama.tokenizer.decode(tokens) def _model_call(self, inps): - Tensor.no_grad = True return torch.Tensor(self.llama.model(Tensor(inps.numpy()), 0).numpy()) def greedy_until(self, requests): diff --git a/tinygrad_repo/test/external/external_model_benchmark.py b/tinygrad_repo/test/external/external_model_benchmark.py index a0a3359..838089e 100644 --- a/tinygrad_repo/test/external/external_model_benchmark.py +++ b/tinygrad_repo/test/external/external_model_benchmark.py @@ -1,15 +1,13 @@ -import csv, pathlib, time, numpy as np -from os import getenv +import csv, pathlib, time +import numpy as np import torch torch.set_num_threads(1) -import onnx from onnx.helper import tensor_dtype_to_np_dtype import onnxruntime as ort from onnx2torch import convert -from tinygrad.frontend.onnx import OnnxRunner -from tinygrad.helpers import OSX, DEBUG, fetch +from tinygrad.frontend.onnx import OnnxRunner, onnx_load +from tinygrad.helpers import OSX, DEBUG, fetch, getenv from tinygrad import Tensor, Device -from tinygrad.device import CompileError MODELS = { "resnet50": "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx", @@ -51,10 +49,10 @@ def benchmark_model(m, devices, validate_outs=False): CSV = {"model": m} fn = fetch(MODELS[m]) - onnx_model = onnx.load(fn) + onnx_model = onnx_load(fn) output_names = [out.name for out in onnx_model.graph.output] excluded = {inp.name for inp in onnx_model.graph.initializer} - input_shapes = {inp.name:tuple(x.dim_value if x.dim_value != 0 else 1 for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input if inp.name not in excluded} # noqa: E501 + input_shapes = {inp.name:tuple(x.dim_value if hasattr(x, "dim_value") and x.dim_value != 0 else 1 for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input if inp.name not in excluded} # noqa: E501 input_types = {inp.name: tensor_dtype_to_np_dtype(inp.type.tensor_type.elem_type) for inp in onnx_model.graph.input if inp.name not in excluded} #input_types = {k:v if v!=np.float16 else np.float32 for k,v in input_types.items()} # cast np_inputs = {k:torch.randn(shp).numpy().astype(input_types[k]) for k,shp in input_shapes.items()} @@ -63,27 +61,20 @@ def benchmark_model(m, devices, validate_outs=False): # print input names if DEBUG >= 2: print([inp.name for inp in onnx_model.graph.input if inp.name not in excluded]) for device in devices: - try: - Device.DEFAULT = device - inputs = {k:Tensor(inp) for k,inp in np_inputs.items()} - tinygrad_model = OnnxRunner(onnx_model) - benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()}) + Device.DEFAULT = device + inputs = {k:Tensor(inp) for k,inp in np_inputs.items()} + tinygrad_model = OnnxRunner(onnx_model) + benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()}) - from tinygrad.engine.jit import TinyJit - tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()}) - for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()} - benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}) # noqa: F821 - del inputs, tinygrad_model, tinygrad_jitted_model - except CompileError as e: - # TODO: we don't run the dm model on METAL for now - if Device.DEFAULT == "METAL": - assert "no 'buffer' resource location available" in str(e) - return - else: raise e + from tinygrad.engine.jit import TinyJit + tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()}) + for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()} + benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}) # noqa: F821 + del inputs, tinygrad_model, tinygrad_jitted_model # convert model to torch try: - torch_model = convert(onnx_model) + torch_model = convert(fn) except Exception as e: # model conversion failed print(f"{m:16s}onnx2torch {type(e).__name__:>25}") @@ -131,15 +122,14 @@ def benchmark_model(m, devices, validate_outs=False): open_csv.writeheader() open_csv.writerow(CSV) -def assert_allclose(tiny_out:dict, onnx_out:dict, rtol=1e-5, atol=1e-5): - assert len(tiny_out) == len(onnx_out) and tiny_out.keys() == onnx_out.keys() +def assert_allclose(tiny_out:dict, onnx_out:dict, rtol, atol): + assert tiny_out.keys() == onnx_out.keys() for k in tiny_out.keys(): tiny_v, onnx_v = tiny_out[k], onnx_out[k] - if tiny_v is None: assert tiny_v == onnx_v - else: np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}") + np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}") if __name__ == "__main__": devices = [Device.DEFAULT] if getenv("NOCLANG") else [Device.DEFAULT, "CPU"] - if getenv("MODEL", "") != "": benchmark_model(getenv("MODEL", ""), devices, True) + if (model:=getenv("MODEL", "")) != "": benchmark_model(model, devices, validate_outs=True) else: - for m in MODELS: benchmark_model(m, devices, True) + for m in MODELS: benchmark_model(m, devices, validate_outs=True) diff --git a/tinygrad_repo/test/external/external_multi_gpu.py b/tinygrad_repo/test/external/external_multi_gpu.py index e5dd836..32d107d 100644 --- a/tinygrad_repo/test/external/external_multi_gpu.py +++ b/tinygrad_repo/test/external/external_multi_gpu.py @@ -21,8 +21,8 @@ if __name__ == "__main__": with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"): c0 = (Tensor.ones(sz, device="CPU")/2).realize() c1 = (Tensor.ones(sz, device="CPU")/4).realize() - print(c0.lazydata.base.realized) - print(c1.lazydata.base.realized) + print(c0.uop.base.realized) + print(c1.uop.base.realized) with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): a0 = c0.to(d0).realize() diff --git a/tinygrad_repo/test/external/external_test_am.py b/tinygrad_repo/test/external/external_test_am.py index 4601435..dfe241f 100644 --- a/tinygrad_repo/test/external/external_test_am.py +++ b/tinygrad_repo/test/external/external_test_am.py @@ -91,6 +91,17 @@ class TestAMPageTable(unittest.TestCase): assert pte['paddr'] == 0 assert pte['valid'] == 0 + def test_map_notaligned(self): + mm0 = self.d[0].mm + + for (va1,sz1),(va2,sz2) in [((0x10000, (0x1000)), (0x11000, (2 << 20)))]: + exteranl_va1 = va1 + AMMemoryManager.va_allocator.base + exteranl_va2 = va2 + AMMemoryManager.va_allocator.base + mm0.map_range(vaddr=exteranl_va1, size=sz1, paddrs=[(va1, sz1)]) + mm0.map_range(vaddr=exteranl_va2, size=sz2, paddrs=[(va2, sz2)]) + mm0.unmap_range(va2, sz2) + mm0.unmap_range(va1, sz1) + def test_double_map(self): mm0 = self.d[0].mm diff --git a/tinygrad_repo/test/external/external_test_amd.py b/tinygrad_repo/test/external/external_test_amd.py index aabff98..dd13353 100644 --- a/tinygrad_repo/test/external/external_test_amd.py +++ b/tinygrad_repo/test/external/external_test_amd.py @@ -9,18 +9,18 @@ class TestAMD(unittest.TestCase): TestAMD.d0: AMDDevice = Device["AMD"] TestAMD.a = Tensor([0.,1.], device="AMD").realize() TestAMD.b = self.a + 1 - si = create_schedule([self.b.lazydata])[-1] + si = create_schedule([self.b.uop])[-1] TestAMD.d0_runner = TestAMD.d0.get_runner(*si.ast) - TestAMD.b.lazydata.buffer.allocate() + TestAMD.b.uop.buffer.allocate() def test_amd_ring_64bit_doorbell(self): TestAMD.d0.pm4_write_pointer[0] = TestAMD.d0.pm4_write_pointer[0] + (2 << 32) - TestAMD.d0.pm4_ring.size // 4 for _ in range(2000): - TestAMD.d0_runner.clprg(TestAMD.b.lazydata.buffer._buf, TestAMD.a.lazydata.buffer._buf, + TestAMD.d0_runner.clprg(TestAMD.b.uop.buffer._buf, TestAMD.a.uop.buffer._buf, global_size=TestAMD.d0_runner.global_size, local_size=TestAMD.d0_runner.local_size) - TestAMD.d0_runner.clprg(TestAMD.a.lazydata.buffer._buf, TestAMD.b.lazydata.buffer._buf, + TestAMD.d0_runner.clprg(TestAMD.a.uop.buffer._buf, TestAMD.b.uop.buffer._buf, global_size=TestAMD.d0_runner.global_size, local_size=TestAMD.d0_runner.local_size) - val = TestAMD.a.lazydata.buffer.as_buffer().cast("f")[0] + val = TestAMD.a.uop.buffer.as_buffer().cast("f")[0] assert val == 4000.0, f"got val {val}" if __name__ == "__main__": diff --git a/tinygrad_repo/test/external/external_test_datasets.py b/tinygrad_repo/test/external/external_test_datasets.py index 8fd3c77..19e58d8 100644 --- a/tinygrad_repo/test/external/external_test_datasets.py +++ b/tinygrad_repo/test/external/external_test_datasets.py @@ -152,7 +152,7 @@ class TestOpenImagesDataset(ExternalTestDatasets): ref_tgt = postprocess_targets(ref_tgt, anchors.unsqueeze(0)) ref_boxes, ref_labels = ref_tgt[0]["boxes"], ref_tgt[0]["labels"] - np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy()) + np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6) np.testing.assert_equal(tinygrad_boxes[0].numpy(), ref_boxes.numpy()) np.testing.assert_equal(tinygrad_labels[0].numpy(), ref_labels.numpy()) @@ -165,7 +165,7 @@ class TestOpenImagesDataset(ExternalTestDatasets): for ((tinygrad_img, _, _, _), (ref_img, _)) in zip(tinygrad_dataloader, ref_dataloader): ref_img, _ = transform(ref_img.unsqueeze(0)) - np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy()) + np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6) if __name__ == '__main__': unittest.main() diff --git a/tinygrad_repo/test/external/external_test_hcq.py b/tinygrad_repo/test/external/external_test_hcq.py index 0303948..2ae0371 100644 --- a/tinygrad_repo/test/external/external_test_hcq.py +++ b/tinygrad_repo/test/external/external_test_hcq.py @@ -22,10 +22,10 @@ class TestHCQ(unittest.TestCase): TestHCQ.b = self.a + 1 si = self.b.schedule()[-1] TestHCQ.runner = get_runner(TestHCQ.d0.device, si.ast) - TestHCQ.b.lazydata.buffer.allocate() + TestHCQ.b.uop.buffer.allocate() # wow that's a lot of abstraction layers - TestHCQ.addr = struct.pack("QQ", TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr) - TestHCQ.addr2 = struct.pack("QQ", TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr) + TestHCQ.addr = struct.pack("QQ", TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr) + TestHCQ.addr2 = struct.pack("QQ", TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr) TestHCQ.kernargs_off = TestHCQ.runner._prg.kernargs_offset TestHCQ.kernargs_size = TestHCQ.runner._prg.kernargs_alloc_size ctypes.memmove(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_off, TestHCQ.addr, len(TestHCQ.addr)) @@ -45,8 +45,8 @@ class TestHCQ(unittest.TestCase): def setUp(self): TestHCQ.d0.synchronize() - TestHCQ.a.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) - TestHCQ.b.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 0)))) + TestHCQ.a.uop.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) + TestHCQ.b.uop.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 0)))) TestHCQ.d0.synchronize() # wait for copyins to complete def test_run_1000_times_one_submit(self): @@ -65,7 +65,7 @@ class TestHCQ(unittest.TestCase): q.submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.a.uop.buffer.as_buffer().cast("f")[0] assert val == 2000.0, f"got val {val}" def test_run_1000_times(self): @@ -81,7 +81,7 @@ class TestHCQ(unittest.TestCase): TestHCQ.compute_queue().signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.a.uop.buffer.as_buffer().cast("f")[0] assert val == 2000.0, f"got val {val}" def test_run_to_3(self): @@ -95,7 +95,7 @@ class TestHCQ(unittest.TestCase): q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[0] assert val == 3.0, f"got val {val}" def test_update_exec(self): @@ -106,9 +106,9 @@ class TestHCQ(unittest.TestCase): q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[0] assert val == 1.0, f"got val {val}" - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[1] assert val == 0.0, f"got val {val}, should not be updated" @unittest.skipUnless(Device.DEFAULT == "NV", "Only NV supports bind") @@ -126,7 +126,7 @@ class TestHCQ(unittest.TestCase): TestHCQ.compute_queue().signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.a.uop.buffer.as_buffer().cast("f")[0] assert val == 2000.0, f"got val {val}" @unittest.skipUnless(Device.DEFAULT == "NV", "Only NV supports bind") @@ -141,9 +141,9 @@ class TestHCQ(unittest.TestCase): q.submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[0] assert val == 1.0, f"got val {val}" - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[1] assert val == 0.0, f"got val {val}, should not be updated" @unittest.skipIf(CI, "Can't handle async update on CPU") @@ -174,7 +174,7 @@ class TestHCQ(unittest.TestCase): q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[0] assert val == 1.0, f"got val {val}" def test_submit_empty_queues(self): @@ -206,13 +206,13 @@ class TestHCQ(unittest.TestCase): q.submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[0] assert val == 1.0, f"got val {val}" def test_copy_1000_times(self): q = TestHCQ.copy_queue() - q.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8) - q.copy(TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr, 8) + q.copy(TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr, 8) + q.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8) for _ in range(1000): q.submit(TestHCQ.d0) TestHCQ.copy_queue().signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) @@ -221,24 +221,24 @@ class TestHCQ(unittest.TestCase): # confirm the signal didn't exceed the put value with self.assertRaises(RuntimeError): TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value + 1, timeout=50) - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[1] assert val == 0.0, f"got val {val}" def test_copy(self): q = TestHCQ.copy_queue() - q.copy(TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr, 8) + q.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8) q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) q.submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[1] assert val == 1.0, f"got val {val}" @unittest.skipUnless(Device.DEFAULT == "NV", "Only NV supports bind") def test_bind_copy(self): q = TestHCQ.copy_queue() - q.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8) - q.copy(TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr, 8) + q.copy(TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr, 8) + q.copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8) q.bind(TestHCQ.d0) for _ in range(1000): q.submit(TestHCQ.d0) @@ -248,7 +248,7 @@ class TestHCQ(unittest.TestCase): # confirm the signal didn't exceed the put value with self.assertRaises(RuntimeError): TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value + 1, timeout=50) - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[1] assert val == 0.0, f"got val {val}" def test_copy_bandwidth(self): @@ -281,14 +281,14 @@ class TestHCQ(unittest.TestCase): q.exec(TestHCQ.runner._prg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) # b = [1, 2] q.signal(sig:=TestHCQ.d0._alloc_signal(value=0), value=1) qc.wait(sig, value=1) - qc.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8) + qc.copy(TestHCQ.a.uop.buffer._buf.va_addr, TestHCQ.b.uop.buffer._buf.va_addr, 8) qc.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) qc.submit(TestHCQ.d0) time.sleep(0.02) # give it time for the wait to fail q.submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.a.uop.buffer.as_buffer().cast("f")[0] assert val == 1.0, f"got val {val}" def test_cross_device_signal(self): @@ -319,7 +319,7 @@ class TestHCQ(unittest.TestCase): q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[0] assert val == 1.0, f"got val {val}" if __name__ == "__main__": diff --git a/tinygrad_repo/test/external/external_test_hip_compile.py b/tinygrad_repo/test/external/external_test_hip_compile.py index 8f8292d..0edef87 100644 --- a/tinygrad_repo/test/external/external_test_hip_compile.py +++ b/tinygrad_repo/test/external/external_test_hip_compile.py @@ -10,7 +10,7 @@ class TestHIPCompileSpeed(unittest.TestCase): def test_hip_compile(self): a, b = Tensor([1,2,3,4,5]), Tensor([1,2,3,4,5]) out = a + b - lin = Kernel(create_schedule([out.lazydata])[-1].ast[0]) + lin = Kernel(create_schedule([out.uop])[-1].ast[0]) lin.linearize() reference = """ diff --git a/tinygrad_repo/test/external/external_test_image.py b/tinygrad_repo/test/external/external_test_image.py index 3e246ee..1c2cc39 100644 --- a/tinygrad_repo/test/external/external_test_image.py +++ b/tinygrad_repo/test/external/external_test_image.py @@ -8,7 +8,6 @@ os.environ['GPU'] = '1' os.environ['OPT'] = '2' from tinygrad.tensor import Tensor from tinygrad.nn import Conv2d -Tensor.no_grad = True class TestImage(unittest.TestCase): def test_create_image(self): diff --git a/tinygrad_repo/test/external/external_test_keccak.py b/tinygrad_repo/test/external/external_test_keccak.py new file mode 100644 index 0000000..bccffe8 --- /dev/null +++ b/tinygrad_repo/test/external/external_test_keccak.py @@ -0,0 +1,31 @@ +import unittest, zipfile, re +from tinygrad import Tensor +from tinygrad.helpers import fetch, tqdm + +SHA3_URL = "https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/sha3/sha-3bytetestvectors.zip" +SHAKE_URL = "https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/sha3/shakebytetestvectors.zip" + +class TestExternalKeccak(unittest.TestCase): + def test_sha3_224(self): self.check_nist_vectors(SHA3_URL, ["SHA3_224LongMsg.rsp", "SHA3_224ShortMsg.rsp"], "sha3_224") + def test_sha3_256(self): self.check_nist_vectors(SHA3_URL, ["SHA3_256LongMsg.rsp", "SHA3_256ShortMsg.rsp"], "sha3_256") + def test_shake_128(self): self.check_nist_vectors(SHAKE_URL, ["SHAKE128LongMsg.rsp", "SHAKE128ShortMsg.rsp"], "shake_128") + + def check_nist_vectors(self, url: str, filenames: list[str], preset: str): + pattern = r"Len\s*=\s*(?P\d+)\s+Msg\s*=\s*(?P[0-9a-fA-F\s]+)\s+(MD|Output)\s*=\s*(?P[0-9a-fA-F]+)" + vecs_zip = fetch(url) + + for filename in filenames: + vecs = zipfile.ZipFile(vecs_zip).open(filename).read().decode() + + vectors = [ (l, bytes.fromhex(match["Msg"].lower()), bytes.fromhex(match["Output"].lower())) + for match in re.finditer(pattern, vecs) if (l:=int(match["Len"])) < 8192 ] + + self.assertTrue(len(vectors) > 0) + + print("file", filename) + for data_len, data, output in tqdm(vectors): + tinyout = bytes(Tensor(data[:data_len//8]).keccak(preset).data()) + self.assertEqual(tinyout, output) + +if __name__ == '__main__': + unittest.main() diff --git a/tinygrad_repo/test/external/external_test_nv.py b/tinygrad_repo/test/external/external_test_nv.py index b488e86..f061975 100644 --- a/tinygrad_repo/test/external/external_test_nv.py +++ b/tinygrad_repo/test/external/external_test_nv.py @@ -21,8 +21,8 @@ class TestNV(unittest.TestCase): TestNV.b = self.a + 1 si = self.b.schedule()[-1] TestNV.d0_runner = get_runner(TestNV.d0.device, si.ast) - TestNV.b.lazydata.buffer.allocate() - TestNV.addr = struct.pack("QQ", TestNV.b.lazydata.buffer._buf.va_addr, TestNV.a.lazydata.buffer._buf.va_addr) + TestNV.b.uop.buffer.allocate() + TestNV.addr = struct.pack("QQ", TestNV.b.uop.buffer._buf.va_addr, TestNV.a.uop.buffer._buf.va_addr) def test_oor_kernels(self): ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=Ops.CAST, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=Ops.CAST, src=(LazyOp(op=Ops.MUL, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 256, 1, 512, 4, 16, 4, 16), strides=(0, 100352, 0, 196, 0, 14, 0, 1), offset=-15, mask=((0, 1), (0, 256), (0, 1), (0, 512), (0, 4), (1, 15), (0, 4), (1, 15)), contiguous=False), View(shape=(256, 1, 512, 7, 7, 512, 3, 3), strides=(2097152, 0, 0, 128, 2, 4096, 1088, 17), offset=0, mask=None, contiguous=False))))), LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=2, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(256, 1, 512, 7, 7, 512, 3, 3), strides=(25088, 0, 49, 7, 1, 0, 0, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=(dtypes.float, False)),), arg=((0, 3, 4), dtypes.float)),), arg=(dtypes.half, False)),), arg=MemBuffer(idx=0, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 1, 512, 1, 1, 512, 3, 3), strides=(0, 0, 4608, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=True),)))) # noqa: E501 @@ -44,8 +44,8 @@ class TestNV(unittest.TestCase): TestNV.along = Tensor([105615], device="NV").realize() ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=Ops.SIN, src=(LazyOp(op=Ops.CAST, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.ulong, st=ShapeTracker(views=(View(shape=(3,), strides=(1,), offset=0, mask=None, contiguous=True),)))),), arg=dtypes.float),), arg=None),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(3,), strides=(1,), offset=0, mask=None, contiguous=True),)))) # noqa: E501 temp_runner = get_runner(TestNV.d0.device, (ast,)) - temp_runner([TestNV.b.lazydata.buffer, TestNV.along.lazydata.buffer], var_vals={}) - val = TestNV.b.lazydata.buffer.as_buffer().cast("f")[0] + temp_runner([TestNV.b.uop.buffer, TestNV.along.uop.buffer], var_vals={}) + val = TestNV.b.uop.buffer.as_buffer().cast("f")[0] assert abs(val - 0.80647) < 0.001, f"got val {val}" def test_kernargs_no_oob_access(self): @@ -59,7 +59,7 @@ class TestNV(unittest.TestCase): q.signal(TestNV.d0.timeline_signal, TestNV.d0.timeline_value).submit(TestNV.d0) TestNV.d0._wait_signal(TestNV.d0.timeline_signal, TestNV.d0.timeline_value) TestNV.d0.timeline_value += 1 - val = TestNV.b.lazydata.buffer.as_buffer().cast("f")[0] + val = TestNV.b.uop.buffer.as_buffer().cast("f")[0] assert val == 1.0, f"got val {val}" if __name__ == "__main__": diff --git a/tinygrad_repo/test/external/external_test_onnx_backend.py b/tinygrad_repo/test/external/external_test_onnx_backend.py index b6ccd00..2769663 100644 --- a/tinygrad_repo/test/external/external_test_onnx_backend.py +++ b/tinygrad_repo/test/external/external_test_onnx_backend.py @@ -1,4 +1,4 @@ -import unittest +import tempfile, unittest from typing import Any, Tuple from onnx.backend.base import Backend, BackendRep import onnx.backend.test @@ -10,7 +10,7 @@ from tinygrad.device import is_dtype_supported # pip3 install tabulate pytest_plugins = 'onnx.backend.test.report', -from tinygrad.frontend.onnx import OnnxRunner +from tinygrad.frontend.onnx import OnnxRunner, onnx_load class TinygradModel(BackendRep): def __init__(self, run_onnx, input_names): @@ -25,12 +25,16 @@ class TinygradModel(BackendRep): class TinygradBackend(Backend): @classmethod - def prepare(cls, model, device): + def prepare(cls, model: onnx.ModelProto, device): input_all = [x.name for x in model.graph.input] input_initializer = [x.name for x in model.graph.initializer] net_feed_input = [x for x in input_all if x not in input_initializer] print("prepare", cls, device, net_feed_input) - run_onnx = OnnxRunner(model) + with tempfile.NamedTemporaryFile(suffix='.onnx') as f: + onnx.save(model, f.name) + f.flush() + new_model = onnx_load(f.name) + run_onnx = OnnxRunner(new_model) return TinygradModel(run_onnx, net_feed_input) @classmethod diff --git a/tinygrad_repo/test/external/external_uop_gc.py b/tinygrad_repo/test/external/external_uop_gc.py index 03f6169..f27f33b 100644 --- a/tinygrad_repo/test/external/external_uop_gc.py +++ b/tinygrad_repo/test/external/external_uop_gc.py @@ -1,7 +1,7 @@ import gc from tinygrad import Tensor, UOp, Device from tinygrad.shape.shapetracker import views_to_indexed_uops -from tinygrad.engine.realize import method_cache, get_kernel +from tinygrad.engine.realize import method_cache, get_program def uops_allocated(): return sum([isinstance(x, UOp) for x in gc.get_objects()]) def print_uops(): @@ -14,12 +14,10 @@ def two_plus_two(): Tensor([2])+Tensor([2]) def two_plus_two_schedule(): (Tensor([2])+Tensor([2])).schedule() def two_plus_two_kernel(): si = (Tensor([2])+Tensor([2])).schedule()[-1] - get_kernel(Device.default.renderer, si.ast) + get_program(Device.default.renderer, si.ast) def two_plus_two_linearize(): si = (Tensor([2])+Tensor([2])).schedule()[-1] - k = get_kernel(Device.default.renderer, si.ast) - k.get_optimized_ast() - #k.linearize() + get_program(Device.default.renderer, si.ast) def two_plus_two_realize(): (Tensor([2])+Tensor([2])).realize() def two_plus_two_item(): (Tensor([2])+Tensor([2])).item() def gradient_test(): @@ -36,7 +34,7 @@ def kernel_matmul(): y = Tensor([[2.0,0,-2.0]], requires_grad=True) z = y.matmul(x) si = z.schedule()[-1] - get_kernel(Device.default.renderer, si.ast) + get_program(Device.default.renderer, si.ast) def realized_matmul(): x = Tensor.eye(3, requires_grad=True) y = Tensor([[2.0,0,-2.0]], requires_grad=True) diff --git a/tinygrad_repo/test/external/fuzz_graph.py b/tinygrad_repo/test/external/fuzz_graph.py index 7992e21..4e1d492 100644 --- a/tinygrad_repo/test/external/fuzz_graph.py +++ b/tinygrad_repo/test/external/fuzz_graph.py @@ -28,7 +28,7 @@ def alloc_rawbuffer(device, fill=False): if fill: with Context(DEBUG=0): data = np.random.randint(-10000, 10000, size=rawbuf.size, dtype=_to_np_dtype(rawbuf.dtype)) - rawbuf.copyin(Tensor(data).realize().lazydata.base.realized.as_buffer()) + rawbuf.copyin(Tensor(data).realize().uop.base.realized.as_buffer()) return rawbuf def gen_kernel_ji(device, deps): diff --git a/tinygrad_repo/test/external/fuzz_linearizer.py b/tinygrad_repo/test/external/fuzz_linearizer.py index 8a03176..1bf269c 100644 --- a/tinygrad_repo/test/external/fuzz_linearizer.py +++ b/tinygrad_repo/test/external/fuzz_linearizer.py @@ -73,7 +73,7 @@ def get_fuzz_rawbufs(lin): data = np.random.uniform(-1, 1, size=rawbuf.size).astype(dtype=_to_np_dtype(rawbuf.dtype)) else: data = np.random.uniform(-10, 10, size=rawbuf.size).astype(dtype=_to_np_dtype(rawbuf.dtype)) - rawbuf.copyin(Tensor(data, device=lin.opts.device).realize().lazydata.base.realized.as_buffer()) + rawbuf.copyin(Tensor(data, device=lin.opts.device).realize().uop.base.realized.as_buffer()) return rawbufs def get_fuzz_rawbuf_like(old_rawbuf, zero=False, copy=False, size=None, force_device=None): diff --git a/tinygrad_repo/test/test_fuzz_shape_ops.py b/tinygrad_repo/test/external/fuzz_shape_ops.py similarity index 96% rename from tinygrad_repo/test/test_fuzz_shape_ops.py rename to tinygrad_repo/test/external/fuzz_shape_ops.py index b90d98c..f60e58e 100644 --- a/tinygrad_repo/test/test_fuzz_shape_ops.py +++ b/tinygrad_repo/test/external/fuzz_shape_ops.py @@ -7,7 +7,7 @@ from hypothesis.extra import numpy as stn import numpy as np import torch -from tinygrad import Tensor, Device +from tinygrad import Tensor from tinygrad.helpers import CI, getenv @@ -38,7 +38,6 @@ def apply(tor, ten, tor_fn, ten_fn=None): except: ten, ok = None, not ok # noqa: E722 return tor, ten, ok -@unittest.skipIf(CI and Device.DEFAULT in ("CPU", "NV"), "slow") class TestShapeOps(unittest.TestCase): @settings.get_profile(__file__) @given(st_shape(), st_int32, st.one_of(st_int32, st.lists(st_int32))) diff --git a/tinygrad_repo/test/external/fuzz_symbolic.py b/tinygrad_repo/test/external/fuzz_symbolic.py index 8d2242a..41970a9 100644 --- a/tinygrad_repo/test/external/fuzz_symbolic.py +++ b/tinygrad_repo/test/external/fuzz_symbolic.py @@ -1,78 +1,92 @@ -import itertools -import random +import random, operator +import z3 from tinygrad import Variable, dtypes -from tinygrad.uop.ops import UOp -from tinygrad.helpers import DEBUG -random.seed(42) +from tinygrad.uop.ops import UOp, graph_rewrite +from tinygrad.uop.spec import z3_renderer +from tinygrad.helpers import DEBUG, Context -def add_v(expr, rng=None): - if rng is None: rng = random.randint(0,2) - return expr + v[rng], rng +seed = random.randint(0, 100) +print(f"Seed: {seed}") +random.seed(seed) -def div(expr, rng=None): - if rng is None: rng = random.randint(1,9) - return expr // rng, rng +unary_ops = [lambda a:a+random.randint(-4, 4), lambda a: a*random.randint(-4, 4), + lambda a: a//random.randint(1, 9), lambda a: a%random.randint(1, 9), + lambda a:a.maximum(random.randint(-10, 10)), lambda a:a.minimum(random.randint(-10, 10))] +binary_ops = [lambda a,b: a+b, lambda a,b: a*b, lambda a,b:a.maximum(b), lambda a,b:a.minimum(b)] +comp_ops = [operator.lt, operator.le, operator.gt, operator.ge] -def mul(expr, rng=None): - if rng is None: rng = random.randint(-4,4) - return expr * rng, rng +def random_or_sub_expression_int(depth, expr): + sub_expr = random.choice([e for e in expr.toposort() if e.dtype is not dtypes.bool]) + return random.choice([random_int_expr(depth-1), sub_expr]) -def mod(expr, rng=None): - if rng is None: rng = random.randint(1,9) - return expr % rng, rng +def random_int_expr(depth=10): + if depth <= 0: return random.choice(v) + expr1 = random_int_expr(depth-1) -def add_num(expr, rng=None): - if rng is None: rng = random.randint(-4,4) - return expr + rng, rng + # we give more weight to arithmatic ops than to minimum and maximum + ops = [ + lambda: random.choices(unary_ops, weights=[4, 4, 4, 4, 1, 1])[0](expr1), + # for the second operand its either another random exprssion or some subexpression of the first operand + lambda: random.choices(binary_ops, [8, 1, 1, 1])[0](expr1, random_or_sub_expression_int(depth-1, expr1)), + lambda: random_bool_expr(3, random_or_sub_expression_int(depth-1, expr1)).where(expr1, random_or_sub_expression_int(depth-1, expr1)), + ] + # we give weight proportional to the amount of ops in each branch + return random.choices(ops, weights=[6, 4, 1])[0]() -def lt(expr, rng=None): - if rng is None: rng = random.randint(-4,4) - return expr < rng, rng +def random_bool_expr(depth=10, expr1=None): + if depth == 0: return True + if expr1 is None: expr1 = random_int_expr(depth-1) + expr2 = random.choice([random_or_sub_expression_int(depth-1, expr1), UOp.const(dtypes.int, random.randint(-10, 10))]) + return random.choice(comp_ops)(expr1, expr2) -def ge(expr, rng=None): - if rng is None: rng = random.randint(-4,4) - return expr >= rng, rng - -def le(expr, rng=None): - if rng is None: rng = random.randint(-4,4) - return expr <= rng, rng - -def gt(expr, rng=None): - if rng is None: rng = random.randint(-4,4) - return expr > rng, rng - -# NOTE: you have to replace these for this test to pass -from tinygrad.uop.ops import python_alu, Ops -python_alu[Ops.MOD] = lambda x,y: x%y -python_alu[Ops.IDIV] = lambda x,y: x//y if __name__ == "__main__": - ops = [add_v, div, mul, add_num, mod] - for _ in range(1000): + skipped = 0 + for i in range(10000): + if i % 1000 == 0: + print(f"Running test {i}") upper_bounds = [*list(range(1, 10)), 16, 32, 64, 128, 256] u1 = Variable("v1", 0, random.choice(upper_bounds)) u2 = Variable("v2", 0, random.choice(upper_bounds)) u3 = Variable("v3", 0, random.choice(upper_bounds)) v = [u1,u2,u3] - tape = [random.choice(ops) for _ in range(random.randint(2, 30))] - # 10% of the time, add one of lt, le, gt, ge - if random.random() < 0.1: tape.append(random.choice([lt, le, gt, ge])) - expr = UOp.const(dtypes.int, 0) - rngs = [] - for t in tape: - expr, rng = t(expr) - if DEBUG >= 1: print(t.__name__, rng) - rngs.append(rng) - if DEBUG >=1: print(expr) - space = list(itertools.product(range(u1.vmin, u1.vmax+1), range(u2.vmin, u2.vmax+1), range(u3.vmin, u3.vmax+1))) - volume = len(space) - for (v1, v2, v3) in random.sample(space, min(100, volume)): - v = [v1,v2,v3] - rn = 0 - for t,r in zip(tape, rngs): rn, _ = t(rn, r) - num = eval(expr.render(simplify=False)) - if num != rn: - unsimplified_num = eval(expr.render(simplify=False)) - assert unsimplified_num == rn, "UNSIMPLIFIED MISMATCH!" - assert num == rn, f"mismatched {expr.render()} at {v1=} {v2=} {v3=} = {num} != {rn}\n{expr.render(simplify=False)}" - if DEBUG >= 1: print(f"matched {expr.render()} at {v1=} {v2=} {v3=} = {num} == {rn}") + expr = random_int_expr(6) + + with Context(CORRECT_DIVMOD_FOLDING=1): + simplified_expr = expr.simplify() + + solver = z3.Solver() + solver.set(timeout=5000) # some expressions take very long verify, but its very unlikely they actually return sat + z3_sink = graph_rewrite(expr.sink(simplified_expr, u1, u2, u3), z3_renderer, ctx=(solver, {})) + z3_expr, z3_simplified_expr = z3_sink.src[0].arg, z3_sink.src[1].arg + check = solver.check(z3_simplified_expr != z3_expr) + if check == z3.unknown and DEBUG>=1: + skipped += 1 + print("Skipped due to timeout or interrupt:\n" + + f"v1=Variable(\"{u1.arg[0]}\", {u1.arg[1]}, {u1.arg[2]})\n" + + f"v2=Variable(\"{u2.arg[0]}\", {u2.arg[1]}, {u2.arg[2]})\n" + + f"v3=Variable(\"{u3.arg[0]}\", {u3.arg[1]}, {u3.arg[2]})\n" + + f"expr = {expr.render(simplify=False)}\n") + elif check == z3.sat: + m = solver.model() + v1, v2, v3 = z3_sink.src[2].arg, z3_sink.src[3].arg, z3_sink.src[4].arg + n1, n2, n3 = m[v1], m[v2], m[v3] + u1_val, u2_val, u3_val = u1.const_like(n1.as_long()), u2.const_like(n2.as_long()), u3.const_like(n3.as_long()) + with Context(CORRECT_DIVMOD_FOLDING=1): + num = expr.simplify().substitute({u1:u1_val, u2:u2_val, u3:u3_val}).ssimplify() + rn = expr.substitute({u1:u1_val, u2:u2_val, u3:u3_val}).ssimplify() + if num==rn: print("z3 found a mismatch but the expressions are equal!!") + assert False, f"mismatched {expr.render()} at v1={m[v1]}; v2={m[v2]}; v3={m[v3]} = {num} != {rn}\n" +\ + "Reproduce with:\n" +\ + f"v1=Variable(\"{u1.arg[0]}\", {u1.arg[1]}, {u1.arg[2]})\n" +\ + f"v2=Variable(\"{u2.arg[0]}\", {u2.arg[1]}, {u2.arg[2]})\n" +\ + f"v3=Variable(\"{u3.arg[0]}\", {u3.arg[1]}, {u3.arg[2]})\n" +\ + f"expr = {expr}\n" +\ + f"v1_val, v2_val, v3_val = UOp.const(dtypes.int, {n1.as_long()}), UOp.const(dtypes.int, {n2.as_long()})," +\ + f"UOp.const(dtypes.int, {n3.as_long()})\n" +\ + "num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify()\n" +\ + "rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify()\n" +\ + "assert num==rn, f\"{num} != {rn}\"\n" + + if DEBUG >= 2: print(f"validated {expr.render()}") + print(f"Skipped {skipped} expressions due to timeout") diff --git a/tinygrad_repo/test/external/openpilot/b1ab7897cbfa35981e1636fe551e4ce5.npy b/tinygrad_repo/test/external/openpilot/b1ab7897cbfa35981e1636fe551e4ce5.npy index a10a135..2c87c44 100644 Binary files a/tinygrad_repo/test/external/openpilot/b1ab7897cbfa35981e1636fe551e4ce5.npy and b/tinygrad_repo/test/external/openpilot/b1ab7897cbfa35981e1636fe551e4ce5.npy differ diff --git a/tinygrad_repo/test/external/process_replay/local.sh b/tinygrad_repo/test/external/process_replay/local.sh index c13e220..0d79973 100644 --- a/tinygrad_repo/test/external/process_replay/local.sh +++ b/tinygrad_repo/test/external/process_replay/local.sh @@ -1,9 +1,10 @@ #!/bin/bash +set -e HEAD=$(git rev-parse --abbrev-ref HEAD) python test/external/process_replay/reset.py CAPTURE_PROCESS_REPLAY=1 python test/test_ops.py TestOps.test_add git checkout master git checkout $HEAD -- test/external/process_replay/process_replay.py -ASSERT_PROCESS_REPLAY=1 python test/external/process_replay/process_replay.py +ASSERT_PROCESS_REPLAY=${ASSERT_PROCESS_REPLAY:-1} python test/external/process_replay/process_replay.py git checkout $HEAD diff --git a/tinygrad_repo/test/external/process_replay/process_replay.py b/tinygrad_repo/test/external/process_replay/process_replay.py index 872af70..1ff2873 100755 --- a/tinygrad_repo/test/external/process_replay/process_replay.py +++ b/tinygrad_repo/test/external/process_replay/process_replay.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 # compare kernels created by HEAD against master -import os, multiprocessing, logging, pickle, sqlite3, difflib, functools, warnings, itertools -from typing import Callable, cast -from tinygrad.helpers import VERSION, Context, ContextVar, colored, db_connection, getenv, tqdm +import os, multiprocessing, logging, pickle, sqlite3, difflib, warnings, itertools +from typing import Callable, Any +from tinygrad.helpers import VERSION, Context, ContextVar, colored, db_connection, getenv, tqdm, to_function_name from tinygrad.engine.grouper import get_kernelize_map -from tinygrad.codegen.kernel import Kernel, Opt -from tinygrad.renderer import Renderer +from tinygrad.codegen.kernel import Kernel from tinygrad.uop.ops import UOp, Ops # *** process replay settings @@ -30,81 +29,60 @@ SKIP_PROCESS_REPLAY = (k:="[skip_process_replay]") in os.getenv("COMMIT_MESSAGE" if REF == "master": SKIP_PROCESS_REPLAY = True class ProcessReplayWarning(Warning): pass -# *** recreators +# *** replay the function and convert return values to string -def recreate_sched(big_sink:UOp) -> list[UOp]: +def replay_kernelize(ret:dict[UOp, UOp], big_sink:UOp) -> tuple[str, str, tuple[Any, ...]]: UOp.unique_num = itertools.count(max([u.arg for u in big_sink.toposort() if u.op is Ops.UNIQUE], default=0)+1) - becomes_map = get_kernelize_map(big_sink) - sched_sink = big_sink.substitute(becomes_map) - return [u.arg.ast for u in sched_sink.toposort() if u.op is Ops.KERNEL] + new_sink = big_sink.substitute(get_kernelize_map(big_sink)) + def to_str(ret:UOp) -> str: + asts = [repr(u.arg.ast) for u in ret.toposort() if u.op is Ops.KERNEL] + return "\n".join([f"{len(asts)} kernels", *asts]) + return to_str(new_sink), to_str(ret[big_sink]), (big_sink,) -def recreate_kernel(ast:UOp, opts:Renderer, applied_opts:list[Opt], name:str, _) -> str: - k = Kernel(ast, opts=opts) - for opt in applied_opts: k.apply_opt(opt) - # NOTE: replay with the captured renderer, not the one in master - return k.opts.render(cast(list,k.to_program(name).uops)) +def replay_linearize(k:Kernel, _:Kernel, name_override=None, ast_transform=None) -> tuple[str, str, tuple[Any, ...]]: + # create a copy because the Kernel class contains optimization parameters (other than applied_opts) in its state + # this should be made fully functional. It's fine for process replay since copy returns a fresh instance + k2 = k.copy() + k2.linearize(name_override=name_override or to_function_name(k.name), ast_transform=ast_transform) + def to_str(ret:Kernel) -> str: + try: return ret.opts.render(ret.uops) + except NotImplementedError: return "" # NULL backend doesn't have a renderer, this is okay + return to_str(k2), to_str(k), (k.ast, k.opts, k.applied_opts) -# *** diff a "good" recreation against the generated version +replayers: dict[str, Callable[..., tuple[str, str, tuple[Any, ...]]]] = {"get_kernelize_map":replay_kernelize, "linearize":replay_linearize} -def diff(offset:int, name:str, fxn:Callable) -> None: +# *** run replayers on captured rows and print diffs + +def diff(offset:int) -> None: if ASSERT_DIFF: warnings.filterwarnings("error", category=ProcessReplayWarning) if early_stop.is_set(): return None conn = db_connection() cur = conn.cursor() - cur.execute(f"SELECT val FROM '{name}_{TABLE_NAME}' LIMIT ? OFFSET ?", (PAGE_SIZE, offset)) + cur.execute(f"SELECT val FROM '{TABLE_NAME}' LIMIT ? OFFSET ?", (PAGE_SIZE, offset)) changed = 0 for row in cur.fetchall(): if changed > MAX_DIFF_PCT: - warnings.warn(f"detected changes in over {MAX_DIFF_PCT}% of {name}s. skipping further diff generation.") + warnings.warn(f"detected changes in over {MAX_DIFF_PCT}%. skipping further diff generation.", ProcessReplayWarning) early_stop.set() break - # try unpickle - try: args = pickle.loads(row[0]) - except Exception as e: - changed += 1 - warnings.warn(f"FAILED TO UNPICKLE OBJECTS {e}", ProcessReplayWarning) - continue - # try recreate try: - ctx_vars = {k:v.value for k,v in args[-2].items() if k != "DEBUG" and (var:=ContextVar._cache.get(k)) is not None and var.value != v.value} - with Context(**ctx_vars): good = fxn(*args[:-2]) - if good is None: continue + name, args, kwargs, ctx_vals, loc, ret = pickle.loads(row[0]) + ctx_vars = {k:v.value for k,v in ctx_vals.items() if k != "DEBUG" and (var:=ContextVar._cache.get(k)) is not None and var.value != v.value} + if (replayer:=replayers.get(name)) is None: continue + with Context(**ctx_vars): good, compare, metadata = replayer(ret, *args, **kwargs) + if good != compare: + for m in metadata: trunc_log(m) + logging.info(loc) + for line in difflib.unified_diff(good.splitlines(), compare.splitlines()): + logging.info(colored(line, "red" if line.startswith("-") else "green" if line.startswith("+") else None)) + if ctx_vars: logging.info(ctx_vars) + warnings.warn("PROCESS REPLAY DETECTED CHANGE", ProcessReplayWarning) except Exception as e: changed += 1 - if ctx_vars: logging.info(ctx_vars) - for x in args[:-2]: trunc_log(x) - warnings.warn(f"FAILED TO RECREATE KERNEL {e}", ProcessReplayWarning) - continue - # diff kernels - try: assert str(args[-1]) == str(good) - except AssertionError: - changed += 1 - if ctx_vars: logging.info(ctx_vars) - for x in args[:-2]: trunc_log(x) - changes = list(difflib.unified_diff(str(good).splitlines(), str(args[-1]).splitlines())) - logging.info("\n".join(colored(line, "red" if line.startswith("-") else "green" if line.startswith("+") else None) for line in changes)) - warnings.warn("PROCESS REPLAY DETECTED CHANGE", ProcessReplayWarning) + warnings.warn(e, ProcessReplayWarning) conn.commit() cur.close() -# *** generic runner for executing fxn across all rows of a table in parallel - -def _pmap(name:str, fxn:Callable, maxtasksperchild:int=16) -> None: - conn = db_connection() - cur = conn.cursor() - try: row_count = cur.execute(f"select count(*) from '{name}_{TABLE_NAME}'").fetchone()[0] - except sqlite3.OperationalError: - warnings.warn(f"{name}_{TABLE_NAME} isn't accessible in master, did DB_VERSION change?", ProcessReplayWarning) - return None - conn.commit() - cur.close() - with multiprocessing.get_context("spawn").Pool(multiprocessing.cpu_count(), maxtasksperchild=maxtasksperchild) as pool: - inputs = list(range(0, row_count, PAGE_SIZE)) - list(tqdm(pool.imap_unordered(functools.partial(diff, name=name, fxn=fxn), inputs), total=len(inputs))) - pool.close() - pool.join() - pool.terminate() - # *** main loop if __name__ == "__main__": @@ -112,11 +90,20 @@ if __name__ == "__main__": logging.info("skipping process replay.") exit(0) - print(f"running process replay with {ASSERT_DIFF=}") - for name,fxn in [("schedule", recreate_sched), ("kernel", recreate_kernel)]: - logging.info(f"***** {name} diff") - try: _pmap(name, fxn) - except ProcessReplayWarning: exit(1) - except Exception as e: - if ASSERT_DIFF: raise e - logging.error(f"{name} diff err {e}") + conn = db_connection() + cur = conn.cursor() + try: row_count = cur.execute(f"select count(*) from '{TABLE_NAME}'").fetchone()[0] + except sqlite3.OperationalError: + warnings.warn(f"{TABLE_NAME} isn't accessible in master, did DB_VERSION change?", ProcessReplayWarning) + exit(int(ASSERT_DIFF)) + finally: + conn.commit() + cur.close() + + logging.info(f"running process replay with {ASSERT_DIFF=}") + with multiprocessing.get_context("spawn").Pool(multiprocessing.cpu_count()) as pool: + inputs = list(range(0, row_count, PAGE_SIZE)) + list(tqdm(pool.imap_unordered(diff, inputs), total=len(inputs))) + pool.close() + pool.join() + pool.terminate() diff --git a/tinygrad_repo/test/external/process_replay/reset.py b/tinygrad_repo/test/external/process_replay/reset.py index 381a357..c40b6d0 100755 --- a/tinygrad_repo/test/external/process_replay/reset.py +++ b/tinygrad_repo/test/external/process_replay/reset.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 from tinygrad.helpers import db_connection, VERSION cur = db_connection() -cur.execute(f"drop table if exists kernel_process_replay_{VERSION}") -cur.execute(f"drop table if exists schedule_process_replay_{VERSION}") +cur.execute(f"drop table if exists process_replay_{VERSION}") diff --git a/tinygrad_repo/test/external/speed_v_theoretical.py b/tinygrad_repo/test/external/speed_v_theoretical.py index d02f87f..de59e8f 100644 --- a/tinygrad_repo/test/external/speed_v_theoretical.py +++ b/tinygrad_repo/test/external/speed_v_theoretical.py @@ -87,7 +87,7 @@ class TestKernelSpeed(unittest.TestCase): # NOTE: tiny7 was slower than tiny12 # TODO: why are convs so slow?!? - def test_conv_3x3_256_32_32_256_256(self): self._test_conv_3x3(256, 32, 32, 256, 256, nv_tflops=27, amd_tflops=20) + def test_conv_3x3_256_32_32_256_256(self): self._test_conv_3x3(256, 32, 32, 256, 256, nv_tflops=27, amd_tflops=14) # theoretical is nv_tflops=165, amd_tflops=123 def test_gemm_4096(self): self._test_matmul(4096, nv_tflops=115, amd_tflops=65) diff --git a/tinygrad_repo/test/helpers.py b/tinygrad_repo/test/helpers.py index 9a34198..07fe61b 100644 --- a/tinygrad_repo/test/helpers.py +++ b/tinygrad_repo/test/helpers.py @@ -2,10 +2,11 @@ import time, struct from typing import Any, Callable, Optional import numpy as np from tinygrad import Tensor, dtypes, Device -from tinygrad.uop.ops import UOp, Ops, sint +from tinygrad.uop.ops import UOp, Ops, sint, graph_rewrite from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.tensor import _to_np_dtype from tinygrad.engine.realize import Runner +from tinygrad.engine.grouper import view_left from tinygrad.dtype import ConstType, DType from tinygrad.nn.state import get_parameters from tinygrad.helpers import T, unwrap, CI @@ -44,7 +45,7 @@ def ast_const(dtype:DType, val:ConstType, shape:tuple[sint, ...]=(), st:Optional st_src = (st.to_uop() if st is not None else ShapeTracker.from_shape(()).reshape((1,)*len(shape)).expand(shape).to_uop(),) st = unwrap(st_src[0].st) if all(v.mask is None for v in st.views): return UOp.const(dtype, val).replace(src=(st.to_uop(),)) - return UOp.const(dtype, val).valid(st) + return graph_rewrite(UOp.const(dtype, val).view(st).valid(), view_left) def timeit(fxn:Callable[..., T], *args, **kwargs) -> tuple[T, float]: st = time.perf_counter_ns() diff --git a/tinygrad_repo/test/imported/__init__.py b/tinygrad_repo/test/imported/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tinygrad_repo/test/models/test_onnx.py b/tinygrad_repo/test/models/test_onnx.py index d1b7def..7bc9fb3 100644 --- a/tinygrad_repo/test/models/test_onnx.py +++ b/tinygrad_repo/test/models/test_onnx.py @@ -7,7 +7,7 @@ try: import onnx except ModuleNotFoundError: raise unittest.SkipTest("onnx not installed, skipping onnx test") -from tinygrad.frontend.onnx import OnnxRunner +from tinygrad.frontend.onnx import OnnxRunner, onnx_load from tinygrad.tensor import Tensor from tinygrad.helpers import CI, fetch, temp @@ -25,7 +25,7 @@ np.random.seed(1337) class TestOnnxModel(unittest.TestCase): def test_benchmark_openpilot_model(self): - onnx_model = onnx.load(fetch(OPENPILOT_MODEL)) + onnx_model = onnx_load(fetch(OPENPILOT_MODEL)) run_onnx = OnnxRunner(onnx_model) def get_inputs(): np_inputs = { @@ -69,7 +69,7 @@ class TestOnnxModel(unittest.TestCase): ps.print_stats(30) def test_openpilot_model(self): - onnx_model = onnx.load(fetch(OPENPILOT_MODEL)) + onnx_model = onnx_load(fetch(OPENPILOT_MODEL)) run_onnx = OnnxRunner(onnx_model) print("got run_onnx") inputs = { @@ -93,9 +93,8 @@ class TestOnnxModel(unittest.TestCase): et = time.monotonic() print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue") - Tensor.no_grad = True + onnx_model = onnx.load(fetch(OPENPILOT_MODEL)) torch_out = run_onnx_torch(onnx_model, inputs).numpy() - Tensor.no_grad = False print(tinygrad_out, torch_out) np.testing.assert_allclose(tinygrad_out, torch_out, atol=1e-4, rtol=1e-2) @@ -121,7 +120,7 @@ class TestOnnxModel(unittest.TestCase): input_name, input_new) def _test_model(self, fn, input_name, input_new, debug=False): - onnx_model = onnx.load(fn) + onnx_model = onnx_load(fn) print("onnx loaded") from test.models.test_efficientnet import chicken_img, car_img, preprocess, _LABELS run_onnx = OnnxRunner(onnx_model) diff --git a/tinygrad_repo/test/models/test_real_world.py b/tinygrad_repo/test/models/test_real_world.py index fd19e72..e6facd4 100644 --- a/tinygrad_repo/test/models/test_real_world.py +++ b/tinygrad_repo/test/models/test_real_world.py @@ -61,7 +61,7 @@ class TestRealWorld(unittest.TestCase): derandomize_model(model) @TinyJit def test(t, t2): return model(t, Tensor([801]), t2).realize() - helper_test("test_sd", lambda: (Tensor.randn(1, 4, 64, 64),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515) + helper_test("test_sd", lambda: (Tensor.randn(1, 4, 32, 32),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515) def test_unet_resblock(self): model = [ResBlock(16, 24, 16) for _ in range(4)] diff --git a/tinygrad_repo/test/models/test_whisper.py b/tinygrad_repo/test/models/test_whisper.py index 7e4b95b..f1696fd 100644 --- a/tinygrad_repo/test/models/test_whisper.py +++ b/tinygrad_repo/test/models/test_whisper.py @@ -1,7 +1,7 @@ import unittest import pathlib from examples.whisper import init_whisper, load_file_waveform, transcribe_file, transcribe_waveform -from tinygrad.helpers import CI, fetch, Context +from tinygrad.helpers import CI, fetch from tinygrad import Device, dtypes from tinygrad.device import is_dtype_supported @@ -24,25 +24,15 @@ class TestWhisper(unittest.TestCase): model, enc = init_whisper("tiny.en", batch_size=2) cls.model = model cls.enc = enc - # TODO: whisper has out of bounds access somewhere - cls.context = Context(IGNORE_OOB=1) - cls.context.__enter__() @classmethod def tearDownClass(cls): - cls.context.__exit__(None, None, None) del cls.model del cls.enc def test_transcribe_file1(self): self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_1), TRANSCRIPTION_1) - @unittest.expectedFailure # Test for out of bounds access - @unittest.skip("TODO: flaky") - def test_transcribe_file1_OOB(self): - with Context(IGNORE_OOB=0): - self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_1), TRANSCRIPTION_1) - @unittest.skipIf(CI or Device.DEFAULT == "LLVM", "too many tests for CI") def test_transcribe_file2(self): self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_2), TRANSCRIPTION_2) diff --git a/tinygrad_repo/test/test_arange.py b/tinygrad_repo/test/test_arange.py index 5599996..e9bc998 100644 --- a/tinygrad_repo/test/test_arange.py +++ b/tinygrad_repo/test/test_arange.py @@ -20,7 +20,7 @@ class TestArange(unittest.TestCase): p = k.to_program() print(p.name) #print(p.src) - ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run() + ExecItem(CompiledRunner(p), [tt.uop.buffer]).run() np.testing.assert_equal(tt.numpy(), np.arange(N)) return p.estimates.ops @@ -189,8 +189,6 @@ class TestIndexing(unittest.TestCase): np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x) np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y) - # TODO: fix these on WEBGPU, it looks like it has to do with packed stuff - @unittest.skipIf(getenv("WEBGPU"), "broken on webgpu for some reason") def test_index_mnist_opt(self): self.test_index_mnist(0) def test_index_mnist_split(self): self.test_index_mnist(1, split_reduceop=1) def test_index_mnist_opt_split(self): self.test_index_mnist(0, split_reduceop=1) diff --git a/tinygrad_repo/test/test_assign.py b/tinygrad_repo/test/test_assign.py index 6f8a7b8..837e414 100644 --- a/tinygrad_repo/test/test_assign.py +++ b/tinygrad_repo/test/test_assign.py @@ -13,11 +13,11 @@ class TestAssign(unittest.TestCase): b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N) a.realize() b.realize() - ba1 = a.lazydata.base.realized - bb1 = b.lazydata.base.realized + ba1 = a.uop.base.realized + bb1 = b.uop.base.realized a += b a.realize() - ba2 = a.lazydata.base.realized + ba2 = a.uop.base.realized assert ba1 == ba2 and ba1 != bb1 np.testing.assert_allclose(a.numpy(), (np.arange(N*N)*2).reshape((N,N))) @@ -259,13 +259,13 @@ class TestAssign(unittest.TestCase): b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N) a.realize() b.realize() - ba1 = a.lazydata.base.realized - bb1 = b.lazydata.base.realized + ba1 = a.uop.base.realized + bb1 = b.uop.base.realized with self.assertRaises((RuntimeError, AssertionError)): a = a.permute(1,0) a += b a.realize() - ba2 = a.lazydata.base.realized + ba2 = a.uop.base.realized assert ba1 != ba2 and ba1 != bb1 np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0)) @@ -275,12 +275,12 @@ class TestAssign(unittest.TestCase): a.realize() b.realize() #GlobalCounters.cache = [] - ba1 = a.lazydata.base.realized # noqa: F841 - bb1 = b.lazydata.base.realized # noqa: F841 + ba1 = a.uop.base.realized # noqa: F841 + bb1 = b.uop.base.realized # noqa: F841 with self.assertRaisesRegex(RuntimeError, "contiguous"): a.assign(a.permute(1,0) + b) # this should not work! a.realize() - ba2 = a.lazydata.base.realized # noqa: F841 + ba2 = a.uop.base.realized # noqa: F841 # NOTE: don't test that it's assigned #assert ba1 == ba2 and ba1 != bb1 np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0)) @@ -383,10 +383,10 @@ class TestAssign(unittest.TestCase): def test_cast_assignment(self): a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N) a.realize() - oba1 = a.lazydata.base.output_buffer + oba1 = a.uop.base.output_buffer a.assign(a.cast(dtypes.int32).realize()) a.realize() - oba2 = a.lazydata.base.output_buffer + oba2 = a.uop.base.output_buffer assert oba1 is None and oba2 is None np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N))) diff --git a/tinygrad_repo/test/test_const_folding.py b/tinygrad_repo/test/test_const_folding.py index a117baa..187a4b5 100644 --- a/tinygrad_repo/test/test_const_folding.py +++ b/tinygrad_repo/test/test_const_folding.py @@ -174,9 +174,9 @@ class TestMovedConstFolding(unittest.TestCase): if is_dtype_supported(dtypes.uint16): _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16)) np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0]) - # not folded + # folded if is_dtype_supported(dtypes.int64): - _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64)) + _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64)) np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0]) class TestReduceOpsConstFolding(unittest.TestCase): @@ -221,7 +221,7 @@ class TestReduceOpsConstFolding(unittest.TestCase): # contiguous folded const can still schedule a = Tensor.empty(1, 0).sum().contiguous() _check_ast_count(2, a+2) - self.assertIs(a.lazydata.base.op, Ops.BUFFER) + self.assertIs(a.uop.base.op, Ops.BUFFER) np.testing.assert_equal((Tensor.empty(1, 0).sum().contiguous()+2).numpy(), 2) # otherwise we just fuse it _check_ast_count(1, (Tensor.empty(1, 0).sum()+2).contiguous()) diff --git a/tinygrad_repo/test/test_dtype.py b/tinygrad_repo/test/test_dtype.py index b41a966..cfa95e0 100644 --- a/tinygrad_repo/test/test_dtype.py +++ b/tinygrad_repo/test/test_dtype.py @@ -1,15 +1,15 @@ -import unittest, operator, subprocess, math +import unittest, math import numpy as np import torch from typing import Any, List from tinygrad.device import is_dtype_supported from tinygrad.helpers import getenv, DEBUG, CI -from tinygrad.dtype import DType, DTYPES_DICT, ImageDType, PtrDType, least_upper_float, least_upper_dtype, truncate_fp16, truncate_bf16, to_dtype -from tinygrad.dtype import truncate, fp8_to_float, float_to_fp8 +from tinygrad.dtype import DType, DTYPES_DICT, ImageDType, PtrDType, least_upper_dtype, to_dtype, fp8_to_float, float_to_fp8 from tinygrad import Device, Tensor, dtypes from tinygrad.tensor import _to_np_dtype from hypothesis import assume, given, settings, strategies as strat from test.helpers import rand_for_dtype +from test.unit.test_dtype_spec import _assert_eq, core_dtypes, dtype_ints, dtype_floats, FP8E4M3_MAX, FP8E5M2_MAX import ml_dtypes import pytest pytestmark = pytest.mark.filterwarnings("ignore") @@ -17,12 +17,7 @@ pytestmark = pytest.mark.filterwarnings("ignore") settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False)) settings.load_profile("my_profile") -core_dtypes = list(DTYPES_DICT.values()) if Device.DEFAULT == "CPU": core_dtypes.remove(dtypes.bfloat16) # NOTE: this is for teenygrad, don't remove -dtype_ints = [dt for dt in core_dtypes if dtypes.is_int(dt) and is_dtype_supported(dt)] -dtype_floats = [dt for dt in core_dtypes if dtypes.is_float(dt) and is_dtype_supported(dt)] -FP8E4M3_MAX = 448.0 -FP8E5M2_MAX = 57344.0 def get_available_cast_dtypes(dtype: DType) -> List[DType]: if not is_dtype_supported(dtype): return [] @@ -32,21 +27,13 @@ def get_available_cast_dtypes(dtype: DType) -> List[DType]: def _test_to_np(a:Tensor, np_dtype, target): if DEBUG >= 2: print(a) na = a.numpy() - if DEBUG >= 2: print(na, na.dtype, a.lazydata.base.realized) + if DEBUG >= 2: print(na, na.dtype, a.uop.base.realized) try: assert na.dtype == np_dtype np.testing.assert_allclose(na, target) except AssertionError as e: raise AssertionError(f"\ntensor {a.numpy()} does not match target {target} with np_dtype {np_dtype}") from e -def _assert_eq(tensor:Tensor, target_dtype:DType, target, tol_target_dtype:float=1e-7): - if DEBUG >= 2: print(tensor.numpy()) - try: - assert tensor.dtype == target_dtype - np.testing.assert_allclose(tensor.numpy(), target, rtol={dtypes.float16:1e-3, dtypes.bfloat16:1e-2}.get(target_dtype, tol_target_dtype)) - except AssertionError as e: - raise AssertionError(f"\ntensor {tensor.numpy()} dtype {tensor.dtype} does not match target {target} with dtype {target_dtype}") from e - def _test_op(fxn, target_dtype:DType, target): _assert_eq(fxn(), target_dtype, target) def _test_cast(a:Tensor, target_dtype:DType): @@ -413,523 +400,6 @@ class TestEqStrDType(unittest.TestCase): self.assertEqual(str(dtypes.imagef((1,2,4))), "dtypes.imagef((1, 2, 4))") self.assertEqual(str(dtypes.float32.ptr(16)), "dtypes.float.ptr(16)") -class TestHelpers(unittest.TestCase): - signed_ints = (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64) - uints = (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64) - floats = (dtypes.float16, dtypes.float32, dtypes.float64) - - @given(strat.sampled_from(signed_ints+uints), strat.integers(min_value=1, max_value=8)) - def test_is_int(self, dtype, amt): - assert dtypes.is_int(dtype.vec(amt) if amt > 1 else dtype) - assert not dtypes.is_float(dtype.vec(amt) if amt > 1 else dtype) - - @given(strat.sampled_from(uints), strat.integers(min_value=1, max_value=8)) - def test_is_unsigned_uints(self, dtype, amt): - assert dtypes.is_unsigned(dtype.vec(amt) if amt > 1 else dtype) - - @given(strat.sampled_from(signed_ints), strat.integers(min_value=1, max_value=8)) - def test_is_unsigned_signed_ints(self, dtype, amt): - assert not dtypes.is_unsigned(dtype.vec(amt) if amt > 1 else dtype) - - @given(strat.sampled_from(floats), strat.integers(min_value=1, max_value=8)) - def test_is_float(self, dtype, amt): - assert dtypes.is_float(dtype.vec(amt) if amt > 1 else dtype) - assert not dtypes.is_int(dtype.vec(amt) if amt > 1 else dtype) - assert not dtypes.is_unsigned(dtype.vec(amt) if amt > 1 else dtype) - - def test_bf16_is_float(self): - assert dtypes.is_float(dtypes.bfloat16) - - def test_fp8s_are_float(self): - assert dtypes.is_float(dtypes.fp8e4m3) - assert dtypes.is_float(dtypes.fp8e5m2) - - @given(strat.sampled_from([d for d in DTYPES_DICT.values() if dtypes.is_float(d) or dtypes.is_int(d)]), strat.integers(min_value=2, max_value=8)) - def test_scalar(self, dtype, amt): - assert dtype.vec(amt).scalar() == dtype - - def test_from_py(self): - assert dtypes.from_py(True) == dtypes.bool - assert dtypes.from_py(2) == dtypes.default_int - assert dtypes.from_py(3.0) == dtypes.default_float - assert dtypes.from_py([]) == dtypes.default_float - assert dtypes.from_py(()) == dtypes.default_float - assert dtypes.from_py([True]) == dtypes.bool - assert dtypes.from_py([True, 2]) == dtypes.default_int - assert dtypes.from_py([True, 3.0]) == dtypes.default_float - assert dtypes.from_py([2, 3.0]) == dtypes.default_float - assert dtypes.from_py([True, 2, 3.0]) == dtypes.default_float - with self.assertRaises(RuntimeError): dtypes.from_py(None) - with self.assertRaises(RuntimeError): dtypes.from_py([None]) - with self.assertRaises(RuntimeError): dtypes.from_py({}) - with self.assertRaises(RuntimeError): dtypes.from_py(set()) - - def test_dtype_range(self): - for dt in core_dtypes: - if dtypes.is_float(dt): - np.testing.assert_equal(dtypes.min(dt), -math.inf) - np.testing.assert_equal(dtypes.max(dt), math.inf) - elif dtypes.is_int(dt): - info = np.iinfo(_to_np_dtype(dt)) - np.testing.assert_equal(dtypes.min(dt), info.min) - np.testing.assert_equal(dtypes.max(dt), info.max) - else: - assert dt == dtypes.bool, dt - np.testing.assert_equal(dtypes.min(dt), False) - np.testing.assert_equal(dtypes.max(dt), True) - - def test_truncate_fp16(self): - self.assertEqual(truncate_fp16(1), 1) - self.assertEqual(truncate_fp16(65504), 65504) - self.assertEqual(truncate_fp16(65519.999), 65504) - self.assertEqual(truncate_fp16(65520), math.inf) - - def test_truncate_bf16(self): - self.assertEqual(truncate_bf16(1), 1) - self.assertAlmostEqual(truncate_bf16(1.1), 1.09375, places=7) - for a in [1234, 23456, -777.777]: - self.assertEqual(truncate_bf16(a), torch.tensor([a], dtype=torch.bfloat16).item()) - # TODO: torch bfloat 1.1 gives 1.1015625 instead of 1.09375 - max_bf16 = torch.finfo(torch.bfloat16).max - self.assertEqual(truncate_bf16(max_bf16), max_bf16) - self.assertEqual(truncate_bf16(min_bf16:=-max_bf16), min_bf16) - self.assertEqual(truncate_bf16(max_bf16 * 1.00001), math.inf) - self.assertEqual(truncate_bf16(min_bf16 * 1.00001), -math.inf) - - @given(strat.floats(width=32, allow_subnormal=True, allow_nan=True, allow_infinity=True)) - def test_truncate_fp8e4m3(self, x): - if x > FP8E4M3_MAX: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), FP8E4M3_MAX) - elif x < -FP8E4M3_MAX: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), -FP8E4M3_MAX) - else: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), ml_dtypes.float8_e4m3fn(x)) - - @given(strat.floats(width=32, allow_subnormal=True, allow_nan=True, allow_infinity=True)) - def test_truncate_fp8e5m2(self, x): - if x > FP8E5M2_MAX: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), FP8E5M2_MAX) - elif x < -FP8E5M2_MAX: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), -FP8E5M2_MAX) - else: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), ml_dtypes.float8_e5m2(x)) - -class TestTypeSpec(unittest.TestCase): - def setUp(self): - self.old_default_int, self.old_default_float = dtypes.default_int, dtypes.default_float - def tearDown(self): - dtypes.default_int, dtypes.default_float = self.old_default_int, self.old_default_float - - def test_set_dtype_default(self): - for default_int in [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64]: - dtypes.default_int = default_int - assert dtypes.default_int == default_int - - for default_float in [*dtypes.fp8s, dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]: - dtypes.default_float = default_float - assert dtypes.default_float == default_float - - def test_env_set_default_float(self): - # check default - subprocess.run(['python3 -c "from tinygrad import dtypes; assert dtypes.default_float == dtypes.float"'], - shell=True, check=True) - # check change - subprocess.run(['DEFAULT_FLOAT=HALF python3 -c "from tinygrad import dtypes; assert dtypes.default_float == dtypes.half"'], - shell=True, check=True) - # check invalid - with self.assertRaises(subprocess.CalledProcessError): - subprocess.run(['DEFAULT_FLOAT=INT32 python3 -c "from tinygrad import dtypes"'], - shell=True, check=True) - - with self.assertRaises(subprocess.CalledProcessError): - subprocess.run(['DEFAULT_FLOAT=TYPO python3 -c "from tinygrad import dtypes"'], - shell=True, check=True) - - @unittest.skipUnless(is_dtype_supported(dtypes.int8), f"no int8 on {Device.DEFAULT}") - def test_dtype_str_arg(self): - n = np.random.normal(0, 1, (10, 10)).astype(np.float32) - tested = 0 - for dtype_str, dtype in [ - ("bool", dtypes.bool), ("int8", dtypes.int8), ("int", dtypes.int), ("uint32", dtypes.uint32), ("float32", dtypes.float32)]: - np.testing.assert_equal(Tensor(n, dtype=dtype_str).numpy(), Tensor(n, dtype=dtype).numpy()) - np.testing.assert_equal(Tensor(n).cast(dtype_str).numpy(), Tensor(n).cast(dtype).numpy()) - if dtype.itemsize == 4: - np.testing.assert_equal(Tensor(n).bitcast(dtype_str).numpy(), Tensor(n).bitcast(dtype).numpy()) - tested += 1 - assert tested == 3 - - with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="nonexistdtype") - with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="") - - np.testing.assert_equal(Tensor(n).sum(dtype="int16").numpy(), Tensor(n).sum(dtype=dtypes.int16).numpy()) - - @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) - def test_creation(self, default_int, default_float): - dtypes.default_int, dtypes.default_float = default_int, default_float - _assert_eq(Tensor(True), dtypes.bool, True) - _assert_eq(Tensor(None), dtypes.default_float, []) - _assert_eq(Tensor(2), dtypes.default_int, 2) - _assert_eq(Tensor(2.34), dtypes.default_float, 2.34) - _assert_eq(Tensor([]), dtypes.default_float, []) - _assert_eq(Tensor([1]), dtypes.default_int, [1]) - _assert_eq(Tensor([1.1]), dtypes.default_float, [1.1]) - - _assert_eq(Tensor.eye(0), dtypes.default_float, np.eye(0)) - _assert_eq(Tensor.eye(3), dtypes.default_float, np.eye(3)) - if is_dtype_supported(dtypes.int64): - _assert_eq(Tensor.eye(3, dtype=dtypes.int64), dtypes.int64, np.eye(3)) - if is_dtype_supported(dtypes.float16): - _assert_eq(Tensor.eye(3, dtype=dtypes.float16), dtypes.float16, np.eye(3)) - - @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) - def test_full(self, default_int, default_float): - dtypes.default_int, dtypes.default_float = default_int, default_float - - _assert_eq(Tensor.zeros((2, 3)), dtypes.default_float, np.zeros((2, 3))) - if is_dtype_supported(dtypes.int64): - _assert_eq(Tensor.zeros((2, 3), dtype=dtypes.int64), dtypes.int64, np.zeros((2, 3))) - if is_dtype_supported(dtypes.float16): - _assert_eq(Tensor.zeros((2, 3), dtype=dtypes.float16), dtypes.float16, np.zeros((2, 3))) - - _assert_eq(Tensor.ones((2, 3)), dtypes.default_float, np.ones((2, 3))) - if is_dtype_supported(dtypes.int64): - _assert_eq(Tensor.ones((2, 3), dtype=dtypes.int64), dtypes.int64, np.ones((2, 3))) - if is_dtype_supported(dtypes.float16): - _assert_eq(Tensor.ones((2, 3), dtype=dtypes.float16), dtypes.float16, np.ones((2, 3))) - - _assert_eq(Tensor.full((2, 3), 3.0), dtypes.default_float, np.full((2, 3), 3.0)) - _assert_eq(Tensor.full((2, 3), 3), dtypes.default_int, np.full((2, 3), 3)) - _assert_eq(Tensor.full((2, 3), True), dtypes.bool, np.full((2, 3), True)) - if is_dtype_supported(dtypes.int64): - _assert_eq(Tensor.full((2, 3), 3, dtype=dtypes.int64), dtypes.int64, np.full((2, 3), 3)) - _assert_eq(Tensor.full((2, 3), 3.0, dtype=dtypes.int64), dtypes.int64, np.full((2, 3), 3)) - if is_dtype_supported(dtypes.float16): - _assert_eq(Tensor.full((2, 3), 3, dtype=dtypes.float16), dtypes.float16, np.full((2, 3), 3)) - _assert_eq(Tensor.full((2, 3), 3.0, dtype=dtypes.float16), dtypes.float16, np.full((2, 3), 3)) - - @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) - def test_reduce_0d_default(self, default_int, default_float): - dtypes.default_int, dtypes.default_float = default_int, default_float - _assert_eq(Tensor.ones((2,3,0)).sum(2), dtypes.default_float, np.zeros((2, 3))) - # TODO: what should this one be? - # _assert_eq(Tensor.ones((2,3,0), dtype=dtypes.default_int).sum(2), dtypes.default_int, np.zeros((2, 3))) - _assert_eq(Tensor.ones((2,3,0), dtype=dtypes.int32).sum(2), dtypes.int32, np.zeros((2, 3))) - - @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) - def test_arange(self, default_int, default_float): - dtypes.default_int, dtypes.default_float = default_int, default_float - - _assert_eq(Tensor.arange(5), dtypes.default_int, np.arange(5)) - _assert_eq(Tensor.arange(120), dtypes.default_int, np.arange(120)) - _assert_eq(Tensor.arange(5.0), dtypes.default_float, np.arange(5)) - if is_dtype_supported(dtypes.int16): - _assert_eq(Tensor.arange(5, dtype=dtypes.int16), dtypes.int16, np.arange(5)) - if is_dtype_supported(dtypes.int64): - _assert_eq(Tensor.arange(5, dtype=dtypes.int64), dtypes.int64, np.arange(5)) - if is_dtype_supported(dtypes.float16): - _assert_eq(Tensor.arange(5, dtype=dtypes.float16), dtypes.float16, np.arange(5)) - _assert_eq(Tensor.arange(3, 9, 0.7), dtypes.default_float, np.arange(3, 9, 0.7), 1e-6 if Device.DEFAULT == "WEBGPU" else 1e-7) - _assert_eq(Tensor.arange(3, 8.5, 3), dtypes.default_float, np.arange(3, 8.5, 3)) - # stop-start and step have different signs - _assert_eq(Tensor.arange(3, 5, -2), dtypes.default_int, np.arange(3, 5, -2)) - _assert_eq(Tensor.arange(5.0, 3.0), dtypes.default_float, np.arange(5.0, 3.0)) - - @given(strat.sampled_from(core_dtypes), strat.sampled_from([operator.gt, operator.ge, operator.le, operator.lt, operator.eq, operator.ne])) - def test_bool_ops(self, dtype, op): - assert op(Tensor.ones(4, 4, dtype=dtype), Tensor.ones(4, 4, dtype=dtype)).dtype == dtypes.bool - - @given(strat.sampled_from(core_dtypes), strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) - def test_functions_return_index(self, dtype, default_int, default_float): - dtypes.default_int, dtypes.default_float = default_int, default_float - assert Tensor([0, 1], dtype=dtype).argmax().dtype == dtypes.int32 - assert Tensor([0, 1], dtype=dtype).argmin().dtype == dtypes.int32 - assert Tensor([0, 1], dtype=dtype).multinomial().dtype == dtypes.int32 - - @given(strat.sampled_from(core_dtypes), strat.sampled_from(dtype_ints)) - def test_tensor_indexing_returns_same_dtype(self, data_dtype, indices_dtype): - X_data = Tensor.ones(60000, 1, 28, 28, dtype=data_dtype) - indices = Tensor.randint(512, high=X_data.shape[0]).cast(indices_dtype) - assert X_data[indices].dtype == X_data.dtype - - @given(strat.sampled_from(core_dtypes), strat.sampled_from(dtype_ints)) - def test_gather_returns_same_dtype(self, data_dtype, indices_dtype): - X_data = Tensor([[1, 0], [0, 1]], dtype=data_dtype) - indices = Tensor([[0, 0], [1, 0]], dtype=indices_dtype) - assert X_data.gather(0, indices).dtype == X_data.dtype - assert X_data.gather(1, indices).dtype == X_data.dtype - - @given(strat.sampled_from(dtype_floats), strat.sampled_from(dtype_floats)) - def test_attention_returns_same_dtype(self, data_dtype, default_float): - dtypes.default_float = default_float - query = Tensor.rand(32, 8, 128, 64, dtype=data_dtype) - key = Tensor.rand(32, 8, 128, 64, dtype=data_dtype) - value = Tensor.rand(32, 8, 128, 64, dtype=data_dtype) - mask = (Tensor.rand(32, 8, 128, 128) < 0.5) - assert query.scaled_dot_product_attention(key, value, is_causal=True).dtype == data_dtype - assert query.scaled_dot_product_attention(key, value, is_causal=True, dropout_p=0.3).dtype == data_dtype - assert query.scaled_dot_product_attention(key, value, is_causal=False).dtype == data_dtype - assert query.scaled_dot_product_attention(key, value, attn_mask=mask).dtype == data_dtype - -class TestTypePromotion(unittest.TestCase): - @given(strat.sampled_from(core_dtypes)) - def test_self_promo_to_self(self, dtype): - assert least_upper_dtype(dtype) == dtype - assert least_upper_dtype(dtype, dtype) == dtype - assert least_upper_dtype(dtype, dtype, dtype) == dtype - - @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) - def test_promo_resulted_higher_than_inputs(self, dtype1, dtype2): - result = least_upper_dtype(dtype1, dtype2) - assert not (result < dtype1) and not (result < dtype2) - - def test_dtype_promo(self): - assert least_upper_dtype(dtypes.bool, dtypes.int8) == dtypes.int8 - assert least_upper_dtype(dtypes.int8, dtypes.uint8) == dtypes.int16 - assert least_upper_dtype(dtypes.uint8, dtypes.int16) == dtypes.int16 - assert least_upper_dtype(dtypes.int16, dtypes.uint16) == dtypes.int32 - assert least_upper_dtype(dtypes.uint16, dtypes.int32) == dtypes.int32 - assert least_upper_dtype(dtypes.int32, dtypes.uint32) == dtypes.int64 - assert least_upper_dtype(dtypes.uint32, dtypes.int64) == dtypes.int64 - # similar to jax but we don't use weak type - assert least_upper_dtype(dtypes.int64, dtypes.uint64) == dtypes.float16 - assert least_upper_dtype(dtypes.float16, dtypes.float32) == dtypes.float32 - assert least_upper_dtype(dtypes.float32, dtypes.float64) == dtypes.float64 - - assert least_upper_dtype(dtypes.bool, dtypes.float32) == dtypes.float32 - assert least_upper_dtype(dtypes.bool, dtypes.float64) == dtypes.float64 - assert least_upper_dtype(dtypes.float16, dtypes.int64) == dtypes.float16 - assert least_upper_dtype(dtypes.float16, dtypes.uint64) == dtypes.float16 - assert least_upper_dtype(dtypes.fp8e4m3, dtypes.fp8e5m2) == dtypes.half - -class TestAutoCastType(unittest.TestCase): - def setUp(self): - self.old_default_int, self.old_default_float = dtypes.default_int, dtypes.default_float - def tearDown(self): - dtypes.default_int, dtypes.default_float = self.old_default_int, self.old_default_float - - @given(strat.sampled_from(dtype_floats), strat.sampled_from(dtype_floats)) - def test_least_upper_float_input_is_float(self, input_dtype, default_float): - dtypes.default_float = default_float - self.assertEqual(least_upper_float(input_dtype), input_dtype) - - @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) - def test_least_upper_float_input_is_int(self, input_dtype, default_float): - dtypes.default_float = default_float - self.assertEqual(least_upper_float(input_dtype), default_float) - - @given(strat.sampled_from([d for d in core_dtypes if dtypes.is_int(d) and is_dtype_supported(d)])) - def test_int_to_float_unary_func(self, dtype): - for func in [ - lambda t: t.exp(), - lambda t: t.exp2(), - lambda t: t.log(), - lambda t: t.log2(), - lambda t: t.sqrt(), - lambda t: t.rsqrt(), - lambda t: t.sin(), - lambda t: t.cos(), - lambda t: t.tan(), - lambda t: t.sigmoid(), - ]: - a = [2, 3, 4] - # float16 can have larger precision errors - np.testing.assert_allclose(func(Tensor(a, dtype=dtype)).numpy(), func(torch.tensor(a)), rtol=1e-3, atol=1e-3) - - @given(strat.sampled_from(core_dtypes)) - def test_broadcast_scalar(self, dt): - assert (Tensor.ones(4, 4, dtype=dt) + 2.3).dtype == (dt if dtypes.is_float(dt) else dtypes.default_float) - assert (Tensor.ones(4, 4, dtype=dt) + 2).dtype == (dt if dtypes.is_float(dt) or dtypes.is_int(dt) else dtypes.default_int) - assert (Tensor.ones(4, 4, dtype=dt) + True).dtype == dt - - @given(strat.sampled_from(dtype_floats)) - def test_int_div_int(self, default_float): - dtypes.default_float = default_float - self.assertEqual(Tensor([1]).div(Tensor([2])).dtype, default_float) - - def test_sum(self): - assert (Tensor([0, 1], dtype=dtypes.bool)).sum().dtype == dtypes.int32 - assert (Tensor([0, 1], dtype=dtypes.int8)).sum().dtype == dtypes.int32 - assert (Tensor([0, 1], dtype=dtypes.int16)).sum().dtype == dtypes.int32 - assert (Tensor([0, 1], dtype=dtypes.int32)).sum().dtype == dtypes.int32 - assert (Tensor([0, 1], dtype=dtypes.int64)).sum().dtype == dtypes.int64 - assert (Tensor([0, 1], dtype=dtypes.uint8)).sum().dtype == dtypes.uint32 - assert (Tensor([0, 1], dtype=dtypes.uint16)).sum().dtype == dtypes.uint32 - assert (Tensor([0, 1], dtype=dtypes.uint32)).sum().dtype == dtypes.uint32 - assert (Tensor([0, 1], dtype=dtypes.uint64)).sum().dtype == dtypes.uint64 - assert (Tensor([0, 1], dtype=dtypes.float16)).sum().dtype == dtypes.float16 - #assert (Tensor([0, 1], dtype=dtypes.bfloat16)).sum().dtype == dtypes.bfloat16 - assert (Tensor([0, 1], dtype=dtypes.float32)).sum().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.float64)).sum().dtype == dtypes.float64 - - @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16") - def test_sum_dtype_arg(self): - t = Tensor([40000, 40000], dtype=dtypes.float16) - # default float16 sum returns in float16, overflowed in this case - assert t.sum().dtype == dtypes.float16 - assert math.isinf(t.sum().numpy().item()) - # specifiying dtype and it's not downcasted - assert t.sum(dtype=dtypes.float32).dtype == dtypes.float32 - np.testing.assert_allclose(t.sum(dtype=dtypes.float32).numpy(), 80000) - - def test_prod_dtype_arg(self): - t = Tensor([100, 200], dtype=dtypes.int32) - assert t.prod().dtype == dtypes.int32 - np.testing.assert_allclose(t.prod().numpy(), 20000) - assert t.prod(dtype=dtypes.float32).dtype == dtypes.float32 - np.testing.assert_allclose(t.prod(dtype=dtypes.float32).numpy(), 20000) - - def test_mean(self): - assert (Tensor([0, 1], dtype=dtypes.bool)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.int8)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.int16)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.int32)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.int64)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.uint8)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.uint16)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.uint32)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.uint64)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.float16)).mean().dtype == dtypes.float16 - #assert (Tensor([0, 1], dtype=dtypes.bfloat16)).mean().dtype == dtypes.bfloat16 - assert (Tensor([0, 1], dtype=dtypes.float32)).mean().dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.float64)).mean().dtype == dtypes.float64 - - def test_cumsum(self): - assert (Tensor([0, 1], dtype=dtypes.bool)).cumsum(0).dtype == dtypes.int32 - assert (Tensor([0, 1], dtype=dtypes.int8)).cumsum(0).dtype == dtypes.int32 - assert (Tensor([0, 1], dtype=dtypes.int16)).cumsum(0).dtype == dtypes.int32 - assert (Tensor([0, 1], dtype=dtypes.int32)).cumsum(0).dtype == dtypes.int32 - assert (Tensor([0, 1], dtype=dtypes.int64)).cumsum(0).dtype == dtypes.int64 - assert (Tensor([0, 1], dtype=dtypes.uint8)).cumsum(0).dtype == dtypes.uint32 - assert (Tensor([0, 1], dtype=dtypes.uint16)).cumsum(0).dtype == dtypes.uint32 - assert (Tensor([0, 1], dtype=dtypes.uint32)).cumsum(0).dtype == dtypes.uint32 - assert (Tensor([0, 1], dtype=dtypes.uint64)).cumsum(0).dtype == dtypes.uint64 - assert (Tensor([0, 1], dtype=dtypes.float16)).cumsum(0).dtype == dtypes.float16 - #assert (Tensor([0, 1], dtype=dtypes.bfloat16)).cumsum(0).dtype == dtypes.bfloat16 - assert (Tensor([0, 1], dtype=dtypes.float32)).cumsum(0).dtype == dtypes.float32 - assert (Tensor([0, 1], dtype=dtypes.float64)).cumsum(0).dtype == dtypes.float64 - - @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) - def test_matmul(self, dt1, dt2, acc_dt): - t1 = Tensor([0, 1], dtype=dt1) - t2 = Tensor([0, 1], dtype=dt2) - self.assertEqual(t1.matmul(t2).dtype, least_upper_dtype(t1.dtype, t2.dtype)) - # if dtype is specified, return in dtype - self.assertEqual(t1.matmul(t2, dtype=acc_dt).dtype, acc_dt) - - @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) - def test_linear(self, dt1, dt2, dt3, acc_dt): - x = Tensor([0, 1], dtype=dt1) - w = Tensor([0, 1], dtype=dt2) - b = Tensor([0, 1], dtype=dt3) - self.assertEqual(x.linear(w).dtype, least_upper_dtype(x.dtype, w.dtype)) - self.assertEqual(x.linear(w, b).dtype, least_upper_dtype(least_upper_dtype(x.dtype, w.dtype), b.dtype)) - # if dtype is specified, return in dtype - self.assertEqual(x.linear(w, dtype=acc_dt).dtype, acc_dt) - self.assertEqual(x.linear(w, b, dtype=acc_dt).dtype, acc_dt) - - @staticmethod - def check_where_alternate_input_other(input_, other, data_type): - assert (Tensor([True, False]).where(input_, other)).dtype == data_type - assert (Tensor([True, False]).where(other, input_)).dtype == data_type - - @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) - def test_where_no_scalar(self, dt1, dt2): - self.check_where_alternate_input_other(Tensor(2, dtype=dt1), Tensor(3, dtype=dt2), least_upper_dtype(dt1, dt2)) - - @given(strat.sampled_from(core_dtypes)) - def test_where_one_scalar(self, dt): - t = Tensor(2, dtype=dt) - self.check_where_alternate_input_other(t, 3.2, (dt if dtypes.is_float(dt) else dtypes.default_float)) - self.check_where_alternate_input_other(t, 3, (dt if dtypes.is_float(dt) or dtypes.is_int(dt) else dtypes.default_int)) - self.check_where_alternate_input_other(t, True, dt) - - def test_where_two_scalars(self): - self.check_where_alternate_input_other(3.1, 3.2, dtypes.default_float) - self.check_where_alternate_input_other(3.1, 3, dtypes.default_float) - self.check_where_alternate_input_other(3.1, True, dtypes.default_float) - self.check_where_alternate_input_other(3, 2, dtypes.default_int) - self.check_where_alternate_input_other(3, True, dtypes.default_int) - self.check_where_alternate_input_other(False, True, dtypes.bool) - - @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) - def test_maximum(self, dt1, dt2): - assert Tensor([0, 1, 2], dtype=dt1).maximum(Tensor([2, 0, 5], dtype=dt2)).dtype == least_upper_dtype(dt1, dt2) - - @given(strat.sampled_from(core_dtypes)) - def test_maximum_const(self, dt): - assert Tensor([1, 2], dtype=dt).maximum(3.1).dtype == (dt if dtypes.is_float(dt) else dtypes.default_float) - assert Tensor([1, 2], dtype=dt).maximum(3).dtype == (dt if dtypes.is_float(dt) or dtypes.is_int(dt) else dtypes.default_int) - assert Tensor([1, 2], dtype=dt).maximum(True).dtype == dt - - def test_div(self): - assert (Tensor([1, 2], dtype=dtypes.int32) / Tensor([2, 2], dtype=dtypes.int32)).dtype == dtypes.default_float - assert (Tensor([1, 2], dtype=dtypes.int16) / Tensor([2, 2], dtype=dtypes.int32)).dtype == dtypes.default_float - assert (Tensor([1, 2], dtype=dtypes.float32) / Tensor([2, 2], dtype=dtypes.float16)).dtype == dtypes.float32 - assert (Tensor([1, 2], dtype=dtypes.int32) / Tensor([2, 2], dtype=dtypes.float16)).dtype == dtypes.float16 - - def test_div_const(self): - assert (Tensor([1, 2], dtype=dtypes.int32) / 2).dtype == dtypes.default_float - assert (Tensor([1, 2], dtype=dtypes.int32) / 2.0).dtype == dtypes.default_float - assert (Tensor([1, 2], dtype=dtypes.float16) / 2).dtype == dtypes.float16 - assert (Tensor([1, 2], dtype=dtypes.float16) / 2.0).dtype == dtypes.float16 - - def test_gradient_dtype(self): - old_default_float = dtypes.default_float - - for default_dtype in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]: - if not is_dtype_supported(default_dtype): continue - dtypes.default_float = default_dtype - for dtype in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]: - if not is_dtype_supported(dtype): continue - if DEBUG >= 2: - print(f"testing {default_dtype=}, {dtype=}") - a = Tensor([1, 2, 3], dtype=dtype, requires_grad=True) - b = (a * 5).sum() - b.backward() # if there is dtype mismatch, lazy should assert - assert a.grad.dtype == a.dtype - np.testing.assert_allclose(a.grad.numpy(), [5, 5, 5]) - - dtypes.default_float = old_default_float - - @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") - def test_backward_sum_acc_dtype(self): - # test acc of sum in the backward is upcasted to float - t = Tensor([5, -5], dtype=dtypes.half, requires_grad=True) - t.reshape(2, 1).expand(2, 10001).max().backward() - np.testing.assert_allclose(t.grad.numpy(), [1, 0]) - - @unittest.skipIf(Device.DEFAULT == "PYTHON", "very slow") - @unittest.skipIf(CI and Device.DEFAULT == "AMD", "very slow") - @unittest.skipIf(Device.DEFAULT == "WEBGPU", "Binding size is larger than the maximum storage buffer binding size") - @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") - def test_mean_half_precision_underflow(self): - N = 10000 - x = 0.001 - t = Tensor([[x]], dtype=dtypes.half, requires_grad=True).expand(N, N).contiguous() - np.testing.assert_allclose(t.mean(axis=1).numpy(), np.array([x] * N, dtype=np.float16), rtol=1e-3) - - @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") - def test_mean_half_precision_overflow(self): - N = 256 - t = Tensor([60000] * N*N, dtype=dtypes.half, requires_grad=True).reshape(N, N) - np.testing.assert_allclose(t.mean().numpy(), 60000) - t.square().mean().backward() - np.testing.assert_allclose(t.grad.numpy().flatten(), [60000 * 2 / (N*N)] * N*N) - - @unittest.skipIf(Device.DEFAULT == "WEBGPU", "Precision error") - @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") - def test_softmax_dtype(self): - data = [1, 2, 3] - t = Tensor(data, dtype=dtypes.half) - tt = torch.tensor(data, dtype=torch.half) - - out = t.softmax(0) - self.assertEqual(out.dtype, dtypes.half) - np.testing.assert_allclose(out.numpy(), tt.softmax(0).numpy(), rtol=1e-3) - out = t.softmax(0, dtype=dtypes.float) - self.assertEqual(out.dtype, dtypes.float) - np.testing.assert_allclose(out.numpy(), tt.softmax(0, dtype=torch.float).numpy(), rtol=1e-3) - out = t.log_softmax(0) - self.assertEqual(out.dtype, dtypes.half) - np.testing.assert_allclose(out.numpy(), tt.log_softmax(0).numpy(), rtol=1e-3) - out = t.log_softmax(0, dtype=dtypes.float) - self.assertEqual(out.dtype, dtypes.float) - np.testing.assert_allclose(out.numpy(), tt.log_softmax(0, dtype=torch.float).numpy(), rtol=1e-3) - class TestImplicitFunctionTypeChange(unittest.TestCase): def test_functions(self): result = [] diff --git a/tinygrad_repo/test/test_dtype_alu.py b/tinygrad_repo/test/test_dtype_alu.py index ceae502..1b0fa94 100644 --- a/tinygrad_repo/test/test_dtype_alu.py +++ b/tinygrad_repo/test/test_dtype_alu.py @@ -157,6 +157,7 @@ class TestDTypeALU(unittest.TestCase): @given(ht.bool, ht.bool, strat.sampled_from(((operator.add, operator.add), (operator.mul, operator.mul)))) def test_bool(self, a, b, op): universal_test(a, b, dtypes.bool, op) + @unittest.skipIf(not CI and Device.DEFAULT == "METAL", "broken on local M3") @given(ht.int32, ht.int32, ht.float32, strat.sampled_from(integer_binary_operations), strat.sampled_from(binary_operations)) def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32) @@ -176,6 +177,7 @@ class TestDTypeALU(unittest.TestCase): @given(ht.int32, strat.sampled_from(dtypes_float+dtypes_int+dtypes_bool)) def test_int32_cast(self, a, dtype): universal_test_cast(a, dtypes.int32, dtype) + @settings(suppress_health_check=[HealthCheck.filter_too_much]) @given(strat.data(), strat.sampled_from(dtypes_float), strat.sampled_from((dtypes.uint8, dtypes.uint16))) def test_float_cast_to_unsigned(self, a, float_dtype, unsigned_dtype): if not is_dtype_supported(float_dtype, Device.DEFAULT): float_dtype = dtypes.float32 diff --git a/tinygrad_repo/test/test_gc.py b/tinygrad_repo/test/test_gc.py index e32053c..78827a0 100644 --- a/tinygrad_repo/test/test_gc.py +++ b/tinygrad_repo/test/test_gc.py @@ -73,7 +73,7 @@ class TestGC(unittest.TestCase): x = Tensor.ones(4,4).contiguous().realize()+1 self.assertEqual(bufs_allocated()-init, 1) # try commenting this part out, it's green! - x.lazydata.toposort() + x.uop.toposort() del x if bufs_allocated()-init != 0: print(inspect.getclosurevars(UOp.toposort().fget)) @@ -84,11 +84,11 @@ class TestGC(unittest.TestCase): a = Tensor.empty(10) self.assertEqual(bufs_allocated()-init, 0) a.realize() - real_buf = a.lazydata.buffer + real_buf = a.uop.buffer # after the Tensor UOp is deleted there shouldn't be any references on the Buffer self.assertEqual(real_buf.lb_refcount, 1) self.assertEqual(bufs_allocated()-init, 1) - del a.lazydata + del a.uop self.assertEqual(real_buf.lb_refcount, 0) self.assertEqual(bufs_allocated()-init, 1) # keep the buffer alive del real_buf @@ -98,10 +98,10 @@ class TestGC(unittest.TestCase): init = bufs_allocated() a = Tensor.full((4,), 1.).contiguous() a.realize() - real_buf = a.lazydata.buffer + real_buf = a.uop.buffer self.assertEqual(real_buf.lb_refcount, 1) a.assign(Tensor.full((4,), 2.)) - self.assertIs(a.lazydata.src[0].buffer, real_buf) + self.assertIs(a.uop.src[0].buffer, real_buf) # NOTE: this is still 1, we don't count the ASSIGN self.assertEqual(real_buf.lb_refcount, 1) a.realize() diff --git a/tinygrad_repo/test/test_graph.py b/tinygrad_repo/test/test_graph.py index a47756a..19dd6f9 100644 --- a/tinygrad_repo/test/test_graph.py +++ b/tinygrad_repo/test/test_graph.py @@ -36,7 +36,7 @@ def helper_alloc_rawbuffer(device, fill=False): if fill: with Context(DEBUG=0): data = np.random.randint(-10000, 10000, size=rawbuf.size, dtype=_to_np_dtype(rawbuf.dtype)) - rawbuf.copyin(Tensor(data).realize().lazydata.base.realized.as_buffer()) + rawbuf.copyin(Tensor(data).realize().uop.base.realized.as_buffer()) return rawbuf def helper_create_offset_rawbuffer(base, offset=0): diff --git a/tinygrad_repo/test/test_hcq.py b/tinygrad_repo/test/test_hcq.py index b2ae098..884c5fc 100644 --- a/tinygrad_repo/test/test_hcq.py +++ b/tinygrad_repo/test/test_hcq.py @@ -20,15 +20,15 @@ class TestHCQ(unittest.TestCase): si = self.b.schedule()[-1] TestHCQ.runner = get_runner(TestHCQ.d0.device, si.ast) - TestHCQ.b.lazydata.buffer.allocate() + TestHCQ.b.uop.buffer.allocate() - TestHCQ.kernargs_ba_ptr = TestHCQ.runner._prg.fill_kernargs([TestHCQ.b.lazydata.buffer._buf, TestHCQ.a.lazydata.buffer._buf]) - TestHCQ.kernargs_ab_ptr = TestHCQ.runner._prg.fill_kernargs([TestHCQ.a.lazydata.buffer._buf, TestHCQ.b.lazydata.buffer._buf]) + TestHCQ.kernargs_ba_ptr = TestHCQ.runner._prg.fill_kernargs([TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf]) + TestHCQ.kernargs_ab_ptr = TestHCQ.runner._prg.fill_kernargs([TestHCQ.a.uop.buffer._buf, TestHCQ.b.uop.buffer._buf]) def setUp(self): TestHCQ.d0.synchronize() - TestHCQ.a.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) - TestHCQ.b.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 0)))) + TestHCQ.a.uop.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) + TestHCQ.b.uop.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 0)))) TestHCQ.d0.synchronize() # wait for copyins to complete # Test signals @@ -117,7 +117,7 @@ class TestHCQ(unittest.TestCase): TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[0] assert val == 1.0, f"got val {val}" def test_exec_2_kernels_100_times(self): @@ -133,7 +133,7 @@ class TestHCQ(unittest.TestCase): q.submit(TestHCQ.d0, {virt_val: TestHCQ.d0.timeline_value}) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.a.uop.buffer.as_buffer().cast("f")[0] assert val == 200.0, f"got val {val}" def test_exec_update(self): @@ -148,9 +148,9 @@ class TestHCQ(unittest.TestCase): TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[0] assert val == 1.0, f"got val {val}" - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[1] assert val == 0.0, f"got val {val}, should not be updated" def test_exec_update_fuzz(self): @@ -192,13 +192,13 @@ class TestHCQ(unittest.TestCase): if TestHCQ.d0.hw_copy_queue_t is None: self.skipTest("device does not support copy queue") TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \ - .copy(TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr, 8) \ + .copy(TestHCQ.b.uop.buffer._buf.va_addr, TestHCQ.a.uop.buffer._buf.va_addr, 8) \ .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[1] assert val == 1.0, f"got val {val}" def test_copy_long(self): @@ -252,12 +252,12 @@ class TestHCQ(unittest.TestCase): .copy(virt_dest_addr, virt_src_addr, 8) \ .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) - q.submit(TestHCQ.d0, {virt_src_addr: TestHCQ.a.lazydata.buffer._buf.va_addr, virt_dest_addr: TestHCQ.b.lazydata.buffer._buf.va_addr}) + q.submit(TestHCQ.d0, {virt_src_addr: TestHCQ.a.uop.buffer._buf.va_addr, virt_dest_addr: TestHCQ.b.uop.buffer._buf.va_addr}) TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value) TestHCQ.d0.timeline_value += 1 - val = TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1] + val = TestHCQ.b.uop.buffer.as_buffer().cast("f")[1] assert val == 1.0, f"got val {val}" def test_update_copy_long(self): diff --git a/tinygrad_repo/test/test_image_dtype.py b/tinygrad_repo/test/test_image_dtype.py index 41b32de..08d2c04 100644 --- a/tinygrad_repo/test/test_image_dtype.py +++ b/tinygrad_repo/test/test_image_dtype.py @@ -13,7 +13,7 @@ IMAGE_SUPPORTED_DEVICES = ("QCOM", "GPU") class TestImageCopy(unittest.TestCase): def test_image_copyout_1x1(self, img_type=dtypes.imagef): it = Tensor.arange(4).cast(img_type((1,1,4))).realize() - buf = it.lazydata.buffer + buf = it.uop.buffer out = buf.as_buffer() np.testing.assert_equal(out.cast(it.dtype.fmt).tolist(), np.arange(4)) @@ -27,18 +27,18 @@ class TestImageCopy(unittest.TestCase): def test_image_copyout_2x3(self): it = Tensor.arange(2*3*4).cast(dtypes.imagef((2,3,4))).realize() - buf = it.lazydata.buffer + buf = it.uop.buffer out = buf.as_buffer() np.testing.assert_equal(out.cast('f').tolist(), np.arange(2*3*4)) def test_image_roundtrip(self): sz = (4,2,4) it = Tensor.rand(prod(sz)).cast(dtypes.imagef(sz)).realize() - buf = it.lazydata.buffer + buf = it.uop.buffer out = buf.as_buffer() it2 = Tensor.rand(prod(sz)).cast(dtypes.imagef(sz)).realize() - buf2 = it2.lazydata.buffer + buf2 = it2.uop.buffer buf2.copyin(out) assert (it == it2).sum().item() == prod(sz) @@ -49,7 +49,7 @@ class TestImageDType(unittest.TestCase): data = Tensor.randn(9*27*4).realize() tst = data.numpy() it = data.cast(dtypes.imagef((9,27,4))).contiguous().realize() - assert isinstance(it.lazydata.base.realized.dtype, ImageDType) + assert isinstance(it.uop.base.realized.dtype, ImageDType) np.testing.assert_equal(tst, it.numpy()) @unittest.expectedFailure # this isn't supported anymore, CAST to ImageDType stays ImageDType @@ -58,14 +58,14 @@ class TestImageDType(unittest.TestCase): tst = data.numpy() it = data.cast(dtypes.imagef((9,27,4))).realize() # the underlying UOp is identical - self.assertIs(it.lazydata.base.realized, data.lazydata.base.realized) + self.assertIs(it.uop.base.realized, data.uop.base.realized) np.testing.assert_equal(tst, it.numpy()) def test_image_and_back_wrong_shape(self): data = Tensor.randn(9*27*4).realize() tst = data.numpy() it = data.cast(dtypes.imagef((9,12,4))).realize() - assert not isinstance(it.lazydata.base.realized.dtype, ImageDType) + assert not isinstance(it.uop.base.realized.dtype, ImageDType) np.testing.assert_equal(tst, it.numpy()) def test_shrink_load_float(self): @@ -77,7 +77,7 @@ class TestImageDType(unittest.TestCase): # NOTE: contiguous is needed otherwise this folds it = Tensor.randn(4).cast(dtypes.imagef((1,1,4))).contiguous().realize() out = (it*2).realize() - assert isinstance(out.lazydata.base.realized.dtype, ImageDType) + assert isinstance(out.uop.base.realized.dtype, ImageDType) def test_sum(self): it = Tensor.rand(8).cast(dtypes.imagef((1,2,4))).realize() @@ -98,26 +98,26 @@ class TestImageDType(unittest.TestCase): def test_lru_alloc(self): data = Tensor.randn(9*27*4).realize() it = data.cast(dtypes.imagef((9,27,4))).realize() - b1 = it.lazydata.base.realized._buf + b1 = it.uop.base.realized._buf del it it = data.cast(dtypes.imagef((9,27,4))).realize() - assert it.lazydata.base.realized._buf == b1 + assert it.uop.base.realized._buf == b1 def test_no_lru_alloc(self): data = Tensor.randn(9*27*4).realize() it = data.cast(dtypes.imagef((9,27,4))).contiguous().realize() - b1 = it.lazydata.base.realized._buf + b1 = it.uop.base.realized._buf del it it = data.cast(dtypes.imagef((10,27,4))).contiguous().realize() - assert it.lazydata.base.realized._buf != b1 + assert it.uop.base.realized._buf != b1 def test_no_lru_alloc_dtype(self): data = Tensor.randn(9*27*4).realize() it = data.cast(dtypes.imagef((9,27,4))).contiguous().realize() - b1 = it.lazydata.base.realized._buf + b1 = it.uop.base.realized._buf del it it = data.cast(dtypes.imageh((9,27,4))).realize() - assert it.lazydata.base.realized._buf != b1 + assert it.uop.base.realized._buf != b1 # issue caused by: don't realize image to image casts. this is part of a larger problem #@unittest.expectedFailure @@ -137,8 +137,8 @@ class TestImageDType(unittest.TestCase): print(lst) assert not np.any(np.isnan(lst)) # NOTE: the w1 grad must realize to a seperate kernel - assert w1.grad.lazydata.is_realized, f"never realized {w1.grad}" - self.assertEqual(w1.grad.lazydata.base.buffer.dtype, dtypes.float32) + assert w1.grad.uop.is_realized, f"never realized {w1.grad}" + self.assertEqual(w1.grad.uop.base.buffer.dtype, dtypes.float32) self.assertEqual(len(sched), 10) @unittest.skipUnless(REAL_DEV in IMAGE_SUPPORTED_DEVICES, "Images not supported") diff --git a/tinygrad_repo/test/test_jit_cases.py b/tinygrad_repo/test/test_jit_cases.py new file mode 100644 index 0000000..dd474e3 --- /dev/null +++ b/tinygrad_repo/test/test_jit_cases.py @@ -0,0 +1,78 @@ +import unittest +from tinygrad import TinyJit, Tensor + +# The JIT functions as a "capturing" JIT. +# Whatever kernels ran in the JIT the second run through the function will be the kernels that will run from then on. +# Explicit inputs to the function are updated in the JIT graph to the new inputs. + +# JITs have four tensor types +# 1. Tensors that are explicit in the input, aka what's passed in. TODO: support lists/dicts/classes, anything get_state works on +# 2. Tensors that are explicit in the output, aka what's returned. TODO: same as above +# 3. Tensors that are implicit in the input as a closure. +# 4. Tensors that are implicit in the output because they were assigned to and realized. + +# explicit inputs and outputs are realized on their way in and out of the JIT +# there's a whole bunch of edge cases and weirdness here that needs to be tested and clarified. + +class TestJitCases(unittest.TestCase): + def test_explicit(self): + # this function has an explicit input and an explicit output + @TinyJit + def f(x:Tensor): + ret:Tensor = x*2 + return ret + + for i in range(5): + out = f(Tensor([i])) + self.assertEqual(out.item(), i*2) + + def test_implicit_input(self): + # x is the implicit input (like a weight) + x = Tensor([0]) + + # this function has an implicit input and an explicit output + @TinyJit + def f(): + ret:Tensor = x*2 + return ret + + for i in range(5): + # NOTE: this must be realized here, otherwise the update doesn't happen + # if we were explicitly tracking the implicit input Tensors, we might not need this realize + x.assign(Tensor([i])).realize() + out = f() + self.assertEqual(out.item(), i*2) + + def test_implicit_output(self): + # out is the implicit output (it's assigned to) + out = Tensor([0]) + + # this function has an explicit input and an implicit output + @TinyJit + def f(x:Tensor): + # NOTE: this must be realized here + # if we were explicitly tracking the implicit output Tensors, we might not need this realize + out.assign(x*2).realize() + + for i in range(5): + f(Tensor([i])) + self.assertEqual(out.item(), i*2) + + def test_implicit_io(self): + # x is the implicit input (like a weight) + # out is the implicit output (it's assigned to) + x = Tensor([0]) + out = Tensor([0]) + + # this function has an implicit input and an implicit output + @TinyJit + def f(): + out.assign(x*2).realize() # NOTE: this must be realized here + + for i in range(5): + x.assign(Tensor([i])).realize() + f() + self.assertEqual(out.item(), i*2) + +if __name__ == '__main__': + unittest.main() diff --git a/tinygrad_repo/test/test_linearizer.py b/tinygrad_repo/test/test_linearizer.py index 8cc4dfb..6a9dfe2 100644 --- a/tinygrad_repo/test/test_linearizer.py +++ b/tinygrad_repo/test/test_linearizer.py @@ -75,7 +75,7 @@ class TestLinearizer(unittest.TestCase): lowered = [x[1] for x in lower_schedule(c.schedule())] for ei in lowered: ei.run() rawbufs = lowered[-1].bufs - assert len(rawbufs) == 3 and set(rawbufs[1:]) == {a.lazydata.base.realized, b.lazydata.base.realized} + assert len(rawbufs) == 3 and set(rawbufs[1:]) == {a.uop.base.realized, b.uop.base.realized} np_c = (np_a[:2] - np_a[2:]) - (np_b[:2] - np_b[2:]) np.testing.assert_allclose(np_c, c.numpy(), atol=1e-4, rtol=1e-4) @@ -90,10 +90,10 @@ class TestLinearizer(unittest.TestCase): def test_multioutput(self): dtype, st = dtypes.int, ShapeTracker.from_shape((8,)) g0, g1, g2, g3 = [UOp(Ops.DEFINE_GLOBAL, dtype.ptr(), arg=i) for i in range(4)] - a = UOp(Ops.LOAD, dtype, (g2, st.to_uop())) - b = UOp(Ops.LOAD, dtype, (g3, st.to_uop())) - out0 = UOp(Ops.STORE, dtypes.void, (g0, st.to_uop(), a + b)) - out1 = UOp(Ops.STORE, dtypes.void, (g1, st.to_uop(), a * b)) + a = UOp(Ops.LOAD, dtype, src=(g2.view(st),)) + b = UOp(Ops.LOAD, dtype, src=(g3.view(st),)) + out0 = UOp(Ops.STORE, dtypes.void, src=(g0.view(st), a + b)) + out1 = UOp(Ops.STORE, dtypes.void, src=(g1.view(st), a * b)) sink = UOp(Ops.SINK, src=(out0, out1)) a_t = Tensor.full(st.shape, 2).contiguous().realize() @@ -140,14 +140,14 @@ class TestLinearizer(unittest.TestCase): def test_multireduce(self): Tensor.manual_seed(0) x = Tensor.randn(32, dtype=dtypes.float).realize() - st_x = x.lazydata.st + st_x = x.uop.st g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, st_x.reshape((1, 32)).expand((32, 32)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(st_x.reshape((1, 32)).expand((32, 32))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (1,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, st_x.reshape((32, 1)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, (g1.view(st_x.reshape((32, 1))),)) diff = second_x + first_reduce*ast_const(dtypes.float, -1, (32, 1)) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (0,))) - store = UOp(Ops.STORE, dtypes.void, (g0, ShapeTracker.from_shape((1, 1)).to_uop(), second_reduce)) + store = UOp(Ops.STORE, dtypes.void, (g0.view(ShapeTracker.from_shape((1, 1))), second_reduce)) sink = UOp(Ops.SINK, src=(store,)) opts = [ [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], # grouping @@ -172,14 +172,14 @@ class TestLinearizer(unittest.TestCase): def test_mid_dim_multireduce(self): Tensor.manual_seed(0) x = Tensor.randn(27, 32, 5, dtype=dtypes.float).realize() - st_x = x.lazydata.st + st_x = x.uop.st g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, st_x.reshape((27, 1, 32, 5)).expand((27, 32, 32, 5)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(st_x.reshape((27, 1, 32, 5)).expand((27, 32, 32, 5))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, st_x.reshape((27, 32, 1, 5)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, (g1.view(st_x.reshape((27, 32, 1, 5))),)) diff = second_x + first_reduce*ast_const(dtypes.float, -1, (27, 32, 1, 5)) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((27, 1, 1, 5)).to_uop(), second_reduce)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) sink = UOp(Ops.SINK, src=(store,)) opts = [ # locals @@ -232,15 +232,15 @@ class TestLinearizer(unittest.TestCase): x1 = Tensor.randn(27, 32, 5, dtype=dtypes.float).realize() x2 = Tensor.randn(27, 32, 5, dtype=dtypes.float).realize() g0, g1, g2, g3 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(4)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x0.lazydata.st.reshape((27, 1, 1, 32, 5)).expand((27, 32, 32, 32, 5)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x0.uop.st.reshape((27, 1, 1, 32, 5)).expand((27, 32, 32, 32, 5))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (3,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g2, x1.lazydata.st.reshape((27, 1, 32, 1, 5)).expand((27, 32, 32, 1, 5)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, (g2.view(x1.uop.st.reshape((27, 1, 32, 1, 5)).expand((27, 32, 32, 1, 5))),)) diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 32, 32, 1, 5))) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (2,))) - third_x = UOp(Ops.LOAD, dtypes.float, (g3, x2.lazydata.st.reshape((27, 32, 1, 1, 5)).to_uop())) + third_x = UOp(Ops.LOAD, dtypes.float, (g3.view(x2.uop.st.reshape((27, 32, 1, 1, 5))),)) mul = (third_x*second_reduce) third_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (mul,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((27, 1, 1, 1, 5)).to_uop(), third_reduce)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 1, 5))), third_reduce)) sink = UOp(Ops.SINK, src=(store,)) wanna_output = (x2.numpy()*(x1.numpy()-x0.numpy().sum(axis=1, keepdims=True)).sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,1,5) lins = helper_linearizer_ast(sink, [x0,x1,x2], wanna_output=[wanna_output]) @@ -253,7 +253,7 @@ class TestLinearizer(unittest.TestCase): def test_double_reduce_multireduce(self): Tensor.manual_seed(0) x = Tensor.randn(8, 32, 8, 16, dtype=dtypes.float).realize() - st = x.lazydata.st + st = x.uop.st g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] first_x = UOp(Ops.LOAD, dtypes.float, (g1, st.reshape((8, 1, 32, 8, 1, 16)).expand((8, 32, 32, 8, 16, 16)).to_uop())) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2, 5))) @@ -302,12 +302,12 @@ class TestLinearizer(unittest.TestCase): Tensor.manual_seed(0) x = Tensor.randn(27, 15, 5, dtype=dtypes.float).softmax(1).realize() g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((27, 1, 15, 5)).expand((27, 15, 15, 5)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x.uop.st.reshape((27, 1, 15, 5)).expand((27, 15, 15, 5))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((27, 15, 1, 5)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x.uop.st.reshape((27, 15, 1, 5))),)) diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 15, 1, 5))) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((27, 1, 1, 5)).to_uop(), second_reduce)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) sink = UOp(Ops.SINK, src=(store,)) opts = [ [Opt(OptOps.GROUPTOP, 0, 3)], # grouping @@ -329,14 +329,14 @@ class TestLinearizer(unittest.TestCase): x = Tensor.randn(4, 32, dtype=dtypes.float).realize() x_p = Tensor.randn(4, 32, dtype=dtypes.float).realize() g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((4, 1, 32)).expand((4, 32, 32)).to_uop())) - first_x_p = UOp(Ops.LOAD, dtypes.float, (g2, x_p.lazydata.st.reshape((4, 1, 32)).expand((4, 32, 32)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x.uop.st.reshape((4, 1, 32)).expand((4, 32, 32))),)) + first_x_p = UOp(Ops.LOAD, dtypes.float, (g2.view(x_p.uop.st.reshape((4, 1, 32)).expand((4, 32, 32))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) first_reduce_p = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x_p.alu(Ops.EXP2),), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((4, 32, 1)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x.uop.st.reshape((4, 32, 1))),)) diff = (second_x+(first_reduce + first_reduce_p)*ast_const(dtypes.float, -1, (4, 32, 1))) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((4, 1, 1)).to_uop(), second_reduce)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((4, 1, 1))), second_reduce)) sink = UOp(Ops.SINK, src=(store,)) opts = [ # [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], # grouping @@ -361,14 +361,14 @@ class TestLinearizer(unittest.TestCase): Tensor.manual_seed(0) x = Tensor.randn(27, 15, 5, dtype=dtypes.float).realize() g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - first_x = UOp(Ops.LOAD, dtypes.float, (g2, x.lazydata.st.reshape((27, 1, 15, 5)).expand((27, 15, 15, 5)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, (g2.view(x.uop.st.reshape((27, 1, 15, 5)).expand((27, 15, 15, 5))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g2, x.lazydata.st.reshape((27, 15, 1, 5)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, (g2.view(x.uop.st.reshape((27, 15, 1, 5))),)) diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 15, 1, 5))) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store0 = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((27, 1, 1, 5)).to_uop(), second_reduce)) + store0 = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) second_out = second_reduce * ast_const(dtypes.float, 1/15, (27, 1, 1, 5)) - store1 = UOp(Ops.STORE, src=(g1, ShapeTracker.from_shape((27, 1, 1, 5)).to_uop(), second_out)) + store1 = UOp(Ops.STORE, src=(g1.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_out)) sink = UOp(Ops.SINK, src=(store0, store1)) wanna_output = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) @@ -383,13 +383,13 @@ class TestLinearizer(unittest.TestCase): Tensor.manual_seed(0) x = Tensor.randn(27, 15, 5, dtype=dtypes.float).realize() g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - first_x = UOp(Ops.LOAD, dtypes.float, (g2, x.lazydata.st.reshape((27, 1, 15, 5)).expand((27, 15, 15, 5)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, src=(g2.view(x.uop.st.reshape((27, 1, 15, 5)).expand((27, 15, 15, 5))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g2, x.lazydata.st.reshape((27, 15, 1, 5)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, src=(g2.view(x.uop.st.reshape((27, 15, 1, 5))),)) diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 15, 1, 5))) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store0 = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((27, 1, 1, 5)).to_uop(), second_reduce)) - store1 = UOp(Ops.STORE, src=(g1, ShapeTracker(views=(View(shape=(27,15,1,5), strides=(5,0,1,1), offset=0, mask=None, contiguous=False),)).to_uop(), first_reduce)) # noqa: E501 + store0 = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) + store1 = UOp(Ops.STORE, src=(g1.view(ShapeTracker(views=(View(shape=(27,15,1,5), strides=(5,0,1,1), offset=0, mask=None, contiguous=False),))), first_reduce)) # noqa: E501 wanna_output0 = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) wanna_output1 = x.numpy().sum(axis=1).reshape(27,1,1,5) @@ -402,12 +402,12 @@ class TestLinearizer(unittest.TestCase): Tensor.manual_seed(0) x = Tensor.randn(27, 3, 5, dtype=dtypes.float).realize() g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((27, 1, 3, 5)).expand((27, 3, 3, 5)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((27, 1, 3, 5)).expand((27, 3, 3, 5))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((27, 3, 1, 5)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((27, 3, 1, 5))),)) diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 3, 1, 5))) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((27, 1, 1, 5)).to_uop(), second_reduce)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) sink = UOp(Ops.SINK, src=(store,)) opts = [[Opt(OptOps.UNROLL, 0, 3), Opt(OptOps.UNROLL, 0, 3)]] wanna_output = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) @@ -418,12 +418,12 @@ class TestLinearizer(unittest.TestCase): Tensor.manual_seed(0) x = Tensor.randn(27, 3, 5, dtype=dtypes.float).realize() g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((27, 1, 3, 5)).expand((27, 3, 3, 5)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((27, 1, 3, 5)).expand((27, 3, 3, 5))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((27, 3, 1, 5)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((27, 3, 1, 5))),)) diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 3, 1, 5))) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((27, 1, 1, 5)).to_uop(), second_reduce)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) sink = UOp(Ops.SINK, src=(store,)) opts = [[Opt(OptOps.UPCAST, 0, 3)]] wanna_output = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) @@ -437,9 +437,9 @@ class TestLinearizer(unittest.TestCase): Tensor.manual_seed(0) x = Tensor.randn(27, 12, 5, dtype=dtypes.float).realize() g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((27, 1, 12, 5)).expand((27, 12, 12, 5)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.uop.st.reshape((27, 1, 12, 5)).expand((27, 12, 12, 5)).to_uop())) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((27, 12, 1, 5)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.uop.st.reshape((27, 12, 1, 5)).to_uop())) diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 12, 1, 5))) second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((27, 1, 1, 5)).to_uop(), second_reduce)) @@ -453,15 +453,15 @@ class TestLinearizer(unittest.TestCase): Tensor.manual_seed(0) x = Tensor.randn(15, 25, 35, dtype=dtypes.float).realize() g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((15, 25, 1, 35)).expand((15, 25, 35, 35)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((15, 25, 1, 35)).expand((15, 25, 35, 35))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (3,))) neg_mean = first_reduce * ast_const(dtypes.float, -1/35, (15, 25, 35, 1)) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((15, 25, 35, 1)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((15, 25, 35, 1))),)) squares = (second_x+neg_mean)*(second_x+neg_mean) squares_sum = UOp(Ops.REDUCE_AXIS, dtypes.float, (squares,), (Ops.ADD, (2,))) variance = squares_sum * ast_const(dtypes.float, 1/35, (15, 25, 1, 1)) std = variance.alu(Ops.SQRT) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((15, 25, 1, 1)).to_uop(), std)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((15, 25, 1, 1))), std)) sink = UOp(Ops.SINK, src=(store,)) wanna_output = x.numpy().std(axis=2, ddof=0).reshape((15,25,1,1)) helper_linearizer_ast(sink, [x], wanna_output=[wanna_output]) @@ -471,15 +471,15 @@ class TestLinearizer(unittest.TestCase): Tensor.manual_seed(0) x = Tensor.randn(15, 25, 35, dtype=dtypes.float).realize() g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((15, 1, 25, 35)).expand((15, 25, 25, 35)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((15, 1, 25, 35)).expand((15, 25, 25, 35))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) neg_mean = first_reduce * ast_const(dtypes.float, -0.04, (15, 25, 1, 35)) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((15, 25, 1, 35)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((15, 25, 1, 35))),)) squares = (second_x+neg_mean)*(second_x+neg_mean) squares_sum = UOp(Ops.REDUCE_AXIS, dtypes.float, (squares,), (Ops.ADD, (1,))) variance = squares_sum * ast_const(dtypes.float, 0.04, (15, 1, 1, 35)) std = variance.alu(Ops.SQRT) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((15, 1, 1, 35)).to_uop(), std)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((15, 1, 1, 35))), std)) sink = UOp(Ops.SINK, src=(store,)) wanna_output = x.numpy().std(axis=1, ddof=0).reshape((15,1,1,35)) helper_linearizer_ast(sink, [x], wanna_output=[wanna_output]) @@ -491,10 +491,10 @@ class TestLinearizer(unittest.TestCase): Tensor.manual_seed(0) x = Tensor.randn(15, 25, 35, dtype=dtypes.float).realize() g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - first_x = UOp(Ops.LOAD, dtypes.float, (g2, x.lazydata.st.reshape((15, 25, 1, 35)).expand((15, 25, 35, 35)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, (g2, x.uop.st.reshape((15, 25, 1, 35)).expand((15, 25, 35, 35)).to_uop())) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (3,))) neg_mean = first_reduce * ast_const(dtypes.float, -1/35, (15, 25, 35, 1)) - second_x = UOp(Ops.LOAD, dtypes.float, (g2, x.lazydata.st.reshape((15, 25, 35, 1)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, (g2, x.uop.st.reshape((15, 25, 35, 1)).to_uop())) squares = (second_x+neg_mean)*(second_x+neg_mean) squares_sum = UOp(Ops.REDUCE_AXIS, dtypes.float, (squares,), (Ops.ADD, (2,))) variance = squares_sum * ast_const(dtypes.float, 1/35, (15, 25, 1, 1)) @@ -514,16 +514,16 @@ class TestLinearizer(unittest.TestCase): x = Tensor.randn(3, 27, 32, dtype=dtypes.float).realize() g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] # push reduce (3, 27, 32) -> (3, 27, 1) -> (3, 27, 32) expand to LOAD - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((3, 27, 1, 32)).expand((3, 27, 32, 32)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((3, 27, 1, 32)).expand((3, 27, 32, 32))),)) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (3,))) neg_mean = first_reduce * ast_const(dtypes.float, -0.03125, (3, 27, 32, 1)) # store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((3, 27, 32, 1)).to_uop(), mean)) # verify_lazyop(store) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((3, 27, 32, 1)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((3, 27, 32, 1))),)) squares = (second_x+neg_mean)*(second_x+neg_mean) squares_sum = UOp(Ops.REDUCE_AXIS, dtypes.float, (squares,), (Ops.ADD, (2,))) variance = squares_sum * ast_const(dtypes.float, 0.03125, (3, 27, 1, 1)) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((3, 27, 1, 1)).to_uop(), variance)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((3, 27, 1, 1))), variance)) sink = UOp(Ops.SINK, src=(store,)) wanna_output = x.numpy().var(axis=2, ddof=0).reshape((3,27,1,1)) helper_linearizer_ast(sink, [x], wanna_output=[wanna_output]) @@ -535,63 +535,25 @@ class TestLinearizer(unittest.TestCase): def test_softmax_multireduce(self): x = Tensor.rand(4, 32).realize() g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((4, 1, 32,)).expand((4, 32, 32)).to_uop())) + first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((4, 1, 32,)).expand((4, 32, 32))),)) max_x = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.MAX, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((4, 32, 1,)).to_uop())) + second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((4, 32, 1,))),)) centered_x = second_x+max_x*ast_const(dtypes.float, -1, (4, 32, 1)) exp_x = centered_x.alu(Ops.EXP2) sum_exp_x = UOp(Ops.REDUCE_AXIS, dtypes.float, (exp_x,), (Ops.ADD, (1,))) # y = exp_x * sum_exp_x.alu(Ops.RECIP) # kernels cannot do a return to full shape recip_sum_exp_x = sum_exp_x.alu(Ops.RECIP) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((4,1,1)).to_uop(), recip_sum_exp_x)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((4,1,1))), recip_sum_exp_x)) sink = UOp(Ops.SINK, src=(store,)) expected = 1/np.exp2(x.numpy() - x.numpy().max(axis=-1, keepdims=True)).sum(axis=-1, keepdims=True).reshape(4,1,1) helper_linearizer_ast(sink, [x], wanna_output=[expected]) - # *** buildup to fused indexing - @unittest.skipIf(CI, "very slow because of recomputing") - def test_arange_expanded(self): - # Tensor.arange(16384) expanded such that output shape is (4, 16384, 256, 1) - # basically it's pushing the expand through this reduce: - tiny = Tensor.arange(16384).reshape(16384, 1).expand(4, 16384, 256).reshape(4, 16384, 256, 1) - real_arange = np.broadcast_to(np.arange(16384).reshape(16384, 1), (4, 16384, 256)).reshape(4, 16384, 256, 1) - # NOTE: this is stupidly recomputing because it's not fused, but it proves a point. - arange_input_st = ShapeTracker(views=(View(shape=(16385, 32767), strides=(0, 0), offset=0, mask=((0, 16385), (16383, 32767)), contiguous=False), \ - View(shape=(16384, 16384), strides=(1, 32768), offset=0, mask=None, contiguous=False))) - arange_input_st = arange_input_st.reshape((1, 16384, 1, 16384)).expand((4, 16384, 256, 16384)) - arange_axis = (3,) - arange = UOp(Ops.REDUCE_AXIS, dtypes.int, (ast_const(dtypes.int, 1, st=arange_input_st),), (Ops.ADD, arange_axis)) - output_shape = tuple(1 if i in arange_axis else s for i,s in enumerate(arange_input_st.shape)) - out = arange+ast_const(dtypes.int, -1, output_shape) - store = UOp(Ops.STORE, src=(UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0), ShapeTracker.from_shape(output_shape).to_uop(), out)) - sink = UOp(Ops.SINK, src=(store,)) - helper_linearizer_ast(sink, [], wanna_output=[real_arange]) - with Context(DEBUG=0, NOOPT=0): np.testing.assert_equal(tiny.numpy(), real_arange) - @unittest.skipIf(CI and Device.DEFAULT in {"PTX", "AMD", "NV"}, "very slow") def test_indexing_multireduce(self): - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - g2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=2) - arange_input_st = ShapeTracker(views=(View(shape=(16385, 32767), strides=(0, 0), offset=0, mask=((0, 16385), (16383, 32767)), contiguous=False), \ - View(shape=(16384, 16384), strides=(1, 32768), offset=0, mask=None, contiguous=False))) - # TODO: do this arange broadcast in the scheduler - arange_input_st = arange_input_st.reshape((1, 16384, 1, 16384)).expand((4, 16384, 256, 16384)) - arange_axis = (3,) - arange = UOp(Ops.REDUCE_AXIS, dtypes.int, (ast_const(dtypes.int, 1, st=arange_input_st),), (Ops.ADD, arange_axis)) - arange_out_shape = tuple(1 if i in arange_axis else s for i,s in enumerate(arange_input_st.shape)) - arange = arange+ast_const(dtypes.int, -1, arange_out_shape) - # p2: the indexing dataset = Tensor.rand(16384, 256).realize() - data1 = (g1, ShapeTracker.from_shape(dataset.shape).reshape((1, 16384, 256, 1)).expand(arange_out_shape).to_uop()) idxs = Tensor([0,3,5,6]).realize() - data2 = (g2, ShapeTracker.from_shape((4,)+(1,)*(len(arange_out_shape)-1)).expand(arange_out_shape).to_uop()) - arange_eq = arange.alu(Ops.CMPNE, UOp(Ops.LOAD, dtypes.int, data2)).alu(Ops.CMPNE, ast_const(dtypes.bool, True, arange_out_shape)) - reduce_input = UOp(Ops.LOAD, dataset.dtype, data1)*UOp(Ops.CAST, dataset.dtype.scalar(), src=(arange_eq,)) - out_axis = (1,) - out = UOp(Ops.REDUCE_AXIS, reduce_input.dtype, (reduce_input,), (Ops.ADD, out_axis)) - output_shape = tuple(1 if i in out_axis else s for i,s in enumerate(arange_out_shape)) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape(output_shape).to_uop(), out)) - sink = UOp(Ops.SINK, src=(store,)) + with Context(FUSE_ARANGE=1): + sink = dataset[idxs].contiguous().kernelize().uop.base.src[1].arg.ast real_index = dataset.numpy()[idxs.numpy()].reshape(4, 1, 256, 1) helper_linearizer_ast(sink, [dataset, idxs], wanna_output=[real_index]) @@ -602,62 +564,82 @@ class TestLinearizer(unittest.TestCase): real_argmax = np.argmax(t.numpy(), axis=0, keepdims=False).reshape(1, 20, 1) ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 20, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), # noqa E501 + UOp(Ops.VIEW, dtypes.int.ptr(20), arg=ShapeTracker(views=(View(shape=(1, 20, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(-1), arg=0, src=()),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.ADD, dtypes.int, arg=None, src=( - ast_const(dtypes.int, st=ShapeTracker(views=(View(shape=(1, 20, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), val=10), + UOp(Ops.CONST, dtypes.int, arg=10, src=( + x6:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 20, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), # noqa: E501 UOp(Ops.MUL, dtypes.int, arg=None, src=( - ast_const(dtypes.int, -1, (1, 20, 1)), + x8:=UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x6,)), UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.MAX, (0,)), src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CAST, dtypes.int, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 20, 1), strides=(20, 1, 0), offset=0, mask=None, contiguous=True),)), src=()),)), # noqa E501 + UOp(Ops.VIEW, dtypes.float.ptr(200), arg=ShapeTracker(views=(View(shape=(10, 20, 1), strides=(20, 1, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(-1), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 20, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), # noqa E501 - ast_const(dtypes.bool, True, st=ShapeTracker(views=(View(shape=(10, 20, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),))),)),)), # noqa E501 + UOp(Ops.VIEW, dtypes.float.ptr(20), arg=ShapeTracker(views=(View(shape=(10, 20, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(-1), arg=2, src=()),)),)),)), + UOp(Ops.CONST, dtypes.bool, arg=True, src=( + x21:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 20, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), # noqa: E501 UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (2,)), src=( - ast_const(dtypes.int, -1, st=ShapeTracker(views=(View(shape=(11, 19), strides=(0, 0), offset=0, mask=((0, 11), (9, 19)), contiguous=False), View(shape=(10, 20, 10), strides=(1, 0, 20), offset=0, mask=None, contiguous=False)))),)), # noqa E501 - ast_const(dtypes.int, 10, (10, 20, 1)))),)),)),)),)), - ast_const(dtypes.int, -1, (1, 20, 1)),)),)),)) + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(11, 19), strides=(0, 0), offset=0, mask=((0, 11), (9, 19)), contiguous=False), View(shape=(10, 20, 10), strides=(1, 0, 20), offset=0, mask=None, contiguous=False))), src=()),)), # noqa: E501 + UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x28:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 20, 10), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), # noqa: E501 + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x28,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=10, src=( + x21,)),)),)),)),)),)), + x8,)),)),)) helper_linearizer_ast(ast, [t, t_max], wanna_output=[real_argmax]) def test_argmax_multireduce_flat(self): t = Tensor.randn(10, 20).realize() t_max = t.max().realize() real_argmax = np.argmax(t.numpy()) - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( + ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1), strides=(0, 0), offset=0, mask=None, contiguous=True),)), src=()), # noqa: E501 + UOp(Ops.VIEW, dtypes.int.ptr(1), arg=ShapeTracker(views=(View(shape=(1, 1), strides=(0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(-1), arg=0, src=()),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.ADD, dtypes.int, arg=None, src=( - ast_const(dtypes.int, 200, (1, 1)), + UOp(Ops.CONST, dtypes.int, arg=200, src=( + x6:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1), strides=(0, 0), offset=0, mask=None, contiguous=True),)), src=()),)), # noqa: E501 UOp(Ops.MUL, dtypes.int, arg=None, src=( - ast_const(dtypes.int, -1, (1, 1)), + x8:=UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x6,)), UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.MAX, (0,)), src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CAST, dtypes.int, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(200, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(200), arg=ShapeTracker(views=(View(shape=(200, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(-1), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(200, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), # noqa: E501 - ast_const(dtypes.bool, True, (200, 1)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(200, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(-1), arg=2, src=()),)),)),)), + UOp(Ops.CONST, dtypes.bool, arg=True, src=( + x21:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(200, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), # noqa: E501 UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - ast_const(dtypes.int, -1, st=ShapeTracker(views=(View(shape=(201, 399), strides=(0, 0), offset=0, mask=((0, 201), (199, 399)), contiguous=False), View(shape=(200, 200), strides=(1, 400), offset=0, mask=None, contiguous=False)))),)), # noqa: E501 - ast_const(dtypes.int, 200, (200, 1)),)),)),)),)),)), - ast_const(dtypes.int, -1, (1, 1)),)),)),)) + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(201, 399), strides=(0, 0), offset=0, mask=((0, 201), (199, 399)), contiguous=False), View(shape=(200, 200), strides=(1, 400), offset=0, mask=None, contiguous=False))), src=()),)), # noqa: E501 + UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x28:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(200, 200), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), # noqa: E501 + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x28,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=200, src=( + x21,)),)),)),)),)),)), + x8,)),)),)) helper_linearizer_ast(ast, [t, t_max], wanna_output=[real_argmax]) @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") @@ -674,19 +656,19 @@ class TestLinearizer(unittest.TestCase): ] g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - x_ld0 = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((1, N, N)).expand((N,N,N)).to_uop())) - x_ld1 = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((N, 1, N)).to_uop())) + x_ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((1, N, N)).expand((N,N,N))),)) + x_ld1 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, 1, N))),)) r0 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld0,), (Ops.ADD, (1,))) r1 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld1+r0*ast_const(dtypes.float, -1, (N, 1, N)),),(Ops.ADD, (0,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((1,1,N)).to_uop(), r1)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((1,1,N))), r1)) sink = UOp(Ops.SINK, src=(store,)) helper_linearizer_ast(sink, [x], wanna_output=[(x.numpy()-x.numpy().sum(axis=0, keepdims=True)).sum(axis=0).reshape(1,1,N)], opts=opts) - x_ld0 = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((N, 1, N)).expand((N,N,N)).to_uop())) - x_ld1 = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((N, N, 1)).to_uop())) + x_ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, 1, N)).expand((N,N,N))),)) + x_ld1 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, N, 1))),)) r0 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld0,), (Ops.ADD, (2,))) r1 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld1+r0*ast_const(dtypes.float, -1, (N, N, 1)),), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((N,1,1)).to_uop(), r1)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((N,1,1))), r1)) sink = UOp(Ops.SINK, src=(store,)) helper_linearizer_ast(sink, [x], wanna_output=[(x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(N,1,1)], opts=opts) @@ -701,19 +683,19 @@ class TestLinearizer(unittest.TestCase): ] g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - x_ld0 = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((1, N, N)).expand((N,N,N)).to_uop())) - x_ld1 = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((N, 1, N)).to_uop())) + x_ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((1, N, N)).expand((N,N,N))),)) + x_ld1 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, 1, N))),)) r0 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld0,), (Ops.MAX, (1,))) r1 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld1+r0*ast_const(dtypes.float, -1, (N, 1, N)),), (Ops.MAX, (0,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((1,1,N)).to_uop(), r1)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((1,1,N))), r1)) sink = UOp(Ops.SINK, src=(store,)) helper_linearizer_ast(sink, [x], wanna_output=[(x.numpy()-x.numpy().max(axis=0, keepdims=True)).max(axis=0).reshape(1,1,N)], opts=opts) - x_ld0 = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((N, 1, N)).expand((N,N,N)).to_uop())) - x_ld1 = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((N, N, 1)).to_uop())) + x_ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, 1, N)).expand((N,N,N))),)) + x_ld1 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, N, 1))),)) r0 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld0,), (Ops.MAX, (2,))) r1 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld1+r0*ast_const(dtypes.float, -1, (N, N, 1)),), (Ops.MAX, (1,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((N,1,1)).to_uop(), r1)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((N,1,1))), r1)) sink = UOp(Ops.SINK, src=(store,)) helper_linearizer_ast(sink, [x], wanna_output=[(x.numpy()-x.numpy().max(axis=1, keepdims=True)).max(axis=1).reshape(N,1,1)], opts=opts) @@ -728,104 +710,100 @@ class TestLinearizer(unittest.TestCase): b = Tensor.rand(1, 1).realize() opts = [[Opt(OptOps.PADTO, 0, 32)],[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),],] - # TODO: these large ASTs are suboptimal but we need this until the scheduler can fuse these wanna_output = np.where(0.5*17 < (x.numpy()+np.where(0.75*17 < x.numpy().sum(axis=1,keepdims=True), a.numpy(), b.numpy())).sum(axis=1),0.0,1.0).reshape((N,1,1)) # noqa: E501 - ld0 = x.lazydata.st.reshape((N, 1, N)).expand((N,N,N)) - ld1 = x.lazydata.st.reshape((N, N, 1)) + ld0 = x.uop.st.reshape((N, 1, N)).expand((N,N,N)) + ld1 = x.uop.st.reshape((N, N, 1)) ast = UOp(Ops.SINK, src=( UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(N, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),))), + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0),)), UOp(Ops.WHERE, dtypes.float, arg=None, src=( UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( ast_const(dtypes.float, 0.5*N, (N, 1, 1)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1), - ld1.to_uop(),)), + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ld1, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1),)),)), UOp(Ops.WHERE, dtypes.float, arg=None, src=( UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( ast_const(dtypes.float, 0.75*N, (N, N, 1)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1), - ld0.to_uop(),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ld0, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1),)),)),)),)), UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(N, N, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),))),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2),)),)), UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(N, N, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),))),)),)),)),)),)), # noqa: E501 - + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3),)),)),)),)),)),)), ast_const(dtypes.float, 0.0, (N, 1, 1)), ast_const(dtypes.float, 1.0, (N, 1, 1)),)),)),)) helper_linearizer_ast(ast, [x,a,b], opts=opts, wanna_output=[wanna_output]) - ld0 = x.lazydata.st.reshape((1, N, N)).expand((N,N,N)) - ld1 = x.lazydata.st.reshape((N, 1, N)) + ld0 = x.uop.st.reshape((1, N, N)).expand((N,N,N)) + ld1 = x.uop.st.reshape((N, 1, N)) wanna_output = np.where(0.5*17 < (x.numpy()+np.where(0.75*17 < x.numpy().sum(axis=0,keepdims=True), a.numpy(), b.numpy())).sum(axis=0),0.0,1.0).reshape(1,1,N) # noqa: E501 ast = UOp(Ops.SINK, src=( UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 1, N), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(1, 1, N), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()),)), UOp(Ops.WHERE, dtypes.float, arg=None, src=( UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( ast_const(dtypes.float, 0.5*N, (1, 1, N)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0,)), src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - ld1.to_uop(),)), + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ld1, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()),)),)), UOp(Ops.WHERE, dtypes.float, arg=None, src=( UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( ast_const(dtypes.float, 0.75*N, (N, 1, N)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - ld0.to_uop(),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ld0, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()),)),)),)),)), UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(N, 1, N), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, 1, N), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()),)),)), UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(N, 1, N), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)), # noqa: E501 - + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, 1, N), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()),)),)),)),)),)),)), ast_const(dtypes.float, 0.0, (1, 1, N)), ast_const(dtypes.float, 1.0, (1, 1, N)),)),)),)) helper_linearizer_ast(ast, [x,a,b], opts=opts, wanna_output=[wanna_output]) - # pad reduce axis helper_linearizer_ast(ast, [x,a,b], opts=[[Opt(OptOps.PADTO, 1, 32)],], wanna_output=[wanna_output]) - ld0 = x.lazydata.st.reshape((1,1,N,N)).expand((N,N,N,N)) - ld1 = x.lazydata.st.reshape((N,N,1,1)) + ld0 = x.uop.st.reshape((1,1,N,N)).expand((N,N,N,N)) + ld1 = x.uop.st.reshape((N,N,1,1)) wanna_output = np.where(0.5*17 < (x.numpy()+np.where(0.75*17 < x.numpy().sum(keepdims=True), a.numpy(), b.numpy())).sum(keepdims=True),0.0,1.0).reshape((1,1,1,1))# noqa: E501 ast = UOp(Ops.SINK, src=( UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=True),))), + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()),)), UOp(Ops.WHERE, dtypes.float, arg=None, src=( UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( ast_const(dtypes.float, 0.5*N, (1, 1, 1, 1)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 1)), src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(N, N, 1, 1), strides=(N, 1, 0, 0), offset=0, mask=None, contiguous=True),))),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1, 1), strides=(N, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1),)),)), UOp(Ops.WHERE, dtypes.float, arg=None, src=( UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( ast_const(dtypes.float, 0.75*N, (N, N, 1, 1)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2, 3)), src=( UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(N, N, N, N), strides=(0, 0, N, 1), offset=0, mask=None, contiguous=False),))),)),)),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, N, N), strides=(0, 0, N, 1), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1),)),)),)),)), UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(N, N, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),))),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2),)),)), UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(N, N, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),))),)),)),)),)),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3),)),)),)),)),)),)), ast_const(dtypes.float, 0.0, (1, 1, 1, 1)), ast_const(dtypes.float, 1.0, (1, 1, 1, 1)),)),)),)) helper_linearizer_ast(ast, [x,a,b], opts=[[Opt(OptOps.PADTO, 0, 32)],], wanna_output=[wanna_output]) @@ -834,9 +812,9 @@ class TestLinearizer(unittest.TestCase): @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") def test_end_local(self): g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=i) for i in range(2)] - load = UOp(Ops.LOAD, dtypes.int, (g1, ShapeTracker.from_shape((32,)).to_uop())) + load = UOp(Ops.LOAD, dtypes.int, (g1.view(ShapeTracker.from_shape((32,))),)) reduce = UOp(Ops.REDUCE_AXIS, dtypes.int, (load,), (Ops.ADD, (0,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((1,)).to_uop(), reduce)) + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((1,))), reduce)) sink = UOp(Ops.SINK, src=(store,)) load_t = Tensor.full(load.st_arg.shape, 1).contiguous().realize() k = helper_linearizer_ast(sink, [load_t], wanna_output=[load_t.numpy().sum()])[1] @@ -941,11 +919,11 @@ class TestLinearizer(unittest.TestCase): g0, g1 = [UOp(Ops.DEFINE_GLOBAL, DT.ptr(), arg=i) for i in range(2)] # data1[0] + VAL - a = UOp(Ops.LOAD, DT, (g1, ST)) + VAL + a = UOp(Ops.LOAD, DT, (g1.view(ST.arg),)) + VAL # (literal const 1) + VAL b = ast_const(DT, 1, ST.arg.shape) + VAL - store = UOp(Ops.STORE, src=(g0, ST, (a+b))) + store = UOp(Ops.STORE, src=(g0.view(ST.arg), (a+b))) sink = UOp(Ops.SINK, src=(store,)) lin = Kernel(sink) lin.linearize() @@ -1425,13 +1403,13 @@ class TestLinearizer(unittest.TestCase): @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4") def test_skip_unmatching_upcasts(self): Tensor.manual_seed(0) - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(40, 1, 0, 0), offset=0, mask=None, contiguous=True),))), - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(1, 240, 0, 0), offset=0, mask=None, contiguous=False),))),)),)),)) # noqa: E501 + ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(9600), arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(40, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9600), arg=0, src=()),)), + UOp(Ops.LOAD, dtypes.float, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(9600), arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(1, 240, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9600), arg=1, src=()),)),)),)),)) opt = [ Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.LOCAL, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=2) @@ -1444,13 +1422,13 @@ class TestLinearizer(unittest.TestCase): @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4") def test_skip_unmatching_upcasts_with_gep(self): Tensor.manual_seed(0) - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(32, 1, 0, 0), offset=0, mask=None, contiguous=True),))), - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(1, 8, 0, 0), offset=0, mask=None, contiguous=False),))),)),)),)) # noqa: E501 + ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(256), arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(32, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(256), arg=0, src=()),)), + UOp(Ops.LOAD, dtypes.float, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(256), arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(1, 8, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(256), arg=1, src=()),)),)),)),)) opt = [Opt(op=OptOps.LOCAL, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=2, arg=2), Opt(op=OptOps.LOCAL, axis=1, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=2)] @@ -1657,19 +1635,19 @@ class TestFloat4(unittest.TestCase): def test_half4_load_unrolled(self): # from llama 7B shard 4 gpus - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 3, 32000, 1), strides=(0, 32000, 1, 0), offset=0, mask=None, contiguous=True),))), # noqa: E501 + ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(96000), arg=ShapeTracker(views=(View(shape=(1, 3, 32000, 1), strides=(0, 32000, 1, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96000), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( - UOp(Ops.CAST, dtypes.float, src=( + UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 3, 32000, 1024), strides=(0, 4096, 0, 1), offset=0, mask=None, contiguous=False),))),)), # noqa: E501 - UOp(Ops.LOAD, dtypes.half, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 3, 32000, 1024), strides=(0, 0, 1024, 1), offset=0, mask=None, contiguous=False),))),)),)),)),)),)),)) # noqa: E501 + UOp(Ops.LOAD, dtypes.half, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(9216), arg=ShapeTracker(views=(View(shape=(1, 3, 32000, 1024), strides=(0, 4096, 0, 1), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(9216), arg=1, src=()),)),)), + UOp(Ops.LOAD, dtypes.half, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(32768000), arg=ShapeTracker(views=(View(shape=(1, 3, 32000, 1024), strides=(0, 0, 1024, 1), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(32768000), arg=2, src=()),)),)),)),)),)),)),)) # TODO: fix this, expected might change but should be positive for expected, opts in [ @@ -1686,22 +1664,22 @@ class TestFloat4(unittest.TestCase): @unittest.skip("this doesn't happen anymore") def test_float4_acc(self): # from float32 stable diffusion red tinybox - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 1, 128, 512, 512, 1, 1, 1), strides=(0, 0, 262144, 512, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),))), # noqa: E501 + ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(33554432), arg=ShapeTracker(views=(View(shape=(1, 1, 128, 512, 512, 1, 1, 1), strides=(0, 0, 262144, 512, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(33554432), arg=0, src=()),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 1, 1, 256, 4, 514, 4, 514), strides=(0, 0, 0, 262144, 0, 512, 0, 1), offset=-513, mask=((0, 1), (0, 1), (0, 1), (0, 256), (0, 4), (1, 513), (0, 4), (1, 513)), contiguous=False), View(shape=(1, 1, 128, 512, 512, 256, 3, 3), strides=(0, 0, 0, 2056, 1, 4227136, 1058840, 515), offset=0, mask=None, contiguous=False)))),)), # noqa: E501 - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 1, 128, 512, 512, 256, 3, 3), strides=(0, 0, 2304, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),))),)),)),)), # noqa: E501 - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 1, 128, 512, 512, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),))),)),)),)),)) # noqa: E501 + UOp(Ops.LOAD, dtypes.float, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(67108864), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 256, 4, 514, 4, 514), strides=(0, 0, 0, 262144, 0, 512, 0, 1), offset=-513, mask=((0, 1), (0, 1), (0, 1), (0, 256), (0, 4), (1, 513), (0, 4), (1, 513)), contiguous=False), View(shape=(1, 1, 128, 512, 512, 256, 3, 3), strides=(0, 0, 0, 2056, 1, 4227136, 1058840, 515), offset=0, mask=None, contiguous=False))), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(67108864), arg=1, src=()),)),)), + UOp(Ops.LOAD, dtypes.float, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(294912), arg=ShapeTracker(views=(View(shape=(1, 1, 128, 512, 512, 256, 3, 3), strides=(0, 0, 2304, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(294912), arg=2, src=()),)),)),)),)), + UOp(Ops.LOAD, dtypes.float, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(128), arg=ShapeTracker(views=(View(shape=(1, 1, 128, 512, 512, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(128), arg=3, src=()),)),)),)),)),)) for expected, opts in [ (1, [Opt(op=OptOps.UPCAST, axis=2, arg=4)]), @@ -1716,16 +1694,16 @@ class TestFloat4(unittest.TestCase): @unittest.skip("this doesn't happen anymore") def test_float2_acc(self): # from resnet - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 256, 1, 64, 1, 114, 1, 114), strides=(0, 831744, 0, 12996, 0, 114, 0, 1), offset=0, mask=None, contiguous=True),))), # noqa: E501 - UOp(Ops.CAST, dtypes.half, src=( + ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(212926464), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 64, 1, 114, 1, 114), strides=(0, 831744, 0, 12996, 0, 114, 0, 1), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(212926464), arg=0, src=()),)), + UOp(Ops.CAST, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (4, 6)), src=( - UOp(Ops.CAST, dtypes.float, src=( - UOp(Ops.LOAD, dtypes.half, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(256, 64, 3, 56, 2, 3, 56, 2), strides=(1806336, 28224, 3, 504, 0, 1, 9, 0), offset=0, mask=((0, 256), (0, 64), (0, 3), (0, 56), (0, 1), (0, 3), (0, 56), (0, 1)), contiguous=False), View(shape=(256, 64, 3, 115, 3, 115), strides=(7225344, 112896, 37632, 336, 112, 1), offset=0, mask=((0, 256), (0, 64), (0, 3), (0, 112), (0, 3), (0, 112)), contiguous=False), View(shape=(256, 64, 456, 456), strides=(7617600, 119025, 345, 1), offset=0, mask=((0, 256), (0, 64), (0, 345), (0, 345)), contiguous=False), View(shape=(1, 256, 1, 64, 4, 114, 4, 114), strides=(0, 13307904, 0, 207936, 51984, 456, 114, 1), offset=0, mask=None, contiguous=True)))),)),)),)),)),)),)) # noqa: E501 + UOp(Ops.CAST, dtypes.float, arg=None, src=( + UOp(Ops.LOAD, dtypes.half, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(462422016), arg=ShapeTracker(views=(View(shape=(256, 64, 3, 56, 2, 3, 56, 2), strides=(1806336, 28224, 3, 504, 0, 1, 9, 0), offset=0, mask=((0, 256), (0, 64), (0, 3), (0, 56), (0, 1), (0, 3), (0, 56), (0, 1)), contiguous=False), View(shape=(256, 64, 3, 115, 3, 115), strides=(7225344, 112896, 37632, 336, 112, 1), offset=0, mask=((0, 256), (0, 64), (0, 3), (0, 112), (0, 3), (0, 112)), contiguous=False), View(shape=(256, 64, 456, 456), strides=(7617600, 119025, 345, 1), offset=0, mask=((0, 256), (0, 64), (0, 345), (0, 345)), contiguous=False), View(shape=(1, 256, 1, 64, 4, 114, 4, 114), strides=(0, 13307904, 0, 207936, 51984, 456, 114, 1), offset=0, mask=None, contiguous=True))), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(462422016), arg=1, src=()),)),)),)),)),)),)),)) for expected, opts in [ (16, [Opt(op=OptOps.LOCAL, axis=1, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=2, arg=2), Opt(op=OptOps.LOCAL, axis=2, arg=3), Opt(op=OptOps.UPCAST, axis=3, arg=4)]), # noqa: E501 (4, [Opt(op=OptOps.LOCAL, axis=1, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=2, arg=2)]), @@ -1816,8 +1794,8 @@ class TestHandCodedOpts(unittest.TestCase): def helper_linearizer_ast(ast:UOp, inputs:list[Tensor], *args, **kwargs): assert isinstance(ast, UOp), "ast must be UOp" - inbufs = [x.lazydata.base.buffer for x in inputs] - outbufs = [Buffer(inbufs[-1].device if inbufs else Device.DEFAULT, out.st_arg.size, out.src[2].dtype).allocate() \ + inbufs = [x.uop.base.buffer for x in inputs] + outbufs = [Buffer(inbufs[-1].device if inbufs else Device.DEFAULT, out.st_arg.size, out.src[1].dtype).allocate() \ for out in ast.src] return _helper_linearizer_opt_ast(ast, outbufs+inbufs, *args, **kwargs) @@ -1838,7 +1816,7 @@ def reset_bufs(bufs:list[Buffer]): def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[], apply_tc=False, atol=1e-4, rtol=1e-4, color_sizes=[], wanna_output=[]) -> list[Kernel]: lins: list[Kernel] = [] - outbufs = [real_bufs[x.src[0].arg] for x in realized_ast.src] + outbufs = [real_bufs[x.src[0].base.arg] for x in realized_ast.src] device = real_bufs[0].device def get_prg(k:Kernel): return CompiledRunner(replace(k.to_program(), device=device)) @@ -2005,23 +1983,23 @@ class TestKernelOpts(unittest.TestCase): @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") def test_buf_index_not_found_tensor_core(self): - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1, 256), strides=(0, 1), offset=0, mask=None, contiguous=True),))), + ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(256), arg=ShapeTracker(views=(View(shape=(1, 256), strides=(0, 1), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(256), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, src=( + UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=1), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1243, 256), strides=(0, 1), offset=0, mask=None, contiguous=False),))),)), # noqa: E501 - UOp(Ops.LOAD, dtypes.int, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=2), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),))),)),)),)), # noqa: E501 - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),))),)),)),)),)),)) # noqa: E501 + UOp(Ops.LOAD, dtypes.int, arg=None, src=( + UOp(Ops.VIEW, dtypes.int.ptr(256), arg=ShapeTracker(views=(View(shape=(1243, 256), strides=(0, 1), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(256), arg=1, src=()),)),)), + UOp(Ops.LOAD, dtypes.int, arg=None, src=( + UOp(Ops.VIEW, dtypes.int.ptr(1243), arg=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1243), arg=2, src=()),)),)),)),)), + UOp(Ops.LOAD, dtypes.float, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(1243), arg=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1243), arg=3, src=()),)),)),)),)),)),)) k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) with self.assertRaises(KernelOptError): k.apply_opt(Opt(OptOps.TC, 0, (-1, 1, 1))) @@ -2199,9 +2177,9 @@ class TestKernelOpts(unittest.TestCase): def test_padto_group(self): Tensor.manual_seed(0) g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - ld0 = UOp(Ops.LOAD, dtypes.float, (g1, ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),)).to_uop())) # noqa: E501 - ld1 = UOp(Ops.LOAD, dtypes.float, (g2, ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)).to_uop())) # noqa: E501 - store = UOp(Ops.STORE, src=(g0, ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),)).to_uop(), UOp(Ops.REDUCE_AXIS, dtypes.float, (ld0*ld1,), (Ops.ADD, (0, 2, 4, 6)),))) # noqa: E501 + ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),))),)) # noqa: E501 + ld1 = UOp(Ops.LOAD, dtypes.float, src=(g2.view(ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),))),)) # noqa: E501 + store = UOp(Ops.STORE, src=(g0.view(ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),))), UOp(Ops.REDUCE_AXIS, dtypes.float, (ld0*ld1,), (Ops.ADD, (0, 2, 4, 6)),))) # noqa: E501 sink = UOp(Ops.SINK, src=(store,)) data1 = Tensor.randn(2, 1, 4, 1, 3, 4, 2, 6, 1, 3).realize() data2 = Tensor.randn(2, 1, 4, 1, 3, 4, 2, 6, 1, 3).realize() diff --git a/tinygrad_repo/test/test_linearizer_dumb.py b/tinygrad_repo/test/test_linearizer_dumb.py index 0cbaea3..0c56dc9 100644 --- a/tinygrad_repo/test/test_linearizer_dumb.py +++ b/tinygrad_repo/test/test_linearizer_dumb.py @@ -3,7 +3,6 @@ # like test_linearizer_failures, but they don't have to fail import unittest -from test.helpers import ast_const from tinygrad import Device, dtypes from tinygrad.device import is_dtype_supported from tinygrad.uop.ops import UOp, Ops @@ -17,8 +16,8 @@ class TestLinearizerDumb(unittest.TestCase): def test_unmerged_ifs(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64, 1, 512, 7, 7, 1, 1, 1), strides=(25088, 0, 49, 7, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(1605632), arg=ShapeTracker(views=(View(shape=(64, 1, 512, 7, 7, 1, 1, 1), strides=(25088, 0, 49, 7, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1605632), arg=0, src=()),)), UOp(Ops.MAX, dtypes.half, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.CAST, dtypes.half, arg=None, src=( @@ -26,15 +25,15 @@ class TestLinearizerDumb(unittest.TestCase): UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 64, 1, 512, 4, 9, 4, 9), strides=(0, 25088, 0, 49, 0, 7, 0, 1), offset=-8, mask=((0, 1), (0, 64), (0, 1), (0, 512), (0, 4), (1, 8), (0, 4), (1, 8)), contiguous=False), View(shape=(64, 1, 512, 7, 7, 512, 3, 3), strides=(663552, 0, 0, 36, 1, 1296, 360, 10), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(1605632), arg=ShapeTracker(views=(View(shape=(1, 64, 1, 512, 4, 9, 4, 9), strides=(0, 25088, 0, 49, 0, 7, 0, 1), offset=-8, mask=((0, 1), (0, 64), (0, 1), (0, 512), (0, 4), (1, 8), (0, 4), (1, 8)), contiguous=False), View(shape=(64, 1, 512, 7, 7, 512, 3, 3), strides=(663552, 0, 0, 36, 1, 1296, 360, 10), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1605632), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64, 1, 512, 7, 7, 512, 3, 3), strides=(0, 0, 4608, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)), - ast_const(dtypes.half, 0.9999950000374996, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64, 1, 512, 7, 7, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.half, 0.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64, 1, 512, 7, 7, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(2359296), arg=ShapeTracker(views=(View(shape=(64, 1, 512, 7, 7, 512, 3, 3), strides=(0, 0, 4608, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(2359296), arg=2, src=()),)),)),)),)),)),)), + UOp(Ops.CONST, dtypes.half, arg=0.9999950000374996, src=( + x16:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64, 1, 512, 7, 7, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.CONST, dtypes.half, arg=0.0, src=( + x16,)),)),)),)) opts = [Opt(op=OptOps.TC, axis=2, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.UNROLL, axis=1, arg=0)] k = Kernel(ast, opts=Device["METAL"].renderer) k.apply_opts(opts) @@ -49,26 +48,31 @@ class TestLinearizerDumb(unittest.TestCase): def test_max_simplify_and_cancel(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.int.ptr(1000), arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1000), arg=0, src=()),)), UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CAST, dtypes.int, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(1000), arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1000), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.bool, True, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=2, src=()),)),)),)), + UOp(Ops.CONST, dtypes.bool, arg=True, src=( + x14:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - ast_const(dtypes.int, -1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1001, 1999), strides=(0, 0), offset=0, mask=((0, 1001), (999, 1999)), contiguous=False), View(shape=(1000, 1000), strides=(1, 2000), offset=0, mask=None, contiguous=False))), src=()),)),)), - ast_const(dtypes.int, 1000, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1001, 1999), strides=(0, 0), offset=0, mask=((0, 1001), (999, 1999)), contiguous=False), View(shape=(1000, 1000), strides=(1, 2000), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x21:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1000), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x21,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=1000, src=( + x14,)),)),)),)),)) opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) k.apply_opts(opts) @@ -80,12 +84,12 @@ class TestLinearizerDumb(unittest.TestCase): def test_expander_new_srcs(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=1, src=()),)),)),)),)),)) opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) k.apply_opts(opts) @@ -101,8 +105,8 @@ class TestLinearizerDumb(unittest.TestCase): def test_llama_embedding(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4096, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(4096), arg=ShapeTracker(views=(View(shape=(4096, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(4096), arg=0, src=()),)), UOp(Ops.CAST, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( @@ -112,77 +116,50 @@ class TestLinearizerDumb(unittest.TestCase): UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (2,)), src=( - ast_const(dtypes.int, 1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32001, 63999), strides=(0, 0), offset=0, mask=((0, 32001), (31999, 63999)), contiguous=False), View(shape=(4096, 32000, 32000), strides=(0, 1, 64000), offset=0, mask=None, contiguous=False))), src=()),)),)), - ast_const(dtypes.int, -1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4096, 32000, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32001, 63999), strides=(0, 0), offset=0, mask=((0, 32001), (31999, 63999)), contiguous=False), View(shape=(4096, 32000, 32000), strides=(0, 1, 64000), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=1, src=( + x16:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4096, 32000, 32000), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x16,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x19:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4096, 32000, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4096, 32000, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.bool, True, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4096, 32000, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.int.ptr(1), arg=ShapeTracker(views=(View(shape=(4096, 32000, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), arg=1, src=()),)),)),)), + UOp(Ops.CONST, dtypes.bool, arg=True, src=( + x19,)),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4096, 32000, 1), strides=(1, 4096, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(131072000), arg=ShapeTracker(views=(View(shape=(4096, 32000, 1), strides=(1, 4096, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(131072000), arg=2, src=()),)),)),)),)),)),)),)),)) k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) prg = k.to_program() print(prg.src) - # from process replay https://github.com/tinygrad/tinygrad/actions/runs/10389229290/job/28766762085#step:18:6490 - @unittest.expectedFailure - def test_unaligns_idxs(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.long, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 1, 5), strides=(1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CAST, dtypes.long, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 1, 5), strides=(0, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - ast_const(dtypes.bool, True, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 1, 5), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 1, 5), strides=(0, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.UNROLL, axis=0, arg=0), Opt(op=OptOps.LOCAL, axis=0, arg=3)] - k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) - k.apply_opts(opts) - prg = k.to_program() - print(prg.src) - load_idxs = [x.src[1] for x in k.uops if x.op is Ops.LOAD and x.src[0].arg == 3] - assert load_idxs[0] < load_idxs[1], f"first loaded idx {load_idxs[0].arg} then {load_idxs[1].arg}!" - @unittest.expectedFailure @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "need float4") def test_unrolled_float4_align(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1), strides=(0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(1, 1), strides=(0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 1)), src=( UOp(Ops.WHERE, dtypes.float, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.long, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 6), strides=(6, 1), offset=0, mask=None, contiguous=True),)), src=()),)), - ast_const(dtypes.long, -1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 6), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.bool, True, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 6), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.float, 0.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 6), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.long.ptr(18), arg=ShapeTracker(views=(View(shape=(3, 6), strides=(6, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(18), arg=1, src=()),)),)), + UOp(Ops.CONST, dtypes.long, arg=-1, src=( + x11:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 6), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.CONST, dtypes.bool, arg=True, src=( + x11,)),)), + UOp(Ops.CONST, dtypes.float, arg=0.0, src=( + x11,)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 6), strides=(6, 1), offset=0, mask=None, contiguous=True),)), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(18), arg=ShapeTracker(views=(View(shape=(3, 6), strides=(6, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(18), arg=2, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.UNROLL, axis=0, arg=0)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) k.apply_opts(opts) @@ -197,16 +174,16 @@ class TestLinearizerDumb(unittest.TestCase): def test_upcasted_stores_out_of_order(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 5, 13, 1, 1, 1, 1, 1, 4, 3, 3), strides=(2340, 468, 36, 0, 0, 0, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(9360), arg=ShapeTracker(views=(View(shape=(4, 5, 13, 1, 1, 1, 1, 1, 4, 3, 3), strides=(2340, 468, 36, 0, 0, 0, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9360), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (6,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 5, 13, 1, 1, 1, 4, 1, 4, 3, 3), strides=(0, 0, 0, 0, 0, 0, 1, 0, 4, 48, 16), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(144), arg=ShapeTracker(views=(View(shape=(4, 5, 13, 1, 1, 1, 4, 1, 4, 3, 3), strides=(0, 0, 0, 0, 0, 0, 1, 0, 4, 48, 16), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(144), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 5, 13, 1, 1, 1, 4, 1, 4, 3, 3), strides=(260, 13, 1, 0, 0, 0, 65, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(1040), arg=ShapeTracker(views=(View(shape=(4, 5, 13, 1, 1, 1, 4, 1, 4, 3, 3), strides=(260, 13, 1, 0, 0, 0, 65, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1040), arg=2, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.UPCAST, axis=2, arg=0)] k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) k.apply_opts(opts) diff --git a/tinygrad_repo/test/test_linearizer_failures.py b/tinygrad_repo/test/test_linearizer_failures.py index 931b738..ec4f2ae 100644 --- a/tinygrad_repo/test/test_linearizer_failures.py +++ b/tinygrad_repo/test/test_linearizer_failures.py @@ -8,7 +8,6 @@ from tinygrad.engine.search import Opt, OptOps from tinygrad import Device, dtypes, Tensor from tinygrad.helpers import CI, Context from test.external.fuzz_linearizer import compare_linearizer -from test.helpers import ast_const from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View @@ -42,43 +41,43 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_1(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(16, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(16, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(512), arg=0, src=()),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 16, 16), strides=(16, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(32, 16, 16), strides=(16, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + x8:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(512), arg=1, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=2, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(16, 1, 0), offset=0, mask=None, contiguous=True),)), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(16, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + x8,)),)),)),)),)) helper_test_lin(Kernel(ast), [], failed_platforms=[]) def test_failure_2(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 2, 37, 9, 1, 1), strides=(666, 333, 9, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(21312), arg=ShapeTracker(views=(View(shape=(32, 2, 37, 9, 1, 1), strides=(666, 333, 9, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(21312), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (4, 5)), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 2, 111, 27), strides=(6160, 3080, 28, 1), offset=0, mask=((0, 32), (0, 2), (0, 110), (0, 27)), contiguous=False), View(shape=(32, 2, 37, 9, 2, 2), strides=(5994, 2997, 81, 3, 27, 1), offset=0, mask=None, contiguous=False))), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(197119), arg=ShapeTracker(views=(View(shape=(32, 2, 111, 27), strides=(6160, 3080, 28, 1), offset=0, mask=((0, 32), (0, 2), (0, 110), (0, 27)), contiguous=False), View(shape=(32, 2, 37, 9, 2, 2), strides=(5994, 2997, 81, 3, 27, 1), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(197119), arg=1, src=()),)),)),)),)),)) opts = [Opt(op=OptOps.LOCAL, axis=0, arg=32)] helper_test_lin(Kernel(ast), opts, failed_platforms=[]) def test_failure_3(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 8, 16, 1), strides=(128, 16, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(4096), arg=ShapeTracker(views=(View(shape=(32, 8, 16, 1), strides=(128, 16, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4096), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 8, 16, 16), strides=(2048, 256, 16, 1), offset=0, mask=None, contiguous=True),)), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(65536), arg=ShapeTracker(views=(View(shape=(32, 8, 16, 16), strides=(2048, 256, 16, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(65536), arg=1, src=()),)),)),)),)),)) opts = [Opt(op=OptOps.GROUP, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=2), Opt(op=OptOps.UNROLL, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.LOCAL, axis=0, arg=32)] # METAL: AssertionError: Error Domain=AGXMetalG13X Code=3 "Threadgroup memory size (65536) exceeds the maximum threadgroup memory allowed (32768)" UserInfo={NSLocalizedDescription=Threadgroup memory size (65536) exceeds the maximum threadgroup memory allowed (32768)} helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -86,19 +85,19 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_5(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 4, 6)), src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( x5:=UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( - ast_const(dtypes.float, 0.1464405059814453, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 1, 4, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - ast_const(dtypes.float, 1.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 1, 4, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=0.1464405059814453, src=( + x8:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 1, 4, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.float, arg=1.0, src=( + x8,)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 1, 4, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 1, 4, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=1, src=()),)),)),)), x5,)),)),)),)) opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=0)] # EXEC_ERROR, it has no global_size @@ -107,13 +106,18 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_6(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.int.ptr(10), arg=ShapeTracker(views=(View(shape=(10, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(10), arg=0, src=()),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - ast_const(dtypes.int, -1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(11, 19), strides=(0, 0), offset=0, mask=((0, 11), (9, 19)), contiguous=False), View(shape=(10, 10), strides=(1, 20), offset=0, mask=None, contiguous=False))), src=()),)),)), - ast_const(dtypes.int, 10, st_src=( + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(11, 19), strides=(0, 0), offset=0, mask=((0, 11), (9, 19)), contiguous=False), View(shape=(10, 10), strides=(1, 20), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 10), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x9,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=10, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=0, arg=2), Opt(op=OptOps.UPCAST, axis=0, arg=0)] # COMPILE FAILED, KeyError: Ops.CONST @@ -122,12 +126,12 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_7(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 32, 1, 34, 1, 34), strides=(36992, 1156, 0, 34, 0, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(18939904), arg=ShapeTracker(views=(View(shape=(512, 32, 1, 34, 1, 34), strides=(36992, 1156, 0, 34, 0, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(18939904), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2, 4)), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 32, 6, 8, 4, 6, 8, 4), strides=(2048, 64, 6291456, 8, 0, 1048576, 1, 0), offset=0, mask=((0, 512), (0, 32), (0, 6), (0, 8), (0, 1), (0, 6), (0, 8), (0, 1)), contiguous=False), View(shape=(512, 32, 6, 35, 6, 35), strides=(1179648, 36864, 6144, 192, 32, 1), offset=0, mask=((0, 512), (0, 32), (0, 6), (0, 32), (0, 6), (0, 32)), contiguous=False), View(shape=(512, 32, 238, 238), strides=(1411200, 44100, 210, 1), offset=0, mask=((0, 512), (0, 32), (0, 210), (0, 210)), contiguous=False), View(shape=(512, 32, 7, 34, 7, 34), strides=(1812608, 56644, 8092, 238, 34, 1), offset=0, mask=None, contiguous=True))), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(37748736), arg=ShapeTracker(views=(View(shape=(512, 32, 6, 8, 4, 6, 8, 4), strides=(2048, 64, 6291456, 8, 0, 1048576, 1, 0), offset=0, mask=((0, 512), (0, 32), (0, 6), (0, 8), (0, 1), (0, 6), (0, 8), (0, 1)), contiguous=False), View(shape=(512, 32, 6, 35, 6, 35), strides=(1179648, 36864, 6144, 192, 32, 1), offset=0, mask=((0, 512), (0, 32), (0, 6), (0, 32), (0, 6), (0, 32)), contiguous=False), View(shape=(512, 32, 238, 238), strides=(1411200, 44100, 210, 1), offset=0, mask=((0, 512), (0, 32), (0, 210), (0, 210)), contiguous=False), View(shape=(512, 32, 7, 34, 7, 34), strides=(1812608, 56644, 8092, 238, 34, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(37748736), arg=1, src=()),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=0, arg=4)] # test/test_linearizer_failures.py Fatal Python error: Segmentation fault helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -135,8 +139,8 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_8(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(1, 1, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=0, src=()),)), UOp(Ops.SQRT, dtypes.float, arg=None, src=( UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( @@ -145,16 +149,16 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.MUL, dtypes.float, arg=None, src=( x9:=UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 4096), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(4096), arg=ShapeTracker(views=(View(shape=(1, 1, 4096), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4096), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 4096), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(4096), arg=ShapeTracker(views=(View(shape=(1, 1, 4096), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4096), arg=2, src=()),)),)),)), x9,)),)), - ast_const(dtypes.float, 0.000244140625, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),)),)), - ast_const(dtypes.float, 1e-06, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),)),)),)),)),)),)) + UOp(Ops.CONST, dtypes.float, arg=0.000244140625, src=( + x17:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=1e-06, src=( + x17,)),)),)),)),)),)) opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=4)] # fatal error: bracket nesting level exceeded maximum of 256 # note: use -fbracket-depth=N to increase maximum nesting level @@ -163,43 +167,43 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_9(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1, 3, 1, 1, 1, 1, 5, 15, 5, 3, 4), strides=(0, 0, 0, 4500, 0, 0, 0, 0, 900, 60, 12, 4, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(13500), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 3, 1, 1, 1, 1, 5, 15, 5, 3, 4), strides=(0, 0, 0, 4500, 0, 0, 0, 0, 900, 60, 12, 4, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(13500), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 2, 1, 3, 1, 1, 1, 1, 5, 15, 5, 3, 4), strides=(0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(6), arg=ShapeTracker(views=(View(shape=(1, 2, 1, 3, 1, 1, 1, 1, 5, 15, 5, 3, 4), strides=(0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 2, 1, 3, 1, 1, 1, 1, 5, 15, 5, 3, 4), strides=(0, 4500, 0, 0, 0, 0, 0, 0, 900, 60, 12, 4, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(9000), arg=ShapeTracker(views=(View(shape=(1, 2, 1, 3, 1, 1, 1, 1, 5, 15, 5, 3, 4), strides=(0, 4500, 0, 0, 0, 0, 0, 0, 900, 60, 12, 4, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9000), arg=2, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32)] helper_test_lin(Kernel(ast), opts, failed_platforms=[]) def test_failure_10(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=0, src=()),)), UOp(Ops.ADD, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.half, arg=(Ops.ADD, (3,)), src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 50257), strides=(0, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(50257), arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 50257), strides=(0, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(50257), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 50257), strides=(0, 0, 1, 1024), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.half.ptr(51463168), arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 50257), strides=(0, 0, 1, 1024), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(51463168), arg=2, src=()),)),)),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=3, src=()),)),)),)),)),)) helper_test_lin(Kernel(ast), [], failed_platforms=[]) def test_failure_11(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 64, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(1, 64, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=0, src=()),)), UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 3)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( @@ -208,22 +212,22 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.MAX, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(1179648), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True),)), src=( + x12:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1179648), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.float, 0.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + x15:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=2, src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=0.0, src=( + x17:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.float, 1.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + x20:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=3, src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=1.0, src=( + x17,)),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( - ast_const(dtypes.float, 1.0, st_src=( + UOp(Ops.CONST, dtypes.float, arg=1.0, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( @@ -234,127 +238,102 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.MAX, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(1179648), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( + x12,)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)), - x42:=ast_const(dtypes.float, 0.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( + x15,)),)),)), + x39:=UOp(Ops.CONST, dtypes.float, arg=0.0, src=( + x40:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)), - ast_const(dtypes.float, 1.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( + x20,)),)),)), + UOp(Ops.CONST, dtypes.float, arg=1.0, src=( + x40,)),)), UOp(Ops.SQRT, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64,), strides=(1,), offset=0, mask=None, contiguous=True), View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)), - ast_const(dtypes.float, 5.425347222222222e-05, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64,), strides=(0,), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)), - ast_const(dtypes.float, 1e-05, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64,), strides=(0,), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)),)),)),)),)), - x42,)), + UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(64,), strides=(1,), offset=0, mask=None, contiguous=True), View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=4, src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=5.425347222222222e-05, src=( + x53:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64,), strides=(0,), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=1e-05, src=( + x53,)),)),)),)),)),)), + x39,)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=5, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(576, 9, 3, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(294912), arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(576, 9, 3, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(294912), arg=5, src=()),)),)),)),)),)), UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=6, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(576, 9, 3, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(294912), arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(576, 9, 3, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(294912), arg=6, src=()),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=7, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(576, 9, 3, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(294912), arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(576, 9, 3, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(294912), arg=7, src=()),)),)),)),)),)),)),)),)) helper_test_lin(Kernel(ast), [], failed_platforms=[]) def test_failure_12(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(72), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(72), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 4, 6)), src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( x5:=UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), - ast_const(dtypes.float, 1.0, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(72), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(72), arg=1, src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=1.0, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=2, src=()),)),)),)), x5,)),)),)),)) opts = [Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.GROUP, axis=0, arg=4)] helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - @unittest.skip("found implicit expand") - def test_failure_12_multireduce(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),)), src=()), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 4, 8)), src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - x5:=UOp(Ops.ADD, dtypes.float, arg=None, src=( - x6:=UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), - ast_const(dtypes.float, 1.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - x6,)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 4, 8)), src=( - x5,)),)),)),)),)) - opts = [Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.GROUP, axis=0, arg=4)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - # both kernels are correct from a code standpoint, but generate different results due to precision errors (switching to float results in output matches) def test_failure_13(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(384, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(768), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(384, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(768), arg=0, src=()),)), UOp(Ops.ADD, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.half, arg=(Ops.ADD, (3,)), src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 384, 51864), strides=(51864, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(103728), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 51864), strides=(51864, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(103728), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 384, 51864), strides=(0, 0, 1, 384), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.half.ptr(19915776), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 51864), strides=(0, 0, 1, 384), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(19915776), arg=2, src=()),)),)),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(0, 0, 1, 0), offset=19584, mask=None, contiguous=False),)), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(19968), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(0, 0, 1, 0), offset=19584, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(19968), arg=3, src=()),)),)),)),)),)) opts = [Opt(op=OptOps.GROUP, axis=0, arg=4)] helper_test_lin(Kernel(ast), opts, failed_platforms=["METAL", "GPU", "CUDA"]) def test_failure_14(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(72), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(72), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 4, 6)), src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( x5:=UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), - ast_const(dtypes.float, 1.0, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(72), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(72), arg=1, src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=1.0, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=2, src=()),)),)),)), x5,)),)),)),)) opts = [Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4)] # COMPILE_ERROR on METAL in fuzz_linearizer: unused variables and undeclared variables @@ -363,8 +342,8 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_15(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 196, 14, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(21952), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 196, 14, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(21952), arg=0, src=()),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( @@ -372,28 +351,28 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 480, 1, 1), strides=(0, 0, 0, 14, 1, 196, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(94080), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 480, 1, 1), strides=(0, 0, 0, 14, 1, 196, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(94080), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 480, 1, 1), strides=(0, 0, 480, 0, 0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(53760), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 480, 1, 1), strides=(0, 0, 480, 0, 0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(53760), arg=2, src=()),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(112), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(112), arg=3, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(112), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(112), arg=4, src=()),)),)),)), UOp(Ops.SQRT, dtypes.float, arg=None, src=( UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=5, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - ast_const(dtypes.float, 1e-05, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(112), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(112), arg=5, src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=1e-05, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=6, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(112), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(112), arg=6, src=()),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=2), Opt(op=OptOps.PADTO, axis=1, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.LOCAL, axis=1, arg=16)] # COMPILE_ERROR on METAL in fuzz_linearizer ast 115: Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -401,14 +380,14 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_16(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 13, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(13), arg=ShapeTracker(views=(View(shape=(1, 13, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(13), arg=0, src=()),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 13, 1024), strides=(0, 1024, 1), offset=0, mask=None, contiguous=True),)), src=()),)),)), - ast_const(dtypes.float, 0.0009765625, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(13312), arg=ShapeTracker(views=(View(shape=(1, 13, 1024), strides=(0, 1024, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(13312), arg=1, src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=0.0009765625, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 13, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.GROUP, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=0), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=1, arg=4)] # COMPILE_ERROR on METAL/GPU (probably HIP/CUDA too) in fuzz_linearizer ast 154: bracket nesting level exceeded maximum of 256 @@ -417,16 +396,16 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_17(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 40, 1, 28, 28, 1, 1), strides=(31360, 0, 784, 0, 28, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(62720), arg=ShapeTracker(views=(View(shape=(2, 1, 40, 1, 28, 28, 1, 1), strides=(31360, 0, 784, 0, 28, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(62720), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 40, 240, 28, 28, 1, 1), strides=(0, 0, 1, 40, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(9600), arg=ShapeTracker(views=(View(shape=(2, 1, 40, 240, 28, 28, 1, 1), strides=(0, 0, 1, 40, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9600), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 40, 240, 28, 28, 1, 1), strides=(188160, 0, 0, 784, 28, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(376320), arg=ShapeTracker(views=(View(shape=(2, 1, 40, 240, 28, 28, 1, 1), strides=(188160, 0, 0, 784, 28, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(376320), arg=2, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=1, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.GROUPTOP, axis=0, arg=16), Opt(op=OptOps.PADTO, axis=1, arg=32), Opt(op=OptOps.LOCAL, axis=1, arg=4)] # COMPILE_ERROR on METAL in fuzz_linearizer ast 178: Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -434,24 +413,24 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_18(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(384, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(768), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(384, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(768), arg=0, src=()),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(384, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(768), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(384, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(768), arg=1, src=()),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1536), strides=(1536, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(3072), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1536), strides=(1536, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(3072), arg=2, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1536), strides=(0, 0, 1536, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(589824), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1536), strides=(0, 0, 1536, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(589824), arg=3, src=()),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(384), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(384), arg=4, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUPTOP, axis=0, arg=256), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=3)] # COMPILE_ERROR on METAL in fuzz_linearizer ast 239: Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -459,16 +438,16 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_19(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 9, 7, 3, 3), strides=(2268, 0, 567, 0, 63, 9, 3, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(4536), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 9, 7, 3, 3), strides=(2268, 0, 567, 0, 63, 9, 3, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4536), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 4, 9, 7, 3, 3), strides=(0, 0, 36, 9, 0, 0, -3, -1), offset=8, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(144), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 4, 9, 7, 3, 3), strides=(0, 0, 36, 9, 0, 0, -3, -1), offset=8, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(144), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 4, 9, 7, 3, 3), strides=(252, 0, 0, 63, 7, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(504), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 4, 9, 7, 3, 3), strides=(252, 0, 0, 63, 7, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(504), arg=2, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.LOCAL, axis=2, arg=3), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=7), Opt(op=OptOps.UPCAST, axis=2, arg=3), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.LOCAL, axis=0, arg=3)] # COMPILE_ERROR on METAL in fuzz_linearizer ast 379: Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -476,13 +455,13 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_20(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 4), strides=(4, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(4, 4), strides=(4, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=0, src=()),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 4), strides=(0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), - ast_const(dtypes.float, 1.0, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(4), arg=ShapeTracker(views=(View(shape=(4, 4), strides=(0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4), arg=1, src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=1.0, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 4), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=0)] helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -490,9 +469,9 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_21(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(45, 65), strides=(65, 1), offset=0, mask=None, contiguous=True),)), src=()), - ast_const(dtypes.float, 1.0, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(2925), arg=ShapeTracker(views=(View(shape=(45, 65), strides=(65, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(2925), arg=0, src=()),)), + UOp(Ops.CONST, dtypes.float, arg=1.0, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(45, 65), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)) opts = [Opt(op=OptOps.PADTO, axis=0, arg=32)] helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -502,11 +481,11 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_22(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=0, src=()),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( - x4:=ast_const(dtypes.float, 0.000244140625, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + x4:=UOp(Ops.CONST, dtypes.float, arg=0.000244140625, src=( + x5:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( @@ -519,100 +498,100 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(393216), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(393216), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=2, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=3, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=4, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=5, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=5, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=6, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=6, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=7, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=7, src=()),)),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=8, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=8, src=()),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=9, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=9, src=()),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=10, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=10, src=()),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=11, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=11, src=()),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=12, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=12, src=()),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=13, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=13, src=()),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=14, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=14, src=()),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=15, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=15, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=16, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 17280, 180, 18, 1), offset=19, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=()),)),)),)),)),)),)),)),)),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(544301), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 17280, 180, 18, 1), offset=19, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(544301), arg=16, src=()),)),)),)),)),)),)),)),)),)),)),)),)), UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=17, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),)), - ast_const(dtypes.float, 2.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)), - x80:=UOp(Ops.RECIP, dtypes.float, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=17, src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=2.0, src=( + x5,)),)),)),)), + x79:=UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=18, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=18, src=()),)),)), x4,)), - ast_const(dtypes.float, 1e-05, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)), - x80,)),)),)),)) + UOp(Ops.CONST, dtypes.float, arg=1e-05, src=( + x5,)),)),)),)), + x79,)),)),)),)) opts = [] helper_test_lin(Kernel(ast), opts, failed_platforms=["METAL", "CUDA"]) def test_failure_23(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(40, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(9600), arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(40, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9600), arg=0, src=()),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(1, 240, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(9600), arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(1, 240, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9600), arg=1, src=()),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.LOCAL, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=2)] helper_test_lin(Kernel(ast), opts, failed_platforms=[]) def test_failure_24(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(32, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(256), arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(32, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(256), arg=0, src=()),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(1, 8, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(256), arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(1, 8, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(256), arg=1, src=()),)),)),)),)) opts = [Opt(op=OptOps.LOCAL, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=2, arg=2), Opt(op=OptOps.LOCAL, axis=1, arg=8), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=2)] helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -620,13 +599,18 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_25(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.int.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1024), arg=0, src=()),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - ast_const(dtypes.int, 1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1025, 2047), strides=(0, 0), offset=0, mask=((0, 1025), (1023, 2047)), contiguous=False), View(shape=(1024, 1024), strides=(1, 2048), offset=0, mask=None, contiguous=False))), src=()),)),)), - ast_const(dtypes.int, -1, st_src=( + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1025, 2047), strides=(0, 0), offset=0, mask=((0, 1025), (1023, 2047)), contiguous=False), View(shape=(1024, 1024), strides=(1, 2048), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=1, src=( + x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1024), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x9,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=-1, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.GROUP, axis=0, arg=16), Opt(op=OptOps.UNROLL, axis=0, arg=4)] helper_test_lin(Kernel(ast), opts, failed_platforms=[]) @@ -635,13 +619,18 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_26(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.int.ptr(128), arg=ShapeTracker(views=(View(shape=(128, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(128), arg=0, src=()),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - ast_const(dtypes.int, 1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(129, 255), strides=(0, 0), offset=0, mask=((0, 129), (127, 255)), contiguous=False), View(shape=(128, 128), strides=(1, 256), offset=0, mask=None, contiguous=False))), src=()),)),)), - ast_const(dtypes.int, -1, st_src=( + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(129, 255), strides=(0, 0), offset=0, mask=((0, 129), (127, 255)), contiguous=False), View(shape=(128, 128), strides=(1, 256), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=1, src=( + x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 128), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x9,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=-1, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) all_failing_opts = [ [Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.GROUPTOP, axis=0, arg=32), Opt(op=OptOps.UNROLL, axis=0, arg=0)], @@ -675,12 +664,12 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_27(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 16, 13, 1), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(208), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 1), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(208), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.half, arg=(Ops.MAX, (3,)), src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 169, 13, 1), offset=0, mask=None, contiguous=True),)), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(2704), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 169, 13, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(2704), arg=1, src=()),)),)),)),)),)) all_failing_opts = [ [Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=7), Opt(op=OptOps.UPCAST, axis=0, arg=0)], ] @@ -690,73 +679,73 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_28(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.bfloat16.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.bfloat16.ptr(1), arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.bfloat16.ptr(1), arg=0, src=()),)), UOp(Ops.WHERE, dtypes.bfloat16, arg=None, src=( UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( x5:=UOp(Ops.CAST, dtypes.bfloat16, arg=None, src=( UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)), - x9:=ast_const(dtypes.bfloat16, 230.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.int.ptr(1), arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), arg=1, src=()),)),)),)), + x9:=UOp(Ops.CONST, dtypes.bfloat16, arg=230.0, src=( + x10:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)), UOp(Ops.ADD, dtypes.bfloat16, arg=None, src=( UOp(Ops.MUL, dtypes.bfloat16, arg=None, src=( UOp(Ops.MUL, dtypes.bfloat16, arg=None, src=( x5, - ast_const(dtypes.bfloat16, 0.004347826086956522, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)), - ast_const(dtypes.bfloat16, 0.199374800625, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)), - ast_const(dtypes.bfloat16, 1.99375e-07, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)), + UOp(Ops.CONST, dtypes.bfloat16, arg=0.004347826086956522, src=( + x10,)),)), + UOp(Ops.CONST, dtypes.bfloat16, arg=0.199374800625, src=( + x10,)),)), + UOp(Ops.CONST, dtypes.bfloat16, arg=1.99375e-07, src=( + x10,)),)), UOp(Ops.ADD, dtypes.bfloat16, arg=None, src=( UOp(Ops.MUL, dtypes.bfloat16, arg=None, src=( UOp(Ops.MUL, dtypes.bfloat16, arg=None, src=( UOp(Ops.ADD, dtypes.bfloat16, arg=None, src=( x5, x9,)), - ast_const(dtypes.bfloat16, 0.0012987012987012987, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)), - ast_const(dtypes.bfloat16, -0.19439062499999998, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)), - ast_const(dtypes.bfloat16, 0.199375, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)),)),)),)) + UOp(Ops.CONST, dtypes.bfloat16, arg=0.0012987012987012987, src=( + x10,)),)), + UOp(Ops.CONST, dtypes.bfloat16, arg=-0.19439062499999998, src=( + x10,)),)), + UOp(Ops.CONST, dtypes.bfloat16, arg=0.199375, src=( + x10,)),)),)),)),)) helper_test_lin(Kernel(ast), opts=[], failed_platforms=[]) def test_failure_29(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 1, 64, 56, 56, 1, 1, 1), strides=(200704, 0, 3136, 56, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(25690112), arg=ShapeTracker(views=(View(shape=(128, 1, 64, 56, 56, 1, 1, 1), strides=(200704, 0, 3136, 56, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(25690112), arg=0, src=()),)), UOp(Ops.CAST, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 128, 1, 64, 4, 58, 4, 58), strides=(0, 200704, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, 128), (0, 1), (0, 64), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(128, 1, 64, 56, 56, 64, 3, 3), strides=(3444736, 0, 0, 232, 1, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(25690112), arg=ShapeTracker(views=(View(shape=(1, 128, 1, 64, 4, 58, 4, 58), strides=(0, 200704, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, 128), (0, 1), (0, 64), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(128, 1, 64, 56, 56, 64, 3, 3), strides=(3444736, 0, 0, 232, 1, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(25690112), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 1, 64, 56, 56, 64, 3, 3), strides=(0, 0, 576, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(36864), arg=ShapeTracker(views=(View(shape=(128, 1, 64, 56, 56, 64, 3, 3), strides=(0, 0, 576, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(36864), arg=2, src=()),)),)),)),)),)),)),)),)) opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 1, 1)), Opt(op=OptOps.PADTO, axis=2, arg=32)] helper_test_lin(Kernel(ast), opts, failed_platforms=[], atol=1.0) def test_failure_30(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 12, 31, 31, 1, 1, 1), strides=(11532, 0, 961, 31, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(2952192), arg=ShapeTracker(views=(View(shape=(256, 1, 12, 31, 31, 1, 1, 1), strides=(11532, 0, 961, 31, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(2952192), arg=0, src=()),)), UOp(Ops.CAST, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 12, 31, 31, 3, 2, 2), strides=(3072, 0, 0, 32, 1, 1024, 32, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(786432), arg=ShapeTracker(views=(View(shape=(256, 1, 12, 31, 31, 3, 2, 2), strides=(3072, 0, 0, 32, 1, 1024, 32, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(786432), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 12, 31, 31, 3, 2, 2), strides=(0, 0, 12, 0, 0, 4, 2, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(144), arg=ShapeTracker(views=(View(shape=(256, 1, 12, 31, 31, 3, 2, 2), strides=(0, 0, 12, 0, 0, 4, 2, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(144), arg=2, src=()),)),)),)),)),)),)),)),)) opts = [Opt(op=OptOps.PADTO, axis=3, arg=32), Opt(op=OptOps.LOCAL, axis=3, arg=32), Opt(op=OptOps.UPCAST, axis=3, arg=4), Opt(op=OptOps.UPCAST, axis=3, arg=0)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -764,19 +753,19 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_31(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 16, 13, 1), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(208), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 1), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(208), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( UOp(Ops.EXP2, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 169, 13, 1), offset=0, mask=None, contiguous=True),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(2704), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 169, 13, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(2704), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.float, 1.4426950408889634, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(208), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(208), arg=2, src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.UNROLL, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=1, arg=32)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -787,18 +776,18 @@ class TestLinearizerFailures(unittest.TestCase): # Memory access fault on tinybox red ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 256, 14, 14, 1, 1, 1), strides=(50176, 0, 196, 14, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(12845056), arg=ShapeTracker(views=(View(shape=(256, 1, 256, 14, 14, 1, 1, 1), strides=(50176, 0, 196, 14, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(12845056), arg=0, src=()),)), UOp(Ops.CAST, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 256, 1, 256, 4, 16, 4, 16), strides=(0, 50176, 0, 196, 0, 14, 0, 1), offset=-15, mask=((0, 1), (0, 256), (0, 1), (0, 256), (0, 4), (1, 15), (0, 4), (1, 15)), contiguous=False), View(shape=(256, 1, 256, 14, 14, 256, 3, 3), strides=(1048576, 0, 0, 64, 1, 4096, 1088, 17), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(12845056), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 256, 4, 16, 4, 16), strides=(0, 50176, 0, 196, 0, 14, 0, 1), offset=-15, mask=((0, 1), (0, 256), (0, 1), (0, 256), (0, 4), (1, 15), (0, 4), (1, 15)), contiguous=False), View(shape=(256, 1, 256, 14, 14, 256, 3, 3), strides=(1048576, 0, 0, 64, 1, 4096, 1088, 17), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(12845056), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 256, 14, 14, 256, 3, 3), strides=(0, 0, 2304, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(589824), arg=ShapeTracker(views=(View(shape=(256, 1, 256, 14, 14, 256, 3, 3), strides=(0, 0, 2304, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(589824), arg=2, src=()),)),)),)),)),)),)),)),)) opts = [Opt(op=OptOps.TC, axis=2, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=2, arg=7), Opt(op=OptOps.UNROLL, axis=1, arg=0), Opt(op=OptOps.LOCAL, axis=1, arg=16)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[], atol=0.1, rtol=0.05) @@ -806,39 +795,50 @@ class TestLinearizerFailures(unittest.TestCase): # Ops.UNMUL left after linearize ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( x5:=UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(1,), offset=0, mask=((0, 26040),), contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(26040), arg=ShapeTracker(views=(View(shape=(32640,), strides=(1,), offset=0, mask=((0, 26040),), contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(26040), arg=1, src=()),)),)), UOp(Ops.WHERE, dtypes.float, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( x5, - x10:=ast_const(dtypes.float, 0.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=None, contiguous=False),)), src=()),)),)), + x10:=UOp(Ops.CONST, dtypes.float, arg=0.0, src=( + x11:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=None, contiguous=False),)), src=()),)),)), UOp(Ops.WHERE, dtypes.float, arg=None, src=( UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( - ast_const(dtypes.float, 0.06788442333021306, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=((0, 26040),), contiguous=False),)), src=()),)), + UOp(Ops.WHERE, dtypes.float, arg=None, src=( + x18:=UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=((0, 26040),), contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.float, arg=0.06788442333021306, src=( + x11,)), + x10,)), x5,)), - ast_const(dtypes.float, -0.03394221166510653, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=((0, 26040),), contiguous=False),)), src=()),)),)), + UOp(Ops.WHERE, dtypes.float, arg=None, src=( + x18, + UOp(Ops.CONST, dtypes.float, arg=-0.03394221166510653, src=( + x11,)), + x10,)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(1,), offset=-26040, mask=((26040, 32640),), contiguous=False),)), src=()),)), - ast_const(dtypes.float, -0.18257418583505536, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=((26040, 32640),), contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(6600), arg=ShapeTracker(views=(View(shape=(32640,), strides=(1,), offset=-26040, mask=((26040, 32640),), contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6600), arg=2, src=()),)),)), + UOp(Ops.WHERE, dtypes.float, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=((26040, 32640),), contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.float, arg=-0.18257418583505536, src=( + x11,)), + x10,)),)),)), x10,)), - ast_const(dtypes.float, -1.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=None, contiguous=False),)), src=()),)), - ast_const(dtypes.float, 1.0, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( + x11,)), + UOp(Ops.CONST, dtypes.float, arg=1.0, src=( + x11,)),)), x10,)),)),)),)),)) opts = [Opt(op=OptOps.GROUPTOP, axis=0, arg=16)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -847,18 +847,18 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_34(self, unroll=False): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 1, 1), strides=(180, 0, 30, 3, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(720), arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 1, 1), strides=(180, 0, 30, 3, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(720), arg=0, src=()),)), UOp(Ops.MAX, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (6, 7)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 2, 5), strides=(77, 0, 0, 7, 1, 0, 7, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(308), arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 2, 5), strides=(77, 0, 0, 7, 1, 0, 7, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(308), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 2, 5), strides=(0, 0, 10, 0, 0, 0, 5, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - ast_const(dtypes.float, 0.0, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(60), arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 2, 5), strides=(0, 0, 10, 0, 0, 0, 5, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(60), arg=2, src=()),)),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=0.0, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1)), Opt(op=OptOps.UNROLL, axis=0, arg=0)] if unroll else [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -870,15 +870,20 @@ class TestLinearizerFailures(unittest.TestCase): # Ops.UNMUL left after linearize ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(5, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.uchar.ptr(5), arg=ShapeTracker(views=(View(shape=(5, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(5), arg=0, src=()),)), UOp(Ops.CAST, dtypes.uchar, arg=None, src=( UOp(Ops.ADD, dtypes.uint, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.uint, arg=(Ops.ADD, (1,)), src=( UOp(Ops.CAST, dtypes.uint, arg=None, src=( - ast_const(dtypes.uchar, 1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(6, 9), strides=(0, 0), offset=0, mask=((0, 6), (4, 9)), contiguous=False), View(shape=(5, 5), strides=(1, 10), offset=0, mask=None, contiguous=False))), src=()),)),)),)), - ast_const(dtypes.uint, -1, st_src=( + UOp(Ops.WHERE, dtypes.uchar, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(6, 9), strides=(0, 0), offset=0, mask=((0, 6), (4, 9)), contiguous=False), View(shape=(5, 5), strides=(1, 10), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.CONST, dtypes.uchar, arg=1, src=( + x11:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(5, 5), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.uchar, arg=0, src=( + x11,)),)),)),)), + UOp(Ops.CONST, dtypes.uint, arg=-1, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(5, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=0, arg=0)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -890,23 +895,23 @@ class TestLinearizerFailures(unittest.TestCase): # fuzz: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=1 FUZZ_NTH=28 DEBUG=2 python3 ./test/external/fuzz_linearizer.py --logfile /tmp/beautiful_mnist.kernels.txt ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 1, 1), strides=(18432, 0, 576, 24, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(9437184), arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 1, 1), strides=(18432, 0, 576, 24, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9437184), arg=0, src=()),)), UOp(Ops.MAX, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (6, 7)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 5, 5), strides=(784, 0, 0, 28, 1, 0, 28, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.uchar.ptr(401408), arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 5, 5), strides=(784, 0, 0, 28, 1, 0, 28, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(401408), arg=1, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 5, 5), strides=(0, 0, 25, 0, 0, 0, 5, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(800), arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 5, 5), strides=(0, 0, 25, 0, 0, 0, 5, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(800), arg=2, src=()),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.float, 0.0, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(32), arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(32), arg=3, src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=0.0, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) for axis in [0,1,2,3,4,5]: opts = [Opt(op=OptOps.TC, axis=axis, arg=(-1, 2, 1))] @@ -917,17 +922,17 @@ class TestLinearizerFailures(unittest.TestCase): # fuzz: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=1 FUZZ_NTH=87 DEBUG=2 python3 ./test/external/fuzz_linearizer.py --logfile /tmp/beautiful_mnist.kernels.txt ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 32, 1, 1, 1, 5, 5, 256), strides=(0, 0, 6400, 0, 0, 0, 1280, 256, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(204800), arg=ShapeTracker(views=(View(shape=(1, 1, 32, 1, 1, 1, 5, 5, 256), strides=(0, 0, 6400, 0, 0, 0, 1280, 256, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(204800), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 3, 4)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 32, 24, 24, 1, 5, 5, 256), strides=(784, 0, 0, 28, 1, 0, 28, 1, 1568), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.uchar.ptr(401408), arg=ShapeTracker(views=(View(shape=(2, 1, 32, 24, 24, 1, 5, 5, 256), strides=(784, 0, 0, 28, 1, 0, 28, 1, 1568), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(401408), arg=1, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 32, 24, 24, 1, 5, 5, 256), strides=(18432, 0, 576, 24, 1, 0, 0, 0, 36864), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(9437184), arg=ShapeTracker(views=(View(shape=(2, 1, 32, 24, 24, 1, 5, 5, 256), strides=(18432, 0, 576, 24, 1, 0, 0, 0, 36864), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9437184), arg=2, src=()),)),)),)),)),)),)) for axis in [0,1,3,4]: opts = [Opt(op=OptOps.TC, axis=axis, arg=(-1, 2, 1))] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -938,23 +943,23 @@ class TestLinearizerFailures(unittest.TestCase): # fuzz: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=1 FUZZ_NTH=127 DEBUG=2 python3 ./test/external/fuzz_linearizer.py --logfile /tmp/beautiful_mnist.kernels.txt ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 1, 1), strides=(18432, 0, 576, 24, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(184320000), arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 1, 1), strides=(18432, 0, 576, 24, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(184320000), arg=0, src=()),)), UOp(Ops.MAX, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (6, 7)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 5, 5), strides=(784, 0, 0, 28, 1, 0, 28, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.uchar.ptr(7840000), arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 5, 5), strides=(784, 0, 0, 28, 1, 0, 28, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(7840000), arg=1, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 5, 5), strides=(0, 0, 25, 0, 0, 0, 5, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(800), arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 5, 5), strides=(0, 0, 25, 0, 0, 0, 5, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(800), arg=2, src=()),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.float, 0.0, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(32), arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(32), arg=3, src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=0.0, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) for axis in [0,1,2,3,4,5]: opts = [Opt(op=OptOps.TC, axis=axis, arg=(-1, 2, 1))] @@ -965,13 +970,18 @@ class TestLinearizerFailures(unittest.TestCase): # fuzz: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=2 DEBUG=2 FUZZ_NTH=3 python3 ./test/external/fuzz_linearizer.py --logfile /tmp/beautiful_mnist.kernels.txt ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.int.ptr(60000), arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(60000), arg=0, src=()),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - ast_const(dtypes.int, 1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60001, 119999), strides=(0, 0), offset=0, mask=((0, 60001), (59999, 119999)), contiguous=False), View(shape=(60000, 60000), strides=(1, 120000), offset=0, mask=None, contiguous=False))), src=()),)),)), - ast_const(dtypes.int, -1, st_src=( + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60001, 119999), strides=(0, 0), offset=0, mask=((0, 60001), (59999, 119999)), contiguous=False), View(shape=(60000, 60000), strides=(1, 120000), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=1, src=( + x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 60000), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x9,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=-1, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) for amt in [16,32]: opts = [Opt(op=OptOps.GROUPTOP, axis=0, arg=amt), Opt(op=OptOps.UNROLL, axis=0, arg=0)] @@ -983,18 +993,18 @@ class TestLinearizerFailures(unittest.TestCase): # One more resnet crash with a page fault on AMD. Checked on rocm6.1.3, -O1 works, -O2 fails ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 128, 28, 28, 1, 1, 1), strides=(100352, 0, 784, 28, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(25690112), arg=ShapeTracker(views=(View(shape=(256, 1, 128, 28, 28, 1, 1, 1), strides=(100352, 0, 784, 28, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(25690112), arg=0, src=()),)), UOp(Ops.CAST, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 256, 1, 128, 4, 58, 4, 58), strides=(0, 401408, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, 256), (0, 1), (0, 128), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(256, 1, 128, 28, 28, 128, 3, 3), strides=(6889472, 0, 0, 464, 2, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(102760448), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 128, 4, 58, 4, 58), strides=(0, 401408, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, 256), (0, 1), (0, 128), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(256, 1, 128, 28, 28, 128, 3, 3), strides=(6889472, 0, 0, 464, 2, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(102760448), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 128, 28, 28, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(147456), arg=ShapeTracker(views=(View(shape=(256, 1, 128, 28, 28, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(147456), arg=2, src=()),)),)),)),)),)),)),)),)) opts=[Opt(op=OptOps.TC, axis=5, arg=(-1, 2, 1)), Opt(op=OptOps.UNROLL, axis=0, arg=0)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=["AMD", "HIP"], atol=0.02) @@ -1004,12 +1014,12 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_42(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=1, src=()),)),)),)),)),)) opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.UPCAST, axis=0, arg=2), Opt(op=OptOps.PADTO, axis=0, arg=32)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -1018,12 +1028,12 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_43(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=1, src=()),)),)),)),)),)) opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -1032,12 +1042,12 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_44(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=()),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=1, src=()),)),)),)),)),)) opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4)] k = helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) assert k is not None @@ -1049,39 +1059,47 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_45(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 1, 1, 1), strides=(3, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(6), arg=ShapeTracker(views=(View(shape=(2, 3, 1, 1, 1), strides=(3, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2, 3)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 3, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(6), arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 3, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6), arg=1, src=()),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.int.ptr(1), arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), arg=2, src=()),)),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (4,)), src=( - ast_const(dtypes.int, 1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 3), strides=(0, 0), offset=0, mask=((0, 3), (1, 3)), contiguous=False), View(shape=(2, 3, 2, 3, 3), strides=(0, 0, 1, 0, 4), offset=0, mask=((0, 2), (0, 3), (0, 2), (0, 3), (0, 2)), contiguous=False))), src=()),)),)), - x19:=ast_const(dtypes.int, -1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - x21:=ast_const(dtypes.bool, True, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 3), strides=(0, 0), offset=0, mask=((0, 3), (1, 3)), contiguous=False), View(shape=(2, 3, 2, 3, 3), strides=(0, 0, 1, 0, 4), offset=0, mask=((0, 2), (0, 3), (0, 2), (0, 3), (0, 2)), contiguous=False))), src=()),)), + x20:=UOp(Ops.CONST, dtypes.int, arg=1, src=( + x21:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 3), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + x22:=UOp(Ops.CONST, dtypes.int, arg=0, src=( + x21,)),)),)), + x23:=UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x24:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + x25:=UOp(Ops.CONST, dtypes.bool, arg=True, src=( + x24,)),)), UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(3, 1, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.int.ptr(6), arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(3, 1, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(6), arg=3, src=()),)),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (4,)), src=( - ast_const(dtypes.int, 1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 5), strides=(0, 0), offset=0, mask=((0, 4), (2, 5)), contiguous=False), View(shape=(2, 3, 2, 3, 3), strides=(0, 0, 0, 1, 6), offset=0, mask=None, contiguous=False))), src=()),)),)), - x19,)),)), - x21,)),)),)),)),)),)),)) + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 5), strides=(0, 0), offset=0, mask=((0, 4), (2, 5)), contiguous=False), View(shape=(2, 3, 2, 3, 3), strides=(0, 0, 0, 1, 6), offset=0, mask=None, contiguous=False))), src=()),)), + x20, + x22,)),)), + x23,)),)), + x25,)),)),)),)),)),)),)) # ValueError: size mismatched, can't reshape self.shape=(6, 2, 3, 3) -> new_shape=(6, 2, 3, 1, 2) opts = [Opt(op=OptOps.UNROLL, axis=2, arg=0)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -1089,8 +1107,8 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_46(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(512, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(512), arg=0, src=()),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( @@ -1099,23 +1117,23 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 10), strides=(0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.int.ptr(10), arg=ShapeTracker(views=(View(shape=(512, 10), strides=(0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(10), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 10), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.bool, True, st_src=( + UOp(Ops.VIEW, dtypes.int.ptr(512), arg=ShapeTracker(views=(View(shape=(512, 10), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(512), arg=2, src=()),)),)),)), + UOp(Ops.CONST, dtypes.bool, arg=True, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 10), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.bool, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 10), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.bool.ptr(512), arg=ShapeTracker(views=(View(shape=(512, 10), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(512), arg=3, src=()),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 10), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(512, 10), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=4, src=()),)),)),)),)), UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=5, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(512, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(512), arg=5, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=0, arg=2)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -1123,13 +1141,18 @@ class TestLinearizerFailures(unittest.TestCase): # upcast an arange, failed with UOP_IS_SYMBOLIC=1 (fixed!) ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.int.ptr(60000), arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(60000), arg=0, src=()),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - ast_const(dtypes.int, 1, st_src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60001, 119999), strides=(0, 0), offset=0, mask=((0, 60001), (59999, 119999)), contiguous=False), View(shape=(60000, 60000), strides=(1, 120000), offset=0, mask=None, contiguous=False))), src=()),)),)), - ast_const(dtypes.int, -1, st_src=( + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60001, 119999), strides=(0, 0), offset=0, mask=((0, 60001), (59999, 119999)), contiguous=False), View(shape=(60000, 60000), strides=(1, 120000), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=1, src=( + x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 60000), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x9,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=-1, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=0, arg=3)] helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) @@ -1139,17 +1162,17 @@ class TestLinearizerFailures(unittest.TestCase): # with UOP_IS_SYMBOLIC=1, generates the wrong IDIV (fixed!) ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 64, 1, 1, 256, 1, 1, 256), strides=(0, 0, 65536, 0, 0, 256, 0, 0, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(4194304), arg=ShapeTracker(views=(View(shape=(1, 1, 64, 1, 1, 256, 1, 1, 256), strides=(0, 0, 65536, 0, 0, 256, 0, 0, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4194304), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3, 4)), src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 64, 56, 56, 256, 1, 1, 256), strides=(0, 0, 0, 56, 1, 3136, 0, 0, 802816), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(205520896), arg=ShapeTracker(views=(View(shape=(1, 1, 64, 56, 56, 256, 1, 1, 256), strides=(0, 0, 0, 56, 1, 3136, 0, 0, 802816), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(205520896), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 64, 56, 56, 256, 1, 1, 256), strides=(0, 0, 3136, 56, 1, 0, 0, 0, 200704), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(51380224), arg=ShapeTracker(views=(View(shape=(1, 1, 64, 56, 56, 256, 1, 1, 256), strides=(0, 0, 3136, 56, 1, 0, 0, 0, 200704), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(51380224), arg=2, src=()),)),)),)),)),)),)),)) opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 0, 1)), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=2)] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) @@ -1157,16 +1180,16 @@ class TestLinearizerFailures(unittest.TestCase): # with UOP_IS_SYMBOLIC=1, on METAL it breaks store fusion and has A+B and B+A being two different UOp ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 6, 1), strides=(6, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(60), arg=ShapeTracker(views=(View(shape=(10, 6, 1), strides=(6, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(60), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 6, 10), strides=(10, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(100), arg=ShapeTracker(views=(View(shape=(10, 6, 10), strides=(10, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(100), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 6, 10), strides=(0, 1, 6), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(60), arg=ShapeTracker(views=(View(shape=(10, 6, 10), strides=(0, 1, 6), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(60), arg=2, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=0, arg=2)] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) @@ -1174,25 +1197,25 @@ class TestLinearizerFailures(unittest.TestCase): # from BEAM_COMPARE=2 running tinyphysics.onnx model ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 20, 1, 20), strides=(0, 0, 20, 0, 1), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.bool.ptr(400), arg=ShapeTracker(views=(View(shape=(1, 1, 20, 1, 20), strides=(0, 0, 20, 0, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(400), arg=0, src=()),)), UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.bool, arg=(Ops.ADD, (3,)), src=( UOp(Ops.MUL, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.bool, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 0, 20, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.bool.ptr(400), arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 0, 20, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(400), arg=1, src=()),)),)), UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.int.ptr(20), arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(20), arg=2, src=()),)),)), UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - ast_const(dtypes.bool, True, st_src=( + UOp(Ops.VIEW, dtypes.int.ptr(20), arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(20), arg=3, src=()),)),)),)), + UOp(Ops.CONST, dtypes.bool, arg=True, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)), - ast_const(dtypes.bool, True, st_src=( + UOp(Ops.CONST, dtypes.bool, arg=True, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 20, 1, 20), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=1, arg=2)] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) @@ -1201,8 +1224,8 @@ class TestLinearizerFailures(unittest.TestCase): # regression test for #7019, training bert on tinybox red ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(12, 1024, 1), strides=(1024, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(12288), arg=ShapeTracker(views=(View(shape=(12, 1024, 1), strides=(1024, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(12288), arg=0, src=()),)), UOp(Ops.RECIP, dtypes.half, arg=None, src=( UOp(Ops.ADD, dtypes.half, arg=None, src=( UOp(Ops.CONST, dtypes.half, arg=1.0, src=( @@ -1218,19 +1241,20 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(12, 1024, 1024), strides=(524288, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(5768192), arg=ShapeTracker(views=(View(shape=(12, 1024, 1024), strides=(524288, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(5768192), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(12, 1024, 1024), strides=(0, 1024, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)), + UOp(Ops.VIEW, dtypes.half.ptr(1048576), arg=ShapeTracker(views=(View(shape=(12, 1024, 1024), strides=(0, 1024, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1048576), arg=2, src=()),)),)),)),)),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(12, 1024, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(12, 1024, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=3, src=()),)),)),)),)), UOp(Ops.CONST, dtypes.half, arg=-1.4426950408889634, src=( x6,)),)),)),)),)),)),)) opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) + @unittest.skip("allocating over 200MB buffer") @unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI") def test_failure_52(self): # resnet beam. @@ -1238,18 +1262,18 @@ class TestLinearizerFailures(unittest.TestCase): # CUDA Error 700, an illegal memory access was encountered ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 1, 1, 1), strides=(802816, 0, 12544, 112, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(205520896), arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 1, 1, 1), strides=(802816, 0, 12544, 112, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(205520896), arg=0, src=()),)), UOp(Ops.CAST, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 256, 1, 3, 8, 230, 8, 230), strides=(0, 150528, 0, 50176, 0, 224, 0, 1), offset=-675, mask=((0, 1), (0, 256), (0, 1), (0, 3), (0, 8), (3, 227), (0, 8), (3, 227)), contiguous=False), View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(10156800, 0, 0, 3680, 2, 3385600, 425040, 231), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.VIEW, dtypes.half.ptr(38535168), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 3, 8, 230, 8, 230), strides=(0, 150528, 0, 50176, 0, 224, 0, 1), offset=-675, mask=((0, 1), (0, 256), (0, 1), (0, 3), (0, 8), (3, 227), (0, 8), (3, 227)), contiguous=False), View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(10156800, 0, 0, 3680, 2, 3385600, 425040, 231), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(38535168), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(0, 0, 147, 0, 0, 49, 7, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(9408), arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(0, 0, 147, 0, 0, 49, 7, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(9408), arg=2, src=()),)),)),)),)),)),)),)),)) opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=16)] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) @@ -1257,19 +1281,19 @@ class TestLinearizerFailures(unittest.TestCase): # COMPILE_ERROR, val scope issue ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.uchar.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(1024), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.uchar, arg=(Ops.ADD, (1,)), src=( UOp(Ops.MUL, dtypes.uchar, arg=None, src=( UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 50000, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.uchar.ptr(50000), arg=ShapeTracker(views=(View(shape=(1024, 50000, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(50000), arg=1, src=()),)),)), UOp(Ops.CAST, dtypes.uchar, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 50000, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.int.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 50000, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1024), arg=2, src=()),)),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (2,)), src=( UOp(Ops.WHERE, dtypes.int, arg=None, src=( @@ -1292,18 +1316,18 @@ class TestLinearizerFailures(unittest.TestCase): # HIP: Memory access fault by GPU node-1 (Agent handle: 0x56c21f1d1480) on address 0x730cc242e000. Reason: Page not present or supervisor privilege. ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 64, 56, 56, 1, 1, 1), strides=(200704, 0, 3136, 56, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.half.ptr(51380224), arg=ShapeTracker(views=(View(shape=(256, 1, 64, 56, 56, 1, 1, 1), strides=(200704, 0, 3136, 56, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(51380224), arg=0, src=()),)), UOp(Ops.CAST, dtypes.half, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.half, arg=None, src=( UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 256, 1, 64, 4, 58, 4, 58), strides=(0, 200704, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, 256), (0, 1), (0, 64), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(256, 1, 64, 56, 56, 64, 3, 3), strides=(3444736, 0, 0, 232, 1, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(256, 1, 64, 56, 56, 64, 3, 3), strides=(0, 0, 576, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.half.ptr(51380224), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 64, 4, 58, 4, 58), strides=(0, 200704, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, 256), (0, 1), (0, 64), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(256, 1, 64, 56, 56, 64, 3, 3), strides=(3444736, 0, 0, 232, 1, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(51380224), arg=1, src=()),)),)), + UOp(Ops.LOAD, dtypes.half, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(36864), arg=ShapeTracker(views=(View(shape=(256, 1, 64, 56, 56, 64, 3, 3), strides=(0, 0, 576, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(36864), arg=2, src=()),)),)),)),)),)),)),)),)) opts = [Opt(op=OptOps.TC, axis=2, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=2, arg=7), Opt(op=OptOps.UPCAST, axis=1, arg=2)] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=["HIP", "AMD"]) @@ -1311,28 +1335,27 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_55(self): W = 2 ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(W, 1, 64, 56, 56, 1, 1, 1), strides=(200704, 0, 3136, 56, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, W, 1, 64, 4, 58, 4, 58), strides=(0, 200704, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, W), (0, 1), (0, 64), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), - View(shape=(W, 1, 64, 56, 56, 64, 3, 3), strides=(3444736, 0, 0, 232, 1, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(W, 1, 64, 56, 56, 64, 3, 3), strides=(0, 0, 576, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(W * 200704), arg=ShapeTracker(views=(View(shape=(W, 1, 64, 56, 56, 1, 1, 1), strides=(200704, 0, 3136, 56, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(W * 200704), arg=0, src=()),)), + UOp(Ops.CAST, dtypes.half, arg=None, src=( + UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( + UOp(Ops.CAST, dtypes.float, arg=None, src=( + UOp(Ops.MUL, dtypes.half, arg=None, src=( + UOp(Ops.LOAD, dtypes.half, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(W * 200704), arg=ShapeTracker(views=(View(shape=(1, W, 1, 64, 4, 58, 4, 58), strides=(0, 200704, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, W), (0, 1), (0, 64), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(W, 1, 64, 56, 56, 64, 3, 3), strides=(3444736, 0, 0, 232, 1, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(W * 200704), arg=1, src=()),)),)), + UOp(Ops.LOAD, dtypes.half, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(W * 18432), arg=ShapeTracker(views=(View(shape=(W, 1, 64, 56, 56, 64, 3, 3), strides=(0, 0, 576, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(W * 18432), arg=2, src=()),)),)),)),)),)),)),)),)) opts = [Opt(op=OptOps.SWAP, axis=1, arg=2)] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) def test_failure_56(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 16, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(1, 16, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 3)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( @@ -1345,35 +1368,35 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(1936, 121, 11, 1), offset=0, mask=None, contiguous=True),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(247808), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(1936, 121, 11, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(247808), arg=1, src=()),)),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - x20:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=2, src=()),)),)), UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( x8,)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - x20,)),)), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=3, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), - x20,)),)), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=4, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=5, src=()), - x20,)),)), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=5, src=()),)),)),)), x7,)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=6, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 16, 5, 2, 5, 2), strides=(1600, 100, 20, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(128, 16, 11, 11), strides=(1600, 100, 10, 1), offset=0, mask=((0, 128), (0, 16), (0, 10), (0, 10)), contiguous=False))), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(204800), arg=ShapeTracker(views=(View(shape=(128, 16, 5, 2, 5, 2), strides=(1600, 100, 20, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(128, 16, 11, 11), strides=(1600, 100, 10, 1), offset=0, mask=((0, 128), (0, 16), (0, 10), (0, 10)), contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(204800), arg=6, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=2, arg=32)] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) def test_failure_57(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 16, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(1, 16, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 3)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( @@ -1386,27 +1409,27 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(1936, 121, 11, 1), offset=0, mask=None, contiguous=True),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(247808), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(1936, 121, 11, 1), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(247808), arg=1, src=()),)),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - x20:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=2, src=()),)),)), UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( x8,)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - x20,)),)), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=3, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), - x20,)),)), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=4, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=5, src=()), - x20,)),)), + UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=5, src=()),)),)),)), x7,)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=6, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 16, 5, 2, 5, 2), strides=(1600, 100, 20, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(128, 16, 11, 11), strides=(1600, 100, 10, 1), offset=0, mask=((0, 128), (0, 16), (0, 10), (0, 10)), contiguous=False))), src=()),)),)),)),)),)) + UOp(Ops.VIEW, dtypes.float.ptr(204800), arg=ShapeTracker(views=(View(shape=(128, 16, 5, 2, 5, 2), strides=(1600, 100, 20, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(128, 16, 11, 11), strides=(1600, 100, 10, 1), offset=0, mask=((0, 128), (0, 16), (0, 10), (0, 10)), contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(204800), arg=6, src=()),)),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=1, arg=32)] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) @@ -1414,8 +1437,8 @@ class TestLinearizerFailures(unittest.TestCase): # OUT OF BOUNDS ACCESS in INDEX 0 - 50271 not in 0 - 50257. idx.src[1].render()='gidx0' ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(50257), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(50257, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.int.ptr(50257), arg=ShapeTracker(views=(View(shape=(50257, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(50257), arg=0, src=()),)), UOp(Ops.ADD, dtypes.int, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( UOp(Ops.WHERE, dtypes.int, arg=None, src=( @@ -1424,7 +1447,7 @@ class TestLinearizerFailures(unittest.TestCase): UOp(Ops.CONST, dtypes.int, arg=1, src=( x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(50257, 50257), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), UOp(Ops.CONST, dtypes.int, arg=0, src=( - x9,)),)),)), + x9,)),)),)), UOp(Ops.CONST, dtypes.int, arg=-1, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(50257, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.GROUPTOP, axis=0, arg=29), Opt(op=OptOps.PADTO, axis=0, arg=32)] @@ -1436,41 +1459,41 @@ class TestLinearizerFailures(unittest.TestCase): # stable diffusion with SINGLE_KERNEL_SOFTMAX=1 ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(268435456), arg=0, src=()), - x2:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 1), strides=(134217728, 16777216, 4096, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 1), strides=(134217728, 16777216, 4096, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(268435456), arg=0, src=()),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.EXP2, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x8:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(268435456), arg=1, src=()), - x2,)), + UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 1), strides=(134217728, 16777216, 4096, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + x9:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(268435456), arg=1, src=()),)),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (5,), True), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x8, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 4096), strides=(134217728, 16777216, 4096, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 4096), strides=(134217728, 16777216, 4096, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + x9,)),)),)), UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x14:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 1), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + x15:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 1), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - x14,)),)),)), + x15,)),)),)), UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (4,)), src=( UOp(Ops.EXP2, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x8, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 4096, 1), strides=(134217728, 16777216, 4096, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 4096, 1), strides=(134217728, 16777216, 4096, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + x9,)),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (5,), True), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x8, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 4096, 4096), strides=(134217728, 16777216, 4096, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 4096, 4096), strides=(134217728, 16777216, 4096, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + x9,)),)),)), UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x28:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 4096, 1), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + x29:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 4096, 1), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - x28,)),)),)),)),)),)),)),)) + x29,)),)),)),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UNROLL, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.LOCAL, axis=1, arg=16)] # NOTE: this is slow to run, just confirm it can generate the program without Exception Kernel(ast, opts=Device[Device.DEFAULT].renderer).apply_opts(opts).to_program() @@ -1481,144 +1504,224 @@ class TestLinearizerFailures(unittest.TestCase): # TestSymbolicOps.test_attention ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(80), arg=0, src=()), - x2:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( x2:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)), - x2,)), + x2,)), UOp(Ops.CONST, dtypes.int, arg=4, src=()),)), UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)), - x1,)), 0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + x1,)), 0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(80), arg=0, src=()),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.EXP2, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(80), arg=1, src=()), - x2,)), + UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.MUL, dtypes.int, arg=None, src=( + x2:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), + UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)), + x2,)), + UOp(Ops.CONST, dtypes.int, arg=4, src=()),)), UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.MUL, dtypes.int, arg=None, src=( + x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), + UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)), + x1,)), 0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(80), arg=1, src=()),)),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (5,), True), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x12:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(80), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=4, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( + x3,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1, + x1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x1,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( + x1,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=0, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)), + x3,)), 0, 1), offset=0, mask=None, contiguous=False),)), src=( + x14:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(80), arg=2, src=()),)),)),)), UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x15:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, 1), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + x16:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, 1), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - x15,)),)),)), + x16,)),)),)), UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (4,)), src=( UOp(Ops.EXP2, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x12, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=4, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( + x3,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1, + x1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x1,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( + x1,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=0, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + x3,)), 1, 0), offset=0, mask=None, contiguous=False),)), src=( + x14,)),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (5,), True), src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - x12, - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=4, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), UOp(Ops.MUL, dtypes.int, arg=None, src=( x0:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( - x0, + x0, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=0, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), 1), offset=0, mask=None, contiguous=False), View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(UOp(Ops. - MUL, dtypes.int, arg=None, src=( + UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), 1), offset=0, mask=None, contiguous=False), View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=4, src=()), x2:=UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x2,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( + x2,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), x2:=UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1, + x1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x2,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( + x2,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=0, src=()), x2:=UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x2,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( + x2,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( x0:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( - x0, - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), 1), offset=0, mask=None, contiguous=False))), src=()),)),)), + x0, + UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), 1), offset=0, mask=None, contiguous=False))), src=( + x14,)),)),)), UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x29:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int - , arg=('i', 1, 10), src=()), 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( + x30:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=4, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( + x3,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1, + x1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x1,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( + x1,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.MUL, dtypes.int, arg=None, src=( UOp(Ops.CONST, dtypes.int, arg=0, src=()), UOp(Ops.MUL, dtypes.int, arg=None, src=( x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), 1, 0), offset=0, mask=None, contiguous=False))), src=()),)),)),)), + x3,)), 1, 0), offset=0, mask=None, contiguous=False))), src=()),)),)),)), UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - x29,)),)),)),)),)),)),)),)) + x30,)),)),)),)),)),)),)),)) opts = [Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.LOCAL, axis=0, arg=4)] # NOTE: this is slow to run, just confirm it can generate the program without Exception Kernel(ast, opts=Device[Device.DEFAULT].renderer).apply_opts(opts).to_program() + def test_failure_61(self): + # WINO=1 JITBEAM=4 python3 examples/beautiful_cifar.py + ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=0, src=()),)), + UOp(Ops.CAST, dtypes.half, arg=None, src=( + UOp(Ops.CAST, dtypes.float, arg=None, src=( + UOp(Ops.MUL, dtypes.half, arg=None, src=( + UOp(Ops.MUL, dtypes.half, arg=None, src=( + x7:=UOp(Ops.CONST, dtypes.half, arg=0.6931471805599453, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CAST, dtypes.half, arg=None, src=( + UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( + UOp(Ops.CAST, dtypes.float, arg=None, src=( + UOp(Ops.MUL, dtypes.half, arg=None, src=( + UOp(Ops.CONST, dtypes.half, arg=-1.0, src=( + x14:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 10, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.ADD, dtypes.half, arg=None, src=( + UOp(Ops.CONST, dtypes.half, arg=-0.010000000000000002, src=( + x14,)), + UOp(Ops.MUL, dtypes.half, arg=None, src=( + UOp(Ops.CAST, dtypes.half, arg=None, src=( + UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( + UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( + UOp(Ops.LOAD, dtypes.int, arg=None, src=( + UOp(Ops.VIEW, dtypes.int.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 10, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1024), arg=1, src=()),)),)), + UOp(Ops.ADD, dtypes.int, arg=None, src=( + UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (2,), True), src=( + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.VALID, dtypes.bool, arg=None, src=( + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(11, 19), strides=(0, 0), offset=0, mask=((0, 11), (9, 19)), contiguous=False), View(shape=(1024, 10, 10), strides=(0, 1, 20), offset=0, mask=None, contiguous=False))), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=1, src=( + x30:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 10, 10), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x30,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x14,)),)),)), + UOp(Ops.CONST, dtypes.bool, arg=True, src=( + x14,)),)),)), + UOp(Ops.CONST, dtypes.half, arg=-0.4, src=( + x14,)),)),)),)),)),)),)),)), + UOp(Ops.RECIP, dtypes.half, arg=None, src=( + UOp(Ops.MUL, dtypes.half, arg=None, src=( + UOp(Ops.LOAD, dtypes.half, arg=None, src=( + UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=2, src=()),)),)), + x7,)),)),)),)),)),)),)) + opts = [Opt(op=OptOps.LOCAL, axis=0, arg=32), Opt(op=OptOps.GROUP, axis=1, arg=0)] + helper_test_lin(Kernel(ast), opts, failed_platforms=["AMD", "METAL", "CUDA", "NV"]) + + def test_failure_62(self): + # WINO=1 DEFAULT_FLOAT=HALF FUSE_ARANGE=1 JITBEAM=4 BS=1024 STEPS=500 python examples/hlb_cifar10.py + # RuntimeError: UOp verification failed at 4 on Ops.LOAD dtypes.half 2 [, ] None + ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(11808768), arg=0, src=()), + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1, 12, 31, 31, 1, 1, 1), strides=(11532, 0, 961, 31, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.CAST, dtypes.half, arg=None, src=( + UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( + UOp(Ops.CAST, dtypes.float, arg=None, src=( + UOp(Ops.MUL, dtypes.half, arg=None, src=( + UOp(Ops.LOAD, dtypes.half, arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(3145728), arg=1, src=()), + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1, 12, 31, 31, 3, 2, 2), strides=(3072, 0, 0, 32, 1, 1024, 32, 1), offset=0, mask=None, contiguous=False),)), src=()),)), + UOp(Ops.LOAD, dtypes.half, arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(144), arg=2, src=()), + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1, 12, 31, 31, 3, 2, 2), strides=(0, 0, 12, 0, 0, 4, 2, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) + opts = [Opt(op=OptOps.LOCAL, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=1, arg=3), Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.PADTO, axis=2, arg=32), Opt(op=OptOps.UPCAST, axis=2, arg=4), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=0)] + helper_test_lin(Kernel(ast), opts, failed_platforms=["AMD", "HIP", "NV", "CUDA"]) + if __name__ == '__main__': unittest.main() diff --git a/tinygrad_repo/test/test_linearizer_overflows.py b/tinygrad_repo/test/test_linearizer_overflows.py index 502d544..0bd8ef0 100644 --- a/tinygrad_repo/test/test_linearizer_overflows.py +++ b/tinygrad_repo/test/test_linearizer_overflows.py @@ -1,6 +1,5 @@ # ruff: noqa: E501 import unittest -from test.helpers import ast_const from tinygrad import dtypes, Device from tinygrad.helpers import CI from tinygrad.codegen.kernel import Kernel @@ -26,7 +25,7 @@ class TestLinearizerOverflow(unittest.TestCase): def test_overflow_1(self): ast = UOp(Ops.SINK, None, arg=None, src=( UOp(Ops.STORE, None, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(51380224), arg=0, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(802816, 0, 12544, 112, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), UOp(Ops.MAX, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( @@ -36,29 +35,29 @@ class TestLinearizerOverflow(unittest.TestCase): UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9633792), arg=1, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 64, 1, 3, 8, 230, 8, 230), strides=(0, 150528, 0, 50176, 0, 224, 0, 1), offset=-675, mask=((0, 1), (0, 64), (0, 1), (0, 3), (0, 8), (3, 227), (0, 8), (3, 227)), contiguous=False), View(shape=(64, 1, 64, 112, 112, 3, 7, 7), strides=(10156800, 0, 0, 3680, 2, 3385600, 425040, 231), offset=0, mask=None, contiguous=False))), src=()),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9408), arg=2, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 3, 7, 7), strides=(0, 0, 147, 0, 0, 49, 7, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - x16:=ast_const(dtypes.float, 0.0, st_src=( - UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + x16:=UOp(Ops.CONST, dtypes.float, arg=0.0, src=( + x17:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=3, src=()), + x20:=UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), UOp(Ops.SQRT, dtypes.float, arg=None, src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( - x23:=ast_const(dtypes.float, 1.0, st_src=( - UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), + x23:=UOp(Ops.CONST, dtypes.float, arg=1.0, src=( + x17,)), UOp(Ops.RECIP, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( x23, - ast_const(dtypes.float, 1e-05, st_src=( - UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=1e-05, src=( + x17,)),)),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), - UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - x16,)),)),)) + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=4, src=()), + x20,)),)), + x16,)),)),)) opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.LOCAL, axis=2, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=2, arg=0)] _test_overflow(ast, opts) @@ -66,15 +65,15 @@ class TestLinearizerOverflow(unittest.TestCase): def test_overflow_2(self): ast = UOp(Ops.SINK, None, arg=None, src=( UOp(Ops.STORE, None, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(33554432), arg=0, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(512, 1, 64, 32, 32, 1, 1, 1), strides=(65536, 0, 1024, 32, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16777216), arg=1, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 512, 1, 32, 4, 34, 4, 34), strides=(0, 32768, 0, 1024, 0, 32, 0, 1), offset=-33, mask=((0, 1), (0, 512), (0, 1), (0, 32), (0, 4), (1, 33), (0, 4), (1, 33)), contiguous=False), View(shape=(512, 1, 64, 32, 32, 32, 3, 3), strides=(591872, 0, 0, 136, 1, 18496, 4760, 35), offset=0, mask=None, contiguous=False))), src=()),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(18432), arg=2, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(512, 1, 64, 32, 32, 32, 3, 3), strides=(0, 0, 288, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.LOCAL, axis=2, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=0)] _test_overflow(ast, opts) @@ -83,15 +82,15 @@ class TestLinearizerOverflow(unittest.TestCase): def test_overflow_3(self): ast = UOp(Ops.SINK, None, arg=None, src=( UOp(Ops.STORE, None, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(33554432), arg=0, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(16, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(33554432), arg=1, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 16, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 16), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(16, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(16, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.LOCAL, axis=2, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=2, arg=2)] _test_overflow(ast, opts) @@ -100,15 +99,15 @@ class TestLinearizerOverflow(unittest.TestCase): def test_overflow_4(self): ast = UOp(Ops.SINK, None, arg=None, src=( UOp(Ops.STORE, None, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(8388608), arg=0, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(4, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(8388608), arg=1, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 4, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 4), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(4, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(4, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=3, arg=4), Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=2, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=2, arg=4)] _test_overflow(ast, opts) @@ -117,15 +116,15 @@ class TestLinearizerOverflow(unittest.TestCase): def test_overflow_5(self): ast = UOp(Ops.SINK, None, arg=None, src=( UOp(Ops.STORE, None, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4194304), arg=0, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(2, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4194304), arg=1, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 2, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 2), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(2, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(2, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.LOCAL, axis=2, arg=2), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=2, arg=2)] _test_overflow(ast, opts) @@ -134,15 +133,15 @@ class TestLinearizerOverflow(unittest.TestCase): def test_overflow_6(self): ast = UOp(Ops.SINK, None, arg=None, src=( UOp(Ops.STORE, None, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6291456), arg=0, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(3, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6291456), arg=1, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 3, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 3), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(3, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(3, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=2, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=2)] _test_overflow(ast, opts) @@ -151,15 +150,15 @@ class TestLinearizerOverflow(unittest.TestCase): def test_overflow_7(self): ast = UOp(Ops.SINK, None, arg=None, src=( UOp(Ops.STORE, None, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6291456), arg=0, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(3, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6291456), arg=1, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 3, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 3), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(3, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()), UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(3, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=3, arg=4), Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=2, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=2, arg=4)] _test_overflow(ast, opts) @@ -174,8 +173,8 @@ class TestLinearizerOverflowAlt(unittest.TestCase): View(shape=(BS, 1, 64, 112, 112, 3, 7, 7), strides=(10156800, 0, 0, 3680, 2, 3385600, 425040, 231), offset=0, mask=None, contiguous=False))).to_uop() in_st_2 = ShapeTracker(views=(View(shape=(BS, 1, 64, 112, 112, 3, 7, 7), strides=(0, 0, 147, 0, 0, 49, 7, 1), offset=0, mask=None, contiguous=False),)).to_uop() ot_st = ShapeTracker(views=(View(shape=(BS, 1, 64, 112, 112, 1, 1, 1), strides=(802816, 0, 12544, 112, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)).to_uop() - prod = UOp(Ops.LOAD, dtypes.float, (g1, in_st_1)) * UOp(Ops.LOAD, dtypes.float, (g2, in_st_2)) - store = UOp(Ops.STORE, src=(g0, ot_st, UOp(Ops.REDUCE_AXIS, dtypes.float, (prod,), (Ops.ADD, (7, 6, 5))))) + prod = UOp(Ops.LOAD, dtypes.float, (g1.view(in_st_1.arg),)) * UOp(Ops.LOAD, dtypes.float, (g2.view(in_st_2.arg),)) + store = UOp(Ops.STORE, src=(g0.view(ot_st.arg), UOp(Ops.REDUCE_AXIS, dtypes.float, (prod,), (Ops.ADD, (7, 6, 5))))) ast = UOp(Ops.SINK, src=(store,)) opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.LOCAL, axis=2, arg=2), Opt(op=OptOps.UPCAST, axis=0, arg=2)] _test_overflow(ast, opts) @@ -186,8 +185,8 @@ class TestLinearizerOverflowAlt(unittest.TestCase): View(shape=(BS, 1, 64, 112, 112, 3, 7, 7), strides=(10156800, 0, 0, 3680, 2, 3385600, 425040, 231), offset=0, mask=None, contiguous=False))).to_uop() in_st_2 = ShapeTracker(views=(View(shape=(BS, 1, 64, 112, 112, 3, 7, 7), strides=(0, 0, 147, 0, 0, 49, 7, 1), offset=0, mask=None, contiguous=False),)).to_uop() ot_st = ShapeTracker(views=(View(shape=(BS, 1, 64, 112, 112, 1, 1, 1), strides=(802816, 0, 12544, 112, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)).to_uop() - prod = UOp(Ops.LOAD, dtypes.float, (g1, in_st_1)) * UOp(Ops.LOAD, dtypes.float, (g2, in_st_2)) - store = UOp(Ops.STORE, src=(g0, ot_st, UOp(Ops.REDUCE_AXIS, dtypes.float, (prod,), (Ops.ADD, (7, 6, 5))))) + prod = UOp(Ops.LOAD, dtypes.float, (g1.view(in_st_1.arg),)) * UOp(Ops.LOAD, dtypes.float, (g2.view(in_st_2.arg),)) + store = UOp(Ops.STORE, src=(g0.view(ot_st.arg), UOp(Ops.REDUCE_AXIS, dtypes.float, (prod,), (Ops.ADD, (7, 6, 5))))) ast = UOp(Ops.SINK, src=(store,)) opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=2, arg=16), Opt(op=OptOps.UPCAST, axis=4, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=5, arg=2)] _test_overflow(ast, opts) diff --git a/tinygrad_repo/test/test_multitensor.py b/tinygrad_repo/test/test_multitensor.py index 87ed97a..0f6f16b 100644 --- a/tinygrad_repo/test/test_multitensor.py +++ b/tinygrad_repo/test/test_multitensor.py @@ -1,13 +1,12 @@ -import unittest, functools, random -from typing import List +import unittest, functools, random, os from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes, Variable +from tinygrad.device import is_dtype_supported from tinygrad.uop.ops import Ops, UOp from tinygrad.helpers import CI, getenv, prod, Context, OSX from tinygrad.nn.state import get_parameters, get_state_dict from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule import numpy as np from hypothesis import given, strategies as strat, settings -from tinygrad.device import is_dtype_supported from test.helpers import REAL_DEV, not_support_multi_device settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False)) @@ -30,7 +29,7 @@ N = 128 def _test_allreduce(t:Tensor): aa = (t[0:64] + t[64:128] + t[128:192] + t[192:256]).repeat([4,1]).realize() ts = t.shard(devices_4, 0).realize() - b = Tensor(UOp.allreduce(ts.lazydata, Ops.ADD, ts.device)) + b = Tensor(UOp.allreduce(ts.uop, Ops.ADD, ts.device)) b.realize() return aa, b @@ -51,7 +50,7 @@ class TestMultiTensor(unittest.TestCase): def test_shard(self): X = Tensor.ones(256).contiguous().realize() X.shard_(devices_2, 0) - for lb in X.lazydata.src: + for lb in X.uop.src: assert lb.shape == (128,) (X + X).realize() @@ -62,18 +61,20 @@ class TestMultiTensor(unittest.TestCase): def test_tensor_from_multi(self): X = Tensor([1, 2], dtype=dtypes.int).shard_(devices_2, 0) - Y = Tensor(X.lazydata) + Y = Tensor(X.uop) self.assertEqual(Y.device, Device.DEFAULT) np.testing.assert_equal(X.numpy(), Y.numpy()) with self.assertRaises(AssertionError): - _ = Tensor(X.lazydata, dtype=dtypes.float) + _ = Tensor(X.uop, dtype=dtypes.float) def test_sharded_arange(self): sharded_arange = Tensor.arange(1000).shard(devices_2, 0) sharded_arange.realize() np.testing.assert_equal(sharded_arange.numpy(), np.arange(1000)) + # TODO: fix this to not copy on the src device + @unittest.expectedFailure def test_shard_no_recompile(self): X = Tensor.ones(256).contiguous().realize() X.shard_(devices_2, 0) @@ -83,7 +84,7 @@ class TestMultiTensor(unittest.TestCase): for si, ei in lower_schedule(sched): if isinstance(ei.prg, CompiledRunner): names.append(ei.prg.p.name) ei.run() - self.assertEqual(len(set(names)), 1), "function was relinearized" + self.assertEqual(len(set(names)), 1, "function was relinearized") @unittest.skip("this doesn't fold because shard_ calls contiguous on all lbs") def test_sharded_memory(self): @@ -171,9 +172,9 @@ class TestMultiTensor(unittest.TestCase): for i in range(2): xt = X[i*2:i*2+2].contiguous() sched = xt.schedule() - kernels = [s for s in sched if s.ast.op is Ops.SINK] - self.assertEqual(len(kernels), 1) - self.assertEqual(kernels[0].bufs[0].device, devices_2[i]) + #kernels = [s for s in sched if s.ast.op is Ops.SINK] + #self.assertEqual(len(kernels), 1) + #self.assertEqual(kernels[0].bufs[0].device, devices_2[i]) run_schedule(sched) np.testing.assert_equal(xt.numpy(), X_np[i*2:i*2+2]) @@ -246,9 +247,9 @@ class TestMultiTensor(unittest.TestCase): shape = tuple([(n if i == 0 else 1) * random.randint(1, 10) for i in range(random.randint(1, 4))]) t = Tensor.rand(shape).shard_(tuple([d0, d1, d2, d3][:n]), 0) with Context(RING=0): - a = Tensor(UOp.allreduce(t.lazydata, Ops.ADD, t.device)) + a = Tensor(UOp.allreduce(t.uop, Ops.ADD, t.device)) with Context(RING=2): - b = Tensor(UOp.allreduce(t.lazydata, Ops.ADD, t.device)) + b = Tensor(UOp.allreduce(t.uop, Ops.ADD, t.device)) diff = a - b mean_err = diff.reshape((prod(diff.shape),)).abs().mean().numpy() max_err = diff.reshape((prod(diff.shape),)).abs().max().numpy() @@ -372,7 +373,7 @@ class TestMultiTensor(unittest.TestCase): np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6) # NOTE: this is failing on LLVM CI, no idea why. Works locally. - @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM"), "slow") + @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU") @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_data_parallel_resnet(self): from extra.models.resnet import ResNet18 @@ -575,12 +576,9 @@ class TestMultiTensor(unittest.TestCase): scheds = [sched for sched in out.schedule() if sched.bufs[0].device in devices_4 and sched.ast.op is not Ops.COPY] assert set(sched.bufs[0].device for sched in scheds) == set(devices_4), "should have ast on each shard device" asts = [sched.ast for sched in scheds] - assert len(asts) - # test case to show that ast can be different on devices - # TODO: make ast identical on devices - #assert len(set(asts)) == 4, len(asts) - # for i, ast in enumerate(asts): - # print(f"{i} {ast}") + self.assertEqual(len(asts), 4) + # ast are the same on devices + self.assertEqual(len(set(asts)), 1) def test_reshape_on_axis(self): t0 = Tensor.rand((26, 15, 7)).shard(devices_3, axis=1) @@ -592,21 +590,21 @@ class TestMultiTensor(unittest.TestCase): t4 = t2.reshape((26, 105,)) for t in [t0, t1, t2, t3, t4]: - assert t.lazydata.axis == 1 + assert t.uop.axis == 1 np.testing.assert_allclose(t.numpy().flatten(), t0.numpy().flatten()) # test shape-one axis t5 = t4.reshape((26, 1, 105)) - assert t5.lazydata.axis == 2 + assert t5.uop.axis == 2 np.testing.assert_allclose(t.numpy().flatten(), t5.numpy().flatten()) # test split and rejoin to the right and reshape to the left t5 = t0.reshape((2, 13, 3, 5, 7)) t6 = t0.reshape((13, 2, 3, 7, 5)) t7 = t0.reshape((1, 13, 2, 3, 1, 7, 5)) - assert t5.lazydata.axis == 2 - assert t6.lazydata.axis == 2 - assert t7.lazydata.axis == 3 + assert t5.uop.axis == 2 + assert t6.uop.axis == 2 + assert t7.uop.axis == 3 np.testing.assert_allclose(t5.numpy().flatten(), t0.numpy().flatten()) np.testing.assert_allclose(t6.numpy().flatten(), t0.numpy().flatten()) np.testing.assert_allclose(t7.numpy().flatten(), t0.numpy().flatten()) @@ -618,7 +616,7 @@ class TestMultiTensor(unittest.TestCase): @unittest.skip("no longer supports uneven shard") def test_reshape_on_axis_uneven(self): def reshape_helper(t0, t, t_axis): - assert t.lazydata.axis == t_axis + assert t.uop.axis == t_axis np.testing.assert_allclose(t0.reshape(t.shape).numpy(), t.numpy()) t0 = Tensor.rand((4, 42, 15)).shard(devices_3, axis=1, splits=[14, 7, 21]) @@ -689,24 +687,24 @@ class TestMultiTensor(unittest.TestCase): self.assertEqual(t.shape, t2.shape) self.assertEqual(t.device, t2.device) self.assertEqual(t.dtype, t2.dtype) - self.assertEqual(t.lazydata.axis, t2.lazydata.axis) + self.assertEqual(t.uop.axis, t2.uop.axis) def test_rand_like_from_alu(self): a = Tensor.ones(4, 4).shard(devices_4, axis=0) aa = a + a self.assertEqual(aa.device, devices_4) - self.assertEqual(aa.lazydata.axis, 0) + self.assertEqual(aa.uop.axis, 0) raa = aa.rand_like() self.assertEqual(raa.device, devices_4) - self.assertEqual(raa.lazydata.axis, 0) + self.assertEqual(raa.uop.axis, 0) b = Tensor.empty(4, 4).shard(devices_4, axis=None) ab = a + b self.assertEqual(ab.device, devices_4) - self.assertEqual(ab.lazydata.axis, 0) + self.assertEqual(ab.uop.axis, 0) rab = ab.rand_like() self.assertEqual(rab.device, devices_4) - self.assertEqual(rab.lazydata.axis, 0) + self.assertEqual(rab.uop.axis, 0) @unittest.skip("no longer supports uneven shard") def test_rand_like_uneven_shard(self): @@ -715,8 +713,8 @@ class TestMultiTensor(unittest.TestCase): self.assertEqual(t.shape, t2.shape) self.assertEqual(t.device, t2.device) self.assertEqual(t.dtype, t2.dtype) - self.assertEqual(t.lazydata.axis, t2.lazydata.axis) - assert all(tlb.shape == t2lb.shape for tlb, t2lb in zip(t.lazydata.src, t2.lazydata.src)) + self.assertEqual(t.uop.axis, t2.uop.axis) + assert all(tlb.shape == t2lb.shape for tlb, t2lb in zip(t.uop.src, t2.uop.src)) def test_rand_like_none_shard(self): t = Tensor.empty((16, 16)).shard(devices_2) @@ -724,7 +722,7 @@ class TestMultiTensor(unittest.TestCase): self.assertEqual(t.shape, t2.shape) self.assertEqual(t.device, t2.device) self.assertEqual(t.dtype, t2.dtype) - self.assertEqual(t.lazydata.axis, t2.lazydata.axis) + self.assertEqual(t.uop.axis, t2.uop.axis) def test_rand_like_arg_dtype(self): t = Tensor.empty((16, 16), dtype=dtypes.int32).shard(devices_2, axis=1) @@ -768,38 +766,12 @@ class TestMultiTensor(unittest.TestCase): assert set(unique) == {0, 2}, unique assert 100 < counts[0] < 156, counts[0] - @unittest.skip("test depends on UOp order. TODO: fix it") - def test_broadcast_const(self): - for axis in (None, 0, 1): - t = Tensor.zeros(16, 16).contiguous().shard(devices_4, axis).realize() - t = t + 1 - for si in t.schedule(): - ast = si.ast.src[0] - assert ast.op is Ops.STORE - assert ast.src[2].op is Ops.ADD - assert ast.src[2].src[0].op is Ops.LOAD - assert ast.src[2].src[1].src[1].op is Ops.CONST and ast.src[2].src[1].src[1].arg == 1 - t = 2 * t - for si in t.schedule(): - ast = si.ast.src[0] - assert ast.op is Ops.STORE - assert ast.src[2].op is Ops.MUL - assert ast.src[2].src[0].src[1].op is Ops.CONST and ast.src[2].src[0].src[1].arg == 2 - assert ast.src[2].src[1].op is Ops.LOAD - t = t + t.full_like(3) - for si in t.schedule(): - ast = si.ast.src[0] - assert ast.op is Ops.STORE - assert ast.src[2].op is Ops.ADD - assert ast.src[2].src[0].op is Ops.LOAD - assert ast.src[2].src[1].src[1].op is Ops.CONST and ast.src[2].src[1].src[1].arg == 3 - @unittest.skip("TODO: this requires forced_realize to be deleted.") def test_shard_memory(self): devices = (d0, d1, d2, d3) t = Tensor.zeros(16, 16).contiguous() t.shard_(devices, axis=0).realize() - assert all([lb is lb.base and lb.realized.base.size == 4 * 16 for lb in t.lazydata.src]) + assert all([lb is lb.base and lb.realized.base.size == 4 * 16 for lb in t.uop.src]) @unittest.skip("this is unreliable on OSX") def test_clone(self): @@ -809,28 +781,15 @@ class TestMultiTensor(unittest.TestCase): t = Tensor.rand(16, 16).shard(devices_2, axis=0) np.testing.assert_allclose(t.numpy(), t.clone().numpy()) - @unittest.skip("this test looks wrong, times 0 is 0") def test_multi_const_folding(self): with Context(TRACK_MATCH_STATS=0): a = Tensor.arange(3).realize() zeros = Tensor.zeros(3).realize() b = a.to(devices_2)*zeros.to(devices_2) sched = b.schedule() - self.assertEqual(len(sched), 6) - # notably, only two copies (for the arange) - vs 4 copies if we didn't fold the const copy - self.assertEqual(len([x for x in sched if any(u.op is Ops.COPY for u in x.ast.toposort())]), 2) - run_schedule(sched) + self.assertEqual(len(sched), 0) self.assertListEqual(b.tolist(), [0, 0, 0]) - @unittest.skip("not sure what this tests") - def test_dont_realize_intermediate_expand(self): - a = Tensor.empty(16, 1).shard_(devices_2, axis=0) - b = Tensor.empty(16, 16).to_(devices_2) - c = Tensor.empty(16, 16).shard_(devices_2, axis=1) - d = a+b - (d*c).realize() - assert not d.lazydata.is_realized - @unittest.skipIf(not_support_multi_device(), "no multi") class TestHandleData(unittest.TestCase): def test_copied_to_device(self): @@ -875,19 +834,6 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase): a.schedule() assert a.shape == (2, 8) - # real is no longer used, so these are on None and we can pad them however - """ - with self.assertRaises(AssertionError): - # cannot pad sharded and non-sharded axis at the same time - p = a.pad(((0, 6), (0, 1))) - p.schedule() - - with self.assertRaises(AssertionError): - # can only pad to whole axis - p = a.pad(((1, 5), (0, 0))) - p.schedule() - """ - p = a.pad(((0, 6), (0, 0))) p.schedule() assert p.shape == (8, 8) @@ -956,7 +902,7 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase): np.testing.assert_equal((a+a).numpy(), na+na) np.testing.assert_equal((b+b).numpy(), nb+nb) - @unittest.skip("why didn't this work?") + # @unittest.skip("why didn't this work?") def test_add_two_partitions(self): t = Tensor.arange(64).reshape(8, 8).contiguous().realize() t.shard_([f"{Device.DEFAULT}:{i}" for i in range(4)], axis=0) @@ -967,16 +913,9 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase): nb = t.numpy()[6:8] np.testing.assert_equal(a.numpy(), na) np.testing.assert_equal(b.numpy(), nb) - self.assertEqual(a.lazydata.real, (False, True, False, False)) - self.assertEqual(b.lazydata.real, (False, False, False, True)) - with self.assertRaises(AssertionError): - # cannot add directly - c = a + b - c.schedule() - + np.testing.assert_equal((a+b).numpy(), na+nb) c = a.pad(((2, 4), None)) + b.pad(((6, 0), None)) c.realize() - self.assertEqual(c.lazydata.real, (True, True, True, True)) expected = np.concatenate([np.zeros_like(t.numpy()[0:2]), na, np.zeros_like(t.numpy()[4:6]), nb]) np.testing.assert_equal(c.numpy(), expected) @@ -988,8 +927,8 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase): for i in range(len(devices)): to_add.append((Tensor.ones(2, 8) * i).shard(devices)) - added:List[Tensor] = [] - for bound, a in zip(x.lazydata.bounds, to_add): + added:list[Tensor] = [] + for bound, a in zip(x.uop.bounds, to_add): added.append(x[bound[0]:bound[1]] + a) output = added[0].cat(*added[1:]) @@ -1032,7 +971,7 @@ class TestBatchNorm(unittest.TestCase): class BatchNorm: def __init__(self, num_features): - self.bns:List[nn.BatchNorm2d] = [] + self.bns:list[nn.BatchNorm2d] = [] for _ in GPUS: bn = nn.BatchNorm2d(num_features, track_running_stats=False, eps=1e-12, momentum=0.85, affine=True) self.bns.append(bn) @@ -1104,7 +1043,7 @@ class TestBatchNorm(unittest.TestCase): bns.append(bn) bn_ts = [] - for bound, bn in zip(x.lazydata.bounds, bns): + for bound, bn in zip(x.uop.bounds, bns): bni = bn(x[bound[0]:bound[1]]) bn_ts.append(bni) @@ -1186,14 +1125,144 @@ class TestMultiRamUsage(unittest.TestCase): # NOTE: the first one on the DEFAULT device should be freed self.assertUsed(self.N*self.N*4*2) - @unittest.skip("TODO: this is broken") - def test_zeros_shard(self): - _ = Tensor.zeros(self.N, self.N).contiguous().shard(devices_2, axis=0).realize() + def test_zeros_shard(self, devices=(d1, d2)): + _ = Tensor.zeros(self.N, self.N).contiguous().shard(devices, axis=0).realize() + assert int(os.getenv("VIZ", "0")) == 0 self.assertUsed(self.N*self.N*4) # sharding should not increase total ram usage + def test_zeros_shard_self(self): self.test_zeros_shard((d0, d1)) def test_zeros_contiguous_shard(self): _ = Tensor.zeros(self.N, self.N).contiguous().shard(devices_2, axis=0).contiguous().realize() + assert int(os.getenv("VIZ", "0")) == 0 self.assertUsed(self.N*self.N*4) # sharding should not increase total ram usage +@unittest.skipIf(not_support_multi_device(), "need multi") +class TestMultiFromUnrenderable(unittest.TestCase): + def test_from_npy(self): + t = Tensor(np.arange(100, dtype=np.uint32)) + ll = t.shard((d0, d1), axis=0) + 1 + np.testing.assert_equal(ll.numpy(), np.arange(100)+1) + +@unittest.skipIf(not_support_multi_device(), "need multi") +class TestMultiAssign(unittest.TestCase): + device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2)) + + def test_multi_assign_realized(self): + out = Tensor.zeros(4).shard(self.device, 0).contiguous().realize() + ones = Tensor.ones(4).shard(self.device, 0).contiguous().realize() + out.assign(ones).realize() + self.assertListEqual(out.tolist(), [1,1,1,1]) + + def test_multi_assign_unrealized(self): + out = Tensor.zeros(4).contiguous().realize().shard(self.device, 0) + ones = Tensor.ones(4).shard(self.device, 0).contiguous().realize() + out.assign(ones).realize() + self.assertListEqual(out.tolist(), [1,1,1,1]) + + def test_multi_assign_both_unrealized(self): + out = Tensor.zeros(4).contiguous().realize().shard(self.device, 0) + ones = Tensor.ones(4).contiguous().realize().shard(self.device, 0) + out.assign(ones).realize() + self.assertListEqual(out.tolist(), [1,1,1,1]) + + def test_multi_assign_piece(self): + out = Tensor.zeros(4,4).shard(self.device, 0).contiguous().realize() + ones = Tensor.ones(4,1).shard(self.device, 0).contiguous().realize() + out[:, 2:3].assign(ones).realize() + self.assertListEqual(out.tolist(), [[0,0,1,0], [0,0,1,0], [0,0,1,0], [0,0,1,0]]) + + def test_multi_assign_piece_noncontig(self): + out = Tensor.zeros(4,4).contiguous().realize().shard(self.device, 0).realize() + ones = Tensor.ones(4,1).shard(self.device, 0).contiguous().realize() + out[:, 2:3].assign(ones).realize() + self.assertListEqual(out.tolist(), [[0,0,1,0], [0,0,1,0], [0,0,1,0], [0,0,1,0]]) + + @unittest.expectedFailure + def test_multi_assign_piece_unrealized(self): + out = Tensor.zeros(4,4).contiguous().realize().shard(self.device, 0) + ones = Tensor.ones(4,1).shard(self.device, 0).contiguous().realize() + out[:, 2:3].assign(ones).realize() + self.assertListEqual(out.tolist(), [[0,0,1,0], [0,0,1,0], [0,0,1,0], [0,0,1,0]]) + + def test_multi_assign_var_offset(self): + out = Tensor.zeros(4,4).contiguous().realize().shard(self.device, 0).realize() + ones = Tensor.ones(4,1).shard(self.device, 0).contiguous().realize() + vi = Variable("i", 0, 3).bind(2) + out[:, vi:vi+1].assign(ones).realize() + self.assertListEqual(out.tolist(), [[0,0,1,0], [0,0,1,0], [0,0,1,0], [0,0,1,0]]) + + def test_multi_assign_var_offset_jit_none(self): self.test_multi_assign_var_offset_jit(None) + def test_multi_assign_var_offset_jit(self, shard_axis=0): + out = Tensor.zeros(4,6).contiguous().realize().shard(self.device, shard_axis).realize() + ones = Tensor.ones(4,1).shard(self.device, shard_axis).contiguous().realize() + + @TinyJit + def f(out:Tensor, vi): + out[:, vi:vi+1].assign(ones).realize() + ones.assign(ones+1).realize() + + vi = Variable("i", 0, 5) + for i in range(1,5): + GlobalCounters.reset() + f(out, vi.bind(i)) + self.assertListEqual(out.tolist(), [[0,1,2,3,4,0]]*4) + +@unittest.skipIf(not_support_multi_device(), "need multi") +class TestMultiTransformer(unittest.TestCase): + def test_transformer(self): + device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2)) + + from extra.models.llama import Transformer + args = {"dim": 32, "n_heads": 1, "n_kv_heads": 1, "n_layers": 2, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 1024, + "hidden_dim": 32, "max_context": 12} + real_model = Transformer(**args) + shard_model = Transformer(**args) + + # copy state + nn.state.load_state_dict(shard_model, nn.state.get_state_dict(real_model)) + + # shard + for k,v in nn.state.get_state_dict(shard_model).items(): + if 'scale' in k: v.shard_(device, axis=None) # from quantized + elif '.attention.' in k: v.shard_(device, axis=-1) + elif '.feed_forward.w1.' in k: v.shard_(device, axis=0) + elif '.feed_forward.w3.' in k: v.shard_(device, axis=0) + elif '.feed_forward.' in k: v.shard_(device, axis=-1) + elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0) + elif 'output.weight' in k: v.shard_(device, axis=0) + else: v.shard_(device, axis=None) + + last_tok = 0 + for i in range(10): + real_tok = real_model(Tensor([[last_tok]], device=Device.DEFAULT), i).item() + shard_tok = shard_model(Tensor([[last_tok]], device=device), i).item() + + # test kv cache + kv1 = real_model.layers[0].attention.cache_kv.numpy() + kv2 = shard_model.layers[0].attention.cache_kv.numpy() + #print(np.concatenate([kv1[:, :, :, :, 0:1], kv2[:, :, :, :, 0:1]], axis=4)) + np.testing.assert_allclose(kv1, kv2, atol=1e-5, rtol=1e-5, err_msg=f"issue at token {i}") + + # test token + self.assertEqual(real_tok, shard_tok, f"issue at token {i}") + last_tok = real_tok + + @unittest.skip("super slow") + def test_llama1b_full(self): + from tinygrad.helpers import fetch + fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model", "tokenizer.model", subdir="llama3-1b-instruct") + model = fetch("https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q6_K.gguf", + "Llama-3.2-1B-Instruct-Q6_K.gguf", subdir="llama3-1b-instruct") + + device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2)) + from examples.llama3 import build_transformer + real_model = build_transformer(model, model_size="1B", device=Device.DEFAULT) + shard_model = build_transformer(model, model_size="1B", device=device) + + last_tok = 0 + real_tok = real_model(Tensor([[last_tok]], device=Device.DEFAULT), 0) + shard_tok = shard_model(Tensor([[last_tok]], device=device), 0) + self.assertEqual(real_tok.item(), shard_tok.item()) + if __name__ == '__main__': unittest.main() diff --git a/tinygrad_repo/test/test_nn.py b/tinygrad_repo/test/test_nn.py index f99b2d2..000e459 100755 --- a/tinygrad_repo/test/test_nn.py +++ b/tinygrad_repo/test/test_nn.py @@ -596,9 +596,9 @@ class TestNN(unittest.TestCase): # sharded model shards the state_dict self.assertEqual(layer.weight.device, devices) - self.assertEqual(layer.weight.lazydata.axis, 3) + self.assertEqual(layer.weight.uop.axis, 3) self.assertEqual(layer.bias.device, devices) - self.assertEqual(layer.bias.lazydata.axis, None) + self.assertEqual(layer.bias.uop.axis, None) np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy()) np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy()) @@ -634,9 +634,9 @@ class TestNN(unittest.TestCase): load_state_dict(layer, state_dict) self.assertEqual(layer.weight.device, devices) - self.assertEqual(layer.weight.lazydata.axis, 3) + self.assertEqual(layer.weight.uop.axis, 3) self.assertEqual(layer.bias.device, devices) - self.assertEqual(layer.bias.lazydata.axis, None) + self.assertEqual(layer.bias.uop.axis, None) np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy()) np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy()) @@ -658,9 +658,9 @@ class TestNN(unittest.TestCase): # NOTE: model and state_dict shard differently, use the state_dict sharding # TODO: revisit this? self.assertEqual(layer.weight.device, devices) - self.assertEqual(layer.weight.lazydata.axis, None) + self.assertEqual(layer.weight.uop.axis, None) self.assertEqual(layer.bias.device, devices5) - self.assertEqual(layer.bias.lazydata.axis, 0) + self.assertEqual(layer.bias.uop.axis, 0) np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy()) np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy()) diff --git a/tinygrad_repo/test/test_ops.py b/tinygrad_repo/test/test_ops.py index f69f022..f3a887c 100644 --- a/tinygrad_repo/test/test_ops.py +++ b/tinygrad_repo/test/test_ops.py @@ -671,7 +671,10 @@ class TestOps(unittest.TestCase): helper_test_op([()], lambda x: x**2.0) helper_test_op([()], lambda x: 2.0**x) helper_test_op(None, lambda x: 0**x, vals=[[-2.,-1,0,1,2,3]]) + helper_test_op(None, lambda x: 0.7**x, vals=[[-2.,-1,0,1,2,3]]) helper_test_op(None, lambda x: (-2)**x, vals=[[-2.,-1,0,1,2,3]]) + # float to power of int + helper_test_op(None, lambda x: 0.7**x, vals=[[-2,-1,0,1,2,3]], forward_only=True) def test_pow_const_direct(self): # x ** c @@ -1090,8 +1093,8 @@ class TestOps(unittest.TestCase): def test_sort(self): for dim in [-1, 0, 1]: for descending in [True, False]: - helper_test_op([(8,45,65)], lambda x: x.sort(dim, descending).values, lambda x: x.sort(dim, descending)[0], forward_only=True) - helper_test_op([(8,45,65)], lambda x: x.sort(dim, descending).indices.type(torch.int32), lambda x: x.sort(dim, descending)[1], + helper_test_op([(8,8,6)], lambda x: x.sort(dim, descending).values, lambda x: x.sort(dim, descending)[0], forward_only=True) + helper_test_op([(8,8,6)], lambda x: x.sort(dim, descending).indices.type(torch.int32), lambda x: x.sort(dim, descending)[1], forward_only=True) # repeated values helper_test_op(None, lambda x: x.sort(stable=True).values, lambda x: x.sort()[0], forward_only=True, vals=[[0, 1] * 9]) @@ -1107,12 +1110,12 @@ class TestOps(unittest.TestCase): for dim in [0, 1, -1]: for largest in [True, False]: for sorted_ in [True]: # TODO support False - helper_test_op([(10,20,30)], - lambda x: x.topk(5, dim, largest, sorted_).values, - lambda x: x.topk(5, dim, largest, sorted_)[0], forward_only=True) - helper_test_op([(10,20,30)], - lambda x: x.topk(5, dim, largest, sorted_).indices.type(torch.int32), - lambda x: x.topk(5, dim, largest, sorted_)[1], forward_only=True) + helper_test_op([(6,5,4)], + lambda x: x.topk(4, dim, largest, sorted_).values, + lambda x: x.topk(4, dim, largest, sorted_)[0], forward_only=True) + helper_test_op([(5,5,4)], + lambda x: x.topk(4, dim, largest, sorted_).indices.type(torch.int32), + lambda x: x.topk(4, dim, largest, sorted_)[1], forward_only=True) # repeated values value, indices = Tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]).topk(3) np.testing.assert_equal(value.numpy(), [1, 1, 1]) @@ -1857,6 +1860,11 @@ class TestOps(unittest.TestCase): def test_view(self): helper_test_op([(4,3,6,6)], lambda x: x.view((12,6,6))) helper_test_op([(4,3,6,6)], lambda x: x.view((-1,3,6,6))) + helper_test_op([(6,)], lambda x: x.view(2, 3)) + helper_test_op([(6,1)], lambda x: x.view([2, 3])) + helper_test_op([(1,6)], lambda x: x.view((3, 2))) + helper_test_op([(3,2)], lambda x: x.view((2, 3))) + helper_test_op([(3,2)], lambda x: x.view(6)) def test_flip(self): helper_test_op([(4,3,6,6)], lambda x: x.flip((0,))) @@ -1973,106 +1981,106 @@ class TestOps(unittest.TestCase): def test_simple_conv2d(self): helper_test_op([(1,4,9,9), (4,4,3,3)], - lambda x,w: torch.nn.functional.conv2d(x,w).relu(), - lambda x,w: Tensor.conv2d(x,w).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w), + lambda x,w: Tensor.conv2d(x,w), grad_rtol=1e-5) def test_simple_conv2d_bias(self): helper_test_op([(1,4,9,9), (4,4,3,3), (4,)], - lambda x,w,b: torch.nn.functional.conv2d(x,w,b).relu(), - lambda x,w,b: Tensor.conv2d(x,w,b).relu(), grad_rtol=1e-5) + lambda x,w,b: torch.nn.functional.conv2d(x,w,b), + lambda x,w,b: Tensor.conv2d(x,w,b), grad_rtol=1e-5) @unittest.skipIf(IMAGE>0, "no conv3d on images") def test_simple_conv3d(self): helper_test_op([(1,4,9,9,9), (4,4,3,3,3)], - lambda x,w: torch.nn.functional.conv3d(x,w).relu(), - lambda x,w: Tensor.conv2d(x,w).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv3d(x,w), + lambda x,w: Tensor.conv2d(x,w), grad_rtol=1e-5) @unittest.skipIf(IMAGE>0, "no conv3d on images") def test_padded_conv3d(self): helper_test_op([(1,4,5,5,5), (4,4,3,3,3)], - lambda x,w: torch.nn.functional.conv3d(x,w,padding=1).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=[1,1,1,1,1,1]).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv3d(x,w,padding=1), + lambda x,w: Tensor.conv2d(x,w,padding=[1,1,1,1,1,1]), grad_rtol=1e-5) def test_simple_conv2d_m4(self): helper_test_op([(1,16,18,18), (16,16,3,3)], - lambda x,w: torch.nn.functional.conv2d(x,w).relu(), - lambda x,w: Tensor.conv2d(x,w).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w), + lambda x,w: Tensor.conv2d(x,w), atol=1e-05, grad_rtol=1e-5) def test_simple_conv2d_1x1(self): helper_test_op([(1,4,9,9), (4,4,1,1)], - lambda x,w: torch.nn.functional.conv2d(x,w).relu(), - lambda x,w: Tensor.conv2d(x,w).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w), + lambda x,w: Tensor.conv2d(x,w), grad_rtol=1e-5) def test_simple_conv2d_1x1_m4(self): helper_test_op([(1,16,32,32), (16,16,1,1)], - lambda x,w: torch.nn.functional.conv2d(x,w).relu(), - lambda x,w: Tensor.conv2d(x,w).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w), + lambda x,w: Tensor.conv2d(x,w), grad_rtol=1e-5) def test_nested_conv2d(self): helper_test_op([(1,32,9,9), (32,32,3,3), (32,32,3,3)], - lambda x,w1,w2: torch.nn.functional.conv2d(torch.nn.functional.conv2d(x,w1).relu(), w2).relu(), - lambda x,w1,w2: x.conv2d(w1).relu().conv2d(w2).relu()) + lambda x,w1,w2: torch.nn.functional.conv2d(torch.nn.functional.conv2d(x,w1).relu(), w2), + lambda x,w1,w2: x.conv2d(w1).relu().conv2d(w2)) # expect reduce nodes == 3 def test_simple_conv2d_nhwc(self): # weights (from tf): filter_height x filter_width x in_channels x out_channels helper_test_op([(2,9,9,10), (3,3,10,20)], - lambda x,w: torch.nn.functional.conv2d(x.permute(0,3,1,2),w.permute(3,2,0,1)).relu(), - lambda x,w: Tensor.conv2d(x.permute(0,3,1,2),w.permute(3,2,0,1)).relu(), atol=1e-5, grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x.permute(0,3,1,2),w.permute(3,2,0,1)), + lambda x,w: Tensor.conv2d(x.permute(0,3,1,2),w.permute(3,2,0,1)), atol=1e-5, grad_rtol=1e-5) def test_simple_conv2d_batched(self): helper_test_op([(2,4,9,9), (4,4,3,3)], - lambda x,w: torch.nn.functional.conv2d(x,w).relu(), - lambda x,w: Tensor.conv2d(x,w).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w), + lambda x,w: Tensor.conv2d(x,w), grad_rtol=1e-5) # conv transpose def test_simple_conv_transpose2d(self): helper_test_op([(2,4,9,9), (4,4,3,3)], - lambda x,w: torch.nn.functional.conv_transpose2d(x,w).relu(), - lambda x,w: Tensor.conv_transpose2d(x,w).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv_transpose2d(x,w), + lambda x,w: Tensor.conv_transpose2d(x,w), grad_rtol=1e-5) def test_bias_conv_transpose2d(self): helper_test_op([(2,4,9,9), (4,4,3,3), (4,)], - lambda x,w,b: torch.nn.functional.conv_transpose2d(x,w,b).relu(), - lambda x,w,b: Tensor.conv_transpose2d(x,w,b).relu(), grad_rtol=1e-5) + lambda x,w,b: torch.nn.functional.conv_transpose2d(x,w,b), + lambda x,w,b: Tensor.conv_transpose2d(x,w,b), grad_rtol=1e-5) def test_grouped_conv_transpose2d(self): helper_test_op([(2,4,9,9), (4,4,3,3)], - lambda x,w: torch.nn.functional.conv_transpose2d(x,w,groups=2).relu(), - lambda x,w: Tensor.conv_transpose2d(x,w,groups=2).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv_transpose2d(x,w,groups=2), + lambda x,w: Tensor.conv_transpose2d(x,w,groups=2), grad_rtol=1e-5) def test_padded_conv_transpose2d(self): for padding in [(1,2), (2,1), 2, 1, 0]: helper_test_op([(2,4,9,9), (4,4,3,3)], - lambda x,w: torch.nn.functional.conv_transpose2d(x,w,padding=padding).relu(), - lambda x,w: Tensor.conv_transpose2d(x,w,padding=padding).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv_transpose2d(x,w,padding=padding), + lambda x,w: Tensor.conv_transpose2d(x,w,padding=padding), grad_rtol=1e-5) self.helper_test_exception([(2,16,2,2), (32,16,3,3)], lambda x,w: torch.nn.functional.conv_transpose2d(x,w,padding=(1,1,1)), lambda x,w: Tensor.conv_transpose2d(x,w,padding=(1,1,1)), expected=(RuntimeError, ValueError)) def test_dilated_conv_transpose2d(self): for dilation in [(1,2), (2,1), 2, 1]: helper_test_op([(2,4,9,9), (4,4,3,3)], - lambda x,w: torch.nn.functional.conv_transpose2d(x,w,dilation=dilation).relu(), - lambda x,w: Tensor.conv_transpose2d(x,w,dilation=dilation).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv_transpose2d(x,w,dilation=dilation), + lambda x,w: Tensor.conv_transpose2d(x,w,dilation=dilation), grad_rtol=1e-5) def test_strided_conv_transpose2d(self): for stride in [(2,1), (1,2), 1]: helper_test_op([(2,4,4,5), (4,4,3,3)], - lambda x,w: torch.nn.functional.conv_transpose2d(x,w, stride=stride).relu(), - lambda x,w: Tensor.conv_transpose2d(x,w,stride=stride).relu(), atol=1e-5, grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv_transpose2d(x,w, stride=stride), + lambda x,w: Tensor.conv_transpose2d(x,w,stride=stride), atol=1e-5, grad_rtol=1e-5) def test_output_padded_conv_transpose2d(self): for output_padding, stride in [((1,1), (2,3)), ((2,1), (3,2))]: helper_test_op([(2,4,6,5), (4,4,3,3),(4,)], - lambda x,w,b: torch.nn.functional.conv_transpose2d(x,w,b,output_padding=output_padding,stride=stride).relu(), - lambda x,w,b: Tensor.conv_transpose2d(x,w,b,output_padding=output_padding,stride=stride).relu(), grad_rtol=1e-5) + lambda x,w,b: torch.nn.functional.conv_transpose2d(x,w,b,output_padding=output_padding,stride=stride), + lambda x,w,b: Tensor.conv_transpose2d(x,w,b,output_padding=output_padding,stride=stride), grad_rtol=1e-5) @unittest.skipIf(IMAGE>0, "no conv3d on images") def test_simple_conv_transpose3d(self): helper_test_op([(2,4,9,9,9), (4,4,3,3,3)], - lambda x,w: torch.nn.functional.conv_transpose3d(x,w).relu(), - lambda x,w: Tensor.conv_transpose2d(x,w).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv_transpose3d(x,w), + lambda x,w: Tensor.conv_transpose2d(x,w), grad_rtol=1e-5) @unittest.skipIf((IMAGE>0), "no conv1d on images") def test_conv1d(self): @@ -2082,8 +2090,8 @@ class TestOps(unittest.TestCase): for groups in [1,3] if cin == 3 and H == 5 else [1]: with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H): helper_test_op([(bs,cin,11), (6,cin//groups,H)], - lambda x,w: torch.nn.functional.conv1d(x,w,groups=groups).relu(), - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv1d(x,w,groups=groups), + lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5) @unittest.skipIf(IMAGE>0, "no conv1d on images") def test_simple_padding_conv1d(self): @@ -2093,15 +2101,15 @@ class TestOps(unittest.TestCase): H = 5 p = (1,1) helper_test_op([(bs,cin,11), (6,cin//groups,H)], - lambda x,w: torch.nn.functional.conv1d(torch.nn.functional.pad(x, p),w).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=p).relu()) + lambda x,w: torch.nn.functional.conv1d(torch.nn.functional.pad(x, p),w), + lambda x,w: Tensor.conv2d(x,w,padding=p)) @unittest.skipIf(IMAGE>0, "no conv1d on images") def test_strided_conv1d_simple(self): bs, H = 2, 3 helper_test_op([(bs,1,5), (1,1,H)], - lambda x,w: torch.nn.functional.conv1d(x,w,stride=2).relu(), - lambda x,w: Tensor.conv2d(x,w,stride=2).relu()) + lambda x,w: torch.nn.functional.conv1d(x,w,stride=2), + lambda x,w: Tensor.conv2d(x,w,stride=2)) @unittest.skipIf(IMAGE>0, "no conv1d on images") def test_asymmetric_padding_conv1d(self): @@ -2110,17 +2118,17 @@ class TestOps(unittest.TestCase): for n in [3,4]: for k in [2]: helper_test_op([(1,1,n), (1,1,k)], - lambda x,w: torch.nn.functional.conv1d(torch.nn.functional.pad(x, p),w).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=p).relu()) + lambda x,w: torch.nn.functional.conv1d(torch.nn.functional.pad(x, p),w), + lambda x,w: Tensor.conv2d(x,w,padding=p)) def _test_conv2d(self, bs=1, cin=1, cout=6): - for H in [1,2,3]: - for W in [1,2,3,5]: + for H in [2,3]: + for W in [1,3,5]: for groups in [1,3] if cin == 3 and cout == 6 and H == 3 and W == 3 else [1]: with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W): helper_test_op([(bs,cin,5,7), (cout,cin//groups,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(), - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups), + lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5) def test_conv2d(self): self._test_conv2d(bs=1, cin=3) def test_conv2d_bs_4_cin_3(self): self._test_conv2d(bs=4, cin=3, cout=2) def test_conv2d_bs_1_cin_1(self): self._test_conv2d(bs=1, cin=1) @@ -2144,9 +2152,9 @@ class TestOps(unittest.TestCase): H = 5 W = 2 helper_test_op([(bs,cin,64,64), (6,cin//groups,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(), - # needed to relax tolerance on NVIDIA - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=1e-4, grad_atol=1e-4, grad_rtol=1e-4) + lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups), + # needed to relax tolerance for larger input + lambda x,w: Tensor.conv2d(x,w,groups=groups), atol=1e-4, grad_atol=3e-4, grad_rtol=1e-4) def test_simple_grouped_conv2d(self): bs = 1 @@ -2154,8 +2162,8 @@ class TestOps(unittest.TestCase): rcout = 1 cin = 2 helper_test_op([(bs,groups*cin,1,1), (groups*rcout,cin,1,1)], - lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(), - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups), + lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5) def test_medium_grouped_conv2d(self): bs = 1 @@ -2163,8 +2171,8 @@ class TestOps(unittest.TestCase): rcout = 2 cin = 2 helper_test_op([(bs,groups*cin,1,1), (groups*rcout,cin,1,1)], - lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(), - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups), + lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5) def test_depthwise_conv2d(self): bs = 1 @@ -2172,8 +2180,8 @@ class TestOps(unittest.TestCase): rcout = 1 cin = 1 helper_test_op([(bs,groups*cin,32,32), (groups*rcout,cin,1,1)], - lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(), - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups), + lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5) def test_grouped_conv2d(self): bs = 4 @@ -2181,8 +2189,8 @@ class TestOps(unittest.TestCase): rcout = 7 cin = 3 helper_test_op([(bs,groups*cin,5,5), (groups*rcout,cin,3,3)], - lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(), - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups), + lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5) def test_fancy_conv2d(self): bs = 2 @@ -2191,14 +2199,14 @@ class TestOps(unittest.TestCase): groups = 3 H,W = 3,3 helper_test_op([(bs,cin,11,28), (groups*cout,cin//groups,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(), - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), grad_rtol=1e-5) + lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups), + lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5) def test_strided_conv2d_simple(self): bs,H,W = 2,3,1 helper_test_op([(bs,1,5,1), (1,1,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(), - lambda x,w: Tensor.conv2d(x,w,stride=2).relu()) + lambda x,w: torch.nn.functional.conv2d(x,w,stride=2), + lambda x,w: Tensor.conv2d(x,w,stride=2)) @unittest.skipIf(Device.DEFAULT != "LLVM", "DEVECTORIZE=0 only for LLVM") def test_strided_conv2d_simple_vec(self): @@ -2210,27 +2218,27 @@ class TestOps(unittest.TestCase): H,W = 3,3 with self.subTest(stride := 2): helper_test_op([(bs,cin,11,28), (4,cin,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(), - lambda x,w: Tensor.conv2d(x,w,stride=stride).relu()) + lambda x,w: torch.nn.functional.conv2d(x,w,stride=2), + lambda x,w: Tensor.conv2d(x,w,stride=stride)) with self.subTest(stride := (2,1)): helper_test_op([(bs,cin,11,28), (4,cin,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,stride=stride).relu(), - lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu()) + lambda x,w: torch.nn.functional.conv2d(x,w,stride=stride), + lambda x,w: Tensor.conv2d(x,w,stride=(2,1))) def test_negative_padding_conv2d(self): n,k = 10, 3 helper_test_op([(1,1,n,n), (1,1,k,k)], - lambda x,w: torch.nn.functional.conv2d(x[:, :, 1:-1, 1:-1],w).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=-1).relu()) + lambda x,w: torch.nn.functional.conv2d(x[:, :, 1:-1, 1:-1],w), + lambda x,w: Tensor.conv2d(x,w,padding=-1)) helper_test_op([(1,1,n,n), (1,1,k,k)], - lambda x,w: torch.nn.functional.conv2d(x[:, :, 1:, 1:],w).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=(-1,0,-1,0)).relu()) + lambda x,w: torch.nn.functional.conv2d(x[:, :, 1:, 1:],w), + lambda x,w: Tensor.conv2d(x,w,padding=(-1,0,-1,0))) def test_simple_padding_conv2d(self): p = (1,1,1,1) helper_test_op(None, - lambda x,w: torch.nn.functional.conv2d(torch.nn.functional.pad(x, p),w).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=p).relu(), vals=[[[[[2.,3.]]]], [[[[1.]]]]]) + lambda x,w: torch.nn.functional.conv2d(torch.nn.functional.pad(x, p),w), + lambda x,w: Tensor.conv2d(x,w,padding=p), vals=[[[[[2.,3.]]]], [[[[1.]]]]]) def test_asymmetric_padding_conv2d(self): for p in [(0,1,0,1), (2,1,2,1), (2,0,2,1)]: @@ -2238,35 +2246,35 @@ class TestOps(unittest.TestCase): for n in [3,4]: for k in [2]: helper_test_op([(1,1,n,n), (1,1,k,k)], - lambda x,w: torch.nn.functional.conv2d(torch.nn.functional.pad(x, p),w).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=p).relu()) + lambda x,w: torch.nn.functional.conv2d(torch.nn.functional.pad(x, p),w), + lambda x,w: Tensor.conv2d(x,w,padding=p)) helper_test_op([(1,1,n,n), (1,1,k,k)], - lambda x,w: torch.nn.functional.conv2d(torch.nn.functional.pad(x, p),w).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=p).relu()) + lambda x,w: torch.nn.functional.conv2d(torch.nn.functional.pad(x, p),w), + lambda x,w: Tensor.conv2d(x,w,padding=p)) def test_padded_conv2d_p21(self): bs,cin,H,W,padding = 4, 3, 3, 3, (2,1) helper_test_op([(bs,cin,11,28), (4,cin,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,padding=padding).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=padding).relu()) + lambda x,w: torch.nn.functional.conv2d(x,w,padding=padding), + lambda x,w: Tensor.conv2d(x,w,padding=padding)) def test_padded_conv2d_p22(self): bs,cin,H,W,padding = 4, 3, 3, 3, (2,2) helper_test_op([(bs,cin,11,28), (4,cin,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,padding=padding).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=padding).relu()) + lambda x,w: torch.nn.functional.conv2d(x,w,padding=padding), + lambda x,w: Tensor.conv2d(x,w,padding=padding)) def test_padded_conv2d_1x1(self): bs,cin,H,W,padding = 4, 3, 1, 1, 2 helper_test_op([(bs,cin,11,28), (4,cin,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,padding=padding).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=padding).relu()) + lambda x,w: torch.nn.functional.conv2d(x,w,padding=padding), + lambda x,w: Tensor.conv2d(x,w,padding=padding)) def test_padded_conv2d_bs1(self): bs,cin,H,W,padding = 1, 3, 3, 3, 1 helper_test_op([(bs,cin,11,28), (4,cin,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,padding=padding).relu(), - lambda x,w: Tensor.conv2d(x,w,padding=padding).relu()) + lambda x,w: torch.nn.functional.conv2d(x,w,padding=padding), + lambda x,w: Tensor.conv2d(x,w,padding=padding)) def test_padding_add(self): helper_test_op([(64,64), (60,60)], @@ -2280,8 +2288,8 @@ class TestOps(unittest.TestCase): for d in [2, (2,1)]: with self.subTest(dilation := d): helper_test_op([(bs,cin,11,28), (4,cin,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,dilation=dilation).relu(), - lambda x,w: Tensor.conv2d(x,w,dilation=dilation).relu()) + lambda x,w: torch.nn.functional.conv2d(x,w,dilation=dilation), + lambda x,w: Tensor.conv2d(x,w,dilation=dilation)) def test_max_pool2d_simple(self): ksz = (2,2) @@ -2292,7 +2300,7 @@ class TestOps(unittest.TestCase): def test_max_pool2d(self): for ksz in [(2,2), (3,3), 2, 3, (3,2), (5,5), (5,1)]: with self.subTest(kernel_size=ksz): - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz), lambda x: Tensor.max_pool2d(x, kernel_size=ksz)) @@ -2300,7 +2308,7 @@ class TestOps(unittest.TestCase): for ksz in [(2,2), (3,3), 2, 3, (3,2)]: for p in [1, (1,0), (0,1)]: with self.subTest(kernel_size=ksz, padding=p): - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz, padding=p), lambda x: Tensor.max_pool2d(x, kernel_size=ksz, padding=p)) self.helper_test_exception([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), padding=(1,1,1)), @@ -2316,40 +2324,40 @@ class TestOps(unittest.TestCase): def test_max_pool2d_padding_int(self): ksz = (2,2) - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x.int(), kernel_size=ksz, padding=1), lambda x: Tensor.max_pool2d(x.int(), kernel_size=ksz, padding=1), forward_only=True) def test_max_pool2d_bigger_stride(self): for stride in [(2,3), (3,2), 2, 3]: with self.subTest(stride=stride): - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride), lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride)) def test_max_pool2d_bigger_stride_dilation(self): for stride, dilation in zip([(2,3), (3,2), 2, 3, 4], [(3,2), (2,3), 2, 3, 6]): with self.subTest(stride=stride): - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation), lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation)) @unittest.skipIf( Device.DEFAULT in {"CUDA", "NV"}, "CUDA fails on this") def test_max_pool2d_unit_stride(self): - helper_test_op([(8, 2, 17, 14)], + helper_test_op([(3, 2, 17, 14)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=1), lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=1)) def test_max_pool2d_smaller_stride(self): for stride in [(2,3), (3,2), 2, 3]: with self.subTest(stride=stride): - helper_test_op([(8, 2, 17, 14)], + helper_test_op([(3, 2, 17, 14)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=stride), lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=stride)) def test_max_pool2d_dilation(self): for dilation in [(2, 3), (3, 2), 2, 3]: - helper_test_op([(8, 2, 17, 14)], + helper_test_op([(3, 2, 17, 14)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), dilation=dilation), lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), dilation=dilation)) @@ -2533,13 +2541,13 @@ class TestOps(unittest.TestCase): def test_interpolate_nearest_exact(self): self.test_interpolate_nearest("nearest-exact") def test_interpolate_bilinear(self): - for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]: + for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]: helper_test_op([(2,3)+in_sz], lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear"), lambda x: Tensor.interpolate(x, size=out_sz, mode="linear"), atol=1e-4) def test_interpolate_bilinear_corners_aligned(self): - for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]: + for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]: helper_test_op([(2,3)+in_sz], lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear", align_corners=True), lambda x: Tensor.interpolate(x, size=out_sz, mode="linear", align_corners=True), atol=1e-4) @@ -2830,7 +2838,7 @@ class TestOps(unittest.TestCase): b = torch.randint(3, size=[3,4,5], dtype=torch.int64, requires_grad=False) a = Tensor(b.detach().cpu().numpy().astype(np.int32), dtype=dtypes.int32, requires_grad=False) for reduce in ("sum", "prod", "mean", "amin", "amax"): - for dim in (0,1,2,-1,-2,-3): + for dim in (-1,1,-3): helper_test_op([(4,5,6), (4,5,6)], lambda x,src: x.scatter_reduce(dim=dim, index=b, src=src, reduce=reduce), lambda x,src: x.scatter_reduce(dim=dim, index=a, src=src, reduce=reduce), forward_only=True) diff --git a/tinygrad_repo/test/test_pickle.py b/tinygrad_repo/test/test_pickle.py index e9d88e1..93758ee 100644 --- a/tinygrad_repo/test/test_pickle.py +++ b/tinygrad_repo/test/test_pickle.py @@ -53,7 +53,7 @@ class TestPickle(unittest.TestCase): def test_pickle_realized_tensor_alt2(self): print("** init") t = Tensor.rand(10, 10).to("CPU").realize() - tensor_uop = t.lazydata + tensor_uop = t.uop assert tensor_uop.is_realized, f"expected {tensor_uop} to be realized" t_values = t.numpy() # pickle @@ -63,13 +63,13 @@ class TestPickle(unittest.TestCase): del tensor_uop print("** post pickle") t2:Tensor = pickle.loads(st) - assert t2.lazydata.is_realized, f"expected {t2.lazydata} to be realized" + assert t2.uop.is_realized, f"expected {t2.uop} to be realized" np.testing.assert_equal(t_values, t2.numpy()) # NOTE: currently Buffer exists on the uop, not tensor def test_pickle_buffer_uop(self): t = Tensor.arange(4).realize() - a = t.lazydata + a = t.uop assert a.op is Ops.BUFFER self.assertIsNotNone(buffer:=a.realized) s = pickle.dumps(a) @@ -98,12 +98,12 @@ class TestPickle(unittest.TestCase): def test_pickle_buffer_view(self): t = Tensor.arange(10, device="CPU").contiguous().realize() vt = t[3:5].contiguous().realize() - assert hasattr(vt.lazydata.buffer, 'base') + assert hasattr(vt.uop.buffer, 'base') ref_value = vt.tolist() st = pickle.dumps(vt) del t, vt vt2 = pickle.loads(st) - assert hasattr(vt2.lazydata.buffer, 'base') + assert hasattr(vt2.uop.buffer, 'base') assert ref_value == vt2.tolist() def test_pickle_numpy(self): diff --git a/tinygrad_repo/test/test_profiler.py b/tinygrad_repo/test/test_profiler.py index 989f79f..c64c271 100644 --- a/tinygrad_repo/test/test_profiler.py +++ b/tinygrad_repo/test/test_profiler.py @@ -36,12 +36,12 @@ class TestProfiler(unittest.TestCase): si = self.b.schedule()[-1] TestProfiler.runner = get_runner(TestProfiler.d0.device, si.ast) - TestProfiler.b.lazydata.buffer.allocate() + TestProfiler.b.uop.buffer.allocate() def test_profile_kernel_run(self): runner_name = TestProfiler.runner._prg.name with helper_collect_profile(TestProfiler.d0) as profile: - TestProfiler.runner([TestProfiler.b.lazydata.buffer, TestProfiler.a.lazydata.buffer], var_vals={}) + TestProfiler.runner([TestProfiler.b.uop.buffer, TestProfiler.a.uop.buffer], var_vals={}) profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device) kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent)] @@ -66,7 +66,7 @@ class TestProfiler(unittest.TestCase): with helper_collect_profile(TestProfiler.d0) as profile: buf1.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) - TestProfiler.runner([buf1, TestProfiler.a.lazydata.buffer], var_vals={}) + TestProfiler.runner([buf1, TestProfiler.a.uop.buffer], var_vals={}) buf1.copyout(memoryview(bytearray(buf1.nbytes))) profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device) diff --git a/tinygrad_repo/test/test_quantize_onnx.py b/tinygrad_repo/test/test_quantize_onnx.py index c8e62a9..7626aad 100644 --- a/tinygrad_repo/test/test_quantize_onnx.py +++ b/tinygrad_repo/test/test_quantize_onnx.py @@ -67,12 +67,12 @@ def get_quantized_model(sz): class TestQuantizeOnnxCPU(unittest.TestCase): def test_quant_128(self, sz=128): try: - import onnx + import onnx # noqa: F401 # pylint: disable=unused-import except ImportError: raise unittest.SkipTest() - from tinygrad.frontend.onnx import OnnxRunner + from tinygrad.frontend.onnx import OnnxRunner, onnx_load out_file = get_quantized_model(sz) - onnx_model = onnx.load(out_file) + onnx_model = onnx_load(out_file) run_onnx = OnnxRunner(onnx_model) inp = Tensor(np.random.uniform(size=(sz, sz)).astype(np.float32)) with Context(DONT_REALIZE_EXPAND=1, QUANTIZE=1): @@ -243,8 +243,8 @@ class TestDSPCache(unittest.TestCase): # string becuase this breaks Python language server for syntax highlight for some reason ast = eval("""UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(25088), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 896, 32, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.VIEW, dtypes.uchar.ptr(25088), arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 896, 32, 1, 0), offset=0, mask=None, contiguous=True),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(25088), arg=0, src=()),)), UOp(Ops.CAST, dtypes.uchar, arg=None, src=( UOp(Ops.XOR, dtypes.int, arg=None, src=( UOp(Ops.MAX, dtypes.int, arg=None, src=( @@ -261,23 +261,23 @@ class TestDSPCache(unittest.TestCase): UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.int, arg=None, src=( UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(150528), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 192), strides=(0, 5376, 192, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.uchar.ptr(150528), arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 192), strides=(0, 5376, 192, 0, 1), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(150528), arg=1, src=()),)),)),)),)), UOp(Ops.CONST, dtypes.float, arg=0.012368360534310341, src=( x22:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 192), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.int, arg=None, src=( UOp(Ops.LOAD, dtypes.char, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.char.ptr(6144), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 48, 4), strides=(4, 128, 1), offset=0, mask=None, contiguous=False), View(shape=(1, 28, 28, 32, 192), strides=(0, 0, 0, 192, 1), offset=0, mask=None, contiguous=False))), src=()),)),)),)), + UOp(Ops.VIEW, dtypes.char.ptr(6144), arg=ShapeTracker(views=(View(shape=(32, 48, 4), strides=(4, 128, 1), offset=0, mask=None, contiguous=False), View(shape=(1, 28, 28, 32, 192), strides=(0, 0, 0, 192, 1), offset=0, mask=None, contiguous=False))), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.char.ptr(6144), arg=2, src=()),)),)),)),)), UOp(Ops.CONST, dtypes.float, arg=0.007441135589033365, src=( x22,)),)),)),)), UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(32), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.VIEW, dtypes.int.ptr(32), arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(32), arg=3, src=()),)),)),)), UOp(Ops.CONST, dtypes.float, arg=9.203465015161783e-05, src=( x36:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), UOp(Ops.CONST, dtypes.float, arg=33.812857328652136, src=( diff --git a/tinygrad_repo/test/test_randomness.py b/tinygrad_repo/test/test_randomness.py index 05012ec..435778e 100644 --- a/tinygrad_repo/test/test_randomness.py +++ b/tinygrad_repo/test/test_randomness.py @@ -136,9 +136,7 @@ class TestRandomness(unittest.TestCase): jr = np.array([0.9614430665969849, 0.059279561042785645, 0.01909029483795166, 0.47882091999053955, 0.9677121639251709, 0.36863112449645996, 0.3102607727050781, 0.06608951091766357, 0.35329878330230713, 0.26518797874450684], dtype=np.float32) r = Tensor.rand(10).numpy() - # TODO: this failed because increment happened before _threefry_random_bits - with self.assertRaises(AssertionError): - np.testing.assert_allclose(r, jr, atol=1e-5, rtol=1e-5) + np.testing.assert_allclose(r, jr, atol=1e-5, rtol=1e-5) @unittest.skipIf(not_support_multi_device(), "no multi") def test_threefry_tensors_cnt(self): @@ -260,7 +258,7 @@ class TestRandomness(unittest.TestCase): old_default_float = dtypes.default_float # low precision can result in inf from randn dtypes.default_float = default_float - t = Tensor.randn(1024, 1024) + t = Tensor.randn(256, 256) mx = t.max().numpy().item() mn = t.min().numpy().item() print(f"testing with {default_float=}") @@ -303,11 +301,11 @@ class TestRandomness(unittest.TestCase): lambda x: np.random.uniform(-1, 1, size=x) * math.sqrt(6 / (x[0] + math.prod(x[1:]))))) def test_kaiming_uniform(self): - for shape in [(256, 128, 3, 3), (80, 44), (3, 55, 35)]: + for shape in [(32, 128, 3, 3), (80, 44), (3, 55, 35)]: self.assertTrue(equal_distribution(Tensor.kaiming_uniform, lambda x: torch.nn.init.kaiming_uniform_(torch.empty(x)), shape=shape)) def test_kaiming_normal(self): - for shape in [(256, 128, 3, 3), (80, 44), (3, 55, 35)]: + for shape in [(32, 128, 3, 3), (80, 44), (3, 55, 35)]: self.assertTrue(equal_distribution(Tensor.kaiming_normal, lambda x: torch.nn.init.kaiming_normal_(torch.empty(x)), shape=shape)) def test_multinomial(self): diff --git a/tinygrad_repo/test/test_renderer_failures.py b/tinygrad_repo/test/test_renderer_failures.py index 329975b..3dd795e 100644 --- a/tinygrad_repo/test/test_renderer_failures.py +++ b/tinygrad_repo/test/test_renderer_failures.py @@ -9,10 +9,11 @@ from tinygrad.renderer.cstyle import CStyleLanguage from tinygrad.renderer.ptx import PTXRenderer from tinygrad.renderer.wgsl import WGSLRenderer from tinygrad.runtime.ops_python import PythonRenderer -from tinygrad.uop.ops import UOp, Ops +from tinygrad.uop.ops import UOp, Ops, python_alu from tinygrad.renderer import ProgramSpec from tinygrad.tensor import Tensor, _to_np_dtype from tinygrad.codegen import full_rewrite +from tinygrad.engine.realize import lower_schedule_item def _test_uop_result(inputs:List[Tensor], stores:List[UOp], local_size=None): for x in inputs: x.realize() @@ -22,7 +23,7 @@ def _test_uop_result(inputs:List[Tensor], stores:List[UOp], local_size=None): uops = dedup(flatten(_recursive_add(st) for st in stores)) outbufs = [Buffer(Device.DEFAULT, sz:=(1 if local_size is None else prod(local_size)), (dtype:=u.src[1].dtype), \ initial_value=np.zeros(sz, dtype=_to_np_dtype(dtype)).data) for u in uops if u.op is Ops.STORE] - inbufs = [cast(UOp,x.lazydata).base.buffer for x in inputs] + inbufs = [cast(UOp,x.uop).base.buffer for x in inputs] src = Device[Device.DEFAULT].renderer.render(uops) ei = CompiledRunner(ProgramSpec("test", src, Device.DEFAULT, uops[-1], uops=uops, local_size=local_size)) ei.exec(outbufs+inbufs) @@ -69,6 +70,23 @@ class TestCStyleFailures(unittest.TestCase): ret = _setup_and_test_alu(Ops.MAX, 1, UOp.const(dtypes.int, dtypes.min(dtypes.int)+1)) self.assertEqual(ret[0], 1) + def _test_src_strip_paren(self, op: Ops, should_strip_paren:bool=True): + dtype = "bool" if op in (Ops.OR, Ops.XOR, Ops.AND) else None + ret = Tensor.empty(1, dtype=dtype) + for _ in range(5): ret = python_alu[op](ret, Tensor.empty(1, dtype=dtype)) + schedule = ret.schedule() + assert len(schedule) == 1 + ei = lower_schedule_item(schedule[0]) + src = ei.prg.p.src + self.assertEqual("("*5 not in src, should_strip_paren) + + def test_repeat_add(self): self._test_src_strip_paren(Ops.ADD) + def test_repeat_mul(self): self._test_src_strip_paren(Ops.MUL) + def test_repeat_xor(self): self._test_src_strip_paren(Ops.XOR) + def test_repeat_or(self): self._test_src_strip_paren(Ops.OR) + def test_repeat_and(self): self._test_src_strip_paren(Ops.AND) + def test_repeat_sub(self): self._test_src_strip_paren(Ops.SUB, should_strip_paren=False) + @unittest.skipUnless(isinstance(Device[Device.DEFAULT].renderer, WGSLRenderer), "tests for wgsl renderer") class TestWGSLFailures(unittest.TestCase): def test_multiply_infinity(self): diff --git a/tinygrad_repo/test/test_schedule.py b/tinygrad_repo/test/test_schedule.py index 02c0346..7a8a28f 100644 --- a/tinygrad_repo/test/test_schedule.py +++ b/tinygrad_repo/test/test_schedule.py @@ -6,13 +6,14 @@ import unittest import numpy as np import functools from typing import List, Optional, Union, cast +from hypothesis import assume, given, strategies as strat from tinygrad import nn, dtypes, Device, Tensor from tinygrad.device import is_dtype_supported from tinygrad.dtype import DType, ImageDType from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.uop.ops import PatternMatcher, UOp, Ops, GroupOp, UPat, graph_rewrite, track_rewrites -from tinygrad.codegen.symbolic import symbolic_simple +from tinygrad.uop.symbolic import symbolic_simple from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, SPLIT_REDUCEOP, GlobalCounters, Context, getenv, all_same, temp from tinygrad.engine.grouper import view_left, view_right, sym, get_kernelize_map, Kernel, create_ast, merge_views, create_kernels from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars @@ -69,6 +70,46 @@ def _test_conv2d(allowed:int, dtype:DType=dtypes.float, **kwargs): def schedule_graph_rewrite(big_sink:UOp): return graph_rewrite(big_sink, merge_views+sym, {}) class TestSchedule(unittest.TestCase): + def test_arange_avgpool2d(self, kcount=2): + x = Tensor.arange(25).reshape(1,1,5,5).cast(dtypes.float32) + t = x.avg_pool2d(padding=1) + sched = t.schedule() + self.assertEqual(len(sched), kcount) + run_schedule(sched) + import torch + torch_out = torch.nn.functional.avg_pool2d(torch.arange(25).reshape(1,1,5,5).float(), kernel_size=(2,2), padding=1).numpy() + np.testing.assert_allclose(t.numpy(), torch_out) + + def test_arange_avgpool2d_fused_noopt(self): + with Context(FUSE_ARANGE=1, NOOPT=1): self.test_arange_avgpool2d(kcount=1) + + # linearizer error + @unittest.skip("recursion error no longer raised") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "needs supports_float4 to fail") + def test_arange_avgpool2d_fused(self): + with self.assertRaises(RecursionError): + with Context(FUSE_ARANGE=1, NOOPT=0): self.test_arange_avgpool2d(kcount=1) + + # when we're fusing a reduce, all ReduceOps must have the same N in the dimensions + # all permutes, reshapes, expands and shrinks push through the reduce + def test_arange_sum(self): + a = Tensor.arange(6).reshape(3, 2).sum(axis=1) + with Context(FUSE_ARANGE=1): + run_schedule(check_schedule(a, 1)) + self.assertListEqual(a.tolist(), [1, 5, 9]) + + def test_arange_sum_alt(self): + a = (Tensor.arange(5).reshape(1,5).expand(6,5)*Tensor(2)).reshape(1,6,5).sum(axis=2) + with Context(FUSE_ARANGE=1): + run_schedule(check_schedule(a, 1)) + np.testing.assert_equal(a.numpy(), 20) + + def test_permute_arange(self): + a = Tensor.arange(6).reshape(6, 1, 1).permute(2, 0, 1).sum(axis=1) + with Context(FUSE_ARANGE=1): + run_schedule(check_schedule(a, 1)) + self.assertListEqual(a.tolist(), [[15]]) + @unittest.skipIf(Device.DEFAULT == "CPU", "devices must mismatch") def test_error_on_device_mismatch(self): a = Tensor.empty(10) @@ -109,13 +150,14 @@ class TestSchedule(unittest.TestCase): root = root + functools.reduce(lambda a,b:a+b, bufs[i:i+X]) self.assertEqual(root.item(), sum(range(N))) - @unittest.expectedFailure # TODO: failing because of can_chase - def test_indexing_scalars_multiple_dims(self): - X = Tensor.randn(2, 3).realize() - xt = X[Tensor(0)][Tensor(1)] + @given(strat.sampled_from(range(2,4)), strat.sampled_from(range(2,4)), strat.sampled_from(range(0,4)), strat.sampled_from(range(0,4))) + def test_indexing_scalars(self, x, y, a, b): + assume(a UOp: return graph_rewrite(graph_rewrite(u, view_left), view_right) -def swizzle_cnt(u:UOp) -> int: return len([x for x in u.toposort() if x.op is Ops.VIEW and len(x.src) != 0 and x.src[0].op is not Ops.BUFFER]) +def swizzle_cnt(u:UOp) -> int: + return len([x for x in u.toposort() if x.op is Ops.VIEW and len(x.src) != 0 and x.src[0].op not in {Ops.BUFFER, Ops.DEFINE_GLOBAL}]) class TestSwizzle(unittest.TestCase): def test_swizzle_simple(self): @@ -2096,7 +2126,7 @@ class TestSwizzle(unittest.TestCase): run_schedule(check_schedule(t, 3)) np.testing.assert_equal(t.numpy(), [[0.5, 0.5], [0.5, 0.5], [0., 0.]]) -def store_val(si:ScheduleItem): return si.ast.src[0].src[2] +def store_val(si:ScheduleItem): return si.ast.src[0].src[1] zero_pm = UPat(Ops.CONST, arg=0) class TestView(unittest.TestCase): def test_all_masked_out(self): @@ -2132,7 +2162,7 @@ class TestView(unittest.TestCase): assert b.shape == (10, 10) sched = check_schedule(b.contiguous(), 1) self.assertEqual(store_val(sched[-1]).op, Ops.LOAD) - self.assertEqual(store_val(sched[-1]).st_arg, b.lazydata.st) + self.assertEqual(store_val(sched[-1]).st_arg, b.uop.st) run_schedule(sched) np.testing.assert_allclose(b.numpy(), np.pad(a.numpy(), ((0, 5), (0, 0)))[5:]) @@ -2146,9 +2176,9 @@ class TestView(unittest.TestCase): late_mul = a*bv check_schedule(late_mul, 0) # the arange doesn't realize - self.assertIsNone(b.lazydata.base.realized) + self.assertIsNone(b.uop.base.realized) # mul doesn't realize - self.assertIsNone(late_mul.lazydata.base.realized) + self.assertIsNone(late_mul.uop.base.realized) self.assertEqual(late_mul.tolist(), [0, 0]) # SINK has two branches: @@ -2163,13 +2193,13 @@ class TestView(unittest.TestCase): other_child = b+2 s = check_schedule([late_mul, other_child], 2) # the arange becomes a BUFFER - self.assertIs(b.lazydata.base.op, Ops.BUFFER) + self.assertIs(b.uop.base.op, Ops.BUFFER) # mul still collapses - self.assertIs(late_mul.lazydata.base.op, Ops.CONST) + self.assertIs(late_mul.uop.base.op, Ops.CONST) run_schedule(s) self.assertEqual(other_child.tolist(), [2, 3, 4]) -def tensor_rewrite(t) -> UOp: return graph_rewrite(t.lazydata.base, merge_views+symbolic_simple) +def tensor_rewrite(t) -> UOp: return graph_rewrite(t.uop.base, merge_views+symbolic_simple) class TestSimplifier(unittest.TestCase): def test_sink_childless_const(self): x = Tensor(0) @@ -2193,8 +2223,8 @@ class TestSimplifier(unittest.TestCase): a = Tensor.empty(4, 4, dtype=dtypes.int) sink = tensor_rewrite(a*0) assert UPat(Ops.CONST, arg=0).match(sink, {}) - self.assertIs(tensor_rewrite(a*1).base, a.lazydata.base) - self.assertIs(tensor_rewrite(a+0).base, a.lazydata.base) + self.assertIs(tensor_rewrite(a*1).base, a.uop.base) + self.assertIs(tensor_rewrite(a+0).base, a.uop.base) def test_cast_folding(self): a = Tensor(1.0).cast(dtypes.int) @@ -2228,14 +2258,14 @@ class TestConst(unittest.TestCase): def test_tensor_const(self): a = Tensor(1) - print(a.lazydata) - self.assertTrue(tensor_const_pm.rewrite(a.lazydata)) + print(a.uop) + self.assertTrue(tensor_const_pm.rewrite(a.uop)) def test_tensor_variable(self): vv = UOp.variable("a", 0, 10).bind(1) a = Tensor(vv) - print(a.lazydata) - self.assertTrue(tensor_const_pm.rewrite(a.lazydata)) + print(a.uop) + self.assertTrue(tensor_const_pm.rewrite(a.uop)) def test_const_schedule(self): a = Tensor.ones((4, 4)) @@ -2252,7 +2282,7 @@ class TestConst(unittest.TestCase): a = Tensor.ones((4,)).pad((1, 1)).contiguous() sched = a.schedule() print(sched[0].ast) - const_ast_pattern = UPat(Ops.SINK, src=(UPat.store(UPat(), UPat(), UPat.where(UPat(Ops.VALID), UPat.cvar("x"), UPat(Ops.CONST, arg=0))),)) + const_ast_pattern = UPat(Ops.SINK, src=(UPat.store(UPat(), UPat.where(UPat(Ops.VALID), UPat.cvar("x"), UPat(Ops.CONST, arg=0))),)) self.assertEqual(len(const_ast_pattern.match(sched[0].ast, {})), 1) run_schedule(sched) self.assertListEqual(a.tolist(), [0, 1, 1, 1, 1, 0]) @@ -2261,7 +2291,7 @@ class TestConst(unittest.TestCase): a = Tensor.ones((4,)).contiguous() sched = a.schedule() print(sched[0].ast) - const_ast_pattern = UPat(Ops.SINK, src=(UPat.store(UPat(), UPat(), UPat(Ops.CONST)),)) + const_ast_pattern = UPat(Ops.SINK, src=(UPat.store(UPat(), UPat(Ops.CONST)),)) self.assertEqual(len(const_ast_pattern.match(sched[0].ast, {})), 1) run_schedule(sched) self.assertListEqual(a.tolist(), [1, 1, 1, 1]) @@ -2282,7 +2312,7 @@ class TestConst(unittest.TestCase): sched = add.schedule() self.assertEqual(len(sched), 0) # b+0 and b share the same underlying device memory - self.assertIs(add.lazydata.buffer, b.lazydata.buffer) + self.assertIs(add.uop.buffer, b.uop.buffer) self.assertListEqual(add.tolist(), [2, 2, 2, 2]) def test_src_masked_const_folding(self): @@ -2295,7 +2325,7 @@ class TestConst(unittest.TestCase): self.assertEqual(len(sched), 1) run_schedule(sched) # add gets assigned to a new buffer - self.assertIsNot(add.lazydata.base.realized, b.lazydata.base.realized) + self.assertIsNot(add.uop.base.realized, b.uop.base.realized) self.assertListEqual(add.tolist(), [4, 2, 2, 2, 2, 4]) # ** part 3: Tensor variable bindings @@ -2330,15 +2360,15 @@ class TestCopyFolding(unittest.TestCase): self.assertListEqual(b.tolist(), [0, 0, 0]) def test_alu_after_copy(self): - a = Tensor.ones((4,)).to("CPU").lazydata - b = Tensor.empty(4, device="CPU").lazydata + a = Tensor.ones((4,)).to("CPU").uop + b = Tensor.empty(4, device="CPU").uop add = a+b add = schedule_graph_rewrite(add) assert all_same([x.device for x in add.src]), f"ALU has different devices! {[x.device for x in add.src]}" @unittest.skip("this is just clone now") def test_copy_to_same_device(self): - a = Tensor.empty(4).lazydata + a = Tensor.empty(4).uop b = a.copy_to_device(a.device) check_schedule(b, 0, filter_sink=False) b = schedule_graph_rewrite(b) @@ -2348,7 +2378,7 @@ class TestCopyFolding(unittest.TestCase): @unittest.skip("this is just clone now") def test_copy_to_same_device_alt(self): - a = Tensor.empty(4, 4).lazydata + a = Tensor.empty(4, 4).uop b = a.copy_to_device(a.device) check_schedule(b, 0, filter_sink=False) b = schedule_graph_rewrite(b) @@ -2365,8 +2395,8 @@ class TestCopyFolding(unittest.TestCase): b = view.clone() # NOTE: this was sort of a bug making this 2 run_schedule(check_schedule(b, 2, filter_sink=False)) - self.assertEqual(b.lazydata.base.buffer.size, 2) - self.assertEqual(b.lazydata.size, 2) + self.assertEqual(b.uop.base.buffer.size, 2) + self.assertEqual(b.uop.size, 2) self.assertListEqual(b.tolist(), [0, 1]) def test_expanded_copy(self): @@ -2374,8 +2404,8 @@ class TestCopyFolding(unittest.TestCase): view = a.reshape(2, 1).expand(2, 2) b = view.clone() run_schedule(check_schedule(b, 2, filter_sink=False)) - self.assertEqual(b.lazydata.base.buffer.size, 4) - self.assertEqual(b.lazydata.size, 4) + self.assertEqual(b.uop.base.buffer.size, 4) + self.assertEqual(b.uop.size, 4) self.assertListEqual(b.tolist(), [[0, 0], [1, 1]]) def test_permuted_copy(self): @@ -2385,7 +2415,7 @@ class TestCopyFolding(unittest.TestCase): self.assertListEqual(b.tolist(), [[0, 2], [1, 3]]) def test_permute_on_disk(self): - with open(temp('dt_arange_4_permute'), "wb") as f: f.write(Tensor.arange(4).realize().lazydata.base.buffer.as_buffer()) + with open(temp('dt_arange_4_permute'), "wb") as f: f.write(Tensor.arange(4).realize().uop.base.buffer.as_buffer()) a = Tensor.empty(4, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_4_permute')}") b = a.reshape(2, 2).permute(1, 0).to("CPU") b.realize() @@ -2401,7 +2431,7 @@ class TestCopyFolding(unittest.TestCase): # TODO: this is wrong because of the permute @unittest.expectedFailure def test_permute_after_shrink_on_disk(self): - with open(temp('dt_arange_5_permute'), "wb") as f: f.write(Tensor.arange(5).realize().lazydata.base.buffer.as_buffer()) + with open(temp('dt_arange_5_permute'), "wb") as f: f.write(Tensor.arange(5).realize().uop.base.buffer.as_buffer()) a = Tensor.empty(5, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_5_permute')}") b = a.shrink(((0, 4),)).reshape(2, 2).permute(1, 0).to("CPU") b.realize() @@ -2413,13 +2443,13 @@ class TestTensorUOpSpec(unittest.TestCase): unsafe_push_views = PatternMatcher([ (UPat.cvar("root").view(name="view"), lambda root,view: root.replace(src=tuple(x.view(view.st) for x in root.src))), ]) - a.lazydata = graph_rewrite(a.lazydata.sink(), merge_views+merge_views+unsafe_push_views) + a.uop = graph_rewrite(a.uop.sink(), merge_views+merge_views+unsafe_push_views) with self.assertRaisesRegex(RuntimeError, "UOp verification failed"): a.schedule() def test_expanded_const_ok(self): a = Tensor.ones((4, 4)) - t = graph_rewrite(a.lazydata.sink(), merge_views+merge_views) + t = graph_rewrite(a.uop.sink(), merge_views+merge_views) create_schedule_with_vars(t) # NOTE: changing symbolic CONST VIEWs is not allowed @@ -2427,69 +2457,69 @@ class TestTensorUOpSpec(unittest.TestCase): def test_symbolic_shape_ok(self): a = Tensor.ones(4) vi = UOp.variable("i", 1, 10).bind(4) - a.lazydata = graph_rewrite(a.reshape(vi).sum().lazydata, merge_views+merge_views) + a.uop = graph_rewrite(a.reshape(vi).sum().uop, merge_views+merge_views) a.schedule() class TestBufferUOp(unittest.TestCase): # BUFFER has a ShapeTracker of shape=(n,) and stride=(1,) def test_buffer_has_buffer(self): buf = Tensor.empty(10) - self.assertIsNotNone(buf.lazydata.buffer) - self.assertEqual(buf.lazydata.st, ShapeTracker.from_shape((10,))) + self.assertIsNotNone(buf.uop.buffer) + self.assertEqual(buf.uop.st, ShapeTracker.from_shape((10,))) # the device Buffer remains unallocated until it's we run the schedule - self.assertFalse(buf.lazydata.buffer.is_allocated()) + self.assertFalse(buf.uop.buffer.is_allocated()) add = buf+1 sched = add.schedule() - self.assertFalse(buf.lazydata.buffer.is_allocated()) + self.assertFalse(buf.uop.buffer.is_allocated()) run_schedule(sched) - self.assertTrue(buf.lazydata.buffer.is_allocated()) + self.assertTrue(buf.uop.buffer.is_allocated()) def test_buffer_has_unique_buffer(self): buf = Tensor.empty(10) - buf1 = buf.lazydata.buffer - buf2 = buf.lazydata.buffer + buf1 = buf.uop.buffer + buf2 = buf.uop.buffer self.assertIs(buf1, buf2) # we also allow VIEW(BUFFER) to access the underlying device Buffer, as long as it's contiguous def test_buffer_view_allowed(self): add = Tensor.empty(1, 1)+Tensor.empty(1, 1) add.realize() - self.assertIsNotNone(add.lazydata.buffer) - self.assertEqual(add.lazydata.shape, (1, 1)) + self.assertIsNotNone(add.uop.buffer) + self.assertEqual(add.uop.shape, (1, 1)) def test_buffer_view_not_allowed(self): permuted_view = Tensor.empty(1, 2, 3).permute(0, 2, 1) - merged = graph_rewrite(permuted_view.lazydata, merge_views) + merged = graph_rewrite(permuted_view.uop, merge_views) with self.assertRaisesRegex(AssertionError, "VIEW only works here if it's contiguous"): merged.buffer # cannot access Buffer of a non contiguous VIEW def test_buffer_only_after_realize(self): a = Tensor([1])+Tensor([2]) # accessing realized will return None - self.assertIsNone(a.lazydata.realized) + self.assertIsNone(a.uop.realized) # accessing Buffer will assert with self.assertRaisesRegex(AssertionError, "must be BUFFER"): - a.lazydata.buffer # there is no BUFFER on an unrealized ADD + a.uop.buffer # there is no BUFFER on an unrealized ADD # Buffer only exists once we realize it a.realize() - self.assertIsNotNone(a.lazydata.buffer) + self.assertIsNotNone(a.uop.buffer) def test_const_does_not_realize(self): a = Tensor(1)+Tensor(2) run_schedule(check_schedule(a, 0)) - self.assertIsNone(a.lazydata.base.realized) + self.assertIsNone(a.uop.base.realized) def test_var_does_not_realize(self): a = Tensor(UOp.variable("a", 0, 10).bind(1)) run_schedule(check_schedule(a, 0)) - self.assertIsNone(a.lazydata.base.realized) + self.assertIsNone(a.uop.base.realized) def test_view_does_not_realize(self): a = Tensor.randn(1, 4).expand(4, 4) a.realize() - self.assertEqual(a.lazydata.base.realized.size, 4) + self.assertEqual(a.uop.base.realized.size, 4) a2 = a.contiguous().realize() - self.assertEqual(a2.lazydata.base.realized.size, 16) + self.assertEqual(a2.uop.base.realized.size, 16) class TestContiguous(unittest.TestCase): def test_contiguous_buffer(self): @@ -2521,13 +2551,13 @@ class TestContiguous(unittest.TestCase): a = Tensor.empty(4) b = a.expand((4, 4)) check_schedule(b, 0) - self.assertEqual(b.lazydata.base.buffer.size, 4) + self.assertEqual(b.uop.base.buffer.size, 4) def test_contiguous_view_realizes(self): a = Tensor.empty(4) b = a.expand((4, 4)).contiguous() check_schedule(b, 1) - self.assertEqual(b.lazydata.base.buffer.size, 16) + self.assertEqual(b.uop.base.buffer.size, 16) class TestUOpBecome(unittest.TestCase): # the simplest case, if we create a new BUFFER for this tensor UOp @@ -2537,21 +2567,21 @@ class TestUOpBecome(unittest.TestCase): add = a+b check_schedule(add, 1) # NOTE: realized base is always a flat buffer - assert UPat(Ops.BUFFER).match(add.lazydata.base, {}) + assert UPat(Ops.BUFFER).match(add.uop.base, {}) # the Tensor UOp can optionally stack a VIEW on top of the BUFFER, in this case to preserve the (4, 4) shape of the tensor - assert add.lazydata is not add.lazydata.base - self.assertEqual(add.lazydata.size, 16) - self.assertEqual(add.lazydata.shape, (4, 4)) + assert add.uop is not add.uop.base + self.assertEqual(add.uop.size, 16) + self.assertEqual(add.uop.shape, (4, 4)) def test_new_buffer_view(self): a = Tensor.empty(4, 4) b = Tensor.empty(4, 4) add = (a+b).reshape(8, 2) check_schedule(add, 1) - assert UPat(Ops.BUFFER).match(add.lazydata.base, {}) + assert UPat(Ops.BUFFER).match(add.uop.base, {}) # the shape is preserverd in the becomes_map. - self.assertEqual(add.lazydata.shape, (8, 2)) - assert add.lazydata is not add.lazydata.base + self.assertEqual(add.uop.shape, (8, 2)) + assert add.uop is not add.uop.base def test_new_flat_buffer(self): a = Tensor.empty(4,) @@ -2559,7 +2589,7 @@ class TestUOpBecome(unittest.TestCase): add = a+b check_schedule(add, 1) # BUFFER already has a shape (4,), this tensor just becomes a contiguous BUFFER - assert UPat(Ops.BUFFER).match(add.lazydata, {}) + assert UPat(Ops.BUFFER).match(add.uop, {}) # sometimes we prefer to perform an op before movement ops, in this case we should stack the mops on top of the new buffer @@ -2568,8 +2598,8 @@ class TestUOpBecome(unittest.TestCase): a = Tensor.empty(4, 1) b = a.expand(4, 4).reciprocal() check_schedule(b, 1) - self.assertEqual(b.lazydata.base.buffer.size, 16) - self.assertEqual(b.lazydata.st, ShapeTracker.from_shape((4, 4))) + self.assertEqual(b.uop.base.buffer.size, 16) + self.assertEqual(b.uop.st, ShapeTracker.from_shape((4, 4))) def test_reorder_expand_alt(self): x = Tensor.empty(4, 1) @@ -2581,95 +2611,95 @@ class TestUOpBecome(unittest.TestCase): def test_become_existing_buffer(self): a = Tensor.empty(4, 4) b = a*1 - assert UPat(Ops.MUL).match(b.lazydata, {}) # before scheduling it's a mul + assert UPat(Ops.MUL).match(b.uop, {}) # before scheduling it's a mul check_schedule(b, 0) - assert UPat(Ops.VIEW, src=(UPat(Ops.BUFFER))).match(b.lazydata, {}) # scheduling merges all MovementOps into a single VIEW - self.assertIs(a.lazydata.base.buffer, b.lazydata.base.buffer) + assert UPat(Ops.VIEW, src=(UPat(Ops.BUFFER))).match(b.uop, {}) # scheduling merges all MovementOps into a single VIEW + self.assertIs(a.uop.base.buffer, b.uop.base.buffer) def test_become_buf_with_mops(self): a = Tensor.empty(2, 4, 2) noop = a.shrink(((1, 2), (0, 4), (0, 2))).reshape(4, 2)*1+0 # before realizing, this tensor is base - assert noop.lazydata is noop.lazydata.base + assert noop.uop is noop.uop.base noop.realize() # it becomes a realized view after realize - assert noop.lazydata is not noop.lazydata.base - assert noop.lazydata.base.op is Ops.BUFFER + assert noop.uop is not noop.uop.base + assert noop.uop.base.op is Ops.BUFFER late_add = noop+2 late_add.realize() def test_become_const_in_base(self): a = Tensor.empty(4) b = a*0 - assert UPat(Ops.MUL).match(b.lazydata, {}) # before scheduling it's a mul + assert UPat(Ops.MUL).match(b.uop, {}) # before scheduling it's a mul check_schedule(b, 0) - assert UPat(Ops.CONST, arg=0).match(b.lazydata.base, {}) # scheduling replaces the tensor lazydata with a VIEW(BUFFER) + assert UPat(Ops.CONST, arg=0).match(b.uop.base, {}) # scheduling replaces the tensor uop with a VIEW(BUFFER) def test_become_const_in_view(self): # if we shrink the base down to a size 0, only the VIEW becomes CONST, base is unchanged. add = Tensor.empty(2, 2)+Tensor.empty(2, 2) b = add.shrink(((0, 1), (0, 0))) check_schedule(b, 0) - assert UPat(Ops.CONST, arg=0).match(b.lazydata, {}) + assert UPat(Ops.CONST, arg=0).match(b.uop, {}) self.assertEqual(b.shape, (1, 0)) # the base is untouched. - assert UPat(Ops.ADD).match(add.lazydata, {}) + assert UPat(Ops.ADD).match(add.uop, {}) def test_become_const_from_const(self): const_add = Tensor(1)+Tensor(2) - assert UPat(Ops.ADD).match(const_add.lazydata, {}) + assert UPat(Ops.ADD).match(const_add.uop, {}) check_schedule(const_add, 0) - assert UPat(Ops.CONST, arg=3).match(const_add.lazydata.base, {}) + assert UPat(Ops.CONST, arg=3).match(const_add.uop.base, {}) # tensors can become another realized tensor source def test_become_existing_buf_simple(self): a = Tensor.empty(4, 4) b = a+0 check_schedule(b, 0) - assert b.lazydata.base.op is Ops.BUFFER - self.assertIs(a.lazydata, b.lazydata) + assert b.uop.base.op is Ops.BUFFER + self.assertIs(a.uop, b.uop) # they can also chain other movement ops on top of the tensor source def test_become_existing_buf_view(self): a = Tensor.empty(4, 4) b = a.permute((1, 0))+0 check_schedule(b, 0) - self.assertEqual(b.lazydata.st, a.lazydata.permute((1, 0)).st) + self.assertEqual(b.uop.st, a.uop.permute((1, 0)).st) def test_become_existing_buf_view_alt(self): a = Tensor.empty(4, 4) b = a.permute((1, 0)).reshape((8, 2))+0 check_schedule(b, 0) - self.assertEqual(b.lazydata.st, a.lazydata.permute((1, 0)).reshape((8, 2)).st) + self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st) # they can also have other base parents that simplified, in that case we just backtrack to the chained mops def test_become_existing_buf_complex(self): a = Tensor.empty(4, 4) b = (a.permute((1, 0))+0).reshape((8, 2))+0 check_schedule(b, 0) - self.assertEqual(b.lazydata.st, a.lazydata.permute((1, 0)).reshape((8, 2)).st) - assert b.lazydata.base.op is Ops.BUFFER + self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st) + assert b.uop.base.op is Ops.BUFFER def test_become_multiple_choices(self): a = Tensor.empty(16) b = (a.reshape(1, 1, 4, 1, 4)+0).reshape(1, 1, 4, 4).shrink(((0, 1), (0, 1), (0, 3), (0, 3)))+0 c = (a.reshape(1, 1, 4, 4)+0).shrink(((0, 1), (0, 1), (0, 3), (0, 3)))+0 check_schedule([b, c], 0) - assert all_same([x.lazydata.base.realized for x in [a,b,c]]) + assert all_same([x.uop.base.realized for x in [a,b,c]]) # these movement ops result in the same ShapeTracker - assert b.lazydata.st == c.lazydata.st - assert b.lazydata is c.lazydata - assert UPat(Ops.VIEW, src=(UPat(Ops.BUFFER),)).match(c.lazydata, {}) + assert b.uop.st == c.uop.st + assert b.uop is c.uop + assert UPat(Ops.VIEW, src=(UPat(Ops.BUFFER),)).match(c.uop, {}) def test_setitem_becomes_subbuffer(self): a = Tensor.full((4,), 2.).contiguous().realize() b = a.shrink(((0, 2),)).assign(Tensor.full((2,), 1.0)) b.realize() - assert a.lazydata.is_realized - assert a.lazydata.buffer._base is None + assert a.uop.is_realized + assert a.uop.buffer._base is None # b is a subbuffer of a - assert b.lazydata.op is Ops.BUFFER_VIEW - assert b.lazydata.src[0] is a.lazydata + assert b.uop.op is Ops.BUFFER_VIEW + assert b.uop.src[0] is a.uop def test_setitem_offset(self): a = Tensor.full((16,), 0.).contiguous().realize() diff --git a/tinygrad_repo/test/test_search.py b/tinygrad_repo/test/test_search.py index 19923a6..a3f459e 100644 --- a/tinygrad_repo/test/test_search.py +++ b/tinygrad_repo/test/test_search.py @@ -1,11 +1,9 @@ import unittest -from test.helpers import ast_const -from tinygrad.codegen.kernel import Opt, OptOps -from tinygrad.codegen.kernel import Kernel +from tinygrad.codegen.kernel import Opt, OptOps, Kernel from tinygrad.uop.ops import UOp, Ops from tinygrad.engine.search import bufs_from_lin, actions, beam_search -from tinygrad.device import Device, Buffer +from tinygrad.device import Device from tinygrad.tensor import Tensor from tinygrad.dtype import dtypes from tinygrad.helpers import Context, GlobalCounters @@ -14,53 +12,6 @@ from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View from extra.optimization.helpers import time_linearizer -class TestTimeLinearizer(unittest.TestCase): - @unittest.skipIf(Device.DEFAULT == "WEBGPU", "WebGPU timestamps are low precision, tm is 0") - def test_reasonable_time(self): - a = Tensor([1,2,3,4]).realize() - si = (a+1).schedule()[0] - # create fresh empty buffers - rawbufs = [Buffer(b.device, b.size, b.dtype).allocate() for b in si.bufs] - tm = time_linearizer(Kernel(si.ast), rawbufs, allow_test_size=False, cnt=10, disable_cache=True) - assert tm > 0 and tm != float('inf') - - def test_bufs_from_lin(self): - a = Tensor([1,2,3,4]).realize() - si = (a+1).schedule()[0] - rawbufs = bufs_from_lin(lin:=Kernel(si.ast)) - assert len(rawbufs) == len(lin.membufs) == 2 - assert all(r is not None for r in rawbufs) - assert all(isinstance(r, Buffer) for r in rawbufs) - assert all(r.size > 0 for r in rawbufs) - - def test_bufs_from_lin_alt(self): - a = Tensor.randn(4, 4).realize() - b = a+a[0] - si = b.schedule()[0] - rawbufs = bufs_from_lin(k:=Kernel(si.ast)) - assert len(rawbufs) == len(k.membufs) == 2 - assert all(r is not None for r in rawbufs) - assert all(isinstance(r, Buffer) for r in rawbufs) - assert all(r.size > 0 for r in rawbufs) - - def test_kernel_count(self): - """ - Ensure that the kernel count is not incremented by time_linearizer when clearing l2 - """ - # ast of Tensor.zeros(16).contiguous().realize() - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(16,), strides=(1,), offset=0, mask=None, contiguous=True),))), - ast_const(dtypes.float, 0.0, st_src=( - UOp(Ops.VIEW, arg=ShapeTracker(views=(View(shape=(16,), strides=(0,), offset=0, mask=None, contiguous=False),))),)),)),)) - lin = Kernel(ast) - bufs = bufs_from_lin(lin) - - kernel_count = GlobalCounters.kernel_count - time_linearizer(lin, bufs, allow_test_size=False, cnt=2, disable_cache=True, clear_l2=True) - assert GlobalCounters.kernel_count == kernel_count, "kernel count was incremented by time_linearizer" - class TestBEAM(unittest.TestCase): def test_dynamic_beam(self): # TODO: make this infra globally usable @@ -136,8 +87,8 @@ class TestBEAM(unittest.TestCase): # taken from https://github.com/tinygrad/tinygrad/issues/4612 ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 256), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=()), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(256), arg=ShapeTracker(views=(View(shape=(1, 1, 256), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(256), arg=0, src=()),)), UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (1,)), src=( UOp(Ops.MUL, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( @@ -146,24 +97,24 @@ class TestBEAM(unittest.TestCase): UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=0, mask=((0, 64128),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=()),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(64128), arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=0, mask=((0, 64128),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64128), arg=1, src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-64128, mask=((64128, 128256),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=()),)),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(64128), arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-64128, mask=((64128, 128256),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64128), arg=2, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-128256, mask=((128256, 192384),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=()),)),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(64128), arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-128256, mask=((128256, 192384),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64128), arg=3, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-192384, mask=((192384, 256512),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=()),)),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(64128), arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-192384, mask=((192384, 256512),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64128), arg=4, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=5, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-256512, mask=((256512, 320640),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=()),)),)), # noqa: E501 + UOp(Ops.VIEW, dtypes.float.ptr(64128), arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-256512, mask=((256512, 320640),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64128), arg=5, src=()),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=6, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-320640, mask=((320640, 384768),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=()),)),)), # noqa: E501 - ast_const(dtypes.float, 1.4285714285714286, st_src=( + UOp(Ops.VIEW, dtypes.float.ptr(64128), arg=ShapeTracker(views=(View(shape=(384768,), strides=(1,), offset=-320640, mask=((320640, 384768),), contiguous=False), View(shape=(1, 501, 256), strides=(0, 1, 501), offset=256512, mask=None, contiguous=False))), src=( # noqa: E501 + UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64128), arg=6, src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=1.4285714285714286, src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 501, 256), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) # noqa: E501 lin = Kernel(ast) diff --git a/tinygrad_repo/test/test_setitem.py b/tinygrad_repo/test/test_setitem.py index d3f066b..d6de6e9 100644 --- a/tinygrad_repo/test/test_setitem.py +++ b/tinygrad_repo/test/test_setitem.py @@ -50,7 +50,7 @@ class TestSetitem(unittest.TestCase): def test_setitem_into_noncontiguous(self): t = Tensor.ones(4) - self.assertFalse(t.lazydata.st.contiguous) + self.assertFalse(t.uop.st.contiguous) with self.assertRaises(RuntimeError): t[1] = 5 @unittest.skip("TODO: flaky") diff --git a/tinygrad_repo/test/test_softmax_fusion.py b/tinygrad_repo/test/test_softmax_fusion.py index fd0dedc..23e1a4b 100644 --- a/tinygrad_repo/test/test_softmax_fusion.py +++ b/tinygrad_repo/test/test_softmax_fusion.py @@ -153,6 +153,7 @@ class TestSoftmaxFusion(unittest.TestCase): np.testing.assert_allclose(sout.numpy(), out.numpy()) + @unittest.skip("recursion error no longer raised") def test_softmax_bw(self): print("*** softmax bw ***") self.test.requires_grad_() diff --git a/tinygrad_repo/test/test_symbolic_jit.py b/tinygrad_repo/test/test_symbolic_jit.py index 1529e3b..881ce33 100644 --- a/tinygrad_repo/test/test_symbolic_jit.py +++ b/tinygrad_repo/test/test_symbolic_jit.py @@ -197,6 +197,18 @@ class TestSymbolicJit(unittest.TestCase): np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6) assert_jit_cache_len(jf, 1) + def test_slice_var_shape(self): + def f(a): return (a+1).realize() + jf = TinyJit(f) + for i in range(1, 5): + vi = Variable("i", 1, 10).bind(i) + a = Tensor.ones(vi, 11).contiguous() + symbolic = a[:, 1:2] + symbolic = jf(symbolic).reshape(i, 1).numpy() + expected = f(a.reshape(i, 11)[:, 1:2]).numpy() + np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6) + assert_jit_cache_len(jf, 1) + def test_ones_sum(self): def f(a): return a.sum().realize() jf = TinyJit(f) diff --git a/tinygrad_repo/test/test_symbolic_ops.py b/tinygrad_repo/test/test_symbolic_ops.py index 76bb3e8..082d046 100644 --- a/tinygrad_repo/test/test_symbolic_ops.py +++ b/tinygrad_repo/test/test_symbolic_ops.py @@ -3,6 +3,8 @@ from tinygrad import Tensor, Variable from tinygrad.shape.shapetracker import View from tinygrad.helpers import Context, GlobalCounters from tinygrad.uop.ops import sym_infer +from tinygrad.dtype import dtypes +from tinygrad.device import Device from examples.gpt2 import Attention import numpy as np @@ -173,6 +175,31 @@ class TestSymbolicOps(unittest.TestCase): expected = a[3:5, i:i+2].numpy() np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6) + def test_slice_no_start(self): + for i in range(1, 5): + vi = Variable("i", 1, 10).bind(i) + a = Tensor.rand(7, 11) + symbolic = a[3:5, :vi:1].reshape(2,i) + symbolic = symbolic.numpy() + expected = a[3:5, :i:1].numpy() + np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6) + + def test_expand_padded(self): + for i in range(1, 5): + vi = Variable("i", 1, 10).bind(i) + a = Tensor(1).unsqueeze(0).pad((0, 1)).unsqueeze(0) + symbolic = a.expand(vi, 2).reshape(i, 2).numpy() + expected = a.expand(i, 2).numpy() + np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6) + + def test_slice_var_shape(self): + for i in range(1, 5): + vi = Variable("i", 1, 10).bind(i) + a = Tensor.ones(vi, 11).contiguous() + symbolic = a[:, 1:2].reshape(i, 1).numpy() + expected = a.reshape(i, 11)[:, 1:2].numpy() + np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6) + def test_ones_sum(self): for i in range(1, 5): vi = Variable("i", 1, 10).bind(i) @@ -221,6 +248,23 @@ class TestSymbolicOps(unittest.TestCase): symbolic = a.reshape(vi, vj).var(axis).reshape(expected.shape).numpy() np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6) + def test_bitcast_down(self): + for i in range(1, 5): + vi = Variable("i", 1, 10).bind(i) + a = Tensor.rand(i, 3) + expected = a.bitcast(dtypes.uint8).numpy() + symbolic = a.reshape(vi, 3).bitcast(dtypes.uint8).reshape(expected.shape).numpy() + np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=0) + + @unittest.skipIf(Device.DEFAULT == "WEBGPU", "no uint64") + def test_bitcast_up(self): + for i in range(1, 5): + vi = Variable("i", 1, 10).bind(i) + a = Tensor.rand(i, 4) + expected = a.bitcast(dtypes.uint64).numpy() + symbolic = a.reshape(vi, 4).bitcast(dtypes.uint64).reshape(expected.shape).numpy() + np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=0) + @unittest.expectedFailure def test_conv2d_ceildiv_edge_case(self): v = Variable('v', 11, 50_000) diff --git a/tinygrad_repo/test/test_tensor.py b/tinygrad_repo/test/test_tensor.py index ea29c4e..704f697 100644 --- a/tinygrad_repo/test/test_tensor.py +++ b/tinygrad_repo/test/test_tensor.py @@ -518,6 +518,10 @@ class TestTinygrad(unittest.TestCase): except ValueError: Tensor.zeros(2, 2).realize() + def test_shrink(self): + t = Tensor.arange(32).contiguous().realize() + self.assertListEqual(t[16:20].tolist(), [16,17,18,19]) + @unittest.skip("this test is just flaky, sync issue") class TestMoveTensor(unittest.TestCase): d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1" @@ -561,17 +565,17 @@ class TestZeroShapeTensor(unittest.TestCase): t = Tensor.empty(3, 2, 0) assert t.shape == (3, 2, 0) # numpy has stride 0, 0, 0; torch has stride 2, 1, 1 - assert t.lazydata.st.real_strides() == (0, 0, 0) + assert t.uop.st.real_strides() == (0, 0, 0) t = Tensor.empty(3, 0, 2) assert t.shape == (3, 0, 2) # numpy has stride 0, 0, 0; torch has stride 2, 2, 1 - assert t.lazydata.st.real_strides() == (0, 0, 0) + assert t.uop.st.real_strides() == (0, 0, 0) t = Tensor.empty(0, 0, 0) assert t.shape == (0, 0, 0) # numpy has stride 0, 0, 0; torch has stride 1, 1, 1 - assert t.lazydata.st.real_strides() == (0, 0, 0) + assert t.uop.st.real_strides() == (0, 0, 0) def test_rand(self): t = Tensor.rand(3, 2, 0) @@ -686,24 +690,24 @@ class TestZeroShapeTensor(unittest.TestCase): a = Tensor.rand(16, 16).realize() b = a.clone() np.testing.assert_allclose(a.numpy(), b.numpy()) - self.assertIsNot(a.lazydata.base.buffer, b.lazydata.base.buffer) + self.assertIsNot(a.uop.base.buffer, b.uop.base.buffer) a = Tensor.rand(16, 16).mul(5.0).add(5.0).realize() b = a.clone() np.testing.assert_allclose(a.numpy(), b.numpy()) - self.assertIsNot(a.lazydata.base.buffer, b.lazydata.base.buffer) + self.assertIsNot(a.uop.base.buffer, b.uop.base.buffer) def test_clone_with_shrink(self): a = Tensor.rand(16, 16) b = a.shrink(((2, 10), None)).clone() b.realize() - self.assertIsNot(a.lazydata.base.buffer, b.lazydata.base.buffer) + self.assertIsNot(a.uop.base.buffer, b.uop.base.buffer) def test_clone_with_shrink_realized(self): a = Tensor.rand(16, 16).realize() b = a.shrink(((2, 10), None)).clone() b.realize() - self.assertIsNot(a.lazydata.base.buffer, b.lazydata.base.buffer) + self.assertIsNot(a.uop.base.buffer, b.uop.base.buffer) def test_clone_with_grad(self): a = Tensor.rand(16, 16, requires_grad=True) @@ -740,12 +744,11 @@ class TestInferenceMode(unittest.TestCase): x = Tensor(x_init, requires_grad=True) m = Tensor(m_init, requires_grad=True) W = Tensor(W_init, requires_grad=True) - with Tensor.test(): - tmp = x.mul(m) - mm = tmp.matmul(W) - out = mm.relu() - out = out.sum() - out.backward() + tmp = x.mul(m) + mm = tmp.matmul(W) + out = mm.relu() + out = out.sum() + #out.backward() assert x.grad is None assert m.grad is None assert tmp.grad is None @@ -757,13 +760,12 @@ class TestInferenceMode(unittest.TestCase): x = Tensor(x_init, requires_grad=True) m = Tensor(m_init, requires_grad=True) W = Tensor(W_init, requires_grad=True) - @Tensor.test() def f(x, m, W): tmp = x.mul(m) mm = tmp.matmul(W) out = mm.relu() out = out.sum() - out.backward() + #out.backward() assert x.grad is None assert m.grad is None assert tmp.grad is None @@ -778,7 +780,7 @@ class TestTensorMetadata(unittest.TestCase): @unittest.skip("why would this be true?") def test_exclude_noop_metadata(self): a = Tensor.rand(4, 4)*1 - self.assertEqual(a.lazydata.metadata[0].name, "__mul__") + self.assertEqual(a.uop.metadata[0].name, "__mul__") k = a.schedule()[-1] self.assertEqual([m.name for m in k.metadata], ["rand"]) @@ -795,7 +797,7 @@ class TestTensorMetadata(unittest.TestCase): x = Tensor.rand(3, requires_grad=True) W = Tensor.rand(3, 3, requires_grad=True) out = x.matmul(W) - self.assertEqual(out.lazydata.metadata[0].name, "matmul") + self.assertEqual(out.uop.metadata[0].name, "matmul") si = out.schedule()[-1] self.assertEqual(len(si.metadata), 1) self.assertEqual(si.metadata[0].name, "matmul") @@ -803,7 +805,7 @@ class TestTensorMetadata(unittest.TestCase): def test_relu(self): x = Tensor.rand(3, requires_grad=True) out = x.relu() - self.assertEqual(out.lazydata.metadata[0].name, "relu") + self.assertEqual(out.uop.metadata[0].name, "relu") si = out.schedule()[-1] self.assertEqual(len(si.metadata), 1) self.assertEqual(si.metadata[0].name, "relu") @@ -812,9 +814,9 @@ class TestTensorMetadata(unittest.TestCase): x = Tensor.rand(3, requires_grad=True) y = Tensor.rand(3, requires_grad=True) out = x.relu() * y.sigmoid() - self.assertEqual(out.lazydata.metadata[0].name, "__mul__") - self.assertEqual(out.lazydata.src[0].metadata[0].name, "relu") - self.assertEqual(out.lazydata.src[1].metadata[0].name, "sigmoid") + self.assertEqual(out.uop.metadata[0].name, "__mul__") + self.assertEqual(out.uop.src[0].metadata[0].name, "relu") + self.assertEqual(out.uop.src[1].metadata[0].name, "sigmoid") si = out.schedule()[-1] self.assertEqual(len(si.metadata), 3) self.assertEqual(set(m.name for m in si.metadata), {"relu", "sigmoid", "__mul__"}) @@ -823,12 +825,12 @@ class TestTensorMetadata(unittest.TestCase): x = Tensor.rand(3, requires_grad=True).realize() y = Tensor.rand(3, requires_grad=True).realize() out = (x.relu() * y.sigmoid()).sum() - self.assertEqual(out.lazydata.metadata[0].name, "sum") + self.assertEqual(out.uop.metadata[0].name, "sum") out.backward() - self.assertEqual(x.grad.lazydata.metadata[0].name, "relu") - self.assertTrue(x.grad.lazydata.metadata[0].backward) - self.assertEqual(y.grad.lazydata.metadata[0].name, "sigmoid") - self.assertTrue(y.grad.lazydata.metadata[0].backward) + self.assertEqual(x.grad.uop.metadata[0].name, "relu") + self.assertTrue(x.grad.uop.metadata[0].backward) + self.assertEqual(y.grad.uop.metadata[0].name, "sigmoid") + self.assertTrue(y.grad.uop.metadata[0].backward) si = Tensor.schedule(out, x.grad, y.grad)[-1] self.assertEqual(len(si.metadata), 4, f"failed with {si.metadata}") self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "__mul__", "relu"}) diff --git a/tinygrad_repo/test/test_tensor_uop.py b/tinygrad_repo/test/test_tensor_uop.py index d6fc085..19541ea 100644 --- a/tinygrad_repo/test/test_tensor_uop.py +++ b/tinygrad_repo/test/test_tensor_uop.py @@ -9,7 +9,7 @@ class TestTensorUOp(unittest.TestCase): def test_fromcpu_shape_tracker(self): def helper(a: np.ndarray): print(a.shape, a.strides, a.flags.c_contiguous) - b = Tensor(a).lazydata + b = Tensor(a).uop #assert b.st.contiguous == a.flags.c_contiguous assert b.st.shape == a.shape np.testing.assert_equal(a, Tensor(b).numpy()) @@ -60,11 +60,11 @@ class TestTensorUOp(unittest.TestCase): np.testing.assert_allclose(c.numpy(), np.concatenate((a.numpy(), b.numpy()), axis=1)) def test_const_dtype(self): - lb: UOp = Tensor([1], dtype=dtypes.int).lazydata + lb: UOp = Tensor([1], dtype=dtypes.int).uop assert lb.const_like(1).base.arg == 1 assert type(lb.const_like(1).base.arg) is int - lb: UOp = Tensor([1], dtype=dtypes.float).lazydata + lb: UOp = Tensor([1], dtype=dtypes.float).uop assert lb.const_like(1).base.arg == 1.0 assert type(lb.const_like(1).base.arg) is float @@ -92,7 +92,7 @@ class TestTensorUOp(unittest.TestCase): out.realize() self.assertEqual(out.tolist(), Tensor.zeros(4, 8).tolist()) -reduce_kernel = UPat(Ops.SINK, src=(UPat(Ops.STORE, src=(UPat(), UPat(), UPat(Ops.REDUCE_AXIS))))) +reduce_kernel = UPat(Ops.SINK, src=(UPat(Ops.STORE, src=(UPat(), UPat(Ops.REDUCE_AXIS))))) class TestReduceOp(unittest.TestCase): def test_no_split_reduce_kernel(self): a = Tensor.rand(4, 4).realize() diff --git a/tinygrad_repo/test/test_uop_graph.py b/tinygrad_repo/test/test_uop_graph.py index 0ad6f07..d9089cb 100644 --- a/tinygrad_repo/test/test_uop_graph.py +++ b/tinygrad_repo/test/test_uop_graph.py @@ -3,7 +3,7 @@ import unittest, pytest from tinygrad import dtypes, Variable from tinygrad.helpers import DEBUG, Context from tinygrad.uop.ops import Ops, UOp, UPat, PatternMatcher, track_rewrites, graph_rewrite, GroupOp -from tinygrad.codegen.symbolic import sym +from tinygrad.uop.symbolic import sym from tinygrad.codegen import full_rewrite, full_rewrite_to_sink from tinygrad.codegen.expander import expander diff --git a/tinygrad_repo/test/test_uops.py b/tinygrad_repo/test/test_uops.py index ebee1f6..2f5ca3e 100644 --- a/tinygrad_repo/test/test_uops.py +++ b/tinygrad_repo/test/test_uops.py @@ -4,16 +4,16 @@ import numpy as np from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View # noqa F401 from tinygrad.tensor import Tensor, _to_np_dtype -from tinygrad.helpers import CI, DEBUG, getenv, Context, Timing +from tinygrad.helpers import CI, DEBUG, getenv, Timing from tinygrad.dtype import dtypes, DType from tinygrad.device import Buffer, Device from tinygrad.uop.ops import Ops, UOp, UPat, KernelInfo, exec_alu # noqa F401 from tinygrad.uop.spec import spec from tinygrad.renderer import ProgramSpec from tinygrad.engine.grouper import fix_kernel_ops -from tinygrad.engine.realize import CompiledRunner, get_kernel +from tinygrad.engine.realize import CompiledRunner from tinygrad.codegen import full_rewrite -from tinygrad.codegen.symbolic import sym +from tinygrad.uop.symbolic import sym from tinygrad.device import is_dtype_supported from tinygrad.codegen.kernel import Kernel, Opt, OptOps @@ -310,7 +310,7 @@ class TestLocalAccess(unittest.TestCase): sres = uop(uops, Ops.LOAD, dtypes.float32, (smem.index(uop(uops, Ops.CONST, dtypes.int32, (), 0)), barr)) self.assertEqual(_test_uops_result(dtypes.float32, uops, sres), 42) - # NOTE: webgpu specific, since only webgpu performs bitpacking for uchar + # NOTE: webgpu specific, since only webgpu performs bitpacking @unittest.skipUnless(Device.DEFAULT == "WEBGPU", "Test local access with packed data type") def test_local_packed(self): uops = [] @@ -320,6 +320,19 @@ class TestLocalAccess(unittest.TestCase): sres = uop(uops, Ops.LOAD, dtypes.uint8, (smem.index(uop(uops, Ops.CONST, dtypes.int32, (), 0)), barr)) self.assertEqual(_test_uops_result(dtypes.uint8, uops, sres), 42) + # NOTE: webgpu specific, since only webgpu performs bitpacking + @unittest.skipUnless(Device.DEFAULT == "WEBGPU", "Test local memory size for packed data types") + def test_packed_smem_size(self): + _dtypes = [dtypes.char, dtypes.uchar, dtypes.short, dtypes.ushort, dtypes.half] + size = 16 + for dtype in _dtypes: + temp = UOp(Ops.DEFINE_LOCAL, dtype.ptr(size=size, local=True), (), 'smem') + uops = to_uops_list([temp], opts=Device[Device.DEFAULT].renderer) + out = Device[Device.DEFAULT].renderer.render(uops) + # half is supported in wgsl, so it doesn't have to be packed + corrected_size = size//(4//dtype.itemsize) if dtype != dtypes.half else size + self.assertIn(f"temp0: array<{Device[Device.DEFAULT].renderer.buf_map(dtype)},{corrected_size}>;", out) + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared memory") @unittest.skip("tinygrad doesn't support this behavior") def test_local_indirect(self): @@ -348,15 +361,16 @@ class TestAssembly(unittest.TestCase): self.assertIn(Ops.MUL, ops) def test_division_power_of_two(self): - g = UOp(Ops.DEFINE_GLOBAL, dtypes.uint32.ptr(), (), 0) - c = UOp(Ops.CONST, dtypes.uint, (), 2) - l = UOp(Ops.LOAD, dtypes.uint, (g.index(c),)) - a = UOp(Ops.IDIV, dtypes.uint, (l, c)) - uops = to_uops_list([a], opts=Device[Device.DEFAULT].renderer) - Device[Device.DEFAULT].renderer.render(uops) - ops = [x.op for x in uops] - self.assertIn(Ops.SHR, ops) - self.assertNotIn(Ops.IDIV, ops) + for dt in (dtypes.int32, dtypes.uint32): + g = UOp(Ops.DEFINE_GLOBAL, dt.ptr(), (), 0) + c = UOp(Ops.CONST, dt, (), 2) + l = UOp(Ops.LOAD, dt, (g.index(c),)) + a = UOp(Ops.IDIV, dt, (l, c)) + uops = to_uops_list([a], opts=Device[Device.DEFAULT].renderer) + Device[Device.DEFAULT].renderer.render(uops) + ops = [x.op for x in uops] + self.assertIn(Ops.SHR, ops, f"For dtype={dt} divison by power of two did not simplify to shift") + self.assertNotIn(Ops.IDIV, ops, f"For dtype={dt} divison by power of two did not simplify to shift") def test_fast_idiv_and_mod(self): g = UOp(Ops.DEFINE_GLOBAL, dtypes.uint32.ptr(), (), 0) @@ -447,13 +461,6 @@ class TestUOpStr(unittest.TestCase): assert len(str(a)) < 10_000, "exponential string growth" assert str(eval(str(a))) == str(a) - t = Tensor.arange(10) - t = t + t * Tensor.rand(10) - # nice big complicated uop - with Context(NOOPT=1): - sink = UOp(Ops.SINK, dtypes.void, (get_kernel(Device[Device.DEFAULT].renderer, t.schedule()[-1].ast).linearize().uops[-1],)) - self.assertEqual(sink, eval(str(sink))) - def test_vectorized_str(self): vec = UOp(Ops.VECTORIZE, dtypes.int.vec(4), tuple(UOp.const(dtypes.int, x) for x in range(4))) assert str(eval(str(vec))) == str(vec) @@ -463,7 +470,7 @@ class TestUOpStr(unittest.TestCase): assert str(eval(str(device))) == str(device) def test_reduceop_arg(self): - sum_uop = Tensor.empty(32, 32).sum().lazydata + sum_uop = Tensor.empty(32, 32).sum().uop assert str(eval(str(sum_uop))) == str(sum_uop) @unittest.skip("uop no longer has order like this") @@ -536,9 +543,9 @@ class TestShapeSpec(unittest.TestCase): # ** CONST is CONST(VIEW(DEVICE)) -> RESHPAE -> EXPAND def test_expanded_const(self): - a = Tensor(1).lazydata + a = Tensor(1).uop self.assertEqual(a.st, ShapeTracker.from_shape(())) - a = Tensor.ones((4, 4)).lazydata + a = Tensor.ones((4, 4)).uop self.assertEqual(a.st, ShapeTracker.from_shape(()).reshape((1,1)).expand((4,4))) def test_padded_const(self): @@ -556,12 +563,12 @@ class TestShapeSpec(unittest.TestCase): # NOTE: CONST ShapeTracker comes from its source def test_scalar_const(self): - a = Tensor(0).lazydata + a = Tensor(0).uop self.assertEqual(a.st, ShapeTracker.from_shape(())) def test_scalar_var(self): vv = UOp.variable("a", 1, 4).bind(2) - t = Tensor(vv).lazydata + t = Tensor(vv).uop self.assertEqual(t.st, ShapeTracker.from_shape(())) # ** ASSIGN is ASSIGN(VIEW(BUFFER), new_val) @@ -570,7 +577,7 @@ class TestShapeSpec(unittest.TestCase): buffer = Tensor.arange(4).realize() a = buffer.assign(Tensor.zeros((4,), dtype=dtypes.int)) assign_pattern = UPat(Ops.ASSIGN, src=(UPat(Ops.BUFFER), UPat())) - assert assign_pattern.match(a.lazydata, {}) + assert assign_pattern.match(a.uop, {}) a.realize() self.assertEqual(buffer.tolist(), [0, 0, 0, 0]) @@ -584,7 +591,7 @@ class TestShapeSpec(unittest.TestCase): buffer = Tensor.ones((4,)).contiguous().realize() a = buffer.reshape((2, 2)).assign(Tensor.zeros((2, 2))) assign_pattern = UPat(Ops.ASSIGN, src=(UPat(Ops.RESHAPE, src=(UPat(Ops.BUFFER))), UPat())) - assert assign_pattern.match(a.lazydata, {}) + assert assign_pattern.match(a.uop, {}) a.realize() self.assertEqual(buffer.tolist(), [0, 0, 0, 0]) @@ -593,13 +600,13 @@ class TestShapeSpec(unittest.TestCase): a = Tensor.ones((4,)).contiguous().realize() assign = a.shrink(((1, 2),)).assign(Tensor.zeros((1,))) # the ASSIGN UOp has size=1 - self.assertEqual(assign.lazydata.size, 1) + self.assertEqual(assign.uop.size, 1) # the ASSIGN views the buffer with a shrunk st - self.assertEqual(assign.lazydata.src[0].st, ShapeTracker.from_shape((4,)).shrink(((1, 2),))) + self.assertEqual(assign.uop.src[0].st, ShapeTracker.from_shape((4,)).shrink(((1, 2),))) # the underlying BUFFER has a size=4 - self.assertEqual(assign.lazydata.buf_uop.size, 4) + self.assertEqual(assign.uop.buf_uop.size, 4) # NOTE: output shape is different from the BUFFER shape - self.assertNotEqual(assign.lazydata.shape, a.lazydata.shape) + self.assertNotEqual(assign.uop.shape, a.uop.shape) assign.realize() self.assertEqual(a.tolist(), [1, 0, 1, 1]) @@ -609,13 +616,13 @@ class TestShapeSpec(unittest.TestCase): def test_ops_st(self): # view / mop - a = Tensor.empty(4, 2, 1).permute((1, 2, 0)).lazydata + a = Tensor.empty(4, 2, 1).permute((1, 2, 0)).uop self.assertEqual(a.st, ShapeTracker.from_shape((4, 2, 1)).permute((1, 2, 0))) # alu / reduce alu = a*2 self.assertEqual(alu.st, ShapeTracker.from_shape((2, 1, 4))) r = Tensor.empty(4, 4).sum(axis=1) - self.assertEqual(r.lazydata.st, ShapeTracker.from_shape((4,))) + self.assertEqual(r.uop.st, ShapeTracker.from_shape((4,))) def test_st_wmma_none(self): A = UOp(Ops.DEFINE_VAR, dtypes.float.vec(16), arg=('a', UOp.const(dtypes.float, 0), UOp.const(dtypes.float, 1))) diff --git a/tinygrad_repo/test/test_winograd.py b/tinygrad_repo/test/test_winograd.py index 975fe88..e4ef3ca 100644 --- a/tinygrad_repo/test/test_winograd.py +++ b/tinygrad_repo/test/test_winograd.py @@ -1,10 +1,26 @@ import unittest -from tinygrad import Tensor, GlobalCounters, dtypes +import numpy as np +from tinygrad import Tensor, GlobalCounters, dtypes, Context, nn from tinygrad.uop.ops import Ops from tinygrad.helpers import Timing, CI, Profiling, WINO, DEBUG, getenv from tinygrad.codegen.kernel import Kernel from tinygrad.codegen.heuristic import hand_coded_optimizations +class TestWinogradClose(unittest.TestCase): + def test_close(self): + inp = Tensor.rand(1, 16, 16, 16) + conv = nn.Conv2d(16, 16, 3) + conv(inp).realize() # warmup + GlobalCounters.reset() + print("non winograd") + with Context(WINO=0): + cmp = conv(inp).realize() # warmup + GlobalCounters.reset() + print("winograd") + with Context(WINO=1): + test = conv(inp).realize() + np.testing.assert_allclose(cmp.numpy(), test.numpy(), atol=1e-5) + class TestWinograd(unittest.TestCase): def setUp(self): self.old = WINO.value diff --git a/tinygrad_repo/test/test_zero_copy.py b/tinygrad_repo/test/test_zero_copy.py index 6f2b2cd..c862578 100644 --- a/tinygrad_repo/test/test_zero_copy.py +++ b/tinygrad_repo/test/test_zero_copy.py @@ -6,7 +6,7 @@ def time_tensor_numpy(out:Tensor): times = [] for _ in range(5): st = time.perf_counter() - out.lazydata.base.realized.as_buffer(allow_zero_copy=True) + out.uop.base.realized.as_buffer(allow_zero_copy=True) et = time.perf_counter() - st times.append(et) return min(times) diff --git a/tinygrad_repo/test/unit/test_allreduce.py b/tinygrad_repo/test/unit/test_allreduce.py index 7eb54bd..a303096 100644 --- a/tinygrad_repo/test/unit/test_allreduce.py +++ b/tinygrad_repo/test/unit/test_allreduce.py @@ -4,7 +4,6 @@ from tinygrad.helpers import Context from tinygrad.uop.ops import Ops class TestRingAllReduce(unittest.TestCase): - @unittest.skip("still broken") def test_schedule_ring(self): with Context(RING=2): N = 4 diff --git a/tinygrad_repo/test/test_conv.py b/tinygrad_repo/test/unit/test_conv.py similarity index 80% rename from tinygrad_repo/test/test_conv.py rename to tinygrad_repo/test/unit/test_conv.py index 1ae5d30..6091a9c 100644 --- a/tinygrad_repo/test/test_conv.py +++ b/tinygrad_repo/test/unit/test_conv.py @@ -5,7 +5,7 @@ from tinygrad.helpers import Context class TestConv(unittest.TestCase): def test_simple(self): - x = Tensor.ones(1,12,128,256).contiguous().realize() + x = Tensor.ones(1,12,16,32).contiguous().realize() w = Tensor.ones(32,12,3,3).contiguous().realize() ret = x.conv2d(w, stride=(2,2), padding=(1,1)).numpy() # it's not 108 around the padding @@ -14,7 +14,7 @@ class TestConv(unittest.TestCase): assert ret[0,0,0,1] == 72 def test_simple_rand(self): - x = Tensor.rand(1,12,128,256) + x = Tensor.rand(1,12,16,32) w = Tensor.rand(32,12,3,3) x.conv2d(w, stride=(2,2), padding=(1,1)).numpy() @@ -26,12 +26,10 @@ class TestConv(unittest.TestCase): print(ret) def test_lazycache(self): - Tensor.no_grad = True x = Tensor.rand(1, 32) y = Tensor.rand(32) out = x + y.reshape((1,32,1)).reshape((1,32)) + y.reshape((1,32,1)).reshape((1,32)) out.numpy() - Tensor.no_grad = False def test_simple_biased(self): C = 8 @@ -43,35 +41,28 @@ class TestConv(unittest.TestCase): print(ret.numpy()) def test_two_binops_no_rerun_small(self): - Tensor.no_grad = True x = Tensor.rand(1,1,32,32) w = Tensor.rand(1,1,3,3) out = x.conv2d(w, padding=(1,1)) np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0)) - Tensor.no_grad = False def test_two_binops_no_rerun(self): - Tensor.no_grad = True - x = Tensor.randn(1,12,128,256) + x = Tensor.randn(1,12,16,32) w = Tensor.randn(32,12,3,3) out = x.conv2d(w, stride=(2,2), padding=(1,1)) r1, r2 = out.relu(), (out-1) np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0)) np.testing.assert_allclose(r2.numpy(), out.numpy() - 1) - Tensor.no_grad = False def test_two_overlapping_binops_no_rerun(self): - Tensor.no_grad = True - x = Tensor.randn(1,12,128,256) + x = Tensor.randn(1,12,16,32) w = Tensor.randn(32,12,3,3) out = x.conv2d(w, stride=(2,2), padding=(1,1)) r1, r2 = out.relu(), out.elu() np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0)) np.testing.assert_allclose(r2.numpy(), np.where(out.numpy() > 0, out.numpy(), (np.exp(out.numpy()) - 1)), atol=1e-5) - Tensor.no_grad = False def test_two_overlapping_binops_no_rerun_wino(self): - Tensor.no_grad = True with Context(WINO=1): x = Tensor.randn(1,4,16,16) w = Tensor.randn(6,4,3,3) @@ -79,11 +70,9 @@ class TestConv(unittest.TestCase): r1, r2 = out.relu(), out.elu() np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0)) np.testing.assert_allclose(r2.numpy(), np.where(out.numpy() > 0, out.numpy(), (np.exp(out.numpy()) - 1)), atol=1e-5) - Tensor.no_grad = False def test_first_three(self): - Tensor.no_grad = True - x = Tensor.rand(1,12,128,256) + x = Tensor.rand(1,12,16,32) w = Tensor.rand(32,12,3,3) x = x.conv2d(w, stride=(2,2), padding=(1,1)).elu() @@ -96,11 +85,9 @@ class TestConv(unittest.TestCase): x = x.numpy() print(x.shape) - Tensor.no_grad = False def test_elu(self): - Tensor.no_grad = True - x = Tensor.rand(1,12,128,256) + x = Tensor.rand(1,12,16,32) w = Tensor.rand(32,12,3,3) x = x.conv2d(w, stride=(2,2), padding=(1,1)) @@ -110,25 +97,20 @@ class TestConv(unittest.TestCase): w = Tensor.rand(32,1,3,3) x = x.conv2d(w, padding=(1,1), groups=32) x.numpy() - Tensor.no_grad = False def test_reduce_relu(self): - Tensor.no_grad = True - x = Tensor.rand(1,12,128,256) + x = Tensor.rand(1,12,16,32) x = x.sum(keepdim=True).relu() x.numpy() - Tensor.no_grad = False def test_bias(self): - Tensor.no_grad = True from tinygrad.nn import Conv2d - x = Tensor.rand(1,12,128,256) + x = Tensor.rand(1,12,16,32) c = Conv2d(12, 32, 3) x = c(x).relu() w = Tensor.uniform(32, 1, 3, 3) x = x.conv2d(w, groups=32) x.numpy() - Tensor.no_grad = False def test_multiadd(self): w = Tensor.rand(32) @@ -136,14 +118,14 @@ class TestConv(unittest.TestCase): (w+x).numpy() def test_reorder(self): - x = Tensor.rand(1,12,128,256) + x = Tensor.rand(1,12,16,32) w = Tensor.rand(12,12,3,3) x = x.conv2d(w, padding=(1,1)) print(x.shape) - x = x.reshape((1, 12, 256, 128)) + x = x.reshape((1, 12, 32, 16)) x += 1 x += 1 - x = x.reshape((1, 12, 128, 256)) + x = x.reshape((1, 12, 16, 32)) x.numpy() if __name__ == '__main__': diff --git a/tinygrad_repo/test/test_conv_shapetracker.py b/tinygrad_repo/test/unit/test_conv_shapetracker.py similarity index 100% rename from tinygrad_repo/test/test_conv_shapetracker.py rename to tinygrad_repo/test/unit/test_conv_shapetracker.py diff --git a/tinygrad_repo/test/unit/test_disk_tensor.py b/tinygrad_repo/test/unit/test_disk_tensor.py index 1e11a85..74a8fee 100644 --- a/tinygrad_repo/test/unit/test_disk_tensor.py +++ b/tinygrad_repo/test/unit/test_disk_tensor.py @@ -117,6 +117,7 @@ class TestSafetensors(unittest.TestCase): for k in f.keys(): np.testing.assert_array_equal(f.get_tensor(k).numpy(), state_dict[k].numpy()) + @unittest.skip("this test takes 7 seconds. TODO: make disk assign lazy") def test_efficientnet_safetensors(self): from extra.models.efficientnet import EfficientNet model = EfficientNet(0) @@ -351,6 +352,7 @@ class TestDiskTensor(unittest.TestCase): on_dev = t.to(Device.DEFAULT).realize() np.testing.assert_equal(on_dev.numpy(), t.numpy()) + @unittest.skip("this allocates a lot of RAM") @unittest.skipUnless(OSX, "seems to only be an issue on macOS with file size >2 GiB") def test_copy_to_cpu_not_truncated(self): with open((fn:=temp("dt_copy_to_cpu_not_truncated")), "wb") as f: f.write(b'\x01' * (size := int(2 * 1024**3)) + (test := b"test")) diff --git a/tinygrad_repo/test/unit/test_dtype_spec.py b/tinygrad_repo/test/unit/test_dtype_spec.py new file mode 100644 index 0000000..0e78af1 --- /dev/null +++ b/tinygrad_repo/test/unit/test_dtype_spec.py @@ -0,0 +1,552 @@ +import unittest, math, operator, subprocess +from tinygrad.tensor import Tensor, dtypes, Device +from tinygrad.dtype import DType, DTYPES_DICT, truncate, truncate_fp16, truncate_bf16, _to_np_dtype, least_upper_dtype, least_upper_float +from tinygrad.device import is_dtype_supported +from tinygrad.helpers import getenv, CI, DEBUG +from hypothesis import given, settings, strategies as strat +import numpy as np +import torch +import ml_dtypes + +settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False)) +settings.load_profile("my_profile") + +core_dtypes = list(DTYPES_DICT.values()) +dtype_ints = [dt for dt in core_dtypes if dtypes.is_int(dt) and is_dtype_supported(dt)] +dtype_floats = [dt for dt in core_dtypes if dtypes.is_float(dt) and is_dtype_supported(dt)] + +FP8E4M3_MAX = 448.0 +FP8E5M2_MAX = 57344.0 + +def _assert_eq(tensor:Tensor, target_dtype:DType, target, tol_target_dtype:float=1e-7): + if DEBUG >= 2: print(tensor.numpy()) + try: + assert tensor.dtype == target_dtype + np.testing.assert_allclose(tensor.numpy(), target, rtol={dtypes.float16:1e-3, dtypes.bfloat16:1e-2}.get(target_dtype, tol_target_dtype)) + except AssertionError as e: + raise AssertionError(f"\ntensor {tensor.numpy()} dtype {tensor.dtype} does not match target {target} with dtype {target_dtype}") from e + +class TestHelpers(unittest.TestCase): + signed_ints = (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64) + uints = (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64) + floats = (dtypes.float16, dtypes.float32, dtypes.float64) + + @given(strat.sampled_from(signed_ints+uints), strat.integers(min_value=1, max_value=8)) + def test_is_int(self, dtype, amt): + assert dtypes.is_int(dtype.vec(amt) if amt > 1 else dtype) + assert not dtypes.is_float(dtype.vec(amt) if amt > 1 else dtype) + + @given(strat.sampled_from(uints), strat.integers(min_value=1, max_value=8)) + def test_is_unsigned_uints(self, dtype, amt): + assert dtypes.is_unsigned(dtype.vec(amt) if amt > 1 else dtype) + + @given(strat.sampled_from(signed_ints), strat.integers(min_value=1, max_value=8)) + def test_is_unsigned_signed_ints(self, dtype, amt): + assert not dtypes.is_unsigned(dtype.vec(amt) if amt > 1 else dtype) + + @given(strat.sampled_from(floats), strat.integers(min_value=1, max_value=8)) + def test_is_float(self, dtype, amt): + assert dtypes.is_float(dtype.vec(amt) if amt > 1 else dtype) + assert not dtypes.is_int(dtype.vec(amt) if amt > 1 else dtype) + assert not dtypes.is_unsigned(dtype.vec(amt) if amt > 1 else dtype) + + def test_bf16_is_float(self): + assert dtypes.is_float(dtypes.bfloat16) + + def test_fp8s_are_float(self): + assert dtypes.is_float(dtypes.fp8e4m3) + assert dtypes.is_float(dtypes.fp8e5m2) + + @given(strat.sampled_from([d for d in DTYPES_DICT.values() if dtypes.is_float(d) or dtypes.is_int(d)]), strat.integers(min_value=2, max_value=8)) + def test_scalar(self, dtype, amt): + assert dtype.vec(amt).scalar() == dtype + + def test_from_py(self): + assert dtypes.from_py(True) == dtypes.bool + assert dtypes.from_py(2) == dtypes.default_int + assert dtypes.from_py(3.0) == dtypes.default_float + assert dtypes.from_py([]) == dtypes.default_float + assert dtypes.from_py(()) == dtypes.default_float + assert dtypes.from_py([True]) == dtypes.bool + assert dtypes.from_py([True, 2]) == dtypes.default_int + assert dtypes.from_py([True, 3.0]) == dtypes.default_float + assert dtypes.from_py([2, 3.0]) == dtypes.default_float + assert dtypes.from_py([True, 2, 3.0]) == dtypes.default_float + with self.assertRaises(RuntimeError): dtypes.from_py(None) + with self.assertRaises(RuntimeError): dtypes.from_py([None]) + with self.assertRaises(RuntimeError): dtypes.from_py({}) + with self.assertRaises(RuntimeError): dtypes.from_py(set()) + + def test_dtype_range(self): + for dt in core_dtypes: + if dtypes.is_float(dt): + np.testing.assert_equal(dtypes.min(dt), -math.inf) + np.testing.assert_equal(dtypes.max(dt), math.inf) + np.testing.assert_equal(dt.min, -math.inf) + np.testing.assert_equal(dt.max, math.inf) + elif dtypes.is_int(dt): + info = np.iinfo(_to_np_dtype(dt)) + np.testing.assert_equal(dtypes.min(dt), info.min) + np.testing.assert_equal(dtypes.max(dt), info.max) + np.testing.assert_equal(dt.min, info.min) + np.testing.assert_equal(dt.max, info.max) + else: + assert dt == dtypes.bool, dt + np.testing.assert_equal(dtypes.min(dt), False) + np.testing.assert_equal(dtypes.max(dt), True) + np.testing.assert_equal(dt.min, False) + np.testing.assert_equal(dt.max, True) + + def test_truncate_fp16(self): + self.assertEqual(truncate_fp16(1), 1) + self.assertEqual(truncate_fp16(65504), 65504) + self.assertEqual(truncate_fp16(65519.999), 65504) + self.assertEqual(truncate_fp16(65520), math.inf) + + def test_truncate_bf16(self): + self.assertEqual(truncate_bf16(1), 1) + self.assertAlmostEqual(truncate_bf16(1.1), 1.09375, places=7) + for a in [1234, 23456, -777.777]: + self.assertEqual(truncate_bf16(a), torch.tensor([a], dtype=torch.bfloat16).item()) + # TODO: torch bfloat 1.1 gives 1.1015625 instead of 1.09375 + max_bf16 = torch.finfo(torch.bfloat16).max + self.assertEqual(truncate_bf16(max_bf16), max_bf16) + self.assertEqual(truncate_bf16(min_bf16:=-max_bf16), min_bf16) + self.assertEqual(truncate_bf16(max_bf16 * 1.00001), math.inf) + self.assertEqual(truncate_bf16(min_bf16 * 1.00001), -math.inf) + + @given(strat.floats(width=32, allow_subnormal=True, allow_nan=True, allow_infinity=True)) + def test_truncate_fp8e4m3(self, x): + if x > FP8E4M3_MAX: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), FP8E4M3_MAX) + elif x < -FP8E4M3_MAX: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), -FP8E4M3_MAX) + else: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), ml_dtypes.float8_e4m3fn(x)) + + @given(strat.floats(width=32, allow_subnormal=True, allow_nan=True, allow_infinity=True)) + def test_truncate_fp8e5m2(self, x): + if x > FP8E5M2_MAX: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), FP8E5M2_MAX) + elif x < -FP8E5M2_MAX: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), -FP8E5M2_MAX) + else: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), ml_dtypes.float8_e5m2(x)) + +class TestTypeSpec(unittest.TestCase): + def setUp(self): + self.old_default_int, self.old_default_float = dtypes.default_int, dtypes.default_float + def tearDown(self): + dtypes.default_int, dtypes.default_float = self.old_default_int, self.old_default_float + + def test_set_dtype_default(self): + for default_int in [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64]: + dtypes.default_int = default_int + assert dtypes.default_int == default_int + + for default_float in [*dtypes.fp8s, dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]: + dtypes.default_float = default_float + assert dtypes.default_float == default_float + + @unittest.skip("this test is slow and spawning whole pythons") + def test_env_set_default_float(self): + # check default + subprocess.run(['python3 -c "from tinygrad import dtypes; assert dtypes.default_float == dtypes.float"'], + shell=True, check=True) + # check change + subprocess.run(['DEFAULT_FLOAT=HALF python3 -c "from tinygrad import dtypes; assert dtypes.default_float == dtypes.half"'], + shell=True, check=True) + # check invalid + with self.assertRaises(subprocess.CalledProcessError): + subprocess.run(['DEFAULT_FLOAT=INT32 python3 -c "from tinygrad import dtypes"'], + shell=True, check=True) + + with self.assertRaises(subprocess.CalledProcessError): + subprocess.run(['DEFAULT_FLOAT=TYPO python3 -c "from tinygrad import dtypes"'], + shell=True, check=True) + + @unittest.skipUnless(is_dtype_supported(dtypes.int8), f"no int8 on {Device.DEFAULT}") + def test_dtype_str_arg(self): + n = np.random.normal(0, 1, (10, 10)).astype(np.float32) + tested = 0 + for dtype_str, dtype in [ + ("bool", dtypes.bool), ("int8", dtypes.int8), ("int", dtypes.int), ("uint32", dtypes.uint32), ("float32", dtypes.float32)]: + np.testing.assert_equal(Tensor(n, dtype=dtype_str).numpy(), Tensor(n, dtype=dtype).numpy()) + np.testing.assert_equal(Tensor(n).cast(dtype_str).numpy(), Tensor(n).cast(dtype).numpy()) + if dtype.itemsize == 4: + np.testing.assert_equal(Tensor(n).bitcast(dtype_str).numpy(), Tensor(n).bitcast(dtype).numpy()) + tested += 1 + assert tested == 3 + + with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="nonexistdtype") + with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="") + + np.testing.assert_equal(Tensor(n).sum(dtype="int16").numpy(), Tensor(n).sum(dtype=dtypes.int16).numpy()) + + @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) + def test_creation(self, default_int, default_float): + dtypes.default_int, dtypes.default_float = default_int, default_float + _assert_eq(Tensor(True), dtypes.bool, True) + _assert_eq(Tensor(None), dtypes.default_float, []) + _assert_eq(Tensor(2), dtypes.default_int, 2) + _assert_eq(Tensor(2.34), dtypes.default_float, 2.34) + _assert_eq(Tensor([]), dtypes.default_float, []) + _assert_eq(Tensor([1]), dtypes.default_int, [1]) + _assert_eq(Tensor([1.1]), dtypes.default_float, [1.1]) + + _assert_eq(Tensor.eye(0), dtypes.default_float, np.eye(0)) + _assert_eq(Tensor.eye(3), dtypes.default_float, np.eye(3)) + if is_dtype_supported(dtypes.int64): + _assert_eq(Tensor.eye(3, dtype=dtypes.int64), dtypes.int64, np.eye(3)) + if is_dtype_supported(dtypes.float16): + _assert_eq(Tensor.eye(3, dtype=dtypes.float16), dtypes.float16, np.eye(3)) + + @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) + def test_full(self, default_int, default_float): + dtypes.default_int, dtypes.default_float = default_int, default_float + + _assert_eq(Tensor.zeros((2, 3)), dtypes.default_float, np.zeros((2, 3))) + if is_dtype_supported(dtypes.int64): + _assert_eq(Tensor.zeros((2, 3), dtype=dtypes.int64), dtypes.int64, np.zeros((2, 3))) + if is_dtype_supported(dtypes.float16): + _assert_eq(Tensor.zeros((2, 3), dtype=dtypes.float16), dtypes.float16, np.zeros((2, 3))) + + _assert_eq(Tensor.ones((2, 3)), dtypes.default_float, np.ones((2, 3))) + if is_dtype_supported(dtypes.int64): + _assert_eq(Tensor.ones((2, 3), dtype=dtypes.int64), dtypes.int64, np.ones((2, 3))) + if is_dtype_supported(dtypes.float16): + _assert_eq(Tensor.ones((2, 3), dtype=dtypes.float16), dtypes.float16, np.ones((2, 3))) + + _assert_eq(Tensor.full((2, 3), 3.0), dtypes.default_float, np.full((2, 3), 3.0)) + _assert_eq(Tensor.full((2, 3), 3), dtypes.default_int, np.full((2, 3), 3)) + _assert_eq(Tensor.full((2, 3), True), dtypes.bool, np.full((2, 3), True)) + if is_dtype_supported(dtypes.int64): + _assert_eq(Tensor.full((2, 3), 3, dtype=dtypes.int64), dtypes.int64, np.full((2, 3), 3)) + _assert_eq(Tensor.full((2, 3), 3.0, dtype=dtypes.int64), dtypes.int64, np.full((2, 3), 3)) + if is_dtype_supported(dtypes.float16): + _assert_eq(Tensor.full((2, 3), 3, dtype=dtypes.float16), dtypes.float16, np.full((2, 3), 3)) + _assert_eq(Tensor.full((2, 3), 3.0, dtype=dtypes.float16), dtypes.float16, np.full((2, 3), 3)) + + @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) + def test_reduce_0d_default(self, default_int, default_float): + dtypes.default_int, dtypes.default_float = default_int, default_float + _assert_eq(Tensor.ones((2,3,0)).sum(2), dtypes.default_float, np.zeros((2, 3))) + # TODO: what should this one be? + # _assert_eq(Tensor.ones((2,3,0), dtype=dtypes.default_int).sum(2), dtypes.default_int, np.zeros((2, 3))) + _assert_eq(Tensor.ones((2,3,0), dtype=dtypes.int32).sum(2), dtypes.int32, np.zeros((2, 3))) + + @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) + def test_arange(self, default_int, default_float): + dtypes.default_int, dtypes.default_float = default_int, default_float + + _assert_eq(Tensor.arange(5), dtypes.default_int, np.arange(5)) + _assert_eq(Tensor.arange(120), dtypes.default_int, np.arange(120)) + _assert_eq(Tensor.arange(5.0), dtypes.default_float, np.arange(5)) + if is_dtype_supported(dtypes.int16): + _assert_eq(Tensor.arange(5, dtype=dtypes.int16), dtypes.int16, np.arange(5)) + if is_dtype_supported(dtypes.int64): + _assert_eq(Tensor.arange(5, dtype=dtypes.int64), dtypes.int64, np.arange(5)) + if is_dtype_supported(dtypes.float16): + _assert_eq(Tensor.arange(5, dtype=dtypes.float16), dtypes.float16, np.arange(5)) + _assert_eq(Tensor.arange(3, 9, 0.7), dtypes.default_float, np.arange(3, 9, 0.7), 1e-6 if Device.DEFAULT == "WEBGPU" else 1e-7) + _assert_eq(Tensor.arange(3, 8.5, 3), dtypes.default_float, np.arange(3, 8.5, 3)) + # stop-start and step have different signs + _assert_eq(Tensor.arange(3, 5, -2), dtypes.default_int, np.arange(3, 5, -2)) + _assert_eq(Tensor.arange(5.0, 3.0), dtypes.default_float, np.arange(5.0, 3.0)) + + @given(strat.sampled_from(core_dtypes), strat.sampled_from([operator.gt, operator.ge, operator.le, operator.lt, operator.eq, operator.ne])) + def test_bool_ops(self, dtype, op): + assert op(Tensor.ones(4, 4, dtype=dtype), Tensor.ones(4, 4, dtype=dtype)).dtype == dtypes.bool + + @given(strat.sampled_from(core_dtypes), strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) + def test_functions_return_index(self, dtype, default_int, default_float): + dtypes.default_int, dtypes.default_float = default_int, default_float + assert Tensor([0, 1], dtype=dtype).argmax().dtype == dtypes.int32 + assert Tensor([0, 1], dtype=dtype).argmin().dtype == dtypes.int32 + assert Tensor([0, 1], dtype=dtype).multinomial().dtype == dtypes.int32 + + @given(strat.sampled_from(core_dtypes), strat.sampled_from(dtype_ints)) + def test_tensor_indexing_returns_same_dtype(self, data_dtype, indices_dtype): + X_data = Tensor.ones(60000, 1, 28, 28, dtype=data_dtype) + indices = Tensor.randint(512, high=X_data.shape[0]).cast(indices_dtype) + assert X_data[indices].dtype == X_data.dtype + + @given(strat.sampled_from(core_dtypes), strat.sampled_from(dtype_ints)) + def test_gather_returns_same_dtype(self, data_dtype, indices_dtype): + X_data = Tensor([[1, 0], [0, 1]], dtype=data_dtype) + indices = Tensor([[0, 0], [1, 0]], dtype=indices_dtype) + assert X_data.gather(0, indices).dtype == X_data.dtype + assert X_data.gather(1, indices).dtype == X_data.dtype + + @given(strat.sampled_from(dtype_floats), strat.sampled_from(dtype_floats)) + def test_attention_returns_same_dtype(self, data_dtype, default_float): + dtypes.default_float = default_float + query = Tensor.rand(32, 8, 128, 64, dtype=data_dtype) + key = Tensor.rand(32, 8, 128, 64, dtype=data_dtype) + value = Tensor.rand(32, 8, 128, 64, dtype=data_dtype) + mask = (Tensor.rand(32, 8, 128, 128) < 0.5) + assert query.scaled_dot_product_attention(key, value, is_causal=True).dtype == data_dtype + assert query.scaled_dot_product_attention(key, value, is_causal=True, dropout_p=0.3).dtype == data_dtype + assert query.scaled_dot_product_attention(key, value, is_causal=False).dtype == data_dtype + assert query.scaled_dot_product_attention(key, value, attn_mask=mask).dtype == data_dtype + +class TestTypePromotion(unittest.TestCase): + @given(strat.sampled_from(core_dtypes)) + def test_self_promo_to_self(self, dtype): + assert least_upper_dtype(dtype) == dtype + assert least_upper_dtype(dtype, dtype) == dtype + assert least_upper_dtype(dtype, dtype, dtype) == dtype + + @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) + def test_promo_resulted_higher_than_inputs(self, dtype1, dtype2): + result = least_upper_dtype(dtype1, dtype2) + assert not (result < dtype1) and not (result < dtype2) + + def test_dtype_promo(self): + assert least_upper_dtype(dtypes.bool, dtypes.int8) == dtypes.int8 + assert least_upper_dtype(dtypes.int8, dtypes.uint8) == dtypes.int16 + assert least_upper_dtype(dtypes.uint8, dtypes.int16) == dtypes.int16 + assert least_upper_dtype(dtypes.int16, dtypes.uint16) == dtypes.int32 + assert least_upper_dtype(dtypes.uint16, dtypes.int32) == dtypes.int32 + assert least_upper_dtype(dtypes.int32, dtypes.uint32) == dtypes.int64 + assert least_upper_dtype(dtypes.uint32, dtypes.int64) == dtypes.int64 + # similar to jax but we don't use weak type + assert least_upper_dtype(dtypes.int64, dtypes.uint64) == dtypes.float16 + assert least_upper_dtype(dtypes.float16, dtypes.float32) == dtypes.float32 + assert least_upper_dtype(dtypes.float32, dtypes.float64) == dtypes.float64 + + assert least_upper_dtype(dtypes.bool, dtypes.float32) == dtypes.float32 + assert least_upper_dtype(dtypes.bool, dtypes.float64) == dtypes.float64 + assert least_upper_dtype(dtypes.float16, dtypes.int64) == dtypes.float16 + assert least_upper_dtype(dtypes.float16, dtypes.uint64) == dtypes.float16 + assert least_upper_dtype(dtypes.fp8e4m3, dtypes.fp8e5m2) == dtypes.half + +class TestAutoCastType(unittest.TestCase): + def setUp(self): + self.old_default_int, self.old_default_float = dtypes.default_int, dtypes.default_float + def tearDown(self): + dtypes.default_int, dtypes.default_float = self.old_default_int, self.old_default_float + + @given(strat.sampled_from(dtype_floats), strat.sampled_from(dtype_floats)) + def test_least_upper_float_input_is_float(self, input_dtype, default_float): + dtypes.default_float = default_float + self.assertEqual(least_upper_float(input_dtype), input_dtype) + + @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats)) + def test_least_upper_float_input_is_int(self, input_dtype, default_float): + dtypes.default_float = default_float + self.assertEqual(least_upper_float(input_dtype), default_float) + + @given(strat.sampled_from([d for d in core_dtypes if dtypes.is_int(d) and is_dtype_supported(d)])) + def test_int_to_float_unary_func(self, dtype): + for func in [ + lambda t: t.exp(), + lambda t: t.exp2(), + lambda t: t.log(), + lambda t: t.log2(), + lambda t: t.sqrt(), + lambda t: t.rsqrt(), + lambda t: t.sin(), + lambda t: t.cos(), + lambda t: t.tan(), + lambda t: t.sigmoid(), + ]: + a = [2, 3, 4] + # float16 can have larger precision errors + np.testing.assert_allclose(func(Tensor(a, dtype=dtype)).numpy(), func(torch.tensor(a)), rtol=1e-3, atol=1e-3) + + @given(strat.sampled_from(core_dtypes)) + def test_broadcast_scalar(self, dt): + assert (Tensor.ones(4, 4, dtype=dt) + 2.3).dtype == (dt if dtypes.is_float(dt) else dtypes.default_float) + assert (Tensor.ones(4, 4, dtype=dt) + 2).dtype == (dt if dtypes.is_float(dt) or dtypes.is_int(dt) else dtypes.default_int) + assert (Tensor.ones(4, 4, dtype=dt) + True).dtype == dt + + @given(strat.sampled_from(dtype_floats)) + def test_int_div_int(self, default_float): + dtypes.default_float = default_float + self.assertEqual(Tensor([1]).div(Tensor([2])).dtype, default_float) + + def test_sum(self): + assert (Tensor([0, 1], dtype=dtypes.bool)).sum().dtype == dtypes.int32 + assert (Tensor([0, 1], dtype=dtypes.int8)).sum().dtype == dtypes.int32 + assert (Tensor([0, 1], dtype=dtypes.int16)).sum().dtype == dtypes.int32 + assert (Tensor([0, 1], dtype=dtypes.int32)).sum().dtype == dtypes.int32 + assert (Tensor([0, 1], dtype=dtypes.int64)).sum().dtype == dtypes.int64 + assert (Tensor([0, 1], dtype=dtypes.uint8)).sum().dtype == dtypes.uint32 + assert (Tensor([0, 1], dtype=dtypes.uint16)).sum().dtype == dtypes.uint32 + assert (Tensor([0, 1], dtype=dtypes.uint32)).sum().dtype == dtypes.uint32 + assert (Tensor([0, 1], dtype=dtypes.uint64)).sum().dtype == dtypes.uint64 + assert (Tensor([0, 1], dtype=dtypes.float16)).sum().dtype == dtypes.float16 + #assert (Tensor([0, 1], dtype=dtypes.bfloat16)).sum().dtype == dtypes.bfloat16 + assert (Tensor([0, 1], dtype=dtypes.float32)).sum().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.float64)).sum().dtype == dtypes.float64 + + @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16") + def test_sum_dtype_arg(self): + t = Tensor([40000, 40000], dtype=dtypes.float16) + # default float16 sum returns in float16, overflowed in this case + assert t.sum().dtype == dtypes.float16 + assert math.isinf(t.sum().numpy().item()) + # specifiying dtype and it's not downcasted + assert t.sum(dtype=dtypes.float32).dtype == dtypes.float32 + np.testing.assert_allclose(t.sum(dtype=dtypes.float32).numpy(), 80000) + + def test_prod_dtype_arg(self): + t = Tensor([100, 200], dtype=dtypes.int32) + assert t.prod().dtype == dtypes.int32 + np.testing.assert_allclose(t.prod().numpy(), 20000) + assert t.prod(dtype=dtypes.float32).dtype == dtypes.float32 + np.testing.assert_allclose(t.prod(dtype=dtypes.float32).numpy(), 20000) + + def test_mean(self): + assert (Tensor([0, 1], dtype=dtypes.bool)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.int8)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.int16)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.int32)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.int64)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.uint8)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.uint16)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.uint32)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.uint64)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.float16)).mean().dtype == dtypes.float16 + #assert (Tensor([0, 1], dtype=dtypes.bfloat16)).mean().dtype == dtypes.bfloat16 + assert (Tensor([0, 1], dtype=dtypes.float32)).mean().dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.float64)).mean().dtype == dtypes.float64 + + def test_cumsum(self): + assert (Tensor([0, 1], dtype=dtypes.bool)).cumsum(0).dtype == dtypes.int32 + assert (Tensor([0, 1], dtype=dtypes.int8)).cumsum(0).dtype == dtypes.int32 + assert (Tensor([0, 1], dtype=dtypes.int16)).cumsum(0).dtype == dtypes.int32 + assert (Tensor([0, 1], dtype=dtypes.int32)).cumsum(0).dtype == dtypes.int32 + assert (Tensor([0, 1], dtype=dtypes.int64)).cumsum(0).dtype == dtypes.int64 + assert (Tensor([0, 1], dtype=dtypes.uint8)).cumsum(0).dtype == dtypes.uint32 + assert (Tensor([0, 1], dtype=dtypes.uint16)).cumsum(0).dtype == dtypes.uint32 + assert (Tensor([0, 1], dtype=dtypes.uint32)).cumsum(0).dtype == dtypes.uint32 + assert (Tensor([0, 1], dtype=dtypes.uint64)).cumsum(0).dtype == dtypes.uint64 + assert (Tensor([0, 1], dtype=dtypes.float16)).cumsum(0).dtype == dtypes.float16 + #assert (Tensor([0, 1], dtype=dtypes.bfloat16)).cumsum(0).dtype == dtypes.bfloat16 + assert (Tensor([0, 1], dtype=dtypes.float32)).cumsum(0).dtype == dtypes.float32 + assert (Tensor([0, 1], dtype=dtypes.float64)).cumsum(0).dtype == dtypes.float64 + + @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) + def test_matmul(self, dt1, dt2, acc_dt): + t1 = Tensor([0, 1], dtype=dt1) + t2 = Tensor([0, 1], dtype=dt2) + self.assertEqual(t1.matmul(t2).dtype, least_upper_dtype(t1.dtype, t2.dtype)) + # if dtype is specified, return in dtype + self.assertEqual(t1.matmul(t2, dtype=acc_dt).dtype, acc_dt) + + @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) + def test_linear(self, dt1, dt2, dt3, acc_dt): + x = Tensor([0, 1], dtype=dt1) + w = Tensor([0, 1], dtype=dt2) + b = Tensor([0, 1], dtype=dt3) + self.assertEqual(x.linear(w).dtype, least_upper_dtype(x.dtype, w.dtype)) + self.assertEqual(x.linear(w, b).dtype, least_upper_dtype(least_upper_dtype(x.dtype, w.dtype), b.dtype)) + # if dtype is specified, return in dtype + self.assertEqual(x.linear(w, dtype=acc_dt).dtype, acc_dt) + self.assertEqual(x.linear(w, b, dtype=acc_dt).dtype, acc_dt) + + @staticmethod + def check_where_alternate_input_other(input_, other, data_type): + assert (Tensor([True, False]).where(input_, other)).dtype == data_type + assert (Tensor([True, False]).where(other, input_)).dtype == data_type + + @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) + def test_where_no_scalar(self, dt1, dt2): + self.check_where_alternate_input_other(Tensor(2, dtype=dt1), Tensor(3, dtype=dt2), least_upper_dtype(dt1, dt2)) + + @given(strat.sampled_from(core_dtypes)) + def test_where_one_scalar(self, dt): + t = Tensor(2, dtype=dt) + self.check_where_alternate_input_other(t, 3.2, (dt if dtypes.is_float(dt) else dtypes.default_float)) + self.check_where_alternate_input_other(t, 3, (dt if dtypes.is_float(dt) or dtypes.is_int(dt) else dtypes.default_int)) + self.check_where_alternate_input_other(t, True, dt) + + def test_where_two_scalars(self): + self.check_where_alternate_input_other(3.1, 3.2, dtypes.default_float) + self.check_where_alternate_input_other(3.1, 3, dtypes.default_float) + self.check_where_alternate_input_other(3.1, True, dtypes.default_float) + self.check_where_alternate_input_other(3, 2, dtypes.default_int) + self.check_where_alternate_input_other(3, True, dtypes.default_int) + self.check_where_alternate_input_other(False, True, dtypes.bool) + + @given(strat.sampled_from(core_dtypes), strat.sampled_from(core_dtypes)) + def test_maximum(self, dt1, dt2): + assert Tensor([0, 1, 2], dtype=dt1).maximum(Tensor([2, 0, 5], dtype=dt2)).dtype == least_upper_dtype(dt1, dt2) + + @given(strat.sampled_from(core_dtypes)) + def test_maximum_const(self, dt): + assert Tensor([1, 2], dtype=dt).maximum(3.1).dtype == (dt if dtypes.is_float(dt) else dtypes.default_float) + assert Tensor([1, 2], dtype=dt).maximum(3).dtype == (dt if dtypes.is_float(dt) or dtypes.is_int(dt) else dtypes.default_int) + assert Tensor([1, 2], dtype=dt).maximum(True).dtype == dt + + def test_div(self): + assert (Tensor([1, 2], dtype=dtypes.int32) / Tensor([2, 2], dtype=dtypes.int32)).dtype == dtypes.default_float + assert (Tensor([1, 2], dtype=dtypes.int16) / Tensor([2, 2], dtype=dtypes.int32)).dtype == dtypes.default_float + assert (Tensor([1, 2], dtype=dtypes.float32) / Tensor([2, 2], dtype=dtypes.float16)).dtype == dtypes.float32 + assert (Tensor([1, 2], dtype=dtypes.int32) / Tensor([2, 2], dtype=dtypes.float16)).dtype == dtypes.float16 + + def test_div_const(self): + assert (Tensor([1, 2], dtype=dtypes.int32) / 2).dtype == dtypes.default_float + assert (Tensor([1, 2], dtype=dtypes.int32) / 2.0).dtype == dtypes.default_float + assert (Tensor([1, 2], dtype=dtypes.float16) / 2).dtype == dtypes.float16 + assert (Tensor([1, 2], dtype=dtypes.float16) / 2.0).dtype == dtypes.float16 + + def test_gradient_dtype(self): + old_default_float = dtypes.default_float + + for default_dtype in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]: + if not is_dtype_supported(default_dtype): continue + dtypes.default_float = default_dtype + for dtype in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]: + if not is_dtype_supported(dtype): continue + if DEBUG >= 2: + print(f"testing {default_dtype=}, {dtype=}") + a = Tensor([1, 2, 3], dtype=dtype, requires_grad=True) + b = (a * 5).sum() + b.backward() # if there is dtype mismatch, lazy should assert + assert a.grad.dtype == a.dtype + np.testing.assert_allclose(a.grad.numpy(), [5, 5, 5]) + + dtypes.default_float = old_default_float + + @unittest.skipIf(CI, "TODO: broken RuntimeError: Attempting to relocate against an undefined symbol 'fmaxf'") + @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") + def test_backward_sum_acc_dtype(self): + # test acc of sum in the backward is upcasted to float + t = Tensor([5, -5], dtype=dtypes.half, requires_grad=True) + t.reshape(2, 1).expand(2, 10001).max().backward() + np.testing.assert_allclose(t.grad.numpy(), [1, 0]) + + @unittest.skipIf(Device.DEFAULT == "PYTHON", "very slow") + @unittest.skipIf(CI and Device.DEFAULT == "AMD", "very slow") + @unittest.skipIf(Device.DEFAULT == "WEBGPU", "Binding size is larger than the maximum storage buffer binding size") + @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") + def test_mean_half_precision_underflow(self): + N = 10000 + x = 0.001 + t = Tensor([[x]], dtype=dtypes.half, requires_grad=True).expand(N, N).contiguous() + np.testing.assert_allclose(t.mean(axis=1).numpy(), np.array([x] * N, dtype=np.float16), rtol=1e-3) + + @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") + def test_mean_half_precision_overflow(self): + N = 256 + t = Tensor([60000] * N*N, dtype=dtypes.half, requires_grad=True).reshape(N, N) + np.testing.assert_allclose(t.mean().numpy(), 60000) + t.square().mean().backward() + np.testing.assert_allclose(t.grad.numpy().flatten(), [60000 * 2 / (N*N)] * N*N) + + @unittest.skipIf(Device.DEFAULT == "WEBGPU", "Precision error") + @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") + def test_softmax_dtype(self): + data = [1, 2, 3] + t = Tensor(data, dtype=dtypes.half) + tt = torch.tensor(data, dtype=torch.half) + + out = t.softmax(0) + self.assertEqual(out.dtype, dtypes.half) + np.testing.assert_allclose(out.numpy(), tt.softmax(0).numpy(), rtol=1e-3) + out = t.softmax(0, dtype=dtypes.float) + self.assertEqual(out.dtype, dtypes.float) + np.testing.assert_allclose(out.numpy(), tt.softmax(0, dtype=torch.float).numpy(), rtol=1e-3) + out = t.log_softmax(0) + self.assertEqual(out.dtype, dtypes.half) + np.testing.assert_allclose(out.numpy(), tt.log_softmax(0).numpy(), rtol=1e-3) + out = t.log_softmax(0, dtype=dtypes.float) + self.assertEqual(out.dtype, dtypes.float) + np.testing.assert_allclose(out.numpy(), tt.log_softmax(0, dtype=torch.float).numpy(), rtol=1e-3) \ No newline at end of file diff --git a/tinygrad_repo/test/unit/test_gradient.py b/tinygrad_repo/test/unit/test_gradient.py index 20980e3..2739fe9 100644 --- a/tinygrad_repo/test/unit/test_gradient.py +++ b/tinygrad_repo/test/unit/test_gradient.py @@ -1,6 +1,7 @@ from typing import Callable import unittest, math import torch +import numpy as np from tinygrad import Tensor from tinygrad.dtype import dtypes from tinygrad.uop.ops import UOp @@ -112,16 +113,16 @@ class TestTensorGradient(unittest.TestCase): class TestRealizeMeansRealize(unittest.TestCase): def test_randn_realizes(self): x = Tensor.randn(2, 3, 64, 64, requires_grad=True).realize() - assert x.lazydata is not x.lazydata.base - assert x.lazydata.is_realized + assert x.uop is not x.uop.base + assert x.uop.is_realized #@unittest.expectedFailure # update: passing after delete_forced_realize def test_uniform_realizes(self): x = Tensor.uniform(16, 3, 3, 3, requires_grad=True).realize() - print(x.lazydata) - assert x.lazydata is not x.lazydata.base - assert x.lazydata.is_realized + print(x.uop) + assert x.uop is not x.uop.base + assert x.uop.is_realized # NOTE: even though it doesn't realize, this seems fine def test_uniform_gradient(self): @@ -129,5 +130,18 @@ class TestRealizeMeansRealize(unittest.TestCase): y = x * 2 y.sum().gradient(x)[0].realize() +class TestViewGradient(unittest.TestCase): + def test_expand(self): + # this test shows that if Tensors collapse to the views and create a disconnected graph + # there's no way to recover the proper gradient + x = Tensor.randn(5,2) + a = Tensor([3.], requires_grad=True) + aex = a.expand(10) + (aex.reshape(5,2) * x).sum().backward() + np.testing.assert_allclose(aex.grad.numpy(), x.reshape(10).numpy()) + # NOTE: aex.grad is *not* a.grad.expand(10)! + with self.assertRaises(AssertionError): + np.testing.assert_allclose(aex.grad.numpy(), a.grad.expand(10).numpy()) + if __name__ == '__main__': unittest.main() diff --git a/tinygrad_repo/test/unit/test_graph_rewrite.py b/tinygrad_repo/test/unit/test_graph_rewrite.py index 97816f2..327140b 100644 --- a/tinygrad_repo/test/unit/test_graph_rewrite.py +++ b/tinygrad_repo/test/unit/test_graph_rewrite.py @@ -204,7 +204,7 @@ class TestGEPAndVectorizeRewrite(unittest.TestCase): import inspect from tinygrad.uop.ops import graph_rewrite, _substitute, track_rewrites -from tinygrad.codegen.symbolic import symbolic_simple +from tinygrad.uop.symbolic import symbolic_simple class TestBottomUpRewrite(unittest.TestCase): def test_const_folding(self): @@ -253,6 +253,7 @@ class TestSubstitute(unittest.TestCase): # broken due to infinite recursion # NOTE: VIZ hangs and doesn't recover if you click this one + @unittest.skip("recursion error no longer raised") def test_assert_inf_recurse(self): a = UOp.variable('a', 0, 10) n1 = a.sin() @@ -275,6 +276,13 @@ class TestSubstitute(unittest.TestCase): ret = substitute(ret, {a.sin():a.sqrt(), n1.sin():n1.sqrt()}) self.assertIs(ret, a.sqrt().sqrt()) + def test_tagged_replace(self): + a = UOp.variable('a', 0, 10) + b = UOp.variable('b', 0, 10) + ret = (a+4).replace(tag=1) + ret = substitute(ret, {a:b}) + # the srcs are rewritten but we keep tag + self.assertIs(ret, (b+4).replace(tag=1)) if __name__ == '__main__': unittest.main() diff --git a/tinygrad_repo/test/unit/test_helpers.py b/tinygrad_repo/test/unit/test_helpers.py index e820e31..b9c0616 100644 --- a/tinygrad_repo/test/unit/test_helpers.py +++ b/tinygrad_repo/test/unit/test_helpers.py @@ -1,6 +1,6 @@ import gzip, unittest from tinygrad import Variable -from tinygrad.helpers import Context, ContextVar +from tinygrad.helpers import Context, ContextVar, argfix from tinygrad.helpers import merge_dicts, strip_parens, prod, round_up, fetch, fully_flatten, from_mv, to_mv, polyN, time_to_str, cdiv, cmod, getbits from tinygrad.tensor import get_shape from tinygrad.codegen.lowerer import get_contraction, get_contraction_with_reduce @@ -352,5 +352,16 @@ class TestGetBits(unittest.TestCase): def test_single_bit(self): self.assertEqual(getbits(0b100000000, 8, 8), 1) +class TestArgFix(unittest.TestCase): + def test_none(self): + self.assertEqual(argfix(None), (None, )) + self.assertEqual(argfix(None, None), (None, None)) + def test_positional_arguments(self): + self.assertEqual(argfix(1, 2, 3), (1, 2, 3)) + def test_tuple(self): + self.assertEqual(argfix((1., 2., 3.)), (1., 2., 3.)) + def test_list(self): + self.assertEqual(argfix([True, False]), (True, False)) + if __name__ == '__main__': unittest.main() diff --git a/tinygrad_repo/test/imported/test_indexing.py b/tinygrad_repo/test/unit/test_indexing.py similarity index 98% rename from tinygrad_repo/test/imported/test_indexing.py rename to tinygrad_repo/test/unit/test_indexing.py index 271aeb5..da5d619 100644 --- a/tinygrad_repo/test/imported/test_indexing.py +++ b/tinygrad_repo/test/unit/test_indexing.py @@ -21,18 +21,18 @@ def consec(shape, start=1): # creates strided tensor with base set to reference tensor's base, equivalent to torch.set_() def set_(reference: Tensor, shape, strides, offset): - raise NotImplementedError("need to implement without calling lazydata.view") - if reference.lazydata.base.realized is None: reference.realize() - assert reference.lazydata.base.realized, "base has to be realized before setting it to strided's base" - strided = Tensor(reference.lazydata.view(ShapeTracker((View.create(shape=shape, strides=strides, offset=offset),)))) - assert strided.lazydata.st.real_strides() == strides, "real_strides should equal strides for strided" + raise NotImplementedError("need to implement without calling uop.view") + if reference.uop.base.realized is None: reference.realize() + assert reference.uop.base.realized, "base has to be realized before setting it to strided's base" + strided = Tensor(reference.uop.view(ShapeTracker((View.create(shape=shape, strides=strides, offset=offset),)))) + assert strided.uop.st.real_strides() == strides, "real_strides should equal strides for strided" return strided def clone(original:Tensor): return original.clone() def copy_(src:Tensor, other:Tensor) -> Tensor: return src.clone() # this is fine for tested usecases since as geohotstan understands, # data_ptr is used to compare if operations needed between tensors is the same -def data_ptr(tensor:Tensor): return tensor.lazydata +def data_ptr(tensor:Tensor): return tensor.uop # https://pytorch.org/docs/stable/generated/torch.Tensor.index_put_.html def index_put_(tensor:Tensor, indices, values, accumulate) -> Tensor: @@ -971,9 +971,9 @@ class TestIndexing(unittest.TestCase): numpy_testing_assert_equal_helper((2, 0, 4), z.shape) # this isn't technically necessary, but matches NumPy stride calculations. # NOTE: this is empty and shouldn't have strides - #numpy_testing_assert_equal_helper((60, 20, 5), z.lazydata.st.real_strides()) + #numpy_testing_assert_equal_helper((60, 20, 5), z.uop.st.real_strides()) # NOTE tinygrad's int slicing implementation makes this not contiguous - # self.assertTrue(z.lazydata.st.contiguous) + # self.assertTrue(z.uop.st.contiguous) @unittest.skip("bool indexing not supported") def test_index_getitem_copy_bools_slices(self): diff --git a/tinygrad_repo/test/unit/test_keccak.py b/tinygrad_repo/test/unit/test_keccak.py new file mode 100644 index 0000000..c32a74f --- /dev/null +++ b/tinygrad_repo/test/unit/test_keccak.py @@ -0,0 +1,42 @@ +from typing_extensions import Callable +import hashlib, random, unittest +from tinygrad import Tensor, Device, getenv, dtypes +from tinygrad.device import is_dtype_supported + +@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64") +@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI") +class TestKeccak(unittest.TestCase): + def setUp(self) -> None: random.seed(1337) + + def test_shape_keeping(self): + s = (1, 2, 3, 4) + for i in range(len(s)): + out_shape = Tensor.randint(*s[i:], high=255, dtype=dtypes.uint8).keccak().shape + self.assertTupleEqual(s[i:-1], out_shape[:-1]) + + def test_sha3_224(self): self._test_preset("sha3_224", [143, 144]) + def test_sha3_256(self): self._test_preset("sha3_256", [135, 136]) + def test_shake_128(self): self._test_preset("shake_128", [167, 168], lambda d: hashlib.shake_128(d).digest(16)) + + def _test_preset(self, name: str, special_sizes: list[int], hasher: Callable[[bytes], bytes] | None = None): + def default_hasher(d: bytes) -> bytes: return getattr(hashlib, name)(d).digest() + if hasher is None: hasher = default_hasher + + for n in (special_sizes + [special_sizes[0] - 1]): + a, b = random.randbytes(n), random.randbytes(n) + + ha_ref, hb_ref = hasher(a), hasher(b) + tres = Tensor.stack(*(Tensor(d) for d in (a, b))).keccak(name) + ha, hb = tres[0].data(), tres[1].data() + + self.assertEqual(ha_ref, ha) + self.assertEqual(ha_ref, Tensor(a).keccak(name).data()) + self.assertEqual(hb_ref, hb) + + def test_abc(self): + # https://www.di-mgt.com.au/sha_testvectors.html + out = Tensor(b"abc").keccak() + self.assertEqual(bytes(out.tolist()), bytearray.fromhex("3a985da74fe225b2 045c172d6bd390bd 855f086e3e9d525b 46bfe24511431532")) + +if __name__ == "__main__": + unittest.main() diff --git a/tinygrad_repo/test/unit/test_kernelize.py b/tinygrad_repo/test/unit/test_kernelize.py new file mode 100644 index 0000000..2b0bcee --- /dev/null +++ b/tinygrad_repo/test/unit/test_kernelize.py @@ -0,0 +1,33 @@ +import unittest +from tinygrad import Tensor +from tinygrad.uop import Ops + +class TestKernelize(unittest.TestCase): + def test_add_reshaped(self): + a = Tensor.ones(16,16).contiguous() + b = Tensor.zeros(16,16).contiguous() + ret = (a+b).sum(axis=1) + ret_reshaped_1 = ret.reshape(4,4) + ret_reshaped_2 = ret.reshape(2,8) + ret.kernelize() + self.assertIs(ret_reshaped_1.uop.src[0], ret_reshaped_2.uop.src[0]) + + def test_two_reduce(self): + a = Tensor.ones(16,16).contiguous() + a1 = a.sum(axis=1) + a0 = a1.sum(axis=0) + a0.kernelize() + self.assertIs(a1.uop.base.op, Ops.ASSIGN) + + def test_two_reduce_w_add(self): + a = Tensor.ones(16,16).contiguous() + a1 = a.sum(axis=1) + a0 = (a1+1).sum(axis=0) + a0.kernelize() + # NOTE: the +1 is fused with a1, so a1 is not kernelized + self.assertIs(a1.uop.base.op, Ops.REDUCE_AXIS) + # the input to the REDUCE_AXIS is an ASSIGN though + self.assertIs(a1.uop.base.src[0].base.op, Ops.ASSIGN) + +if __name__ == '__main__': + unittest.main() diff --git a/tinygrad_repo/test/test_masked_st.py b/tinygrad_repo/test/unit/test_masked_st.py similarity index 82% rename from tinygrad_repo/test/test_masked_st.py rename to tinygrad_repo/test/unit/test_masked_st.py index c518d5b..ce88a71 100644 --- a/tinygrad_repo/test/test_masked_st.py +++ b/tinygrad_repo/test/unit/test_masked_st.py @@ -7,7 +7,7 @@ class TestMaskedShapeTracker(unittest.TestCase): b = Tensor([1,1]).pad(((0,3),)) c = a*b assert c.shape == a.shape - #assert c.lazydata.st.views[0].mask is not None + #assert c.uop.st.views[0].mask is not None ret = c.data() assert ret.tolist() == [1.0, 1.0, 0.0, 0.0, 0.0] @@ -16,7 +16,7 @@ class TestMaskedShapeTracker(unittest.TestCase): b = Tensor([1,1]).pad(((0,3),)) c = a*b assert c.shape == a.shape - #assert c.lazydata.st.views[0].mask is not None + #assert c.uop.st.views[0].mask is not None ret = c.data() assert ret.tolist() == [1.0, 1.0, 0.0, 0.0, 0.0] @@ -24,7 +24,7 @@ class TestMaskedShapeTracker(unittest.TestCase): a = Tensor([1,1]).pad(((0,2),)) b = Tensor([1,1]).pad(((0,2),)) c = a+b - #assert c.lazydata.st.views[0].mask is not None + #assert c.uop.st.views[0].mask is not None ret = c.data() assert ret.tolist() == [2.0, 2.0, 0.0, 0.0] diff --git a/tinygrad_repo/test/test_mnist_dataset.py b/tinygrad_repo/test/unit/test_mnist_dataset.py similarity index 93% rename from tinygrad_repo/test/test_mnist_dataset.py rename to tinygrad_repo/test/unit/test_mnist_dataset.py index 5606577..9db9a9e 100644 --- a/tinygrad_repo/test/test_mnist_dataset.py +++ b/tinygrad_repo/test/unit/test_mnist_dataset.py @@ -3,7 +3,6 @@ from tinygrad.helpers import GlobalCounters from tinygrad.nn.datasets import mnist class TestDataset(unittest.TestCase): - @unittest.expectedFailure def test_dataset_is_realized(self): X_train, _, _, _ = mnist() X_train[0].contiguous().realize() diff --git a/tinygrad_repo/test/test_rearrange_einops.py b/tinygrad_repo/test/unit/test_rearrange_einops.py similarity index 100% rename from tinygrad_repo/test/test_rearrange_einops.py rename to tinygrad_repo/test/unit/test_rearrange_einops.py diff --git a/tinygrad_repo/test/unit/test_rewrite_map.py b/tinygrad_repo/test/unit/test_rewrite_map.py index 0eed947..adc8cd2 100644 --- a/tinygrad_repo/test/unit/test_rewrite_map.py +++ b/tinygrad_repo/test/unit/test_rewrite_map.py @@ -1,7 +1,7 @@ import unittest from tinygrad import dtypes from tinygrad.uop.ops import UOp, graph_rewrite_map, _substitute -from tinygrad.codegen.symbolic import symbolic +from tinygrad.uop.symbolic import symbolic class TestRewriteMap(unittest.TestCase): def test_substitute(self): diff --git a/tinygrad_repo/test/test_rewrite_tracked_childen.py b/tinygrad_repo/test/unit/test_rewrite_tracked_childen.py similarity index 93% rename from tinygrad_repo/test/test_rewrite_tracked_childen.py rename to tinygrad_repo/test/unit/test_rewrite_tracked_childen.py index a1dea79..688e2ef 100644 --- a/tinygrad_repo/test/test_rewrite_tracked_childen.py +++ b/tinygrad_repo/test/unit/test_rewrite_tracked_childen.py @@ -25,7 +25,7 @@ class TestRewriteTrackedChildren(unittest.TestCase): a = Tensor(2) b = Tensor(3) c = a + b - sink = c.lazydata.sink() + sink = c.uop.sink() sink = graph_rewrite(sink, rewrite, track_children=True) def test_simple_child(self): @@ -35,8 +35,8 @@ class TestRewriteTrackedChildren(unittest.TestCase): a = Tensor(2) b = Tensor(3) c = a + b - sink = c.lazydata - view_w_child = a.lazydata.src[0] + sink = c.uop + view_w_child = a.uop.src[0] print([x().arg for x in view_w_child.children]) print([x.arg for x in sink.get_children_map()[view_w_child]]) self.assertSetEqual(set([x.arg for x in sink.get_children_map()[view_w_child]]), set((2,3))) @@ -57,7 +57,7 @@ class TestRewriteTrackedChildren(unittest.TestCase): extra = PatternMatcher([(UPat(Ops.REDUCE_AXIS, name="r"), print_children)]) a = Tensor.empty(3, 3) r = (a+0).sum() - graph_rewrite(r.lazydata, merge_views+sym+extra, track_children=True) + graph_rewrite(r.uop, merge_views+sym+extra, track_children=True) if __name__ == '__main__': unittest.main() diff --git a/tinygrad_repo/test/unit/test_search.py b/tinygrad_repo/test/unit/test_search.py new file mode 100644 index 0000000..be16686 --- /dev/null +++ b/tinygrad_repo/test/unit/test_search.py @@ -0,0 +1,55 @@ +import unittest +from tinygrad import Tensor, Device +from tinygrad.codegen.kernel import Kernel +from tinygrad.device import Buffer +from tinygrad.engine.search import get_test_global_size, bufs_from_lin +from tinygrad.helpers import GlobalCounters +from extra.optimization.helpers import time_linearizer + +class TestSearchUtil(unittest.TestCase): + def test_get_test_global_size(self): + self.assertEqual(get_test_global_size([256, 256, 256], 65536, {}), ([256, 16, 16], 256.0)) + self.assertEqual(get_test_global_size([65536, 1, 1], 256, {}), ([256, 1, 1], 256.0)) + self.assertEqual(get_test_global_size([77, 1, 1], 16, {}), ([9, 1, 1], 77/9)) + + def test_bufs_from_lin(self): + a = Tensor([1,2,3,4]).realize() + si = (a+1).schedule()[0] + rawbufs = bufs_from_lin(lin:=Kernel(si.ast)) + assert len(rawbufs) == len(lin.membufs) == 2 + assert all(r is not None for r in rawbufs) + assert all(isinstance(r, Buffer) for r in rawbufs) + assert all(r.size > 0 for r in rawbufs) + + def test_bufs_from_lin_alt(self): + a = Tensor.randn(4, 4).realize() + b = a+a[0] + si = b.schedule()[0] + rawbufs = bufs_from_lin(k:=Kernel(si.ast)) + assert len(rawbufs) == len(k.membufs) == 2 + assert all(r is not None for r in rawbufs) + assert all(isinstance(r, Buffer) for r in rawbufs) + assert all(r.size > 0 for r in rawbufs) + +class TestTimeLinearizer(unittest.TestCase): + @unittest.skipIf(Device.DEFAULT == "WEBGPU", "WebGPU timestamps are low precision, tm is 0") + def test_reasonable_time(self): + a = Tensor([1,2,3,4]).realize() + si = (a+1).schedule()[0] + # create fresh empty buffers + rawbufs = [Buffer(b.device, b.size, b.dtype).allocate() for b in si.bufs] + tm = time_linearizer(Kernel(si.ast), rawbufs, allow_test_size=False, cnt=10, disable_cache=True) + assert tm > 0 and tm != float('inf') + + # Ensure that the kernel count is not incremented by time_linearizer when clearing l2 + def test_kernel_count(self): + ast = Tensor.zeros(16).contiguous().kernelize().uop.src[1].arg.ast + lin = Kernel(ast) + bufs = bufs_from_lin(lin) + + kernel_count = GlobalCounters.kernel_count + time_linearizer(lin, bufs, allow_test_size=False, cnt=2, disable_cache=True, clear_l2=True) + assert GlobalCounters.kernel_count == kernel_count, "kernel count was incremented by time_linearizer" + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/tinygrad_repo/test/unit/test_shapetracker.py b/tinygrad_repo/test/unit/test_shapetracker.py index 223acce..23fda92 100644 --- a/tinygrad_repo/test/unit/test_shapetracker.py +++ b/tinygrad_repo/test/unit/test_shapetracker.py @@ -836,29 +836,29 @@ class TestConsecutive(unittest.TestCase): self.ones = Tensor.ones(2, 4) def test_unmodified(self): - assert self.t.lazydata.st.consecutive - assert self.t.reshape(4, 2).lazydata.st.consecutive - assert self.t.reshape(1, 8).lazydata.st.consecutive + assert self.t.uop.st.consecutive + assert self.t.reshape(4, 2).uop.st.consecutive + assert self.t.reshape(1, 8).uop.st.consecutive def test_sliced(self): - assert self.t[0].lazydata.st.consecutive - assert self.t[0, 1:2].lazydata.st.consecutive - assert self.t[1].lazydata.st.consecutive - assert not self.t[:, 0].lazydata.st.consecutive - assert not self.t[:, 1].lazydata.st.consecutive + assert self.t[0].uop.st.consecutive + assert self.t[0, 1:2].uop.st.consecutive + assert self.t[1].uop.st.consecutive + assert not self.t[:, 0].uop.st.consecutive + assert not self.t[:, 1].uop.st.consecutive def test_padded(self): - assert not self.t.pad(((1, 1), None)).lazydata.st.consecutive - assert not self.t.pad((None, (1, 1))).lazydata.st.consecutive + assert not self.t.pad(((1, 1), None)).uop.st.consecutive + assert not self.t.pad((None, (1, 1))).uop.st.consecutive def test_const(self): - assert self.const.lazydata.st.consecutive + assert self.const.uop.st.consecutive def test_ones(self): - assert not self.ones.lazydata.st.consecutive - assert not self.ones[0, :].lazydata.st.consecutive + assert not self.ones.uop.st.consecutive + assert not self.ones[0, :].uop.st.consecutive # consecutive if sliced into size 1 - assert self.ones[0, 0].lazydata.st.consecutive + assert self.ones[0, 0].uop.st.consecutive class TestRender(unittest.TestCase): def test_render(self): @@ -872,5 +872,46 @@ class TestRender(unittest.TestCase): self.assertEqual(idx.render(), "((ridx0*3)+ridx1)") self.assertEqual(valid.render(), "(ridx0<2)") +class TestVariableReshape(unittest.TestCase): + def test_reshape(self): + st = ShapeTracker.from_shape((3,)) + st = st.reshape((Variable("i", 1, 10),)) + assert len(st.views) == 1 + + def test_reshape_stride_0(self): + st = ShapeTracker.from_shape((3,), (0,)) + st = st.reshape((Variable("i", 1, 10).bind(3),)) + assert len(st.views) == 1, f"multiview {st}" + + def test_reshape_bound(self): + st = ShapeTracker.from_shape((3,)) + st = st.reshape((Variable("i", 1, 10).bind(3),)) + assert len(st.views) == 1 + + def test_add(self): + st1 = ShapeTracker.from_shape((3,)) + st2 = ShapeTracker.from_shape((Variable("i", 1, 10),)) + st = st1+st2 + assert len(st.views) == 1 + + def test_add_stride_0(self): + st1 = ShapeTracker.from_shape((3,), (0,)) + st2 = ShapeTracker.from_shape((Variable("i", 1, 10).bind(3),), (0,)) + st = st1+st2 + assert len(st.views) == 1, f"multiview {st}" + + def test_add_bound(self): + st1 = ShapeTracker.from_shape((3,)) + st2 = ShapeTracker.from_shape((Variable("i", 1, 10).bind(3),)) + st = st1+st2 + assert len(st.views) == 1 + + def test_simplify(self): + st1 = ShapeTracker.from_shape((3,)) + st2 = ShapeTracker.from_shape((Variable("i", 1, 10).bind(3),)) + st = ShapeTracker((st1.views[0], st2.views[0])) + st = st.simplify() + assert len(st.views) == 1 + if __name__ == '__main__': unittest.main() diff --git a/tinygrad_repo/test/unit/test_simplify_valid_idx.py b/tinygrad_repo/test/unit/test_simplify_valid_idx.py index 656821b..8ef89d5 100644 --- a/tinygrad_repo/test/unit/test_simplify_valid_idx.py +++ b/tinygrad_repo/test/unit/test_simplify_valid_idx.py @@ -3,7 +3,7 @@ import unittest, itertools from tinygrad.codegen import full_rewrite_to_sink from tinygrad.dtype import dtypes from tinygrad.uop.ops import UOp, Ops -from tinygrad.codegen.symbolic import simplify_valid +from tinygrad.uop.symbolic import simplify_valid def get_gated_load_uop(valid:UOp, idx:UOp): return UOp(Ops.LOAD, dtypes.float, ( diff --git a/tinygrad_repo/test/unit/test_symbolic_failures.py b/tinygrad_repo/test/unit/test_symbolic_failures.py new file mode 100644 index 0000000..0fc5ee2 --- /dev/null +++ b/tinygrad_repo/test/unit/test_symbolic_failures.py @@ -0,0 +1,161 @@ +import unittest +from tinygrad import Variable, dtypes +from tinygrad.helpers import Context +from tinygrad.uop.ops import Ops, UOp + + +class TestFuzzFailure(unittest.TestCase): + def setUp(self): + self.context = Context(CORRECT_DIVMOD_FOLDING=1) + self.context.__enter__() + + def tearDown(self): + self.context.__exit__(None, None, None) + + def test_fuzz_failure1(self): + v1=Variable('v1', 0, 8) + v2=Variable('v2', 0, 2) + v3=Variable('v3', 0, 1) + expr = (((((((((((((((((((((((0//4)%2)//8)+-2)+-4)+-3)+v1)+-4)+v2)+-2)+v3)+v2)//3)%7)*1)//2)+v2)*-1)+2)+1)+0)+-3)+v3) + v1_val, v2_val, v3_val = v1.const_like(8), v2.const_like(0), v3.const_like(0) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) + + def test_fuzz_failure2(self): + v1=Variable('v1', 0, 16) + v2=Variable('v2', 0, 5) + v3=Variable('v3', 0, 3) + expr = (((((((((((((((((((((((((0*4)//5)*2)*-1)*-2)+-4)*4)*2)*3)*4)+-4)*4)+v2)+v2)+v3)//3)+v2)+v1)//9)+3)+1)//1)+-4)//4)*2) + expr = (((((v1+(v2+(((v3+(v2*2))+1)//3)))+4)//9)+-57)//(9*4)) + v1_val, v2_val, v3_val = v1.const_like(6), v2.const_like(0), v3.const_like(0) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) + + def test_fuzz_failure3(self): + v1=Variable('v1', 0, 2) + v2=Variable('v2', 0, 1) + v3=Variable('v3', 0, 2) + expr = (((((((((((((((((((0//2)//3)+v3)+0)+-4)*-2)*-2)+-1)+2)+3)+v3)+0)//8)*-3)+0)*-2)*-4)*-2)//5) + v1_val, v2_val, v3_val = v1.const_like(0), v2.const_like(0), v3.const_like(0) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) + + def test_fuzz_failure4(self): + v1=Variable('v1', 0, 2) + v2=Variable('v2', 0, 3) + v3=Variable('v3', 0, 4) + expr = (((((((((((((((((((((((((((((0*-2)+0)*-1)//9)//6)//8)+v1)*-4)+v2)//4)//8)+4)*3)+v1)+v3)//8)//7)+4)+v3)*-4)+1)+v1)*3)+4)*2)//5)//2)//3)*-4) + v1_val, v2_val, v3_val = v1.const_like(2), v2.const_like(0), v3.const_like(2) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) + + def test_fuzz_failure5(self): + v1=Variable('v1', 0, 1) + v2=Variable('v2', 0, 1) + v3=Variable('v3', 0, 3) + expr = ((((((((((((((0+v2)+v1)*0)+v2)//1)//7)+-2)+v2)+v1)*4)+-3)//5)+v2)+1) + v1_val, v2_val, v3_val = v1.const_like(0), v2.const_like(0), v3.const_like(0) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) + + def test_fuzz_failure6(self): + v1=Variable('v1', 0, 8) + v2=Variable('v2', 0, 64) + v3=Variable('v3', 0, 128) + expr = (((((((((((((((((((((((((((((0//3)+4)+v1)//2)+-1)//1)*1)*-1)*4)//5)+v1)//6)+v1)*-1)+-4)+v2)+-2)*-3)+v3)+-4)+-2)*-1)//8)//4)*-4)+3)+v3)* + -2)+v2) + v1_val, v2_val, v3_val = v1.const_like(8), v2.const_like(3), v3.const_like(2) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) + + def test_fuzz_failure7(self): + v1=Variable('v1', 0, 64) + v2=Variable('v2', 0, 5) + v3=Variable('v3', 0, 128) + expr = (((((((((((((((((((((((((((((0+v2)*-4)+0)//9)+-4)*-2)*3)*4)//9)+v3)+v1)//4)+v1)+v3)+-1)*4)//4)+v2)//7)//3)+v1)+v2)+v3)+1)*2)//4)*3)+-1)*1) + v1_val, v2_val, v3_val = v1.const_like(0), v2.const_like(2), v3.const_like(65) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) + + def test_fuzz_failure8(self): + v1=Variable('v1', 0, 2) + v2=Variable('v2', 0, 8) + v3=Variable('v3', 0, 9) + expr = (((((((0+-1)+2)+v1)*-2)//3)+v1)*-4) + v1_val, v2_val, v3_val = v1.const_like(0), v2.const_like(0), v3.const_like(0) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) + + def test_fuzz_failure9(self): + v1=Variable('v1', 0, 256) + v2=Variable('v2', 0, 1) + v3=Variable('v3', 0, 8) + expr = (((((((((((((((((((((((((((((0*-2)//1)+3)*-2)+-3)*-4)*1)+v1)+0)%2)%8)%9)+v2)%9)+-4)//4)+-1)*-2)+0)+v1)+v1)+3)+v1)+4)+-4)+0)*2)+-3)%6) + v1_val, v2_val, v3_val = v1.const_like(0), v2.const_like(1), v3.const_like(0) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) + + def test_fuzz_failure10(self): + v1=Variable("v1", 0, 256) + v2=Variable("v2", 0, 32) + v3=Variable("v3", 0, 32) + expr = UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.MAX, dtypes.int, arg=None, src=( + UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( + UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( + x5:=UOp(Ops.IDIV, dtypes.int, arg=None, src=( + UOp(Ops.WHERE, dtypes.int, arg=None, src=( + UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( + UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( + x9:=UOp(Ops.CONST, dtypes.int, arg=9, src=()), + x10:=UOp(Ops.DEFINE_VAR, dtypes.int, arg=('v1', 0, 256), src=()),)), + x11:=UOp(Ops.CONST, dtypes.bool, arg=True, src=()),)), + UOp(Ops.ADD, dtypes.int, arg=None, src=( + UOp(Ops.MUL, dtypes.int, arg=None, src=( + x10, + x14:=UOp(Ops.CONST, dtypes.int, arg=-4, src=()),)), + x14,)), + UOp(Ops.IDIV, dtypes.int, arg=None, src=( + x10, + x9,)),)), + x9,)), + x14,)), + x11,)), + x5, + UOp(Ops.IDIV, dtypes.int, arg=None, src=( + UOp(Ops.ADD, dtypes.int, arg=None, src=( + UOp(Ops.MOD, dtypes.int, arg=None, src=( + x19:=UOp(Ops.DEFINE_VAR, dtypes.int, arg=('v2', 0, 32), src=()), + UOp(Ops.CONST, dtypes.int, arg=3, src=()),)), + x19,)), + UOp(Ops.CONST, dtypes.int, arg=5, src=()),)),)), + x22:=UOp(Ops.CONST, dtypes.int, arg=-1, src=()),)), + UOp(Ops.MUL, dtypes.int, arg=None, src=( + UOp(Ops.ADD, dtypes.int, arg=None, src=( + UOp(Ops.ADD, dtypes.int, arg=None, src=( + UOp(Ops.MOD, dtypes.int, arg=None, src=( + UOp(Ops.MUL, dtypes.int, arg=None, src=( + x10, + UOp(Ops.CONST, dtypes.int, arg=-2, src=()),)), + UOp(Ops.CONST, dtypes.int, arg=6, src=()),)), + UOp(Ops.MOD, dtypes.int, arg=None, src=( + UOp(Ops.DEFINE_VAR, dtypes.int, arg=('v3', 0, 32), src=()), + UOp(Ops.CONST, dtypes.int, arg=1, src=()),)),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=()),)), + x22,)),)), + x22,)) + v1_val, v2_val, v3_val = UOp.const(dtypes.int, 9), UOp.const(dtypes.int, 0),UOp.const(dtypes.int, 0) + num = expr.simplify().substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + rn = expr.substitute({v1:v1_val, v2:v2_val, v3:v3_val}).ssimplify() + self.assertEqual(num, rn) diff --git a/tinygrad_repo/test/test_symbolic_shapetracker.py b/tinygrad_repo/test/unit/test_symbolic_shapetracker.py similarity index 94% rename from tinygrad_repo/test/test_symbolic_shapetracker.py rename to tinygrad_repo/test/unit/test_symbolic_shapetracker.py index 9c754e3..ed065d0 100644 --- a/tinygrad_repo/test/test_symbolic_shapetracker.py +++ b/tinygrad_repo/test/unit/test_symbolic_shapetracker.py @@ -49,11 +49,11 @@ class TestSymbolic(unittest.TestCase): j = Variable("j", 1, 5).bind(3) k = Variable("k", 1, 5).bind(3) t = Tensor.rand(3, 4).reshape(i, 4).cat(Tensor.rand(3, 4).reshape(j, 4), dim=0).cat(Tensor.rand(3, 4).reshape(k, 4), dim=0) - st = t.lazydata.st + st = t.uop.st self.assert_tuple_equal(st.shape, (i+j+k, 4)) assert st.real_strides() == (4, 1) t = Tensor.rand(3, 3).reshape(i, 3).cat(Tensor.rand(3, 3).reshape(i, 3), dim=0).cat(Tensor.rand(3, 3), dim=0) - st = t.lazydata.st + st = t.uop.st self.assert_tuple_equal(st.shape, (2*i+3, 3)) assert st.real_strides() == (3, 1) @@ -62,7 +62,7 @@ class TestSymbolic(unittest.TestCase): j = Variable("j", 1, 5).bind(4) k = Variable("k", 1, 5).bind(4) t = Tensor.rand(3, 4).reshape(3, i).cat(Tensor.rand(3, 4).reshape(3, j), dim=1).cat(Tensor.rand(3, 4).reshape(3, k), dim=1) - st = t.lazydata.st + st = t.uop.st self.assert_tuple_equal(st.shape, (3, i+j+k)) self.assert_tuple_equal(st.real_strides(), (i+j+k, 1)) @@ -113,7 +113,7 @@ class TestShapeTrackerUnbind(unittest.TestCase): v = Variable("v", 1, 100) bv = Variable("v", 1, 100).bind(3) t = Tensor.rand(3, 4).reshape(bv, 4) - unbound_st, var_val = t.lazydata.st.unbind() + unbound_st, var_val = t.uop.st.unbind() assert unbound_st == ShapeTracker((View.create(shape=(v, 4)),)) assert var_val == {v: 3} @@ -121,7 +121,7 @@ class TestShapeTrackerUnbind(unittest.TestCase): v = Variable("v", 1, 100) bv = Variable("v", 1, 100).bind(2) t = Tensor.rand(3, 4).shrink(((bv, bv+1), (0, 4))) - unbound_st, var_val = t.lazydata.st.unbind() + unbound_st, var_val = t.uop.st.unbind() assert unbound_st == ShapeTracker((View.create(shape=(1, 4), offset=4*v),)) assert var_val == {v: 2} @@ -180,8 +180,8 @@ class TestSymbolicReshapeFromNonContiguous(unittest.TestCase): vi = Variable("i", 1, 5).bind(4) t = Tensor.ones(3, 4).reshape(3, vi) assert t.shape == (3, vi) - assert not t.lazydata.st.contiguous - assert len(t.lazydata.st.views) == 1 + assert not t.uop.st.contiguous + assert len(t.uop.st.views) == 1 def test_reshape_not_allowed(self): vi = Variable("i", 1, 5).bind(4) @@ -195,12 +195,12 @@ class TestSymbolicReshapeFromNonContiguous(unittest.TestCase): def test_reshape_from_padded(self): vi = Variable("i", 1, 5).bind(4) t = Tensor.ones(3, 4).contiguous().expand(2, 3, 4).pad(((1, 1), None, None)).shrink((None, None, (1, 3))) - st = t.lazydata.st + st = t.uop.st assert len(st.views) == 1 view = st.views[0] assert view.shape == (4, 3, 2) t = t.reshape(vi, 3, 2) - st2 = t.lazydata.st + st2 = t.uop.st assert len(st2.views) == 1 view2 = st2.views[0] # check only shape changed. strides, offset, mask, contiguous remained the same @@ -226,6 +226,13 @@ class TestSymbolicExpand(unittest.TestCase): a = a + 1 self.assertTupleEqual(a.shape, (3, vi)) + def test_pad_then_expand_into_symbols(self): + vi = Variable("i", 1, 10).bind(3) + a = Tensor(1).unsqueeze(0).pad((0, 24)).unsqueeze(0).expand((vi, 25)) + self.assertEqual(a.shape, (vi, 25)) + self.assertEqual(a.reshape(25*vi).shape, (vi*25,)) + self.assertEqual(a.reshape(vi*25).shape, (vi*25,)) + class TestSymbolicShrink(unittest.TestCase): def test_shrink_symbols(self): vi = Variable("i", 1, 5) @@ -237,7 +244,7 @@ class TestSymbolicPad(unittest.TestCase): v = Variable("v", 1, 100).bind(5) t = Tensor.ones(5).reshape(v).pad(((4, 0),)).reshape(9) assert t.shape == (9,) - st = t.lazydata.st + st = t.uop.st print(st) if __name__ == '__main__': diff --git a/tinygrad_repo/test/unit/test_tensor_uop_representation.py b/tinygrad_repo/test/unit/test_tensor_uop_representation.py index 9156d2b..a2dacee 100644 --- a/tinygrad_repo/test/unit/test_tensor_uop_representation.py +++ b/tinygrad_repo/test/unit/test_tensor_uop_representation.py @@ -8,21 +8,21 @@ realized_pattern = UPat(Ops.BUFFER) buffer_view_pattern = UPat(Ops.RESHAPE, src=(UPat(Ops.BUFFER),)) const_pattern = UPat(Ops.CONST, src=(UPat(Ops.VIEW, src=(UPat(Ops.DEVICE),),))) def is_pattern_uop(u:UOp, pat:UPat): assert pat.match(u, {}), f"{u}\nis not\n{pat}" -def is_pattern(ten:Tensor, pat:UPat): is_pattern_uop(ten.lazydata, pat) +def is_pattern(ten:Tensor, pat:UPat): is_pattern_uop(ten.uop, pat) class TestTensorMutates(unittest.TestCase): def test_mutate_add(self): a = Tensor([1,2,3]) b = Tensor([4,5,6]) ret = a+b - pa = a.lazydata - pb = b.lazydata - pr = ret.lazydata + pa = a.uop + pb = b.uop + pr = ret.uop ret.schedule() - self.assertIsNot(pa, a.lazydata) - self.assertIsNot(pb, b.lazydata) - self.assertIsNot(pr, ret.lazydata) - for t in [a,b,ret]: is_pattern_uop(t.lazydata.base, realized_pattern) + self.assertIsNot(pa, a.uop) + self.assertIsNot(pb, b.uop) + self.assertIsNot(pr, ret.uop) + for t in [a,b,ret]: is_pattern_uop(t.uop.base, realized_pattern) def test_reshape_is_same_parent(self): a = Tensor([1,2,3]) @@ -30,11 +30,11 @@ class TestTensorMutates(unittest.TestCase): c = a+b d = (a+b).reshape(3,1) d.realize() - is_pattern_uop(d.lazydata.base, realized_pattern) - is_pattern_uop(c.lazydata.base, realized_pattern) + is_pattern_uop(d.uop.base, realized_pattern) + is_pattern_uop(c.uop.base, realized_pattern) # NOTE: we keep movement ops on top of the buffer view - is_pattern_uop(c.lazydata, UPat(Ops.BUFFER)) - is_pattern_uop(d.lazydata, UPat(Ops.VIEW, src=(realized_pattern,))) + is_pattern_uop(c.uop, UPat(Ops.BUFFER)) + is_pattern_uop(d.uop, UPat(Ops.VIEW, src=(realized_pattern,))) def test_reshape_is_same_child(self): a = Tensor([1,2,3]) @@ -42,41 +42,41 @@ class TestTensorMutates(unittest.TestCase): c = a+b d = (a+b).reshape(3,1) c.realize() - is_pattern_uop(c.lazydata.base, realized_pattern) - is_pattern_uop(d.lazydata.base, realized_pattern) + is_pattern_uop(c.uop.base, realized_pattern) + is_pattern_uop(d.uop.base, realized_pattern) class TestTensorUopRepresentation(unittest.TestCase): def test_realized(self): a = Tensor([1.,2,3]).realize() - print(a.lazydata) - is_pattern_uop(a.lazydata.base, realized_pattern) + print(a.uop) + is_pattern_uop(a.uop.base, realized_pattern) def test_add_realized(self): a = Tensor([1.,2,3]).realize() b = Tensor([4.,5,6]).realize() c = a+b - print(c.lazydata) + print(c.uop) is_pattern(c, UPat(Ops.ADD, src=(realized_pattern, realized_pattern))) def test_const_pattern(self): a = Tensor(1) - print(a.lazydata) + print(a.uop) is_pattern(a, const_pattern) # const in tensor has a DEVICE and VIEW src is_pattern(a, UPat.cvar("x")) # even cvar works! def test_consts_do_not_realize(self): a = Tensor(1) - print(a.lazydata) - pre_realize = a.lazydata + print(a.uop) + pre_realize = a.uop a.realize() - assert a.lazydata is pre_realize + assert a.uop is pre_realize def test_viewed_consts_do_not_realize(self): a = Tensor.ones(10, 10) - print(a.lazydata) + print(a.uop) a.realize() is_pattern(a, const_pattern) - self.assertEqual(a.lazydata.shape, (10, 10)) + self.assertEqual(a.uop.shape, (10, 10)) # currently, CONSTs have a "fake" BUFFER. this should be fixed # current: @@ -93,8 +93,8 @@ class TestTensorUopRepresentation(unittest.TestCase): # UOp(Ops.DEVICE, dtypes.void, arg="METAL", src=()),)),)),)) def test_consts_dont_have_buffers(self): a = Tensor.ones(10, 10) - print(a.lazydata) - buffers_in_parents = [x.op for x in a.lazydata.toposort() if x.op is Ops.BUFFER] + print(a.uop) + buffers_in_parents = [x.op for x in a.uop.toposort() if x.op is Ops.BUFFER] self.assertEqual(len(buffers_in_parents), 0) # currently, COPY has an extra BUFFER on the output @@ -112,7 +112,7 @@ class TestTensorUopRepresentation(unittest.TestCase): def test_copyin(self): a = Tensor([1.,2,3]).realize() c = a.to("TEST") # NOTE: this isn't checked - print(c.lazydata) + print(c.uop) is_pattern(c, UPat(Ops.COPY, src=(realized_pattern, UPat(Ops.DEVICE)))) def test_empty_buf(self): @@ -121,7 +121,7 @@ class TestTensorUopRepresentation(unittest.TestCase): vi = UOp.variable("i", 1, 3).bind(1) a = Tensor.empty(3, vi) is_pattern(a, UPat(Ops.RESHAPE, src=(UPat(Ops.BUFFER),))) - self.assertEqual(a.lazydata.base.buffer.size, 9) + self.assertEqual(a.uop.base.buffer.size, 9) if __name__ == '__main__': unittest.main() diff --git a/tinygrad_repo/test/unit/test_transcendental_helpers.py b/tinygrad_repo/test/unit/test_transcendental_helpers.py index 16a97ad..5a14b38 100644 --- a/tinygrad_repo/test/unit/test_transcendental_helpers.py +++ b/tinygrad_repo/test/unit/test_transcendental_helpers.py @@ -2,8 +2,8 @@ import unittest, math import numpy as np from tinygrad import dtypes from tinygrad.uop.ops import UOp, Ops -from tinygrad.codegen.transcendental import TRANSCENDENTAL_SUPPORTED_DTYPES, payne_hanek_reduction, cody_waite_reduction -from tinygrad.codegen.transcendental import frexp, rintk, xpow, xexp2, xlog2, trig_poly, pow2if +from tinygrad.uop.transcendental import TRANSCENDENTAL_SUPPORTED_DTYPES, payne_hanek_reduction, cody_waite_reduction +from tinygrad.uop.transcendental import frexp, rintk, xpow, xexp2, xlog2, trig_poly, pow2if from test.helpers import eval_uop class TestTranscendentalFunctions(unittest.TestCase): diff --git a/tinygrad_repo/test/unit/test_uop_symbolic.py b/tinygrad_repo/test/unit/test_uop_symbolic.py index 53b1d5f..ac32c9b 100644 --- a/tinygrad_repo/test/unit/test_uop_symbolic.py +++ b/tinygrad_repo/test/unit/test_uop_symbolic.py @@ -110,6 +110,9 @@ class TestSymbolic(unittest.TestCase): def test_neg(self): self.helper_test_variable(-Variable("a", 0, 8), -8, 0, "(a*-1)") + def test_xor_0(self): + self.helper_test_variable(Variable("a", 0, 8) ^ 0, 0, 8, "a") + def test_add_1(self): self.helper_test_variable(Variable("a", 0, 8)+1, 1, 9, "(a+1)") diff --git a/tinygrad_repo/test/unit/test_verify_ast.py b/tinygrad_repo/test/unit/test_verify_ast.py index a5eaff8..fd8a921 100644 --- a/tinygrad_repo/test/unit/test_verify_ast.py +++ b/tinygrad_repo/test/unit/test_verify_ast.py @@ -29,46 +29,46 @@ class TestVerifyAST(unittest.TestCase): buf_0 = UOp(Ops.DEFINE_GLOBAL, dtype.ptr(), (), 0) buf_1 = UOp(Ops.DEFINE_GLOBAL, dtype.ptr(), (), 1) buf_2 = UOp(Ops.DEFINE_GLOBAL, dtype.ptr(), (), 2) - a = UOp(Ops.LOAD, dtype, (buf_1, ShapeTracker.from_shape((32, 1)).to_uop())) - b = UOp(Ops.LOAD, dtype, (buf_2, ShapeTracker.from_shape((32, 1)).to_uop())) - store = UOp(Ops.STORE, dtypes.void, (buf_0, ShapeTracker.from_shape((32, 1)).to_uop(), a+b)) + a = UOp(Ops.LOAD, dtype, (buf_1.view(ShapeTracker.from_shape((32, 1))),)) + b = UOp(Ops.LOAD, dtype, (buf_2.view(ShapeTracker.from_shape((32, 1))),)) + store = UOp(Ops.STORE, dtypes.void, (buf_0.view(ShapeTracker.from_shape((32, 1))), a+b)) helper_test_verify_ast(store) def test_exactly_one_full_shape(self): dtype = dtypes.int bufs = [UOp(Ops.DEFINE_GLOBAL, dtype.ptr(), (), i) for i in range(6)] - a = UOp(Ops.LOAD, dtype, (bufs[2], ShapeTracker.from_shape((32, 1)).to_uop())) - b = UOp(Ops.LOAD, dtype, (bufs[3], ShapeTracker.from_shape((32, 1)).to_uop())) + a = UOp(Ops.LOAD, dtype, (bufs[2].view(ShapeTracker.from_shape((32, 1))),)) + b = UOp(Ops.LOAD, dtype, (bufs[3].view(ShapeTracker.from_shape((32, 1))),)) st0 = UOp.store(bufs[0], ShapeTracker.from_shape((32, 1)).to_uop(), a+b) - a = UOp(Ops.LOAD, dtype, (bufs[4], ShapeTracker.from_shape((32, 32)).to_uop())) - b = UOp(Ops.LOAD, dtype, (bufs[5], ShapeTracker.from_shape((32, 32)).to_uop())) + a = UOp(Ops.LOAD, dtype, (bufs[4].view(ShapeTracker.from_shape((32, 32))),)) + b = UOp(Ops.LOAD, dtype, (bufs[5].view(ShapeTracker.from_shape((32, 32))),)) st1 = UOp.store(bufs[1], ShapeTracker.from_shape((32, 32)).to_uop(), a+b) with self.assertRaises(InvalidASTException): helper_test_verify_ast(st0, st1) def test_no_implicit_broadcasting(self): bufs = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), i) for i in range(2)] - a = UOp(Ops.LOAD, dtypes.float, (bufs[1], ShapeTracker.from_shape((4, 32)).to_uop())) + a = UOp(Ops.LOAD, dtypes.float, (bufs[1].view(ShapeTracker.from_shape((4, 32))),)) b = a + UOp(Ops.REDUCE_AXIS, dtypes.float, (a,), (Ops.MAX, (1,))) - st = UOp(Ops.STORE, dtypes.void, (bufs[0], ShapeTracker.from_shape((4, 32)).to_uop(), b)) + st = UOp(Ops.STORE, dtypes.void, (bufs[0].view(ShapeTracker.from_shape((4, 32))), b)) with self.assertRaises(InvalidASTException): helper_test_verify_ast(st) def test_shrink_ok(self): bufs = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), i) for i in range(2)] - a = UOp(Ops.LOAD, dtypes.float, (bufs[1], ShapeTracker((View((32, 32), strides=(32, 1), offset=0, mask=None, contiguous=True),)).to_uop())) - b = UOp(Ops.LOAD, dtypes.float, (bufs[1], ShapeTracker((View((32, 32), strides=(0, 1), offset=0, mask=None, contiguous=False),)).to_uop())) - st = UOp.store(bufs[0], ShapeTracker.from_shape((32, 32)).to_uop(), a+b) + a = UOp(Ops.LOAD, dtypes.float, (bufs[1].view(ShapeTracker((View((32, 32), strides=(32, 1), offset=0, mask=None, contiguous=True),))),)) + b = UOp(Ops.LOAD, dtypes.float, (bufs[1].view(ShapeTracker((View((32, 32), strides=(0, 1), offset=0, mask=None, contiguous=False),))),)) + st = UOp.store(bufs[0].view(ShapeTracker.from_shape((32, 32))), a+b) helper_test_verify_ast(st) def test_reduce_store(self): bufs = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), i) for i in range(2)] - a = UOp(Ops.LOAD, dtypes.float, (bufs[1], ShapeTracker.from_shape((32, 1)).to_uop())) + a = UOp(Ops.LOAD, dtypes.float, (bufs[1].view(ShapeTracker.from_shape((32, 1))),)) r = UOp(Ops.REDUCE_AXIS, dtypes.float, (a,), (Ops.ADD, (0,))) st = UOp.store(bufs[0], ShapeTracker.from_shape((32, 1)).to_uop(), r) with self.assertRaises(InvalidASTException): helper_test_verify_ast(st) def test_reduce_add_store(self): bufs = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), i) for i in range(2)] - a = UOp(Ops.LOAD, dtypes.float, (bufs[1], ShapeTracker.from_shape((32, 1)).to_uop())) + a = UOp(Ops.LOAD, dtypes.float, (bufs[1].view(ShapeTracker.from_shape((32, 1))),)) r = UOp(Ops.REDUCE_AXIS, dtypes.float, (a,), (Ops.ADD, (0,))) st = UOp.store(bufs[0], ShapeTracker.from_shape((32, 1)).to_uop(), r+a) with self.assertRaises(InvalidASTException): helper_test_verify_ast(st) @@ -83,7 +83,7 @@ class TestVerifyAST(unittest.TestCase): def test_assert_swizzle(self): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), 0) - a = UOp(Ops.LOAD, dtypes.float, (buf, ShapeTracker.from_shape((32, 1)).to_uop())) + a = UOp(Ops.LOAD, dtypes.float, (buf.view(ShapeTracker.from_shape((32, 1))),)) r = UOp(Ops.REDUCE_AXIS, dtypes.float, (a,), (Ops.ADD, (0,))) st = UOp.store(buf, ShapeTracker.from_shape((32, 1)).to_uop(), r.view(r.st.expand((32, 1)))+a) with self.assertRaisesRegex(InvalidASTException, "UOp verification failed"): helper_test_verify_ast(st) @@ -91,7 +91,7 @@ class TestVerifyAST(unittest.TestCase): def test_const_view_always_valid(self): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), 0) a = UOp.const(dtypes.int, 0).replace(src=(UOp(Ops.VIEW, dtypes.void, (), ShapeTracker.from_shape(())),)) - st = UOp.store(buf, ShapeTracker.from_shape(()).to_uop(), a.cast(dtypes.float)) + st = UOp.store(buf.view(ShapeTracker.from_shape(())), a.cast(dtypes.float)) helper_test_verify_ast(st) if __name__ == '__main__': diff --git a/tinygrad_repo/test/unit/test_viz.py b/tinygrad_repo/test/unit/test_viz.py index d7409c4..b1bd161 100644 --- a/tinygrad_repo/test/unit/test_viz.py +++ b/tinygrad_repo/test/unit/test_viz.py @@ -1,7 +1,7 @@ import unittest, decimal, json from tinygrad.dtype import dtypes from tinygrad.uop.ops import TRACK_MATCH_STATS, TrackedPatternMatcher, UOp, graph_rewrite, track_rewrites, UPat, Ops -from tinygrad.codegen.symbolic import symbolic +from tinygrad.uop.symbolic import symbolic, symbolic_simple from tinygrad.uop.ops import tracked_ctxs as contexts, tracked_keys as keys, _name_cnt, _substitute from tinygrad.device import ProfileDeviceEvent, ProfileRangeEvent, ProfileGraphEvent, ProfileGraphEntry from tinygrad.viz.serve import get_metadata, get_details, uop_to_json, to_perfetto @@ -35,7 +35,7 @@ class TestViz(unittest.TestCase): test(a*1) ret = get_metadata(keys, contexts) self.assertEqual(len(ret), 1) - key, val = ret[0] + key, val = ret[0]["name"], ret[0]["steps"] self.assertEqual(key, "test_1") self.assertEqual(val[0]["match_count"], 1) @@ -45,7 +45,7 @@ class TestViz(unittest.TestCase): def test(sink): return graph_rewrite(sink, symbolic) test((a+a)*1) ret = get_metadata(keys, contexts) - key, val = ret[0] + key, val = ret[0]["name"], ret[0]["steps"] self.assertEqual(len(ret), 1) # one context self.assertEqual(len(val), 1) # one graph_rewrite call in context self.assertEqual(key, "test_1") @@ -59,7 +59,7 @@ class TestViz(unittest.TestCase): b = graph_rewrite(b, symbolic) test(a*1, a*5) ret = get_metadata(keys, contexts) - key, val = ret[0] + key, val = ret[0]["name"], ret[0]["steps"] self.assertEqual(len(ret), 1) # one context self.assertEqual(len(val), 2) # two graph_rewrite calls in context self.assertEqual(key, "test_1") @@ -75,10 +75,10 @@ class TestViz(unittest.TestCase): do_rewrite(a*b) ret = get_metadata(keys, contexts) self.assertEqual(len(ret), 2) - key, m = ret[0] + key, m = ret[0]["name"], ret[0]["steps"] self.assertEqual(key, "do_rewrite_1") self.assertEqual(m[0]["match_count"], 1) - key, m = ret[1] + key, m = ret[1]["name"], ret[1]["steps"] self.assertEqual(key, "do_rewrite_2") self.assertEqual(m[0]["match_count"], 0) @@ -93,18 +93,18 @@ class TestViz(unittest.TestCase): self.assertEqual(len(ret), 1) def test_track_rewrites_name_fxn(self): - @track_rewrites(name_fxn=lambda r: f"output_{r}") + @track_rewrites(name_fxn=lambda _,ret: f"output_{ret}") def do_rewrite(x:UOp): x = graph_rewrite(x, symbolic) return x.render() expr = UOp.variable("a",0,10)*UOp.variable("b",0,10) do_rewrite(expr) - key = get_metadata(keys, contexts)[0][0] + key = get_metadata(keys, contexts)[0]["name"] self.assertEqual(key, "output_(a*b) n1") expr2 = UOp.variable("a",0,10)+UOp.variable("b",0,10) do_rewrite(expr2) - key = get_metadata(keys, contexts)[1][0] + key = get_metadata(keys, contexts)[1]["name"] self.assertEqual(key, "output_(a+b) n2") @unittest.expectedFailure @@ -131,7 +131,7 @@ class TestViz(unittest.TestCase): #UOp.substitute(a+b, {a+b:c}) ret = get_metadata(keys, contexts) self.assertEqual(len(ret), 1) - _, m = ret[0] + m = ret[0]["steps"] self.assertEqual(m[0]["match_count"], 1) # NOTE: calling graph_rewrite when the function isn't decorated with track_rewrites should not VIZ @@ -160,6 +160,10 @@ class TestViz(unittest.TestCase): self.assertEqual(lineno, inner_rewrite.__code__.co_firstlineno) self.assertEqual(fp, inner_rewrite.__code__.co_filename) + def test_upat_location(self): + for (pat, fn) in symbolic_simple.patterns: + self.assertIn("symbolic.py", pat.location[0]) + def test_nested_rewrite(self): def make_float(x:UOp, y:UOp): if x.dtype == dtypes.float: return None diff --git a/tinygrad_repo/test/web/test_viz.js b/tinygrad_repo/test/web/test_viz.js new file mode 100644 index 0000000..d39bc07 --- /dev/null +++ b/tinygrad_repo/test/web/test_viz.js @@ -0,0 +1,35 @@ +const { spawn } = require("child_process"); +const puppeteer = require("puppeteer"); + +async function main() { + // ** start viz server + const proc = spawn("python", ["-u", "-c", "from tinygrad import Tensor; Tensor.arange(4).realize()"], { env: { ...process.env, VIZ:"1" }, + stdio: ["inherit", "pipe", "inherit"]}); + await new Promise(resolve => proc.stdout.on("data", r => { + if (r.includes("ready")) resolve(); + })); + + // ** run browser tests + let browser, page; + try { + browser = await puppeteer.launch({ headless: true }); + page = await browser.newPage(); + const res = await page.goto("http://localhost:8000", { waitUntil:"domcontentloaded" }); + if (res.status() !== 200) throw new Error("Failed to load page"); + const scheduleSelector = await page.waitForSelector("ul"); + scheduleSelector.click(); + await page.waitForSelector("rect"); + await page.waitForFunction(() => { + const nodes = document.querySelectorAll("#nodes > g").length; + const edges = document.querySelectorAll("#edges > path").length; + return nodes > 0 && edges > 0; + }); + } finally { + // ** cleanups + if (page != null) await page.close(); + if (browser != null) await browser.close(); + proc.kill(); + } +} + +main(); diff --git a/tinygrad_repo/tinygrad/codegen/__init__.py b/tinygrad_repo/tinygrad/codegen/__init__.py index b89549c..2474c49 100644 --- a/tinygrad_repo/tinygrad/codegen/__init__.py +++ b/tinygrad_repo/tinygrad/codegen/__init__.py @@ -3,12 +3,13 @@ import functools from dataclasses import dataclass from tinygrad.helpers import QUANTIZE, DEVECTORIZE, TRANSCENDENTAL from tinygrad.uop.ops import PatternMatcher, graph_rewrite, UOp +from tinygrad.uop.spec import type_verify from tinygrad.renderer import Renderer # import all pattern matchers here from tinygrad.codegen.lowerer import pm_quant, pm_lowerer, get_index -from tinygrad.codegen.symbolic import sym, symbolic_simple, gep_pushing -from tinygrad.codegen.expander import migrate_indexing, pm_store_ignore, pm_move_ignore, pm_delete_ignore, expander +from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing +from tinygrad.codegen.expander import migrate_indexing, expander from tinygrad.codegen.devectorizer import load_store_folding, load_store_indexing, devectorize, \ pm_reduce, ReduceContext, correct_load_store, pm_render, get_late_rewrite_patterns from tinygrad.codegen.linearize import block_create, pm_blockend_merge, block_merge, pm_finalize, BlockContext @@ -24,6 +25,12 @@ class RewriteStep: def apply_rewrites(sink:UOp, rewrites:list[RewriteStep]): return functools.reduce(lambda x,f: f(x), rewrites, sink) +rewrites_for_linearizer = [ + RewriteStep(block_create, ctx=BlockContext.from_sink, name="Linearizer: Create Blocks", bottom_up=True), + RewriteStep(pm_blockend_merge, name="Linearizer: Merge Blockends"), + RewriteStep(block_merge, name="Linearizer: Merge Blocks"), + RewriteStep(pm_finalize, name="Linearizer: Finalize")] + def get_rewrites_for_renderer(opts:Renderer, linearizer:bool=True) -> list[RewriteStep]: # cache with the values of the context vars return _get_rewrites_for_renderer(opts, linearizer, QUANTIZE.value, DEVECTORIZE.value, TRANSCENDENTAL.value) @@ -38,12 +45,8 @@ def _get_rewrites_for_renderer(opts:Renderer, linearizer:bool, _QUANTIZE, _DEVEC # ** expander (expand_rewrite) ** ret.append(RewriteStep(sym+migrate_indexing, name="initial symbolic")) - # ignore (for masked stores) - ret.append(RewriteStep(pm_store_ignore, name="store_ignore")) - ret.append(RewriteStep(pm_move_ignore, name="move_ignore")) - - # expand + remove surviving ignores - ret.append(RewriteStep(pm_delete_ignore+sym+expander, name="expander")) + # expand + ret.append(RewriteStep(sym+expander, name="expander")) # ** devectorizer (full_graph_rewrite) ** # remove reduce @@ -65,14 +68,13 @@ def _get_rewrites_for_renderer(opts:Renderer, linearizer:bool, _QUANTIZE, _DEVEC pm_final_rewrite = symbolic_simple+get_late_rewrite_patterns(supported_ops, _TRANSCENDENTAL>=2)+pm_render+extra_matcher ret.append(RewriteStep(pm_final_rewrite, lambda _: opts, name="final rewrite")) - # ** linearizer ** - if linearizer: - ret.append(RewriteStep(block_create, ctx=BlockContext.from_sink, name="Linearizer: Create Blocks", bottom_up=True)) - ret.append(RewriteStep(pm_blockend_merge, name="Linearizer: Merge Blockends")) - ret.append(RewriteStep(block_merge, name="Linearizer: Merge Blocks")) - ret.append(RewriteStep(pm_finalize, name="Linearizer: Finalize")) - return ret + # return the list (with optional linearizer) + return ret + (rewrites_for_linearizer if linearizer else []) def full_rewrite_to_sink(sink:UOp, opts:Renderer|None=None, linearizer:bool=False) -> UOp: return apply_rewrites(sink, get_rewrites_for_renderer(opts if opts is not None else Renderer(), linearizer)) -def full_rewrite(sink:UOp, opts:Renderer|None=None) -> list[UOp]: return list(full_rewrite_to_sink(sink, opts, linearizer=True).arg.lst) + +def full_rewrite(sink:UOp, opts:Renderer|None=None) -> list[UOp]: + lst = list(full_rewrite_to_sink(sink, opts, linearizer=True).arg.lst) + if __debug__: type_verify(lst) + return lst diff --git a/tinygrad_repo/tinygrad/codegen/devectorizer.py b/tinygrad_repo/tinygrad/codegen/devectorizer.py index c3a4911..f59286a 100644 --- a/tinygrad_repo/tinygrad/codegen/devectorizer.py +++ b/tinygrad_repo/tinygrad/codegen/devectorizer.py @@ -4,10 +4,10 @@ from collections import defaultdict from dataclasses import dataclass from tinygrad.device import is_dtype_supported from tinygrad.dtype import dtypes, ImageDType, PtrDType, promo_lattice, DType -from tinygrad.uop.ops import UOp, Ops, UPat, PatternMatcher, resolve, graph_rewrite, GroupOp, identity_element -from tinygrad.codegen.symbolic import split_uop, uop_given_valid, parse_valid, simplify_valid, sym, symbolic_flat +from tinygrad.uop.ops import UOp, Ops, UPat, PatternMatcher, graph_rewrite, GroupOp, identity_element +from tinygrad.uop.symbolic import split_uop, uop_given_valid, parse_valid, simplify_valid, sym, symbolic_flat from tinygrad.helpers import getenv, flatten, AMX, prod, partition -from tinygrad.codegen.transcendental import xexp2, xlog2, xsin, xpow, TRANSCENDENTAL_SUPPORTED_DTYPES +from tinygrad.uop.transcendental import xexp2, xlog2, xsin, xpow, TRANSCENDENTAL_SUPPORTED_DTYPES from tinygrad.renderer import Renderer # ***** image load valid simplification ***** @@ -175,9 +175,10 @@ def get_late_rewrite_patterns(ops, force_transcendental=False): # rewrite MUL/IDIV to SHL+SHR: x*(2**y) -> shl(x,y) and x//(2**y) -> shr(x,y) if Ops.SHL in ops: pat += [(UPat.var("x", dtypes.ints)*UPat.cvar("c"), lambda c,x: x << v if (v:=powers_of_two.get(c.arg, 0)) else None)] if Ops.SHR in ops: - # no reason to check x>=0 for uints + # no reason to check x<0 for uints pat += [(UPat.var("x", dtypes.uints)//UPat.cvar("c"), lambda x,c: x >> v if (v:=powers_of_two.get(c.arg, 0)) else None)] - pat += [(UPat.var("x", dtypes.sints)//UPat.cvar("c"), lambda x,c: x >> v if (v:=powers_of_two.get(c.arg, 0)) and resolve(x>=0,False) else None)] + pat += [(UPat.var("x", dtypes.ints)//UPat.cvar("c"), lambda x,c: (x+(l.const_like(l.vmin) if (l:=(x<0)).vmin==l.vmax else l).where( + c-1, 0)) >> v if (v:=powers_of_two.get(c.arg, 0)) else None)] # (x+(x<0).where(c-1, 0)) >> v if not getenv("DISABLE_FAST_IDIV"): pat += [(UPat.var("x", dtypes.ints)//UPat.cvar("d"), lambda ctx, x, d: fast_idiv(ctx, x, d.arg))] pat += [(UPat.var("x", dtypes.ints)%UPat.cvar("d"), lambda ctx, x, d: x - d*f if (f:=fast_idiv(ctx, x, d.arg)) is not None else None)] diff --git a/tinygrad_repo/tinygrad/codegen/expander.py b/tinygrad_repo/tinygrad/codegen/expander.py index be59b4c..17a5faa 100644 --- a/tinygrad_repo/tinygrad/codegen/expander.py +++ b/tinygrad_repo/tinygrad/codegen/expander.py @@ -114,25 +114,3 @@ migrate_indexing = PatternMatcher([ # create gate MUST BE BEFORE expander (UPat(Ops.STORE, name="root"), create_gate), ]) - -# **** IGNORE support **** - -pm_store_ignore = PatternMatcher([ - (UPat().index(UPat(), UPat(name="mask")).store(UPat()).named("store"), - lambda store,mask: store.replace(src=(store.src[0], UOp(Ops.IGNORE, src=(store.src[1], mask)))) if store.src[1].op is not Ops.IGNORE else None), -]) - -pm_move_ignore = PatternMatcher([ - # IGNORE on SELF is nothing - (UPat(Ops.IGNORE, src=(UPat(name="x"), UPat(name="x"))), lambda x: x.const_like(True)), - # IGNORE on a CONST is nothing - (UPat(Ops.IGNORE, src=(UPat((Ops.CONST, Ops.VCONST), name="c"), UPat())), lambda c: c), - # move the IGNOREs - (UPat(Ops.IGNORE, src=(UPat((*GroupOp.ALU, Ops.CAST, Ops.VECTORIZE), name="alu"), UPat.var("mask")), name="ig"), - lambda ig,alu,mask: alu.replace(src=tuple(UOp(Ops.IGNORE, x.dtype, (x, mask)) for x in alu.src))), -]) - -pm_delete_ignore = PatternMatcher([ - # IGNORE on SELF is nothing - (UPat(Ops.IGNORE, src=(UPat(name="x"), UPat())), lambda x: x), -]) diff --git a/tinygrad_repo/tinygrad/codegen/kernel.py b/tinygrad_repo/tinygrad/codegen/kernel.py index 6bead92..4445795 100644 --- a/tinygrad_repo/tinygrad/codegen/kernel.py +++ b/tinygrad_repo/tinygrad/codegen/kernel.py @@ -10,8 +10,8 @@ from tinygrad.uop.spec import type_verify, ast_spec from tinygrad.device import Device from tinygrad.renderer import Renderer, TensorCore, ProgramSpec, Opt, OptOps from tinygrad.dtype import ImageDType -from tinygrad.helpers import all_same, colored, ansilen, dedup, getenv, prod, round_up, all_int, to_function_name, diskcache_put, unwrap, ContextVar -from tinygrad.helpers import DEBUG, TC_SELECT, TC_OPT, AMX, CAPTURE_PROCESS_REPLAY +from tinygrad.helpers import all_same, colored, ansilen, dedup, getenv, prod, round_up, all_int, to_function_name, unwrap +from tinygrad.helpers import DEBUG, TC_SELECT, TC_OPT, AMX from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import strides_for_shape from tinygrad.codegen.lowerer import get_contraction @@ -99,7 +99,7 @@ class Kernel: return ret @property - def membufs(self) -> list[UOp]: return dedup([x.src[0] for x in self.bufs if x.op in {Ops.LOAD, Ops.STORE}]) + def membufs(self) -> list[UOp]: return dedup([x.src[0].base for x in self.bufs if x.op in {Ops.LOAD, Ops.STORE}]) def upcasted_axis(self, i:int) -> list[tuple[int, Optional[sint], bool]]: upcasted_shape, upcasted_stride = self.sts[i].shape[self.first_upcast:], self.sts[i].real_strides()[self.first_upcast:] @@ -452,11 +452,11 @@ class Kernel: def fixup_ast(op:UOp) -> UOp: ret = op.replace(src=tuple(fixup_ast(x) for x in op.src)) # noqa: F821 if op.op in GroupOp.Buffer and op in self.bufs: - st_uop = self.sts[self.bufs.index(op)].to_uop() + st = self.sts[self.bufs.index(op)] # NOTE: if CONST got masked after applying opts, we create a new VALID - if op.op is Ops.CONST and any(v.mask is not None for v in unwrap(st_uop.st).views): return op.valid(unwrap(st_uop.st)) + if op.op is Ops.CONST and any(v.mask is not None for v in st.views): return op.view(st).valid() # otherwise we just replace the VIEW source - return ret.replace(src=(st_uop,)) if len(op.src) == 1 else ret.replace(src=(ret.src[0], st_uop, *ret.src[2:])) + return ret.replace(src=(ret.src[0].replace(arg=st),)+ret.src[1:]) if op.op is Ops.SINK: return ret.replace(arg = KernelInfo(to_function_name(self.name) if name_override is None else name_override, self.local_dims, self.upcasted, self.dont_use_locals)) @@ -491,8 +491,8 @@ class Kernel: st = store_st = ShapeTracker.from_shape(local_shape) local_buffer = UOp(Ops.DEFINE_LOCAL, tc.dtype_in.ptr(size=st.real_size(), local=True), (), f"temp{i}") if swizzle: store_st = get_tc_swizzle_st(store_st.shape, *swizzle) - local_store = UOp.store(local_buffer, store_st.to_uop(), srcs[i]) - srcs[i] = UOp(Ops.LOAD, tc.dtype_in, (local_buffer, st.to_uop(), local_store)) + local_store = UOp.store(local_buffer.view(store_st), srcs[i]) + srcs[i] = UOp(Ops.LOAD, tc.dtype_in, (local_buffer.view(st), local_store)) tc_reduce_axes = tuple(tcd + ax for ax, _ in tc.get_reduce_axes()) if self.use_tensor_cores == 1: # real WMMA, use CONTRACT/UNROLL to get the vectorization right @@ -515,14 +515,14 @@ class Kernel: tuple([self.full_shape[i] if self.sts[reduce_idx].shape[i] != self.sts[reduce_idx+1].shape[i] else 1 \ for i in range(self.first_reduce, self.first_reduce+self.group_for_reduces)]) + \ (1,) * (self.shape_len - self.upcasted - self.group_for_reduces - self.first_reduce) + tuple([x[0] for x in self.upcasted_axis(0)]) - st_uop = ShapeTracker.from_shape(local_shape).to_uop() - local_size = st_uop.arg.real_size() + st = ShapeTracker.from_shape(local_shape) + local_size = st.real_size() local_buffer = UOp(Ops.DEFINE_LOCAL, op.dtype.ptr(local_size, local=True), (), f"temp{self.reduceops.index(op)}") - local_load = UOp(Ops.LOAD, op.dtype, (local_buffer, st_uop, UOp.store(local_buffer, st_uop, ret))) + local_load = UOp(Ops.LOAD, op.dtype, (local_buffer.view(st), UOp.store(local_buffer.view(st), ret))) grouped_reduce = UOp(Ops.REDUCE_AXIS, op.dtype, (local_load,), arg=(op.arg[0], grouped_axes)) if op is self.reduceops[-1]: return grouped_reduce - st_uop = ShapeTracker.from_shape(tuple([1 if i in grouped_axes else a for i,a in enumerate(local_shape)])).to_uop() - return UOp(Ops.LOAD, op.dtype, (local_buffer, st_uop, UOp.store(local_buffer, st_uop, grouped_reduce))) + st = ShapeTracker.from_shape(tuple([1 if i in grouped_axes else a for i,a in enumerate(local_shape)])) + return UOp(Ops.LOAD, op.dtype, (local_buffer.view(st), UOp.store(local_buffer.view(st), grouped_reduce))) return ret fixed_ast = fixup_ast(self.ast) @@ -566,19 +566,10 @@ class Kernel: self.linearize(name_override, ast_transform) assert self.uops[-1].op is Ops.SINK, "last uop must be sink" src = self.opts.render(self.uops) - - if CAPTURE_PROCESS_REPLAY: - import sys - frm = sys._getframe(1) - while (f_back:=frm.f_back) is not None and "unittest" not in f_back.f_code.co_filename: frm = f_back - loc = f"{frm.f_code.co_filename.split('/')[-1]}:{frm.f_lineno} {frm.f_code.co_name}" - diskcache_put("kernel_process_replay", str(id(self)), (self.ast, self.opts, self.applied_opts, self.uops[-1].arg.name, loc, ContextVar._cache, - src)) - # group non-local bufs by the op type (LOAD or STORE) and the buffer arg. take the max access of that buffer in bytes # TODO: these max and min don't work on symbolic, and results are very wrong. mem_bytes = sum(max(x.src[0].dtype.nbytes() for x in group) - for _, group in itertools.groupby([x for x in self.ast.toposort() if x.op in GroupOp.Buffer and x.src[0].op is Ops.DEFINE_GLOBAL], - key=lambda x: (x.op, x.src[0].arg))) + for _, group in itertools.groupby([x for x in self.ast.toposort() if x.op in GroupOp.Buffer and x.src[0].base.op is Ops.DEFINE_GLOBAL], + key=lambda x: (x.op, x.src[0].base.arg))) return ProgramSpec(self.name if not name_override else name_override, src, self.opts.device, self.ast, self.uops, self.applied_opts, mem_bytes, global_size=[1,1,1] if self.opts.has_local else None, local_size=[1,1,1] if self.opts.has_local else None) diff --git a/tinygrad_repo/tinygrad/codegen/linearize.py b/tinygrad_repo/tinygrad/codegen/linearize.py index bd9231e..d205299 100644 --- a/tinygrad_repo/tinygrad/codegen/linearize.py +++ b/tinygrad_repo/tinygrad/codegen/linearize.py @@ -3,8 +3,7 @@ import heapq from collections import defaultdict from dataclasses import dataclass, replace from tinygrad.uop.ops import UOp, Ops, PatternMatcher, UPat, GroupOp -from tinygrad.helpers import dedup, partition, all_same, flatten -from tinygrad.uop.spec import type_verify +from tinygrad.helpers import dedup, partition, all_same, flatten, getenv # NOTE: any toposort should be valid here, unlike last time this isn't required, it's just for speed def block_reorder(lst:list[UOp]) -> list[UOp]: @@ -50,13 +49,13 @@ def disp(y:UOp) -> str: return "" @dataclass(frozen=True, eq=False) -class BasicBlock2: +class BasicBlock: lst: tuple[UOp, ...] ctx: tuple[UOp, ...] = () end: UOp|None = None cnt: int = 0 child_ctx: tuple[UOp, ...]|None = None - def __lt__(self, _:BasicBlock2): raise RuntimeError("no comparing basic blocks") + def __lt__(self, _:BasicBlock): raise RuntimeError("no comparing basic blocks") def __repr__(self): return f"{(str(disp(self.end))+' ') if self.end is not None else ''}"+f'f{self.cnt} '+\ f"{[disp(y) for y in self.ctx]} {[disp(y) for y in self.child_ctx] if self.child_ctx is not None else '-'} "+\ @@ -72,7 +71,7 @@ class BlockContext: child_count: dict[UOp, int] block_ctxs: dict[UOp, tuple[UOp, ...]] child_ctxs: dict[UOp, tuple[UOp, ...]] - def last_ctx(self, u): return ret if (ret:=self.child_ctxs.get(u)) is not None else self.block_ctxs[u] + def last_ctx(self, u): return self.child_ctxs.get(u, self.block_ctxs[u]) @staticmethod def from_sink(sink:UOp) -> BlockContext: # get children and all block contexts @@ -114,7 +113,7 @@ def add_blockends(base_block:UOp, new_ctx:tuple[UOp, ...], current_ctx:tuple[UOp r:UOp = ends_to_add.pop(-1) new_ctx = tuple([z for z in new_ctx if z is not r]) end_uop = UOp(Ops.ENDIF if r.op is Ops.IF else Ops.ENDRANGE, src=(r,)) - base_block = UOp(Ops.BLOCKEND, src=(base_block,)*cnt, arg=BasicBlock2((end_uop,), tuple(new_ctx), end=r, cnt=cnt)) + base_block = UOp(Ops.BLOCKEND, src=(base_block,)*cnt, arg=BasicBlock((end_uop,), tuple(new_ctx), end=r, cnt=cnt)) return base_block def make_block_bottom_up(ctx:BlockContext, x:UOp): @@ -158,8 +157,9 @@ def make_block_bottom_up(ctx:BlockContext, x:UOp): base_block = UOp(Ops.BLOCKSTART, src=tuple(v), arg=(new_ctx, new_child_ctx)) srcs.append(add_blockends(base_block, new_ctx, current_ctx)) - lst = block_reorder(lst[::-1]) - bb = BasicBlock2(tuple(lst), ctx=current_ctx, cnt=child_count, child_ctx=child_ctx) + lst = lst[::-1] + if getenv("BLOCK_REORDER", 1): lst = block_reorder(lst) + bb = BasicBlock(tuple(lst), ctx=current_ctx, cnt=child_count, child_ctx=child_ctx) return UOp(Ops.BLOCK, src=tuple(srcs), arg=bb) block_create = PatternMatcher([ @@ -179,7 +179,7 @@ def merge_blockends(sink:UOp) -> UOp|None: for k,v in blockends_to_arg.items(): # NOTE: if any BLOCKEND is the parent of any other with the same arg, this algo fails if len(v) > 1: - bb = BasicBlock2(v[0].arg.lst, _sort_ctx(flatten([y.arg.ctx for y in v])), k, cnt=sum(y.arg.cnt for y in v)) + bb = BasicBlock(v[0].arg.lst, _sort_ctx(flatten([y.arg.ctx for y in v])), k, cnt=sum(y.arg.cnt for y in v)) out = UOp(Ops.BLOCKEND, src=tuple(flatten([x.src for x in v])), arg=bb) # NOTE: bb.ctx != u.arg.ctx can cause problems here for u in v: new_forks[u] = out @@ -211,9 +211,8 @@ def remove_blockend(x:UOp): # if there's any remaining blocks that need to go in this BLOCKEND, we don't remove it if any(x.arg.end in y.arg.ctx for y in x.src if y.op in {Ops.BLOCK, Ops.BLOCKEND}): return None - parent_blocks = [y for y in x.src if y.op is Ops.BLOCK and y.arg.child_ctx is not None and x.arg.end in y.arg.child_ctx] - assert all_same(parent_blocks), f"should never have two parent blocks (has {len(parent_blocks)})" - if len(parent_blocks) > 0: + if (parent_blocks := [y for y in x.src if y.op is Ops.BLOCK and y.arg.child_ctx is not None and x.arg.end in y.arg.child_ctx]): + assert all_same(parent_blocks), f"should never have two parent blocks (has {len(parent_blocks)})" parent_block = parent_blocks[0] assert len(parent_blocks) == parent_block.arg.cnt # range needs DEFINE_ACC to be before the range (never in DEFINE_ACC for if) @@ -221,7 +220,7 @@ def remove_blockend(x:UOp): # NOTE: we have to add a barrier at the start if barrier is used in the range if x.op is Ops.BLOCKEND and any(y.op is Ops.BARRIER for y in late_ops) and late_ops[-1].op is Ops.ENDRANGE: late_ops = [UOp(Ops.BARRIER)] + late_ops - arg = BasicBlock2(tuple(early_ops)+parent_block.arg.lst+tuple(late_ops), tuple([y for y in x.arg.ctx if y is not x.arg.end]), cnt=x.arg.cnt) + arg = BasicBlock(tuple(early_ops)+parent_block.arg.lst+tuple(late_ops), tuple([y for y in x.arg.ctx if y is not x.arg.end]), cnt=x.arg.cnt) return UOp(Ops.BLOCK, src=tuple(y for y in x.src if y is not parent_block)+parent_block.src, arg=arg) block_merge = PatternMatcher([ @@ -237,9 +236,6 @@ def finalize(sink:UOp) -> UOp: # place the early things lst = sorted(dedup(sink.src), key=lambda x: x.tuplize) + list(sink.arg.lst) - - if __debug__: type_verify(lst) - - return UOp(Ops.BLOCKFINAL, arg=BasicBlock2(tuple(lst))) + return UOp(Ops.BLOCKFINAL, arg=BasicBlock(tuple(lst))) pm_finalize = PatternMatcher([(UPat(Ops.BLOCK, name="sink"), finalize)]) diff --git a/tinygrad_repo/tinygrad/codegen/lowerer.py b/tinygrad_repo/tinygrad/codegen/lowerer.py index 5790fc1..bbbd13e 100644 --- a/tinygrad_repo/tinygrad/codegen/lowerer.py +++ b/tinygrad_repo/tinygrad/codegen/lowerer.py @@ -6,7 +6,7 @@ from tinygrad.dtype import dtypes, PtrDType, least_upper_dtype from tinygrad.uop.ops import KernelInfo, UOp, Ops, PatternMatcher, UPat, sint, sint_to_uop from tinygrad.renderer import Renderer from tinygrad.helpers import all_int, prod, partition, flatten, unwrap -from tinygrad.codegen.symbolic import symbolic +from tinygrad.uop.symbolic import symbolic # returns the axes to create new_shape if new_shape can be created by combining axis from old_shape def get_contraction(old_shape:tuple[sint, ...], new_shape:tuple[sint, ...]) -> list[list[int]]|None: @@ -83,7 +83,6 @@ def get_grouped_dims(prefix, dims:tuple[sint, ...], max_sizes:tuple[int, ...]|No class IndexContext: idxs: list[UOp] ridxs: list[UOp] - acc_num: int = 0 def get_index(ast:UOp, opts:Renderer) -> IndexContext: ki = ast.arg if isinstance(ast.arg, KernelInfo) else KernelInfo() @@ -92,7 +91,7 @@ def get_index(ast:UOp, opts:Renderer) -> IndexContext: first_upcasted = len(full_shape)-ki.upcasted # if there's no reduce, this is first_upcasted. assumes reduces are at the end first_reduce = min([first_upcasted]+flatten(x.axis_arg for x in ast.toposort() if x.op is Ops.REDUCE_AXIS)) - local_loads = [x for x in ast.toposort() if x.op is Ops.LOAD and x.src[0].op is Ops.DEFINE_LOCAL] + local_loads = [x for x in ast.toposort() if x.op is Ops.LOAD and x.src[0].base.op is Ops.DEFINE_LOCAL] # NOTE: sum up the reduced axes looking across all local loads, yields the number of grouped reduces group_for_reduces = sum([any(l.st_arg.shape[i]!=ast.src[0].st_arg.shape[i] for l in local_loads) for i in range(first_reduce,first_upcasted)]) global_dims = first_reduce-ki.local_dims @@ -138,23 +137,22 @@ def lower_reduce_axis(ctx: IndexContext, x: UOp): # REDUCE supports both "horizonal" reduction and range reduction. the horizonal elements are taken in the nearest group return UOp(Ops.REDUCE, x.dtype, (ret,)+tuple(reduce_range), alu_op) -def lower_load_store(ctx: IndexContext, x: UOp): - idx, valid = x.st_arg.to_indexed_uops(ctx.ridxs if x.op is Ops.LOAD and x.src[0].op is Ops.DEFINE_LOCAL else ctx.idxs) - buf = x.src[0] +def lower_load_store(ctx: IndexContext, x: UOp, buf: UOp): + idx, valid = x.st_arg.to_indexed_uops(ctx.ridxs if x.op is Ops.LOAD and buf.op is Ops.DEFINE_LOCAL else ctx.idxs) if x.op is Ops.LOAD: - barrier = (UOp(Ops.BARRIER, dtypes.void, (x.src[2],)),) if x.src[0].op is Ops.DEFINE_LOCAL else () + barrier = (UOp(Ops.BARRIER, dtypes.void, (x.src[1],)),) if buf.op is Ops.DEFINE_LOCAL else () return UOp(Ops.LOAD, x.dtype, (buf.index(idx, valid),) + barrier) # NOTE: only store the local reduceop in the threads that are actually doing the reduce - if cast(PtrDType, x.src[0].dtype).local and x.src[2].op is Ops.REDUCE: - reduce_input = x.src[2].src[0] + if cast(PtrDType, buf.dtype).local and x.src[1].op is Ops.REDUCE: + reduce_input = x.src[1].src[0] store_back = reduce_input.op is Ops.LOAD and cast(PtrDType, reduce_input.src[0].dtype).local else: store_back = False # NOTE: If we're storing the reduced value back into each thread, need to zero-out the reduced axes - if store_back: idx, _ = x.st_arg.to_indexed_uops([u.const_like(0) if u in x.src[2].src else u for u in ctx.idxs]) - if (not cast(PtrDType, x.src[0].dtype).local) or store_back: + if store_back: idx, _ = x.st_arg.to_indexed_uops([u.const_like(0) if u in x.src[1].src else u for u in ctx.idxs]) + if (not cast(PtrDType, buf.dtype).local) or store_back: for oidx, ridx in zip(ctx.idxs, ctx.ridxs): if oidx is not ridx: valid = valid * oidx.eq(0) - return UOp(Ops.STORE, dtypes.void, (buf.index(idx, valid), x.src[2])) + return UOp(Ops.STORE, dtypes.void, (buf.index(idx, valid), x.src[1])) def lower_const(x:UOp): assert all(v.mask is None for v in unwrap(x.st).views), f"VIEW in CONST/DEFINE_VAR source must be unmasked, got {x.st}" @@ -165,9 +163,8 @@ pm_lowerer = PatternMatcher([ (UPat((Ops.CONST, Ops.DEFINE_VAR), src=(UPat(Ops.VIEW),), name="x"), lower_const), (UPat(Ops.VALID, src=(UPat(Ops.VIEW),), name="x"), lambda ctx,x: x.st_arg.to_indexed_uops(ctx.idxs)[1]), # rewrite LOAD/STORE VIEW to LOAD/STORE with indexed - (UPat((Ops.LOAD, Ops.STORE), src=(UPat(), UPat(Ops.VIEW)), allow_any_len=True, name="x"), lower_load_store), + (UPat((Ops.LOAD, Ops.STORE), src=(UPat.var("buf").view(),), allow_any_len=True, name="x"), lower_load_store), (UPat(Ops.INDEX, src=(UPat.var("b"), UPat.var("idx"), UPat.const(dtypes.bool, True))), lambda b, idx: b.index(idx)), - (UPat(Ops.IGNORE, name="x"), lambda x: x.src[0]), ]) # **** this is the "quantization preprocessor", it makes ONNX quantized models, and probably also others, actually use ints **** @@ -185,7 +182,7 @@ pm_quant = symbolic+PatternMatcher([ lambda x,v,cadd,cmul: x*v.where(cmul, 0)+v.where(cadd*cmul, 0)), # MUL after reduce - (UPat(Ops.REDUCE_AXIS, src=(UPat.var("x") * UPat.cvar("c"),), name="r"), lambda x,c,r: r.replace(src=(x,))*c), + (UPat(Ops.REDUCE_AXIS, src=(UPat.var("x") * UPat.cvar("c"),), name="r"), lambda x,c,r: r.replace(src=(x,))*c.arg), # CAST after reduce (doesn't work if it's a size change) (UPat(Ops.REDUCE_AXIS, src=(UPat(Ops.CAST, src=(UPat.var("x"),)),), name="r"), lambda x,r: r.replace(dtype=x.dtype, src=(x,)).cast(r.dtype) if dtypes.is_float(r.dtype) else None), @@ -195,10 +192,10 @@ pm_quant = symbolic+PatternMatcher([ lambda x,y,c1,c2: (x+y)*c1 if abs(c1.arg-c2.arg) < 1e-9 else None), # mul 0 * c1 is 0 (UPat(Ops.VALID, src=(UPat(Ops.VIEW, name="v"),)).where(UPat.cvar("c1"), UPat(Ops.CONST, arg=0)) * - UPat(Ops.LOAD, src=(UPat(), UPat(Ops.VIEW, name="v"))).cast(dtypes.int).cast(dtypes.float).named("ld"), lambda ld,v,c1: ld*c1), + UPat(Ops.LOAD, src=(UPat().view(name="v"),)).cast(dtypes.int).cast(dtypes.float).named("ld"), lambda ld,v,c1: ld*c1), # mul (with plus) 0 * c1 is 0 (UPat(Ops.VALID, src=(UPat(Ops.VIEW, name="v"),)).where(UPat.cvar("c1"), UPat(Ops.CONST, arg=0)) * - (UPat(Ops.LOAD, src=(UPat(), UPat(Ops.VIEW, name="v"))).cast(dtypes.int) + \ + (UPat(Ops.LOAD, src=(UPat().view(name="v"),)).cast(dtypes.int) + \ UPat(Ops.VALID, src=(UPat(Ops.VIEW, name="v"),)).where(UPat.cvar(), UPat(Ops.CONST, arg=0))).cast(dtypes.float).named("ld"), lambda ld,v,c1: ld*c1), diff --git a/tinygrad_repo/tinygrad/device.py b/tinygrad_repo/tinygrad/device.py index 7a3b183..9213766 100644 --- a/tinygrad_repo/tinygrad/device.py +++ b/tinygrad_repo/tinygrad/device.py @@ -2,9 +2,9 @@ from __future__ import annotations from dataclasses import dataclass, replace from collections import defaultdict from typing import Optional, Any, Generic, TypeVar, Iterator, Generator -import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, ctypes.util, platform, contextlib, sys, re, atexit, pickle, decimal, time +import importlib, inspect, functools, pathlib, os, ctypes, ctypes.util, platform, contextlib, sys, re, atexit, pickle, decimal, time from tinygrad.helpers import CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp, mv_address, \ - cpu_time_execution, colored, Context, round_up, DISABLE_COMPILER_CACHE + cpu_time_execution, colored, Context, round_up, DISABLE_COMPILER_CACHE, ALLOW_DEVICE_USAGE from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes, _to_np_dtype from tinygrad.renderer import Renderer @@ -22,11 +22,11 @@ class _Device: def __getitem__(self, ix:str) -> Compiled: return self.__get_canonicalized_item(self.canonicalize(ix)) @functools.cache # this class is a singleton, pylint: disable=method-cache-max-size-none def __get_canonicalized_item(self, ix:str) -> Compiled: - cpn = multiprocessing.current_process().name - assert (cpn == "MainProcess") or ix.split(":")[0] in ["DISK", "NPY", "PYTHON"], f"can only open device {ix} from parent, not {cpn}" - x = ix.split(":")[0].upper() - ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) \ - if (cname.lower() == x.lower() + "device")][0](ix) + assert ALLOW_DEVICE_USAGE or ix.split(":")[0] in ["DISK", "NPY", "PYTHON"], f"usage of device {ix} disallowed" + base = __name__.split('.')[0] # tinygrad + x = ix.split(":")[0].lower() + ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'{base}.runtime.ops_{x}')) \ + if (cname.lower() == x + "device")][0](ix) if DEBUG >= 1: print(f"opened device {ix} from pid:{os.getpid()}") self._opened_devices.add(ix) return ret @@ -94,7 +94,10 @@ class BufferSpec: class MultiBuffer: def __init__(self, device:tuple[str, ...], size:int, dtype:DType): self.bufs = [Buffer(d, size, dtype) for d in device] - self.size, self.dtype = size, dtype + @property + def size(self): return self.bufs[0].size + @property + def dtype(self): return self.bufs[0].dtype def ref(self, cnt): for b in self.bufs: b.ref(cnt) return self @@ -132,6 +135,7 @@ class Buffer: def allocate(self, opaque=None, external_ptr=None) -> Buffer: assert not self.is_allocated(), "can't allocate already allocated buffer" if DEBUG >= 7: print(f"buffer: allocate {self.nbytes} bytes on {self.device}") + if (mbs:=getenv("MAX_BUFFER_SIZE", 0)) > 0 and self.size > mbs: raise RuntimeError(f"buffer of size {self.size/1e6:.2f}M is too large") self.allocator:Allocator = Device[self.device].allocator if external_ptr is not None: self.options = replace(self.options, external_ptr=external_ptr) if self.options else BufferSpec(external_ptr=external_ptr) @@ -146,9 +150,9 @@ class Buffer: return self def deallocate(self): assert self.is_allocated(), "buffer must be allocated to deallocate" - if DEBUG >= 7: print(f"buffer: deallocate {self.nbytes} bytes on {self.device}") + if DEBUG is not None and DEBUG >= 7: print(f"buffer: deallocate {self.nbytes} bytes on {self.device}") if self._base is None and (self.options is None or self.options.external_ptr is None): - if not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.nbytes + if GlobalCounters is not None and not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.nbytes self.allocator.free(self._buf, self.nbytes, self.options) elif self._base is not None: self._base.allocated_views -= 1 del self._buf @@ -202,12 +206,15 @@ DeviceType = TypeVar('DeviceType', bound='Compiled') # TODO: size, dest, src are the same type. can we enforce this? class Allocator(Generic[DeviceType]): - def __init__(self, dev:DeviceType): self.dev: DeviceType = dev + def __init__(self, dev:DeviceType): + self.dev: DeviceType = dev + self.default_buffer_spec: BufferSpec = BufferSpec() # overridden in LRUAllocator def alloc(self, size:int, options:Optional[BufferSpec]=None): assert size > 0, f"alloc size must be positive, getting {size}" - return self._alloc(size, options if options is not None else BufferSpec()) - def free(self, opaque, size:int, options:Optional[BufferSpec]=None): self._free(opaque, options if options is not None else BufferSpec()) + return self._alloc(size, options if options is not None else self.default_buffer_spec) + def free(self, opaque, size:int, options:Optional[BufferSpec]=None): + self._free(opaque, options if options is not None else self.default_buffer_spec) # implemented by the runtime def _alloc(self, size:int, options:BufferSpec): raise NotImplementedError("need alloc") diff --git a/tinygrad_repo/tinygrad/dtype.py b/tinygrad_repo/tinygrad/dtype.py index 90c3c1e..d7fe21a 100644 --- a/tinygrad_repo/tinygrad/dtype.py +++ b/tinygrad_repo/tinygrad/dtype.py @@ -42,6 +42,10 @@ class DType(metaclass=DTypeMetaClass): return PtrDType(self.priority, self.itemsize, self.name, self.fmt, self.count, None, self, local, 1, size) def scalar(self) -> DType: return self._scalar if self._scalar is not None else self def nbytes(self): raise RuntimeError("only ptr types have nbytes") + @property + def min(self): return dtypes.min(self) + @property + def max(self): return dtypes.max(self) @dataclass(frozen=True, eq=False) class PtrDType(DType): diff --git a/tinygrad_repo/tinygrad/engine/grouper.py b/tinygrad_repo/tinygrad/engine/grouper.py index fc71a7f..b901f06 100644 --- a/tinygrad_repo/tinygrad/engine/grouper.py +++ b/tinygrad_repo/tinygrad/engine/grouper.py @@ -1,41 +1,20 @@ -from collections import defaultdict, deque from dataclasses import dataclass -from tinygrad.uop.ops import UOp, Ops, GroupOp, PatternMatcher, UPat, graph_rewrite, graph_rewrite_map, identity_element, resolve -from tinygrad.uop.ops import can_pad, sint, track_rewrites, _substitute -from tinygrad.codegen.lowerer import get_contraction_with_reduce, get_contraction -from tinygrad.codegen.symbolic import symbolic_simple -from tinygrad.helpers import Metadata, all_int, all_same, colored, prod, dedup, unwrap, getenv, pluralize, ContextVar, Context, diskcache_put, flatten -from tinygrad.helpers import FUSE_CONV_BW, FUSE_ARANGE, DEBUG, DONT_REALIZE_EXPAND, DONT_GROUP_REDUCES, SPLIT_REDUCEOP, CAPTURE_PROCESS_REPLAY +from tinygrad.uop.ops import UOp, Ops, GroupOp, PatternMatcher, UPat, graph_rewrite, graph_rewrite_map, identity_element, resolve, can_pad, sint +from tinygrad.uop.ops import track_rewrites, _substitute +from tinygrad.uop.spec import type_verify, tensor_uop_spec +from tinygrad.codegen.lowerer import get_contraction_with_reduce +from tinygrad.uop.symbolic import symbolic_simple +from tinygrad.helpers import Metadata, all_int, all_same, colored, prod, dedup, unwrap, getenv, pluralize +from tinygrad.helpers import FUSE_CONV_BW, FUSE_ARANGE, DEBUG, DONT_REALIZE_EXPAND, DONT_GROUP_REDUCES, SPLIT_REDUCEOP from tinygrad.dtype import ImageDType from tinygrad.engine.multi import multi_pm, replace_allreduce from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View, strides_for_shape -from tinygrad.uop.spec import type_verify, sched_spec # creation can recurse a lot import sys sys.setrecursionlimit(10000) -# **** UOp merge views - -merge_views = PatternMatcher([ - # merge adjacent views - (UPat(Ops.VIEW, src=(UPat(Ops.VIEW, name="v1"),), name="v2"), lambda v1,v2: v1.replace(arg=v1.arg+v2.arg)), - # merge view on load/store/valid - (UPat(Ops.VIEW, src=(UPat((Ops.LOAD, Ops.STORE, Ops.VALID), name="x"),), name="view"), - lambda x,view: x.replace(src=tuple((s.st+view.st).to_uop() if s.op is Ops.VIEW else s for s in x.src))), - # merge view on const if it's not masked - (UPat((Ops.CONST, Ops.DEFINE_VAR), name="x").view(name="view"), - lambda x,view: x.replace(src=(x.src[0].replace(arg=x.st+view.st),)) if all(v.mask is None for v in (x.st+view.st).views) else None), - # replace view with base if it's contiguous and the shapes match - (UPat(GroupOp.All-{Ops.DEVICE}, name="x").view(name="view"), lambda x,view: x if view.st.contiguous and x.shape == view.shape else None), - # replace masked view with zero if it can collapse - (UPat(Ops.VIEW, src=(UPat(),), name="view"), - lambda view: view.const_like(0) if (mask:=view.st.views[-1].mask) is not None and any((x[1]-x[0]) == 0 for x in mask) else None), - # movement ops apply a new view on the base - (UPat(GroupOp.Movement, src=(UPat.var("x"),), name="mop"), lambda mop,x: x.view(mop.st)), -]) - # **** schedule simplifier def simplify_stride0_reduce(reduce:UOp, x:UOp): @@ -71,15 +50,8 @@ def copy_reorder_view(copy:UOp, view:UOp, base:UOp): if prod(view.shape) < prod(base.shape): return view.contiguous().copy_to_device(copy.device) return base.copy_to_device(copy.device).view(view.arg) -def mselect_reorder_view(ms:UOp, view:UOp, base:UOp): - st = unwrap(view.st) - # replace dnum in ShapeTracker with literal const for this mselect - if (dnums:=[x for x in st.vars() if x.arg[0] == '_device_num']): - assert len(dnums) == 1, f"view must have exactly 0 or 1 dnum, got {dnums}" - st = st.substitute({dnums[0]:dnums[0].const_like(ms.arg)}) - return base.mselect(ms.arg).view(st) - -ALWAYS_CONTIGUOUS = {Ops.CONTIGUOUS, Ops.ASSIGN, Ops.COPY, Ops.BUFFER, Ops.BUFFER_VIEW, Ops.CONST, Ops.BIND, Ops.DEVICE, Ops.MSELECT} +ALWAYS_CONTIGUOUS = {Ops.CONTIGUOUS, Ops.ASSIGN, Ops.COPY, Ops.BUFFER, Ops.BUFFER_VIEW, + Ops.CONST, Ops.BIND, Ops.DEVICE, Ops.MSELECT, Ops.MSTACK, Ops.GBARRIER} sym = symbolic_simple+PatternMatcher([ # UOp with size 0 is zero @@ -87,9 +59,6 @@ sym = symbolic_simple+PatternMatcher([ and not (root.base.op is Ops.CONST and root.base.arg == 0) else None), # DETACH and CONTIGUOUS_BACKWARD are NOOPs here (UPat((Ops.DETACH, Ops.CONTIGUOUS_BACKWARD), name="x"), lambda x: x.src[0]), - # MULTI in SINK just flattens srcs - (UPat(Ops.SINK, name="x"), - lambda x: UOp.sink(*new_src) if (new_src:=tuple(flatten([s.src if s.op is Ops.MULTI else [s] for s in x.src]))) != x.src else None), # reduce of size 0 is the identity element (UPat(Ops.REDUCE_AXIS, name="reduce", src=(UPat.var("x"),)), lambda reduce,x: reduce.const_like(identity_element(reduce.arg[0], reduce.dtype)) if x.size == 0 and reduce.size != 0 else None), @@ -103,11 +72,11 @@ sym = symbolic_simple+PatternMatcher([ (UPat(Ops.COPY, name="c", src=(UPat.var("x"), UPat(Ops.DEVICE))), lambda c,x: x if c.device == x.device else None), # store a shrink before COPY, otherwise view after the COPY (UPat(Ops.COPY, src=(UPat(Ops.VIEW, src=(UPat.var("base"),), name="view"), UPat(Ops.DEVICE)), name="copy"), copy_reorder_view), - # MSELECT must select a base, if there are views apply them after selecting the base - (UPat(Ops.MSELECT, src=(UPat(Ops.VIEW, src=(UPat.var("base"),), name="view"),), name="ms"), mselect_reorder_view), # remove cast to image when it's already a contiguous image (UPat(Ops.CAST, name="cast", src=(UPat(Ops.VIEW, name="vm", src=(UPat(Ops.CONTIGUOUS, name="base"),)),)), lambda cast,base,vm: base.view(vm.st) if isinstance(cast.dtype, ImageDType) and isinstance(base.dtype, ImageDType) else None), + # CAST before masking constants + (UPat.cvar("x").view().cast(name="c"), lambda x,c: x.cast(c.dtype).view(c.src[0].arg)), # make things that can't be images not images (UPat(GroupOp.All-{Ops.BUFFER, Ops.VIEW, Ops.CONST, Ops.DEVICE}, name="u"), lambda u: u.replace(dtype=dt.base) if isinstance(dt:=u.dtype,ImageDType) and (prod(u.shape) != prod(dt.shape) or not any(u.shape[x]%4 == 0 for x in u.st.unit_stride_axes())) else None), @@ -122,7 +91,8 @@ sym = symbolic_simple+PatternMatcher([ # double ASSIGN to same target is one ASSIGN (UPat(Ops.ASSIGN, src=(UPat.var("t"), UPat(Ops.ASSIGN, src=(UPat.var("t"), UPat.var("x"))))), lambda x,t: t.assign(x.contiguous())), # ASSIGN to unrealized replaces the UOp - (UPat(Ops.ASSIGN, src=(UPat.var("t"), UPat.var("x"))), lambda x,t: x.contiguous() if t.base.op not in {Ops.BUFFER, Ops.BUFFER_VIEW} else None), + (UPat(Ops.ASSIGN, src=(UPat.var("t"), UPat.var("x"))), lambda x,t: x.contiguous() if t.base.op not in {Ops.BUFFER, Ops.BUFFER_VIEW} and + not (t.base.op is Ops.MSTACK and all(x.op is Ops.BUFFER for x in t.base.src)) else None), # put CAST to smaller dtype before EXPAND (UPat(Ops.CAST, name="cast", src=(UPat(Ops.VIEW, name="vm"),)), lambda cast,vm: vm.base.cast(cast.dtype).view(vm.st) if cast.dtype.itemsize <= vm.dtype.itemsize and resolve(prod(vm.shape) > vm.st.real_size()) else None), @@ -145,6 +115,10 @@ replace_contiguous = PatternMatcher([ def realize(ctx:dict[UOp, None], tr:UOp) -> None: ctx[tr] = None +def realize_parents(ctx:dict[UOp, None], rb:UOp) -> None: + for s in rb.src: + if s.op not in ALWAYS_CONTIGUOUS: ctx[s] = None + def realize_before_view(ctx:dict[UOp, None], view:UOp, tr:UOp) -> None: st = unwrap(view.st) # always realize unsafe pad ops before masked view @@ -161,11 +135,11 @@ do_realize = PatternMatcher([ (UPat({Ops.ASSIGN, Ops.CONTIGUOUS, *GroupOp.Meta}, name="tr"), realize), # realize before expand or unsafe pad ops (UPat(Ops.VIEW, src=(UPat(GroupOp.All-ALWAYS_CONTIGUOUS, name="tr"),), name="view"), realize_before_view), - # realize before COPY and MSELECT - (UPat((Ops.COPY, Ops.MSELECT), src=(UPat(GroupOp.All-ALWAYS_CONTIGUOUS, name="tr"),), allow_any_len=True), realize), + # realize parents of COPY, MSELECT, MSTACK + (UPat((Ops.COPY, Ops.MSELECT, Ops.MSTACK), name="rb"), realize_parents), ]) -def recursive_group(tr:UOp, st:ShapeTracker, r:UOp, children:defaultdict[UOp, dict[UOp, None]], realizes:dict[UOp, None], +def recursive_group(tr:UOp, st:ShapeTracker, r:UOp, children:dict[UOp, dict[UOp, None]], realizes:dict[UOp, None], reduce_for_op:dict[UOp, UOp], group:dict[UOp, None], cache:dict[tuple[UOp, ShapeTracker], None]) -> None: if (tr, st) in cache: return cache.setdefault((tr, st)) @@ -175,7 +149,7 @@ def recursive_group(tr:UOp, st:ShapeTracker, r:UOp, children:defaultdict[UOp, di # max one reduceop per kernel if not st.contiguous or st.size != rsize or tr in reduce_for_op: group.setdefault(r) return group.setdefault(tr) - for tr_next in children[tr]: + for tr_next in children.get(tr, {}): # max one reduceop per kernel if tr_next.op is Ops.REDUCE_AXIS: return group.setdefault(r) # can only fuse contiguous @@ -189,12 +163,12 @@ def group_realizes(sink:UOp) -> dict[UOp, None]: if DONT_GROUP_REDUCES: return realizes # construct children graph (only for bases) - children: defaultdict[UOp, dict[UOp, None]] = defaultdict(dict) + children: dict[UOp, dict[UOp, None]] = {} assigns: dict[UOp, None] = {} for u in (toposort:=sink.toposort()): if u.op in {Ops.VIEW, Ops.SINK}: continue if u.op is Ops.ASSIGN: assigns[u.buf_uop] = None - for s in u.src: children[s.base][u] = None + for s in u.src: children.setdefault(s.base, {})[u] = None # find all reduces, and pair them to a elementwise op. if they can't be cleanly paired, force realize the reduce (or a contig child) reduce_for_op: dict[UOp, UOp] = {} @@ -208,13 +182,17 @@ def group_realizes(sink:UOp) -> dict[UOp, None]: recursive_group(r, unwrap(r.st), r, children, realizes, reduce_for_op, group, cache={}) # max one reduceop per kernel can_chase = all(tr not in reduce_for_op for tr in group) + for u in r.toposort(gate=lambda u: u not in realizes): + if u.op is Ops.REDUCE_AXIS and u.src[0].base.op is Ops.CONST: + can_chase = False + break # TODO: forced_realize exists because the scheduler is incapable of checking for self-contained DAGs forced_realize = r in group # can only have one output if not forced_realize and len(group) > 1: forced_realize = True # can only fuse assign if no other assign_target is used in the kernel if not forced_realize and (assign_targets:={x.buf_uop for x in group if x.op is Ops.ASSIGN}): - parents = deque((r, *group)) + parents = [r, *group] while parents and not forced_realize: p = parents.pop().base if p.op is Ops.BUFFER and p in assigns and p not in assign_targets: forced_realize, can_chase = True, False @@ -225,8 +203,8 @@ def group_realizes(sink:UOp) -> dict[UOp, None]: if can_chase: # can chase this down to contiguous children st = unwrap(tr.st) - while len(children[tr]) == 1: - tr_next = next(iter(children[tr])) + while len(lst:=children.get(tr, {})) == 1: + tr_next = next(iter(lst)) st_childs = dedup(unwrap(s.st) for s in tr_next.src if s.base is tr) if len(st_childs) > 1: break if st.size != st_childs[0].size: break @@ -242,7 +220,7 @@ def group_realizes(sink:UOp) -> dict[UOp, None]: # fuse double reduces with no other child for reduceop in double_reduces: top_reduce = reduceop.src[0].base - if len(children[top_reduce]) == 1: del realizes[top_reduce] + if len(children.get(top_reduce, {})) == 1: del realizes[top_reduce] return realizes # **** create kernels @@ -261,7 +239,7 @@ def create_kernel(x:UOp, b:UOp|None=None): buffer = b.base if b.size == b.base.size else UOp(Ops.BUFFER_VIEW, b.dtype, (b.base,), (b.size, b.arg.views[0].offset)) return buffer.assign(kernel).reshape(x.shape) -DONT_PLACE_IN_KERNEL = {Ops.KERNEL, Ops.ASSIGN, Ops.BUFFER, Ops.MSELECT} +DONT_PLACE_IN_KERNEL = {Ops.KERNEL, Ops.ASSIGN, Ops.BUFFER, Ops.MSELECT, Ops.MSTACK, Ops.MULTI} def append_to_kernel(x:UOp): new_srcs: list[UOp] = [] metadata = x.arg.metadata @@ -280,15 +258,29 @@ create_kernels = PatternMatcher([ (UPat(Ops.GBARRIER, src=(UPat.var("x"),)), create_kernel), # walk back the local graph until we reach a realized source (UPat(Ops.KERNEL, name="x"), append_to_kernel), - # remove extra views and constants from SINK - (UPat(Ops.SINK, name="x"), - lambda x: x.replace(src=new_src) if (new_src:=tuple(dedup(s.base for s in x.src if s.base.op not in {Ops.CONST, Ops.BIND}))) != x.src else None), # push RESHAPE through MSELECT (UPat(Ops.MSELECT, src=(UPat(Ops.RESHAPE, name="r"),), name="ms"), lambda ms,r: r.src[0].mselect(ms.arg).reshape(r.arg)), + # push RESHAPE through MSTACK + (UPat(Ops.MSTACK, src=UPat(Ops.RESHAPE), name="ms"), + lambda ms: UOp(Ops.MSTACK, ms.dtype, tuple(x.src[0] for x in ms.src)).reshape(ms.src[0].arg)), ]) # **** swizzler +merge_views = PatternMatcher([ + # merge adjacent views + (UPat(Ops.VIEW, src=(UPat(Ops.VIEW, name="v1"),), name="v2"), lambda v1,v2: v1.replace(arg=v1.arg+v2.arg)), + # replace MovementOps with VIEW + (UPat(GroupOp.Movement, src=(UPat.var("x"),), name="mop"), lambda mop,x: x.base.view(mop.st)), + # remove NOOP views + (UPat.var("x").view(name="view"), lambda x,view: x if x.st is not None and view.st.contiguous and view.shape == x.shape else None), + (UPat(GroupOp.All-{Ops.DEFINE_GLOBAL}).view(name="view"), + lambda view: view.const_like(0) if (mask:=view.st.views[-1].mask) is not None and any((x[1]-x[0]) == 0 for x in mask) else None), + # only unmaksed VIEW on CONST replaces the ShapeTracker + (UPat(Ops.VIEW, src=(UPat((Ops.CONST, Ops.DEFINE_VAR), name="x"),), name="view"), + lambda x,view: x.replace(src=(x.src[0].replace(arg=x.st+view.st),)) if all(v.mask is None for v in (x.st+view.st).views) else None), +]) + def reduce_push_add_ones(src:UOp, r:UOp, view:UOp): # contiguous, expand, and the same with ones removed if unwrap(view.st).contiguous and len(r.shape) < len(view.shape) and \ @@ -312,37 +304,41 @@ def reduce_push_add_ones(src:UOp, r:UOp, view:UOp): return None view_left = merge_views+PatternMatcher([ - # view before elementwise ops - (UPat(Ops.VIEW, src=(UPat({*GroupOp.ALU, Ops.CAST, Ops.BITCAST, Ops.BIND}, name="e"),), name="view"), - lambda e,view: e.replace(src=tuple(s.view(s.st+view.st) if s.op is Ops.VIEW else s.view(view.st) for s in e.src))), + # view before elementwise and buffer ops + (UPat(Ops.VIEW, src=(UPat({*GroupOp.ALU, Ops.CAST, Ops.BITCAST, Ops.BIND, Ops.LOAD, Ops.STORE, Ops.VALID}, name="e"),), name="view"), + lambda e,view: e.replace(src=tuple(s.view(view.st) for s in e.src))), # if there's ones added after reduce, put this before the reduce (UPat(Ops.VIEW, src=(UPat(Ops.REDUCE_AXIS, src=(UPat.var("src"),), name="r"),), name="view"), reduce_push_add_ones), ]) def apply_swizzle(u:UOp) -> UOp: return graph_rewrite(u, view_left, name="Sub View Left") +# change reduceop axes and input ShapeTrackers, view gets replaced with a reshape. def swizzle_reduceop(r:UOp, src:UOp, view:UOp, fuse=False): - if (st:=unwrap(view.st)).contiguous and st.size == r.size: return None + # contiguous and same size can push to children + # if there's a reduce child, shapes match with ones removed + if unwrap(view.st).contiguous and view.size == r.size and \ + (not (len(r.arg) == 3 and r.arg[2]) or # arg[2] = True is fuse marker + tuple((i,x) for i,x in enumerate(r.shape) if resolve(x != 1)) == tuple((i,x) for i,x in enumerate(view.shape) if resolve(x != 1))): + return None + # swizzle the input input_st = ShapeTracker.from_shape(src.shape) tmp = input_st.permute(tuple(i for i in range(len(input_st.shape)) if i not in r.axis_arg)+r.axis_arg) prshape = prod(rshape:=tmp.shape[-len(r.axis_arg):]) strides = strides_for_shape(rshape) nv = [View.create(v.shape+rshape, tuple(x*prshape for x in v.strides)+strides, - v.offset*prshape, v.mask+tuple((0,s) for s in rshape) if v.mask is not None else None) for v in st.views] - # create a new reduceop for the swizzled input - new_input_st = tmp + ShapeTracker(tuple(nv)) - new_axis = tuple(range(len(st.shape), len(st.shape) + len(r.axis_arg))) - swizzled_src = apply_swizzle(src.view(src.arg+new_input_st if src.op is Ops.VIEW else new_input_st)) - if fuse: red = UOp(Ops.REDUCE_AXIS, r.dtype, (swizzled_src.fuse(),), (r.arg[0], new_axis, True)) - else: red = UOp(Ops.REDUCE_AXIS, r.dtype, (swizzled_src,), (r.arg[0], new_axis)) - return red.view(ShapeTracker.from_shape(st.shape)) + v.offset*prshape, v.mask+tuple((0,s) for s in rshape) if v.mask is not None else None) for v in unwrap(view.st).views] + new_view = tmp + ShapeTracker(tuple(nv)) + swizzled_input = apply_swizzle(src.view(new_view)) + # create a new reduceop + new_axis = tuple(range(len(view.shape), len(view.shape) + len(r.axis_arg))) + if fuse: red = UOp(Ops.REDUCE_AXIS, r.dtype, (swizzled_input.fuse(),), (r.arg[0], new_axis, True)) + else: red = UOp(Ops.REDUCE_AXIS, r.dtype, (swizzled_input,), (r.arg[0], new_axis)) + return red.reshape(view.shape) def reduceop_view_right(src:UOp, v:UOp, r:UOp): assert unwrap(v.st).contiguous and v.size == src.size, f"can't compute new axis for {src.shape} -> {r.shape}" - if (contraction:=get_contraction(v.shape, src.shape)) is None: return None - new_axis: list[int] = [] - for i,pairs in enumerate(contraction): - if any(x in r.axis_arg for x in pairs): new_axis.append(i) + new_axis = [i for i,(s,u) in enumerate(zip(src.shape, r.shape)) if s != u] return src.r(r.arg[0], tuple(new_axis)).reshape(r.shape) def elementwise_view_right(root:UOp): @@ -350,7 +346,7 @@ def elementwise_view_right(root:UOp): assert all_same([x.base.size for x in swizzles]), f"swizzle inputs must have the same size {swizzles}" # place view after applying the elementwise op new_st = ShapeTracker.from_shape(swizzles[0].base.shape) - new_src = [x.base if x.base.shape==new_st.shape else apply_swizzle(x.view(x.arg+new_st) if x.op is Ops.VIEW else x.view(new_st)) for x in root.src] + new_src = [x.base if x.base.shape==new_st.shape else apply_swizzle(x.view(new_st)) for x in root.src] # reshape to match downstream shapes return root.replace(src=tuple(new_src)).reshape(root.shape) @@ -361,7 +357,7 @@ view_right = merge_views+PatternMatcher([ # apply view after reduceops (UPat(Ops.REDUCE_AXIS, src=(UPat(Ops.VIEW, src=(UPat(GroupOp.All-ALWAYS_CONTIGUOUS, name="src"),), name="v"),), name="r"), reduceop_view_right), # apply view after elementwise ops - (UPat(GroupOp.All-{Ops.SINK}, name="root"), elementwise_view_right), + (UPat(GroupOp.All-{Ops.SINK, Ops.GBARRIER}, name="root"), elementwise_view_right), # merge axes for double reduce (invert of SPLIT_REDUCEOP=1) (UPat(Ops.REDUCE_AXIS, src=(UPat(Ops.REDUCE_AXIS, name="r1"),), name="r2"), lambda r1,r2: r1.replace(arg=(r1.arg[0], r2.arg[1]+r1.arg[1])) if r1.arg[0] is r2.arg[0] else None), @@ -371,15 +367,15 @@ view_right = merge_views+PatternMatcher([ add_buffer_ops = PatternMatcher([ # LOAD - (UPat(Ops.BUFFER, name="x"), lambda ctx,x: UOp.load(UOp(Ops.DEFINE_GLOBAL, x.dtype.ptr(x.size), (), ctx.index(x)), x.st.to_uop())), + (UPat(Ops.BUFFER, name="x"), lambda ctx,x: UOp.load(UOp(Ops.DEFINE_GLOBAL, x.dtype.ptr(x.size), (), ctx.index(x)).view(x.st),)), # STORE (except for meta ops) (UPat(Ops.SINK, src=(UPat(GroupOp.Meta, name="x"),)), lambda x:x), (UPat(Ops.SINK, src=UPat(GroupOp.All-{Ops.STORE}), name="sink"), lambda ctx,sink: - UOp.sink(*[UOp.store(UOp(Ops.DEFINE_GLOBAL, (s:=x.base).dtype.ptr(ctx[i].size), (), i), s.st.to_uop(), s) for i,x in enumerate(sink.src)])), + UOp.sink(*[UOp.store(UOp(Ops.DEFINE_GLOBAL, (s:=x.base).dtype.ptr(ctx[i].size), (), i).view(s.st), s) for i,x in enumerate(sink.src)])), # passthrough ASSIGN (UPat(Ops.ASSIGN, name="x"), lambda x: x.src[1]), # VALID - (UPat(Ops.VIEW, src=(UPat((Ops.CONST, Ops.DEFINE_VAR), name="x"),), name="view"), lambda x,view: x.valid(view.arg)), + (UPat(Ops.VIEW, src=(UPat.cvar(),), name="self"), UOp.valid), ]) def check_load_st(glbl:UOp, view:UOp): @@ -396,15 +392,17 @@ fix_kernel_ops = PatternMatcher([ # remove CONTIGUOUS/DEVICE from kernel AST (UPat((Ops.CONTIGUOUS, Ops.MSELECT), src=(UPat.var("x"),)), lambda x: x), (UPat(Ops.VIEW, src=(UPat(Ops.DEVICE),), name="view"), lambda view: view.replace(src=())), - # no ImageDType after load - (UPat(GroupOp.All-{Ops.DEFINE_GLOBAL}, name="x"), lambda x: x.replace(dtype=x.dtype.base) if isinstance(x.dtype, ImageDType) else None), + # no ImageDType after index + (UPat(GroupOp.All-{Ops.DEFINE_GLOBAL, Ops.VIEW}, name="x"), lambda x: x.replace(dtype=x.dtype.base) if isinstance(x.dtype, ImageDType) else None), # if this kernel also assigns to the loaded buffer, ensure we can index it correctly - (UPat(Ops.LOAD, src=(UPat.var("glbl"), UPat.var("view"))), check_load_st), + (UPat(Ops.LOAD, src=(UPat.var("glbl").view(name="view"),)), check_load_st), ]) replace_globals = PatternMatcher([ # replace ASSIGN with the target BUFFER (UPat(Ops.ASSIGN, src=(UPat(Ops.BUFFER), UPat(Ops.KERNEL)), name="assign", allow_any_len=True), lambda assign: assign.src[0]), + # HACK: select the 0 branch of MSTACK (the device is wrong after this, is that okay?) + (UPat(Ops.MSTACK, name="x"), lambda x: x.src[0]), ]) def fix_kernel_ast(k:UOp) -> UOp|None: @@ -414,8 +412,13 @@ def fix_kernel_ast(k:UOp) -> UOp|None: # push views to edges ast = graph_rewrite(graph_rewrite(ast, view_left, name="Main View Left"), view_right, name="Main View Right") # replace buffer with define_global + add load/store last - bufs = tuple(s.buf_uop if s.op is not Ops.MSELECT else s.src[0].buf_uop for s in k.src) - ast = graph_rewrite(ast, merge_views+add_buffer_ops+fix_kernel_ops, bufs, bottom_up=True, name="replace buffer") + bufs = [] + for s in k.src: + s = s.buf_uop + # traverse back through MSELECT and MSTACK. HACK: 0 branch of MSTACK only + while s.op in {Ops.MSELECT, Ops.MSTACK}: s = s.src[0] + bufs.append(s) + ast = graph_rewrite(ast, view_left+add_buffer_ops+fix_kernel_ops, bufs, bottom_up=True, name="replace buffer") if ast.op is Ops.SINK and not all_same([x.device for x in k.src]): raise RuntimeError(f"all buffers must be on the same device: {tuple(b.buf_uop.buffer for b in k.src)}") return k.replace(arg=Kernel(ast, k.arg.metadata)) @@ -448,7 +451,7 @@ pm_fuse = PatternMatcher([ # FUSE elementwise. (UPat(Ops.VIEW, src=(UPat({*GroupOp.ALU, Ops.CAST}, name="alu"),), name="view").fuse(), - lambda alu, view: alu.replace(src=tuple(x.view(x.arg+view.arg if x.op is Ops.VIEW else view.arg).fuse() for x in alu.src))), + lambda alu, view: alu.replace(src=tuple(apply_swizzle(x.view(view.arg)).fuse() for x in alu.src))), # push FUSE through to srcs (UPat(Ops.FUSE, name="x"), lambda x: x.src[0].replace(src=tuple(y.fuse() for y in x.src[0].src))), @@ -499,17 +502,6 @@ do_fuse = PatternMatcher([ (UPat(Ops.REDUCE_AXIS, name="root"), fuse_arange), ]) -PROCESS_REPLAY_CAPTURE:dict[int,bytes] = {} -if CAPTURE_PROCESS_REPLAY: - import atexit - @atexit.register - def save_process_replay(): - for k,v in PROCESS_REPLAY_CAPTURE.items(): diskcache_put("schedule_process_replay", k, v, prepickled=True) - -def get_name(becomes_map:dict[UOp, UOp]) -> str: - assigned_kernels = {u.base.buf_uop:u.base.src[1] for u in becomes_map.values() if u.base.op is Ops.ASSIGN}.values() - return f"Schedule {pluralize('Kernel', len(set(assigned_kernels)))}" - add_gbarrier = PatternMatcher([(UPat(GroupOp.All-{Ops.GBARRIER, Ops.ASSIGN}, name="x"), lambda ctx,x: x.replace(tag=1).gbarrier() if x in ctx and x.tag is None else None)]) @@ -523,21 +515,27 @@ def limit_bufs(root:UOp): # count number of unique buffers flowing into this op bufs: set[UOp] = set() def gate_input(u:UOp): - if (is_buffer:=(u.op in {Ops.BUFFER, Ops.GBARRIER, Ops.ASSIGN})): bufs.add(u) - return not is_buffer + if (is_load:=(u.op in {Ops.BUFFER, Ops.GBARRIER, Ops.ASSIGN, Ops.MSTACK})): bufs.add(u) + return not is_load root.toposort(gate=gate_input) # NOTE: this -1 is for the output buffer if len(bufs)>=MAX_BUFS-1: return root.replace(src=tuple(s if s.base in bufs else s.replace(tag=1).gbarrier() for s in root.src)) -split_kernels = PatternMatcher([ +finalize_gbarrier = PatternMatcher([ + # if an op takes more than one input, check combined LOADs don't exceed device limits (UPat(set.union(GroupOp.Binary, GroupOp.Ternary), name="root"), limit_bufs), + # merge gbarrier (UPat((Ops.GBARRIER, Ops.CONTIGUOUS), src=(UPat(Ops.GBARRIER),), name="x"), lambda x: x.src[0]), + # add contiguous to VIEW before GBARRIER + (UPat(Ops.GBARRIER, src=(UPat(Ops.VIEW,),), name="x"), lambda x: x.src[0].contiguous().gbarrier()), + # remove gbarrier on constants without a contiguous + (UPat(Ops.GBARRIER, src=(UPat(Ops.CONST),), name="x"), lambda x: x.src[0]), ]) remove_tags = PatternMatcher([(UPat(GroupOp.All, name="x"), lambda x: x.replace(tag=None) if x.tag is not None else None)]) -@track_rewrites(name_fxn=get_name) +@track_rewrites(name_fxn=lambda big_sink,ret: f"Schedule {pluralize('Kernel',len([u for u in ret[big_sink].toposort() if u.op is Ops.KERNEL]))}") def get_kernelize_map(big_sink:UOp) -> dict[UOp, UOp]: # multi + merge_views + simplify tensor_map = graph_rewrite_map(big_sink, multi_pm+replace_allreduce+do_fuse+merge_views+sym+replace_contiguous, ctx={}, name="merge_views") @@ -548,7 +546,8 @@ def get_kernelize_map(big_sink:UOp) -> dict[UOp, UOp]: # insert gbarriers in places determined by the realize map realize_map = group_realizes(tensor_map[big_sink]) tensor_map = graph_rewrite_map(tensor_map[big_sink], add_gbarrier, realize_map, bottom_up=True, input_map=tensor_map, name="insert_gbarrier") - tensor_map = graph_rewrite_map(tensor_map[big_sink], split_kernels, input_map=tensor_map, name="split_kernels") + # optionally reorder gbarriers or insert more (top down) + tensor_map = graph_rewrite_map(tensor_map[big_sink], finalize_gbarrier, input_map=tensor_map, name="finalize_gbarrier") tensor_map = graph_rewrite_map(tensor_map[big_sink], remove_tags, input_map=tensor_map, name="remove_tags") # TODO: move view_left/view_right here @@ -563,6 +562,7 @@ def get_kernelize_map(big_sink:UOp) -> dict[UOp, UOp]: if u.op is not Ops.ASSIGN: continue kernel_assign[u.buf_uop] = u for s in u.src[1].src: + # TODO: this is probably broken for MSELECT/MSTACK if s.op is not Ops.BUFFER or s is u.buf_uop or (a:=kernel_assign.get(s)) is None: continue if any(x.op is Ops.ASSIGN and x.buf_uop is s for x in u.toposort()): raise RuntimeError(f"cycle detected in graph, kernel for {u.buf_uop} must either depend on ASSIGN or BUFFER") @@ -579,22 +579,6 @@ def get_kernelize_map(big_sink:UOp) -> dict[UOp, UOp]: if getenv("VIZ"): graph_rewrite(sched_sink, PatternMatcher([]), name="View Memory Graph") # verify Kernels match the spec - type_verify(list(toposort:=sched_sink.toposort()), sched_spec) + if __debug__: type_verify(list(sched_sink.toposort()), tensor_uop_spec) - # capture process replay - if CAPTURE_PROCESS_REPLAY: - with Context(PICKLE_BUFFERS=0): - import pickle - PROCESS_REPLAY_CAPTURE[id(big_sink)] = pickle.dumps((big_sink, ContextVar._cache, [u.arg.ast for u in toposort if u.op is Ops.KERNEL])) - - # map tensors to buffer/assign/const - # TODO: this is not right, and causes TestDataset.test_dataset_is_realized to fail unless I unprincipledly add Ops.COPY, which breaks others - becomes_map: dict[UOp, UOp] = {} - for k,v in tensor_map.items(): - if k is v: continue - op = v.base.op - if op in {Ops.BUFFER, Ops.ASSIGN}: becomes_map[k] = v - if op is Ops.CONST and all_int(v.shape): becomes_map[k] = v - if op is Ops.MULTI and all(x.base in becomes_map for x in v.base.src): becomes_map[k] = v - - return becomes_map + return tensor_map diff --git a/tinygrad_repo/tinygrad/engine/jit.py b/tinygrad_repo/tinygrad/engine/jit.py index 23f2425..c63810f 100644 --- a/tinygrad_repo/tinygrad/engine/jit.py +++ b/tinygrad_repo/tinygrad/engine/jit.py @@ -176,7 +176,7 @@ class CapturedJit(Generic[ReturnType]): self.__post_init__() # reset the graph state def replan_buffers_memory_layout(self): - blacklist = [t.lazydata.buffer for t in get_parameters(self.ret)] + blacklist = [t.uop.buffer for t in get_parameters(self.ret)] asgn = _internal_memory_planner([[b for item in self.jit_cache for b in item.bufs if b is not None and b not in blacklist]], ignore_checks=True) self.jit_cache = [ExecItem(item.prg, [asgn.get(b,b) if b is not None else None for b in item.bufs]) for item in self.jit_cache] for old, new in asgn.items(): @@ -210,9 +210,9 @@ class CapturedJit(Generic[ReturnType]): def _prepare_jit_inputs(args, kwargs): input_tensors: list[tuple[int|str, Tensor]] = [(name,t) for name,t in list(enumerate(args))+sorted(kwargs.items()) if t.__class__ is Tensor] names, tensors = [name for name,_ in input_tensors], [t for _,t in input_tensors] - if len(unrealized_tensors := [x for x in tensors if not x.lazydata.is_realized]): Tensor.realize(*unrealized_tensors) + if len(unrealized_tensors := [x for x in tensors if not x.uop.is_realized]): Tensor.realize(*unrealized_tensors) # TODO: this multi unpack stuff is not well tested. - lbs: list[UOp] = flatten([t.lazydata.src if t.lazydata.op is Ops.MULTI else [t.lazydata] for t in tensors]) + lbs: list[UOp] = flatten([t.uop.src if t.uop.op is Ops.MULTI else [t.uop] for t in tensors]) input_buffers: list[Buffer] = flatten([rb.bufs if isinstance(rb:=lb.base.realized, MultiBuffer) else [rb] for lb in lbs if lb.base.realized is not None]) assert len(set(input_buffers)) == len(input_buffers), "duplicate inputs to JIT" diff --git a/tinygrad_repo/tinygrad/engine/multi.py b/tinygrad_repo/tinygrad/engine/multi.py index d76b538..0f544e0 100644 --- a/tinygrad_repo/tinygrad/engine/multi.py +++ b/tinygrad_repo/tinygrad/engine/multi.py @@ -1,6 +1,7 @@ +from typing import cast import functools, itertools, operator -from tinygrad.helpers import all_same, all_int, prod, DEBUG, RING, getenv -from tinygrad.uop.ops import Ops, UOp, sint, PatternMatcher, UPat, GroupOp +from tinygrad.helpers import all_same, all_int, prod, DEBUG, RING, getenv, unwrap +from tinygrad.uop.ops import Ops, UOp, sint, PatternMatcher, UPat, GroupOp, resolve # *** allreduce implementation *** @@ -38,11 +39,68 @@ def handle_allreduce(buf:UOp, red:UOp) -> UOp|None: .alu(red.arg, chunk.copy_to_device(buf.device[dest], dest)) reduced_chunks.append(reduced_chunk) - # allgather + reassemble - pads = [((s,numel-e),) for s,e in chunks] - return functools.reduce(operator.add, [c.copy_to_device(buf.device).pad(pad) for pad,c in zip(pads, reduced_chunks)]).reshape(shape) + # allgather + copied_chunks = [] + for i,c in enumerate(reduced_chunks): + this_chunk = [None] * len(buf.device) + this_chunk[(i+len(buf.device)-1)%n_lbs] = c + for step in range(n_lbs-1): + dest = (i+step)%n_lbs + this_chunk[dest] = c = c.copy_to_device(buf.device[dest]) + copied_chunks.append(UOp(Ops.MSTACK, buf.dtype, tuple(cast(list[UOp], this_chunk)))) -replace_allreduce = PatternMatcher([(UPat(Ops.ALLREDUCE, src=(UPat.var("buf"), UPat()), name="red"), handle_allreduce),]) + # reassemble + pads = [((s,numel-e),) for s,e in chunks] + return functools.reduce(operator.add, [c.pad(pad) for pad,c in zip(pads, copied_chunks)]).reshape(shape) + +# ***** multi rewrite MSELECT/MSTACK ***** + +def _replace_dnum(st, val): + # replace dnum in ShapeTracker with literal const for this mselect + if (dnums:=[x for x in st.vars() if x.op is Ops.DEFINE_VAR and x.arg[0] == '_device_num']): + assert len(dnums) == 1, f"view must have exactly 0 or 1 dnum, got {dnums}" + st = st.substitute({dnums[0]:dnums[0].const_like(val)}) + return st + +def mstack_reorder_view(ms:UOp): + args = [x.arg for x in ms.src] + if not all_same(args) or len([x for x in args[0].vars() if x.arg[0] == '_device_num']) != 0: return None + return UOp(Ops.MSTACK, ms.dtype, tuple(x.src[0] for x in ms.src)).view(args[0]) + +def mstack_early_shrink(view:UOp, ms:UOp): + if resolve(prod(view.shape) >= prod(ms.shape)) or _replace_dnum(view.st, 0) == view.st: return None + ret = [] + for i, x in enumerate(ms.src): + new_view = _replace_dnum(view.st, i) + if x.op is Ops.COPY: + # if src device doesn't have a renderer, we have to view after the copy + # TODO: a way to understand this + if x.src[0].device in {"DISK", "NPY"}: + ret.append(x.view(new_view)) + else: + ret.append(x.src[0].view(new_view).copy_to_device(x.device)) + else: + ret.append(x.view(new_view).contiguous()) + return ms.replace(src=tuple(ret)) + +replace_allreduce = PatternMatcher([ + (UPat(Ops.ALLREDUCE, src=(UPat.var("buf"), UPat()), name="red"), handle_allreduce), + # BROADCAST: explicitly expand broadcast copies and combine with MSTACK + (UPat(Ops.COPY, name="c", src=(UPat(GroupOp.All-{Ops.CONST}, name="x"), UPat(Ops.DEVICE))), lambda c,x: + UOp(Ops.MSTACK, c.dtype, tuple(x.copy_to_device(d) for d in c.device)) if isinstance(c.device, tuple) and isinstance(x.device, str) else None), + # COPY_TO_ONE: if copying from multidevice to one, MSELECT the first (TODO: a little from each?) + (UPat(Ops.COPY, name="c", src=(UPat(GroupOp.All-{Ops.CONST}, name="x"), UPat(Ops.DEVICE))), lambda c,x: + x.mselect(0).copy_to_device(c.device) if isinstance(c.device, str) and isinstance(x.device, tuple) else None), + # MSELECT on MSTACK is replaced with nothing + (UPat(Ops.MSELECT, src=(UPat(Ops.MSTACK, name="mstack"),), name="ms"), lambda mstack, ms: mstack.src[ms.arg]), + # MSELECT must select a base, if there are views apply them after selecting the base + (UPat(Ops.MSELECT, src=(UPat(Ops.VIEW, src=(UPat.var("base"),), name="view"),), name="ms"), lambda ms, view, base: + base.mselect(ms.arg).view(_replace_dnum(unwrap(view.st), ms.arg))), + # move view through MSTACK + (UPat(Ops.MSTACK, src=UPat(Ops.VIEW), name="ms"), mstack_reorder_view), + # move shrink before MSTACK + (UPat(Ops.VIEW, src=(UPat(Ops.MSTACK, name="ms"),), name="view"), mstack_early_shrink), +]) # ***** multi functions ***** diff --git a/tinygrad_repo/tinygrad/engine/realize.py b/tinygrad_repo/tinygrad/engine/realize.py index fc4352d..db0b0fc 100644 --- a/tinygrad_repo/tinygrad/engine/realize.py +++ b/tinygrad_repo/tinygrad/engine/realize.py @@ -13,7 +13,7 @@ from tinygrad.engine.schedule import ScheduleItem # **************** Program Creation **************** logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1) -def get_kernel(renderer:Renderer, ast:UOp) -> Kernel: +def get_program(renderer:Renderer, ast:UOp) -> ProgramSpec: k = Kernel(ast, opts=renderer) if not NOOPT: if not k.apply_tensor_cores(getenv("TC", 1)): k.apply_opts(hand_coded_optimizations(k)) @@ -23,7 +23,7 @@ def get_kernel(renderer:Renderer, ast:UOp) -> Kernel: rawbufs = bufs_from_lin(kb, allocate=False) k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1))) if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"]) - return k + return k.to_program() # **************** Runners **************** @@ -109,7 +109,7 @@ def get_runner(device:str, ast:UOp) -> CompiledRunner: if bret:=method_cache.get(bkey): method_cache[ckey] = ret = CompiledRunner(replace(bret.p, device=device), bret.lib) else: - prg: ProgramSpec = get_kernel(Device[device].renderer, ast).to_program() + prg: ProgramSpec = get_program(Device[device].renderer, ast) method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, device=device)) return ret diff --git a/tinygrad_repo/tinygrad/engine/schedule.py b/tinygrad_repo/tinygrad/engine/schedule.py index ea074ae..9a64639 100644 --- a/tinygrad_repo/tinygrad/engine/schedule.py +++ b/tinygrad_repo/tinygrad/engine/schedule.py @@ -21,7 +21,8 @@ def unbind_view(ctx:list[dict[Variable, int]], x:UOp): if any(x.op is Ops.BIND for x in st.vars()): st, var_vals = st.unbind() ctx.append(var_vals) - return x.replace(arg=st) if st != x.st else None + return x.replace(arg=st) + return None def unbind_bind(ctx:list[dict[Variable, int]], x:UOp): var, val = x.unbind() @@ -47,10 +48,13 @@ def create_schedule_with_vars(sched_sink:UOp) -> tuple[list[ScheduleItem], dict[ if s.op is Ops.ASSIGN: children[s.src[1]].append(k) in_degree[k] += 1 - elif s.op is Ops.MSELECT: - if s.src[0].op is not Ops.BUFFER: - children[s.src[0].src[1]].append(k) - in_degree[k] += 1 + elif s.op in {Ops.MSELECT, Ops.MSTACK}: + for ss in s.src: + if ss.op is Ops.MSELECT: ss = ss.src[0] + if ss.op is not Ops.BUFFER: + assert ss.op is Ops.ASSIGN + children[ss.src[1]].append(k) + in_degree[k] += 1 elif s.op is Ops.BUFFER: pass # a BUFFER is already realized, nothing to do here else: @@ -73,28 +77,12 @@ def create_schedule_with_vars(sched_sink:UOp) -> tuple[list[ScheduleItem], dict[ buffers[k.src[0]] = base.view(k.size, ast.dtype, ast.arg[1]*base.dtype.itemsize) ubufs = tuple(s.buf_uop.buffer for s in k.src) if any(isinstance(x, MultiBuffer) for x in ubufs): - if ast.op is Ops.COPY: - assert ast.arg is None, "copy arg is no longer supported" - if isinstance(ubufs[1], MultiBuffer): # src is multiple buffers, none selected - if isinstance(ubufs[0], MultiBuffer): - # COPY ALL -> ALL - assert len(ubufs[0].bufs) == len(ubufs[1].bufs), "all to all copy must have matching buffer length" - for b1,b2 in zip(ubufs[0].bufs, ubufs[1].bufs): schedule.append(ScheduleItem(ast, (b1, b2), k.arg.metadata)) - else: - # COPY ANY -> ONE. Currently we just select the first - schedule.append(ScheduleItem(ast, (ubufs[0], ubufs[1].bufs[0]), k.arg.metadata)) - else: - assert isinstance(ubufs[1], Buffer), "src can't be MultiBuffer" - if isinstance(ubufs[0], MultiBuffer): - # COPY ONE -> ALL (BROADCAST) - for b in ubufs[0].bufs: schedule.append(ScheduleItem(ast, (b, ubufs[1]), k.arg.metadata)) - else: schedule.append(ScheduleItem(ast, (ubufs[0], ubufs[1]), k.arg.metadata)) # COPY ONE -> ONE - else: - assert all(isinstance(x, MultiBuffer) for x in ubufs), "kernel must all be multibuffer" - dnums = [x for x in ast.variables() if x.arg[0] == '_device_num'] - for i,bufs in enumerate(zip(*[x.bufs for x in cast(tuple[MultiBuffer, ...], ubufs)])): - schedule.append(ScheduleItem(ast, bufs, k.arg.metadata, {dnums[0]:i} if len(dnums) else {})) + assert all(isinstance(x, MultiBuffer) for x in ubufs), "kernel must all be multibuffer" + dnums = [x for x in ast.variables() if x.arg[0] == '_device_num'] + for i,bufs in enumerate(zip(*[x.bufs for x in cast(tuple[MultiBuffer, ...], ubufs)])): + schedule.append(ScheduleItem(ast, bufs, k.arg.metadata, {dnums[0]:i} if len(dnums) else {})) else: + # ONE -> ONE schedule.append(ScheduleItem(ast, cast(tuple[Buffer, ...], ubufs), k.arg.metadata)) for x in children[k]: in_degree[x] -= 1 diff --git a/tinygrad_repo/tinygrad/engine/search.py b/tinygrad_repo/tinygrad/engine/search.py index 739b74d..9e92df3 100644 --- a/tinygrad_repo/tinygrad/engine/search.py +++ b/tinygrad_repo/tinygrad/engine/search.py @@ -25,21 +25,21 @@ actions += [Opt(op=OptOps.TC, axis=axis, arg=(-1, getenv("TC_OPT", 2), getenv("T actions += [Opt(op=OptOps.SWAP, axis=axis_0, arg=axis_1) for axis_0 in range(5) for axis_1 in range(axis_0+1, 5)] if getenv("NOLOCALS"): actions += [Opt(op=OptOps.NOLOCALS)] -def _get_test_global_size(global_size, max_global_size, var_vals): - test_global_size, factor = [sym_infer(sz, var_vals) for sz in global_size], 1 +def get_test_global_size(global_size, max_global_size, var_vals): + test_global_size = [sym_infer(sz, var_vals) for sz in global_size] + input_size = prod(test_global_size) while prod(test_global_size) > max_global_size: for j in range(len(global_size)-1,-1,-1): if test_global_size[j] > 16: test_global_size[j] //= 2 - factor *= 2 break - return test_global_size, factor + return test_global_size, input_size / prod(test_global_size) def _time_program(p:ProgramSpec, lib:bytes, var_vals:dict[Variable, int], rawbufs:list[Buffer], early_stop:Optional[float]=None, allow_test_size:int=True, max_global_size:Optional[int]=65536, clear_l2=False, cnt=3, name="test") -> list[float]: factor = 1 if allow_test_size and p.global_size is not None and max_global_size is not None: - global_size, factor = _get_test_global_size(p.global_size, max_global_size, var_vals) + global_size, factor = get_test_global_size(p.global_size, max_global_size, var_vals) p = replace(p, global_size=global_size) try: car = CompiledRunner(p, precompiled=lib) except AssertionError: return [math.inf] * cnt @@ -81,8 +81,10 @@ def _try_compile_linearized_w_idx(x:tuple[int,Kernel], compiler:Compiler) -> tup if hasattr(signal, "alarm"): signal.alarm(0) return x[0], ret -# workers should ignore ctrl c -def _init_worker(): signal.signal(signal.SIGINT, signal.SIG_IGN) +# workers should not open devices and should ignore ctrl c +def _init_worker(): + Context(ALLOW_DEVICE_USAGE=0).__enter__() + signal.signal(signal.SIGINT, signal.SIG_IGN) def _ensure_buffer_alloc(bufs:list[Buffer]) -> list[Buffer]: return [buf.ensure_allocated() if buf is not None else buf for buf in bufs] @@ -92,7 +94,7 @@ def _ensure_buffer_alloc(bufs:list[Buffer]) -> list[Buffer]: return [buf.ensure_ def bufs_from_lin(lin:Kernel, allocate:bool=True) -> list[Buffer]: bufsts: defaultdict[int, list[UOp]] = defaultdict(list) for x in lin.bufs: - if x.src[0].op is Ops.DEFINE_GLOBAL: bufsts[x.src[0].arg].append(x) + if x.src[0].base.op is Ops.DEFINE_GLOBAL: bufsts[x.src[0].base.arg].append(x) # TODO: Nones are staying in here if buffers are optimized out! # TODO: add a test for this rawbufs: list[Optional[Buffer]] = [None]*(max(bufsts)+1) diff --git a/tinygrad_repo/tinygrad/frontend/onnx.py b/tinygrad_repo/tinygrad/frontend/onnx.py index 5c88616..db34a70 100644 --- a/tinygrad_repo/tinygrad/frontend/onnx.py +++ b/tinygrad_repo/tinygrad/frontend/onnx.py @@ -1,5 +1,7 @@ # type: ignore import sys, pathlib sys.path.append(pathlib.Path(__file__).parent.parent.as_posix()) -try: from extra.onnx import OnnxRunner # noqa: F401 # pylint: disable=unused-import +try: + from extra.onnx import OnnxRunner # noqa: F401 # pylint: disable=unused-import + from extra.onnx_parser import onnx_load # noqa: F401 # pylint: disable=unused-import except ImportError as e: raise ImportError("onnx frontend not in release\nTo fix, install tinygrad from a git checkout with pip install -e .") from e \ No newline at end of file diff --git a/tinygrad_repo/tinygrad/helpers.py b/tinygrad_repo/tinygrad/helpers.py index d21935f..1c2f7ff 100644 --- a/tinygrad_repo/tinygrad/helpers.py +++ b/tinygrad_repo/tinygrad/helpers.py @@ -117,8 +117,9 @@ PICKLE_BUFFERS, PROFILE, LRU = ContextVar("PICKLE_BUFFERS", 1), ContextVar("PROF CACHELEVEL, IGNORE_BEAM_CACHE, DEVECTORIZE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0), ContextVar("DEVECTORIZE", 1) DISABLE_COMPILER_CACHE = ContextVar("DISABLE_COMPILER_CACHE", 0) DONT_REALIZE_EXPAND, DONT_GROUP_REDUCES = ContextVar("DONT_REALIZE_EXPAND", 0), ContextVar("DONT_GROUP_REDUCES", 0) -QUANTIZE, VALIDATE_WITH_CPU, IGNORE_OOB = ContextVar("QUANTIZE", 0), ContextVar("VALIDATE_WITH_CPU", 0), ContextVar("IGNORE_OOB", 1) -CORRECT_DIVMOD_FOLDING = ContextVar("CORRECT_DIVMOD_FOLDING", 0) +QUANTIZE, VALIDATE_WITH_CPU = ContextVar("QUANTIZE", 0), ContextVar("VALIDATE_WITH_CPU", 0) +CORRECT_DIVMOD_FOLDING, FUSE_OPTIM = ContextVar("CORRECT_DIVMOD_FOLDING", 0), ContextVar("FUSE_OPTIM", 0) +ALLOW_DEVICE_USAGE = ContextVar("ALLOW_DEVICE_USAGE", 1) @dataclass(frozen=True) class Metadata: @@ -175,7 +176,7 @@ class Profiling(contextlib.ContextDecorator): cache_dir: str = os.path.join(getenv("XDG_CACHE_HOME", os.path.expanduser("~/Library/Caches" if OSX else "~/.cache")), "tinygrad") CACHEDB: str = getenv("CACHEDB", os.path.abspath(os.path.join(cache_dir, "cache.db"))) -VERSION = 19 +VERSION = 20 _db_connection = None def db_connection(): global _db_connection @@ -220,17 +221,13 @@ def diskcache_put(table:str, key:Union[dict, str, int], val:Any, prepickled=Fals cur.close() return val -def diskcache(func): - def wrapper(*args, **kwargs) -> bytes: +def diskcache(func:Callable[..., T]): + def wrapper(*args, **kwargs) -> T: table, key = f"cache_{func.__name__}", hashlib.sha256(pickle.dumps((args, kwargs))).hexdigest() if (ret:=diskcache_get(table, key)) is not None: return ret return diskcache_put(table, key, func(*args, **kwargs)) return wrapper -# *** process replay *** - -CAPTURE_PROCESS_REPLAY = getenv("CAPTURE_PROCESS_REPLAY") - # *** http support *** def _ensure_downloads_dir() -> pathlib.Path: diff --git a/tinygrad_repo/tinygrad/nn/optim.py b/tinygrad_repo/tinygrad/nn/optim.py index e6736bc..9b469fe 100644 --- a/tinygrad_repo/tinygrad/nn/optim.py +++ b/tinygrad_repo/tinygrad/nn/optim.py @@ -1,5 +1,6 @@ # sorted in order of increasing complexity -from tinygrad.helpers import dedup, flatten, getenv, unwrap +import itertools +from tinygrad.helpers import dedup, flatten, getenv, unwrap, FUSE_OPTIM from tinygrad.tensor import Tensor from tinygrad.dtype import dtypes, least_upper_dtype @@ -7,7 +8,7 @@ class Optimizer: """ Base class for all optimizers. """ - def __init__(self, params: list[Tensor], lr: float): + def __init__(self, params: list[Tensor], lr: float, fused=FUSE_OPTIM): # if it's None, but being put into an optimizer, set it to True for x in params: if x.requires_grad is None: x.requires_grad = True @@ -16,9 +17,15 @@ class Optimizer: assert len(self.params) != 0, "optimizer must have at least one param" self.device = self.params[0].device self.buffers: list[Tensor] = dedup([x for x in params if not x.requires_grad]) # buffers are still realized + self.fused = fused # store lr in at least float32 precision self.lr = Tensor(lr if getenv("CONST_LR") else [lr], requires_grad=False, device=self.device, dtype=least_upper_dtype(dtypes.default_float, dtypes.float32)) + if self.fused: self.pos_params = list(itertools.accumulate(self.params, lambda x,y: x+y.flatten().shape[0], initial=0)) + + def _new_optim_param(self) -> list[Tensor]: + if self.fused: return [Tensor.zeros(self.pos_params[-1], dtype=dtypes.float32, device=self.device, requires_grad=False).contiguous()] + return [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params] def zero_grad(self): """ @@ -39,9 +46,17 @@ class Optimizer: if not Tensor.training: raise RuntimeError( f"""Tensor.training={Tensor.training}, Tensor.training must be enabled to use the optimizer. - help: Consider setting Tensor.training=True before calling Optimizer.step().""") - return self.schedule_step_with_grads([unwrap(t.grad) for t in self.params])+self.params+self.buffers + if self.fused: + # optimizer fusion just concatentates all the buffers, runs the _step, then splits them back up + out, extra = self._step([Tensor.cat(*[t.flatten() for t in self.params], dim=0)], + [Tensor.cat(*[unwrap(t.grad).flatten() for t in self.params], dim=0)]) + updated_params = [out[0][self.pos_params[i]:self.pos_params[i+1]].reshape(tt.shape) for i, tt in enumerate(self.params)] + else: + updated_params, extra = self._step(self.params, [unwrap(t.grad) for t in self.params]) + for i, tt in enumerate(self.params): tt.assign(updated_params[i]) + return extra+self.params+self.buffers - def schedule_step_with_grads(self, grads:list[Tensor]) -> list[Tensor]: raise NotImplementedError + def _step(self, params:list[Tensor], grads:list[Tensor]) -> tuple[list[Tensor], list[Tensor]]: raise NotImplementedError class OptimizerGroup(Optimizer): """ @@ -55,7 +70,7 @@ class OptimizerGroup(Optimizer): def schedule_step(self) -> list[Tensor]: return [x for o in self.optimizers for x in o.schedule_step()] # LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 it's just standard SGD. -def SGD(params: list[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False): +def SGD(params: list[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False, fused=FUSE_OPTIM): """ Stochastic Gradient Descent (SGD) optimizer with optional momentum and weight decay. @@ -63,7 +78,7 @@ def SGD(params: list[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov - Described: https://paperswithcode.com/method/sgd """ - return LARS(params, lr, momentum, weight_decay, nesterov, classic, tcoef=0.0) + return LARS(params, lr, momentum, weight_decay, nesterov, classic, tcoef=0.0, fused=fused) class LARS(Optimizer): """ @@ -72,14 +87,14 @@ class LARS(Optimizer): - Described: https://paperswithcode.com/method/lars - Paper: https://arxiv.org/abs/1708.03888v3 """ - def __init__(self, params:list[Tensor], lr=0.001, momentum=0.9, weight_decay=1e-4, nesterov=False, classic=True, tcoef=0.001): - super().__init__(params, lr) + def __init__(self, params:list[Tensor], lr=0.001, momentum=0.9, weight_decay=1e-4, nesterov=False, classic=True, tcoef=0.001, fused=FUSE_OPTIM): + super().__init__(params, lr, fused) self.momentum, self.wd, self.nesterov, self.classic, self.tcoef = momentum, weight_decay, nesterov, classic, tcoef - self.b = [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params] \ - if self.momentum else [] + self.b = self._new_optim_param() if self.momentum else [] - def schedule_step_with_grads(self, grads:list[Tensor]) -> list[Tensor]: - for i, (t, g) in enumerate(zip(self.params, grads)): + def _step(self, params:list[Tensor], grads:list[Tensor]) -> tuple[list[Tensor], list[Tensor]]: + ret = [] + for i, (t, g) in enumerate(zip(params, grads)): if self.tcoef != 0: r1 = t.detach().square().sum().sqrt() r2 = g.square().sum().sqrt() @@ -95,26 +110,26 @@ class LARS(Optimizer): g = (g + self.momentum * self.b[i]) if self.nesterov else self.b[i] # popular momentum does pre learning rate update if not self.classic: g = g * r * self.lr - t.assign((t.detach() - g).cast(t.dtype)) - return self.b + ret.append((t.detach() - g).cast(t.dtype)) + return ret, self.b # LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 it's just Adam/W. -def AdamW(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01): +def AdamW(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01, fused=FUSE_OPTIM): """ AdamW optimizer with optional weight decay. - Described: https://paperswithcode.com/method/adamw - Paper: https://arxiv.org/abs/1711.05101v3 """ - return LAMB(params, lr, b1, b2, eps, weight_decay, adam=True) -def Adam(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8): + return LAMB(params, lr, b1, b2, eps, weight_decay, adam=True, fused=fused) +def Adam(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, fused=FUSE_OPTIM): """ Adam optimizer. - Described: https://paperswithcode.com/method/adam - Paper: https://arxiv.org/abs/1412.6980 """ - return LAMB(params, lr, b1, b2, eps, 0.0, adam=True) + return LAMB(params, lr, b1, b2, eps, 0.0, adam=True, fused=fused) class LAMB(Optimizer): """ @@ -123,17 +138,18 @@ class LAMB(Optimizer): - Described: https://paperswithcode.com/method/lamb - Paper: https://arxiv.org/abs/1904.00962 """ - def __init__(self, params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, weight_decay=0.0, adam=False): - super().__init__(params, lr) + def __init__(self, params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, weight_decay=0.0, adam=False, fused=FUSE_OPTIM): + super().__init__(params, lr, fused) self.b1, self.b2, self.eps, self.wd, self.adam = b1, b2, eps, weight_decay, adam self.b1_t, self.b2_t = (Tensor.ones((1,), dtype=dtypes.float32, device=self.device, requires_grad=False).contiguous() for _ in [b1, b2]) - self.m = [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params] - self.v = [Tensor.zeros(*t.shape, dtype=dtypes.float32, device=t.device, requires_grad=False).contiguous() for t in self.params] + self.m = self._new_optim_param() + self.v = self._new_optim_param() - def schedule_step_with_grads(self, grads:list[Tensor]) -> list[Tensor]: + def _step(self, params:list[Tensor], grads:list[Tensor]) -> tuple[list[Tensor], list[Tensor]]: + ret = [] self.b1_t *= self.b1 self.b2_t *= self.b2 - for i, (t, g) in enumerate(zip(self.params, grads)): + for i, (t, g) in enumerate(zip(params, grads)): self.m[i].assign(self.b1 * self.m[i] + (1.0 - self.b1) * g) self.v[i].assign(self.b2 * self.v[i] + (1.0 - self.b2) * (g * g)) m_hat = self.m[i] / (1.0 - self.b1_t) @@ -145,5 +161,5 @@ class LAMB(Optimizer): r: Tensor|float = Tensor.where(r1 > 0, Tensor.where(r2 > 0, r1 / r2, 1.0), 1.0) else: r = 1.0 - t.assign((t.detach() - self.lr * r * up).cast(t.dtype)) - return [self.b1_t, self.b2_t] + self.m + self.v + ret.append((t.detach() - self.lr * r * up).cast(t.dtype)) + return ret, [self.b1_t, self.b2_t] + self.m + self.v diff --git a/tinygrad_repo/tinygrad/nn/state.py b/tinygrad_repo/tinygrad/nn/state.py index 9ab16ca..6b56a45 100644 --- a/tinygrad_repo/tinygrad/nn/state.py +++ b/tinygrad_repo/tinygrad/nn/state.py @@ -155,7 +155,7 @@ def load_state_dict(model, state_dict:dict[str, Tensor], strict=True, verbose=Tr raise ValueError(f'Shape mismatch in layer `{k}`: Expected shape {v.shape}, but found {state_dict[k].shape} in state dict.') if isinstance(v.device, tuple): if isinstance(state_dict[k].device, tuple): v.replace(state_dict[k]) - else: v.replace(state_dict[k].shard(v.device, v.lazydata.axis)) + else: v.replace(state_dict[k].shard(v.device, v.uop.axis)) else: v.replace(state_dict[k].to(v.device)) if realize: v.realize() if consume: del state_dict[k] diff --git a/tinygrad_repo/tinygrad/renderer/cstyle.py b/tinygrad_repo/tinygrad/renderer/cstyle.py index debdd97..15592b8 100644 --- a/tinygrad_repo/tinygrad/renderer/cstyle.py +++ b/tinygrad_repo/tinygrad/renderer/cstyle.py @@ -50,8 +50,9 @@ base_rewrite = PatternMatcher([ (UPat(Ops.LOAD, src=(UPat.var('bidx'),), allow_any_len=True), lambda ctx,bidx: f"*{ctx[bidx]}"), (UPat(Ops.STORE, src=(UPat.var('bidx'), UPat.var("var")), allow_any_len=True), lambda ctx,bidx,var: f"*{ctx[bidx]} = {ctx[var]};"), # alu/gep + # TODO: look for left-associative (UPat(GroupOp.ALU, name="x"), lambda ctx,x: ctx.code_for_op[x.op]( - *([strip_parens(ctx[v]) if v.op == x.op and x.op in {Ops.ADD, Ops.MUL, Ops.XOR} else ctx[v] for v in x.src]), x.dtype)), + *([strip_parens(ctx[v]) if v.op == x.op and x.op in {Ops.ADD, Ops.MUL, Ops.XOR, Ops.OR, Ops.AND} else ctx[v] for v in x.src]), x.dtype)), (UPat(Ops.GEP, name="x"), lambda ctx,x: ctx[x.src[0]] + \ (f"[{x.arg[0]}]" if x.src[0].dtype.count > ctx.gep_arr_threshold else f".{'xyzwabcd'[x.arg[0]]}")), # custom passes through with format @@ -75,7 +76,7 @@ extra_pm = PatternMatcher([ def uops_to_dtypes(uops:list[UOp]) -> list[DType]: return dedup(u.dtype for u in uops if not isinstance(u.dtype, (ImageDType, PtrDType))) class CStyleLanguage(Renderer): - kernel_prefix: str = "" + kernel_typedef: str = "void" buffer_prefix: str = "" buffer_suffix: str = "" smem_align: str = "" @@ -103,12 +104,12 @@ class CStyleLanguage(Renderer): string_rewrite = base_rewrite extra_matcher = extra_pm - def get_kernel_modifier(self, uops:list[UOp]) -> str: return "" def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str: tmp = "const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n" if any(isinstance(dtype, ImageDType) for _,(dtype,_) in bufs) else "" # noqa: E501 buftypes = [(name, self.render_dtype(dtype, mutable)+self.buffer_suffix if isinstance(dtype, (ImageDType, PtrDType)) else self.arg_int_prefix if dtype == dtypes.int else None) for name,(dtype,mutable) in bufs] - prg = ''.join([f"{self.kernel_prefix}void {self.get_kernel_modifier(uops)}{function_name}(",] + + launch_bounds = prod(u.arg[1] for u in uops if u.op is Ops.SPECIAL and u.arg[0][0] == "l") + prg = ''.join([f"{self.kernel_typedef.format(launch_bounds=launch_bounds)} {function_name}(",] + [', '.join([f'{t} {name}' for name,t in buftypes] + self.extra_args)] + [") {\n" + tmp] + ['\n'.join(kernel), "\n}"]) return prg if prefix is None else "\n".join(prefix)+f"\n{prg}" @@ -200,7 +201,7 @@ class ClangRenderer(CStyleLanguage): (UPat(Ops.SQRT, name="alu"), no_vectorized_alu),]) + CStyleLanguage.extra_matcher if sys.platform == 'win32': - kernel_prefix = "__attribute__((ms_abi)) " + kernel_typedef = "__attribute__((ms_abi)) void" def render_vector_prefix(self, dt:DType) -> str: # round (down) to power of two (this is actually the default clang behavior) alignment = 2**int(math.log2(dt.itemsize)) if getenv("ALIGNED", 1) else 1 @@ -233,7 +234,7 @@ class OpenCLRenderer(CStyleLanguage): device = "GPU" # language options - kernel_prefix = "__kernel " + kernel_typedef = "__kernel void" buffer_prefix = "__global " smem_align = "__attribute__ ((aligned (16))) " smem_prefix = "__local " @@ -259,7 +260,7 @@ class OpenCLRenderer(CStyleLanguage): return super().render_kernel(function_name, kernel, bufs, uops, prefix) class IntelRenderer(OpenCLRenderer): - device, suffix, kernel_prefix = "GPU", "INTEL", "__attribute__((intel_reqd_sub_group_size(8)))\n" + "__kernel " + device, suffix, kernel_typedef = "GPU", "INTEL", "__attribute__((intel_reqd_sub_group_size(8)))\n" + "__kernel void" tensor_cores = [TensorCore(dims=(8,8,16), threads=8, elements_per_thread=(16,16,8), dtype_in=dtypes.half, dtype_out=dtypes.float, opts=("l0","l0","l0","u1","u1","u1"), swizzle=(((4,5,6),(0,1,2,3,7,8,9)), ((0,1,2),(7,8,9,3,4,5,6))))] @@ -285,7 +286,7 @@ class MetalRenderer(CStyleLanguage): def __init__(self): self.tensor_cores = MetalRenderer.tensor_cores if hasattr(os, 'uname') and os.uname().machine == "arm64" else [] # language options - kernel_prefix = "kernel " + kernel_typedef = "kernel void" buffer_prefix = "device " smem_prefix = "threadgroup __attribute__((aligned(16))) " arg_int_prefix = "constant int&" @@ -345,7 +346,8 @@ class CUDARenderer(CStyleLanguage): def __reduce__(self): return self.__class__, (self.arch,) # language options - kernel_prefix = "extern \"C\" __global__ " + # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html + kernel_typedef = "extern \"C\" __global__ void __launch_bounds__({launch_bounds})" smem_prefix = "__shared__ __align__(16) " smem_prefix_for_cast = False barrier = "__syncthreads();" @@ -395,11 +397,6 @@ class CUDARenderer(CStyleLanguage): return super().render_kernel(function_name, kernel, bufs, uops, prefix=prefix) - def get_kernel_modifier(self, uops:list[UOp]) -> str: - maxThreadsPerBlock = prod(u.arg[1] for u in uops if u.op is Ops.SPECIAL and u.arg[0][0] == "l") - # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html - return f"__launch_bounds__({maxThreadsPerBlock}) " - def cast_float_to_bf16(x: UOp) -> UOp: assert x.dtype == dtypes.float, "cast float -> bf16 must start with float" x = x.bitcast(dtypes.uint) @@ -425,7 +422,9 @@ class AMDRenderer(CStyleLanguage): @staticmethod def get_tensor_cores(arch): - return {"gfx942": AMDRenderer.tensor_cores_mfma, "gfx1201": AMDRenderer.tensor_cores_rdna4}.get(arch.split(":")[0], AMDRenderer.tensor_cores) + return {"gfx942": AMDRenderer.tensor_cores_mfma, + "gfx1200": AMDRenderer.tensor_cores_rdna4, + "gfx1201": AMDRenderer.tensor_cores_rdna4}.get(arch.split(":")[0], AMDRenderer.tensor_cores) def __init__(self, arch:str): # gfx942 => MI300, gfx1100 => RX 7900, gfx1201 => RX 9700 self.arch = arch self.tensor_cores = self.get_tensor_cores(arch) @@ -440,8 +439,10 @@ class AMDRenderer(CStyleLanguage): for dt, n in [(dtype.name, dtype.itemsize * 8) for dtype in [dtypes.float, dtypes.double, dtypes.half]] for name, atr in [("fmax", "const"), ("exp2", "pure"), ("log2", "pure"), ("sqrt", "const"), ("sin", "")]] - kernel_prefix = "\n".join(f'extern "C" __attribute__((device{f", {atr}" if atr else ""})) {dto} {meth}({dti});' for meth,dti,dto,atr in ockl+ocml) - kernel_prefix += '\nextern "C" __attribute__((global))' + kernel_typedef = "\n".join(f'extern "C" __attribute__((device{f", {atr}" if atr else ""})) {dto} {meth}({dti});' for meth,dti,dto,atr in ockl+ocml) + # https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size + # NOTE: this makes hlb_cifar10 twice as fast, there may be more gains in tweaking these parameters + kernel_typedef += '\nextern "C" __attribute__((global)) void __attribute__((amdgpu_flat_work_group_size(1, {launch_bounds})))' code_for_workitem = {"g": lambda x: f"__ockl_get_group_id({x})", "l": lambda x: f"__ockl_get_local_id({x})", "i": lambda x: f"(__ockl_get_group_id({x})*__ockl_get_local_size({x})+__ockl_get_local_id({x}))"} code_for_op = { **CStyleLanguage.code_for_op, @@ -500,12 +501,6 @@ class AMDRenderer(CStyleLanguage): for (int n = 0; n < 8; n++) { d[n] = c_frag[n*2]; } return d;\n}""") return super().render_kernel(function_name, kernel, bufs, uops, prefix) - def get_kernel_modifier(self, uops:list[UOp]) -> str: - requiredMaxThreadsPerBlock = prod(u.arg[1] for u in uops if u.op is Ops.SPECIAL and u.arg[0][0] == "l") - # https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size - # NOTE: this makes hlb_cifar10 twice as fast, there may be more gains in tweaking these parameters - return f"__attribute__((amdgpu_flat_work_group_size(1, {requiredMaxThreadsPerBlock})))" - class NVRenderer(CUDARenderer): device = "NV" class HIPRenderer(AMDRenderer): device = "HIP" class QCOMRenderer(OpenCLRenderer): device = "QCOM" diff --git a/tinygrad_repo/tinygrad/renderer/llvmir.py b/tinygrad_repo/tinygrad/renderer/llvmir.py index 69b6452..9be32fd 100644 --- a/tinygrad_repo/tinygrad/renderer/llvmir.py +++ b/tinygrad_repo/tinygrad/renderer/llvmir.py @@ -44,8 +44,11 @@ def render_wmma_amx(ctx, wmma: UOp) -> str: f' call void asm sideeffect "nop\\0Anop\\0Anop\\0A.word ({0x201000 + (17 << 5) + 1})", "~{{memory}}"() #0; AMX clr', # clr f' {ctx[wmma]} = load {ldt(wmma.dtype)}, ptr {ctx[wmma]}_amx2, align {wmma.dtype.itemsize}']) -def render_wmma_amd(ctx, wmma: UOp) -> str: +def render_wmma_amd(ctx, wmma: UOp, arch: str) -> str: dt_map = {dtypes.half: "f16", dtypes.float: "f32", dtypes.bfloat16: "bf16", dtypes.ushort: "bf16"} + # https://github.com/llvm/llvm-project/blob/main/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl + if arch.split(":")[0] == "gfx942": return f" {ctx[wmma]} = call {ldt(wmma.dtype)} @llvm.amdgcn.mfma.{dt_map[wmma.src[-1].dtype.scalar()]}" + \ + f".16x16x16{dt_map[wmma.src[0].dtype.scalar()]}(" + ", ".join([f"{ldt(w.dtype)} {ctx[w]}" for w in wmma.src]) + ", i32 0, i32 0, i32 0)" # https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll # example: %wmma0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %v99,<16 x half> %v100,<8 x float> %v101) return f" {ctx[wmma]} = call {ldt(wmma.dtype)} @llvm.amdgcn.wmma.{dt_map[wmma.src[-1].dtype.scalar()]}.16x16x16." + \ @@ -216,7 +219,6 @@ class AMDLLVMRenderer(LLVMRenderer): f"<8 x half> {ctx[y]}, <8 x half> zeroinitializer, <16 x i32> <{', '.join([f'i32 {i}, i32 {j}' for i, j in zip(range(0, 8), range(8, 16))])}>"), (UPat(Ops.CAST, name="x", dtype=dtypes.half.vec(8), src=UPat.var("y", dtypes.half.vec(16))), lambda ctx, x, y: f" {ctx[x]}= shufflevector <16 x half> {ctx[y]}, <16 x half> undef, <8 x i32> <{', '.join([f'i32 {x}' for x in range(0, 16, 2)])}>"), - (UPat(Ops.WMMA, name="wmma"), render_wmma_amd), ]) + base_rewrite extra_matcher = LLVMRenderer.extra_matcher def _render_footer(self, uops: list[UOp]) -> str: @@ -228,6 +230,7 @@ class AMDLLVMRenderer(LLVMRenderer): def __init__(self, arch:str): self.arch = arch self.tensor_cores = AMDRenderer.get_tensor_cores(arch) + self.string_rewrite += PatternMatcher([(UPat(Ops.WMMA, name="wmma"), lambda ctx, wmma, arch=arch: render_wmma_amd(ctx, wmma, arch))]) if self.arch.split(":")[0] == "gfx1100": self.extra_matcher += PatternMatcher([ (UPat(Ops.WMMA, name="x", dtype=dtypes.half.vec(8)), diff --git a/tinygrad_repo/tinygrad/renderer/wgsl.py b/tinygrad_repo/tinygrad/renderer/wgsl.py index 03337f8..3083e50 100644 --- a/tinygrad_repo/tinygrad/renderer/wgsl.py +++ b/tinygrad_repo/tinygrad/renderer/wgsl.py @@ -2,7 +2,6 @@ from tinygrad.dtype import DType, PtrDType, dtypes from tinygrad.uop.ops import UOp, Ops, PatternMatcher, UPat from tinygrad.renderer.cstyle import CStyleLanguage, base_rewrite, extra_pm from tinygrad.helpers import strip_parens -import math def sign_extend(val:UOp, sext_am:int): return (UOp.where((val >> (sext_am - 1)) > 0, UOp.const(dtypes.uint32, 0xffffffff) << sext_am, UOp.const(dtypes.uint32, 0)) \ @@ -30,14 +29,11 @@ def is_packed(dt:DType) -> bool: return dt.itemsize < 4 and dt.base != dtypes.ha wgsl_matcher = PatternMatcher([ (UPat((Ops.CMPLT, Ops.XOR), src=(UPat(name="a", dtype=dtypes.bool), UPat.var("b")), name="c"), lambda a,b,c: a.cast(dtypes.int).alu(c.op, b.cast(dtypes.int)).cast(dtypes.bool)), - (UPat(Ops.LOAD, name="l", src=(UPat.var("b"),)), lambda l,b: packed_load(l, b, l.dtype) if is_packed(l.dtype) else None), - (UPat(Ops.LOAD, name="l", src=(UPat.var("b"), UPat.cvar("c"))), - lambda l,b,c: packed_load(l,b,l.dtype,c.cast(dtypes.uint32)) if is_packed(l.dtype) else None), + (UPat.load(UPat.var("b"), UPat.cvar("c"), name="l"),lambda l,b,c: packed_load(l,b,l.dtype,c.cast(dtypes.uint32)) if is_packed(l.dtype) else None), + (UPat.load(UPat.var("b"), name='l', allow_any_len=True), lambda l,b: packed_load(l, b, l.dtype) if is_packed(l.dtype) else None), (UPat.store(UPat.var("bidx"), UPat.var("var"), allow_any_len=True), lambda bidx,var: packed_store(bidx,var) if is_packed(var.dtype) else None), - # TODO: why is this needed, and only for this MUL order - (UPat(Ops.MUL, src=(UPat.var("a"), UPat.var("g").where(UPat.cvar("c1"), UPat.cvar("c2")))), - lambda a,g,c1,c2: g.where(c1, a) if math.isnan(c1.arg) and c2.arg == 1.0 else None), - (UPat.var("a") << UPat.var("b"),lambda a,b:(a.bitcast(dtypes.uint32)<> UPat.var("y"), lambda x,y: UOp(Ops.SHR, x.dtype, (x,y.cast(dtypes.uint))) if y.dtype != dtypes.uint else None), ]) + extra_pm class WGSLRenderer(CStyleLanguage): @@ -57,7 +53,8 @@ class WGSLRenderer(CStyleLanguage): (UPat.cvar("x", dtype=dtypes.bool), lambda x: "true" if x.arg else "false"), (UPat(Ops.CONST, dtype=(dtypes.uchar, dtypes.ushort, dtypes.uint32), name="x"), lambda x: f"bitcast({x.arg})" if x.arg < 0 else f"{x.arg&0xFFFFFFFF}u"), - (UPat(Ops.DEFINE_LOCAL, name="x"), lambda ctx,x: f"var {ctx[x]}: array<{ctx.buf_map(x.dtype.base)}, {x.dtype.size}>;"), + (UPat(Ops.DEFINE_LOCAL, name="x"), lambda ctx,x: + f"var {ctx[x]}: array<{ctx.buf_map(x.dtype.base)},{x.dtype.size//(4//x.dtype.itemsize) if is_packed(x.dtype) else x.dtype.size}>;"), (UPat(Ops.BITCAST, dtype=dtypes.half, name="x", src=(UPat(dtype=(dtypes.short, dtypes.ushort, dtypes.uint32),),)), lambda ctx,x: f"bitcast>({ctx[x.src[0]]})[0]"), (UPat(Ops.BITCAST, dtype=(dtypes.char, dtypes.uchar), name="x"), lambda ctx,x: f"bitcast<{ctx.type_map[x.dtype]}>({ctx[x.src[0]]}&0xFF)"), diff --git a/tinygrad_repo/tinygrad/runtime/autogen/am/smu_v14_0_3.py b/tinygrad_repo/tinygrad/runtime/autogen/am/smu_v14_0_2.py similarity index 100% rename from tinygrad_repo/tinygrad/runtime/autogen/am/smu_v14_0_3.py rename to tinygrad_repo/tinygrad/runtime/autogen/am/smu_v14_0_2.py diff --git a/tinygrad_repo/tinygrad/runtime/autogen/comgr.py b/tinygrad_repo/tinygrad/runtime/autogen/comgr.py index d4e05ec..3159606 100644 --- a/tinygrad_repo/tinygrad/runtime/autogen/comgr.py +++ b/tinygrad_repo/tinygrad/runtime/autogen/comgr.py @@ -226,7 +226,8 @@ amd_comgr_data_kind_s__enumvalues = { 17: 'AMD_COMGR_DATA_KIND_AR', 18: 'AMD_COMGR_DATA_KIND_BC_BUNDLE', 19: 'AMD_COMGR_DATA_KIND_AR_BUNDLE', - 19: 'AMD_COMGR_DATA_KIND_LAST', + 20: 'AMD_COMGR_DATA_KIND_OBJ_BUNDLE', + 20: 'AMD_COMGR_DATA_KIND_LAST', } AMD_COMGR_DATA_KIND_UNDEF = 0 AMD_COMGR_DATA_KIND_SOURCE = 1 @@ -242,7 +243,8 @@ AMD_COMGR_DATA_KIND_FATBIN = 16 AMD_COMGR_DATA_KIND_AR = 17 AMD_COMGR_DATA_KIND_BC_BUNDLE = 18 AMD_COMGR_DATA_KIND_AR_BUNDLE = 19 -AMD_COMGR_DATA_KIND_LAST = 19 +AMD_COMGR_DATA_KIND_OBJ_BUNDLE = 20 +AMD_COMGR_DATA_KIND_LAST = 20 amd_comgr_data_kind_s = ctypes.c_uint32 # enum amd_comgr_data_kind_t = amd_comgr_data_kind_s amd_comgr_data_kind_t__enumvalues = amd_comgr_data_kind_s__enumvalues @@ -515,6 +517,24 @@ try: amd_comgr_action_info_get_option_list_item.argtypes = [amd_comgr_action_info_t, size_t, ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_char)] except AttributeError: pass +try: + amd_comgr_action_info_set_bundle_entry_ids = _libraries['libamd_comgr.so'].amd_comgr_action_info_set_bundle_entry_ids + amd_comgr_action_info_set_bundle_entry_ids.restype = amd_comgr_status_t + amd_comgr_action_info_set_bundle_entry_ids.argtypes = [amd_comgr_action_info_t, ctypes.POINTER(ctypes.c_char) * 0, size_t] +except AttributeError: + pass +try: + amd_comgr_action_info_get_bundle_entry_id_count = _libraries['libamd_comgr.so'].amd_comgr_action_info_get_bundle_entry_id_count + amd_comgr_action_info_get_bundle_entry_id_count.restype = amd_comgr_status_t + amd_comgr_action_info_get_bundle_entry_id_count.argtypes = [amd_comgr_action_info_t, ctypes.POINTER(ctypes.c_uint64)] +except AttributeError: + pass +try: + amd_comgr_action_info_get_bundle_entry_id = _libraries['libamd_comgr.so'].amd_comgr_action_info_get_bundle_entry_id + amd_comgr_action_info_get_bundle_entry_id.restype = amd_comgr_status_t + amd_comgr_action_info_get_bundle_entry_id.argtypes = [amd_comgr_action_info_t, size_t, ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_char)] +except AttributeError: + pass try: amd_comgr_action_info_set_working_directory_path = _libraries['libamd_comgr.so'].amd_comgr_action_info_set_working_directory_path amd_comgr_action_info_set_working_directory_path.restype = amd_comgr_status_t @@ -560,7 +580,8 @@ amd_comgr_action_kind_s__enumvalues = { 15: 'AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC', 16: 'AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE', 17: 'AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE', - 17: 'AMD_COMGR_ACTION_LAST', + 18: 'AMD_COMGR_ACTION_UNBUNDLE', + 18: 'AMD_COMGR_ACTION_LAST', } AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR = 0 AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS = 1 @@ -580,7 +601,8 @@ AMD_COMGR_ACTION_COMPILE_SOURCE_TO_FATBIN = 14 AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC = 15 AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE = 16 AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE = 17 -AMD_COMGR_ACTION_LAST = 17 +AMD_COMGR_ACTION_UNBUNDLE = 18 +AMD_COMGR_ACTION_LAST = 18 amd_comgr_action_kind_s = ctypes.c_uint32 # enum amd_comgr_action_kind_t = amd_comgr_action_kind_s amd_comgr_action_kind_t__enumvalues = amd_comgr_action_kind_s__enumvalues @@ -801,12 +823,13 @@ __all__ = \ 'AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE', 'AMD_COMGR_ACTION_OPTIMIZE_BC_TO_BC', 'AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR', - 'AMD_COMGR_DATA_KIND_AR', 'AMD_COMGR_DATA_KIND_AR_BUNDLE', - 'AMD_COMGR_DATA_KIND_BC', 'AMD_COMGR_DATA_KIND_BC_BUNDLE', - 'AMD_COMGR_DATA_KIND_BYTES', 'AMD_COMGR_DATA_KIND_DIAGNOSTIC', + 'AMD_COMGR_ACTION_UNBUNDLE', 'AMD_COMGR_DATA_KIND_AR', + 'AMD_COMGR_DATA_KIND_AR_BUNDLE', 'AMD_COMGR_DATA_KIND_BC', + 'AMD_COMGR_DATA_KIND_BC_BUNDLE', 'AMD_COMGR_DATA_KIND_BYTES', + 'AMD_COMGR_DATA_KIND_DIAGNOSTIC', 'AMD_COMGR_DATA_KIND_EXECUTABLE', 'AMD_COMGR_DATA_KIND_FATBIN', 'AMD_COMGR_DATA_KIND_INCLUDE', 'AMD_COMGR_DATA_KIND_LAST', - 'AMD_COMGR_DATA_KIND_LOG', + 'AMD_COMGR_DATA_KIND_LOG', 'AMD_COMGR_DATA_KIND_OBJ_BUNDLE', 'AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER', 'AMD_COMGR_DATA_KIND_RELOCATABLE', 'AMD_COMGR_DATA_KIND_SOURCE', 'AMD_COMGR_DATA_KIND_UNDEF', 'AMD_COMGR_LANGUAGE_HC', @@ -828,6 +851,8 @@ __all__ = \ 'AMD_COMGR_SYMBOL_TYPE_OBJECT', 'AMD_COMGR_SYMBOL_TYPE_SECTION', 'AMD_COMGR_SYMBOL_TYPE_UNKNOWN', 'amd_comgr_action_data_count', 'amd_comgr_action_data_get_data', + 'amd_comgr_action_info_get_bundle_entry_id', + 'amd_comgr_action_info_get_bundle_entry_id_count', 'amd_comgr_action_info_get_isa_name', 'amd_comgr_action_info_get_language', 'amd_comgr_action_info_get_logging', @@ -835,6 +860,7 @@ __all__ = \ 'amd_comgr_action_info_get_option_list_item', 'amd_comgr_action_info_get_options', 'amd_comgr_action_info_get_working_directory_path', + 'amd_comgr_action_info_set_bundle_entry_ids', 'amd_comgr_action_info_set_isa_name', 'amd_comgr_action_info_set_language', 'amd_comgr_action_info_set_logging', diff --git a/tinygrad_repo/tinygrad/runtime/autogen/cuda.py b/tinygrad_repo/tinygrad/runtime/autogen/cuda.py index a30c8f5..55c101a 100644 --- a/tinygrad_repo/tinygrad/runtime/autogen/cuda.py +++ b/tinygrad_repo/tinygrad/runtime/autogen/cuda.py @@ -1,7 +1,7 @@ # mypy: ignore-errors # -*- coding: utf-8 -*- # -# TARGET arch is: [] +# TARGET arch is: ['-D__CUDA_API_VERSION_INTERNAL'] # WORD_SIZE is: 8 # POINTER_SIZE is: 8 # LONGDOUBLE_SIZE is: 16 @@ -166,6 +166,14 @@ class struct_CUfunc_st(Structure): pass CUfunction = ctypes.POINTER(struct_CUfunc_st) +class struct_CUlib_st(Structure): + pass + +CUlibrary = ctypes.POINTER(struct_CUlib_st) +class struct_CUkern_st(Structure): + pass + +CUkernel = ctypes.POINTER(struct_CUkern_st) class struct_CUarray_st(Structure): pass @@ -303,6 +311,51 @@ CUctx_flags_enum = ctypes.c_uint32 # enum CUctx_flags = CUctx_flags_enum CUctx_flags__enumvalues = CUctx_flags_enum__enumvalues +# values for enumeration 'CUevent_sched_flags_enum' +CUevent_sched_flags_enum__enumvalues = { + 0: 'CU_EVENT_SCHED_AUTO', + 1: 'CU_EVENT_SCHED_SPIN', + 2: 'CU_EVENT_SCHED_YIELD', + 4: 'CU_EVENT_SCHED_BLOCKING_SYNC', +} +CU_EVENT_SCHED_AUTO = 0 +CU_EVENT_SCHED_SPIN = 1 +CU_EVENT_SCHED_YIELD = 2 +CU_EVENT_SCHED_BLOCKING_SYNC = 4 +CUevent_sched_flags_enum = ctypes.c_uint32 # enum +CUevent_sched_flags = CUevent_sched_flags_enum +CUevent_sched_flags__enumvalues = CUevent_sched_flags_enum__enumvalues + +# values for enumeration 'cl_event_flags_enum' +cl_event_flags_enum__enumvalues = { + 0: 'NVCL_EVENT_SCHED_AUTO', + 1: 'NVCL_EVENT_SCHED_SPIN', + 2: 'NVCL_EVENT_SCHED_YIELD', + 4: 'NVCL_EVENT_SCHED_BLOCKING_SYNC', +} +NVCL_EVENT_SCHED_AUTO = 0 +NVCL_EVENT_SCHED_SPIN = 1 +NVCL_EVENT_SCHED_YIELD = 2 +NVCL_EVENT_SCHED_BLOCKING_SYNC = 4 +cl_event_flags_enum = ctypes.c_uint32 # enum +cl_event_flags = cl_event_flags_enum +cl_event_flags__enumvalues = cl_event_flags_enum__enumvalues + +# values for enumeration 'cl_context_flags_enum' +cl_context_flags_enum__enumvalues = { + 0: 'NVCL_CTX_SCHED_AUTO', + 1: 'NVCL_CTX_SCHED_SPIN', + 2: 'NVCL_CTX_SCHED_YIELD', + 4: 'NVCL_CTX_SCHED_BLOCKING_SYNC', +} +NVCL_CTX_SCHED_AUTO = 0 +NVCL_CTX_SCHED_SPIN = 1 +NVCL_CTX_SCHED_YIELD = 2 +NVCL_CTX_SCHED_BLOCKING_SYNC = 4 +cl_context_flags_enum = ctypes.c_uint32 # enum +cl_context_flags = cl_context_flags_enum +cl_context_flags__enumvalues = cl_context_flags_enum__enumvalues + # values for enumeration 'CUstream_flags_enum' CUstream_flags_enum__enumvalues = { 0: 'CU_STREAM_DEFAULT', @@ -385,16 +438,29 @@ CUstreamBatchMemOpType_enum__enumvalues = { 2: 'CU_STREAM_MEM_OP_WRITE_VALUE_32', 4: 'CU_STREAM_MEM_OP_WAIT_VALUE_64', 5: 'CU_STREAM_MEM_OP_WRITE_VALUE_64', + 6: 'CU_STREAM_MEM_OP_BARRIER', 3: 'CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES', } CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1 CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2 CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4 CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5 +CU_STREAM_MEM_OP_BARRIER = 6 CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 CUstreamBatchMemOpType_enum = ctypes.c_uint32 # enum CUstreamBatchMemOpType = CUstreamBatchMemOpType_enum CUstreamBatchMemOpType__enumvalues = CUstreamBatchMemOpType_enum__enumvalues + +# values for enumeration 'CUstreamMemoryBarrier_flags_enum' +CUstreamMemoryBarrier_flags_enum__enumvalues = { + 0: 'CU_STREAM_MEMORY_BARRIER_TYPE_SYS', + 1: 'CU_STREAM_MEMORY_BARRIER_TYPE_GPU', +} +CU_STREAM_MEMORY_BARRIER_TYPE_SYS = 0 +CU_STREAM_MEMORY_BARRIER_TYPE_GPU = 1 +CUstreamMemoryBarrier_flags_enum = ctypes.c_uint32 # enum +CUstreamMemoryBarrier_flags = CUstreamMemoryBarrier_flags_enum +CUstreamMemoryBarrier_flags__enumvalues = CUstreamMemoryBarrier_flags_enum__enumvalues class union_CUstreamBatchMemOpParams_union(Union): pass @@ -455,17 +521,41 @@ struct_CUstreamMemOpFlushRemoteWritesParams_st._fields_ = [ ('flags', ctypes.c_uint32), ] +class struct_CUstreamMemOpMemoryBarrierParams_st(Structure): + pass + +struct_CUstreamMemOpMemoryBarrierParams_st._pack_ = 1 # source:False +struct_CUstreamMemOpMemoryBarrierParams_st._fields_ = [ + ('operation', CUstreamBatchMemOpType), + ('flags', ctypes.c_uint32), +] + union_CUstreamBatchMemOpParams_union._pack_ = 1 # source:False union_CUstreamBatchMemOpParams_union._fields_ = [ ('operation', CUstreamBatchMemOpType), ('waitValue', struct_CUstreamMemOpWaitValueParams_st), ('writeValue', struct_CUstreamMemOpWriteValueParams_st), ('flushRemoteWrites', struct_CUstreamMemOpFlushRemoteWritesParams_st), + ('memoryBarrier', struct_CUstreamMemOpMemoryBarrierParams_st), ('pad', ctypes.c_uint64 * 6), ] CUstreamBatchMemOpParams_v1 = union_CUstreamBatchMemOpParams_union CUstreamBatchMemOpParams = union_CUstreamBatchMemOpParams_union +class struct_CUDA_BATCH_MEM_OP_NODE_PARAMS_st(Structure): + pass + +struct_CUDA_BATCH_MEM_OP_NODE_PARAMS_st._pack_ = 1 # source:False +struct_CUDA_BATCH_MEM_OP_NODE_PARAMS_st._fields_ = [ + ('ctx', ctypes.POINTER(struct_CUctx_st)), + ('count', ctypes.c_uint32), + ('PADDING_0', ctypes.c_ubyte * 4), + ('paramArray', ctypes.POINTER(union_CUstreamBatchMemOpParams_union)), + ('flags', ctypes.c_uint32), + ('PADDING_1', ctypes.c_ubyte * 4), +] + +CUDA_BATCH_MEM_OP_NODE_PARAMS = struct_CUDA_BATCH_MEM_OP_NODE_PARAMS_st # values for enumeration 'CUoccupancy_flags_enum' CUoccupancy_flags_enum__enumvalues = { @@ -690,9 +780,9 @@ CUdevice_attribute_enum__enumvalues = { 89: 'CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS', 90: 'CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED', 91: 'CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM', - 92: 'CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS', - 93: 'CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS', - 94: 'CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR', + 92: 'CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS_V1', + 93: 'CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1', + 94: 'CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1', 95: 'CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH', 96: 'CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH', 97: 'CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN', @@ -719,7 +809,16 @@ CUdevice_attribute_enum__enumvalues = { 117: 'CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS', 118: 'CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING', 119: 'CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES', - 120: 'CU_DEVICE_ATTRIBUTE_MAX', + 120: 'CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH', + 121: 'CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED', + 122: 'CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS', + 123: 'CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR', + 124: 'CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED', + 125: 'CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED', + 126: 'CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT', + 127: 'CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED', + 129: 'CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS', + 130: 'CU_DEVICE_ATTRIBUTE_MAX', } CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2 @@ -817,9 +916,9 @@ CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88 CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89 CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90 CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91 -CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92 -CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93 -CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94 +CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS_V1 = 92 +CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1 = 93 +CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1 = 94 CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95 CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97 @@ -846,7 +945,16 @@ CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118 CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119 -CU_DEVICE_ATTRIBUTE_MAX = 120 +CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120 +CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121 +CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 122 +CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 123 +CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124 +CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED = 125 +CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT = 126 +CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED = 127 +CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS = 129 +CU_DEVICE_ATTRIBUTE_MAX = 130 CUdevice_attribute_enum = ctypes.c_uint32 # enum CUdevice_attribute = CUdevice_attribute_enum CUdevice_attribute__enumvalues = CUdevice_attribute_enum__enumvalues @@ -889,6 +997,9 @@ CUpointer_attribute_enum__enumvalues = { 15: 'CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE', 16: 'CU_POINTER_ATTRIBUTE_ACCESS_FLAGS', 17: 'CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE', + 18: 'CU_POINTER_ATTRIBUTE_MAPPING_SIZE', + 19: 'CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR', + 20: 'CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID', } CU_POINTER_ATTRIBUTE_CONTEXT = 1 CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2 @@ -907,6 +1018,9 @@ CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14 CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15 CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16 CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17 +CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18 +CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19 +CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20 CUpointer_attribute_enum = ctypes.c_uint32 # enum CUpointer_attribute = CUpointer_attribute_enum CUpointer_attribute__enumvalues = CUpointer_attribute_enum__enumvalues @@ -923,7 +1037,13 @@ CUfunction_attribute_enum__enumvalues = { 7: 'CU_FUNC_ATTRIBUTE_CACHE_MODE_CA', 8: 'CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES', 9: 'CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT', - 10: 'CU_FUNC_ATTRIBUTE_MAX', + 10: 'CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET', + 11: 'CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH', + 12: 'CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT', + 13: 'CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH', + 14: 'CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED', + 15: 'CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE', + 16: 'CU_FUNC_ATTRIBUTE_MAX', } CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0 CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1 @@ -935,7 +1055,13 @@ CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6 CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7 CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8 CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9 -CU_FUNC_ATTRIBUTE_MAX = 10 +CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10 +CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11 +CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12 +CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13 +CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14 +CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15 +CU_FUNC_ATTRIBUTE_MAX = 16 CUfunction_attribute_enum = ctypes.c_uint32 # enum CUfunction_attribute = CUfunction_attribute_enum CUfunction_attribute__enumvalues = CUfunction_attribute_enum__enumvalues @@ -1070,7 +1196,13 @@ CUjit_option_enum__enumvalues = { 22: 'CU_JIT_PREC_DIV', 23: 'CU_JIT_PREC_SQRT', 24: 'CU_JIT_FMA', - 25: 'CU_JIT_NUM_OPTIONS', + 25: 'CU_JIT_REFERENCED_KERNEL_NAMES', + 26: 'CU_JIT_REFERENCED_KERNEL_COUNT', + 27: 'CU_JIT_REFERENCED_VARIABLE_NAMES', + 28: 'CU_JIT_REFERENCED_VARIABLE_COUNT', + 29: 'CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES', + 30: 'CU_JIT_POSITION_INDEPENDENT_CODE', + 31: 'CU_JIT_NUM_OPTIONS', } CU_JIT_MAX_REGISTERS = 0 CU_JIT_THREADS_PER_BLOCK = 1 @@ -1097,15 +1229,19 @@ CU_JIT_FTZ = 21 CU_JIT_PREC_DIV = 22 CU_JIT_PREC_SQRT = 23 CU_JIT_FMA = 24 -CU_JIT_NUM_OPTIONS = 25 +CU_JIT_REFERENCED_KERNEL_NAMES = 25 +CU_JIT_REFERENCED_KERNEL_COUNT = 26 +CU_JIT_REFERENCED_VARIABLE_NAMES = 27 +CU_JIT_REFERENCED_VARIABLE_COUNT = 28 +CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES = 29 +CU_JIT_POSITION_INDEPENDENT_CODE = 30 +CU_JIT_NUM_OPTIONS = 31 CUjit_option_enum = ctypes.c_uint32 # enum CUjit_option = CUjit_option_enum CUjit_option__enumvalues = CUjit_option_enum__enumvalues # values for enumeration 'CUjit_target_enum' CUjit_target_enum__enumvalues = { - 20: 'CU_TARGET_COMPUTE_20', - 21: 'CU_TARGET_COMPUTE_21', 30: 'CU_TARGET_COMPUTE_30', 32: 'CU_TARGET_COMPUTE_32', 35: 'CU_TARGET_COMPUTE_35', @@ -1121,9 +1257,11 @@ CUjit_target_enum__enumvalues = { 75: 'CU_TARGET_COMPUTE_75', 80: 'CU_TARGET_COMPUTE_80', 86: 'CU_TARGET_COMPUTE_86', + 87: 'CU_TARGET_COMPUTE_87', + 89: 'CU_TARGET_COMPUTE_89', + 90: 'CU_TARGET_COMPUTE_90', + 65626: 'CU_TARGET_COMPUTE_90A', } -CU_TARGET_COMPUTE_20 = 20 -CU_TARGET_COMPUTE_21 = 21 CU_TARGET_COMPUTE_30 = 30 CU_TARGET_COMPUTE_32 = 32 CU_TARGET_COMPUTE_35 = 35 @@ -1139,6 +1277,10 @@ CU_TARGET_COMPUTE_72 = 72 CU_TARGET_COMPUTE_75 = 75 CU_TARGET_COMPUTE_80 = 80 CU_TARGET_COMPUTE_86 = 86 +CU_TARGET_COMPUTE_87 = 87 +CU_TARGET_COMPUTE_89 = 89 +CU_TARGET_COMPUTE_90 = 90 +CU_TARGET_COMPUTE_90A = 65626 CUjit_target_enum = ctypes.c_uint32 # enum CUjit_target = CUjit_target_enum CUjit_target__enumvalues = CUjit_target_enum__enumvalues @@ -1326,7 +1468,28 @@ struct_CUDA_KERNEL_NODE_PARAMS_st._fields_ = [ ] CUDA_KERNEL_NODE_PARAMS_v1 = struct_CUDA_KERNEL_NODE_PARAMS_st -CUDA_KERNEL_NODE_PARAMS = struct_CUDA_KERNEL_NODE_PARAMS_st +class struct_CUDA_KERNEL_NODE_PARAMS_v2_st(Structure): + pass + +struct_CUDA_KERNEL_NODE_PARAMS_v2_st._pack_ = 1 # source:False +struct_CUDA_KERNEL_NODE_PARAMS_v2_st._fields_ = [ + ('func', ctypes.POINTER(struct_CUfunc_st)), + ('gridDimX', ctypes.c_uint32), + ('gridDimY', ctypes.c_uint32), + ('gridDimZ', ctypes.c_uint32), + ('blockDimX', ctypes.c_uint32), + ('blockDimY', ctypes.c_uint32), + ('blockDimZ', ctypes.c_uint32), + ('sharedMemBytes', ctypes.c_uint32), + ('PADDING_0', ctypes.c_ubyte * 4), + ('kernelParams', ctypes.POINTER(ctypes.POINTER(None))), + ('extra', ctypes.POINTER(ctypes.POINTER(None))), + ('kern', ctypes.POINTER(struct_CUkern_st)), + ('ctx', ctypes.POINTER(struct_CUctx_st)), +] + +CUDA_KERNEL_NODE_PARAMS_v2 = struct_CUDA_KERNEL_NODE_PARAMS_v2_st +CUDA_KERNEL_NODE_PARAMS = struct_CUDA_KERNEL_NODE_PARAMS_v2_st class struct_CUDA_MEMSET_NODE_PARAMS_st(Structure): pass @@ -1368,6 +1531,7 @@ CUgraphNodeType_enum__enumvalues = { 9: 'CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT', 10: 'CU_GRAPH_NODE_TYPE_MEM_ALLOC', 11: 'CU_GRAPH_NODE_TYPE_MEM_FREE', + 12: 'CU_GRAPH_NODE_TYPE_BATCH_MEM_OP', } CU_GRAPH_NODE_TYPE_KERNEL = 0 CU_GRAPH_NODE_TYPE_MEMCPY = 1 @@ -1381,10 +1545,41 @@ CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8 CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9 CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10 CU_GRAPH_NODE_TYPE_MEM_FREE = 11 +CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12 CUgraphNodeType_enum = ctypes.c_uint32 # enum CUgraphNodeType = CUgraphNodeType_enum CUgraphNodeType__enumvalues = CUgraphNodeType_enum__enumvalues +# values for enumeration 'CUgraphInstantiateResult_enum' +CUgraphInstantiateResult_enum__enumvalues = { + 0: 'CUDA_GRAPH_INSTANTIATE_SUCCESS', + 1: 'CUDA_GRAPH_INSTANTIATE_ERROR', + 2: 'CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE', + 3: 'CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED', + 4: 'CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED', +} +CUDA_GRAPH_INSTANTIATE_SUCCESS = 0 +CUDA_GRAPH_INSTANTIATE_ERROR = 1 +CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE = 2 +CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED = 3 +CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4 +CUgraphInstantiateResult_enum = ctypes.c_uint32 # enum +CUgraphInstantiateResult = CUgraphInstantiateResult_enum +CUgraphInstantiateResult__enumvalues = CUgraphInstantiateResult_enum__enumvalues +class struct_CUDA_GRAPH_INSTANTIATE_PARAMS_st(Structure): + pass + +struct_CUDA_GRAPH_INSTANTIATE_PARAMS_st._pack_ = 1 # source:False +struct_CUDA_GRAPH_INSTANTIATE_PARAMS_st._fields_ = [ + ('flags', ctypes.c_uint64), + ('hUploadStream', ctypes.POINTER(struct_CUstream_st)), + ('hErrNode_out', ctypes.POINTER(struct_CUgraphNode_st)), + ('result_out', CUgraphInstantiateResult), + ('PADDING_0', ctypes.c_ubyte * 4), +] + +CUDA_GRAPH_INSTANTIATE_PARAMS = struct_CUDA_GRAPH_INSTANTIATE_PARAMS_st + # values for enumeration 'CUsynchronizationPolicy_enum' CUsynchronizationPolicy_enum__enumvalues = { 1: 'CU_SYNC_POLICY_AUTO', @@ -1400,28 +1595,143 @@ CUsynchronizationPolicy_enum = ctypes.c_uint32 # enum CUsynchronizationPolicy = CUsynchronizationPolicy_enum CUsynchronizationPolicy__enumvalues = CUsynchronizationPolicy_enum__enumvalues -# values for enumeration 'CUkernelNodeAttrID_enum' -CUkernelNodeAttrID_enum__enumvalues = { - 1: 'CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW', - 2: 'CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE', +# values for enumeration 'CUclusterSchedulingPolicy_enum' +CUclusterSchedulingPolicy_enum__enumvalues = { + 0: 'CU_CLUSTER_SCHEDULING_POLICY_DEFAULT', + 1: 'CU_CLUSTER_SCHEDULING_POLICY_SPREAD', + 2: 'CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING', } -CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1 -CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE = 2 -CUkernelNodeAttrID_enum = ctypes.c_uint32 # enum -CUkernelNodeAttrID = CUkernelNodeAttrID_enum -CUkernelNodeAttrID__enumvalues = CUkernelNodeAttrID_enum__enumvalues -class union_CUkernelNodeAttrValue_union(Union): +CU_CLUSTER_SCHEDULING_POLICY_DEFAULT = 0 +CU_CLUSTER_SCHEDULING_POLICY_SPREAD = 1 +CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING = 2 +CUclusterSchedulingPolicy_enum = ctypes.c_uint32 # enum +CUclusterSchedulingPolicy = CUclusterSchedulingPolicy_enum +CUclusterSchedulingPolicy__enumvalues = CUclusterSchedulingPolicy_enum__enumvalues + +# values for enumeration 'CUlaunchMemSyncDomain_enum' +CUlaunchMemSyncDomain_enum__enumvalues = { + 0: 'CU_LAUNCH_MEM_SYNC_DOMAIN_DEFAULT', + 1: 'CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE', +} +CU_LAUNCH_MEM_SYNC_DOMAIN_DEFAULT = 0 +CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE = 1 +CUlaunchMemSyncDomain_enum = ctypes.c_uint32 # enum +CUlaunchMemSyncDomain = CUlaunchMemSyncDomain_enum +CUlaunchMemSyncDomain__enumvalues = CUlaunchMemSyncDomain_enum__enumvalues +class struct_CUlaunchMemSyncDomainMap_st(Structure): pass -union_CUkernelNodeAttrValue_union._pack_ = 1 # source:False -union_CUkernelNodeAttrValue_union._fields_ = [ - ('accessPolicyWindow', CUaccessPolicyWindow), - ('cooperative', ctypes.c_int32), - ('PADDING_0', ctypes.c_ubyte * 28), +struct_CUlaunchMemSyncDomainMap_st._pack_ = 1 # source:False +struct_CUlaunchMemSyncDomainMap_st._fields_ = [ + ('default_', ctypes.c_ubyte), + ('remote', ctypes.c_ubyte), ] -CUkernelNodeAttrValue_v1 = union_CUkernelNodeAttrValue_union -CUkernelNodeAttrValue = union_CUkernelNodeAttrValue_union +CUlaunchMemSyncDomainMap = struct_CUlaunchMemSyncDomainMap_st + +# values for enumeration 'CUlaunchAttributeID_enum' +CUlaunchAttributeID_enum__enumvalues = { + 0: 'CU_LAUNCH_ATTRIBUTE_IGNORE', + 1: 'CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW', + 2: 'CU_LAUNCH_ATTRIBUTE_COOPERATIVE', + 3: 'CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY', + 4: 'CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION', + 5: 'CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE', + 6: 'CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION', + 7: 'CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT', + 8: 'CU_LAUNCH_ATTRIBUTE_PRIORITY', + 9: 'CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP', + 10: 'CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN', +} +CU_LAUNCH_ATTRIBUTE_IGNORE = 0 +CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1 +CU_LAUNCH_ATTRIBUTE_COOPERATIVE = 2 +CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 +CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION = 4 +CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5 +CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION = 6 +CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT = 7 +CU_LAUNCH_ATTRIBUTE_PRIORITY = 8 +CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP = 9 +CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN = 10 +CUlaunchAttributeID_enum = ctypes.c_uint32 # enum +CUlaunchAttributeID = CUlaunchAttributeID_enum +CUlaunchAttributeID__enumvalues = CUlaunchAttributeID_enum__enumvalues +class union_CUlaunchAttributeValue_union(Union): + pass + +class struct_CUlaunchAttributeValue_union_clusterDim(Structure): + pass + +struct_CUlaunchAttributeValue_union_clusterDim._pack_ = 1 # source:False +struct_CUlaunchAttributeValue_union_clusterDim._fields_ = [ + ('x', ctypes.c_uint32), + ('y', ctypes.c_uint32), + ('z', ctypes.c_uint32), +] + +class struct_CUlaunchAttributeValue_union_programmaticEvent(Structure): + pass + +struct_CUlaunchAttributeValue_union_programmaticEvent._pack_ = 1 # source:False +struct_CUlaunchAttributeValue_union_programmaticEvent._fields_ = [ + ('event', ctypes.POINTER(struct_CUevent_st)), + ('flags', ctypes.c_int32), + ('triggerAtBlockStart', ctypes.c_int32), +] + +union_CUlaunchAttributeValue_union._pack_ = 1 # source:False +union_CUlaunchAttributeValue_union._fields_ = [ + ('pad', ctypes.c_char * 64), + ('accessPolicyWindow', CUaccessPolicyWindow), + ('cooperative', ctypes.c_int32), + ('syncPolicy', CUsynchronizationPolicy), + ('clusterDim', struct_CUlaunchAttributeValue_union_clusterDim), + ('clusterSchedulingPolicyPreference', CUclusterSchedulingPolicy), + ('programmaticStreamSerializationAllowed', ctypes.c_int32), + ('programmaticEvent', struct_CUlaunchAttributeValue_union_programmaticEvent), + ('priority', ctypes.c_int32), + ('memSyncDomainMap', CUlaunchMemSyncDomainMap), + ('memSyncDomain', CUlaunchMemSyncDomain), + ('PADDING_0', ctypes.c_ubyte * 60), +] + +CUlaunchAttributeValue = union_CUlaunchAttributeValue_union +class struct_CUlaunchAttribute_st(Structure): + pass + +struct_CUlaunchAttribute_st._pack_ = 1 # source:False +struct_CUlaunchAttribute_st._fields_ = [ + ('id', CUlaunchAttributeID), + ('pad', ctypes.c_char * 4), + ('value', CUlaunchAttributeValue), +] + +CUlaunchAttribute = struct_CUlaunchAttribute_st +class struct_CUlaunchConfig_st(Structure): + pass + +struct_CUlaunchConfig_st._pack_ = 1 # source:False +struct_CUlaunchConfig_st._fields_ = [ + ('gridDimX', ctypes.c_uint32), + ('gridDimY', ctypes.c_uint32), + ('gridDimZ', ctypes.c_uint32), + ('blockDimX', ctypes.c_uint32), + ('blockDimY', ctypes.c_uint32), + ('blockDimZ', ctypes.c_uint32), + ('sharedMemBytes', ctypes.c_uint32), + ('PADDING_0', ctypes.c_ubyte * 4), + ('hStream', ctypes.POINTER(struct_CUstream_st)), + ('attrs', ctypes.POINTER(struct_CUlaunchAttribute_st)), + ('numAttrs', ctypes.c_uint32), + ('PADDING_1', ctypes.c_ubyte * 4), +] + +CUlaunchConfig = struct_CUlaunchConfig_st +CUkernelNodeAttrID = CUlaunchAttributeID_enum +CUkernelNodeAttrID__enumvalues = CUlaunchAttributeID_enum__enumvalues +CUkernelNodeAttrValue_v1 = union_CUlaunchAttributeValue_union +CUkernelNodeAttrValue = union_CUlaunchAttributeValue_union # values for enumeration 'CUstreamCaptureStatus_enum' CUstreamCaptureStatus_enum__enumvalues = { @@ -1448,29 +1758,10 @@ CU_STREAM_CAPTURE_MODE_RELAXED = 2 CUstreamCaptureMode_enum = ctypes.c_uint32 # enum CUstreamCaptureMode = CUstreamCaptureMode_enum CUstreamCaptureMode__enumvalues = CUstreamCaptureMode_enum__enumvalues - -# values for enumeration 'CUstreamAttrID_enum' -CUstreamAttrID_enum__enumvalues = { - 1: 'CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW', - 3: 'CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY', -} -CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1 -CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 -CUstreamAttrID_enum = ctypes.c_uint32 # enum -CUstreamAttrID = CUstreamAttrID_enum -CUstreamAttrID__enumvalues = CUstreamAttrID_enum__enumvalues -class union_CUstreamAttrValue_union(Union): - pass - -union_CUstreamAttrValue_union._pack_ = 1 # source:False -union_CUstreamAttrValue_union._fields_ = [ - ('accessPolicyWindow', CUaccessPolicyWindow), - ('syncPolicy', CUsynchronizationPolicy), - ('PADDING_0', ctypes.c_ubyte * 28), -] - -CUstreamAttrValue_v1 = union_CUstreamAttrValue_union -CUstreamAttrValue = union_CUstreamAttrValue_union +CUstreamAttrID = CUlaunchAttributeID_enum +CUstreamAttrID__enumvalues = CUlaunchAttributeID_enum__enumvalues +CUstreamAttrValue_v1 = union_CUlaunchAttributeValue_union +CUstreamAttrValue = union_CUlaunchAttributeValue_union # values for enumeration 'CUdriverProcAddress_flags_enum' CUdriverProcAddress_flags_enum__enumvalues = { @@ -1485,6 +1776,19 @@ CUdriverProcAddress_flags_enum = ctypes.c_uint32 # enum CUdriverProcAddress_flags = CUdriverProcAddress_flags_enum CUdriverProcAddress_flags__enumvalues = CUdriverProcAddress_flags_enum__enumvalues +# values for enumeration 'CUdriverProcAddressQueryResult_enum' +CUdriverProcAddressQueryResult_enum__enumvalues = { + 0: 'CU_GET_PROC_ADDRESS_SUCCESS', + 1: 'CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND', + 2: 'CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT', +} +CU_GET_PROC_ADDRESS_SUCCESS = 0 +CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND = 1 +CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT = 2 +CUdriverProcAddressQueryResult_enum = ctypes.c_uint32 # enum +CUdriverProcAddressQueryResult = CUdriverProcAddressQueryResult_enum +CUdriverProcAddressQueryResult__enumvalues = CUdriverProcAddressQueryResult_enum__enumvalues + # values for enumeration 'CUexecAffinityType_enum' CUexecAffinityType_enum__enumvalues = { 0: 'CU_EXEC_AFFINITY_TYPE_SM_COUNT', @@ -1523,6 +1827,31 @@ struct_CUexecAffinityParam_st._fields_ = [ CUexecAffinityParam_v1 = struct_CUexecAffinityParam_st CUexecAffinityParam = struct_CUexecAffinityParam_st +# values for enumeration 'CUlibraryOption_enum' +CUlibraryOption_enum__enumvalues = { + 0: 'CU_LIBRARY_HOST_UNIVERSAL_FUNCTION_AND_DATA_TABLE', + 1: 'CU_LIBRARY_BINARY_IS_PRESERVED', + 2: 'CU_LIBRARY_NUM_OPTIONS', +} +CU_LIBRARY_HOST_UNIVERSAL_FUNCTION_AND_DATA_TABLE = 0 +CU_LIBRARY_BINARY_IS_PRESERVED = 1 +CU_LIBRARY_NUM_OPTIONS = 2 +CUlibraryOption_enum = ctypes.c_uint32 # enum +CUlibraryOption = CUlibraryOption_enum +CUlibraryOption__enumvalues = CUlibraryOption_enum__enumvalues +class struct_CUlibraryHostUniversalFunctionAndDataTable_st(Structure): + pass + +struct_CUlibraryHostUniversalFunctionAndDataTable_st._pack_ = 1 # source:False +struct_CUlibraryHostUniversalFunctionAndDataTable_st._fields_ = [ + ('functionTable', ctypes.POINTER(None)), + ('functionWindowSize', ctypes.c_uint64), + ('dataTable', ctypes.POINTER(None)), + ('dataWindowSize', ctypes.c_uint64), +] + +CUlibraryHostUniversalFunctionAndDataTable = struct_CUlibraryHostUniversalFunctionAndDataTable_st + # values for enumeration 'cudaError_enum' cudaError_enum__enumvalues = { 0: 'CUDA_SUCCESS', @@ -1535,6 +1864,7 @@ cudaError_enum__enumvalues = { 7: 'CUDA_ERROR_PROFILER_ALREADY_STARTED', 8: 'CUDA_ERROR_PROFILER_ALREADY_STOPPED', 34: 'CUDA_ERROR_STUB_LIBRARY', + 46: 'CUDA_ERROR_DEVICE_UNAVAILABLE', 100: 'CUDA_ERROR_NO_DEVICE', 101: 'CUDA_ERROR_INVALID_DEVICE', 102: 'CUDA_ERROR_DEVICE_NOT_LICENSED', @@ -1599,6 +1929,9 @@ cudaError_enum__enumvalues = { 807: 'CUDA_ERROR_MPS_SERVER_NOT_READY', 808: 'CUDA_ERROR_MPS_MAX_CLIENTS_REACHED', 809: 'CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED', + 810: 'CUDA_ERROR_MPS_CLIENT_TERMINATED', + 811: 'CUDA_ERROR_CDP_NOT_SUPPORTED', + 812: 'CUDA_ERROR_CDP_VERSION_MISMATCH', 900: 'CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED', 901: 'CUDA_ERROR_STREAM_CAPTURE_INVALIDATED', 902: 'CUDA_ERROR_STREAM_CAPTURE_MERGE', @@ -1611,6 +1944,7 @@ cudaError_enum__enumvalues = { 909: 'CUDA_ERROR_TIMEOUT', 910: 'CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE', 911: 'CUDA_ERROR_EXTERNAL_DEVICE', + 912: 'CUDA_ERROR_INVALID_CLUSTER_SIZE', 999: 'CUDA_ERROR_UNKNOWN', } CUDA_SUCCESS = 0 @@ -1623,6 +1957,7 @@ CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6 CUDA_ERROR_PROFILER_ALREADY_STARTED = 7 CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8 CUDA_ERROR_STUB_LIBRARY = 34 +CUDA_ERROR_DEVICE_UNAVAILABLE = 46 CUDA_ERROR_NO_DEVICE = 100 CUDA_ERROR_INVALID_DEVICE = 101 CUDA_ERROR_DEVICE_NOT_LICENSED = 102 @@ -1687,6 +2022,9 @@ CUDA_ERROR_MPS_RPC_FAILURE = 806 CUDA_ERROR_MPS_SERVER_NOT_READY = 807 CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808 CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809 +CUDA_ERROR_MPS_CLIENT_TERMINATED = 810 +CUDA_ERROR_CDP_NOT_SUPPORTED = 811 +CUDA_ERROR_CDP_VERSION_MISMATCH = 812 CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900 CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901 CUDA_ERROR_STREAM_CAPTURE_MERGE = 902 @@ -1699,6 +2037,7 @@ CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908 CUDA_ERROR_TIMEOUT = 909 CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910 CUDA_ERROR_EXTERNAL_DEVICE = 911 +CUDA_ERROR_INVALID_CLUSTER_SIZE = 912 CUDA_ERROR_UNKNOWN = 999 cudaError_enum = ctypes.c_uint32 # enum CUresult = cudaError_enum @@ -1875,6 +2214,18 @@ struct_CUDA_ARRAY_SPARSE_PROPERTIES_st._fields_ = [ CUDA_ARRAY_SPARSE_PROPERTIES_v1 = struct_CUDA_ARRAY_SPARSE_PROPERTIES_st CUDA_ARRAY_SPARSE_PROPERTIES = struct_CUDA_ARRAY_SPARSE_PROPERTIES_st +class struct_CUDA_ARRAY_MEMORY_REQUIREMENTS_st(Structure): + pass + +struct_CUDA_ARRAY_MEMORY_REQUIREMENTS_st._pack_ = 1 # source:False +struct_CUDA_ARRAY_MEMORY_REQUIREMENTS_st._fields_ = [ + ('size', ctypes.c_uint64), + ('alignment', ctypes.c_uint64), + ('reserved', ctypes.c_uint32 * 4), +] + +CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 = struct_CUDA_ARRAY_MEMORY_REQUIREMENTS_st +CUDA_ARRAY_MEMORY_REQUIREMENTS = struct_CUDA_ARRAY_MEMORY_REQUIREMENTS_st class struct_CUDA_RESOURCE_DESC_st(Structure): pass @@ -2064,6 +2415,102 @@ struct_CUDA_RESOURCE_VIEW_DESC_st._fields_ = [ CUDA_RESOURCE_VIEW_DESC_v1 = struct_CUDA_RESOURCE_VIEW_DESC_st CUDA_RESOURCE_VIEW_DESC = struct_CUDA_RESOURCE_VIEW_DESC_st +class struct_CUtensorMap_st(Structure): + pass + +struct_CUtensorMap_st._pack_ = 1 # source:False +struct_CUtensorMap_st._fields_ = [ + ('opaque', ctypes.c_uint64 * 16), +] + +CUtensorMap = struct_CUtensorMap_st + +# values for enumeration 'CUtensorMapDataType_enum' +CUtensorMapDataType_enum__enumvalues = { + 0: 'CU_TENSOR_MAP_DATA_TYPE_UINT8', + 1: 'CU_TENSOR_MAP_DATA_TYPE_UINT16', + 2: 'CU_TENSOR_MAP_DATA_TYPE_UINT32', + 3: 'CU_TENSOR_MAP_DATA_TYPE_INT32', + 4: 'CU_TENSOR_MAP_DATA_TYPE_UINT64', + 5: 'CU_TENSOR_MAP_DATA_TYPE_INT64', + 6: 'CU_TENSOR_MAP_DATA_TYPE_FLOAT16', + 7: 'CU_TENSOR_MAP_DATA_TYPE_FLOAT32', + 8: 'CU_TENSOR_MAP_DATA_TYPE_FLOAT64', + 9: 'CU_TENSOR_MAP_DATA_TYPE_BFLOAT16', + 10: 'CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ', + 11: 'CU_TENSOR_MAP_DATA_TYPE_TFLOAT32', + 12: 'CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ', +} +CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0 +CU_TENSOR_MAP_DATA_TYPE_UINT16 = 1 +CU_TENSOR_MAP_DATA_TYPE_UINT32 = 2 +CU_TENSOR_MAP_DATA_TYPE_INT32 = 3 +CU_TENSOR_MAP_DATA_TYPE_UINT64 = 4 +CU_TENSOR_MAP_DATA_TYPE_INT64 = 5 +CU_TENSOR_MAP_DATA_TYPE_FLOAT16 = 6 +CU_TENSOR_MAP_DATA_TYPE_FLOAT32 = 7 +CU_TENSOR_MAP_DATA_TYPE_FLOAT64 = 8 +CU_TENSOR_MAP_DATA_TYPE_BFLOAT16 = 9 +CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ = 10 +CU_TENSOR_MAP_DATA_TYPE_TFLOAT32 = 11 +CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ = 12 +CUtensorMapDataType_enum = ctypes.c_uint32 # enum +CUtensorMapDataType = CUtensorMapDataType_enum +CUtensorMapDataType__enumvalues = CUtensorMapDataType_enum__enumvalues + +# values for enumeration 'CUtensorMapInterleave_enum' +CUtensorMapInterleave_enum__enumvalues = { + 0: 'CU_TENSOR_MAP_INTERLEAVE_NONE', + 1: 'CU_TENSOR_MAP_INTERLEAVE_16B', + 2: 'CU_TENSOR_MAP_INTERLEAVE_32B', +} +CU_TENSOR_MAP_INTERLEAVE_NONE = 0 +CU_TENSOR_MAP_INTERLEAVE_16B = 1 +CU_TENSOR_MAP_INTERLEAVE_32B = 2 +CUtensorMapInterleave_enum = ctypes.c_uint32 # enum +CUtensorMapInterleave = CUtensorMapInterleave_enum +CUtensorMapInterleave__enumvalues = CUtensorMapInterleave_enum__enumvalues + +# values for enumeration 'CUtensorMapSwizzle_enum' +CUtensorMapSwizzle_enum__enumvalues = { + 0: 'CU_TENSOR_MAP_SWIZZLE_NONE', + 1: 'CU_TENSOR_MAP_SWIZZLE_32B', + 2: 'CU_TENSOR_MAP_SWIZZLE_64B', + 3: 'CU_TENSOR_MAP_SWIZZLE_128B', +} +CU_TENSOR_MAP_SWIZZLE_NONE = 0 +CU_TENSOR_MAP_SWIZZLE_32B = 1 +CU_TENSOR_MAP_SWIZZLE_64B = 2 +CU_TENSOR_MAP_SWIZZLE_128B = 3 +CUtensorMapSwizzle_enum = ctypes.c_uint32 # enum +CUtensorMapSwizzle = CUtensorMapSwizzle_enum +CUtensorMapSwizzle__enumvalues = CUtensorMapSwizzle_enum__enumvalues + +# values for enumeration 'CUtensorMapL2promotion_enum' +CUtensorMapL2promotion_enum__enumvalues = { + 0: 'CU_TENSOR_MAP_L2_PROMOTION_NONE', + 1: 'CU_TENSOR_MAP_L2_PROMOTION_L2_64B', + 2: 'CU_TENSOR_MAP_L2_PROMOTION_L2_128B', + 3: 'CU_TENSOR_MAP_L2_PROMOTION_L2_256B', +} +CU_TENSOR_MAP_L2_PROMOTION_NONE = 0 +CU_TENSOR_MAP_L2_PROMOTION_L2_64B = 1 +CU_TENSOR_MAP_L2_PROMOTION_L2_128B = 2 +CU_TENSOR_MAP_L2_PROMOTION_L2_256B = 3 +CUtensorMapL2promotion_enum = ctypes.c_uint32 # enum +CUtensorMapL2promotion = CUtensorMapL2promotion_enum +CUtensorMapL2promotion__enumvalues = CUtensorMapL2promotion_enum__enumvalues + +# values for enumeration 'CUtensorMapFloatOOBfill_enum' +CUtensorMapFloatOOBfill_enum__enumvalues = { + 0: 'CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE', + 1: 'CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA', +} +CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0 +CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA = 1 +CUtensorMapFloatOOBfill_enum = ctypes.c_uint32 # enum +CUtensorMapFloatOOBfill = CUtensorMapFloatOOBfill_enum +CUtensorMapFloatOOBfill__enumvalues = CUtensorMapFloatOOBfill_enum__enumvalues class struct_CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st(Structure): pass @@ -2456,6 +2903,17 @@ CUmemAllocationGranularity_flags_enum = ctypes.c_uint32 # enum CUmemAllocationGranularity_flags = CUmemAllocationGranularity_flags_enum CUmemAllocationGranularity_flags__enumvalues = CUmemAllocationGranularity_flags_enum__enumvalues +# values for enumeration 'CUmemRangeHandleType_enum' +CUmemRangeHandleType_enum__enumvalues = { + 1: 'CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD', + 2147483647: 'CU_MEM_RANGE_HANDLE_TYPE_MAX', +} +CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD = 1 +CU_MEM_RANGE_HANDLE_TYPE_MAX = 2147483647 +CUmemRangeHandleType_enum = ctypes.c_uint32 # enum +CUmemRangeHandleType = CUmemRangeHandleType_enum +CUmemRangeHandleType__enumvalues = CUmemRangeHandleType_enum__enumvalues + # values for enumeration 'CUarraySparseSubresourceType_enum' CUarraySparseSubresourceType_enum__enumvalues = { 0: 'CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL', @@ -2628,6 +3086,7 @@ CUgraphExecUpdateResult_enum__enumvalues = { 5: 'CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED', 6: 'CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED', 7: 'CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE', + 8: 'CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED', } CU_GRAPH_EXEC_UPDATE_SUCCESS = 0 CU_GRAPH_EXEC_UPDATE_ERROR = 1 @@ -2637,9 +3096,23 @@ CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 4 CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 5 CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 6 CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 7 +CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED = 8 CUgraphExecUpdateResult_enum = ctypes.c_uint32 # enum CUgraphExecUpdateResult = CUgraphExecUpdateResult_enum CUgraphExecUpdateResult__enumvalues = CUgraphExecUpdateResult_enum__enumvalues +class struct_CUgraphExecUpdateResultInfo_st(Structure): + pass + +struct_CUgraphExecUpdateResultInfo_st._pack_ = 1 # source:False +struct_CUgraphExecUpdateResultInfo_st._fields_ = [ + ('result', CUgraphExecUpdateResult), + ('PADDING_0', ctypes.c_ubyte * 4), + ('errorNode', ctypes.POINTER(struct_CUgraphNode_st)), + ('errorFromNode', ctypes.POINTER(struct_CUgraphNode_st)), +] + +CUgraphExecUpdateResultInfo_v1 = struct_CUgraphExecUpdateResultInfo_st +CUgraphExecUpdateResultInfo = struct_CUgraphExecUpdateResultInfo_st # values for enumeration 'CUmemPool_attribute_enum' CUmemPool_attribute_enum__enumvalues = { @@ -2775,6 +3248,8 @@ CUgraphDebugDot_flags_enum__enumvalues = { 1024: 'CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES', 2048: 'CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS', 4096: 'CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS', + 8192: 'CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS', + 16384: 'CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO', } CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE = 1 CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES = 2 @@ -2789,6 +3264,8 @@ CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = 512 CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1024 CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 2048 CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 4096 +CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS = 8192 +CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO = 16384 CUgraphDebugDot_flags_enum = ctypes.c_uint32 # enum CUgraphDebugDot_flags = CUgraphDebugDot_flags_enum CUgraphDebugDot_flags__enumvalues = CUgraphDebugDot_flags_enum__enumvalues @@ -2814,8 +3291,14 @@ CUuserObjectRetain_flags__enumvalues = CUuserObjectRetain_flags_enum__enumvalues # values for enumeration 'CUgraphInstantiate_flags_enum' CUgraphInstantiate_flags_enum__enumvalues = { 1: 'CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH', + 2: 'CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD', + 4: 'CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH', + 8: 'CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY', } CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1 +CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD = 2 +CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH = 4 +CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY = 8 CUgraphInstantiate_flags_enum = ctypes.c_uint32 # enum CUgraphInstantiate_flags = CUgraphInstantiate_flags_enum CUgraphInstantiate_flags__enumvalues = CUgraphInstantiate_flags_enum__enumvalues @@ -2921,6 +3404,12 @@ try: cuDeviceGetDefaultMemPool.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUmemPoolHandle_st)), CUdevice] except AttributeError: pass +try: + cuDeviceGetExecAffinitySupport = _libraries['libcuda.so'].cuDeviceGetExecAffinitySupport + cuDeviceGetExecAffinitySupport.restype = CUresult + cuDeviceGetExecAffinitySupport.argtypes = [ctypes.POINTER(ctypes.c_int32), CUexecAffinityType, CUdevice] +except AttributeError: + pass try: cuFlushGPUDirectRDMAWrites = _libraries['libcuda.so'].cuFlushGPUDirectRDMAWrites cuFlushGPUDirectRDMAWrites.restype = CUresult @@ -2969,12 +3458,6 @@ try: cuDevicePrimaryCtxReset_v2.argtypes = [CUdevice] except AttributeError: pass -try: - cuDeviceGetExecAffinitySupport = _libraries['libcuda.so'].cuDeviceGetExecAffinitySupport - cuDeviceGetExecAffinitySupport.restype = CUresult - cuDeviceGetExecAffinitySupport.argtypes = [ctypes.POINTER(ctypes.c_int32), CUexecAffinityType, CUdevice] -except AttributeError: - pass try: cuCtxCreate_v2 = _libraries['libcuda.so'].cuCtxCreate_v2 cuCtxCreate_v2.restype = CUresult @@ -3029,6 +3512,12 @@ try: cuCtxGetFlags.argtypes = [ctypes.POINTER(ctypes.c_uint32)] except AttributeError: pass +try: + cuCtxGetId = _libraries['libcuda.so'].cuCtxGetId + cuCtxGetId.restype = CUresult + cuCtxGetId.argtypes = [CUcontext, ctypes.POINTER(ctypes.c_uint64)] +except AttributeError: + pass try: cuCtxSynchronize = _libraries['libcuda.so'].cuCtxSynchronize cuCtxSynchronize.restype = CUresult @@ -3138,6 +3627,23 @@ try: cuModuleUnload.argtypes = [CUmodule] except AttributeError: pass + +# values for enumeration 'CUmoduleLoadingMode_enum' +CUmoduleLoadingMode_enum__enumvalues = { + 1: 'CU_MODULE_EAGER_LOADING', + 2: 'CU_MODULE_LAZY_LOADING', +} +CU_MODULE_EAGER_LOADING = 1 +CU_MODULE_LAZY_LOADING = 2 +CUmoduleLoadingMode_enum = ctypes.c_uint32 # enum +CUmoduleLoadingMode = CUmoduleLoadingMode_enum +CUmoduleLoadingMode__enumvalues = CUmoduleLoadingMode_enum__enumvalues +try: + cuModuleGetLoadingMode = _libraries['libcuda.so'].cuModuleGetLoadingMode + cuModuleGetLoadingMode.restype = CUresult + cuModuleGetLoadingMode.argtypes = [ctypes.POINTER(CUmoduleLoadingMode_enum)] +except AttributeError: + pass try: cuModuleGetFunction = _libraries['libcuda.so'].cuModuleGetFunction cuModuleGetFunction.restype = CUresult @@ -3150,18 +3656,6 @@ try: cuModuleGetGlobal_v2.argtypes = [ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64), CUmodule, ctypes.POINTER(ctypes.c_char)] except AttributeError: pass -try: - cuModuleGetTexRef = _libraries['libcuda.so'].cuModuleGetTexRef - cuModuleGetTexRef.restype = CUresult - cuModuleGetTexRef.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUtexref_st)), CUmodule, ctypes.POINTER(ctypes.c_char)] -except AttributeError: - pass -try: - cuModuleGetSurfRef = _libraries['libcuda.so'].cuModuleGetSurfRef - cuModuleGetSurfRef.restype = CUresult - cuModuleGetSurfRef.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUsurfref_st)), CUmodule, ctypes.POINTER(ctypes.c_char)] -except AttributeError: - pass try: cuLinkCreate_v2 = _libraries['libcuda.so'].cuLinkCreate_v2 cuLinkCreate_v2.restype = CUresult @@ -3192,6 +3686,90 @@ try: cuLinkDestroy.argtypes = [CUlinkState] except AttributeError: pass +try: + cuModuleGetTexRef = _libraries['libcuda.so'].cuModuleGetTexRef + cuModuleGetTexRef.restype = CUresult + cuModuleGetTexRef.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUtexref_st)), CUmodule, ctypes.POINTER(ctypes.c_char)] +except AttributeError: + pass +try: + cuModuleGetSurfRef = _libraries['libcuda.so'].cuModuleGetSurfRef + cuModuleGetSurfRef.restype = CUresult + cuModuleGetSurfRef.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUsurfref_st)), CUmodule, ctypes.POINTER(ctypes.c_char)] +except AttributeError: + pass +try: + cuLibraryLoadData = _libraries['libcuda.so'].cuLibraryLoadData + cuLibraryLoadData.restype = CUresult + cuLibraryLoadData.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUlib_st)), ctypes.POINTER(None), ctypes.POINTER(CUjit_option_enum), ctypes.POINTER(ctypes.POINTER(None)), ctypes.c_uint32, ctypes.POINTER(CUlibraryOption_enum), ctypes.POINTER(ctypes.POINTER(None)), ctypes.c_uint32] +except AttributeError: + pass +try: + cuLibraryLoadFromFile = _libraries['libcuda.so'].cuLibraryLoadFromFile + cuLibraryLoadFromFile.restype = CUresult + cuLibraryLoadFromFile.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUlib_st)), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(CUjit_option_enum), ctypes.POINTER(ctypes.POINTER(None)), ctypes.c_uint32, ctypes.POINTER(CUlibraryOption_enum), ctypes.POINTER(ctypes.POINTER(None)), ctypes.c_uint32] +except AttributeError: + pass +try: + cuLibraryUnload = _libraries['libcuda.so'].cuLibraryUnload + cuLibraryUnload.restype = CUresult + cuLibraryUnload.argtypes = [CUlibrary] +except AttributeError: + pass +try: + cuLibraryGetKernel = _libraries['libcuda.so'].cuLibraryGetKernel + cuLibraryGetKernel.restype = CUresult + cuLibraryGetKernel.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUkern_st)), CUlibrary, ctypes.POINTER(ctypes.c_char)] +except AttributeError: + pass +try: + cuLibraryGetModule = _libraries['libcuda.so'].cuLibraryGetModule + cuLibraryGetModule.restype = CUresult + cuLibraryGetModule.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUmod_st)), CUlibrary] +except AttributeError: + pass +try: + cuKernelGetFunction = _libraries['libcuda.so'].cuKernelGetFunction + cuKernelGetFunction.restype = CUresult + cuKernelGetFunction.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUfunc_st)), CUkernel] +except AttributeError: + pass +try: + cuLibraryGetGlobal = _libraries['libcuda.so'].cuLibraryGetGlobal + cuLibraryGetGlobal.restype = CUresult + cuLibraryGetGlobal.argtypes = [ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64), CUlibrary, ctypes.POINTER(ctypes.c_char)] +except AttributeError: + pass +try: + cuLibraryGetManaged = _libraries['libcuda.so'].cuLibraryGetManaged + cuLibraryGetManaged.restype = CUresult + cuLibraryGetManaged.argtypes = [ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64), CUlibrary, ctypes.POINTER(ctypes.c_char)] +except AttributeError: + pass +try: + cuLibraryGetUnifiedFunction = _libraries['libcuda.so'].cuLibraryGetUnifiedFunction + cuLibraryGetUnifiedFunction.restype = CUresult + cuLibraryGetUnifiedFunction.argtypes = [ctypes.POINTER(ctypes.POINTER(None)), CUlibrary, ctypes.POINTER(ctypes.c_char)] +except AttributeError: + pass +try: + cuKernelGetAttribute = _libraries['libcuda.so'].cuKernelGetAttribute + cuKernelGetAttribute.restype = CUresult + cuKernelGetAttribute.argtypes = [ctypes.POINTER(ctypes.c_int32), CUfunction_attribute, CUkernel, CUdevice] +except AttributeError: + pass +try: + cuKernelSetAttribute = _libraries['libcuda.so'].cuKernelSetAttribute + cuKernelSetAttribute.restype = CUresult + cuKernelSetAttribute.argtypes = [CUfunction_attribute, ctypes.c_int32, CUkernel, CUdevice] +except AttributeError: + pass +try: + cuKernelSetCacheConfig = _libraries['libcuda.so'].cuKernelSetCacheConfig + cuKernelSetCacheConfig.restype = CUresult + cuKernelSetCacheConfig.argtypes = [CUkernel, CUfunc_cache, CUdevice] +except AttributeError: + pass try: cuMemGetInfo_v2 = _libraries['libcuda.so'].cuMemGetInfo_v2 cuMemGetInfo_v2.restype = CUresult @@ -3313,219 +3891,219 @@ try: except AttributeError: pass try: - cuMemcpy = _libraries['libcuda.so'].cuMemcpy - cuMemcpy.restype = CUresult - cuMemcpy.argtypes = [CUdeviceptr, CUdeviceptr, size_t] + cuMemcpy_ptds = _libraries['libcuda.so'].cuMemcpy_ptds + cuMemcpy_ptds.restype = CUresult + cuMemcpy_ptds.argtypes = [CUdeviceptr, CUdeviceptr, size_t] except AttributeError: pass try: - cuMemcpyPeer = _libraries['libcuda.so'].cuMemcpyPeer - cuMemcpyPeer.restype = CUresult - cuMemcpyPeer.argtypes = [CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t] + cuMemcpyPeer_ptds = _libraries['libcuda.so'].cuMemcpyPeer_ptds + cuMemcpyPeer_ptds.restype = CUresult + cuMemcpyPeer_ptds.argtypes = [CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t] except AttributeError: pass try: - cuMemcpyHtoD_v2 = _libraries['libcuda.so'].cuMemcpyHtoD_v2 - cuMemcpyHtoD_v2.restype = CUresult - cuMemcpyHtoD_v2.argtypes = [CUdeviceptr, ctypes.POINTER(None), size_t] + cuMemcpyHtoD_v2_ptds = _libraries['libcuda.so'].cuMemcpyHtoD_v2_ptds + cuMemcpyHtoD_v2_ptds.restype = CUresult + cuMemcpyHtoD_v2_ptds.argtypes = [CUdeviceptr, ctypes.POINTER(None), size_t] except AttributeError: pass try: - cuMemcpyDtoH_v2 = _libraries['libcuda.so'].cuMemcpyDtoH_v2 - cuMemcpyDtoH_v2.restype = CUresult - cuMemcpyDtoH_v2.argtypes = [ctypes.POINTER(None), CUdeviceptr, size_t] + cuMemcpyDtoH_v2_ptds = _libraries['libcuda.so'].cuMemcpyDtoH_v2_ptds + cuMemcpyDtoH_v2_ptds.restype = CUresult + cuMemcpyDtoH_v2_ptds.argtypes = [ctypes.POINTER(None), CUdeviceptr, size_t] except AttributeError: pass try: - cuMemcpyDtoD_v2 = _libraries['libcuda.so'].cuMemcpyDtoD_v2 - cuMemcpyDtoD_v2.restype = CUresult - cuMemcpyDtoD_v2.argtypes = [CUdeviceptr, CUdeviceptr, size_t] + cuMemcpyDtoD_v2_ptds = _libraries['libcuda.so'].cuMemcpyDtoD_v2_ptds + cuMemcpyDtoD_v2_ptds.restype = CUresult + cuMemcpyDtoD_v2_ptds.argtypes = [CUdeviceptr, CUdeviceptr, size_t] except AttributeError: pass try: - cuMemcpyDtoA_v2 = _libraries['libcuda.so'].cuMemcpyDtoA_v2 - cuMemcpyDtoA_v2.restype = CUresult - cuMemcpyDtoA_v2.argtypes = [CUarray, size_t, CUdeviceptr, size_t] + cuMemcpyDtoA_v2_ptds = _libraries['libcuda.so'].cuMemcpyDtoA_v2_ptds + cuMemcpyDtoA_v2_ptds.restype = CUresult + cuMemcpyDtoA_v2_ptds.argtypes = [CUarray, size_t, CUdeviceptr, size_t] except AttributeError: pass try: - cuMemcpyAtoD_v2 = _libraries['libcuda.so'].cuMemcpyAtoD_v2 - cuMemcpyAtoD_v2.restype = CUresult - cuMemcpyAtoD_v2.argtypes = [CUdeviceptr, CUarray, size_t, size_t] + cuMemcpyAtoD_v2_ptds = _libraries['libcuda.so'].cuMemcpyAtoD_v2_ptds + cuMemcpyAtoD_v2_ptds.restype = CUresult + cuMemcpyAtoD_v2_ptds.argtypes = [CUdeviceptr, CUarray, size_t, size_t] except AttributeError: pass try: - cuMemcpyHtoA_v2 = _libraries['libcuda.so'].cuMemcpyHtoA_v2 - cuMemcpyHtoA_v2.restype = CUresult - cuMemcpyHtoA_v2.argtypes = [CUarray, size_t, ctypes.POINTER(None), size_t] + cuMemcpyHtoA_v2_ptds = _libraries['libcuda.so'].cuMemcpyHtoA_v2_ptds + cuMemcpyHtoA_v2_ptds.restype = CUresult + cuMemcpyHtoA_v2_ptds.argtypes = [CUarray, size_t, ctypes.POINTER(None), size_t] except AttributeError: pass try: - cuMemcpyAtoH_v2 = _libraries['libcuda.so'].cuMemcpyAtoH_v2 - cuMemcpyAtoH_v2.restype = CUresult - cuMemcpyAtoH_v2.argtypes = [ctypes.POINTER(None), CUarray, size_t, size_t] + cuMemcpyAtoH_v2_ptds = _libraries['libcuda.so'].cuMemcpyAtoH_v2_ptds + cuMemcpyAtoH_v2_ptds.restype = CUresult + cuMemcpyAtoH_v2_ptds.argtypes = [ctypes.POINTER(None), CUarray, size_t, size_t] except AttributeError: pass try: - cuMemcpyAtoA_v2 = _libraries['libcuda.so'].cuMemcpyAtoA_v2 - cuMemcpyAtoA_v2.restype = CUresult - cuMemcpyAtoA_v2.argtypes = [CUarray, size_t, CUarray, size_t, size_t] + cuMemcpyAtoA_v2_ptds = _libraries['libcuda.so'].cuMemcpyAtoA_v2_ptds + cuMemcpyAtoA_v2_ptds.restype = CUresult + cuMemcpyAtoA_v2_ptds.argtypes = [CUarray, size_t, CUarray, size_t, size_t] except AttributeError: pass try: - cuMemcpy2D_v2 = _libraries['libcuda.so'].cuMemcpy2D_v2 - cuMemcpy2D_v2.restype = CUresult - cuMemcpy2D_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_st)] + cuMemcpy2D_v2_ptds = _libraries['libcuda.so'].cuMemcpy2D_v2_ptds + cuMemcpy2D_v2_ptds.restype = CUresult + cuMemcpy2D_v2_ptds.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_st)] except AttributeError: pass try: - cuMemcpy2DUnaligned_v2 = _libraries['libcuda.so'].cuMemcpy2DUnaligned_v2 - cuMemcpy2DUnaligned_v2.restype = CUresult - cuMemcpy2DUnaligned_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_st)] + cuMemcpy2DUnaligned_v2_ptds = _libraries['libcuda.so'].cuMemcpy2DUnaligned_v2_ptds + cuMemcpy2DUnaligned_v2_ptds.restype = CUresult + cuMemcpy2DUnaligned_v2_ptds.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_st)] except AttributeError: pass try: - cuMemcpy3D_v2 = _libraries['libcuda.so'].cuMemcpy3D_v2 - cuMemcpy3D_v2.restype = CUresult - cuMemcpy3D_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_st)] + cuMemcpy3D_v2_ptds = _libraries['libcuda.so'].cuMemcpy3D_v2_ptds + cuMemcpy3D_v2_ptds.restype = CUresult + cuMemcpy3D_v2_ptds.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_st)] except AttributeError: pass try: - cuMemcpy3DPeer = _libraries['libcuda.so'].cuMemcpy3DPeer - cuMemcpy3DPeer.restype = CUresult - cuMemcpy3DPeer.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_PEER_st)] + cuMemcpy3DPeer_ptds = _libraries['libcuda.so'].cuMemcpy3DPeer_ptds + cuMemcpy3DPeer_ptds.restype = CUresult + cuMemcpy3DPeer_ptds.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_PEER_st)] except AttributeError: pass try: - cuMemcpyAsync = _libraries['libcuda.so'].cuMemcpyAsync - cuMemcpyAsync.restype = CUresult - cuMemcpyAsync.argtypes = [CUdeviceptr, CUdeviceptr, size_t, CUstream] + cuMemcpyAsync_ptsz = _libraries['libcuda.so'].cuMemcpyAsync_ptsz + cuMemcpyAsync_ptsz.restype = CUresult + cuMemcpyAsync_ptsz.argtypes = [CUdeviceptr, CUdeviceptr, size_t, CUstream] except AttributeError: pass try: - cuMemcpyPeerAsync = _libraries['libcuda.so'].cuMemcpyPeerAsync - cuMemcpyPeerAsync.restype = CUresult - cuMemcpyPeerAsync.argtypes = [CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream] + cuMemcpyPeerAsync_ptsz = _libraries['libcuda.so'].cuMemcpyPeerAsync_ptsz + cuMemcpyPeerAsync_ptsz.restype = CUresult + cuMemcpyPeerAsync_ptsz.argtypes = [CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream] except AttributeError: pass try: - cuMemcpyHtoDAsync_v2 = _libraries['libcuda.so'].cuMemcpyHtoDAsync_v2 - cuMemcpyHtoDAsync_v2.restype = CUresult - cuMemcpyHtoDAsync_v2.argtypes = [CUdeviceptr, ctypes.POINTER(None), size_t, CUstream] + cuMemcpyHtoDAsync_v2_ptsz = _libraries['libcuda.so'].cuMemcpyHtoDAsync_v2_ptsz + cuMemcpyHtoDAsync_v2_ptsz.restype = CUresult + cuMemcpyHtoDAsync_v2_ptsz.argtypes = [CUdeviceptr, ctypes.POINTER(None), size_t, CUstream] except AttributeError: pass try: - cuMemcpyDtoHAsync_v2 = _libraries['libcuda.so'].cuMemcpyDtoHAsync_v2 - cuMemcpyDtoHAsync_v2.restype = CUresult - cuMemcpyDtoHAsync_v2.argtypes = [ctypes.POINTER(None), CUdeviceptr, size_t, CUstream] + cuMemcpyDtoHAsync_v2_ptsz = _libraries['libcuda.so'].cuMemcpyDtoHAsync_v2_ptsz + cuMemcpyDtoHAsync_v2_ptsz.restype = CUresult + cuMemcpyDtoHAsync_v2_ptsz.argtypes = [ctypes.POINTER(None), CUdeviceptr, size_t, CUstream] except AttributeError: pass try: - cuMemcpyDtoDAsync_v2 = _libraries['libcuda.so'].cuMemcpyDtoDAsync_v2 - cuMemcpyDtoDAsync_v2.restype = CUresult - cuMemcpyDtoDAsync_v2.argtypes = [CUdeviceptr, CUdeviceptr, size_t, CUstream] + cuMemcpyDtoDAsync_v2_ptsz = _libraries['libcuda.so'].cuMemcpyDtoDAsync_v2_ptsz + cuMemcpyDtoDAsync_v2_ptsz.restype = CUresult + cuMemcpyDtoDAsync_v2_ptsz.argtypes = [CUdeviceptr, CUdeviceptr, size_t, CUstream] except AttributeError: pass try: - cuMemcpyHtoAAsync_v2 = _libraries['libcuda.so'].cuMemcpyHtoAAsync_v2 - cuMemcpyHtoAAsync_v2.restype = CUresult - cuMemcpyHtoAAsync_v2.argtypes = [CUarray, size_t, ctypes.POINTER(None), size_t, CUstream] + cuMemcpyHtoAAsync_v2_ptsz = _libraries['libcuda.so'].cuMemcpyHtoAAsync_v2_ptsz + cuMemcpyHtoAAsync_v2_ptsz.restype = CUresult + cuMemcpyHtoAAsync_v2_ptsz.argtypes = [CUarray, size_t, ctypes.POINTER(None), size_t, CUstream] except AttributeError: pass try: - cuMemcpyAtoHAsync_v2 = _libraries['libcuda.so'].cuMemcpyAtoHAsync_v2 - cuMemcpyAtoHAsync_v2.restype = CUresult - cuMemcpyAtoHAsync_v2.argtypes = [ctypes.POINTER(None), CUarray, size_t, size_t, CUstream] + cuMemcpyAtoHAsync_v2_ptsz = _libraries['libcuda.so'].cuMemcpyAtoHAsync_v2_ptsz + cuMemcpyAtoHAsync_v2_ptsz.restype = CUresult + cuMemcpyAtoHAsync_v2_ptsz.argtypes = [ctypes.POINTER(None), CUarray, size_t, size_t, CUstream] except AttributeError: pass try: - cuMemcpy2DAsync_v2 = _libraries['libcuda.so'].cuMemcpy2DAsync_v2 - cuMemcpy2DAsync_v2.restype = CUresult - cuMemcpy2DAsync_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_st), CUstream] + cuMemcpy2DAsync_v2_ptsz = _libraries['libcuda.so'].cuMemcpy2DAsync_v2_ptsz + cuMemcpy2DAsync_v2_ptsz.restype = CUresult + cuMemcpy2DAsync_v2_ptsz.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_st), CUstream] except AttributeError: pass try: - cuMemcpy3DAsync_v2 = _libraries['libcuda.so'].cuMemcpy3DAsync_v2 - cuMemcpy3DAsync_v2.restype = CUresult - cuMemcpy3DAsync_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_st), CUstream] + cuMemcpy3DAsync_v2_ptsz = _libraries['libcuda.so'].cuMemcpy3DAsync_v2_ptsz + cuMemcpy3DAsync_v2_ptsz.restype = CUresult + cuMemcpy3DAsync_v2_ptsz.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_st), CUstream] except AttributeError: pass try: - cuMemcpy3DPeerAsync = _libraries['libcuda.so'].cuMemcpy3DPeerAsync - cuMemcpy3DPeerAsync.restype = CUresult - cuMemcpy3DPeerAsync.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_PEER_st), CUstream] + cuMemcpy3DPeerAsync_ptsz = _libraries['libcuda.so'].cuMemcpy3DPeerAsync_ptsz + cuMemcpy3DPeerAsync_ptsz.restype = CUresult + cuMemcpy3DPeerAsync_ptsz.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_PEER_st), CUstream] except AttributeError: pass try: - cuMemsetD8_v2 = _libraries['libcuda.so'].cuMemsetD8_v2 - cuMemsetD8_v2.restype = CUresult - cuMemsetD8_v2.argtypes = [CUdeviceptr, ctypes.c_ubyte, size_t] + cuMemsetD8_v2_ptds = _libraries['libcuda.so'].cuMemsetD8_v2_ptds + cuMemsetD8_v2_ptds.restype = CUresult + cuMemsetD8_v2_ptds.argtypes = [CUdeviceptr, ctypes.c_ubyte, size_t] except AttributeError: pass try: - cuMemsetD16_v2 = _libraries['libcuda.so'].cuMemsetD16_v2 - cuMemsetD16_v2.restype = CUresult - cuMemsetD16_v2.argtypes = [CUdeviceptr, ctypes.c_uint16, size_t] + cuMemsetD16_v2_ptds = _libraries['libcuda.so'].cuMemsetD16_v2_ptds + cuMemsetD16_v2_ptds.restype = CUresult + cuMemsetD16_v2_ptds.argtypes = [CUdeviceptr, ctypes.c_uint16, size_t] except AttributeError: pass try: - cuMemsetD32_v2 = _libraries['libcuda.so'].cuMemsetD32_v2 - cuMemsetD32_v2.restype = CUresult - cuMemsetD32_v2.argtypes = [CUdeviceptr, ctypes.c_uint32, size_t] + cuMemsetD32_v2_ptds = _libraries['libcuda.so'].cuMemsetD32_v2_ptds + cuMemsetD32_v2_ptds.restype = CUresult + cuMemsetD32_v2_ptds.argtypes = [CUdeviceptr, ctypes.c_uint32, size_t] except AttributeError: pass try: - cuMemsetD2D8_v2 = _libraries['libcuda.so'].cuMemsetD2D8_v2 - cuMemsetD2D8_v2.restype = CUresult - cuMemsetD2D8_v2.argtypes = [CUdeviceptr, size_t, ctypes.c_ubyte, size_t, size_t] + cuMemsetD2D8_v2_ptds = _libraries['libcuda.so'].cuMemsetD2D8_v2_ptds + cuMemsetD2D8_v2_ptds.restype = CUresult + cuMemsetD2D8_v2_ptds.argtypes = [CUdeviceptr, size_t, ctypes.c_ubyte, size_t, size_t] except AttributeError: pass try: - cuMemsetD2D16_v2 = _libraries['libcuda.so'].cuMemsetD2D16_v2 - cuMemsetD2D16_v2.restype = CUresult - cuMemsetD2D16_v2.argtypes = [CUdeviceptr, size_t, ctypes.c_uint16, size_t, size_t] + cuMemsetD2D16_v2_ptds = _libraries['libcuda.so'].cuMemsetD2D16_v2_ptds + cuMemsetD2D16_v2_ptds.restype = CUresult + cuMemsetD2D16_v2_ptds.argtypes = [CUdeviceptr, size_t, ctypes.c_uint16, size_t, size_t] except AttributeError: pass try: - cuMemsetD2D32_v2 = _libraries['libcuda.so'].cuMemsetD2D32_v2 - cuMemsetD2D32_v2.restype = CUresult - cuMemsetD2D32_v2.argtypes = [CUdeviceptr, size_t, ctypes.c_uint32, size_t, size_t] + cuMemsetD2D32_v2_ptds = _libraries['libcuda.so'].cuMemsetD2D32_v2_ptds + cuMemsetD2D32_v2_ptds.restype = CUresult + cuMemsetD2D32_v2_ptds.argtypes = [CUdeviceptr, size_t, ctypes.c_uint32, size_t, size_t] except AttributeError: pass try: - cuMemsetD8Async = _libraries['libcuda.so'].cuMemsetD8Async - cuMemsetD8Async.restype = CUresult - cuMemsetD8Async.argtypes = [CUdeviceptr, ctypes.c_ubyte, size_t, CUstream] + cuMemsetD8Async_ptsz = _libraries['libcuda.so'].cuMemsetD8Async_ptsz + cuMemsetD8Async_ptsz.restype = CUresult + cuMemsetD8Async_ptsz.argtypes = [CUdeviceptr, ctypes.c_ubyte, size_t, CUstream] except AttributeError: pass try: - cuMemsetD16Async = _libraries['libcuda.so'].cuMemsetD16Async - cuMemsetD16Async.restype = CUresult - cuMemsetD16Async.argtypes = [CUdeviceptr, ctypes.c_uint16, size_t, CUstream] + cuMemsetD16Async_ptsz = _libraries['libcuda.so'].cuMemsetD16Async_ptsz + cuMemsetD16Async_ptsz.restype = CUresult + cuMemsetD16Async_ptsz.argtypes = [CUdeviceptr, ctypes.c_uint16, size_t, CUstream] except AttributeError: pass try: - cuMemsetD32Async = _libraries['libcuda.so'].cuMemsetD32Async - cuMemsetD32Async.restype = CUresult - cuMemsetD32Async.argtypes = [CUdeviceptr, ctypes.c_uint32, size_t, CUstream] + cuMemsetD32Async_ptsz = _libraries['libcuda.so'].cuMemsetD32Async_ptsz + cuMemsetD32Async_ptsz.restype = CUresult + cuMemsetD32Async_ptsz.argtypes = [CUdeviceptr, ctypes.c_uint32, size_t, CUstream] except AttributeError: pass try: - cuMemsetD2D8Async = _libraries['libcuda.so'].cuMemsetD2D8Async - cuMemsetD2D8Async.restype = CUresult - cuMemsetD2D8Async.argtypes = [CUdeviceptr, size_t, ctypes.c_ubyte, size_t, size_t, CUstream] + cuMemsetD2D8Async_ptsz = _libraries['libcuda.so'].cuMemsetD2D8Async_ptsz + cuMemsetD2D8Async_ptsz.restype = CUresult + cuMemsetD2D8Async_ptsz.argtypes = [CUdeviceptr, size_t, ctypes.c_ubyte, size_t, size_t, CUstream] except AttributeError: pass try: - cuMemsetD2D16Async = _libraries['libcuda.so'].cuMemsetD2D16Async - cuMemsetD2D16Async.restype = CUresult - cuMemsetD2D16Async.argtypes = [CUdeviceptr, size_t, ctypes.c_uint16, size_t, size_t, CUstream] + cuMemsetD2D16Async_ptsz = _libraries['libcuda.so'].cuMemsetD2D16Async_ptsz + cuMemsetD2D16Async_ptsz.restype = CUresult + cuMemsetD2D16Async_ptsz.argtypes = [CUdeviceptr, size_t, ctypes.c_uint16, size_t, size_t, CUstream] except AttributeError: pass try: - cuMemsetD2D32Async = _libraries['libcuda.so'].cuMemsetD2D32Async - cuMemsetD2D32Async.restype = CUresult - cuMemsetD2D32Async.argtypes = [CUdeviceptr, size_t, ctypes.c_uint32, size_t, size_t, CUstream] + cuMemsetD2D32Async_ptsz = _libraries['libcuda.so'].cuMemsetD2D32Async_ptsz + cuMemsetD2D32Async_ptsz.restype = CUresult + cuMemsetD2D32Async_ptsz.argtypes = [CUdeviceptr, size_t, ctypes.c_uint32, size_t, size_t, CUstream] except AttributeError: pass try: @@ -3552,6 +4130,18 @@ try: cuMipmappedArrayGetSparseProperties.argtypes = [ctypes.POINTER(struct_CUDA_ARRAY_SPARSE_PROPERTIES_st), CUmipmappedArray] except AttributeError: pass +try: + cuArrayGetMemoryRequirements = _libraries['libcuda.so'].cuArrayGetMemoryRequirements + cuArrayGetMemoryRequirements.restype = CUresult + cuArrayGetMemoryRequirements.argtypes = [ctypes.POINTER(struct_CUDA_ARRAY_MEMORY_REQUIREMENTS_st), CUarray, CUdevice] +except AttributeError: + pass +try: + cuMipmappedArrayGetMemoryRequirements = _libraries['libcuda.so'].cuMipmappedArrayGetMemoryRequirements + cuMipmappedArrayGetMemoryRequirements.restype = CUresult + cuMipmappedArrayGetMemoryRequirements.argtypes = [ctypes.POINTER(struct_CUDA_ARRAY_MEMORY_REQUIREMENTS_st), CUmipmappedArray, CUdevice] +except AttributeError: + pass try: cuArrayGetPlane = _libraries['libcuda.so'].cuArrayGetPlane cuArrayGetPlane.restype = CUresult @@ -3594,6 +4184,12 @@ try: cuMipmappedArrayDestroy.argtypes = [CUmipmappedArray] except AttributeError: pass +try: + cuMemGetHandleForAddressRange = _libraries['libcuda.so'].cuMemGetHandleForAddressRange + cuMemGetHandleForAddressRange.restype = CUresult + cuMemGetHandleForAddressRange.argtypes = [ctypes.POINTER(None), CUdeviceptr, size_t, CUmemRangeHandleType, ctypes.c_uint64] +except AttributeError: + pass try: cuMemAddressReserve = _libraries['libcuda.so'].cuMemAddressReserve cuMemAddressReserve.restype = CUresult @@ -3625,9 +4221,9 @@ try: except AttributeError: pass try: - cuMemMapArrayAsync = _libraries['libcuda.so'].cuMemMapArrayAsync - cuMemMapArrayAsync.restype = CUresult - cuMemMapArrayAsync.argtypes = [ctypes.POINTER(struct_CUarrayMapInfo_st), ctypes.c_uint32, CUstream] + cuMemMapArrayAsync_ptsz = _libraries['libcuda.so'].cuMemMapArrayAsync_ptsz + cuMemMapArrayAsync_ptsz.restype = CUresult + cuMemMapArrayAsync_ptsz.argtypes = [ctypes.POINTER(struct_CUarrayMapInfo_st), ctypes.c_uint32, CUstream] except AttributeError: pass try: @@ -3679,15 +4275,15 @@ try: except AttributeError: pass try: - cuMemFreeAsync = _libraries['libcuda.so'].cuMemFreeAsync - cuMemFreeAsync.restype = CUresult - cuMemFreeAsync.argtypes = [CUdeviceptr, CUstream] + cuMemFreeAsync_ptsz = _libraries['libcuda.so'].cuMemFreeAsync_ptsz + cuMemFreeAsync_ptsz.restype = CUresult + cuMemFreeAsync_ptsz.argtypes = [CUdeviceptr, CUstream] except AttributeError: pass try: - cuMemAllocAsync = _libraries['libcuda.so'].cuMemAllocAsync - cuMemAllocAsync.restype = CUresult - cuMemAllocAsync.argtypes = [ctypes.POINTER(ctypes.c_uint64), size_t, CUstream] + cuMemAllocAsync_ptsz = _libraries['libcuda.so'].cuMemAllocAsync_ptsz + cuMemAllocAsync_ptsz.restype = CUresult + cuMemAllocAsync_ptsz.argtypes = [ctypes.POINTER(ctypes.c_uint64), size_t, CUstream] except AttributeError: pass try: @@ -3733,9 +4329,9 @@ try: except AttributeError: pass try: - cuMemAllocFromPoolAsync = _libraries['libcuda.so'].cuMemAllocFromPoolAsync - cuMemAllocFromPoolAsync.restype = CUresult - cuMemAllocFromPoolAsync.argtypes = [ctypes.POINTER(ctypes.c_uint64), size_t, CUmemoryPool, CUstream] + cuMemAllocFromPoolAsync_ptsz = _libraries['libcuda.so'].cuMemAllocFromPoolAsync_ptsz + cuMemAllocFromPoolAsync_ptsz.restype = CUresult + cuMemAllocFromPoolAsync_ptsz.argtypes = [ctypes.POINTER(ctypes.c_uint64), size_t, CUmemoryPool, CUstream] except AttributeError: pass try: @@ -3769,9 +4365,9 @@ try: except AttributeError: pass try: - cuMemPrefetchAsync = _libraries['libcuda.so'].cuMemPrefetchAsync - cuMemPrefetchAsync.restype = CUresult - cuMemPrefetchAsync.argtypes = [CUdeviceptr, size_t, CUdevice, CUstream] + cuMemPrefetchAsync_ptsz = _libraries['libcuda.so'].cuMemPrefetchAsync_ptsz + cuMemPrefetchAsync_ptsz.restype = CUresult + cuMemPrefetchAsync_ptsz.argtypes = [CUdeviceptr, size_t, CUdevice, CUstream] except AttributeError: pass try: @@ -3817,39 +4413,45 @@ try: except AttributeError: pass try: - cuStreamGetPriority = _libraries['libcuda.so'].cuStreamGetPriority - cuStreamGetPriority.restype = CUresult - cuStreamGetPriority.argtypes = [CUstream, ctypes.POINTER(ctypes.c_int32)] + cuStreamGetPriority_ptsz = _libraries['libcuda.so'].cuStreamGetPriority_ptsz + cuStreamGetPriority_ptsz.restype = CUresult + cuStreamGetPriority_ptsz.argtypes = [CUstream, ctypes.POINTER(ctypes.c_int32)] except AttributeError: pass try: - cuStreamGetFlags = _libraries['libcuda.so'].cuStreamGetFlags - cuStreamGetFlags.restype = CUresult - cuStreamGetFlags.argtypes = [CUstream, ctypes.POINTER(ctypes.c_uint32)] + cuStreamGetFlags_ptsz = _libraries['libcuda.so'].cuStreamGetFlags_ptsz + cuStreamGetFlags_ptsz.restype = CUresult + cuStreamGetFlags_ptsz.argtypes = [CUstream, ctypes.POINTER(ctypes.c_uint32)] except AttributeError: pass try: - cuStreamGetCtx = _libraries['libcuda.so'].cuStreamGetCtx - cuStreamGetCtx.restype = CUresult - cuStreamGetCtx.argtypes = [CUstream, ctypes.POINTER(ctypes.POINTER(struct_CUctx_st))] + cuStreamGetId_ptsz = _libraries['libcuda.so'].cuStreamGetId_ptsz + cuStreamGetId_ptsz.restype = CUresult + cuStreamGetId_ptsz.argtypes = [CUstream, ctypes.POINTER(ctypes.c_uint64)] except AttributeError: pass try: - cuStreamWaitEvent = _libraries['libcuda.so'].cuStreamWaitEvent - cuStreamWaitEvent.restype = CUresult - cuStreamWaitEvent.argtypes = [CUstream, CUevent, ctypes.c_uint32] + cuStreamGetCtx_ptsz = _libraries['libcuda.so'].cuStreamGetCtx_ptsz + cuStreamGetCtx_ptsz.restype = CUresult + cuStreamGetCtx_ptsz.argtypes = [CUstream, ctypes.POINTER(ctypes.POINTER(struct_CUctx_st))] except AttributeError: pass try: - cuStreamAddCallback = _libraries['libcuda.so'].cuStreamAddCallback - cuStreamAddCallback.restype = CUresult - cuStreamAddCallback.argtypes = [CUstream, CUstreamCallback, ctypes.POINTER(None), ctypes.c_uint32] + cuStreamWaitEvent_ptsz = _libraries['libcuda.so'].cuStreamWaitEvent_ptsz + cuStreamWaitEvent_ptsz.restype = CUresult + cuStreamWaitEvent_ptsz.argtypes = [CUstream, CUevent, ctypes.c_uint32] except AttributeError: pass try: - cuStreamBeginCapture_v2 = _libraries['libcuda.so'].cuStreamBeginCapture_v2 - cuStreamBeginCapture_v2.restype = CUresult - cuStreamBeginCapture_v2.argtypes = [CUstream, CUstreamCaptureMode] + cuStreamAddCallback_ptsz = _libraries['libcuda.so'].cuStreamAddCallback_ptsz + cuStreamAddCallback_ptsz.restype = CUresult + cuStreamAddCallback_ptsz.argtypes = [CUstream, CUstreamCallback, ctypes.POINTER(None), ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamBeginCapture_v2_ptsz = _libraries['libcuda.so'].cuStreamBeginCapture_v2_ptsz + cuStreamBeginCapture_v2_ptsz.restype = CUresult + cuStreamBeginCapture_v2_ptsz.argtypes = [CUstream, CUstreamCaptureMode] except AttributeError: pass try: @@ -3859,51 +4461,45 @@ try: except AttributeError: pass try: - cuStreamEndCapture = _libraries['libcuda.so'].cuStreamEndCapture - cuStreamEndCapture.restype = CUresult - cuStreamEndCapture.argtypes = [CUstream, ctypes.POINTER(ctypes.POINTER(struct_CUgraph_st))] + cuStreamEndCapture_ptsz = _libraries['libcuda.so'].cuStreamEndCapture_ptsz + cuStreamEndCapture_ptsz.restype = CUresult + cuStreamEndCapture_ptsz.argtypes = [CUstream, ctypes.POINTER(ctypes.POINTER(struct_CUgraph_st))] except AttributeError: pass try: - cuStreamIsCapturing = _libraries['libcuda.so'].cuStreamIsCapturing - cuStreamIsCapturing.restype = CUresult - cuStreamIsCapturing.argtypes = [CUstream, ctypes.POINTER(CUstreamCaptureStatus_enum)] + cuStreamIsCapturing_ptsz = _libraries['libcuda.so'].cuStreamIsCapturing_ptsz + cuStreamIsCapturing_ptsz.restype = CUresult + cuStreamIsCapturing_ptsz.argtypes = [CUstream, ctypes.POINTER(CUstreamCaptureStatus_enum)] except AttributeError: pass try: - cuStreamGetCaptureInfo = _libraries['libcuda.so'].cuStreamGetCaptureInfo - cuStreamGetCaptureInfo.restype = CUresult - cuStreamGetCaptureInfo.argtypes = [CUstream, ctypes.POINTER(CUstreamCaptureStatus_enum), ctypes.POINTER(ctypes.c_uint64)] + cuStreamGetCaptureInfo_v2_ptsz = _libraries['libcuda.so'].cuStreamGetCaptureInfo_v2_ptsz + cuStreamGetCaptureInfo_v2_ptsz.restype = CUresult + cuStreamGetCaptureInfo_v2_ptsz.argtypes = [CUstream, ctypes.POINTER(CUstreamCaptureStatus_enum), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.POINTER(struct_CUgraph_st)), ctypes.POINTER(ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st))), ctypes.POINTER(ctypes.c_uint64)] except AttributeError: pass try: - cuStreamGetCaptureInfo_v2 = _libraries['libcuda.so'].cuStreamGetCaptureInfo_v2 - cuStreamGetCaptureInfo_v2.restype = CUresult - cuStreamGetCaptureInfo_v2.argtypes = [CUstream, ctypes.POINTER(CUstreamCaptureStatus_enum), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.POINTER(struct_CUgraph_st)), ctypes.POINTER(ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st))), ctypes.POINTER(ctypes.c_uint64)] + cuStreamUpdateCaptureDependencies_ptsz = _libraries['libcuda.so'].cuStreamUpdateCaptureDependencies_ptsz + cuStreamUpdateCaptureDependencies_ptsz.restype = CUresult + cuStreamUpdateCaptureDependencies_ptsz.argtypes = [CUstream, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), size_t, ctypes.c_uint32] except AttributeError: pass try: - cuStreamUpdateCaptureDependencies = _libraries['libcuda.so'].cuStreamUpdateCaptureDependencies - cuStreamUpdateCaptureDependencies.restype = CUresult - cuStreamUpdateCaptureDependencies.argtypes = [CUstream, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), size_t, ctypes.c_uint32] + cuStreamAttachMemAsync_ptsz = _libraries['libcuda.so'].cuStreamAttachMemAsync_ptsz + cuStreamAttachMemAsync_ptsz.restype = CUresult + cuStreamAttachMemAsync_ptsz.argtypes = [CUstream, CUdeviceptr, size_t, ctypes.c_uint32] except AttributeError: pass try: - cuStreamAttachMemAsync = _libraries['libcuda.so'].cuStreamAttachMemAsync - cuStreamAttachMemAsync.restype = CUresult - cuStreamAttachMemAsync.argtypes = [CUstream, CUdeviceptr, size_t, ctypes.c_uint32] + cuStreamQuery_ptsz = _libraries['libcuda.so'].cuStreamQuery_ptsz + cuStreamQuery_ptsz.restype = CUresult + cuStreamQuery_ptsz.argtypes = [CUstream] except AttributeError: pass try: - cuStreamQuery = _libraries['libcuda.so'].cuStreamQuery - cuStreamQuery.restype = CUresult - cuStreamQuery.argtypes = [CUstream] -except AttributeError: - pass -try: - cuStreamSynchronize = _libraries['libcuda.so'].cuStreamSynchronize - cuStreamSynchronize.restype = CUresult - cuStreamSynchronize.argtypes = [CUstream] + cuStreamSynchronize_ptsz = _libraries['libcuda.so'].cuStreamSynchronize_ptsz + cuStreamSynchronize_ptsz.restype = CUresult + cuStreamSynchronize_ptsz.argtypes = [CUstream] except AttributeError: pass try: @@ -3913,21 +4509,21 @@ try: except AttributeError: pass try: - cuStreamCopyAttributes = _libraries['libcuda.so'].cuStreamCopyAttributes - cuStreamCopyAttributes.restype = CUresult - cuStreamCopyAttributes.argtypes = [CUstream, CUstream] + cuStreamCopyAttributes_ptsz = _libraries['libcuda.so'].cuStreamCopyAttributes_ptsz + cuStreamCopyAttributes_ptsz.restype = CUresult + cuStreamCopyAttributes_ptsz.argtypes = [CUstream, CUstream] except AttributeError: pass try: - cuStreamGetAttribute = _libraries['libcuda.so'].cuStreamGetAttribute - cuStreamGetAttribute.restype = CUresult - cuStreamGetAttribute.argtypes = [CUstream, CUstreamAttrID, ctypes.POINTER(union_CUstreamAttrValue_union)] + cuStreamGetAttribute_ptsz = _libraries['libcuda.so'].cuStreamGetAttribute_ptsz + cuStreamGetAttribute_ptsz.restype = CUresult + cuStreamGetAttribute_ptsz.argtypes = [CUstream, CUstreamAttrID, ctypes.POINTER(union_CUlaunchAttributeValue_union)] except AttributeError: pass try: - cuStreamSetAttribute = _libraries['libcuda.so'].cuStreamSetAttribute - cuStreamSetAttribute.restype = CUresult - cuStreamSetAttribute.argtypes = [CUstream, CUstreamAttrID, ctypes.POINTER(union_CUstreamAttrValue_union)] + cuStreamSetAttribute_ptsz = _libraries['libcuda.so'].cuStreamSetAttribute_ptsz + cuStreamSetAttribute_ptsz.restype = CUresult + cuStreamSetAttribute_ptsz.argtypes = [CUstream, CUstreamAttrID, ctypes.POINTER(union_CUlaunchAttributeValue_union)] except AttributeError: pass try: @@ -3937,15 +4533,15 @@ try: except AttributeError: pass try: - cuEventRecord = _libraries['libcuda.so'].cuEventRecord - cuEventRecord.restype = CUresult - cuEventRecord.argtypes = [CUevent, CUstream] + cuEventRecord_ptsz = _libraries['libcuda.so'].cuEventRecord_ptsz + cuEventRecord_ptsz.restype = CUresult + cuEventRecord_ptsz.argtypes = [CUevent, CUstream] except AttributeError: pass try: - cuEventRecordWithFlags = _libraries['libcuda.so'].cuEventRecordWithFlags - cuEventRecordWithFlags.restype = CUresult - cuEventRecordWithFlags.argtypes = [CUevent, CUstream, ctypes.c_uint32] + cuEventRecordWithFlags_ptsz = _libraries['libcuda.so'].cuEventRecordWithFlags_ptsz + cuEventRecordWithFlags_ptsz.restype = CUresult + cuEventRecordWithFlags_ptsz.argtypes = [CUevent, CUstream, ctypes.c_uint32] except AttributeError: pass try: @@ -4003,15 +4599,15 @@ try: except AttributeError: pass try: - cuSignalExternalSemaphoresAsync = _libraries['libcuda.so'].cuSignalExternalSemaphoresAsync - cuSignalExternalSemaphoresAsync.restype = CUresult - cuSignalExternalSemaphoresAsync.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUextSemaphore_st)), ctypes.POINTER(struct_CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st), ctypes.c_uint32, CUstream] + cuSignalExternalSemaphoresAsync_ptsz = _libraries['libcuda.so'].cuSignalExternalSemaphoresAsync_ptsz + cuSignalExternalSemaphoresAsync_ptsz.restype = CUresult + cuSignalExternalSemaphoresAsync_ptsz.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUextSemaphore_st)), ctypes.POINTER(struct_CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st), ctypes.c_uint32, CUstream] except AttributeError: pass try: - cuWaitExternalSemaphoresAsync = _libraries['libcuda.so'].cuWaitExternalSemaphoresAsync - cuWaitExternalSemaphoresAsync.restype = CUresult - cuWaitExternalSemaphoresAsync.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUextSemaphore_st)), ctypes.POINTER(struct_CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st), ctypes.c_uint32, CUstream] + cuWaitExternalSemaphoresAsync_ptsz = _libraries['libcuda.so'].cuWaitExternalSemaphoresAsync_ptsz + cuWaitExternalSemaphoresAsync_ptsz.restype = CUresult + cuWaitExternalSemaphoresAsync_ptsz.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUextSemaphore_st)), ctypes.POINTER(struct_CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st), ctypes.c_uint32, CUstream] except AttributeError: pass try: @@ -4021,33 +4617,33 @@ try: except AttributeError: pass try: - cuStreamWaitValue32 = _libraries['libcuda.so'].cuStreamWaitValue32 - cuStreamWaitValue32.restype = CUresult - cuStreamWaitValue32.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] + cuStreamWaitValue32_v2_ptsz = _libraries['libcuda.so'].cuStreamWaitValue32_v2_ptsz + cuStreamWaitValue32_v2_ptsz.restype = CUresult + cuStreamWaitValue32_v2_ptsz.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] except AttributeError: pass try: - cuStreamWaitValue64 = _libraries['libcuda.so'].cuStreamWaitValue64 - cuStreamWaitValue64.restype = CUresult - cuStreamWaitValue64.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] + cuStreamWaitValue64_v2_ptsz = _libraries['libcuda.so'].cuStreamWaitValue64_v2_ptsz + cuStreamWaitValue64_v2_ptsz.restype = CUresult + cuStreamWaitValue64_v2_ptsz.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] except AttributeError: pass try: - cuStreamWriteValue32 = _libraries['libcuda.so'].cuStreamWriteValue32 - cuStreamWriteValue32.restype = CUresult - cuStreamWriteValue32.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] + cuStreamWriteValue32_v2_ptsz = _libraries['libcuda.so'].cuStreamWriteValue32_v2_ptsz + cuStreamWriteValue32_v2_ptsz.restype = CUresult + cuStreamWriteValue32_v2_ptsz.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] except AttributeError: pass try: - cuStreamWriteValue64 = _libraries['libcuda.so'].cuStreamWriteValue64 - cuStreamWriteValue64.restype = CUresult - cuStreamWriteValue64.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] + cuStreamWriteValue64_v2_ptsz = _libraries['libcuda.so'].cuStreamWriteValue64_v2_ptsz + cuStreamWriteValue64_v2_ptsz.restype = CUresult + cuStreamWriteValue64_v2_ptsz.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] except AttributeError: pass try: - cuStreamBatchMemOp = _libraries['libcuda.so'].cuStreamBatchMemOp - cuStreamBatchMemOp.restype = CUresult - cuStreamBatchMemOp.argtypes = [CUstream, ctypes.c_uint32, ctypes.POINTER(union_CUstreamBatchMemOpParams_union), ctypes.c_uint32] + cuStreamBatchMemOp_v2_ptsz = _libraries['libcuda.so'].cuStreamBatchMemOp_v2_ptsz + cuStreamBatchMemOp_v2_ptsz.restype = CUresult + cuStreamBatchMemOp_v2_ptsz.argtypes = [CUstream, ctypes.c_uint32, ctypes.POINTER(union_CUstreamBatchMemOpParams_union), ctypes.c_uint32] except AttributeError: pass try: @@ -4081,15 +4677,21 @@ try: except AttributeError: pass try: - cuLaunchKernel = _libraries['libcuda.so'].cuLaunchKernel - cuLaunchKernel.restype = CUresult - cuLaunchKernel.argtypes = [CUfunction, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, CUstream, ctypes.POINTER(ctypes.POINTER(None)), ctypes.POINTER(ctypes.POINTER(None))] + cuLaunchKernel_ptsz = _libraries['libcuda.so'].cuLaunchKernel_ptsz + cuLaunchKernel_ptsz.restype = CUresult + cuLaunchKernel_ptsz.argtypes = [CUfunction, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, CUstream, ctypes.POINTER(ctypes.POINTER(None)), ctypes.POINTER(ctypes.POINTER(None))] except AttributeError: pass try: - cuLaunchCooperativeKernel = _libraries['libcuda.so'].cuLaunchCooperativeKernel - cuLaunchCooperativeKernel.restype = CUresult - cuLaunchCooperativeKernel.argtypes = [CUfunction, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, CUstream, ctypes.POINTER(ctypes.POINTER(None))] + cuLaunchKernelEx_ptsz = _libraries['libcuda.so'].cuLaunchKernelEx_ptsz + cuLaunchKernelEx_ptsz.restype = CUresult + cuLaunchKernelEx_ptsz.argtypes = [ctypes.POINTER(struct_CUlaunchConfig_st), CUfunction, ctypes.POINTER(ctypes.POINTER(None)), ctypes.POINTER(ctypes.POINTER(None))] +except AttributeError: + pass +try: + cuLaunchCooperativeKernel_ptsz = _libraries['libcuda.so'].cuLaunchCooperativeKernel_ptsz + cuLaunchCooperativeKernel_ptsz.restype = CUresult + cuLaunchCooperativeKernel_ptsz.argtypes = [CUfunction, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, CUstream, ctypes.POINTER(ctypes.POINTER(None))] except AttributeError: pass try: @@ -4099,9 +4701,9 @@ try: except AttributeError: pass try: - cuLaunchHostFunc = _libraries['libcuda.so'].cuLaunchHostFunc - cuLaunchHostFunc.restype = CUresult - cuLaunchHostFunc.argtypes = [CUstream, CUhostFn, ctypes.POINTER(None)] + cuLaunchHostFunc_ptsz = _libraries['libcuda.so'].cuLaunchHostFunc_ptsz + cuLaunchHostFunc_ptsz.restype = CUresult + cuLaunchHostFunc_ptsz.argtypes = [CUstream, CUhostFn, ctypes.POINTER(None)] except AttributeError: pass try: @@ -4171,21 +4773,21 @@ try: except AttributeError: pass try: - cuGraphAddKernelNode = _libraries['libcuda.so'].cuGraphAddKernelNode - cuGraphAddKernelNode.restype = CUresult - cuGraphAddKernelNode.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), CUgraph, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), size_t, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_st)] + cuGraphAddKernelNode_v2 = _libraries['libcuda.so'].cuGraphAddKernelNode_v2 + cuGraphAddKernelNode_v2.restype = CUresult + cuGraphAddKernelNode_v2.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), CUgraph, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), size_t, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_v2_st)] except AttributeError: pass try: - cuGraphKernelNodeGetParams = _libraries['libcuda.so'].cuGraphKernelNodeGetParams - cuGraphKernelNodeGetParams.restype = CUresult - cuGraphKernelNodeGetParams.argtypes = [CUgraphNode, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_st)] + cuGraphKernelNodeGetParams_v2 = _libraries['libcuda.so'].cuGraphKernelNodeGetParams_v2 + cuGraphKernelNodeGetParams_v2.restype = CUresult + cuGraphKernelNodeGetParams_v2.argtypes = [CUgraphNode, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_v2_st)] except AttributeError: pass try: - cuGraphKernelNodeSetParams = _libraries['libcuda.so'].cuGraphKernelNodeSetParams - cuGraphKernelNodeSetParams.restype = CUresult - cuGraphKernelNodeSetParams.argtypes = [CUgraphNode, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_st)] + cuGraphKernelNodeSetParams_v2 = _libraries['libcuda.so'].cuGraphKernelNodeSetParams_v2 + cuGraphKernelNodeSetParams_v2.restype = CUresult + cuGraphKernelNodeSetParams_v2.argtypes = [CUgraphNode, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_v2_st)] except AttributeError: pass try: @@ -4332,6 +4934,30 @@ try: cuGraphExternalSemaphoresWaitNodeSetParams.argtypes = [CUgraphNode, ctypes.POINTER(struct_CUDA_EXT_SEM_WAIT_NODE_PARAMS_st)] except AttributeError: pass +try: + cuGraphAddBatchMemOpNode = _libraries['libcuda.so'].cuGraphAddBatchMemOpNode + cuGraphAddBatchMemOpNode.restype = CUresult + cuGraphAddBatchMemOpNode.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), CUgraph, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), size_t, ctypes.POINTER(struct_CUDA_BATCH_MEM_OP_NODE_PARAMS_st)] +except AttributeError: + pass +try: + cuGraphBatchMemOpNodeGetParams = _libraries['libcuda.so'].cuGraphBatchMemOpNodeGetParams + cuGraphBatchMemOpNodeGetParams.restype = CUresult + cuGraphBatchMemOpNodeGetParams.argtypes = [CUgraphNode, ctypes.POINTER(struct_CUDA_BATCH_MEM_OP_NODE_PARAMS_st)] +except AttributeError: + pass +try: + cuGraphBatchMemOpNodeSetParams = _libraries['libcuda.so'].cuGraphBatchMemOpNodeSetParams + cuGraphBatchMemOpNodeSetParams.restype = CUresult + cuGraphBatchMemOpNodeSetParams.argtypes = [CUgraphNode, ctypes.POINTER(struct_CUDA_BATCH_MEM_OP_NODE_PARAMS_st)] +except AttributeError: + pass +try: + cuGraphExecBatchMemOpNodeSetParams = _libraries['libcuda.so'].cuGraphExecBatchMemOpNodeSetParams + cuGraphExecBatchMemOpNodeSetParams.restype = CUresult + cuGraphExecBatchMemOpNodeSetParams.argtypes = [CUgraphExec, CUgraphNode, ctypes.POINTER(struct_CUDA_BATCH_MEM_OP_NODE_PARAMS_st)] +except AttributeError: + pass try: cuGraphAddMemAllocNode = _libraries['libcuda.so'].cuGraphAddMemAllocNode cuGraphAddMemAllocNode.restype = CUresult @@ -4440,12 +5066,6 @@ try: cuGraphDestroyNode.argtypes = [CUgraphNode] except AttributeError: pass -try: - cuGraphInstantiate_v2 = _libraries['libcuda.so'].cuGraphInstantiate_v2 - cuGraphInstantiate_v2.restype = CUresult - cuGraphInstantiate_v2.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUgraphExec_st)), CUgraph, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), ctypes.POINTER(ctypes.c_char), size_t] -except AttributeError: - pass try: cuGraphInstantiateWithFlags = _libraries['libcuda.so'].cuGraphInstantiateWithFlags cuGraphInstantiateWithFlags.restype = CUresult @@ -4453,9 +5073,21 @@ try: except AttributeError: pass try: - cuGraphExecKernelNodeSetParams = _libraries['libcuda.so'].cuGraphExecKernelNodeSetParams - cuGraphExecKernelNodeSetParams.restype = CUresult - cuGraphExecKernelNodeSetParams.argtypes = [CUgraphExec, CUgraphNode, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_st)] + cuGraphInstantiateWithParams_ptsz = _libraries['libcuda.so'].cuGraphInstantiateWithParams_ptsz + cuGraphInstantiateWithParams_ptsz.restype = CUresult + cuGraphInstantiateWithParams_ptsz.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUgraphExec_st)), CUgraph, ctypes.POINTER(struct_CUDA_GRAPH_INSTANTIATE_PARAMS_st)] +except AttributeError: + pass +try: + cuGraphExecGetFlags = _libraries['libcuda.so'].cuGraphExecGetFlags + cuGraphExecGetFlags.restype = CUresult + cuGraphExecGetFlags.argtypes = [CUgraphExec, ctypes.POINTER(ctypes.c_uint64)] +except AttributeError: + pass +try: + cuGraphExecKernelNodeSetParams_v2 = _libraries['libcuda.so'].cuGraphExecKernelNodeSetParams_v2 + cuGraphExecKernelNodeSetParams_v2.restype = CUresult + cuGraphExecKernelNodeSetParams_v2.argtypes = [CUgraphExec, CUgraphNode, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_v2_st)] except AttributeError: pass try: @@ -4507,15 +5139,27 @@ try: except AttributeError: pass try: - cuGraphUpload = _libraries['libcuda.so'].cuGraphUpload - cuGraphUpload.restype = CUresult - cuGraphUpload.argtypes = [CUgraphExec, CUstream] + cuGraphNodeSetEnabled = _libraries['libcuda.so'].cuGraphNodeSetEnabled + cuGraphNodeSetEnabled.restype = CUresult + cuGraphNodeSetEnabled.argtypes = [CUgraphExec, CUgraphNode, ctypes.c_uint32] except AttributeError: pass try: - cuGraphLaunch = _libraries['libcuda.so'].cuGraphLaunch - cuGraphLaunch.restype = CUresult - cuGraphLaunch.argtypes = [CUgraphExec, CUstream] + cuGraphNodeGetEnabled = _libraries['libcuda.so'].cuGraphNodeGetEnabled + cuGraphNodeGetEnabled.restype = CUresult + cuGraphNodeGetEnabled.argtypes = [CUgraphExec, CUgraphNode, ctypes.POINTER(ctypes.c_uint32)] +except AttributeError: + pass +try: + cuGraphUpload_ptsz = _libraries['libcuda.so'].cuGraphUpload_ptsz + cuGraphUpload_ptsz.restype = CUresult + cuGraphUpload_ptsz.argtypes = [CUgraphExec, CUstream] +except AttributeError: + pass +try: + cuGraphLaunch_ptsz = _libraries['libcuda.so'].cuGraphLaunch_ptsz + cuGraphLaunch_ptsz.restype = CUresult + cuGraphLaunch_ptsz.argtypes = [CUgraphExec, CUstream] except AttributeError: pass try: @@ -4531,9 +5175,9 @@ try: except AttributeError: pass try: - cuGraphExecUpdate = _libraries['libcuda.so'].cuGraphExecUpdate - cuGraphExecUpdate.restype = CUresult - cuGraphExecUpdate.argtypes = [CUgraphExec, CUgraph, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), ctypes.POINTER(CUgraphExecUpdateResult_enum)] + cuGraphExecUpdate_v2 = _libraries['libcuda.so'].cuGraphExecUpdate_v2 + cuGraphExecUpdate_v2.restype = CUresult + cuGraphExecUpdate_v2.argtypes = [CUgraphExec, CUgraph, ctypes.POINTER(struct_CUgraphExecUpdateResultInfo_st)] except AttributeError: pass try: @@ -4545,13 +5189,13 @@ except AttributeError: try: cuGraphKernelNodeGetAttribute = _libraries['libcuda.so'].cuGraphKernelNodeGetAttribute cuGraphKernelNodeGetAttribute.restype = CUresult - cuGraphKernelNodeGetAttribute.argtypes = [CUgraphNode, CUkernelNodeAttrID, ctypes.POINTER(union_CUkernelNodeAttrValue_union)] + cuGraphKernelNodeGetAttribute.argtypes = [CUgraphNode, CUkernelNodeAttrID, ctypes.POINTER(union_CUlaunchAttributeValue_union)] except AttributeError: pass try: cuGraphKernelNodeSetAttribute = _libraries['libcuda.so'].cuGraphKernelNodeSetAttribute cuGraphKernelNodeSetAttribute.restype = CUresult - cuGraphKernelNodeSetAttribute.argtypes = [CUgraphNode, CUkernelNodeAttrID, ctypes.POINTER(union_CUkernelNodeAttrValue_union)] + cuGraphKernelNodeSetAttribute.argtypes = [CUgraphNode, CUkernelNodeAttrID, ctypes.POINTER(union_CUlaunchAttributeValue_union)] except AttributeError: pass try: @@ -4620,6 +5264,18 @@ try: cuOccupancyAvailableDynamicSMemPerBlock.argtypes = [ctypes.POINTER(ctypes.c_uint64), CUfunction, ctypes.c_int32, ctypes.c_int32] except AttributeError: pass +try: + cuOccupancyMaxPotentialClusterSize = _libraries['libcuda.so'].cuOccupancyMaxPotentialClusterSize + cuOccupancyMaxPotentialClusterSize.restype = CUresult + cuOccupancyMaxPotentialClusterSize.argtypes = [ctypes.POINTER(ctypes.c_int32), CUfunction, ctypes.POINTER(struct_CUlaunchConfig_st)] +except AttributeError: + pass +try: + cuOccupancyMaxActiveClusters = _libraries['libcuda.so'].cuOccupancyMaxActiveClusters + cuOccupancyMaxActiveClusters.restype = CUresult + cuOccupancyMaxActiveClusters.argtypes = [ctypes.POINTER(ctypes.c_int32), CUfunction, ctypes.POINTER(struct_CUlaunchConfig_st)] +except AttributeError: + pass try: cuTexRefSetArray = _libraries['libcuda.so'].cuTexRefSetArray cuTexRefSetArray.restype = CUresult @@ -4842,6 +5498,24 @@ try: cuSurfObjectGetResourceDesc.argtypes = [ctypes.POINTER(struct_CUDA_RESOURCE_DESC_st), CUsurfObject] except AttributeError: pass +try: + cuTensorMapEncodeTiled = _libraries['libcuda.so'].cuTensorMapEncodeTiled + cuTensorMapEncodeTiled.restype = CUresult + cuTensorMapEncodeTiled.argtypes = [ctypes.POINTER(struct_CUtensorMap_st), CUtensorMapDataType, cuuint32_t, ctypes.POINTER(None), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32), CUtensorMapInterleave, CUtensorMapSwizzle, CUtensorMapL2promotion, CUtensorMapFloatOOBfill] +except AttributeError: + pass +try: + cuTensorMapEncodeIm2col = _libraries['libcuda.so'].cuTensorMapEncodeIm2col + cuTensorMapEncodeIm2col.restype = CUresult + cuTensorMapEncodeIm2col.argtypes = [ctypes.POINTER(struct_CUtensorMap_st), CUtensorMapDataType, cuuint32_t, ctypes.POINTER(None), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_int32), ctypes.POINTER(ctypes.c_int32), cuuint32_t, cuuint32_t, ctypes.POINTER(ctypes.c_uint32), CUtensorMapInterleave, CUtensorMapSwizzle, CUtensorMapL2promotion, CUtensorMapFloatOOBfill] +except AttributeError: + pass +try: + cuTensorMapReplaceAddress = _libraries['libcuda.so'].cuTensorMapReplaceAddress + cuTensorMapReplaceAddress.restype = CUresult + cuTensorMapReplaceAddress.argtypes = [ctypes.POINTER(struct_CUtensorMap_st), ctypes.POINTER(None)] +except AttributeError: + pass try: cuDeviceCanAccessPeer = _libraries['libcuda.so'].cuDeviceCanAccessPeer cuDeviceCanAccessPeer.restype = CUresult @@ -4896,6 +5570,759 @@ try: cuGraphicsResourceSetMapFlags_v2.argtypes = [CUgraphicsResource, ctypes.c_uint32] except AttributeError: pass +try: + cuGraphicsMapResources_ptsz = _libraries['libcuda.so'].cuGraphicsMapResources_ptsz + cuGraphicsMapResources_ptsz.restype = CUresult + cuGraphicsMapResources_ptsz.argtypes = [ctypes.c_uint32, ctypes.POINTER(ctypes.POINTER(struct_CUgraphicsResource_st)), CUstream] +except AttributeError: + pass +try: + cuGraphicsUnmapResources_ptsz = _libraries['libcuda.so'].cuGraphicsUnmapResources_ptsz + cuGraphicsUnmapResources_ptsz.restype = CUresult + cuGraphicsUnmapResources_ptsz.argtypes = [ctypes.c_uint32, ctypes.POINTER(ctypes.POINTER(struct_CUgraphicsResource_st)), CUstream] +except AttributeError: + pass +try: + cuGetProcAddress_v2 = _libraries['libcuda.so'].cuGetProcAddress_v2 + cuGetProcAddress_v2.restype = CUresult + cuGetProcAddress_v2.argtypes = [ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.POINTER(None)), ctypes.c_int32, cuuint64_t, ctypes.POINTER(CUdriverProcAddressQueryResult_enum)] +except AttributeError: + pass +try: + cuGetExportTable = _libraries['libcuda.so'].cuGetExportTable + cuGetExportTable.restype = CUresult + cuGetExportTable.argtypes = [ctypes.POINTER(ctypes.POINTER(None)), ctypes.POINTER(struct_CUuuid_st)] +except AttributeError: + pass +try: + cuMemHostRegister = _libraries['libcuda.so'].cuMemHostRegister + cuMemHostRegister.restype = CUresult + cuMemHostRegister.argtypes = [ctypes.POINTER(None), size_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuGraphicsResourceSetMapFlags = _libraries['libcuda.so'].cuGraphicsResourceSetMapFlags + cuGraphicsResourceSetMapFlags.restype = CUresult + cuGraphicsResourceSetMapFlags.argtypes = [CUgraphicsResource, ctypes.c_uint32] +except AttributeError: + pass +try: + cuLinkCreate = _libraries['libcuda.so'].cuLinkCreate + cuLinkCreate.restype = CUresult + cuLinkCreate.argtypes = [ctypes.c_uint32, ctypes.POINTER(CUjit_option_enum), ctypes.POINTER(ctypes.POINTER(None)), ctypes.POINTER(ctypes.POINTER(struct_CUlinkState_st))] +except AttributeError: + pass +try: + cuLinkAddData = _libraries['libcuda.so'].cuLinkAddData + cuLinkAddData.restype = CUresult + cuLinkAddData.argtypes = [CUlinkState, CUjitInputType, ctypes.POINTER(None), size_t, ctypes.POINTER(ctypes.c_char), ctypes.c_uint32, ctypes.POINTER(CUjit_option_enum), ctypes.POINTER(ctypes.POINTER(None))] +except AttributeError: + pass +try: + cuLinkAddFile = _libraries['libcuda.so'].cuLinkAddFile + cuLinkAddFile.restype = CUresult + cuLinkAddFile.argtypes = [CUlinkState, CUjitInputType, ctypes.POINTER(ctypes.c_char), ctypes.c_uint32, ctypes.POINTER(CUjit_option_enum), ctypes.POINTER(ctypes.POINTER(None))] +except AttributeError: + pass +try: + cuTexRefSetAddress2D_v2 = _libraries['libcuda.so'].cuTexRefSetAddress2D_v2 + cuTexRefSetAddress2D_v2.restype = CUresult + cuTexRefSetAddress2D_v2.argtypes = [CUtexref, ctypes.POINTER(struct_CUDA_ARRAY_DESCRIPTOR_st), CUdeviceptr, size_t] +except AttributeError: + pass +CUdeviceptr_v1 = ctypes.c_uint32 +class struct_CUDA_MEMCPY2D_v1_st(Structure): + pass + +struct_CUDA_MEMCPY2D_v1_st._pack_ = 1 # source:False +struct_CUDA_MEMCPY2D_v1_st._fields_ = [ + ('srcXInBytes', ctypes.c_uint32), + ('srcY', ctypes.c_uint32), + ('srcMemoryType', CUmemorytype), + ('PADDING_0', ctypes.c_ubyte * 4), + ('srcHost', ctypes.POINTER(None)), + ('srcDevice', ctypes.c_uint32), + ('PADDING_1', ctypes.c_ubyte * 4), + ('srcArray', ctypes.POINTER(struct_CUarray_st)), + ('srcPitch', ctypes.c_uint32), + ('dstXInBytes', ctypes.c_uint32), + ('dstY', ctypes.c_uint32), + ('dstMemoryType', CUmemorytype), + ('dstHost', ctypes.POINTER(None)), + ('dstDevice', ctypes.c_uint32), + ('PADDING_2', ctypes.c_ubyte * 4), + ('dstArray', ctypes.POINTER(struct_CUarray_st)), + ('dstPitch', ctypes.c_uint32), + ('WidthInBytes', ctypes.c_uint32), + ('Height', ctypes.c_uint32), + ('PADDING_3', ctypes.c_ubyte * 4), +] + +CUDA_MEMCPY2D_v1 = struct_CUDA_MEMCPY2D_v1_st +class struct_CUDA_MEMCPY3D_v1_st(Structure): + pass + +struct_CUDA_MEMCPY3D_v1_st._pack_ = 1 # source:False +struct_CUDA_MEMCPY3D_v1_st._fields_ = [ + ('srcXInBytes', ctypes.c_uint32), + ('srcY', ctypes.c_uint32), + ('srcZ', ctypes.c_uint32), + ('srcLOD', ctypes.c_uint32), + ('srcMemoryType', CUmemorytype), + ('PADDING_0', ctypes.c_ubyte * 4), + ('srcHost', ctypes.POINTER(None)), + ('srcDevice', ctypes.c_uint32), + ('PADDING_1', ctypes.c_ubyte * 4), + ('srcArray', ctypes.POINTER(struct_CUarray_st)), + ('reserved0', ctypes.POINTER(None)), + ('srcPitch', ctypes.c_uint32), + ('srcHeight', ctypes.c_uint32), + ('dstXInBytes', ctypes.c_uint32), + ('dstY', ctypes.c_uint32), + ('dstZ', ctypes.c_uint32), + ('dstLOD', ctypes.c_uint32), + ('dstMemoryType', CUmemorytype), + ('PADDING_2', ctypes.c_ubyte * 4), + ('dstHost', ctypes.POINTER(None)), + ('dstDevice', ctypes.c_uint32), + ('PADDING_3', ctypes.c_ubyte * 4), + ('dstArray', ctypes.POINTER(struct_CUarray_st)), + ('reserved1', ctypes.POINTER(None)), + ('dstPitch', ctypes.c_uint32), + ('dstHeight', ctypes.c_uint32), + ('WidthInBytes', ctypes.c_uint32), + ('Height', ctypes.c_uint32), + ('Depth', ctypes.c_uint32), + ('PADDING_4', ctypes.c_ubyte * 4), +] + +CUDA_MEMCPY3D_v1 = struct_CUDA_MEMCPY3D_v1_st +class struct_CUDA_ARRAY_DESCRIPTOR_v1_st(Structure): + pass + +struct_CUDA_ARRAY_DESCRIPTOR_v1_st._pack_ = 1 # source:False +struct_CUDA_ARRAY_DESCRIPTOR_v1_st._fields_ = [ + ('Width', ctypes.c_uint32), + ('Height', ctypes.c_uint32), + ('Format', CUarray_format), + ('NumChannels', ctypes.c_uint32), +] + +CUDA_ARRAY_DESCRIPTOR_v1 = struct_CUDA_ARRAY_DESCRIPTOR_v1_st +class struct_CUDA_ARRAY3D_DESCRIPTOR_v1_st(Structure): + pass + +struct_CUDA_ARRAY3D_DESCRIPTOR_v1_st._pack_ = 1 # source:False +struct_CUDA_ARRAY3D_DESCRIPTOR_v1_st._fields_ = [ + ('Width', ctypes.c_uint32), + ('Height', ctypes.c_uint32), + ('Depth', ctypes.c_uint32), + ('Format', CUarray_format), + ('NumChannels', ctypes.c_uint32), + ('Flags', ctypes.c_uint32), +] + +CUDA_ARRAY3D_DESCRIPTOR_v1 = struct_CUDA_ARRAY3D_DESCRIPTOR_v1_st +try: + cuDeviceTotalMem = _libraries['libcuda.so'].cuDeviceTotalMem + cuDeviceTotalMem.restype = CUresult + cuDeviceTotalMem.argtypes = [ctypes.POINTER(ctypes.c_uint32), CUdevice] +except AttributeError: + pass +try: + cuCtxCreate = _libraries['libcuda.so'].cuCtxCreate + cuCtxCreate.restype = CUresult + cuCtxCreate.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUctx_st)), ctypes.c_uint32, CUdevice] +except AttributeError: + pass +try: + cuModuleGetGlobal = _libraries['libcuda.so'].cuModuleGetGlobal + cuModuleGetGlobal.restype = CUresult + cuModuleGetGlobal.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32), CUmodule, ctypes.POINTER(ctypes.c_char)] +except AttributeError: + pass +try: + cuMemGetInfo = _libraries['libcuda.so'].cuMemGetInfo + cuMemGetInfo.restype = CUresult + cuMemGetInfo.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32)] +except AttributeError: + pass +try: + cuMemAlloc = _libraries['libcuda.so'].cuMemAlloc + cuMemAlloc.restype = CUresult + cuMemAlloc.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemAllocPitch = _libraries['libcuda.so'].cuMemAllocPitch + cuMemAllocPitch.restype = CUresult + cuMemAllocPitch.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32), ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemFree = _libraries['libcuda.so'].cuMemFree + cuMemFree.restype = CUresult + cuMemFree.argtypes = [CUdeviceptr_v1] +except AttributeError: + pass +try: + cuMemGetAddressRange = _libraries['libcuda.so'].cuMemGetAddressRange + cuMemGetAddressRange.restype = CUresult + cuMemGetAddressRange.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32), CUdeviceptr_v1] +except AttributeError: + pass +try: + cuMemAllocHost = _libraries['libcuda.so'].cuMemAllocHost + cuMemAllocHost.restype = CUresult + cuMemAllocHost.argtypes = [ctypes.POINTER(ctypes.POINTER(None)), ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemHostGetDevicePointer = _libraries['libcuda.so'].cuMemHostGetDevicePointer + cuMemHostGetDevicePointer.restype = CUresult + cuMemHostGetDevicePointer.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(None), ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyHtoD = _libraries['libcuda.so'].cuMemcpyHtoD + cuMemcpyHtoD.restype = CUresult + cuMemcpyHtoD.argtypes = [CUdeviceptr_v1, ctypes.POINTER(None), ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyDtoH = _libraries['libcuda.so'].cuMemcpyDtoH + cuMemcpyDtoH.restype = CUresult + cuMemcpyDtoH.argtypes = [ctypes.POINTER(None), CUdeviceptr_v1, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyDtoD = _libraries['libcuda.so'].cuMemcpyDtoD + cuMemcpyDtoD.restype = CUresult + cuMemcpyDtoD.argtypes = [CUdeviceptr_v1, CUdeviceptr_v1, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyDtoA = _libraries['libcuda.so'].cuMemcpyDtoA + cuMemcpyDtoA.restype = CUresult + cuMemcpyDtoA.argtypes = [CUarray, ctypes.c_uint32, CUdeviceptr_v1, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyAtoD = _libraries['libcuda.so'].cuMemcpyAtoD + cuMemcpyAtoD.restype = CUresult + cuMemcpyAtoD.argtypes = [CUdeviceptr_v1, CUarray, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyHtoA = _libraries['libcuda.so'].cuMemcpyHtoA + cuMemcpyHtoA.restype = CUresult + cuMemcpyHtoA.argtypes = [CUarray, ctypes.c_uint32, ctypes.POINTER(None), ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyAtoH = _libraries['libcuda.so'].cuMemcpyAtoH + cuMemcpyAtoH.restype = CUresult + cuMemcpyAtoH.argtypes = [ctypes.POINTER(None), CUarray, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyAtoA = _libraries['libcuda.so'].cuMemcpyAtoA + cuMemcpyAtoA.restype = CUresult + cuMemcpyAtoA.argtypes = [CUarray, ctypes.c_uint32, CUarray, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyHtoAAsync = _libraries['libcuda.so'].cuMemcpyHtoAAsync + cuMemcpyHtoAAsync.restype = CUresult + cuMemcpyHtoAAsync.argtypes = [CUarray, ctypes.c_uint32, ctypes.POINTER(None), ctypes.c_uint32, CUstream] +except AttributeError: + pass +try: + cuMemcpyAtoHAsync = _libraries['libcuda.so'].cuMemcpyAtoHAsync + cuMemcpyAtoHAsync.restype = CUresult + cuMemcpyAtoHAsync.argtypes = [ctypes.POINTER(None), CUarray, ctypes.c_uint32, ctypes.c_uint32, CUstream] +except AttributeError: + pass +try: + cuMemcpy2D = _libraries['libcuda.so'].cuMemcpy2D + cuMemcpy2D.restype = CUresult + cuMemcpy2D.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_v1_st)] +except AttributeError: + pass +try: + cuMemcpy2DUnaligned = _libraries['libcuda.so'].cuMemcpy2DUnaligned + cuMemcpy2DUnaligned.restype = CUresult + cuMemcpy2DUnaligned.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_v1_st)] +except AttributeError: + pass +try: + cuMemcpy3D = _libraries['libcuda.so'].cuMemcpy3D + cuMemcpy3D.restype = CUresult + cuMemcpy3D.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_v1_st)] +except AttributeError: + pass +try: + cuMemcpyHtoDAsync = _libraries['libcuda.so'].cuMemcpyHtoDAsync + cuMemcpyHtoDAsync.restype = CUresult + cuMemcpyHtoDAsync.argtypes = [CUdeviceptr_v1, ctypes.POINTER(None), ctypes.c_uint32, CUstream] +except AttributeError: + pass +try: + cuMemcpyDtoHAsync = _libraries['libcuda.so'].cuMemcpyDtoHAsync + cuMemcpyDtoHAsync.restype = CUresult + cuMemcpyDtoHAsync.argtypes = [ctypes.POINTER(None), CUdeviceptr_v1, ctypes.c_uint32, CUstream] +except AttributeError: + pass +try: + cuMemcpyDtoDAsync = _libraries['libcuda.so'].cuMemcpyDtoDAsync + cuMemcpyDtoDAsync.restype = CUresult + cuMemcpyDtoDAsync.argtypes = [CUdeviceptr_v1, CUdeviceptr_v1, ctypes.c_uint32, CUstream] +except AttributeError: + pass +try: + cuMemcpy2DAsync = _libraries['libcuda.so'].cuMemcpy2DAsync + cuMemcpy2DAsync.restype = CUresult + cuMemcpy2DAsync.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_v1_st), CUstream] +except AttributeError: + pass +try: + cuMemcpy3DAsync = _libraries['libcuda.so'].cuMemcpy3DAsync + cuMemcpy3DAsync.restype = CUresult + cuMemcpy3DAsync.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_v1_st), CUstream] +except AttributeError: + pass +try: + cuMemsetD8 = _libraries['libcuda.so'].cuMemsetD8 + cuMemsetD8.restype = CUresult + cuMemsetD8.argtypes = [CUdeviceptr_v1, ctypes.c_ubyte, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemsetD16 = _libraries['libcuda.so'].cuMemsetD16 + cuMemsetD16.restype = CUresult + cuMemsetD16.argtypes = [CUdeviceptr_v1, ctypes.c_uint16, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemsetD32 = _libraries['libcuda.so'].cuMemsetD32 + cuMemsetD32.restype = CUresult + cuMemsetD32.argtypes = [CUdeviceptr_v1, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemsetD2D8 = _libraries['libcuda.so'].cuMemsetD2D8 + cuMemsetD2D8.restype = CUresult + cuMemsetD2D8.argtypes = [CUdeviceptr_v1, ctypes.c_uint32, ctypes.c_ubyte, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemsetD2D16 = _libraries['libcuda.so'].cuMemsetD2D16 + cuMemsetD2D16.restype = CUresult + cuMemsetD2D16.argtypes = [CUdeviceptr_v1, ctypes.c_uint32, ctypes.c_uint16, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemsetD2D32 = _libraries['libcuda.so'].cuMemsetD2D32 + cuMemsetD2D32.restype = CUresult + cuMemsetD2D32.argtypes = [CUdeviceptr_v1, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + cuArrayCreate = _libraries['libcuda.so'].cuArrayCreate + cuArrayCreate.restype = CUresult + cuArrayCreate.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUarray_st)), ctypes.POINTER(struct_CUDA_ARRAY_DESCRIPTOR_v1_st)] +except AttributeError: + pass +try: + cuArrayGetDescriptor = _libraries['libcuda.so'].cuArrayGetDescriptor + cuArrayGetDescriptor.restype = CUresult + cuArrayGetDescriptor.argtypes = [ctypes.POINTER(struct_CUDA_ARRAY_DESCRIPTOR_v1_st), CUarray] +except AttributeError: + pass +try: + cuArray3DCreate = _libraries['libcuda.so'].cuArray3DCreate + cuArray3DCreate.restype = CUresult + cuArray3DCreate.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUarray_st)), ctypes.POINTER(struct_CUDA_ARRAY3D_DESCRIPTOR_v1_st)] +except AttributeError: + pass +try: + cuArray3DGetDescriptor = _libraries['libcuda.so'].cuArray3DGetDescriptor + cuArray3DGetDescriptor.restype = CUresult + cuArray3DGetDescriptor.argtypes = [ctypes.POINTER(struct_CUDA_ARRAY3D_DESCRIPTOR_v1_st), CUarray] +except AttributeError: + pass +try: + cuTexRefSetAddress = _libraries['libcuda.so'].cuTexRefSetAddress + cuTexRefSetAddress.restype = CUresult + cuTexRefSetAddress.argtypes = [ctypes.POINTER(ctypes.c_uint32), CUtexref, CUdeviceptr_v1, ctypes.c_uint32] +except AttributeError: + pass +try: + cuTexRefSetAddress2D = _libraries['libcuda.so'].cuTexRefSetAddress2D + cuTexRefSetAddress2D.restype = CUresult + cuTexRefSetAddress2D.argtypes = [CUtexref, ctypes.POINTER(struct_CUDA_ARRAY_DESCRIPTOR_v1_st), CUdeviceptr_v1, ctypes.c_uint32] +except AttributeError: + pass +try: + cuTexRefGetAddress = _libraries['libcuda.so'].cuTexRefGetAddress + cuTexRefGetAddress.restype = CUresult + cuTexRefGetAddress.argtypes = [ctypes.POINTER(ctypes.c_uint32), CUtexref] +except AttributeError: + pass +try: + cuGraphicsResourceGetMappedPointer = _libraries['libcuda.so'].cuGraphicsResourceGetMappedPointer + cuGraphicsResourceGetMappedPointer.restype = CUresult + cuGraphicsResourceGetMappedPointer.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32), CUgraphicsResource] +except AttributeError: + pass +try: + cuCtxDestroy = _libraries['libcuda.so'].cuCtxDestroy + cuCtxDestroy.restype = CUresult + cuCtxDestroy.argtypes = [CUcontext] +except AttributeError: + pass +try: + cuCtxPopCurrent = _libraries['libcuda.so'].cuCtxPopCurrent + cuCtxPopCurrent.restype = CUresult + cuCtxPopCurrent.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUctx_st))] +except AttributeError: + pass +try: + cuCtxPushCurrent = _libraries['libcuda.so'].cuCtxPushCurrent + cuCtxPushCurrent.restype = CUresult + cuCtxPushCurrent.argtypes = [CUcontext] +except AttributeError: + pass +try: + cuStreamDestroy = _libraries['libcuda.so'].cuStreamDestroy + cuStreamDestroy.restype = CUresult + cuStreamDestroy.argtypes = [CUstream] +except AttributeError: + pass +try: + cuEventDestroy = _libraries['libcuda.so'].cuEventDestroy + cuEventDestroy.restype = CUresult + cuEventDestroy.argtypes = [CUevent] +except AttributeError: + pass +try: + cuDevicePrimaryCtxRelease = _libraries['libcuda.so'].cuDevicePrimaryCtxRelease + cuDevicePrimaryCtxRelease.restype = CUresult + cuDevicePrimaryCtxRelease.argtypes = [CUdevice] +except AttributeError: + pass +try: + cuDevicePrimaryCtxReset = _libraries['libcuda.so'].cuDevicePrimaryCtxReset + cuDevicePrimaryCtxReset.restype = CUresult + cuDevicePrimaryCtxReset.argtypes = [CUdevice] +except AttributeError: + pass +try: + cuDevicePrimaryCtxSetFlags = _libraries['libcuda.so'].cuDevicePrimaryCtxSetFlags + cuDevicePrimaryCtxSetFlags.restype = CUresult + cuDevicePrimaryCtxSetFlags.argtypes = [CUdevice, ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemcpyHtoD_v2 = _libraries['libcuda.so'].cuMemcpyHtoD_v2 + cuMemcpyHtoD_v2.restype = CUresult + cuMemcpyHtoD_v2.argtypes = [CUdeviceptr, ctypes.POINTER(None), size_t] +except AttributeError: + pass +try: + cuMemcpyDtoH_v2 = _libraries['libcuda.so'].cuMemcpyDtoH_v2 + cuMemcpyDtoH_v2.restype = CUresult + cuMemcpyDtoH_v2.argtypes = [ctypes.POINTER(None), CUdeviceptr, size_t] +except AttributeError: + pass +try: + cuMemcpyDtoD_v2 = _libraries['libcuda.so'].cuMemcpyDtoD_v2 + cuMemcpyDtoD_v2.restype = CUresult + cuMemcpyDtoD_v2.argtypes = [CUdeviceptr, CUdeviceptr, size_t] +except AttributeError: + pass +try: + cuMemcpyDtoA_v2 = _libraries['libcuda.so'].cuMemcpyDtoA_v2 + cuMemcpyDtoA_v2.restype = CUresult + cuMemcpyDtoA_v2.argtypes = [CUarray, size_t, CUdeviceptr, size_t] +except AttributeError: + pass +try: + cuMemcpyAtoD_v2 = _libraries['libcuda.so'].cuMemcpyAtoD_v2 + cuMemcpyAtoD_v2.restype = CUresult + cuMemcpyAtoD_v2.argtypes = [CUdeviceptr, CUarray, size_t, size_t] +except AttributeError: + pass +try: + cuMemcpyHtoA_v2 = _libraries['libcuda.so'].cuMemcpyHtoA_v2 + cuMemcpyHtoA_v2.restype = CUresult + cuMemcpyHtoA_v2.argtypes = [CUarray, size_t, ctypes.POINTER(None), size_t] +except AttributeError: + pass +try: + cuMemcpyAtoH_v2 = _libraries['libcuda.so'].cuMemcpyAtoH_v2 + cuMemcpyAtoH_v2.restype = CUresult + cuMemcpyAtoH_v2.argtypes = [ctypes.POINTER(None), CUarray, size_t, size_t] +except AttributeError: + pass +try: + cuMemcpyAtoA_v2 = _libraries['libcuda.so'].cuMemcpyAtoA_v2 + cuMemcpyAtoA_v2.restype = CUresult + cuMemcpyAtoA_v2.argtypes = [CUarray, size_t, CUarray, size_t, size_t] +except AttributeError: + pass +try: + cuMemcpyHtoAAsync_v2 = _libraries['libcuda.so'].cuMemcpyHtoAAsync_v2 + cuMemcpyHtoAAsync_v2.restype = CUresult + cuMemcpyHtoAAsync_v2.argtypes = [CUarray, size_t, ctypes.POINTER(None), size_t, CUstream] +except AttributeError: + pass +try: + cuMemcpyAtoHAsync_v2 = _libraries['libcuda.so'].cuMemcpyAtoHAsync_v2 + cuMemcpyAtoHAsync_v2.restype = CUresult + cuMemcpyAtoHAsync_v2.argtypes = [ctypes.POINTER(None), CUarray, size_t, size_t, CUstream] +except AttributeError: + pass +try: + cuMemcpy2D_v2 = _libraries['libcuda.so'].cuMemcpy2D_v2 + cuMemcpy2D_v2.restype = CUresult + cuMemcpy2D_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_st)] +except AttributeError: + pass +try: + cuMemcpy2DUnaligned_v2 = _libraries['libcuda.so'].cuMemcpy2DUnaligned_v2 + cuMemcpy2DUnaligned_v2.restype = CUresult + cuMemcpy2DUnaligned_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_st)] +except AttributeError: + pass +try: + cuMemcpy3D_v2 = _libraries['libcuda.so'].cuMemcpy3D_v2 + cuMemcpy3D_v2.restype = CUresult + cuMemcpy3D_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_st)] +except AttributeError: + pass +try: + cuMemcpyHtoDAsync_v2 = _libraries['libcuda.so'].cuMemcpyHtoDAsync_v2 + cuMemcpyHtoDAsync_v2.restype = CUresult + cuMemcpyHtoDAsync_v2.argtypes = [CUdeviceptr, ctypes.POINTER(None), size_t, CUstream] +except AttributeError: + pass +try: + cuMemcpyDtoHAsync_v2 = _libraries['libcuda.so'].cuMemcpyDtoHAsync_v2 + cuMemcpyDtoHAsync_v2.restype = CUresult + cuMemcpyDtoHAsync_v2.argtypes = [ctypes.POINTER(None), CUdeviceptr, size_t, CUstream] +except AttributeError: + pass +try: + cuMemcpyDtoDAsync_v2 = _libraries['libcuda.so'].cuMemcpyDtoDAsync_v2 + cuMemcpyDtoDAsync_v2.restype = CUresult + cuMemcpyDtoDAsync_v2.argtypes = [CUdeviceptr, CUdeviceptr, size_t, CUstream] +except AttributeError: + pass +try: + cuMemcpy2DAsync_v2 = _libraries['libcuda.so'].cuMemcpy2DAsync_v2 + cuMemcpy2DAsync_v2.restype = CUresult + cuMemcpy2DAsync_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY2D_st), CUstream] +except AttributeError: + pass +try: + cuMemcpy3DAsync_v2 = _libraries['libcuda.so'].cuMemcpy3DAsync_v2 + cuMemcpy3DAsync_v2.restype = CUresult + cuMemcpy3DAsync_v2.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_st), CUstream] +except AttributeError: + pass +try: + cuMemsetD8_v2 = _libraries['libcuda.so'].cuMemsetD8_v2 + cuMemsetD8_v2.restype = CUresult + cuMemsetD8_v2.argtypes = [CUdeviceptr, ctypes.c_ubyte, size_t] +except AttributeError: + pass +try: + cuMemsetD16_v2 = _libraries['libcuda.so'].cuMemsetD16_v2 + cuMemsetD16_v2.restype = CUresult + cuMemsetD16_v2.argtypes = [CUdeviceptr, ctypes.c_uint16, size_t] +except AttributeError: + pass +try: + cuMemsetD32_v2 = _libraries['libcuda.so'].cuMemsetD32_v2 + cuMemsetD32_v2.restype = CUresult + cuMemsetD32_v2.argtypes = [CUdeviceptr, ctypes.c_uint32, size_t] +except AttributeError: + pass +try: + cuMemsetD2D8_v2 = _libraries['libcuda.so'].cuMemsetD2D8_v2 + cuMemsetD2D8_v2.restype = CUresult + cuMemsetD2D8_v2.argtypes = [CUdeviceptr, size_t, ctypes.c_ubyte, size_t, size_t] +except AttributeError: + pass +try: + cuMemsetD2D16_v2 = _libraries['libcuda.so'].cuMemsetD2D16_v2 + cuMemsetD2D16_v2.restype = CUresult + cuMemsetD2D16_v2.argtypes = [CUdeviceptr, size_t, ctypes.c_uint16, size_t, size_t] +except AttributeError: + pass +try: + cuMemsetD2D32_v2 = _libraries['libcuda.so'].cuMemsetD2D32_v2 + cuMemsetD2D32_v2.restype = CUresult + cuMemsetD2D32_v2.argtypes = [CUdeviceptr, size_t, ctypes.c_uint32, size_t, size_t] +except AttributeError: + pass +try: + cuMemcpy = _libraries['libcuda.so'].cuMemcpy + cuMemcpy.restype = CUresult + cuMemcpy.argtypes = [CUdeviceptr, CUdeviceptr, size_t] +except AttributeError: + pass +try: + cuMemcpyAsync = _libraries['libcuda.so'].cuMemcpyAsync + cuMemcpyAsync.restype = CUresult + cuMemcpyAsync.argtypes = [CUdeviceptr, CUdeviceptr, size_t, CUstream] +except AttributeError: + pass +try: + cuMemcpyPeer = _libraries['libcuda.so'].cuMemcpyPeer + cuMemcpyPeer.restype = CUresult + cuMemcpyPeer.argtypes = [CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t] +except AttributeError: + pass +try: + cuMemcpyPeerAsync = _libraries['libcuda.so'].cuMemcpyPeerAsync + cuMemcpyPeerAsync.restype = CUresult + cuMemcpyPeerAsync.argtypes = [CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream] +except AttributeError: + pass +try: + cuMemcpy3DPeer = _libraries['libcuda.so'].cuMemcpy3DPeer + cuMemcpy3DPeer.restype = CUresult + cuMemcpy3DPeer.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_PEER_st)] +except AttributeError: + pass +try: + cuMemcpy3DPeerAsync = _libraries['libcuda.so'].cuMemcpy3DPeerAsync + cuMemcpy3DPeerAsync.restype = CUresult + cuMemcpy3DPeerAsync.argtypes = [ctypes.POINTER(struct_CUDA_MEMCPY3D_PEER_st), CUstream] +except AttributeError: + pass +try: + cuMemsetD8Async = _libraries['libcuda.so'].cuMemsetD8Async + cuMemsetD8Async.restype = CUresult + cuMemsetD8Async.argtypes = [CUdeviceptr, ctypes.c_ubyte, size_t, CUstream] +except AttributeError: + pass +try: + cuMemsetD16Async = _libraries['libcuda.so'].cuMemsetD16Async + cuMemsetD16Async.restype = CUresult + cuMemsetD16Async.argtypes = [CUdeviceptr, ctypes.c_uint16, size_t, CUstream] +except AttributeError: + pass +try: + cuMemsetD32Async = _libraries['libcuda.so'].cuMemsetD32Async + cuMemsetD32Async.restype = CUresult + cuMemsetD32Async.argtypes = [CUdeviceptr, ctypes.c_uint32, size_t, CUstream] +except AttributeError: + pass +try: + cuMemsetD2D8Async = _libraries['libcuda.so'].cuMemsetD2D8Async + cuMemsetD2D8Async.restype = CUresult + cuMemsetD2D8Async.argtypes = [CUdeviceptr, size_t, ctypes.c_ubyte, size_t, size_t, CUstream] +except AttributeError: + pass +try: + cuMemsetD2D16Async = _libraries['libcuda.so'].cuMemsetD2D16Async + cuMemsetD2D16Async.restype = CUresult + cuMemsetD2D16Async.argtypes = [CUdeviceptr, size_t, ctypes.c_uint16, size_t, size_t, CUstream] +except AttributeError: + pass +try: + cuMemsetD2D32Async = _libraries['libcuda.so'].cuMemsetD2D32Async + cuMemsetD2D32Async.restype = CUresult + cuMemsetD2D32Async.argtypes = [CUdeviceptr, size_t, ctypes.c_uint32, size_t, size_t, CUstream] +except AttributeError: + pass +try: + cuStreamGetPriority = _libraries['libcuda.so'].cuStreamGetPriority + cuStreamGetPriority.restype = CUresult + cuStreamGetPriority.argtypes = [CUstream, ctypes.POINTER(ctypes.c_int32)] +except AttributeError: + pass +try: + cuStreamGetId = _libraries['libcuda.so'].cuStreamGetId + cuStreamGetId.restype = CUresult + cuStreamGetId.argtypes = [CUstream, ctypes.POINTER(ctypes.c_uint64)] +except AttributeError: + pass +try: + cuStreamGetFlags = _libraries['libcuda.so'].cuStreamGetFlags + cuStreamGetFlags.restype = CUresult + cuStreamGetFlags.argtypes = [CUstream, ctypes.POINTER(ctypes.c_uint32)] +except AttributeError: + pass +try: + cuStreamGetCtx = _libraries['libcuda.so'].cuStreamGetCtx + cuStreamGetCtx.restype = CUresult + cuStreamGetCtx.argtypes = [CUstream, ctypes.POINTER(ctypes.POINTER(struct_CUctx_st))] +except AttributeError: + pass +try: + cuStreamWaitEvent = _libraries['libcuda.so'].cuStreamWaitEvent + cuStreamWaitEvent.restype = CUresult + cuStreamWaitEvent.argtypes = [CUstream, CUevent, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamAddCallback = _libraries['libcuda.so'].cuStreamAddCallback + cuStreamAddCallback.restype = CUresult + cuStreamAddCallback.argtypes = [CUstream, CUstreamCallback, ctypes.POINTER(None), ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamAttachMemAsync = _libraries['libcuda.so'].cuStreamAttachMemAsync + cuStreamAttachMemAsync.restype = CUresult + cuStreamAttachMemAsync.argtypes = [CUstream, CUdeviceptr, size_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamQuery = _libraries['libcuda.so'].cuStreamQuery + cuStreamQuery.restype = CUresult + cuStreamQuery.argtypes = [CUstream] +except AttributeError: + pass +try: + cuStreamSynchronize = _libraries['libcuda.so'].cuStreamSynchronize + cuStreamSynchronize.restype = CUresult + cuStreamSynchronize.argtypes = [CUstream] +except AttributeError: + pass +try: + cuEventRecord = _libraries['libcuda.so'].cuEventRecord + cuEventRecord.restype = CUresult + cuEventRecord.argtypes = [CUevent, CUstream] +except AttributeError: + pass +try: + cuEventRecordWithFlags = _libraries['libcuda.so'].cuEventRecordWithFlags + cuEventRecordWithFlags.restype = CUresult + cuEventRecordWithFlags.argtypes = [CUevent, CUstream, ctypes.c_uint32] +except AttributeError: + pass +try: + cuLaunchKernel = _libraries['libcuda.so'].cuLaunchKernel + cuLaunchKernel.restype = CUresult + cuLaunchKernel.argtypes = [CUfunction, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, CUstream, ctypes.POINTER(ctypes.POINTER(None)), ctypes.POINTER(ctypes.POINTER(None))] +except AttributeError: + pass +try: + cuLaunchKernelEx = _libraries['libcuda.so'].cuLaunchKernelEx + cuLaunchKernelEx.restype = CUresult + cuLaunchKernelEx.argtypes = [ctypes.POINTER(struct_CUlaunchConfig_st), CUfunction, ctypes.POINTER(ctypes.POINTER(None)), ctypes.POINTER(ctypes.POINTER(None))] +except AttributeError: + pass +try: + cuLaunchHostFunc = _libraries['libcuda.so'].cuLaunchHostFunc + cuLaunchHostFunc.restype = CUresult + cuLaunchHostFunc.argtypes = [CUstream, CUhostFn, ctypes.POINTER(None)] +except AttributeError: + pass try: cuGraphicsMapResources = _libraries['libcuda.so'].cuGraphicsMapResources cuGraphicsMapResources.restype = CUresult @@ -4908,40 +6335,315 @@ try: cuGraphicsUnmapResources.argtypes = [ctypes.c_uint32, ctypes.POINTER(ctypes.POINTER(struct_CUgraphicsResource_st)), CUstream] except AttributeError: pass +try: + cuStreamWriteValue32 = _libraries['libcuda.so'].cuStreamWriteValue32 + cuStreamWriteValue32.restype = CUresult + cuStreamWriteValue32.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWaitValue32 = _libraries['libcuda.so'].cuStreamWaitValue32 + cuStreamWaitValue32.restype = CUresult + cuStreamWaitValue32.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWriteValue64 = _libraries['libcuda.so'].cuStreamWriteValue64 + cuStreamWriteValue64.restype = CUresult + cuStreamWriteValue64.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWaitValue64 = _libraries['libcuda.so'].cuStreamWaitValue64 + cuStreamWaitValue64.restype = CUresult + cuStreamWaitValue64.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamBatchMemOp = _libraries['libcuda.so'].cuStreamBatchMemOp + cuStreamBatchMemOp.restype = CUresult + cuStreamBatchMemOp.argtypes = [CUstream, ctypes.c_uint32, ctypes.POINTER(union_CUstreamBatchMemOpParams_union), ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWriteValue32_ptsz = _libraries['libcuda.so'].cuStreamWriteValue32_ptsz + cuStreamWriteValue32_ptsz.restype = CUresult + cuStreamWriteValue32_ptsz.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWaitValue32_ptsz = _libraries['libcuda.so'].cuStreamWaitValue32_ptsz + cuStreamWaitValue32_ptsz.restype = CUresult + cuStreamWaitValue32_ptsz.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWriteValue64_ptsz = _libraries['libcuda.so'].cuStreamWriteValue64_ptsz + cuStreamWriteValue64_ptsz.restype = CUresult + cuStreamWriteValue64_ptsz.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWaitValue64_ptsz = _libraries['libcuda.so'].cuStreamWaitValue64_ptsz + cuStreamWaitValue64_ptsz.restype = CUresult + cuStreamWaitValue64_ptsz.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamBatchMemOp_ptsz = _libraries['libcuda.so'].cuStreamBatchMemOp_ptsz + cuStreamBatchMemOp_ptsz.restype = CUresult + cuStreamBatchMemOp_ptsz.argtypes = [CUstream, ctypes.c_uint32, ctypes.POINTER(union_CUstreamBatchMemOpParams_union), ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWriteValue32_v2 = _libraries['libcuda.so'].cuStreamWriteValue32_v2 + cuStreamWriteValue32_v2.restype = CUresult + cuStreamWriteValue32_v2.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWaitValue32_v2 = _libraries['libcuda.so'].cuStreamWaitValue32_v2 + cuStreamWaitValue32_v2.restype = CUresult + cuStreamWaitValue32_v2.argtypes = [CUstream, CUdeviceptr, cuuint32_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWriteValue64_v2 = _libraries['libcuda.so'].cuStreamWriteValue64_v2 + cuStreamWriteValue64_v2.restype = CUresult + cuStreamWriteValue64_v2.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamWaitValue64_v2 = _libraries['libcuda.so'].cuStreamWaitValue64_v2 + cuStreamWaitValue64_v2.restype = CUresult + cuStreamWaitValue64_v2.argtypes = [CUstream, CUdeviceptr, cuuint64_t, ctypes.c_uint32] +except AttributeError: + pass +try: + cuStreamBatchMemOp_v2 = _libraries['libcuda.so'].cuStreamBatchMemOp_v2 + cuStreamBatchMemOp_v2.restype = CUresult + cuStreamBatchMemOp_v2.argtypes = [CUstream, ctypes.c_uint32, ctypes.POINTER(union_CUstreamBatchMemOpParams_union), ctypes.c_uint32] +except AttributeError: + pass +try: + cuMemPrefetchAsync = _libraries['libcuda.so'].cuMemPrefetchAsync + cuMemPrefetchAsync.restype = CUresult + cuMemPrefetchAsync.argtypes = [CUdeviceptr, size_t, CUdevice, CUstream] +except AttributeError: + pass +try: + cuLaunchCooperativeKernel = _libraries['libcuda.so'].cuLaunchCooperativeKernel + cuLaunchCooperativeKernel.restype = CUresult + cuLaunchCooperativeKernel.argtypes = [CUfunction, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, CUstream, ctypes.POINTER(ctypes.POINTER(None))] +except AttributeError: + pass +try: + cuSignalExternalSemaphoresAsync = _libraries['libcuda.so'].cuSignalExternalSemaphoresAsync + cuSignalExternalSemaphoresAsync.restype = CUresult + cuSignalExternalSemaphoresAsync.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUextSemaphore_st)), ctypes.POINTER(struct_CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st), ctypes.c_uint32, CUstream] +except AttributeError: + pass +try: + cuWaitExternalSemaphoresAsync = _libraries['libcuda.so'].cuWaitExternalSemaphoresAsync + cuWaitExternalSemaphoresAsync.restype = CUresult + cuWaitExternalSemaphoresAsync.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUextSemaphore_st)), ctypes.POINTER(struct_CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st), ctypes.c_uint32, CUstream] +except AttributeError: + pass +try: + cuStreamBeginCapture = _libraries['libcuda.so'].cuStreamBeginCapture + cuStreamBeginCapture.restype = CUresult + cuStreamBeginCapture.argtypes = [CUstream] +except AttributeError: + pass +try: + cuStreamBeginCapture_ptsz = _libraries['libcuda.so'].cuStreamBeginCapture_ptsz + cuStreamBeginCapture_ptsz.restype = CUresult + cuStreamBeginCapture_ptsz.argtypes = [CUstream] +except AttributeError: + pass +try: + cuStreamBeginCapture_v2 = _libraries['libcuda.so'].cuStreamBeginCapture_v2 + cuStreamBeginCapture_v2.restype = CUresult + cuStreamBeginCapture_v2.argtypes = [CUstream, CUstreamCaptureMode] +except AttributeError: + pass +try: + cuStreamEndCapture = _libraries['libcuda.so'].cuStreamEndCapture + cuStreamEndCapture.restype = CUresult + cuStreamEndCapture.argtypes = [CUstream, ctypes.POINTER(ctypes.POINTER(struct_CUgraph_st))] +except AttributeError: + pass +try: + cuStreamIsCapturing = _libraries['libcuda.so'].cuStreamIsCapturing + cuStreamIsCapturing.restype = CUresult + cuStreamIsCapturing.argtypes = [CUstream, ctypes.POINTER(CUstreamCaptureStatus_enum)] +except AttributeError: + pass +try: + cuStreamGetCaptureInfo = _libraries['libcuda.so'].cuStreamGetCaptureInfo + cuStreamGetCaptureInfo.restype = CUresult + cuStreamGetCaptureInfo.argtypes = [CUstream, ctypes.POINTER(CUstreamCaptureStatus_enum), ctypes.POINTER(ctypes.c_uint64)] +except AttributeError: + pass +try: + cuStreamGetCaptureInfo_ptsz = _libraries['libcuda.so'].cuStreamGetCaptureInfo_ptsz + cuStreamGetCaptureInfo_ptsz.restype = CUresult + cuStreamGetCaptureInfo_ptsz.argtypes = [CUstream, ctypes.POINTER(CUstreamCaptureStatus_enum), ctypes.POINTER(ctypes.c_uint64)] +except AttributeError: + pass +try: + cuStreamGetCaptureInfo_v2 = _libraries['libcuda.so'].cuStreamGetCaptureInfo_v2 + cuStreamGetCaptureInfo_v2.restype = CUresult + cuStreamGetCaptureInfo_v2.argtypes = [CUstream, ctypes.POINTER(CUstreamCaptureStatus_enum), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.POINTER(struct_CUgraph_st)), ctypes.POINTER(ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st))), ctypes.POINTER(ctypes.c_uint64)] +except AttributeError: + pass +try: + cuGraphAddKernelNode = _libraries['libcuda.so'].cuGraphAddKernelNode + cuGraphAddKernelNode.restype = CUresult + cuGraphAddKernelNode.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), CUgraph, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), size_t, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_st)] +except AttributeError: + pass +try: + cuGraphKernelNodeGetParams = _libraries['libcuda.so'].cuGraphKernelNodeGetParams + cuGraphKernelNodeGetParams.restype = CUresult + cuGraphKernelNodeGetParams.argtypes = [CUgraphNode, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_st)] +except AttributeError: + pass +try: + cuGraphKernelNodeSetParams = _libraries['libcuda.so'].cuGraphKernelNodeSetParams + cuGraphKernelNodeSetParams.restype = CUresult + cuGraphKernelNodeSetParams.argtypes = [CUgraphNode, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_st)] +except AttributeError: + pass +try: + cuGraphExecKernelNodeSetParams = _libraries['libcuda.so'].cuGraphExecKernelNodeSetParams + cuGraphExecKernelNodeSetParams.restype = CUresult + cuGraphExecKernelNodeSetParams.argtypes = [CUgraphExec, CUgraphNode, ctypes.POINTER(struct_CUDA_KERNEL_NODE_PARAMS_st)] +except AttributeError: + pass +try: + cuGraphInstantiateWithParams = _libraries['libcuda.so'].cuGraphInstantiateWithParams + cuGraphInstantiateWithParams.restype = CUresult + cuGraphInstantiateWithParams.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUgraphExec_st)), CUgraph, ctypes.POINTER(struct_CUDA_GRAPH_INSTANTIATE_PARAMS_st)] +except AttributeError: + pass +try: + cuGraphExecUpdate = _libraries['libcuda.so'].cuGraphExecUpdate + cuGraphExecUpdate.restype = CUresult + cuGraphExecUpdate.argtypes = [CUgraphExec, CUgraph, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), ctypes.POINTER(CUgraphExecUpdateResult_enum)] +except AttributeError: + pass +try: + cuGraphUpload = _libraries['libcuda.so'].cuGraphUpload + cuGraphUpload.restype = CUresult + cuGraphUpload.argtypes = [CUgraphExec, CUstream] +except AttributeError: + pass +try: + cuGraphLaunch = _libraries['libcuda.so'].cuGraphLaunch + cuGraphLaunch.restype = CUresult + cuGraphLaunch.argtypes = [CUgraphExec, CUstream] +except AttributeError: + pass +try: + cuStreamCopyAttributes = _libraries['libcuda.so'].cuStreamCopyAttributes + cuStreamCopyAttributes.restype = CUresult + cuStreamCopyAttributes.argtypes = [CUstream, CUstream] +except AttributeError: + pass +try: + cuStreamGetAttribute = _libraries['libcuda.so'].cuStreamGetAttribute + cuStreamGetAttribute.restype = CUresult + cuStreamGetAttribute.argtypes = [CUstream, CUstreamAttrID, ctypes.POINTER(union_CUlaunchAttributeValue_union)] +except AttributeError: + pass +try: + cuStreamSetAttribute = _libraries['libcuda.so'].cuStreamSetAttribute + cuStreamSetAttribute.restype = CUresult + cuStreamSetAttribute.argtypes = [CUstream, CUstreamAttrID, ctypes.POINTER(union_CUlaunchAttributeValue_union)] +except AttributeError: + pass +try: + cuIpcOpenMemHandle = _libraries['libcuda.so'].cuIpcOpenMemHandle + cuIpcOpenMemHandle.restype = CUresult + cuIpcOpenMemHandle.argtypes = [ctypes.POINTER(ctypes.c_uint64), CUipcMemHandle, ctypes.c_uint32] +except AttributeError: + pass +try: + cuGraphInstantiate = _libraries['libcuda.so'].cuGraphInstantiate + cuGraphInstantiate.restype = CUresult + cuGraphInstantiate.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUgraphExec_st)), CUgraph, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), ctypes.POINTER(ctypes.c_char), size_t] +except AttributeError: + pass +try: + cuGraphInstantiate_v2 = _libraries['libcuda.so'].cuGraphInstantiate_v2 + cuGraphInstantiate_v2.restype = CUresult + cuGraphInstantiate_v2.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_CUgraphExec_st)), CUgraph, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), ctypes.POINTER(ctypes.c_char), size_t] +except AttributeError: + pass +try: + cuMemMapArrayAsync = _libraries['libcuda.so'].cuMemMapArrayAsync + cuMemMapArrayAsync.restype = CUresult + cuMemMapArrayAsync.argtypes = [ctypes.POINTER(struct_CUarrayMapInfo_st), ctypes.c_uint32, CUstream] +except AttributeError: + pass +try: + cuMemFreeAsync = _libraries['libcuda.so'].cuMemFreeAsync + cuMemFreeAsync.restype = CUresult + cuMemFreeAsync.argtypes = [CUdeviceptr, CUstream] +except AttributeError: + pass +try: + cuMemAllocAsync = _libraries['libcuda.so'].cuMemAllocAsync + cuMemAllocAsync.restype = CUresult + cuMemAllocAsync.argtypes = [ctypes.POINTER(ctypes.c_uint64), size_t, CUstream] +except AttributeError: + pass +try: + cuMemAllocFromPoolAsync = _libraries['libcuda.so'].cuMemAllocFromPoolAsync + cuMemAllocFromPoolAsync.restype = CUresult + cuMemAllocFromPoolAsync.argtypes = [ctypes.POINTER(ctypes.c_uint64), size_t, CUmemoryPool, CUstream] +except AttributeError: + pass +try: + cuStreamUpdateCaptureDependencies = _libraries['libcuda.so'].cuStreamUpdateCaptureDependencies + cuStreamUpdateCaptureDependencies.restype = CUresult + cuStreamUpdateCaptureDependencies.argtypes = [CUstream, ctypes.POINTER(ctypes.POINTER(struct_CUgraphNode_st)), size_t, ctypes.c_uint32] +except AttributeError: + pass try: cuGetProcAddress = _libraries['libcuda.so'].cuGetProcAddress cuGetProcAddress.restype = CUresult cuGetProcAddress.argtypes = [ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.POINTER(None)), ctypes.c_int32, cuuint64_t] except AttributeError: pass -try: - cuGetExportTable = _libraries['libcuda.so'].cuGetExportTable - cuGetExportTable.restype = CUresult - cuGetExportTable.argtypes = [ctypes.POINTER(ctypes.POINTER(None)), ctypes.POINTER(struct_CUuuid_st)] -except AttributeError: - pass __all__ = \ - ['CUDA_ARRAY3D_DESCRIPTOR', 'CUDA_ARRAY3D_DESCRIPTOR_v2', - 'CUDA_ARRAY_DESCRIPTOR', 'CUDA_ARRAY_DESCRIPTOR_v2', + ['CUDA_ARRAY3D_DESCRIPTOR', 'CUDA_ARRAY3D_DESCRIPTOR_v1', + 'CUDA_ARRAY3D_DESCRIPTOR_v2', 'CUDA_ARRAY_DESCRIPTOR', + 'CUDA_ARRAY_DESCRIPTOR_v1', 'CUDA_ARRAY_DESCRIPTOR_v2', + 'CUDA_ARRAY_MEMORY_REQUIREMENTS', + 'CUDA_ARRAY_MEMORY_REQUIREMENTS_v1', 'CUDA_ARRAY_SPARSE_PROPERTIES', 'CUDA_ARRAY_SPARSE_PROPERTIES_v1', - 'CUDA_ERROR_ALREADY_ACQUIRED', 'CUDA_ERROR_ALREADY_MAPPED', - 'CUDA_ERROR_ARRAY_IS_MAPPED', 'CUDA_ERROR_ASSERT', - 'CUDA_ERROR_CAPTURED_EVENT', + 'CUDA_BATCH_MEM_OP_NODE_PARAMS', 'CUDA_ERROR_ALREADY_ACQUIRED', + 'CUDA_ERROR_ALREADY_MAPPED', 'CUDA_ERROR_ARRAY_IS_MAPPED', + 'CUDA_ERROR_ASSERT', 'CUDA_ERROR_CAPTURED_EVENT', + 'CUDA_ERROR_CDP_NOT_SUPPORTED', 'CUDA_ERROR_CDP_VERSION_MISMATCH', 'CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE', 'CUDA_ERROR_CONTEXT_ALREADY_CURRENT', 'CUDA_ERROR_CONTEXT_ALREADY_IN_USE', 'CUDA_ERROR_CONTEXT_IS_DESTROYED', 'CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE', 'CUDA_ERROR_DEINITIALIZED', 'CUDA_ERROR_DEVICE_NOT_LICENSED', - 'CUDA_ERROR_ECC_UNCORRECTABLE', 'CUDA_ERROR_EXTERNAL_DEVICE', - 'CUDA_ERROR_FILE_NOT_FOUND', + 'CUDA_ERROR_DEVICE_UNAVAILABLE', 'CUDA_ERROR_ECC_UNCORRECTABLE', + 'CUDA_ERROR_EXTERNAL_DEVICE', 'CUDA_ERROR_FILE_NOT_FOUND', 'CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE', 'CUDA_ERROR_HARDWARE_STACK_ERROR', 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED', 'CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED', 'CUDA_ERROR_ILLEGAL_ADDRESS', 'CUDA_ERROR_ILLEGAL_INSTRUCTION', 'CUDA_ERROR_ILLEGAL_STATE', 'CUDA_ERROR_INVALID_ADDRESS_SPACE', - 'CUDA_ERROR_INVALID_CONTEXT', 'CUDA_ERROR_INVALID_DEVICE', + 'CUDA_ERROR_INVALID_CLUSTER_SIZE', 'CUDA_ERROR_INVALID_CONTEXT', + 'CUDA_ERROR_INVALID_DEVICE', 'CUDA_ERROR_INVALID_GRAPHICS_CONTEXT', 'CUDA_ERROR_INVALID_HANDLE', 'CUDA_ERROR_INVALID_IMAGE', 'CUDA_ERROR_INVALID_PC', 'CUDA_ERROR_INVALID_PTX', @@ -4951,6 +6653,7 @@ __all__ = \ 'CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING', 'CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES', 'CUDA_ERROR_LAUNCH_TIMEOUT', 'CUDA_ERROR_MAP_FAILED', 'CUDA_ERROR_MISALIGNED_ADDRESS', + 'CUDA_ERROR_MPS_CLIENT_TERMINATED', 'CUDA_ERROR_MPS_CONNECTION_FAILED', 'CUDA_ERROR_MPS_MAX_CLIENTS_REACHED', 'CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED', @@ -5002,12 +6705,21 @@ __all__ = \ 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1', 'CUDA_EXT_SEM_WAIT_NODE_PARAMS', 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1', + 'CUDA_GRAPH_INSTANTIATE_ERROR', 'CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH', + 'CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH', + 'CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD', + 'CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY', + 'CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE', + 'CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED', + 'CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED', + 'CUDA_GRAPH_INSTANTIATE_PARAMS', 'CUDA_GRAPH_INSTANTIATE_SUCCESS', 'CUDA_HOST_NODE_PARAMS', 'CUDA_HOST_NODE_PARAMS_v1', 'CUDA_KERNEL_NODE_PARAMS', 'CUDA_KERNEL_NODE_PARAMS_v1', - 'CUDA_LAUNCH_PARAMS', 'CUDA_LAUNCH_PARAMS_v1', 'CUDA_MEMCPY2D', + 'CUDA_KERNEL_NODE_PARAMS_v2', 'CUDA_LAUNCH_PARAMS', + 'CUDA_LAUNCH_PARAMS_v1', 'CUDA_MEMCPY2D', 'CUDA_MEMCPY2D_v1', 'CUDA_MEMCPY2D_v2', 'CUDA_MEMCPY3D', 'CUDA_MEMCPY3D_PEER', - 'CUDA_MEMCPY3D_PEER_v1', 'CUDA_MEMCPY3D_v2', + 'CUDA_MEMCPY3D_PEER_v1', 'CUDA_MEMCPY3D_v1', 'CUDA_MEMCPY3D_v2', 'CUDA_MEMSET_NODE_PARAMS', 'CUDA_MEMSET_NODE_PARAMS_v1', 'CUDA_MEM_ALLOC_NODE_PARAMS', 'CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS', @@ -5040,12 +6752,14 @@ __all__ = \ 'CU_AD_FORMAT_UNSIGNED_INT32', 'CU_AD_FORMAT_UNSIGNED_INT8', 'CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL', 'CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL', - 'CU_COMPUTEMODE_DEFAULT', 'CU_COMPUTEMODE_EXCLUSIVE_PROCESS', - 'CU_COMPUTEMODE_PROHIBITED', 'CU_CTX_BLOCKING_SYNC', - 'CU_CTX_FLAGS_MASK', 'CU_CTX_LMEM_RESIZE_TO_MAX', - 'CU_CTX_MAP_HOST', 'CU_CTX_SCHED_AUTO', - 'CU_CTX_SCHED_BLOCKING_SYNC', 'CU_CTX_SCHED_MASK', - 'CU_CTX_SCHED_SPIN', 'CU_CTX_SCHED_YIELD', + 'CU_CLUSTER_SCHEDULING_POLICY_DEFAULT', + 'CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING', + 'CU_CLUSTER_SCHEDULING_POLICY_SPREAD', 'CU_COMPUTEMODE_DEFAULT', + 'CU_COMPUTEMODE_EXCLUSIVE_PROCESS', 'CU_COMPUTEMODE_PROHIBITED', + 'CU_CTX_BLOCKING_SYNC', 'CU_CTX_FLAGS_MASK', + 'CU_CTX_LMEM_RESIZE_TO_MAX', 'CU_CTX_MAP_HOST', + 'CU_CTX_SCHED_AUTO', 'CU_CTX_SCHED_BLOCKING_SYNC', + 'CU_CTX_SCHED_MASK', 'CU_CTX_SCHED_SPIN', 'CU_CTX_SCHED_YIELD', 'CU_CUBEMAP_FACE_NEGATIVE_X', 'CU_CUBEMAP_FACE_NEGATIVE_Y', 'CU_CUBEMAP_FACE_NEGATIVE_Z', 'CU_CUBEMAP_FACE_POSITIVE_X', 'CU_CUBEMAP_FACE_POSITIVE_Y', 'CU_CUBEMAP_FACE_POSITIVE_Z', @@ -5054,10 +6768,13 @@ __all__ = \ 'CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY', 'CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER', 'CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS', + 'CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1', 'CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM', - 'CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS', + 'CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS_V1', 'CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR', + 'CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1', 'CU_DEVICE_ATTRIBUTE_CLOCK_RATE', + 'CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH', 'CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR', 'CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR', 'CU_DEVICE_ATTRIBUTE_COMPUTE_MODE', @@ -5066,7 +6783,9 @@ __all__ = \ 'CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS', 'CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH', 'CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH', + 'CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST', + 'CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_ECC_ENABLED', 'CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED', @@ -5082,6 +6801,7 @@ __all__ = \ 'CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_INTEGRATED', + 'CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT', 'CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE', 'CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED', @@ -5149,6 +6869,7 @@ __all__ = \ 'CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE', 'CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES', + 'CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT', 'CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT', 'CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD', 'CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID', @@ -5166,11 +6887,13 @@ __all__ = \ 'CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT', 'CU_DEVICE_ATTRIBUTE_TCC_DRIVER', + 'CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT', 'CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT', 'CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY', 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', + 'CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS', 'CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED', 'CU_DEVICE_ATTRIBUTE_WARP_SIZE', @@ -5182,6 +6905,8 @@ __all__ = \ 'CU_EVENT_BLOCKING_SYNC', 'CU_EVENT_DEFAULT', 'CU_EVENT_DISABLE_TIMING', 'CU_EVENT_INTERPROCESS', 'CU_EVENT_RECORD_DEFAULT', 'CU_EVENT_RECORD_EXTERNAL', + 'CU_EVENT_SCHED_AUTO', 'CU_EVENT_SCHED_BLOCKING_SYNC', + 'CU_EVENT_SCHED_SPIN', 'CU_EVENT_SCHED_YIELD', 'CU_EVENT_WAIT_DEFAULT', 'CU_EVENT_WAIT_EXTERNAL', 'CU_EXEC_AFFINITY_TYPE_MAX', 'CU_EXEC_AFFINITY_TYPE_SM_COUNT', 'CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE', @@ -5209,19 +6934,28 @@ __all__ = \ 'CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER', 'CU_FUNC_ATTRIBUTE_BINARY_VERSION', 'CU_FUNC_ATTRIBUTE_CACHE_MODE_CA', + 'CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE', + 'CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET', 'CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES', 'CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES', 'CU_FUNC_ATTRIBUTE_MAX', 'CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES', 'CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK', + 'CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED', 'CU_FUNC_ATTRIBUTE_NUM_REGS', 'CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT', 'CU_FUNC_ATTRIBUTE_PTX_VERSION', + 'CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH', + 'CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT', + 'CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH', 'CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES', 'CU_FUNC_CACHE_PREFER_EQUAL', 'CU_FUNC_CACHE_PREFER_L1', 'CU_FUNC_CACHE_PREFER_NONE', 'CU_FUNC_CACHE_PREFER_SHARED', 'CU_GET_PROC_ADDRESS_DEFAULT', 'CU_GET_PROC_ADDRESS_LEGACY_STREAM', 'CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM', + 'CU_GET_PROC_ADDRESS_SUCCESS', + 'CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND', + 'CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT', 'CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES', 'CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE', 'CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER', @@ -5233,7 +6967,9 @@ __all__ = \ 'CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST', 'CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER', 'CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD', + 'CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS', 'CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS', + 'CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO', 'CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS', 'CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS', 'CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES', @@ -5246,6 +6982,7 @@ __all__ = \ 'CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS', 'CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES', 'CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE', 'CU_GRAPH_EXEC_UPDATE_ERROR', + 'CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED', 'CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED', 'CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED', 'CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED', @@ -5256,7 +6993,8 @@ __all__ = \ 'CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT', 'CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH', 'CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT', - 'CU_GRAPH_MEM_ATTR_USED_MEM_HIGH', 'CU_GRAPH_NODE_TYPE_EMPTY', + 'CU_GRAPH_MEM_ATTR_USED_MEM_HIGH', + 'CU_GRAPH_NODE_TYPE_BATCH_MEM_OP', 'CU_GRAPH_NODE_TYPE_EMPTY', 'CU_GRAPH_NODE_TYPE_EVENT_RECORD', 'CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL', 'CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT', 'CU_GRAPH_NODE_TYPE_GRAPH', @@ -5278,11 +7016,28 @@ __all__ = \ 'CU_JIT_LOG_VERBOSE', 'CU_JIT_LTO', 'CU_JIT_MAX_REGISTERS', 'CU_JIT_NEW_SM3X_OPT', 'CU_JIT_NUM_INPUT_TYPES', 'CU_JIT_NUM_OPTIONS', 'CU_JIT_OPTIMIZATION_LEVEL', - 'CU_JIT_PREC_DIV', 'CU_JIT_PREC_SQRT', 'CU_JIT_TARGET', + 'CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES', + 'CU_JIT_POSITION_INDEPENDENT_CODE', 'CU_JIT_PREC_DIV', + 'CU_JIT_PREC_SQRT', 'CU_JIT_REFERENCED_KERNEL_COUNT', + 'CU_JIT_REFERENCED_KERNEL_NAMES', + 'CU_JIT_REFERENCED_VARIABLE_COUNT', + 'CU_JIT_REFERENCED_VARIABLE_NAMES', 'CU_JIT_TARGET', 'CU_JIT_TARGET_FROM_CUCONTEXT', 'CU_JIT_THREADS_PER_BLOCK', - 'CU_JIT_WALL_TIME', - 'CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW', - 'CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE', + 'CU_JIT_WALL_TIME', 'CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW', + 'CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION', + 'CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE', + 'CU_LAUNCH_ATTRIBUTE_COOPERATIVE', 'CU_LAUNCH_ATTRIBUTE_IGNORE', + 'CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN', + 'CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP', + 'CU_LAUNCH_ATTRIBUTE_PRIORITY', + 'CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT', + 'CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION', + 'CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY', + 'CU_LAUNCH_MEM_SYNC_DOMAIN_DEFAULT', + 'CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE', + 'CU_LIBRARY_BINARY_IS_PRESERVED', + 'CU_LIBRARY_HOST_UNIVERSAL_FUNCTION_AND_DATA_TABLE', + 'CU_LIBRARY_NUM_OPTIONS', 'CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT', 'CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH', 'CU_LIMIT_MALLOC_HEAP_SIZE', 'CU_LIMIT_MAX', 'CU_LIMIT_MAX_L2_FETCH_GRANULARITY', @@ -5321,7 +7076,10 @@ __all__ = \ 'CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY', 'CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION', 'CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION', - 'CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY', 'CU_OCCUPANCY_DEFAULT', + 'CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY', + 'CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD', + 'CU_MEM_RANGE_HANDLE_TYPE_MAX', 'CU_MODULE_EAGER_LOADING', + 'CU_MODULE_LAZY_LOADING', 'CU_OCCUPANCY_DEFAULT', 'CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE', 'CU_POINTER_ATTRIBUTE_ACCESS_FLAGS', 'CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE', @@ -5335,6 +7093,9 @@ __all__ = \ 'CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE', 'CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE', 'CU_POINTER_ATTRIBUTE_IS_MANAGED', 'CU_POINTER_ATTRIBUTE_MAPPED', + 'CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR', + 'CU_POINTER_ATTRIBUTE_MAPPING_SIZE', + 'CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID', 'CU_POINTER_ATTRIBUTE_MEMORY_TYPE', 'CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE', 'CU_POINTER_ATTRIBUTE_P2P_TOKENS', @@ -5370,13 +7131,13 @@ __all__ = \ 'CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE', 'CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE', 'CU_STREAM_ADD_CAPTURE_DEPENDENCIES', - 'CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW', - 'CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY', 'CU_STREAM_CAPTURE_MODE_GLOBAL', 'CU_STREAM_CAPTURE_MODE_RELAXED', 'CU_STREAM_CAPTURE_MODE_THREAD_LOCAL', 'CU_STREAM_CAPTURE_STATUS_ACTIVE', 'CU_STREAM_CAPTURE_STATUS_INVALIDATED', 'CU_STREAM_CAPTURE_STATUS_NONE', 'CU_STREAM_DEFAULT', + 'CU_STREAM_MEMORY_BARRIER_TYPE_GPU', + 'CU_STREAM_MEMORY_BARRIER_TYPE_SYS', 'CU_STREAM_MEM_OP_BARRIER', 'CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES', 'CU_STREAM_MEM_OP_WAIT_VALUE_32', 'CU_STREAM_MEM_OP_WAIT_VALUE_64', @@ -5388,8 +7149,7 @@ __all__ = \ 'CU_STREAM_WRITE_VALUE_DEFAULT', 'CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER', 'CU_SYNC_POLICY_AUTO', 'CU_SYNC_POLICY_BLOCKING_SYNC', 'CU_SYNC_POLICY_SPIN', - 'CU_SYNC_POLICY_YIELD', 'CU_TARGET_COMPUTE_20', - 'CU_TARGET_COMPUTE_21', 'CU_TARGET_COMPUTE_30', + 'CU_SYNC_POLICY_YIELD', 'CU_TARGET_COMPUTE_30', 'CU_TARGET_COMPUTE_32', 'CU_TARGET_COMPUTE_35', 'CU_TARGET_COMPUTE_37', 'CU_TARGET_COMPUTE_50', 'CU_TARGET_COMPUTE_52', 'CU_TARGET_COMPUTE_53', @@ -5397,39 +7157,68 @@ __all__ = \ 'CU_TARGET_COMPUTE_62', 'CU_TARGET_COMPUTE_70', 'CU_TARGET_COMPUTE_72', 'CU_TARGET_COMPUTE_75', 'CU_TARGET_COMPUTE_80', 'CU_TARGET_COMPUTE_86', - 'CU_TR_ADDRESS_MODE_BORDER', 'CU_TR_ADDRESS_MODE_CLAMP', - 'CU_TR_ADDRESS_MODE_MIRROR', 'CU_TR_ADDRESS_MODE_WRAP', - 'CU_TR_FILTER_MODE_LINEAR', 'CU_TR_FILTER_MODE_POINT', - 'CU_USER_OBJECT_NO_DESTRUCTOR_SYNC', 'CUaccessPolicyWindow', - 'CUaccessPolicyWindow_v1', 'CUaccessProperty', - 'CUaccessProperty__enumvalues', 'CUaccessProperty_enum', - 'CUaddress_mode', 'CUaddress_mode__enumvalues', - 'CUaddress_mode_enum', 'CUarray', 'CUarrayMapInfo', - 'CUarrayMapInfo_v1', 'CUarraySparseSubresourceType', + 'CU_TARGET_COMPUTE_87', 'CU_TARGET_COMPUTE_89', + 'CU_TARGET_COMPUTE_90', 'CU_TARGET_COMPUTE_90A', + 'CU_TENSOR_MAP_DATA_TYPE_BFLOAT16', + 'CU_TENSOR_MAP_DATA_TYPE_FLOAT16', + 'CU_TENSOR_MAP_DATA_TYPE_FLOAT32', + 'CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ', + 'CU_TENSOR_MAP_DATA_TYPE_FLOAT64', + 'CU_TENSOR_MAP_DATA_TYPE_INT32', 'CU_TENSOR_MAP_DATA_TYPE_INT64', + 'CU_TENSOR_MAP_DATA_TYPE_TFLOAT32', + 'CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ', + 'CU_TENSOR_MAP_DATA_TYPE_UINT16', + 'CU_TENSOR_MAP_DATA_TYPE_UINT32', + 'CU_TENSOR_MAP_DATA_TYPE_UINT64', 'CU_TENSOR_MAP_DATA_TYPE_UINT8', + 'CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA', + 'CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE', + 'CU_TENSOR_MAP_INTERLEAVE_16B', 'CU_TENSOR_MAP_INTERLEAVE_32B', + 'CU_TENSOR_MAP_INTERLEAVE_NONE', + 'CU_TENSOR_MAP_L2_PROMOTION_L2_128B', + 'CU_TENSOR_MAP_L2_PROMOTION_L2_256B', + 'CU_TENSOR_MAP_L2_PROMOTION_L2_64B', + 'CU_TENSOR_MAP_L2_PROMOTION_NONE', 'CU_TENSOR_MAP_SWIZZLE_128B', + 'CU_TENSOR_MAP_SWIZZLE_32B', 'CU_TENSOR_MAP_SWIZZLE_64B', + 'CU_TENSOR_MAP_SWIZZLE_NONE', 'CU_TR_ADDRESS_MODE_BORDER', + 'CU_TR_ADDRESS_MODE_CLAMP', 'CU_TR_ADDRESS_MODE_MIRROR', + 'CU_TR_ADDRESS_MODE_WRAP', 'CU_TR_FILTER_MODE_LINEAR', + 'CU_TR_FILTER_MODE_POINT', 'CU_USER_OBJECT_NO_DESTRUCTOR_SYNC', + 'CUaccessPolicyWindow', 'CUaccessPolicyWindow_v1', + 'CUaccessProperty', 'CUaccessProperty__enumvalues', + 'CUaccessProperty_enum', 'CUaddress_mode', + 'CUaddress_mode__enumvalues', 'CUaddress_mode_enum', 'CUarray', + 'CUarrayMapInfo', 'CUarrayMapInfo_v1', + 'CUarraySparseSubresourceType', 'CUarraySparseSubresourceType__enumvalues', 'CUarraySparseSubresourceType_enum', 'CUarray_cubemap_face', 'CUarray_cubemap_face__enumvalues', 'CUarray_cubemap_face_enum', 'CUarray_format', 'CUarray_format__enumvalues', - 'CUarray_format_enum', 'CUcomputemode', + 'CUarray_format_enum', 'CUclusterSchedulingPolicy', + 'CUclusterSchedulingPolicy__enumvalues', + 'CUclusterSchedulingPolicy_enum', 'CUcomputemode', 'CUcomputemode__enumvalues', 'CUcomputemode_enum', 'CUcontext', 'CUctx_flags', 'CUctx_flags__enumvalues', 'CUctx_flags_enum', 'CUdevice', 'CUdevice_P2PAttribute', 'CUdevice_P2PAttribute__enumvalues', 'CUdevice_P2PAttribute_enum', 'CUdevice_attribute', 'CUdevice_attribute__enumvalues', 'CUdevice_attribute_enum', 'CUdevice_v1', 'CUdeviceptr', - 'CUdeviceptr_v2', 'CUdevprop', 'CUdevprop_v1', + 'CUdeviceptr_v1', 'CUdeviceptr_v2', 'CUdevprop', 'CUdevprop_v1', + 'CUdriverProcAddressQueryResult', + 'CUdriverProcAddressQueryResult__enumvalues', + 'CUdriverProcAddressQueryResult_enum', 'CUdriverProcAddress_flags', 'CUdriverProcAddress_flags__enumvalues', 'CUdriverProcAddress_flags_enum', 'CUevent', 'CUevent_flags', 'CUevent_flags__enumvalues', 'CUevent_flags_enum', 'CUevent_record_flags', 'CUevent_record_flags__enumvalues', - 'CUevent_record_flags_enum', 'CUevent_wait_flags', - 'CUevent_wait_flags__enumvalues', 'CUevent_wait_flags_enum', - 'CUexecAffinityParam', 'CUexecAffinityParam_v1', - 'CUexecAffinitySmCount', 'CUexecAffinitySmCount_v1', - 'CUexecAffinityType', 'CUexecAffinityType__enumvalues', - 'CUexecAffinityType_enum', 'CUexternalMemory', - 'CUexternalMemoryHandleType', + 'CUevent_record_flags_enum', 'CUevent_sched_flags', + 'CUevent_sched_flags__enumvalues', 'CUevent_sched_flags_enum', + 'CUevent_wait_flags', 'CUevent_wait_flags__enumvalues', + 'CUevent_wait_flags_enum', 'CUexecAffinityParam', + 'CUexecAffinityParam_v1', 'CUexecAffinitySmCount', + 'CUexecAffinitySmCount_v1', 'CUexecAffinityType', + 'CUexecAffinityType__enumvalues', 'CUexecAffinityType_enum', + 'CUexternalMemory', 'CUexternalMemoryHandleType', 'CUexternalMemoryHandleType__enumvalues', 'CUexternalMemoryHandleType_enum', 'CUexternalSemaphore', 'CUexternalSemaphoreHandleType', @@ -5450,8 +7239,11 @@ __all__ = \ 'CUfunction_attribute_enum', 'CUgraph', 'CUgraphDebugDot_flags', 'CUgraphDebugDot_flags__enumvalues', 'CUgraphDebugDot_flags_enum', 'CUgraphExec', 'CUgraphExecUpdateResult', + 'CUgraphExecUpdateResultInfo', 'CUgraphExecUpdateResultInfo_v1', 'CUgraphExecUpdateResult__enumvalues', - 'CUgraphExecUpdateResult_enum', 'CUgraphInstantiate_flags', + 'CUgraphExecUpdateResult_enum', 'CUgraphInstantiateResult', + 'CUgraphInstantiateResult__enumvalues', + 'CUgraphInstantiateResult_enum', 'CUgraphInstantiate_flags', 'CUgraphInstantiate_flags__enumvalues', 'CUgraphInstantiate_flags_enum', 'CUgraphMem_attribute', 'CUgraphMem_attribute__enumvalues', 'CUgraphMem_attribute_enum', @@ -5470,9 +7262,16 @@ __all__ = \ 'CUjit_fallback', 'CUjit_fallback__enumvalues', 'CUjit_fallback_enum', 'CUjit_option', 'CUjit_option__enumvalues', 'CUjit_option_enum', 'CUjit_target', 'CUjit_target__enumvalues', - 'CUjit_target_enum', 'CUkernelNodeAttrID', - 'CUkernelNodeAttrID__enumvalues', 'CUkernelNodeAttrID_enum', - 'CUkernelNodeAttrValue', 'CUkernelNodeAttrValue_v1', 'CUlimit', + 'CUjit_target_enum', 'CUkernel', 'CUkernelNodeAttrID', + 'CUkernelNodeAttrID__enumvalues', 'CUkernelNodeAttrValue', + 'CUkernelNodeAttrValue_v1', 'CUlaunchAttribute', + 'CUlaunchAttributeID', 'CUlaunchAttributeID__enumvalues', + 'CUlaunchAttributeID_enum', 'CUlaunchAttributeValue', + 'CUlaunchConfig', 'CUlaunchMemSyncDomain', + 'CUlaunchMemSyncDomainMap', 'CUlaunchMemSyncDomain__enumvalues', + 'CUlaunchMemSyncDomain_enum', 'CUlibrary', + 'CUlibraryHostUniversalFunctionAndDataTable', 'CUlibraryOption', + 'CUlibraryOption__enumvalues', 'CUlibraryOption_enum', 'CUlimit', 'CUlimit__enumvalues', 'CUlimit_enum', 'CUlinkState', 'CUmemAccessDesc', 'CUmemAccessDesc_v1', 'CUmemAccess_flags', 'CUmemAccess_flags__enumvalues', 'CUmemAccess_flags_enum', @@ -5497,22 +7296,25 @@ __all__ = \ 'CUmemPoolProps', 'CUmemPoolProps_v1', 'CUmemPoolPtrExportData', 'CUmemPoolPtrExportData_v1', 'CUmemPool_attribute', 'CUmemPool_attribute__enumvalues', 'CUmemPool_attribute_enum', - 'CUmem_advise', 'CUmem_advise__enumvalues', 'CUmem_advise_enum', + 'CUmemRangeHandleType', 'CUmemRangeHandleType__enumvalues', + 'CUmemRangeHandleType_enum', 'CUmem_advise', + 'CUmem_advise__enumvalues', 'CUmem_advise_enum', 'CUmem_range_attribute', 'CUmem_range_attribute__enumvalues', 'CUmem_range_attribute_enum', 'CUmemoryPool', 'CUmemorytype', 'CUmemorytype__enumvalues', 'CUmemorytype_enum', - 'CUmipmappedArray', 'CUmodule', 'CUoccupancyB2DSize', - 'CUoccupancy_flags', 'CUoccupancy_flags__enumvalues', - 'CUoccupancy_flags_enum', 'CUpointer_attribute', - 'CUpointer_attribute__enumvalues', 'CUpointer_attribute_enum', - 'CUresourceViewFormat', 'CUresourceViewFormat__enumvalues', - 'CUresourceViewFormat_enum', 'CUresourcetype', - 'CUresourcetype__enumvalues', 'CUresourcetype_enum', 'CUresult', - 'CUresult__enumvalues', 'CUshared_carveout', - 'CUshared_carveout__enumvalues', 'CUshared_carveout_enum', - 'CUsharedconfig', 'CUsharedconfig__enumvalues', - 'CUsharedconfig_enum', 'CUstream', 'CUstreamAttrID', - 'CUstreamAttrID__enumvalues', 'CUstreamAttrID_enum', + 'CUmipmappedArray', 'CUmodule', 'CUmoduleLoadingMode', + 'CUmoduleLoadingMode__enumvalues', 'CUmoduleLoadingMode_enum', + 'CUoccupancyB2DSize', 'CUoccupancy_flags', + 'CUoccupancy_flags__enumvalues', 'CUoccupancy_flags_enum', + 'CUpointer_attribute', 'CUpointer_attribute__enumvalues', + 'CUpointer_attribute_enum', 'CUresourceViewFormat', + 'CUresourceViewFormat__enumvalues', 'CUresourceViewFormat_enum', + 'CUresourcetype', 'CUresourcetype__enumvalues', + 'CUresourcetype_enum', 'CUresult', 'CUresult__enumvalues', + 'CUshared_carveout', 'CUshared_carveout__enumvalues', + 'CUshared_carveout_enum', 'CUsharedconfig', + 'CUsharedconfig__enumvalues', 'CUsharedconfig_enum', 'CUstream', + 'CUstreamAttrID', 'CUstreamAttrID__enumvalues', 'CUstreamAttrValue', 'CUstreamAttrValue_v1', 'CUstreamBatchMemOpParams', 'CUstreamBatchMemOpParams_v1', 'CUstreamBatchMemOpType', 'CUstreamBatchMemOpType__enumvalues', @@ -5520,6 +7322,9 @@ __all__ = \ 'CUstreamCaptureMode', 'CUstreamCaptureMode__enumvalues', 'CUstreamCaptureMode_enum', 'CUstreamCaptureStatus', 'CUstreamCaptureStatus__enumvalues', 'CUstreamCaptureStatus_enum', + 'CUstreamMemoryBarrier_flags', + 'CUstreamMemoryBarrier_flags__enumvalues', + 'CUstreamMemoryBarrier_flags_enum', 'CUstreamUpdateCaptureDependencies_flags', 'CUstreamUpdateCaptureDependencies_flags__enumvalues', 'CUstreamUpdateCaptureDependencies_flags_enum', @@ -5530,26 +7335,46 @@ __all__ = \ 'CUstream_flags__enumvalues', 'CUstream_flags_enum', 'CUsurfObject', 'CUsurfObject_v1', 'CUsurfref', 'CUsynchronizationPolicy', 'CUsynchronizationPolicy__enumvalues', - 'CUsynchronizationPolicy_enum', 'CUtexObject', 'CUtexObject_v1', - 'CUtexref', 'CUuserObject', 'CUuserObjectRetain_flags', + 'CUsynchronizationPolicy_enum', 'CUtensorMap', + 'CUtensorMapDataType', 'CUtensorMapDataType__enumvalues', + 'CUtensorMapDataType_enum', 'CUtensorMapFloatOOBfill', + 'CUtensorMapFloatOOBfill__enumvalues', + 'CUtensorMapFloatOOBfill_enum', 'CUtensorMapInterleave', + 'CUtensorMapInterleave__enumvalues', 'CUtensorMapInterleave_enum', + 'CUtensorMapL2promotion', 'CUtensorMapL2promotion__enumvalues', + 'CUtensorMapL2promotion_enum', 'CUtensorMapSwizzle', + 'CUtensorMapSwizzle__enumvalues', 'CUtensorMapSwizzle_enum', + 'CUtexObject', 'CUtexObject_v1', 'CUtexref', 'CUuserObject', + 'CUuserObjectRetain_flags', 'CUuserObjectRetain_flags__enumvalues', 'CUuserObjectRetain_flags_enum', 'CUuserObject_flags', 'CUuserObject_flags__enumvalues', 'CUuserObject_flags_enum', - 'CUuuid', 'cuArray3DCreate_v2', 'cuArray3DGetDescriptor_v2', - 'cuArrayCreate_v2', 'cuArrayDestroy', 'cuArrayGetDescriptor_v2', + 'CUuuid', 'NVCL_CTX_SCHED_AUTO', 'NVCL_CTX_SCHED_BLOCKING_SYNC', + 'NVCL_CTX_SCHED_SPIN', 'NVCL_CTX_SCHED_YIELD', + 'NVCL_EVENT_SCHED_AUTO', 'NVCL_EVENT_SCHED_BLOCKING_SYNC', + 'NVCL_EVENT_SCHED_SPIN', 'NVCL_EVENT_SCHED_YIELD', + 'cl_context_flags', 'cl_context_flags__enumvalues', + 'cl_context_flags_enum', 'cl_event_flags', + 'cl_event_flags__enumvalues', 'cl_event_flags_enum', + 'cuArray3DCreate', 'cuArray3DCreate_v2', 'cuArray3DGetDescriptor', + 'cuArray3DGetDescriptor_v2', 'cuArrayCreate', 'cuArrayCreate_v2', + 'cuArrayDestroy', 'cuArrayGetDescriptor', + 'cuArrayGetDescriptor_v2', 'cuArrayGetMemoryRequirements', 'cuArrayGetPlane', 'cuArrayGetSparseProperties', 'cuCtxAttach', - 'cuCtxCreate_v2', 'cuCtxCreate_v3', 'cuCtxDestroy_v2', - 'cuCtxDetach', 'cuCtxDisablePeerAccess', 'cuCtxEnablePeerAccess', - 'cuCtxGetApiVersion', 'cuCtxGetCacheConfig', 'cuCtxGetCurrent', - 'cuCtxGetDevice', 'cuCtxGetExecAffinity', 'cuCtxGetFlags', + 'cuCtxCreate', 'cuCtxCreate_v2', 'cuCtxCreate_v3', 'cuCtxDestroy', + 'cuCtxDestroy_v2', 'cuCtxDetach', 'cuCtxDisablePeerAccess', + 'cuCtxEnablePeerAccess', 'cuCtxGetApiVersion', + 'cuCtxGetCacheConfig', 'cuCtxGetCurrent', 'cuCtxGetDevice', + 'cuCtxGetExecAffinity', 'cuCtxGetFlags', 'cuCtxGetId', 'cuCtxGetLimit', 'cuCtxGetSharedMemConfig', - 'cuCtxGetStreamPriorityRange', 'cuCtxPopCurrent_v2', - 'cuCtxPushCurrent_v2', 'cuCtxResetPersistingL2Cache', - 'cuCtxSetCacheConfig', 'cuCtxSetCurrent', 'cuCtxSetLimit', - 'cuCtxSetSharedMemConfig', 'cuCtxSynchronize', - 'cuDestroyExternalMemory', 'cuDestroyExternalSemaphore', - 'cuDeviceCanAccessPeer', 'cuDeviceComputeCapability', - 'cuDeviceGet', 'cuDeviceGetAttribute', 'cuDeviceGetByPCIBusId', + 'cuCtxGetStreamPriorityRange', 'cuCtxPopCurrent', + 'cuCtxPopCurrent_v2', 'cuCtxPushCurrent', 'cuCtxPushCurrent_v2', + 'cuCtxResetPersistingL2Cache', 'cuCtxSetCacheConfig', + 'cuCtxSetCurrent', 'cuCtxSetLimit', 'cuCtxSetSharedMemConfig', + 'cuCtxSynchronize', 'cuDestroyExternalMemory', + 'cuDestroyExternalSemaphore', 'cuDeviceCanAccessPeer', + 'cuDeviceComputeCapability', 'cuDeviceGet', + 'cuDeviceGetAttribute', 'cuDeviceGetByPCIBusId', 'cuDeviceGetCount', 'cuDeviceGetDefaultMemPool', 'cuDeviceGetExecAffinitySupport', 'cuDeviceGetGraphMemAttribute', 'cuDeviceGetLuid', 'cuDeviceGetMemPool', 'cuDeviceGetName', @@ -5557,149 +7382,239 @@ __all__ = \ 'cuDeviceGetPCIBusId', 'cuDeviceGetProperties', 'cuDeviceGetTexture1DLinearMaxWidth', 'cuDeviceGetUuid', 'cuDeviceGetUuid_v2', 'cuDeviceGraphMemTrim', - 'cuDevicePrimaryCtxGetState', 'cuDevicePrimaryCtxRelease_v2', + 'cuDevicePrimaryCtxGetState', 'cuDevicePrimaryCtxRelease', + 'cuDevicePrimaryCtxRelease_v2', 'cuDevicePrimaryCtxReset', 'cuDevicePrimaryCtxReset_v2', 'cuDevicePrimaryCtxRetain', - 'cuDevicePrimaryCtxSetFlags_v2', 'cuDeviceSetGraphMemAttribute', - 'cuDeviceSetMemPool', 'cuDeviceTotalMem_v2', 'cuDriverGetVersion', - 'cuEventCreate', 'cuEventDestroy_v2', 'cuEventElapsedTime', - 'cuEventQuery', 'cuEventRecord', 'cuEventRecordWithFlags', - 'cuEventSynchronize', 'cuExternalMemoryGetMappedBuffer', + 'cuDevicePrimaryCtxSetFlags', 'cuDevicePrimaryCtxSetFlags_v2', + 'cuDeviceSetGraphMemAttribute', 'cuDeviceSetMemPool', + 'cuDeviceTotalMem', 'cuDeviceTotalMem_v2', 'cuDriverGetVersion', + 'cuEventCreate', 'cuEventDestroy', 'cuEventDestroy_v2', + 'cuEventElapsedTime', 'cuEventQuery', 'cuEventRecord', + 'cuEventRecordWithFlags', 'cuEventRecordWithFlags_ptsz', + 'cuEventRecord_ptsz', 'cuEventSynchronize', + 'cuExternalMemoryGetMappedBuffer', 'cuExternalMemoryGetMappedMipmappedArray', 'cuFlushGPUDirectRDMAWrites', 'cuFuncGetAttribute', 'cuFuncGetModule', 'cuFuncSetAttribute', 'cuFuncSetBlockShape', 'cuFuncSetCacheConfig', 'cuFuncSetSharedMemConfig', 'cuFuncSetSharedSize', 'cuGetErrorName', 'cuGetErrorString', - 'cuGetExportTable', 'cuGetProcAddress', - 'cuGraphAddChildGraphNode', 'cuGraphAddDependencies', - 'cuGraphAddEmptyNode', 'cuGraphAddEventRecordNode', - 'cuGraphAddEventWaitNode', + 'cuGetExportTable', 'cuGetProcAddress', 'cuGetProcAddress_v2', + 'cuGraphAddBatchMemOpNode', 'cuGraphAddChildGraphNode', + 'cuGraphAddDependencies', 'cuGraphAddEmptyNode', + 'cuGraphAddEventRecordNode', 'cuGraphAddEventWaitNode', 'cuGraphAddExternalSemaphoresSignalNode', 'cuGraphAddExternalSemaphoresWaitNode', 'cuGraphAddHostNode', - 'cuGraphAddKernelNode', 'cuGraphAddMemAllocNode', - 'cuGraphAddMemFreeNode', 'cuGraphAddMemcpyNode', - 'cuGraphAddMemsetNode', 'cuGraphChildGraphNodeGetGraph', + 'cuGraphAddKernelNode', 'cuGraphAddKernelNode_v2', + 'cuGraphAddMemAllocNode', 'cuGraphAddMemFreeNode', + 'cuGraphAddMemcpyNode', 'cuGraphAddMemsetNode', + 'cuGraphBatchMemOpNodeGetParams', + 'cuGraphBatchMemOpNodeSetParams', 'cuGraphChildGraphNodeGetGraph', 'cuGraphClone', 'cuGraphCreate', 'cuGraphDebugDotPrint', 'cuGraphDestroy', 'cuGraphDestroyNode', 'cuGraphEventRecordNodeGetEvent', 'cuGraphEventRecordNodeSetEvent', 'cuGraphEventWaitNodeGetEvent', 'cuGraphEventWaitNodeSetEvent', + 'cuGraphExecBatchMemOpNodeSetParams', 'cuGraphExecChildGraphNodeSetParams', 'cuGraphExecDestroy', 'cuGraphExecEventRecordNodeSetEvent', 'cuGraphExecEventWaitNodeSetEvent', 'cuGraphExecExternalSemaphoresSignalNodeSetParams', 'cuGraphExecExternalSemaphoresWaitNodeSetParams', - 'cuGraphExecHostNodeSetParams', 'cuGraphExecKernelNodeSetParams', + 'cuGraphExecGetFlags', 'cuGraphExecHostNodeSetParams', + 'cuGraphExecKernelNodeSetParams', + 'cuGraphExecKernelNodeSetParams_v2', 'cuGraphExecMemcpyNodeSetParams', 'cuGraphExecMemsetNodeSetParams', 'cuGraphExecUpdate', + 'cuGraphExecUpdate_v2', 'cuGraphExternalSemaphoresSignalNodeGetParams', 'cuGraphExternalSemaphoresSignalNodeSetParams', 'cuGraphExternalSemaphoresWaitNodeGetParams', 'cuGraphExternalSemaphoresWaitNodeSetParams', 'cuGraphGetEdges', 'cuGraphGetNodes', 'cuGraphGetRootNodes', 'cuGraphHostNodeGetParams', 'cuGraphHostNodeSetParams', - 'cuGraphInstantiateWithFlags', 'cuGraphInstantiate_v2', + 'cuGraphInstantiate', 'cuGraphInstantiateWithFlags', + 'cuGraphInstantiateWithParams', + 'cuGraphInstantiateWithParams_ptsz', 'cuGraphInstantiate_v2', 'cuGraphKernelNodeCopyAttributes', 'cuGraphKernelNodeGetAttribute', 'cuGraphKernelNodeGetParams', - 'cuGraphKernelNodeSetAttribute', 'cuGraphKernelNodeSetParams', - 'cuGraphLaunch', 'cuGraphMemAllocNodeGetParams', - 'cuGraphMemFreeNodeGetParams', 'cuGraphMemcpyNodeGetParams', - 'cuGraphMemcpyNodeSetParams', 'cuGraphMemsetNodeGetParams', - 'cuGraphMemsetNodeSetParams', 'cuGraphNodeFindInClone', - 'cuGraphNodeGetDependencies', 'cuGraphNodeGetDependentNodes', - 'cuGraphNodeGetType', 'cuGraphReleaseUserObject', - 'cuGraphRemoveDependencies', 'cuGraphRetainUserObject', - 'cuGraphUpload', 'cuGraphicsMapResources', + 'cuGraphKernelNodeGetParams_v2', 'cuGraphKernelNodeSetAttribute', + 'cuGraphKernelNodeSetParams', 'cuGraphKernelNodeSetParams_v2', + 'cuGraphLaunch', 'cuGraphLaunch_ptsz', + 'cuGraphMemAllocNodeGetParams', 'cuGraphMemFreeNodeGetParams', + 'cuGraphMemcpyNodeGetParams', 'cuGraphMemcpyNodeSetParams', + 'cuGraphMemsetNodeGetParams', 'cuGraphMemsetNodeSetParams', + 'cuGraphNodeFindInClone', 'cuGraphNodeGetDependencies', + 'cuGraphNodeGetDependentNodes', 'cuGraphNodeGetEnabled', + 'cuGraphNodeGetType', 'cuGraphNodeSetEnabled', + 'cuGraphReleaseUserObject', 'cuGraphRemoveDependencies', + 'cuGraphRetainUserObject', 'cuGraphUpload', 'cuGraphUpload_ptsz', + 'cuGraphicsMapResources', 'cuGraphicsMapResources_ptsz', 'cuGraphicsResourceGetMappedMipmappedArray', + 'cuGraphicsResourceGetMappedPointer', 'cuGraphicsResourceGetMappedPointer_v2', + 'cuGraphicsResourceSetMapFlags', 'cuGraphicsResourceSetMapFlags_v2', 'cuGraphicsSubResourceGetMappedArray', 'cuGraphicsUnmapResources', - 'cuGraphicsUnregisterResource', 'cuImportExternalMemory', - 'cuImportExternalSemaphore', 'cuInit', 'cuIpcCloseMemHandle', - 'cuIpcGetEventHandle', 'cuIpcGetMemHandle', - 'cuIpcOpenEventHandle', 'cuIpcOpenMemHandle_v2', 'cuLaunch', - 'cuLaunchCooperativeKernel', - 'cuLaunchCooperativeKernelMultiDevice', 'cuLaunchGrid', - 'cuLaunchGridAsync', 'cuLaunchHostFunc', 'cuLaunchKernel', - 'cuLinkAddData_v2', 'cuLinkAddFile_v2', 'cuLinkComplete', - 'cuLinkCreate_v2', 'cuLinkDestroy', 'cuMemAddressFree', - 'cuMemAddressReserve', 'cuMemAdvise', 'cuMemAllocAsync', - 'cuMemAllocFromPoolAsync', 'cuMemAllocHost_v2', - 'cuMemAllocManaged', 'cuMemAllocPitch_v2', 'cuMemAlloc_v2', - 'cuMemCreate', 'cuMemExportToShareableHandle', 'cuMemFreeAsync', - 'cuMemFreeHost', 'cuMemFree_v2', 'cuMemGetAccess', + 'cuGraphicsUnmapResources_ptsz', 'cuGraphicsUnregisterResource', + 'cuImportExternalMemory', 'cuImportExternalSemaphore', 'cuInit', + 'cuIpcCloseMemHandle', 'cuIpcGetEventHandle', 'cuIpcGetMemHandle', + 'cuIpcOpenEventHandle', 'cuIpcOpenMemHandle', + 'cuIpcOpenMemHandle_v2', 'cuKernelGetAttribute', + 'cuKernelGetFunction', 'cuKernelSetAttribute', + 'cuKernelSetCacheConfig', 'cuLaunch', 'cuLaunchCooperativeKernel', + 'cuLaunchCooperativeKernelMultiDevice', + 'cuLaunchCooperativeKernel_ptsz', 'cuLaunchGrid', + 'cuLaunchGridAsync', 'cuLaunchHostFunc', 'cuLaunchHostFunc_ptsz', + 'cuLaunchKernel', 'cuLaunchKernelEx', 'cuLaunchKernelEx_ptsz', + 'cuLaunchKernel_ptsz', 'cuLibraryGetGlobal', 'cuLibraryGetKernel', + 'cuLibraryGetManaged', 'cuLibraryGetModule', + 'cuLibraryGetUnifiedFunction', 'cuLibraryLoadData', + 'cuLibraryLoadFromFile', 'cuLibraryUnload', 'cuLinkAddData', + 'cuLinkAddData_v2', 'cuLinkAddFile', 'cuLinkAddFile_v2', + 'cuLinkComplete', 'cuLinkCreate', 'cuLinkCreate_v2', + 'cuLinkDestroy', 'cuMemAddressFree', 'cuMemAddressReserve', + 'cuMemAdvise', 'cuMemAlloc', 'cuMemAllocAsync', + 'cuMemAllocAsync_ptsz', 'cuMemAllocFromPoolAsync', + 'cuMemAllocFromPoolAsync_ptsz', 'cuMemAllocHost', + 'cuMemAllocHost_v2', 'cuMemAllocManaged', 'cuMemAllocPitch', + 'cuMemAllocPitch_v2', 'cuMemAlloc_v2', 'cuMemCreate', + 'cuMemExportToShareableHandle', 'cuMemFree', 'cuMemFreeAsync', + 'cuMemFreeAsync_ptsz', 'cuMemFreeHost', 'cuMemFree_v2', + 'cuMemGetAccess', 'cuMemGetAddressRange', 'cuMemGetAddressRange_v2', 'cuMemGetAllocationGranularity', - 'cuMemGetAllocationPropertiesFromHandle', 'cuMemGetInfo_v2', - 'cuMemHostAlloc', 'cuMemHostGetDevicePointer_v2', - 'cuMemHostGetFlags', 'cuMemHostRegister_v2', + 'cuMemGetAllocationPropertiesFromHandle', + 'cuMemGetHandleForAddressRange', 'cuMemGetInfo', + 'cuMemGetInfo_v2', 'cuMemHostAlloc', 'cuMemHostGetDevicePointer', + 'cuMemHostGetDevicePointer_v2', 'cuMemHostGetFlags', + 'cuMemHostRegister', 'cuMemHostRegister_v2', 'cuMemHostUnregister', 'cuMemImportFromShareableHandle', - 'cuMemMap', 'cuMemMapArrayAsync', 'cuMemPoolCreate', - 'cuMemPoolDestroy', 'cuMemPoolExportPointer', + 'cuMemMap', 'cuMemMapArrayAsync', 'cuMemMapArrayAsync_ptsz', + 'cuMemPoolCreate', 'cuMemPoolDestroy', 'cuMemPoolExportPointer', 'cuMemPoolExportToShareableHandle', 'cuMemPoolGetAccess', 'cuMemPoolGetAttribute', 'cuMemPoolImportFromShareableHandle', 'cuMemPoolImportPointer', 'cuMemPoolSetAccess', 'cuMemPoolSetAttribute', 'cuMemPoolTrimTo', 'cuMemPrefetchAsync', - 'cuMemRangeGetAttribute', 'cuMemRangeGetAttributes', - 'cuMemRelease', 'cuMemRetainAllocationHandle', 'cuMemSetAccess', - 'cuMemUnmap', 'cuMemcpy', 'cuMemcpy2DAsync_v2', - 'cuMemcpy2DUnaligned_v2', 'cuMemcpy2D_v2', 'cuMemcpy3DAsync_v2', - 'cuMemcpy3DPeer', 'cuMemcpy3DPeerAsync', 'cuMemcpy3D_v2', - 'cuMemcpyAsync', 'cuMemcpyAtoA_v2', 'cuMemcpyAtoD_v2', - 'cuMemcpyAtoHAsync_v2', 'cuMemcpyAtoH_v2', 'cuMemcpyDtoA_v2', - 'cuMemcpyDtoDAsync_v2', 'cuMemcpyDtoD_v2', 'cuMemcpyDtoHAsync_v2', - 'cuMemcpyDtoH_v2', 'cuMemcpyHtoAAsync_v2', 'cuMemcpyHtoA_v2', - 'cuMemcpyHtoDAsync_v2', 'cuMemcpyHtoD_v2', 'cuMemcpyPeer', - 'cuMemcpyPeerAsync', 'cuMemsetD16Async', 'cuMemsetD16_v2', - 'cuMemsetD2D16Async', 'cuMemsetD2D16_v2', 'cuMemsetD2D32Async', - 'cuMemsetD2D32_v2', 'cuMemsetD2D8Async', 'cuMemsetD2D8_v2', - 'cuMemsetD32Async', 'cuMemsetD32_v2', 'cuMemsetD8Async', - 'cuMemsetD8_v2', 'cuMipmappedArrayCreate', + 'cuMemPrefetchAsync_ptsz', 'cuMemRangeGetAttribute', + 'cuMemRangeGetAttributes', 'cuMemRelease', + 'cuMemRetainAllocationHandle', 'cuMemSetAccess', 'cuMemUnmap', + 'cuMemcpy', 'cuMemcpy2D', 'cuMemcpy2DAsync', 'cuMemcpy2DAsync_v2', + 'cuMemcpy2DAsync_v2_ptsz', 'cuMemcpy2DUnaligned', + 'cuMemcpy2DUnaligned_v2', 'cuMemcpy2DUnaligned_v2_ptds', + 'cuMemcpy2D_v2', 'cuMemcpy2D_v2_ptds', 'cuMemcpy3D', + 'cuMemcpy3DAsync', 'cuMemcpy3DAsync_v2', + 'cuMemcpy3DAsync_v2_ptsz', 'cuMemcpy3DPeer', + 'cuMemcpy3DPeerAsync', 'cuMemcpy3DPeerAsync_ptsz', + 'cuMemcpy3DPeer_ptds', 'cuMemcpy3D_v2', 'cuMemcpy3D_v2_ptds', + 'cuMemcpyAsync', 'cuMemcpyAsync_ptsz', 'cuMemcpyAtoA', + 'cuMemcpyAtoA_v2', 'cuMemcpyAtoA_v2_ptds', 'cuMemcpyAtoD', + 'cuMemcpyAtoD_v2', 'cuMemcpyAtoD_v2_ptds', 'cuMemcpyAtoH', + 'cuMemcpyAtoHAsync', 'cuMemcpyAtoHAsync_v2', + 'cuMemcpyAtoHAsync_v2_ptsz', 'cuMemcpyAtoH_v2', + 'cuMemcpyAtoH_v2_ptds', 'cuMemcpyDtoA', 'cuMemcpyDtoA_v2', + 'cuMemcpyDtoA_v2_ptds', 'cuMemcpyDtoD', 'cuMemcpyDtoDAsync', + 'cuMemcpyDtoDAsync_v2', 'cuMemcpyDtoDAsync_v2_ptsz', + 'cuMemcpyDtoD_v2', 'cuMemcpyDtoD_v2_ptds', 'cuMemcpyDtoH', + 'cuMemcpyDtoHAsync', 'cuMemcpyDtoHAsync_v2', + 'cuMemcpyDtoHAsync_v2_ptsz', 'cuMemcpyDtoH_v2', + 'cuMemcpyDtoH_v2_ptds', 'cuMemcpyHtoA', 'cuMemcpyHtoAAsync', + 'cuMemcpyHtoAAsync_v2', 'cuMemcpyHtoAAsync_v2_ptsz', + 'cuMemcpyHtoA_v2', 'cuMemcpyHtoA_v2_ptds', 'cuMemcpyHtoD', + 'cuMemcpyHtoDAsync', 'cuMemcpyHtoDAsync_v2', + 'cuMemcpyHtoDAsync_v2_ptsz', 'cuMemcpyHtoD_v2', + 'cuMemcpyHtoD_v2_ptds', 'cuMemcpyPeer', 'cuMemcpyPeerAsync', + 'cuMemcpyPeerAsync_ptsz', 'cuMemcpyPeer_ptds', 'cuMemcpy_ptds', + 'cuMemsetD16', 'cuMemsetD16Async', 'cuMemsetD16Async_ptsz', + 'cuMemsetD16_v2', 'cuMemsetD16_v2_ptds', 'cuMemsetD2D16', + 'cuMemsetD2D16Async', 'cuMemsetD2D16Async_ptsz', + 'cuMemsetD2D16_v2', 'cuMemsetD2D16_v2_ptds', 'cuMemsetD2D32', + 'cuMemsetD2D32Async', 'cuMemsetD2D32Async_ptsz', + 'cuMemsetD2D32_v2', 'cuMemsetD2D32_v2_ptds', 'cuMemsetD2D8', + 'cuMemsetD2D8Async', 'cuMemsetD2D8Async_ptsz', 'cuMemsetD2D8_v2', + 'cuMemsetD2D8_v2_ptds', 'cuMemsetD32', 'cuMemsetD32Async', + 'cuMemsetD32Async_ptsz', 'cuMemsetD32_v2', 'cuMemsetD32_v2_ptds', + 'cuMemsetD8', 'cuMemsetD8Async', 'cuMemsetD8Async_ptsz', + 'cuMemsetD8_v2', 'cuMemsetD8_v2_ptds', 'cuMipmappedArrayCreate', 'cuMipmappedArrayDestroy', 'cuMipmappedArrayGetLevel', + 'cuMipmappedArrayGetMemoryRequirements', 'cuMipmappedArrayGetSparseProperties', 'cuModuleGetFunction', - 'cuModuleGetGlobal_v2', 'cuModuleGetSurfRef', 'cuModuleGetTexRef', - 'cuModuleLoad', 'cuModuleLoadData', 'cuModuleLoadDataEx', - 'cuModuleLoadFatBinary', 'cuModuleUnload', + 'cuModuleGetGlobal', 'cuModuleGetGlobal_v2', + 'cuModuleGetLoadingMode', 'cuModuleGetSurfRef', + 'cuModuleGetTexRef', 'cuModuleLoad', 'cuModuleLoadData', + 'cuModuleLoadDataEx', 'cuModuleLoadFatBinary', 'cuModuleUnload', 'cuOccupancyAvailableDynamicSMemPerBlock', 'cuOccupancyMaxActiveBlocksPerMultiprocessor', 'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags', + 'cuOccupancyMaxActiveClusters', 'cuOccupancyMaxPotentialBlockSize', - 'cuOccupancyMaxPotentialBlockSizeWithFlags', 'cuParamSetSize', + 'cuOccupancyMaxPotentialBlockSizeWithFlags', + 'cuOccupancyMaxPotentialClusterSize', 'cuParamSetSize', 'cuParamSetTexRef', 'cuParamSetf', 'cuParamSeti', 'cuParamSetv', 'cuPointerGetAttribute', 'cuPointerGetAttributes', 'cuPointerSetAttribute', 'cuSignalExternalSemaphoresAsync', - 'cuStreamAddCallback', 'cuStreamAttachMemAsync', - 'cuStreamBatchMemOp', 'cuStreamBeginCapture_v2', - 'cuStreamCopyAttributes', 'cuStreamCreate', - 'cuStreamCreateWithPriority', 'cuStreamDestroy_v2', - 'cuStreamEndCapture', 'cuStreamGetAttribute', - 'cuStreamGetCaptureInfo', 'cuStreamGetCaptureInfo_v2', - 'cuStreamGetCtx', 'cuStreamGetFlags', 'cuStreamGetPriority', - 'cuStreamIsCapturing', 'cuStreamQuery', 'cuStreamSetAttribute', - 'cuStreamSynchronize', 'cuStreamUpdateCaptureDependencies', - 'cuStreamWaitEvent', 'cuStreamWaitValue32', 'cuStreamWaitValue64', - 'cuStreamWriteValue32', 'cuStreamWriteValue64', - 'cuSurfObjectCreate', 'cuSurfObjectDestroy', - 'cuSurfObjectGetResourceDesc', 'cuSurfRefGetArray', - 'cuSurfRefSetArray', 'cuTexObjectCreate', 'cuTexObjectDestroy', - 'cuTexObjectGetResourceDesc', 'cuTexObjectGetResourceViewDesc', - 'cuTexObjectGetTextureDesc', 'cuTexRefCreate', 'cuTexRefDestroy', + 'cuSignalExternalSemaphoresAsync_ptsz', 'cuStreamAddCallback', + 'cuStreamAddCallback_ptsz', 'cuStreamAttachMemAsync', + 'cuStreamAttachMemAsync_ptsz', 'cuStreamBatchMemOp', + 'cuStreamBatchMemOp_ptsz', 'cuStreamBatchMemOp_v2', + 'cuStreamBatchMemOp_v2_ptsz', 'cuStreamBeginCapture', + 'cuStreamBeginCapture_ptsz', 'cuStreamBeginCapture_v2', + 'cuStreamBeginCapture_v2_ptsz', 'cuStreamCopyAttributes', + 'cuStreamCopyAttributes_ptsz', 'cuStreamCreate', + 'cuStreamCreateWithPriority', 'cuStreamDestroy', + 'cuStreamDestroy_v2', 'cuStreamEndCapture', + 'cuStreamEndCapture_ptsz', 'cuStreamGetAttribute', + 'cuStreamGetAttribute_ptsz', 'cuStreamGetCaptureInfo', + 'cuStreamGetCaptureInfo_ptsz', 'cuStreamGetCaptureInfo_v2', + 'cuStreamGetCaptureInfo_v2_ptsz', 'cuStreamGetCtx', + 'cuStreamGetCtx_ptsz', 'cuStreamGetFlags', + 'cuStreamGetFlags_ptsz', 'cuStreamGetId', 'cuStreamGetId_ptsz', + 'cuStreamGetPriority', 'cuStreamGetPriority_ptsz', + 'cuStreamIsCapturing', 'cuStreamIsCapturing_ptsz', + 'cuStreamQuery', 'cuStreamQuery_ptsz', 'cuStreamSetAttribute', + 'cuStreamSetAttribute_ptsz', 'cuStreamSynchronize', + 'cuStreamSynchronize_ptsz', 'cuStreamUpdateCaptureDependencies', + 'cuStreamUpdateCaptureDependencies_ptsz', 'cuStreamWaitEvent', + 'cuStreamWaitEvent_ptsz', 'cuStreamWaitValue32', + 'cuStreamWaitValue32_ptsz', 'cuStreamWaitValue32_v2', + 'cuStreamWaitValue32_v2_ptsz', 'cuStreamWaitValue64', + 'cuStreamWaitValue64_ptsz', 'cuStreamWaitValue64_v2', + 'cuStreamWaitValue64_v2_ptsz', 'cuStreamWriteValue32', + 'cuStreamWriteValue32_ptsz', 'cuStreamWriteValue32_v2', + 'cuStreamWriteValue32_v2_ptsz', 'cuStreamWriteValue64', + 'cuStreamWriteValue64_ptsz', 'cuStreamWriteValue64_v2', + 'cuStreamWriteValue64_v2_ptsz', 'cuSurfObjectCreate', + 'cuSurfObjectDestroy', 'cuSurfObjectGetResourceDesc', + 'cuSurfRefGetArray', 'cuSurfRefSetArray', + 'cuTensorMapEncodeIm2col', 'cuTensorMapEncodeTiled', + 'cuTensorMapReplaceAddress', 'cuTexObjectCreate', + 'cuTexObjectDestroy', 'cuTexObjectGetResourceDesc', + 'cuTexObjectGetResourceViewDesc', 'cuTexObjectGetTextureDesc', + 'cuTexRefCreate', 'cuTexRefDestroy', 'cuTexRefGetAddress', 'cuTexRefGetAddressMode', 'cuTexRefGetAddress_v2', 'cuTexRefGetArray', 'cuTexRefGetBorderColor', 'cuTexRefGetFilterMode', 'cuTexRefGetFlags', 'cuTexRefGetFormat', 'cuTexRefGetMaxAnisotropy', 'cuTexRefGetMipmapFilterMode', 'cuTexRefGetMipmapLevelBias', 'cuTexRefGetMipmapLevelClamp', - 'cuTexRefGetMipmappedArray', 'cuTexRefSetAddress2D_v3', - 'cuTexRefSetAddressMode', 'cuTexRefSetAddress_v2', - 'cuTexRefSetArray', 'cuTexRefSetBorderColor', - 'cuTexRefSetFilterMode', 'cuTexRefSetFlags', 'cuTexRefSetFormat', + 'cuTexRefGetMipmappedArray', 'cuTexRefSetAddress', + 'cuTexRefSetAddress2D', 'cuTexRefSetAddress2D_v2', + 'cuTexRefSetAddress2D_v3', 'cuTexRefSetAddressMode', + 'cuTexRefSetAddress_v2', 'cuTexRefSetArray', + 'cuTexRefSetBorderColor', 'cuTexRefSetFilterMode', + 'cuTexRefSetFlags', 'cuTexRefSetFormat', 'cuTexRefSetMaxAnisotropy', 'cuTexRefSetMipmapFilterMode', 'cuTexRefSetMipmapLevelBias', 'cuTexRefSetMipmapLevelClamp', 'cuTexRefSetMipmappedArray', 'cuThreadExchangeStreamCaptureMode', 'cuUserObjectCreate', 'cuUserObjectRelease', 'cuUserObjectRetain', - 'cuWaitExternalSemaphoresAsync', 'cudaError_enum', 'cuuint32_t', - 'cuuint64_t', 'size_t', 'struct_CUDA_ARRAY3D_DESCRIPTOR_st', + 'cuWaitExternalSemaphoresAsync', + 'cuWaitExternalSemaphoresAsync_ptsz', 'cudaError_enum', + 'cuuint32_t', 'cuuint64_t', 'size_t', + 'struct_CUDA_ARRAY3D_DESCRIPTOR_st', + 'struct_CUDA_ARRAY3D_DESCRIPTOR_v1_st', 'struct_CUDA_ARRAY_DESCRIPTOR_st', + 'struct_CUDA_ARRAY_DESCRIPTOR_v1_st', + 'struct_CUDA_ARRAY_MEMORY_REQUIREMENTS_st', 'struct_CUDA_ARRAY_SPARSE_PROPERTIES_st', 'struct_CUDA_ARRAY_SPARSE_PROPERTIES_st_tileExtent', + 'struct_CUDA_BATCH_MEM_OP_NODE_PARAMS_st', 'struct_CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st', 'struct_CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st', 'struct_CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st_0_win32', @@ -5716,10 +7631,13 @@ __all__ = \ 'struct_CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st_params', 'struct_CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st', 'struct_CUDA_EXT_SEM_WAIT_NODE_PARAMS_st', + 'struct_CUDA_GRAPH_INSTANTIATE_PARAMS_st', 'struct_CUDA_HOST_NODE_PARAMS_st', 'struct_CUDA_KERNEL_NODE_PARAMS_st', + 'struct_CUDA_KERNEL_NODE_PARAMS_v2_st', 'struct_CUDA_LAUNCH_PARAMS_st', 'struct_CUDA_MEMCPY2D_st', - 'struct_CUDA_MEMCPY3D_PEER_st', 'struct_CUDA_MEMCPY3D_st', + 'struct_CUDA_MEMCPY2D_v1_st', 'struct_CUDA_MEMCPY3D_PEER_st', + 'struct_CUDA_MEMCPY3D_st', 'struct_CUDA_MEMCPY3D_v1_st', 'struct_CUDA_MEMSET_NODE_PARAMS_st', 'struct_CUDA_MEM_ALLOC_NODE_PARAMS_st', 'struct_CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st', @@ -5737,9 +7655,15 @@ __all__ = \ 'struct_CUexecAffinityParam_st', 'struct_CUexecAffinitySmCount_st', 'struct_CUextMemory_st', 'struct_CUextSemaphore_st', 'struct_CUfunc_st', - 'struct_CUgraphExec_st', 'struct_CUgraphNode_st', - 'struct_CUgraph_st', 'struct_CUgraphicsResource_st', - 'struct_CUipcEventHandle_st', 'struct_CUipcMemHandle_st', + 'struct_CUgraphExecUpdateResultInfo_st', 'struct_CUgraphExec_st', + 'struct_CUgraphNode_st', 'struct_CUgraph_st', + 'struct_CUgraphicsResource_st', 'struct_CUipcEventHandle_st', + 'struct_CUipcMemHandle_st', 'struct_CUkern_st', + 'struct_CUlaunchAttributeValue_union_clusterDim', + 'struct_CUlaunchAttributeValue_union_programmaticEvent', + 'struct_CUlaunchAttribute_st', 'struct_CUlaunchConfig_st', + 'struct_CUlaunchMemSyncDomainMap_st', 'struct_CUlib_st', + 'struct_CUlibraryHostUniversalFunctionAndDataTable_st', 'struct_CUlinkState_st', 'struct_CUmemAccessDesc_st', 'struct_CUmemAllocationProp_st', 'struct_CUmemAllocationProp_st_allocFlags', @@ -5747,10 +7671,12 @@ __all__ = \ 'struct_CUmemPoolProps_st', 'struct_CUmemPoolPtrExportData_st', 'struct_CUmipmappedArray_st', 'struct_CUmod_st', 'struct_CUstreamMemOpFlushRemoteWritesParams_st', + 'struct_CUstreamMemOpMemoryBarrierParams_st', 'struct_CUstreamMemOpWaitValueParams_st', 'struct_CUstreamMemOpWriteValueParams_st', 'struct_CUstream_st', - 'struct_CUsurfref_st', 'struct_CUtexref_st', - 'struct_CUuserObject_st', 'struct_CUuuid_st', + 'struct_CUsurfref_st', 'struct_CUtensorMap_st', + 'struct_CUtexref_st', 'struct_CUuserObject_st', + 'struct_CUuuid_st', 'union_CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st_handle', 'union_CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st_handle', 'union_CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st_0_nvSciSync', @@ -5760,8 +7686,7 @@ __all__ = \ 'union_CUarrayMapInfo_st_resource', 'union_CUarrayMapInfo_st_subresource', 'union_CUexecAffinityParam_st_param', - 'union_CUkernelNodeAttrValue_union', - 'union_CUstreamAttrValue_union', + 'union_CUlaunchAttributeValue_union', 'union_CUstreamBatchMemOpParams_union', 'union_CUstreamMemOpWaitValueParams_st_0', 'union_CUstreamMemOpWriteValueParams_st_0'] diff --git a/tinygrad_repo/tinygrad/runtime/autogen/hsa.py b/tinygrad_repo/tinygrad/runtime/autogen/hsa.py index bf73137..bb65911 100644 --- a/tinygrad_repo/tinygrad/runtime/autogen/hsa.py +++ b/tinygrad_repo/tinygrad/runtime/autogen/hsa.py @@ -372,7 +372,8 @@ c__EA_hsa_extension_t__enumvalues = { 512: 'HSA_EXTENSION_AMD_PROFILER', 513: 'HSA_EXTENSION_AMD_LOADER', 514: 'HSA_EXTENSION_AMD_AQLPROFILE', - 514: 'HSA_AMD_LAST_EXTENSION', + 515: 'HSA_EXTENSION_AMD_PC_SAMPLING', + 515: 'HSA_AMD_LAST_EXTENSION', } HSA_EXTENSION_FINALIZER = 0 HSA_EXTENSION_IMAGES = 1 @@ -383,7 +384,8 @@ HSA_AMD_FIRST_EXTENSION = 512 HSA_EXTENSION_AMD_PROFILER = 512 HSA_EXTENSION_AMD_LOADER = 513 HSA_EXTENSION_AMD_AQLPROFILE = 514 -HSA_AMD_LAST_EXTENSION = 514 +HSA_EXTENSION_AMD_PC_SAMPLING = 515 +HSA_AMD_LAST_EXTENSION = 515 c__EA_hsa_extension_t = ctypes.c_uint32 # enum hsa_extension_t = c__EA_hsa_extension_t hsa_extension_t__enumvalues = c__EA_hsa_extension_t__enumvalues @@ -2088,6 +2090,7 @@ c__EA_hsa_code_symbol_info_t__enumvalues = { 15: 'HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK', 18: 'HSA_CODE_SYMBOL_INFO_KERNEL_CALL_CONVENTION', 16: 'HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION', + 19: 'HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE', } HSA_CODE_SYMBOL_INFO_TYPE = 0 HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1 @@ -2108,6 +2111,7 @@ HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14 HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15 HSA_CODE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18 HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 +HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19 c__EA_hsa_code_symbol_info_t = ctypes.c_uint32 # enum hsa_code_symbol_info_t = c__EA_hsa_code_symbol_info_t hsa_code_symbol_info_t__enumvalues = c__EA_hsa_code_symbol_info_t__enumvalues @@ -2594,6 +2598,7 @@ c__Ea_HSA_STATUS_ERROR_INVALID_MEMORY_POOL__enumvalues = { 43: 'HSA_STATUS_ERROR_MEMORY_FAULT', 44: 'HSA_STATUS_CU_MASK_REDUCED', 45: 'HSA_STATUS_ERROR_OUT_OF_REGISTERS', + 46: 'HSA_STATUS_ERROR_RESOURCE_BUSY', } HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40 HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION = 41 @@ -2601,6 +2606,7 @@ HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42 HSA_STATUS_ERROR_MEMORY_FAULT = 43 HSA_STATUS_CU_MASK_REDUCED = 44 HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45 +HSA_STATUS_ERROR_RESOURCE_BUSY = 46 c__Ea_HSA_STATUS_ERROR_INVALID_MEMORY_POOL = ctypes.c_uint32 # enum # values for enumeration 'c__EA_hsa_amd_iommu_version_t' @@ -2976,9 +2982,11 @@ hsa_amd_memory_pool_info_t__enumvalues = c__EA_hsa_amd_memory_pool_info_t__enumv hsa_amd_memory_pool_flag_s__enumvalues = { 0: 'HSA_AMD_MEMORY_POOL_STANDARD_FLAG', 1: 'HSA_AMD_MEMORY_POOL_PCIE_FLAG', + 2: 'HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG', } HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0 HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1 +HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = 2 hsa_amd_memory_pool_flag_s = ctypes.c_uint32 # enum hsa_amd_memory_pool_flag_t = hsa_amd_memory_pool_flag_s hsa_amd_memory_pool_flag_t__enumvalues = hsa_amd_memory_pool_flag_s__enumvalues @@ -3524,6 +3532,12 @@ try: hsa_amd_vmem_address_reserve.argtypes = [ctypes.POINTER(ctypes.POINTER(None)), size_t, uint64_t, uint64_t] except AttributeError: pass +try: + hsa_amd_vmem_address_reserve_align = _libraries['libhsa-runtime64.so'].hsa_amd_vmem_address_reserve_align + hsa_amd_vmem_address_reserve_align.restype = hsa_status_t + hsa_amd_vmem_address_reserve_align.argtypes = [ctypes.POINTER(ctypes.POINTER(None)), size_t, uint64_t, uint64_t, uint64_t] +except AttributeError: + pass try: hsa_amd_vmem_address_free = _libraries['libhsa-runtime64.so'].hsa_amd_vmem_address_free hsa_amd_vmem_address_free.restype = hsa_status_t @@ -3627,6 +3641,23 @@ try: hsa_amd_agent_set_async_scratch_limit.argtypes = [hsa_agent_t, size_t] except AttributeError: pass + +# values for enumeration 'c__EA_hsa_queue_info_attribute_t' +c__EA_hsa_queue_info_attribute_t__enumvalues = { + 0: 'HSA_AMD_QUEUE_INFO_AGENT', + 1: 'HSA_AMD_QUEUE_INFO_DOORBELL_ID', +} +HSA_AMD_QUEUE_INFO_AGENT = 0 +HSA_AMD_QUEUE_INFO_DOORBELL_ID = 1 +c__EA_hsa_queue_info_attribute_t = ctypes.c_uint32 # enum +hsa_queue_info_attribute_t = c__EA_hsa_queue_info_attribute_t +hsa_queue_info_attribute_t__enumvalues = c__EA_hsa_queue_info_attribute_t__enumvalues +try: + hsa_amd_queue_get_info = _libraries['libhsa-runtime64.so'].hsa_amd_queue_get_info + hsa_amd_queue_get_info.restype = hsa_status_t + hsa_amd_queue_get_info.argtypes = [ctypes.POINTER(struct_hsa_queue_s), hsa_queue_info_attribute_t, ctypes.POINTER(None)] +except AttributeError: + pass amd_queue_properties32_t = ctypes.c_uint32 # values for enumeration 'amd_queue_properties_t' @@ -5077,6 +5108,7 @@ __all__ = \ 'HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT', 'HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT', 'HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED', + 'HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG', 'HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED', 'HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED', 'HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED', @@ -5096,10 +5128,10 @@ __all__ = \ 'HSA_AMD_MEMORY_POOL_PCIE_FLAG', 'HSA_AMD_MEMORY_POOL_STANDARD_FLAG', 'HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU', - 'HSA_AMD_PACKET_TYPE_BARRIER_VALUE', - 'HSA_AMD_QUEUE_PRIORITY_HIGH', 'HSA_AMD_QUEUE_PRIORITY_LOW', - 'HSA_AMD_QUEUE_PRIORITY_NORMAL', 'HSA_AMD_REGION_INFO_BASE', - 'HSA_AMD_REGION_INFO_BUS_WIDTH', + 'HSA_AMD_PACKET_TYPE_BARRIER_VALUE', 'HSA_AMD_QUEUE_INFO_AGENT', + 'HSA_AMD_QUEUE_INFO_DOORBELL_ID', 'HSA_AMD_QUEUE_PRIORITY_HIGH', + 'HSA_AMD_QUEUE_PRIORITY_LOW', 'HSA_AMD_QUEUE_PRIORITY_NORMAL', + 'HSA_AMD_REGION_INFO_BASE', 'HSA_AMD_REGION_INFO_BUS_WIDTH', 'HSA_AMD_REGION_INFO_HOST_ACCESSIBLE', 'HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY', 'HSA_AMD_SDMA_ENGINE_0', 'HSA_AMD_SDMA_ENGINE_1', @@ -5149,6 +5181,7 @@ __all__ = \ 'HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT', 'HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE', 'HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE', + 'HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE', 'HSA_CODE_SYMBOL_INFO_LINKAGE', 'HSA_CODE_SYMBOL_INFO_MODULE_NAME', 'HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH', @@ -5192,8 +5225,9 @@ __all__ = \ 'HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT', 'HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE', 'HSA_EXTENSION_AMD_AQLPROFILE', 'HSA_EXTENSION_AMD_LOADER', - 'HSA_EXTENSION_AMD_PROFILER', 'HSA_EXTENSION_FINALIZER', - 'HSA_EXTENSION_IMAGES', 'HSA_EXTENSION_PERFORMANCE_COUNTERS', + 'HSA_EXTENSION_AMD_PC_SAMPLING', 'HSA_EXTENSION_AMD_PROFILER', + 'HSA_EXTENSION_FINALIZER', 'HSA_EXTENSION_IMAGES', + 'HSA_EXTENSION_PERFORMANCE_COUNTERS', 'HSA_EXTENSION_PROFILING_EVENTS', 'HSA_EXTENSION_STD_LAST', 'HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS', 'HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS', @@ -5366,6 +5400,7 @@ __all__ = \ 'HSA_STATUS_ERROR_OUT_OF_REGISTERS', 'HSA_STATUS_ERROR_OUT_OF_RESOURCES', 'HSA_STATUS_ERROR_REFCOUNT_OVERFLOW', + 'HSA_STATUS_ERROR_RESOURCE_BUSY', 'HSA_STATUS_ERROR_RESOURCE_FREE', 'HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED', 'HSA_STATUS_ERROR_VARIABLE_UNDEFINED', 'HSA_STATUS_INFO_BREAK', @@ -5496,12 +5531,13 @@ __all__ = \ 'c__EA_hsa_machine_model_t', 'c__EA_hsa_packet_header_t', 'c__EA_hsa_packet_header_width_t', 'c__EA_hsa_packet_type_t', 'c__EA_hsa_profile_t', 'c__EA_hsa_queue_feature_t', - 'c__EA_hsa_queue_type_t', 'c__EA_hsa_region_global_flag_t', - 'c__EA_hsa_region_info_t', 'c__EA_hsa_region_segment_t', - 'c__EA_hsa_round_method_t', 'c__EA_hsa_signal_condition_t', - 'c__EA_hsa_status_t', 'c__EA_hsa_symbol_kind_t', - 'c__EA_hsa_symbol_linkage_t', 'c__EA_hsa_system_info_t', - 'c__EA_hsa_variable_allocation_t', 'c__EA_hsa_variable_segment_t', + 'c__EA_hsa_queue_info_attribute_t', 'c__EA_hsa_queue_type_t', + 'c__EA_hsa_region_global_flag_t', 'c__EA_hsa_region_info_t', + 'c__EA_hsa_region_segment_t', 'c__EA_hsa_round_method_t', + 'c__EA_hsa_signal_condition_t', 'c__EA_hsa_status_t', + 'c__EA_hsa_symbol_kind_t', 'c__EA_hsa_symbol_linkage_t', + 'c__EA_hsa_system_info_t', 'c__EA_hsa_variable_allocation_t', + 'c__EA_hsa_variable_segment_t', 'c__EA_hsa_ven_amd_aqlprofile_att_marker_channel_t', 'c__EA_hsa_ven_amd_aqlprofile_block_name_t', 'c__EA_hsa_ven_amd_aqlprofile_event_type_t', @@ -5595,7 +5631,8 @@ __all__ = \ 'hsa_amd_profiling_get_dispatch_time', 'hsa_amd_profiling_set_profiler_enabled', 'hsa_amd_queue_cu_get_mask', 'hsa_amd_queue_cu_set_mask', - 'hsa_amd_queue_priority_s', 'hsa_amd_queue_priority_t', + 'hsa_amd_queue_get_info', 'hsa_amd_queue_priority_s', + 'hsa_amd_queue_priority_t', 'hsa_amd_queue_priority_t__enumvalues', 'hsa_amd_queue_set_priority', 'hsa_amd_region_info_s', 'hsa_amd_region_info_t', 'hsa_amd_region_info_t__enumvalues', @@ -5616,7 +5653,9 @@ __all__ = \ 'hsa_amd_svm_model_t__enumvalues', 'hsa_amd_svm_prefetch_async', 'hsa_amd_system_event_callback_t', 'hsa_amd_vendor_packet_header_t', 'hsa_amd_vmem_address_free', - 'hsa_amd_vmem_address_reserve', 'hsa_amd_vmem_alloc_handle_t', + 'hsa_amd_vmem_address_reserve', + 'hsa_amd_vmem_address_reserve_align', + 'hsa_amd_vmem_alloc_handle_t', 'hsa_amd_vmem_export_shareable_handle', 'hsa_amd_vmem_get_access', 'hsa_amd_vmem_get_alloc_properties_from_handle', 'hsa_amd_vmem_handle_create', 'hsa_amd_vmem_handle_release', @@ -5741,6 +5780,8 @@ __all__ = \ 'hsa_queue_cas_write_index_screlease', 'hsa_queue_create', 'hsa_queue_destroy', 'hsa_queue_feature_t', 'hsa_queue_feature_t__enumvalues', 'hsa_queue_inactivate', + 'hsa_queue_info_attribute_t', + 'hsa_queue_info_attribute_t__enumvalues', 'hsa_queue_load_read_index_acquire', 'hsa_queue_load_read_index_relaxed', 'hsa_queue_load_read_index_scacquire', diff --git a/tinygrad_repo/tinygrad/runtime/autogen/io_uring.py b/tinygrad_repo/tinygrad/runtime/autogen/io_uring.py index 2d73208..420d750 100644 --- a/tinygrad_repo/tinygrad/runtime/autogen/io_uring.py +++ b/tinygrad_repo/tinygrad/runtime/autogen/io_uring.py @@ -159,12 +159,24 @@ def char_pointer_cast(string, encoding='utf-8'): LIB_URING_H = True # macro _XOPEN_SOURCE = 500 # macro +_GNU_SOURCE = True # macro # def uring_unlikely(cond): # macro # return __builtin_expect(!!(cond),0) # def uring_likely(cond): # macro # return __builtin_expect(!!(cond),1) +IOURINGINLINE = True # macro +__NR_io_uring_setup = 425 # macro +__NR_io_uring_enter = 426 # macro +__NR_io_uring_register = 427 # macro +def io_uring_cqe_index(ring, ptr, mask): # macro + return (((ptr)&(mask))<cq.khead;(cqe=(head!=io_uring_smp_load_acquire((ring)->cq.ktail)?&(ring)->cq.cqes[head&(*(ring)->cq.kring_mask)]:NULL));head++) +# return (head=*(ring)->cq.khead;(cqe=(head!=io_uring_smp_load_acquire((ring)->cq.ktail)?&(ring)->cq.cqes[io_uring_cqe_index(ring,head,(ring)->cq.ring_mask)]:NULL));head++) +LIBURING_HAVE_DATA64 = True # macro +def UNUSED(x): # macro + return (void)(x) +# def IO_URING_CHECK_VERSION(major, minor): # macro +# return (major>IO_URING_VERSION_MAJOR or (major==IO_URING_VERSION_MAJOR and minor>=IO_URING_VERSION_MINOR)) class struct_io_uring_sq(Structure): pass @@ -185,16 +197,29 @@ struct_io_uring_sq._fields_ = [ ('sqe_tail', ctypes.c_uint32), ('ring_sz', ctypes.c_uint64), ('ring_ptr', ctypes.POINTER(None)), - ('pad', ctypes.c_uint32 * 4), + ('ring_mask', ctypes.c_uint32), + ('ring_entries', ctypes.c_uint32), + ('pad', ctypes.c_uint32 * 2), ] class union_io_uring_sqe_0(Union): pass +class struct_io_uring_sqe_0_0(Structure): + pass + +struct_io_uring_sqe_0_0._pack_ = 1 # source:False +struct_io_uring_sqe_0_0._fields_ = [ + ('cmd_op', ctypes.c_uint32), + ('__pad1', ctypes.c_uint32), +] + union_io_uring_sqe_0._pack_ = 1 # source:False +union_io_uring_sqe_0._anonymous_ = ('_0',) union_io_uring_sqe_0._fields_ = [ ('off', ctypes.c_uint64), ('addr2', ctypes.c_uint64), + ('_0', struct_io_uring_sqe_0_0), ] class union_io_uring_sqe_1(Union): @@ -227,6 +252,9 @@ union_io_uring_sqe_2._fields_ = [ ('rename_flags', ctypes.c_uint32), ('unlink_flags', ctypes.c_uint32), ('hardlink_flags', ctypes.c_uint32), + ('xattr_flags', ctypes.c_uint32), + ('msg_ring_flags', ctypes.c_uint32), + ('uring_cmd_flags', ctypes.c_uint32), ] class union_io_uring_sqe_3(Union): @@ -241,14 +269,45 @@ union_io_uring_sqe_3._fields_ = [ class union_io_uring_sqe_4(Union): pass +class struct_io_uring_sqe_4_0(Structure): + pass + +struct_io_uring_sqe_4_0._pack_ = 1 # source:False +struct_io_uring_sqe_4_0._fields_ = [ + ('addr_len', ctypes.c_uint16), + ('__pad3', ctypes.c_uint16 * 1), +] + union_io_uring_sqe_4._pack_ = 1 # source:False +union_io_uring_sqe_4._anonymous_ = ('_0',) union_io_uring_sqe_4._fields_ = [ ('splice_fd_in', ctypes.c_int32), ('file_index', ctypes.c_uint32), + ('_0', struct_io_uring_sqe_4_0), +] + +class union_io_uring_sqe_5(Union): + pass + +class struct_io_uring_sqe_5_0(Structure): + pass + +struct_io_uring_sqe_5_0._pack_ = 1 # source:False +struct_io_uring_sqe_5_0._fields_ = [ + ('addr3', ctypes.c_uint64), + ('__pad2', ctypes.c_uint64 * 1), +] + +union_io_uring_sqe_5._pack_ = 1 # source:False +union_io_uring_sqe_5._anonymous_ = ('_0',) +union_io_uring_sqe_5._fields_ = [ + ('_0', struct_io_uring_sqe_5_0), + ('cmd', ctypes.c_ubyte * 0), + ('PADDING_0', ctypes.c_ubyte * 16), ] struct_io_uring_sqe._pack_ = 1 # source:False -struct_io_uring_sqe._anonymous_ = ('_0', '_1', '_2', '_3', '_4',) +struct_io_uring_sqe._anonymous_ = ('_0', '_1', '_2', '_3', '_4', '_5',) struct_io_uring_sqe._fields_ = [ ('opcode', ctypes.c_ubyte), ('flags', ctypes.c_ubyte), @@ -262,7 +321,7 @@ struct_io_uring_sqe._fields_ = [ ('_3', union_io_uring_sqe_3), ('personality', ctypes.c_uint16), ('_4', union_io_uring_sqe_4), - ('__pad2', ctypes.c_uint64 * 2), + ('_5', union_io_uring_sqe_5), ] class struct_io_uring_cq(Structure): @@ -282,7 +341,9 @@ struct_io_uring_cq._fields_ = [ ('cqes', ctypes.POINTER(struct_io_uring_cqe)), ('ring_sz', ctypes.c_uint64), ('ring_ptr', ctypes.POINTER(None)), - ('pad', ctypes.c_uint32 * 4), + ('ring_mask', ctypes.c_uint32), + ('ring_entries', ctypes.c_uint32), + ('pad', ctypes.c_uint32 * 2), ] struct_io_uring_cqe._pack_ = 1 # source:False @@ -290,6 +351,7 @@ struct_io_uring_cqe._fields_ = [ ('user_data', ctypes.c_uint64), ('res', ctypes.c_int32), ('flags', ctypes.c_uint32), + ('big_cqe', ctypes.c_uint64 * 0), ] class struct_io_uring(Structure): @@ -302,7 +364,10 @@ struct_io_uring._fields_ = [ ('flags', ctypes.c_uint32), ('ring_fd', ctypes.c_int32), ('features', ctypes.c_uint32), - ('pad', ctypes.c_uint32 * 3), + ('enter_ring_fd', ctypes.c_int32), + ('int_flags', ctypes.c_ubyte), + ('pad', ctypes.c_ubyte * 3), + ('pad2', ctypes.c_uint32), ] class struct_io_uring_probe(Structure): @@ -368,7 +433,7 @@ struct_io_sqring_offsets._fields_ = [ ('dropped', ctypes.c_uint32), ('array', ctypes.c_uint32), ('resv1', ctypes.c_uint32), - ('resv2', ctypes.c_uint64), + ('user_addr', ctypes.c_uint64), ] class struct_io_cqring_offsets(Structure): @@ -384,7 +449,7 @@ struct_io_cqring_offsets._fields_ = [ ('cqes', ctypes.c_uint32), ('flags', ctypes.c_uint32), ('resv1', ctypes.c_uint32), - ('resv2', ctypes.c_uint64), + ('user_addr', ctypes.c_uint64), ] struct_io_uring_params._pack_ = 1 # source:False @@ -401,6 +466,13 @@ struct_io_uring_params._fields_ = [ ('cq_off', struct_io_cqring_offsets), ] +size_t = ctypes.c_uint64 +try: + io_uring_queue_init_mem = _libraries['FIXME_STUB'].io_uring_queue_init_mem + io_uring_queue_init_mem.restype = ctypes.c_int32 + io_uring_queue_init_mem.argtypes = [ctypes.c_uint32, ctypes.POINTER(struct_io_uring), ctypes.POINTER(struct_io_uring_params), ctypes.POINTER(None), size_t] +except AttributeError: + pass try: io_uring_queue_init_params = _libraries['FIXME_STUB'].io_uring_queue_init_params io_uring_queue_init_params.restype = ctypes.c_int32 @@ -479,9 +551,9 @@ try: except AttributeError: pass try: - io_uring_get_sqe = _libraries['FIXME_STUB'].io_uring_get_sqe - io_uring_get_sqe.restype = ctypes.POINTER(struct_io_uring_sqe) - io_uring_get_sqe.argtypes = [ctypes.POINTER(struct_io_uring)] + io_uring_submit_and_wait_timeout = _libraries['FIXME_STUB'].io_uring_submit_and_wait_timeout + io_uring_submit_and_wait_timeout.restype = ctypes.c_int32 + io_uring_submit_and_wait_timeout.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(ctypes.POINTER(struct_io_uring_cqe)), ctypes.c_uint32, ctypes.POINTER(struct___kernel_timespec), ctypes.POINTER(struct_c__SA___sigset_t)] except AttributeError: pass class struct_iovec(Structure): @@ -505,6 +577,12 @@ try: io_uring_register_buffers_tags.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(struct_iovec), ctypes.POINTER(ctypes.c_uint64), ctypes.c_uint32] except AttributeError: pass +try: + io_uring_register_buffers_sparse = _libraries['FIXME_STUB'].io_uring_register_buffers_sparse + io_uring_register_buffers_sparse.restype = ctypes.c_int32 + io_uring_register_buffers_sparse.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.c_uint32] +except AttributeError: + pass try: io_uring_register_buffers_update_tag = _libraries['FIXME_STUB'].io_uring_register_buffers_update_tag io_uring_register_buffers_update_tag.restype = ctypes.c_int32 @@ -529,6 +607,12 @@ try: io_uring_register_files_tags.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(ctypes.c_int32), ctypes.POINTER(ctypes.c_uint64), ctypes.c_uint32] except AttributeError: pass +try: + io_uring_register_files_sparse = _libraries['FIXME_STUB'].io_uring_register_files_sparse + io_uring_register_files_sparse.restype = ctypes.c_int32 + io_uring_register_files_sparse.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.c_uint32] +except AttributeError: + pass try: io_uring_register_files_update_tag = _libraries['FIXME_STUB'].io_uring_register_files_update_tag io_uring_register_files_update_tag.restype = ctypes.c_int32 @@ -623,7 +707,6 @@ try: __io_uring_sqring_wait.argtypes = [ctypes.POINTER(struct_io_uring)] except AttributeError: pass -size_t = ctypes.c_uint64 class struct_c__SA_cpu_set_t(Structure): pass @@ -650,6 +733,162 @@ try: io_uring_register_iowq_max_workers.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(ctypes.c_uint32)] except AttributeError: pass +try: + io_uring_register_ring_fd = _libraries['FIXME_STUB'].io_uring_register_ring_fd + io_uring_register_ring_fd.restype = ctypes.c_int32 + io_uring_register_ring_fd.argtypes = [ctypes.POINTER(struct_io_uring)] +except AttributeError: + pass +try: + io_uring_unregister_ring_fd = _libraries['FIXME_STUB'].io_uring_unregister_ring_fd + io_uring_unregister_ring_fd.restype = ctypes.c_int32 + io_uring_unregister_ring_fd.argtypes = [ctypes.POINTER(struct_io_uring)] +except AttributeError: + pass +try: + io_uring_close_ring_fd = _libraries['FIXME_STUB'].io_uring_close_ring_fd + io_uring_close_ring_fd.restype = ctypes.c_int32 + io_uring_close_ring_fd.argtypes = [ctypes.POINTER(struct_io_uring)] +except AttributeError: + pass +class struct_io_uring_buf_reg(Structure): + pass + +struct_io_uring_buf_reg._pack_ = 1 # source:False +struct_io_uring_buf_reg._fields_ = [ + ('ring_addr', ctypes.c_uint64), + ('ring_entries', ctypes.c_uint32), + ('bgid', ctypes.c_uint16), + ('flags', ctypes.c_uint16), + ('resv', ctypes.c_uint64 * 3), +] + +try: + io_uring_register_buf_ring = _libraries['FIXME_STUB'].io_uring_register_buf_ring + io_uring_register_buf_ring.restype = ctypes.c_int32 + io_uring_register_buf_ring.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(struct_io_uring_buf_reg), ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_unregister_buf_ring = _libraries['FIXME_STUB'].io_uring_unregister_buf_ring + io_uring_unregister_buf_ring.restype = ctypes.c_int32 + io_uring_unregister_buf_ring.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.c_int32] +except AttributeError: + pass +class struct_io_uring_sync_cancel_reg(Structure): + pass + +struct_io_uring_sync_cancel_reg._pack_ = 1 # source:False +struct_io_uring_sync_cancel_reg._fields_ = [ + ('addr', ctypes.c_uint64), + ('fd', ctypes.c_int32), + ('flags', ctypes.c_uint32), + ('timeout', struct___kernel_timespec), + ('pad', ctypes.c_uint64 * 4), +] + +try: + io_uring_register_sync_cancel = _libraries['FIXME_STUB'].io_uring_register_sync_cancel + io_uring_register_sync_cancel.restype = ctypes.c_int32 + io_uring_register_sync_cancel.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(struct_io_uring_sync_cancel_reg)] +except AttributeError: + pass +try: + io_uring_register_file_alloc_range = _libraries['FIXME_STUB'].io_uring_register_file_alloc_range + io_uring_register_file_alloc_range.restype = ctypes.c_int32 + io_uring_register_file_alloc_range.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_get_events = _libraries['FIXME_STUB'].io_uring_get_events + io_uring_get_events.restype = ctypes.c_int32 + io_uring_get_events.argtypes = [ctypes.POINTER(struct_io_uring)] +except AttributeError: + pass +try: + io_uring_submit_and_get_events = _libraries['FIXME_STUB'].io_uring_submit_and_get_events + io_uring_submit_and_get_events.restype = ctypes.c_int32 + io_uring_submit_and_get_events.argtypes = [ctypes.POINTER(struct_io_uring)] +except AttributeError: + pass +try: + io_uring_enter = _libraries['FIXME_STUB'].io_uring_enter + io_uring_enter.restype = ctypes.c_int32 + io_uring_enter.argtypes = [ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.POINTER(struct_c__SA___sigset_t)] +except AttributeError: + pass +try: + io_uring_enter2 = _libraries['FIXME_STUB'].io_uring_enter2 + io_uring_enter2.restype = ctypes.c_int32 + io_uring_enter2.argtypes = [ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.POINTER(struct_c__SA___sigset_t), size_t] +except AttributeError: + pass +try: + io_uring_setup = _libraries['FIXME_STUB'].io_uring_setup + io_uring_setup.restype = ctypes.c_int32 + io_uring_setup.argtypes = [ctypes.c_uint32, ctypes.POINTER(struct_io_uring_params)] +except AttributeError: + pass +try: + io_uring_register = _libraries['FIXME_STUB'].io_uring_register + io_uring_register.restype = ctypes.c_int32 + io_uring_register.argtypes = [ctypes.c_uint32, ctypes.c_uint32, ctypes.POINTER(None), ctypes.c_uint32] +except AttributeError: + pass +class struct_io_uring_buf_ring(Structure): + pass + +class union_io_uring_buf_ring_0(Union): + pass + +class struct_io_uring_buf_ring_0_0(Structure): + pass + +struct_io_uring_buf_ring_0_0._pack_ = 1 # source:False +struct_io_uring_buf_ring_0_0._fields_ = [ + ('resv1', ctypes.c_uint64), + ('resv2', ctypes.c_uint32), + ('resv3', ctypes.c_uint16), + ('tail', ctypes.c_uint16), +] + +class struct_io_uring_buf(Structure): + pass + +struct_io_uring_buf._pack_ = 1 # source:False +struct_io_uring_buf._fields_ = [ + ('addr', ctypes.c_uint64), + ('len', ctypes.c_uint32), + ('bid', ctypes.c_uint16), + ('resv', ctypes.c_uint16), +] + +union_io_uring_buf_ring_0._pack_ = 1 # source:False +union_io_uring_buf_ring_0._anonymous_ = ('_0',) +union_io_uring_buf_ring_0._fields_ = [ + ('_0', struct_io_uring_buf_ring_0_0), + ('bufs', struct_io_uring_buf * 0), + ('PADDING_0', ctypes.c_ubyte * 16), +] + +struct_io_uring_buf_ring._pack_ = 1 # source:False +struct_io_uring_buf_ring._anonymous_ = ('_0',) +struct_io_uring_buf_ring._fields_ = [ + ('_0', union_io_uring_buf_ring_0), +] + +try: + io_uring_setup_buf_ring = _libraries['FIXME_STUB'].io_uring_setup_buf_ring + io_uring_setup_buf_ring.restype = ctypes.POINTER(struct_io_uring_buf_ring) + io_uring_setup_buf_ring.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.c_uint32, ctypes.c_int32, ctypes.c_uint32, ctypes.POINTER(ctypes.c_int32)] +except AttributeError: + pass +try: + io_uring_free_buf_ring = _libraries['FIXME_STUB'].io_uring_free_buf_ring + io_uring_free_buf_ring.restype = ctypes.c_int32 + io_uring_free_buf_ring.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(struct_io_uring_buf_ring), ctypes.c_uint32, ctypes.c_int32] +except AttributeError: + pass try: __io_uring_get_cqe = _libraries['FIXME_STUB'].__io_uring_get_cqe __io_uring_get_cqe.restype = ctypes.c_int32 @@ -680,6 +919,20 @@ try: io_uring_cqe_get_data.argtypes = [ctypes.POINTER(struct_io_uring_cqe)] except AttributeError: pass +__u64 = ctypes.c_uint64 +# LIBURING_UDATA_TIMEOUT = ((__u64)-1) # macro +try: + io_uring_sqe_set_data64 = _libraries['FIXME_STUB'].io_uring_sqe_set_data64 + io_uring_sqe_set_data64.restype = None + io_uring_sqe_set_data64.argtypes = [ctypes.POINTER(struct_io_uring_sqe), __u64] +except AttributeError: + pass +try: + io_uring_cqe_get_data64 = _libraries['FIXME_STUB'].io_uring_cqe_get_data64 + io_uring_cqe_get_data64.restype = __u64 + io_uring_cqe_get_data64.argtypes = [ctypes.POINTER(struct_io_uring_cqe)] +except AttributeError: + pass try: io_uring_sqe_set_flags = _libraries['FIXME_STUB'].io_uring_sqe_set_flags io_uring_sqe_set_flags.restype = None @@ -692,8 +945,6 @@ try: __io_uring_set_target_fixed_file.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_uint32] except AttributeError: pass -__u64 = ctypes.c_uint64 -# LIBURING_UDATA_TIMEOUT = ((__u64)-1) # macro try: io_uring_prep_rw = _libraries['FIXME_STUB'].io_uring_prep_rw io_uring_prep_rw.restype = None @@ -719,6 +970,12 @@ try: io_uring_prep_readv.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_iovec), ctypes.c_uint32, __u64] except AttributeError: pass +try: + io_uring_prep_readv2 = _libraries['FIXME_STUB'].io_uring_prep_readv2 + io_uring_prep_readv2.restype = None + io_uring_prep_readv2.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_iovec), ctypes.c_uint32, __u64, ctypes.c_int32] +except AttributeError: + pass try: io_uring_prep_read_fixed = _libraries['FIXME_STUB'].io_uring_prep_read_fixed io_uring_prep_read_fixed.restype = None @@ -731,6 +988,12 @@ try: io_uring_prep_writev.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_iovec), ctypes.c_uint32, __u64] except AttributeError: pass +try: + io_uring_prep_writev2 = _libraries['FIXME_STUB'].io_uring_prep_writev2 + io_uring_prep_writev2.restype = None + io_uring_prep_writev2.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_iovec), ctypes.c_uint32, __u64, ctypes.c_int32] +except AttributeError: + pass try: io_uring_prep_write_fixed = _libraries['FIXME_STUB'].io_uring_prep_write_fixed io_uring_prep_write_fixed.restype = None @@ -759,6 +1022,12 @@ try: io_uring_prep_recvmsg.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_msghdr), ctypes.c_uint32] except AttributeError: pass +try: + io_uring_prep_recvmsg_multishot = _libraries['FIXME_STUB'].io_uring_prep_recvmsg_multishot + io_uring_prep_recvmsg_multishot.restype = None + io_uring_prep_recvmsg_multishot.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_msghdr), ctypes.c_uint32] +except AttributeError: + pass try: io_uring_prep_sendmsg = _libraries['FIXME_STUB'].io_uring_prep_sendmsg io_uring_prep_sendmsg.restype = None @@ -786,13 +1055,13 @@ except AttributeError: try: io_uring_prep_poll_remove = _libraries['FIXME_STUB'].io_uring_prep_poll_remove io_uring_prep_poll_remove.restype = None - io_uring_prep_poll_remove.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(None)] + io_uring_prep_poll_remove.argtypes = [ctypes.POINTER(struct_io_uring_sqe), __u64] except AttributeError: pass try: io_uring_prep_poll_update = _libraries['FIXME_STUB'].io_uring_prep_poll_update io_uring_prep_poll_update.restype = None - io_uring_prep_poll_update.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(None), ctypes.POINTER(None), ctypes.c_uint32, ctypes.c_uint32] + io_uring_prep_poll_update.argtypes = [ctypes.POINTER(struct_io_uring_sqe), __u64, __u64, ctypes.c_uint32, ctypes.c_uint32] except AttributeError: pass try: @@ -846,12 +1115,36 @@ try: io_uring_prep_accept_direct.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_sockaddr), ctypes.POINTER(ctypes.c_uint32), ctypes.c_int32, ctypes.c_uint32] except AttributeError: pass +try: + io_uring_prep_multishot_accept = _libraries['FIXME_STUB'].io_uring_prep_multishot_accept + io_uring_prep_multishot_accept.restype = None + io_uring_prep_multishot_accept.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_sockaddr), ctypes.POINTER(ctypes.c_uint32), ctypes.c_int32] +except AttributeError: + pass +try: + io_uring_prep_multishot_accept_direct = _libraries['FIXME_STUB'].io_uring_prep_multishot_accept_direct + io_uring_prep_multishot_accept_direct.restype = None + io_uring_prep_multishot_accept_direct.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_sockaddr), ctypes.POINTER(ctypes.c_uint32), ctypes.c_int32] +except AttributeError: + pass +try: + io_uring_prep_cancel64 = _libraries['FIXME_STUB'].io_uring_prep_cancel64 + io_uring_prep_cancel64.restype = None + io_uring_prep_cancel64.argtypes = [ctypes.POINTER(struct_io_uring_sqe), __u64, ctypes.c_int32] +except AttributeError: + pass try: io_uring_prep_cancel = _libraries['FIXME_STUB'].io_uring_prep_cancel io_uring_prep_cancel.restype = None io_uring_prep_cancel.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(None), ctypes.c_int32] except AttributeError: pass +try: + io_uring_prep_cancel_fd = _libraries['FIXME_STUB'].io_uring_prep_cancel_fd + io_uring_prep_cancel_fd.restype = None + io_uring_prep_cancel_fd.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_uint32] +except AttributeError: + pass try: io_uring_prep_link_timeout = _libraries['FIXME_STUB'].io_uring_prep_link_timeout io_uring_prep_link_timeout.restype = None @@ -871,11 +1164,10 @@ try: io_uring_prep_files_update.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(ctypes.c_int32), ctypes.c_uint32, ctypes.c_int32] except AttributeError: pass -off_t = ctypes.c_int64 try: io_uring_prep_fallocate = _libraries['FIXME_STUB'].io_uring_prep_fallocate io_uring_prep_fallocate.restype = None - io_uring_prep_fallocate.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_int32, off_t, off_t] + io_uring_prep_fallocate.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_int32, __u64, __u64] except AttributeError: pass mode_t = ctypes.c_uint32 @@ -897,6 +1189,12 @@ try: io_uring_prep_close.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32] except AttributeError: pass +try: + io_uring_prep_close_direct = _libraries['FIXME_STUB'].io_uring_prep_close_direct + io_uring_prep_close_direct.restype = None + io_uring_prep_close_direct.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_uint32] +except AttributeError: + pass try: io_uring_prep_read = _libraries['FIXME_STUB'].io_uring_prep_read io_uring_prep_read.restype = None @@ -918,6 +1216,7 @@ try: io_uring_prep_statx.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.c_uint32, ctypes.POINTER(struct_statx)] except AttributeError: pass +off_t = ctypes.c_int64 try: io_uring_prep_fadvise = _libraries['FIXME_STUB'].io_uring_prep_fadvise io_uring_prep_fadvise.restype = None @@ -936,12 +1235,107 @@ try: io_uring_prep_send.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(None), size_t, ctypes.c_int32] except AttributeError: pass +__u16 = ctypes.c_uint16 +try: + io_uring_prep_send_set_addr = _libraries['FIXME_STUB'].io_uring_prep_send_set_addr + io_uring_prep_send_set_addr.restype = None + io_uring_prep_send_set_addr.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(struct_sockaddr), __u16] +except AttributeError: + pass +try: + io_uring_prep_sendto = _libraries['FIXME_STUB'].io_uring_prep_sendto + io_uring_prep_sendto.restype = None + io_uring_prep_sendto.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(None), size_t, ctypes.c_int32, ctypes.POINTER(struct_sockaddr), socklen_t] +except AttributeError: + pass +try: + io_uring_prep_send_zc = _libraries['FIXME_STUB'].io_uring_prep_send_zc + io_uring_prep_send_zc.restype = None + io_uring_prep_send_zc.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(None), size_t, ctypes.c_int32, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_send_zc_fixed = _libraries['FIXME_STUB'].io_uring_prep_send_zc_fixed + io_uring_prep_send_zc_fixed.restype = None + io_uring_prep_send_zc_fixed.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(None), size_t, ctypes.c_int32, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_sendmsg_zc = _libraries['FIXME_STUB'].io_uring_prep_sendmsg_zc + io_uring_prep_sendmsg_zc.restype = None + io_uring_prep_sendmsg_zc.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(struct_msghdr), ctypes.c_uint32] +except AttributeError: + pass try: io_uring_prep_recv = _libraries['FIXME_STUB'].io_uring_prep_recv io_uring_prep_recv.restype = None io_uring_prep_recv.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(None), size_t, ctypes.c_int32] except AttributeError: pass +try: + io_uring_prep_recv_multishot = _libraries['FIXME_STUB'].io_uring_prep_recv_multishot + io_uring_prep_recv_multishot.restype = None + io_uring_prep_recv_multishot.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(None), size_t, ctypes.c_int32] +except AttributeError: + pass +class struct_io_uring_recvmsg_out(Structure): + pass + +struct_io_uring_recvmsg_out._pack_ = 1 # source:False +struct_io_uring_recvmsg_out._fields_ = [ + ('namelen', ctypes.c_uint32), + ('controllen', ctypes.c_uint32), + ('payloadlen', ctypes.c_uint32), + ('flags', ctypes.c_uint32), +] + +try: + io_uring_recvmsg_validate = _libraries['FIXME_STUB'].io_uring_recvmsg_validate + io_uring_recvmsg_validate.restype = ctypes.POINTER(struct_io_uring_recvmsg_out) + io_uring_recvmsg_validate.argtypes = [ctypes.POINTER(None), ctypes.c_int32, ctypes.POINTER(struct_msghdr)] +except AttributeError: + pass +try: + io_uring_recvmsg_name = _libraries['FIXME_STUB'].io_uring_recvmsg_name + io_uring_recvmsg_name.restype = ctypes.POINTER(None) + io_uring_recvmsg_name.argtypes = [ctypes.POINTER(struct_io_uring_recvmsg_out)] +except AttributeError: + pass +class struct_cmsghdr(Structure): + pass + +struct_cmsghdr._pack_ = 1 # source:False +struct_cmsghdr._fields_ = [ + ('cmsg_len', ctypes.c_uint64), + ('cmsg_level', ctypes.c_int32), + ('cmsg_type', ctypes.c_int32), + ('__cmsg_data', ctypes.c_ubyte * 0), +] + +try: + io_uring_recvmsg_cmsg_firsthdr = _libraries['FIXME_STUB'].io_uring_recvmsg_cmsg_firsthdr + io_uring_recvmsg_cmsg_firsthdr.restype = ctypes.POINTER(struct_cmsghdr) + io_uring_recvmsg_cmsg_firsthdr.argtypes = [ctypes.POINTER(struct_io_uring_recvmsg_out), ctypes.POINTER(struct_msghdr)] +except AttributeError: + pass +try: + io_uring_recvmsg_cmsg_nexthdr = _libraries['FIXME_STUB'].io_uring_recvmsg_cmsg_nexthdr + io_uring_recvmsg_cmsg_nexthdr.restype = ctypes.POINTER(struct_cmsghdr) + io_uring_recvmsg_cmsg_nexthdr.argtypes = [ctypes.POINTER(struct_io_uring_recvmsg_out), ctypes.POINTER(struct_msghdr), ctypes.POINTER(struct_cmsghdr)] +except AttributeError: + pass +try: + io_uring_recvmsg_payload = _libraries['FIXME_STUB'].io_uring_recvmsg_payload + io_uring_recvmsg_payload.restype = ctypes.POINTER(None) + io_uring_recvmsg_payload.argtypes = [ctypes.POINTER(struct_io_uring_recvmsg_out), ctypes.POINTER(struct_msghdr)] +except AttributeError: + pass +try: + io_uring_recvmsg_payload_length = _libraries['FIXME_STUB'].io_uring_recvmsg_payload_length + io_uring_recvmsg_payload_length.restype = ctypes.c_uint32 + io_uring_recvmsg_payload_length.argtypes = [ctypes.POINTER(struct_io_uring_recvmsg_out), ctypes.c_int32, ctypes.POINTER(struct_msghdr)] +except AttributeError: + pass class struct_open_how(Structure): pass @@ -997,10 +1391,22 @@ try: io_uring_prep_unlinkat.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.c_int32] except AttributeError: pass +try: + io_uring_prep_unlink = _libraries['FIXME_STUB'].io_uring_prep_unlink + io_uring_prep_unlink.restype = None + io_uring_prep_unlink.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(ctypes.c_char), ctypes.c_int32] +except AttributeError: + pass try: io_uring_prep_renameat = _libraries['FIXME_STUB'].io_uring_prep_renameat io_uring_prep_renameat.restype = None - io_uring_prep_renameat.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.c_int32] + io_uring_prep_renameat.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_rename = _libraries['FIXME_STUB'].io_uring_prep_rename + io_uring_prep_rename.restype = None + io_uring_prep_rename.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char)] except AttributeError: pass try: @@ -1015,18 +1421,108 @@ try: io_uring_prep_mkdirat.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), mode_t] except AttributeError: pass +try: + io_uring_prep_mkdir = _libraries['FIXME_STUB'].io_uring_prep_mkdir + io_uring_prep_mkdir.restype = None + io_uring_prep_mkdir.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(ctypes.c_char), mode_t] +except AttributeError: + pass try: io_uring_prep_symlinkat = _libraries['FIXME_STUB'].io_uring_prep_symlinkat io_uring_prep_symlinkat.restype = None io_uring_prep_symlinkat.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.POINTER(ctypes.c_char)] except AttributeError: pass +try: + io_uring_prep_symlink = _libraries['FIXME_STUB'].io_uring_prep_symlink + io_uring_prep_symlink.restype = None + io_uring_prep_symlink.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char)] +except AttributeError: + pass try: io_uring_prep_linkat = _libraries['FIXME_STUB'].io_uring_prep_linkat io_uring_prep_linkat.restype = None io_uring_prep_linkat.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.c_int32] except AttributeError: pass +try: + io_uring_prep_link = _libraries['FIXME_STUB'].io_uring_prep_link + io_uring_prep_link.restype = None + io_uring_prep_link.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.c_int32] +except AttributeError: + pass +try: + io_uring_prep_msg_ring_cqe_flags = _libraries['FIXME_STUB'].io_uring_prep_msg_ring_cqe_flags + io_uring_prep_msg_ring_cqe_flags.restype = None + io_uring_prep_msg_ring_cqe_flags.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_uint32, __u64, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_msg_ring = _libraries['FIXME_STUB'].io_uring_prep_msg_ring + io_uring_prep_msg_ring.restype = None + io_uring_prep_msg_ring.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_uint32, __u64, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_msg_ring_fd = _libraries['FIXME_STUB'].io_uring_prep_msg_ring_fd + io_uring_prep_msg_ring_fd.restype = None + io_uring_prep_msg_ring_fd.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_int32, ctypes.c_int32, __u64, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_msg_ring_fd_alloc = _libraries['FIXME_STUB'].io_uring_prep_msg_ring_fd_alloc + io_uring_prep_msg_ring_fd_alloc.restype = None + io_uring_prep_msg_ring_fd_alloc.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_int32, __u64, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_getxattr = _libraries['FIXME_STUB'].io_uring_prep_getxattr + io_uring_prep_getxattr.restype = None + io_uring_prep_getxattr.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_setxattr = _libraries['FIXME_STUB'].io_uring_prep_setxattr + io_uring_prep_setxattr.restype = None + io_uring_prep_setxattr.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_fgetxattr = _libraries['FIXME_STUB'].io_uring_prep_fgetxattr + io_uring_prep_fgetxattr.restype = None + io_uring_prep_fgetxattr.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_fsetxattr = _libraries['FIXME_STUB'].io_uring_prep_fsetxattr + io_uring_prep_fsetxattr.restype = None + io_uring_prep_fsetxattr.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_socket = _libraries['FIXME_STUB'].io_uring_prep_socket + io_uring_prep_socket.restype = None + io_uring_prep_socket.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_int32, ctypes.c_int32, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_socket_direct = _libraries['FIXME_STUB'].io_uring_prep_socket_direct + io_uring_prep_socket_direct.restype = None + io_uring_prep_socket_direct.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_int32, ctypes.c_int32, ctypes.c_uint32, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_socket_direct_alloc = _libraries['FIXME_STUB'].io_uring_prep_socket_direct_alloc + io_uring_prep_socket_direct_alloc.restype = None + io_uring_prep_socket_direct_alloc.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_int32, ctypes.c_int32, ctypes.c_uint32] +except AttributeError: + pass +try: + io_uring_prep_cmd_sock = _libraries['FIXME_STUB'].io_uring_prep_cmd_sock + io_uring_prep_cmd_sock.restype = None + io_uring_prep_cmd_sock.argtypes = [ctypes.POINTER(struct_io_uring_sqe), ctypes.c_int32, ctypes.c_int32, ctypes.c_int32, ctypes.c_int32, ctypes.POINTER(None), ctypes.c_int32] +except AttributeError: + pass try: io_uring_sq_ready = _libraries['FIXME_STUB'].io_uring_sq_ready io_uring_sq_ready.restype = ctypes.c_uint32 @@ -1051,6 +1547,12 @@ try: io_uring_cq_ready.argtypes = [ctypes.POINTER(struct_io_uring)] except AttributeError: pass +try: + io_uring_cq_has_overflow = _libraries['FIXME_STUB'].io_uring_cq_has_overflow + io_uring_cq_has_overflow.restype = ctypes.c_bool + io_uring_cq_has_overflow.argtypes = [ctypes.POINTER(struct_io_uring)] +except AttributeError: + pass try: io_uring_cq_eventfd_enabled = _libraries['FIXME_STUB'].io_uring_cq_eventfd_enabled io_uring_cq_eventfd_enabled.restype = ctypes.c_bool @@ -1069,6 +1571,12 @@ try: io_uring_wait_cqe_nr.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(ctypes.POINTER(struct_io_uring_cqe)), ctypes.c_uint32] except AttributeError: pass +try: + __io_uring_peek_cqe = _libraries['FIXME_STUB'].__io_uring_peek_cqe + __io_uring_peek_cqe.restype = ctypes.c_int32 + __io_uring_peek_cqe.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(ctypes.POINTER(struct_io_uring_cqe)), ctypes.POINTER(ctypes.c_uint32)] +except AttributeError: + pass try: io_uring_peek_cqe = _libraries['FIXME_STUB'].io_uring_peek_cqe io_uring_peek_cqe.restype = ctypes.c_int32 @@ -1081,6 +1589,55 @@ try: io_uring_wait_cqe.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(ctypes.POINTER(struct_io_uring_cqe))] except AttributeError: pass +try: + _io_uring_get_sqe = _libraries['FIXME_STUB']._io_uring_get_sqe + _io_uring_get_sqe.restype = ctypes.POINTER(struct_io_uring_sqe) + _io_uring_get_sqe.argtypes = [ctypes.POINTER(struct_io_uring)] +except AttributeError: + pass +__u32 = ctypes.c_uint32 +try: + io_uring_buf_ring_mask = _libraries['FIXME_STUB'].io_uring_buf_ring_mask + io_uring_buf_ring_mask.restype = ctypes.c_int32 + io_uring_buf_ring_mask.argtypes = [__u32] +except AttributeError: + pass +try: + io_uring_buf_ring_init = _libraries['FIXME_STUB'].io_uring_buf_ring_init + io_uring_buf_ring_init.restype = None + io_uring_buf_ring_init.argtypes = [ctypes.POINTER(struct_io_uring_buf_ring)] +except AttributeError: + pass +try: + io_uring_buf_ring_add = _libraries['FIXME_STUB'].io_uring_buf_ring_add + io_uring_buf_ring_add.restype = None + io_uring_buf_ring_add.argtypes = [ctypes.POINTER(struct_io_uring_buf_ring), ctypes.POINTER(None), ctypes.c_uint32, ctypes.c_uint16, ctypes.c_int32, ctypes.c_int32] +except AttributeError: + pass +try: + io_uring_buf_ring_advance = _libraries['FIXME_STUB'].io_uring_buf_ring_advance + io_uring_buf_ring_advance.restype = None + io_uring_buf_ring_advance.argtypes = [ctypes.POINTER(struct_io_uring_buf_ring), ctypes.c_int32] +except AttributeError: + pass +try: + __io_uring_buf_ring_cq_advance = _libraries['FIXME_STUB'].__io_uring_buf_ring_cq_advance + __io_uring_buf_ring_cq_advance.restype = None + __io_uring_buf_ring_cq_advance.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(struct_io_uring_buf_ring), ctypes.c_int32, ctypes.c_int32] +except AttributeError: + pass +try: + io_uring_buf_ring_cq_advance = _libraries['FIXME_STUB'].io_uring_buf_ring_cq_advance + io_uring_buf_ring_cq_advance.restype = None + io_uring_buf_ring_cq_advance.argtypes = [ctypes.POINTER(struct_io_uring), ctypes.POINTER(struct_io_uring_buf_ring), ctypes.c_int32] +except AttributeError: + pass +try: + io_uring_get_sqe = _libraries['FIXME_STUB'].io_uring_get_sqe + io_uring_get_sqe.restype = ctypes.POINTER(struct_io_uring_sqe) + io_uring_get_sqe.argtypes = [ctypes.POINTER(struct_io_uring)] +except AttributeError: + pass ssize_t = ctypes.c_int64 try: io_uring_mlock_size = _libraries['FIXME_STUB'].io_uring_mlock_size @@ -1094,7 +1651,26 @@ try: io_uring_mlock_size_params.argtypes = [ctypes.c_uint32, ctypes.POINTER(struct_io_uring_params)] except AttributeError: pass +try: + io_uring_major_version = _libraries['FIXME_STUB'].io_uring_major_version + io_uring_major_version.restype = ctypes.c_int32 + io_uring_major_version.argtypes = [] +except AttributeError: + pass +try: + io_uring_minor_version = _libraries['FIXME_STUB'].io_uring_minor_version + io_uring_minor_version.restype = ctypes.c_int32 + io_uring_minor_version.argtypes = [] +except AttributeError: + pass +try: + io_uring_check_version = _libraries['FIXME_STUB'].io_uring_check_version + io_uring_check_version.restype = ctypes.c_bool + io_uring_check_version.argtypes = [ctypes.c_int32, ctypes.c_int32] +except AttributeError: + pass LINUX_IO_URING_H = True # macro +IORING_FILE_INDEX_ALLOC = (~0) # macro IORING_SETUP_IOPOLL = (1<<0) # macro IORING_SETUP_SQPOLL = (1<<1) # macro IORING_SETUP_SQ_AFF = (1<<2) # macro @@ -1102,30 +1678,69 @@ IORING_SETUP_CQSIZE = (1<<3) # macro IORING_SETUP_CLAMP = (1<<4) # macro IORING_SETUP_ATTACH_WQ = (1<<5) # macro IORING_SETUP_R_DISABLED = (1<<6) # macro +IORING_SETUP_SUBMIT_ALL = (1<<7) # macro +IORING_SETUP_COOP_TASKRUN = (1<<8) # macro +IORING_SETUP_TASKRUN_FLAG = (1<<9) # macro +IORING_SETUP_SQE128 = (1<<10) # macro +IORING_SETUP_CQE32 = (1<<11) # macro +# def io_uring_cqe_shift(ring): # macro +# return (!!((ring)->flags&IORING_SETUP_CQE32)) +IORING_SETUP_SINGLE_ISSUER = (1<<12) # macro +IORING_SETUP_DEFER_TASKRUN = (1<<13) # macro +IORING_SETUP_NO_MMAP = (1<<14) # macro +IORING_SETUP_REGISTERED_FD_ONLY = (1<<15) # macro +IORING_SETUP_NO_SQARRAY = (1<<16) # macro +IORING_URING_CMD_FIXED = (1<<0) # macro +IORING_URING_CMD_MASK = (1<<0) # macro IORING_FSYNC_DATASYNC = (1<<0) # macro IORING_TIMEOUT_ABS = (1<<0) # macro IORING_TIMEOUT_UPDATE = (1<<1) # macro IORING_TIMEOUT_BOOTTIME = (1<<2) # macro IORING_TIMEOUT_REALTIME = (1<<3) # macro IORING_LINK_TIMEOUT_UPDATE = (1<<4) # macro +IORING_TIMEOUT_ETIME_SUCCESS = (1<<5) # macro +IORING_TIMEOUT_MULTISHOT = (1<<6) # macro IORING_TIMEOUT_CLOCK_MASK = ((1<<2)|(1<<3)) # macro IORING_TIMEOUT_UPDATE_MASK = ((1<<1)|(1<<4)) # macro SPLICE_F_FD_IN_FIXED = (1<<31) # macro IORING_POLL_ADD_MULTI = (1<<0) # macro IORING_POLL_UPDATE_EVENTS = (1<<1) # macro IORING_POLL_UPDATE_USER_DATA = (1<<2) # macro +IORING_POLL_ADD_LEVEL = (1<<3) # macro +IORING_ASYNC_CANCEL_ALL = (1<<0) # macro +IORING_ASYNC_CANCEL_FD = (1<<1) # macro +IORING_ASYNC_CANCEL_ANY = (1<<2) # macro +IORING_ASYNC_CANCEL_FD_FIXED = (1<<3) # macro +IORING_ASYNC_CANCEL_USERDATA = (1<<4) # macro +IORING_ASYNC_CANCEL_OP = (1<<5) # macro +IORING_RECVSEND_POLL_FIRST = (1<<0) # macro +IORING_RECV_MULTISHOT = (1<<1) # macro +IORING_RECVSEND_FIXED_BUF = (1<<2) # macro +IORING_SEND_ZC_REPORT_USAGE = (1<<3) # macro +IORING_NOTIF_USAGE_ZC_COPIED = (1<<31) # macro +IORING_ACCEPT_MULTISHOT = (1<<0) # macro +IORING_MSG_RING_CQE_SKIP = (1<<0) # macro +IORING_MSG_RING_FLAGS_PASS = (1<<1) # macro +IORING_FIXED_FD_NO_CLOEXEC = (1<<0) # macro IORING_CQE_F_BUFFER = (1<<0) # macro IORING_CQE_F_MORE = (1<<1) # macro +IORING_CQE_F_SOCK_NONEMPTY = (1<<2) # macro +IORING_CQE_F_NOTIF = (1<<3) # macro IORING_OFF_SQ_RING = 0 # macro IORING_OFF_CQ_RING = 0x8000000 # macro IORING_OFF_SQES = 0x10000000 # macro +IORING_OFF_PBUF_RING = 0x80000000 # macro +IORING_OFF_PBUF_SHIFT = 16 # macro +IORING_OFF_MMAP_MASK = 0xf8000000 # macro IORING_SQ_NEED_WAKEUP = (1<<0) # macro IORING_SQ_CQ_OVERFLOW = (1<<1) # macro +IORING_SQ_TASKRUN = (1<<2) # macro IORING_CQ_EVENTFD_DISABLED = (1<<0) # macro IORING_ENTER_GETEVENTS = (1<<0) # macro IORING_ENTER_SQ_WAKEUP = (1<<1) # macro IORING_ENTER_SQ_WAIT = (1<<2) # macro IORING_ENTER_EXT_ARG = (1<<3) # macro +IORING_ENTER_REGISTERED_RING = (1<<4) # macro IORING_FEAT_SINGLE_MMAP = (1<<0) # macro IORING_FEAT_NODROP = (1<<1) # macro IORING_FEAT_SUBMIT_STABLE = (1<<2) # macro @@ -1137,6 +1752,10 @@ IORING_FEAT_SQPOLL_NONFIXED = (1<<7) # macro IORING_FEAT_EXT_ARG = (1<<8) # macro IORING_FEAT_NATIVE_WORKERS = (1<<9) # macro IORING_FEAT_RSRC_TAGS = (1<<10) # macro +IORING_FEAT_CQE_SKIP = (1<<11) # macro +IORING_FEAT_LINKED_FILE = (1<<12) # macro +IORING_FEAT_REG_REG_RING = (1<<13) # macro +IORING_RSRC_REGISTER_SPARSE = (1<<0) # macro IORING_REGISTER_FILES_SKIP = (-2) # macro IO_URING_OP_SUPPORTED = (1<<0) # macro @@ -1148,6 +1767,7 @@ c__Ea_IOSQE_FIXED_FILE_BIT__enumvalues = { 3: 'IOSQE_IO_HARDLINK_BIT', 4: 'IOSQE_ASYNC_BIT', 5: 'IOSQE_BUFFER_SELECT_BIT', + 6: 'IOSQE_CQE_SKIP_SUCCESS_BIT', } IOSQE_FIXED_FILE_BIT = 0 IOSQE_IO_DRAIN_BIT = 1 @@ -1155,6 +1775,7 @@ IOSQE_IO_LINK_BIT = 2 IOSQE_IO_HARDLINK_BIT = 3 IOSQE_ASYNC_BIT = 4 IOSQE_BUFFER_SELECT_BIT = 5 +IOSQE_CQE_SKIP_SUCCESS_BIT = 6 c__Ea_IOSQE_FIXED_FILE_BIT = ctypes.c_uint32 # enum IOSQE_FIXED_FILE = (1<EC_NONE and ecode {reloc_sysfs}'") + assert FileIOInterface(reloc_sysfs, os.O_RDONLY).read()[0] == "0", f"Failed to disable migration of locked pages. Please run {cmd} manually." + # Try to init vfio. Use it if success. if getenv("VFIO", 0): try: @@ -711,6 +717,9 @@ class PCIIface: self._setup_adev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2, fmt='Q'), self._map_pci_range(5, fmt='I')) self.doorbell_cpu_addr = dbell.addr + if first_dev: + FileIOInterface.anon_mmap((alloc:=self.adev.mm.va_allocator).base, alloc.size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, 0) + pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND) @@ -729,20 +738,18 @@ class PCIIface: def _map_pci_range(self, bar, off=0, addr=0, size=None, fmt='B'): fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1) libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK) - assert loc != 0xffffffffffffffff, f"Failed to mmap {size} bytes at {hex(addr)}" return MMIOInterface(loc, sz, fmt=fmt) def alloc(self, size:int, host=False, uncached=False, cpu_access=False): if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory. vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE) - va = FileIOInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0) - assert va != 0xffffffffffffffff, f"Failed to mmap {size} bytes at {hex(vaddr)}" + FileIOInterface.anon_mmap(vaddr, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS|MAP_POPULATE|MAP_LOCKED|MAP_FIXED, 0) # Read pagemap to get the physical address of each page. The pages are locked. - self.pagemap.seek(va // mmap.PAGESIZE * 8) + self.pagemap.seek(vaddr // mmap.PAGESIZE * 8) paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))] am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True) - return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access), + return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=True), view=MMIOInterface(am_mapping.va_addr, size, fmt='B')) am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access) @@ -886,8 +893,8 @@ class AMDDevice(HCQCompiled): max_copy_size = 0x40000000 if self.dev_iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000 self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else 0x800000) - super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if getenv("AMD_LLVM", 0) else AMDRenderer(self.arch), - AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 0) else HIPCompiler(self.arch), functools.partial(AMDProgram, self), + super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if getenv("AMD_LLVM", 1) else AMDRenderer(self.arch), + AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 1) else HIPCompiler(self.arch), functools.partial(AMDProgram, self), AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size), kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000) diff --git a/tinygrad_repo/tinygrad/runtime/ops_cuda.py b/tinygrad_repo/tinygrad/runtime/ops_cuda.py index af50545..f62577b 100644 --- a/tinygrad_repo/tinygrad/runtime/ops_cuda.py +++ b/tinygrad_repo/tinygrad/runtime/ops_cuda.py @@ -46,7 +46,8 @@ class CUDAProgram: if self.smem > 0: check(cuda.cuFuncSetAttribute(self.prg, cuda.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, self.smem)) def __del__(self): - if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module)) + try: check(cuda.cuModuleUnload(self.module)) + except AttributeError: pass def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False): check(cuda.cuCtxSetCurrent(self.dev.context)) @@ -67,8 +68,10 @@ class CUDAAllocator(LRUAllocator['CUDADevice']): if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01))) return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size))) def _free(self, opaque, options:BufferSpec): - if options.host: check(cuda.cuMemFreeHost(opaque)) - else: check(cuda.cuMemFree_v2(opaque)) + try: + if options.host: check(cuda.cuMemFreeHost(opaque)) + else: check(cuda.cuMemFree_v2(opaque)) + except (TypeError, AttributeError): pass def _copyin(self, dest, src:memoryview): check(cuda.cuCtxSetCurrent(self.dev.context)) host_mem = self.alloc(len(src), BufferSpec(host=True)) diff --git a/tinygrad_repo/tinygrad/runtime/ops_disk.py b/tinygrad_repo/tinygrad/runtime/ops_disk.py index fb68034..34d4999 100644 --- a/tinygrad_repo/tinygrad/runtime/ops_disk.py +++ b/tinygrad_repo/tinygrad/runtime/ops_disk.py @@ -72,7 +72,7 @@ class DiskBuffer: MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000) class DiskAllocator(Allocator): - def __init__(self, dev:DiskDevice): self.dev = dev + def __init__(self, dev:DiskDevice): super().__init__(dev) def _alloc(self, size:int, options): self.dev._might_open(size) return DiskBuffer(self.dev, size) diff --git a/tinygrad_repo/tinygrad/runtime/ops_dsp.py b/tinygrad_repo/tinygrad/runtime/ops_dsp.py index 449f893..ccbdda2 100644 --- a/tinygrad_repo/tinygrad/runtime/ops_dsp.py +++ b/tinygrad_repo/tinygrad/runtime/ops_dsp.py @@ -36,7 +36,7 @@ class DSPRenderer(ClangRenderer): device = "DSP" supports_float4 = True buffer_suffix = " restrict __attribute__((align_value(128)))" - kernel_prefix = "__attribute__((noinline)) " + kernel_typedef = "__attribute__((noinline)) void" pre_matcher = dsp_pm extra_matcher = dsp_pm_late+ClangRenderer.extra_matcher string_rewrite = dsp_string+ClangRenderer.string_rewrite diff --git a/tinygrad_repo/tinygrad/runtime/ops_gpu.py b/tinygrad_repo/tinygrad/runtime/ops_gpu.py index e07c377..0a8304d 100644 --- a/tinygrad_repo/tinygrad/runtime/ops_gpu.py +++ b/tinygrad_repo/tinygrad/runtime/ops_gpu.py @@ -1,6 +1,6 @@ from __future__ import annotations from typing import Optional, cast -import ctypes, functools, hashlib, contextlib +import ctypes, functools, hashlib from tinygrad.runtime.autogen import opencl as cl from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer @@ -41,8 +41,10 @@ class CLProgram: self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status) def __del__(self): - with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseKernel(self.kernel)) - with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseProgram(self.program)) + try: check(cl.clReleaseKernel(self.kernel)) + except (TypeError, AttributeError): pass + try: check(cl.clReleaseProgram(self.program)) + except (TypeError, AttributeError): pass def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:Optional[tuple[int,int,int]]=None, vals:tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501 for i,(b,_) in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b)) @@ -65,7 +67,9 @@ class CLAllocator(LRUAllocator['CLDevice']): cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]), options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status), options) return (checked(cl.clCreateBuffer(self.dev.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options) - def _free(self, opaque:tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0])) + def _free(self, opaque:tuple[ctypes._CData, BufferSpec], options:BufferSpec): + try: check(cl.clReleaseMemObject(opaque[0])) + except AttributeError: pass def _copyin(self, dest:tuple[ctypes._CData, BufferSpec], src:memoryview): if dest[1].image is not None: check(cl.clEnqueueWriteImage(self.dev.queue, dest[0], False, (ctypes.c_size_t * 3)(0,0,0), diff --git a/tinygrad_repo/tinygrad/runtime/ops_nv.py b/tinygrad_repo/tinygrad/runtime/ops_nv.py index 54a5323..f9decbd 100644 --- a/tinygrad_repo/tinygrad/runtime/ops_nv.py +++ b/tinygrad_repo/tinygrad/runtime/ops_nv.py @@ -159,7 +159,7 @@ class NVComputeQueue(NVCommandQueue): qmd = QMD(dev=prg.dev, addr=cast(int, qmd_buf.va_addr)) # Save qmd for later update - self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=qmd.field_offset('cta_raster_width')) + self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=qmd.field_offset('cta_raster_width' if qmd.ver<4 else 'grid_width')) self.bind_sints_to_mem(*(local_size[:2]), mem=qmd_buf.cpu_view(), fmt='H', offset=qmd.field_offset('cta_thread_dimension0')) self.bind_sints_to_mem(local_size[2], mem=qmd_buf.cpu_view(), fmt='B', offset=qmd.field_offset('cta_thread_dimension2')) qmd.set_constant_buf_addr(0, args_state.buf.va_addr) @@ -179,8 +179,9 @@ class NVComputeQueue(NVCommandQueue): if self.active_qmd.read(f'release{i}_enable') == 0: self.active_qmd.write(**{f'release{i}_enable': 1}) self.bind_sints_to_mem(signal.value_addr, mem=self.active_qmd_buf.cpu_view(), fmt='Q', mask=0xfffffffff, - offset=self.active_qmd.field_offset(f'release{i}_address_lower')) - self.bind_sints_to_mem(value, mem=self.active_qmd_buf.cpu_view(), fmt='Q', offset=self.active_qmd.field_offset(f'release{i}_payload_lower')) + offset=self.active_qmd.field_offset(f'release{i}_address_lower' if self.active_qmd.ver<4 else f'release_semaphore{i}_addr_lower')) + self.bind_sints_to_mem(value, mem=self.active_qmd_buf.cpu_view(), fmt='Q', + offset=self.active_qmd.field_offset(f'release{i}_payload_lower' if self.active_qmd.ver<4 else f'release_semaphore{i}_payload_lower')) return self self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value), @@ -253,19 +254,22 @@ class NVProgram(HCQProgram): if dev.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A: self.constbuffer_0[188:192], self.constbuffer_0[223] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window)], 0xfffdc0 - qmd = {'qmd_major_version':5, 'unknown_13':0x1, 'program_address_upper':hi32(self.prog_addr>>4),'program_address_lower':lo32(self.prog_addr>>4)} + qmd = {'qmd_major_version':5, 'qmd_type':nv_gpu.NVCEC0_QMDV05_00_QMD_TYPE_GRID_CTA, 'register_count':self.regs_usage, + 'program_address_upper_shifted4':hi32(self.prog_addr>>4), 'program_address_lower_shifted4':lo32(self.prog_addr>>4), + 'shared_memory_size_shifted7':self.shmem_usage>>7, 'shader_local_memory_high_size_shifted4':self.dev.slm_per_thread>>4} else: self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)] - qmd = {'qmd_major_version':3, 'sm_global_caching_enable':1, 'cwd_membar_type':nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, - 'program_address_upper':hi32(self.prog_addr), 'program_address_lower':lo32(self.prog_addr)} + qmd = {'qmd_major_version':3, 'sm_global_caching_enable':1, 'shader_local_memory_high_size':self.dev.slm_per_thread, + 'program_address_upper':hi32(self.prog_addr), 'program_address_lower':lo32(self.prog_addr), 'shared_memory_size':self.shmem_usage, + 'register_count_v':self.regs_usage} smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1 self.qmd:QMD = QMD(dev, **qmd, qmd_group_id=0x3f, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1, invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1, barrier_count=1, - constant_buffer_invalidate_0=1, register_count_v=self.regs_usage, shader_local_memory_high_size=self.dev.slm_per_thread, + cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, constant_buffer_invalidate_0=1, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg, max_sm_config_shared_mem_size=0x1a, - shared_memory_size=self.shmem_usage, program_prefetch_size=min(self.prog_sz>>8, 0x1ff), sass_version=dev.sass_version, + program_prefetch_size=min(self.prog_sz>>8, 0x1ff), sass_version=dev.sass_version, program_prefetch_addr_upper_shifted=self.prog_addr>>40, program_prefetch_addr_lower_shifted=self.prog_addr>>8) for i,(addr,sz) in self.constbufs.items(): @@ -298,8 +302,10 @@ class NVAllocator(HCQAllocator['NVDevice']): return self.dev._gpu_alloc(size, cpu_access=options.cpu_access, tag=f"user memory ({options})") def _free(self, opaque:HCQBuffer, options:BufferSpec): - self.dev.synchronize() - self.dev._gpu_free(opaque) + try: + self.dev.synchronize() + self.dev._gpu_free(opaque) + except AttributeError: pass def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf) @@ -436,9 +442,9 @@ class NVDevice(HCQCompiled[NVSignal]): if self.device_id >= len(NVDevice.gpus_info) or not NVDevice.gpus_info[self.device_id].valid: raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?") + self.fd_dev = self._new_gpu_fd() self.gpu_info = rmctrl.gpu_get_id_info_v2(self.fd_ctl, self.root, self.root, gpuId=NVDevice.gpus_info[self.device_id].gpu_id) self.gpu_minor = NVDevice.gpus_info[self.device_id].minor_number - self.fd_dev = self._new_gpu_fd() device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root, vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES) diff --git a/tinygrad_repo/tinygrad/runtime/ops_remote.py b/tinygrad_repo/tinygrad/runtime/ops_remote.py index 9790d96..00440fd 100644 --- a/tinygrad_repo/tinygrad/runtime/ops_remote.py +++ b/tinygrad_repo/tinygrad/runtime/ops_remote.py @@ -246,7 +246,9 @@ class RemoteAllocator(Allocator['RemoteDevice']): self.dev.q(BufferAlloc(buffer_num:=next(self.dev.buffer_num), size, options)) return buffer_num # TODO: options should not be here in any Allocator - def _free(self, opaque:int, options): self.dev.q(BufferFree(opaque)) + def _free(self, opaque:int, options): + try: self.dev.q(BufferFree(opaque)) + except (TypeError, AttributeError): pass def _copyin(self, dest:int, src:memoryview): self.dev.q(CopyIn(dest, self.dev.conn.req.h(src))) def _copyout(self, dest:memoryview, src:int): resp = self.dev.q(CopyOut(src), wait=True) diff --git a/tinygrad_repo/tinygrad/runtime/ops_webgpu.py b/tinygrad_repo/tinygrad/runtime/ops_webgpu.py index f7da78e..1386c96 100644 --- a/tinygrad_repo/tinygrad/runtime/ops_webgpu.py +++ b/tinygrad_repo/tinygrad/runtime/ops_webgpu.py @@ -189,7 +189,8 @@ class WebGpuAllocator(Allocator['WGPUDevPtr']): buffer_data = read_buffer(self.dev, src) dest[:] = buffer_data[:dest.nbytes] if webgpu.wgpuBufferGetSize(src) > dest.nbytes else buffer_data def _free(self, opaque:WGPUBufPtr, options:BufferSpec): - webgpu.wgpuBufferDestroy(opaque) + try: webgpu.wgpuBufferDestroy(opaque) + except AttributeError: pass class WebGpuDevice(Compiled): def __init__(self, device:str): diff --git a/tinygrad_repo/tinygrad/runtime/support/am/amdev.py b/tinygrad_repo/tinygrad/runtime/support/am/amdev.py index 1bbdfbd..5884069 100644 --- a/tinygrad_repo/tinygrad/runtime/support/am/amdev.py +++ b/tinygrad_repo/tinygrad/runtime/support/am/amdev.py @@ -18,7 +18,7 @@ class AMRegister(AMDReg): def write(self, _am_val:int=0, **kwargs): self.adev.wreg(self.addr, _am_val | self.encode(**kwargs)) - def update(self, **kwargs): self.write(self.encode(**{**self.read_bitfields(), **kwargs})) + def update(self, **kwargs): self.write(self.read() & ~self.fields_mask(*kwargs.keys()), **kwargs) class AMFirmware: def __init__(self, adev): @@ -139,7 +139,7 @@ class AMPageTableTraverseContext: while size > 0: pt, pte_idx, pte_covers = self.pt_stack[-1] if self.create_pts: - while pte_covers > size: pt, pte_idx, pte_covers = self.level_down() + while pte_covers > size or self.vaddr & (pte_covers-1) != 0: pt, pte_idx, pte_covers = self.level_down() else: while pt.lv!=am.AMDGPU_VM_PTB and not self.adev.gmc.is_pte_huge_page(pt.entries[pte_idx]): pt, pte_idx, pte_covers = self.level_down() @@ -152,7 +152,7 @@ class AMPageTableTraverseContext: self.level_up() class AMMemoryManager: - va_allocator = TLSFAllocator(512 * (1 << 30), base=0x7F0000000000) # global for all devices. + va_allocator = TLSFAllocator(512 * (1 << 30), base=0x200000000000) # global for all devices. def __init__(self, adev:AMDev, vram_size:int): self.adev, self.vram_size = adev, vram_size @@ -265,7 +265,7 @@ class AMDev: # all blocks that are initialized only during the initial AM boot. # To determine if the GPU is in the third state, AM uses regSCRATCH_REG7 as a flag. self.is_booting, self.smi_dev = True, False # During boot only boot memory can be allocated. This flag is to validate this. - self.partial_boot = (self.reg("regSCRATCH_REG7").read() == (am_version:=0xA0000004)) and (getenv("AM_RESET", 0) != 1) + self.partial_boot = (self.reg("regSCRATCH_REG7").read() == (am_version:=0xA0000005)) and (getenv("AM_RESET", 0) != 1) # Memory manager & firmware self.mm = AMMemoryManager(self, self.vram_size) @@ -280,13 +280,13 @@ class AMDev: self.gfx:AM_GFX = AM_GFX(self) self.sdma:AM_SDMA = AM_SDMA(self) - if self.partial_boot and (self.reg("regGCVM_CONTEXT0_CNTL").read() != 0): - if DEBUG >= 2: print(f"am {self.devfmt}: MEC is active. Issue a full reset.") - self.partial_boot = False - # Init sw for all IP blocks for ip in [self.soc, self.gmc, self.ih, self.psp, self.smu, self.gfx, self.sdma]: ip.init_sw() + if self.partial_boot and (self.reg("regGCVM_CONTEXT0_CNTL").read() != 0 or self.reg(self.gmc.pf_status_reg("GC")).read() != 0): + if DEBUG >= 2: print(f"am {self.devfmt}: Malformed state. Issuing a full reset.") + self.partial_boot = False + # Init hw for IP blocks where it is needed if not self.partial_boot: if self.psp.is_sos_alive() and self.smu.is_smu_alive(): self.smu.mode1_reset() diff --git a/tinygrad_repo/tinygrad/runtime/support/am/ip.py b/tinygrad_repo/tinygrad/runtime/support/am/ip.py index 7412878..59cc734 100644 --- a/tinygrad_repo/tinygrad/runtime/support/am/ip.py +++ b/tinygrad_repo/tinygrad/runtime/support/am/ip.py @@ -38,10 +38,12 @@ class AM_GMC(AM_IP): # GFX11/GFX12 has 44-bit address space self.address_space_mask = (1 << 44) - 1 - self.memscratch_paddr = self.adev.mm.palloc(0x1000, zero=not self.adev.partial_boot, boot=True) - self.dummy_page_paddr = self.adev.mm.palloc(0x1000, zero=not self.adev.partial_boot, boot=True) + self.memscratch_paddr = self.adev.mm.palloc(0x1000, zero=False, boot=True) + self.dummy_page_paddr = self.adev.mm.palloc(0x1000, zero=False, boot=True) self.hub_initted = {"MM": False, "GC": False} + self.pf_status_reg = lambda ip: f"reg{ip}VM_L2_PROTECTION_FAULT_STATUS{'_LO32' if self.adev.ip_ver[am.GC_HWIP] >= (12,0,0) else ''}" + def init_hw(self): self.init_hub("MM") def flush_hdp(self): self.adev.wreg(self.adev.reg("regBIF_BX0_REMAP_HDP_MEM_FLUSH_CNTL").read() // 4, 0x0) @@ -127,15 +129,14 @@ class AM_GMC(AM_IP): def on_interrupt(self): for ip in ["MM", "GC"]: - st = self.adev.reg(f"reg{ip}VM_L2_PROTECTION_FAULT_STATUS{'_LO32' if self.adev.ip_ver[am.GC_HWIP] >= (12,0,0) else ''}").read() - va = (self.adev.reg(f'reg{ip}VM_L2_PROTECTION_FAULT_ADDR_LO32').read() - | (self.adev.reg(f'reg{ip}VM_L2_PROTECTION_FAULT_ADDR_HI32').read()) << 32) << 12 - if st: raise RuntimeError(f"{ip}VM_L2_PROTECTION_FAULT_STATUS: {st:#x} {va:#x}") + va = (self.adev.reg(f'reg{ip}VM_L2_PROTECTION_FAULT_ADDR_HI32').read()<<32) | self.adev.reg(f'reg{ip}VM_L2_PROTECTION_FAULT_ADDR_LO32').read() + if self.adev.reg(self.pf_status_reg(ip)).read(): + raise RuntimeError(f"{ip}VM_L2_PROTECTION_FAULT_STATUS: {self.adev.reg(self.pf_status_reg(ip)).read_bitfields()} {va<<12:#x}") class AM_SMU(AM_IP): def init_sw(self): self.smu_mod = self.adev._ip_module("smu", am.MP1_HWIP, prever_prefix='v') - self.driver_table_paddr = self.adev.mm.palloc(0x4000, zero=not self.adev.partial_boot, boot=True) + self.driver_table_paddr = self.adev.mm.palloc(0x4000, zero=False, boot=True) def init_hw(self): self._send_msg(self.smu_mod.PPSMC_MSG_SetDriverDramAddrHigh, hi32(self.adev.paddr2mc(self.driver_table_paddr))) @@ -377,7 +378,7 @@ class AM_PSP(AM_IP): self.msg1_addr, self.msg1_view = self.adev.paddr2mc(self.msg1_paddr), self.adev.vram.view(self.msg1_paddr, am.PSP_1_MEG, 'B') self.cmd_paddr = self.adev.mm.palloc(am.PSP_CMD_BUFFER_SIZE, zero=False, boot=True) - self.fence_paddr = self.adev.mm.palloc(am.PSP_FENCE_BUFFER_SIZE, zero=not self.adev.partial_boot, boot=True) + self.fence_paddr = self.adev.mm.palloc(am.PSP_FENCE_BUFFER_SIZE, zero=True, boot=True) self.ring_size = 0x10000 self.ring_paddr = self.adev.mm.palloc(self.ring_size, zero=False, boot=True) diff --git a/tinygrad_repo/tinygrad/runtime/support/amd.py b/tinygrad_repo/tinygrad/runtime/support/amd.py index 3703486..b29e618 100644 --- a/tinygrad_repo/tinygrad/runtime/support/amd.py +++ b/tinygrad_repo/tinygrad/runtime/support/amd.py @@ -11,10 +11,9 @@ class AMDReg: def encode(self, **kwargs) -> int: return functools.reduce(int.__or__, (value << self.fields[name][0] for name,value in kwargs.items()), 0) def decode(self, val: int) -> dict: return {name:getbits(val, start, end) for name,(start,end) in self.fields.items()} - def field_mask(self, field_name) -> int: - start, end = self.fields[field_name] - num_bits = end - start + 1 - return ((1 << num_bits) - 1) << start + + def fields_mask(self, *names) -> int: + return functools.reduce(int.__or__, ((((1 << (self.fields[nm][1]-self.fields[nm][0]+1)) - 1) << self.fields[nm][0]) for nm in names), 0) @property def addr(self): return self.bases[self.segment] + self.offset @@ -41,7 +40,7 @@ def fixup_ip_version(ip:str, version:tuple[int, ...]) -> list[tuple[int, ...]]: return version if ip in ['nbio', 'nbif']: version = _apply_ovrd({(3,3): (2,3,0)}) - elif ip == 'mp': version = _apply_ovrd({(14,0,3): (14,0,2)}) + elif ip in ['mp', 'smu']: version = _apply_ovrd({(14,0,3): (14,0,2)}) return [version, version[:2], version[:2]+(0,), version[:1]+(0, 0)] diff --git a/tinygrad_repo/tinygrad/runtime/support/elf.py b/tinygrad_repo/tinygrad/runtime/support/elf.py index 26495ca..3276e6a 100644 --- a/tinygrad_repo/tinygrad/runtime/support/elf.py +++ b/tinygrad_repo/tinygrad/runtime/support/elf.py @@ -1,6 +1,7 @@ -import struct, tinygrad.runtime.autogen.libc as libc +import struct from dataclasses import dataclass from tinygrad.helpers import getbits, i2u +import tinygrad.runtime.autogen.libc as libc @dataclass(frozen=True) class ElfSection: name:str; header:libc.Elf64_Shdr; content:bytes # noqa: E702 diff --git a/tinygrad_repo/tinygrad/runtime/support/hcq.py b/tinygrad_repo/tinygrad/runtime/support/hcq.py index d24c2fb..8dcf154 100644 --- a/tinygrad_repo/tinygrad/runtime/support/hcq.py +++ b/tinygrad_repo/tinygrad/runtime/support/hcq.py @@ -26,7 +26,10 @@ class FileIOInterface: def __del__(self): if hasattr(self, 'fd'): os.close(self.fd) def ioctl(self, request, arg): return fcntl.ioctl(self.fd, request, arg) - def mmap(self, start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, self.fd, offset) + def mmap(self, start, sz, prot, flags, offset): + x = libc.mmap(start, sz, prot, flags, self.fd, offset) + if x == 0xffffffffffffffff: raise OSError(f"Failed to mmap {sz} bytes at {hex(start)}: {os.strerror(ctypes.get_errno())}") + return x def read(self, size=None, binary=False, offset=None): if offset is not None: self.seek(offset) with open(self.fd, "rb" if binary else "r", closefd=False) as file: return file.read(size) @@ -36,7 +39,10 @@ class FileIOInterface: def listdir(self): return os.listdir(self.path) def seek(self, offset): os.lseek(self.fd, offset, os.SEEK_SET) @staticmethod - def anon_mmap(start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, -1, offset) + def anon_mmap(start, sz, prot, flags, offset): + x = libc.mmap(start, sz, prot, flags, -1, offset) + if x == 0xffffffffffffffff: raise OSError(f"Failed to mmap {sz} bytes at {hex(start)}: {os.strerror(ctypes.get_errno())}") + return x @staticmethod def munmap(buf, sz): return libc.munmap(buf, sz) @staticmethod diff --git a/tinygrad_repo/tinygrad/runtime/support/llvm.py b/tinygrad_repo/tinygrad/runtime/support/llvm.py index bb1490e..d20de02 100644 --- a/tinygrad_repo/tinygrad/runtime/support/llvm.py +++ b/tinygrad_repo/tinygrad/runtime/support/llvm.py @@ -10,13 +10,13 @@ if sys.platform == 'win32': elif OSX: # Will raise FileNotFoundError if brew is not installed # `brew --prefix` will return even if formula is not installed - if not os.path.exists(brew_prefix:=subprocess.check_output(['brew', '--prefix', 'llvm@19']).decode().strip()): - raise FileNotFoundError('LLVM not found, you can install it with `brew install llvm@19`') + if not os.path.exists(brew_prefix:=subprocess.check_output(['brew', '--prefix', 'llvm@20']).decode().strip()): + raise FileNotFoundError('LLVM not found, you can install it with `brew install llvm@20`') LLVM_PATH: str|None = os.path.join(brew_prefix, 'lib', 'libLLVM.dylib') else: LLVM_PATH = ctypes.util.find_library('LLVM') # use newer LLVM if possible - for ver in reversed(range(14, 19+1)): + for ver in reversed(range(14, 20+1)): if LLVM_PATH is not None: break LLVM_PATH = ctypes.util.find_library(f'LLVM-{ver}') if LLVM_PATH is None: diff --git a/tinygrad_repo/tinygrad/runtime/support/webgpu.py b/tinygrad_repo/tinygrad/runtime/support/webgpu.py index 24fb75c..9b5362e 100644 --- a/tinygrad_repo/tinygrad/runtime/support/webgpu.py +++ b/tinygrad_repo/tinygrad/runtime/support/webgpu.py @@ -1,4 +1,4 @@ -import ctypes, ctypes.util, os, subprocess +import ctypes, ctypes.util, os, subprocess, platform from tinygrad.helpers import OSX if OSX: @@ -8,4 +8,5 @@ if OSX: else: if (WEBGPU_PATH:=ctypes.util.find_library('webgpu_dawn')) is None: raise FileNotFoundError("dawn library not found. " + - "Install it with `sudo curl -L https://github.com/wpmed92/pydawn/releases/download/v0.1.6/libwebgpu_dawn.so -o /usr/lib/libwebgpu_dawn.so`") + "Install it with `sudo curl -L https://github.com/wpmed92/pydawn/releases/download/v0.3.0/" + + f"libwebgpu_dawn_{platform.machine()}.so -o /usr/lib/libwebgpu_dawn.so`") diff --git a/tinygrad_repo/tinygrad/shape/shapetracker.py b/tinygrad_repo/tinygrad/shape/shapetracker.py index 031fb64..ea716d9 100644 --- a/tinygrad_repo/tinygrad/shape/shapetracker.py +++ b/tinygrad_repo/tinygrad/shape/shapetracker.py @@ -7,7 +7,7 @@ from tinygrad.helpers import merge_dicts, getenv from tinygrad.shape.view import View, strides_for_shape, unravel from tinygrad.dtype import dtypes from tinygrad.uop.ops import UOp, Ops, graph_rewrite, Variable, sint, sint_to_uop, Context, PatternMatcher, UPat, GroupOp -from tinygrad.codegen.symbolic import split_uop, symbolic_flat, uop_given_valid, simplify_valid +from tinygrad.uop.symbolic import split_uop, symbolic_flat, uop_given_valid, simplify_valid # If a node overflow, its srcs need to be checked to see if this overflow is the result of an ALU operation, # or that the node simply inherits the dtype from srcs. Upcast is either `Ops.CAST`+`replace` or just `replace`. diff --git a/tinygrad_repo/tinygrad/shape/view.py b/tinygrad_repo/tinygrad/shape/view.py index bc7ba07..5351ce4 100644 --- a/tinygrad_repo/tinygrad/shape/view.py +++ b/tinygrad_repo/tinygrad/shape/view.py @@ -3,7 +3,7 @@ import functools, operator, itertools from dataclasses import dataclass from typing import Optional, cast, Sequence from tinygrad.dtype import dtypes -from tinygrad.uop.ops import resolve, UOp, Variable, sint, sym_infer, smax, smin, sint_to_uop, Ops +from tinygrad.uop.ops import resolve, UOp, Variable, sint, sym_infer, smax, smin, sint_to_uop, Ops, ssimplify from tinygrad.helpers import prod, all_int, argsort, flatten, ceildiv @functools.cache @@ -51,7 +51,7 @@ def _reshape_mask(_mask:Optional[tuple[tuple[sint, sint], ...]], old_shape:tuple curr_stride, old_dim, new_dim, mask = 1, next(r_shape, 1), next(r_new_shape, 1), next(r_masks, (0,1)) while len(new_mask) < len(new_shape): - (l, r), next_stride = mask, new_dim * curr_stride + (l, r), next_stride = mask, ssimplify(new_dim * curr_stride) # need to split mask if old_dim == next_stride: # simply copy the mask and get next batch for merging @@ -66,7 +66,7 @@ def _reshape_mask(_mask:Optional[tuple[tuple[sint, sint], ...]], old_shape:tuple next_mask = next(r_masks, (0, 1)) # combine if the mask can unfold continuously if mask != (0, old_dim) and l != r and next_mask[1] - next_mask[0] != 1: return None - mask, old_dim = (next_mask[0] * old_dim + l, (next_mask[1] - 1) * old_dim + r), old_dim * next(r_shape, 1) + mask, old_dim = (next_mask[0] * old_dim + l, (next_mask[1] - 1) * old_dim + r), ssimplify(old_dim * next(r_shape, 1)) return tuple(reversed(new_mask)) @@ -160,7 +160,11 @@ class View: if vm1.mask: if (new_vm1 := vm1.shrink(vm1.mask)) == vm1 or (merged := vm2 + new_vm1) is None: return None return merged.pad(tuple((b,s-e) for (b,e),s in zip(vm1.mask, vm1.shape))) - if not all_int(vm1.shape): return None + if not all_int(vm1.shape): + # if all strides are 0 and vm2 is unmasked, return vm1 + if all(x == 0 for x in vm2.strides+vm1.strides) and vm2.mask is None: return vm1 + # TODO: handle more cases + return None # Project vm1's offset and strides on to vm2. origin = unravel(vm2.shape, vm1.offset) @@ -256,8 +260,8 @@ class View: # NOTE: does not check multiple of symbolic shape assert all(resolve(s == ns) or s == 1 for s,ns in zip(self.shape, new_shape)), f"can't expand {self.shape} into {new_shape}" if 0 in self.shape: return View.create(new_shape) - # TODO: this resolve may not be needed, but it's hard because vars need to be sorted - mask = tuple([(((0,0) if m != (0,1) else (0,ns)) if resolve(s != ns, False) else m) \ + # TODO: resolve may not be needed, but it's hard because vars need to be canonicalized + mask = tuple([(((0,0) if m != (0,1) else (0,ns)) if resolve(s != ns) and resolve(s == 1, False) else m) \ for m,s,ns in zip(self.mask, self.shape, new_shape)]) if self.mask else None return View.create(new_shape, self.strides, self.offset, mask) diff --git a/tinygrad_repo/tinygrad/tensor.py b/tinygrad_repo/tinygrad/tensor.py index 42ff44b..8ce86cb 100644 --- a/tinygrad_repo/tinygrad/tensor.py +++ b/tinygrad_repo/tinygrad/tensor.py @@ -20,7 +20,7 @@ from tinygrad.engine.grouper import get_kernelize_map all_tensors: set[weakref.ref[Tensor]] = set() def _find_all_tensors_for_uops(all_uops: set[UOp]) -> list[Tensor]: - return [t for tref in all_tensors if (t:=tref()) is not None and t.lazydata in all_uops] + return [t for tref in all_tensors if (t:=tref()) is not None and t.uop in all_uops] def _apply_map_to_tensors(applied_map:dict[UOp, UOp], name:str|None=None) -> None: # get all children of keys in applied_map @@ -36,21 +36,19 @@ def _apply_map_to_tensors(applied_map:dict[UOp, UOp], name:str|None=None) -> Non # NOTE: this uses all_tensors, but it's fast if len(fixed_tensors := _find_all_tensors_for_uops(all_uops)): # potentially rewrite all the discovered Tensors - sink = UOp.sink(*[t.lazydata for t in fixed_tensors]) + sink = UOp.sink(*[t.uop for t in fixed_tensors]) new_sink = sink.substitute(applied_map, name=name) - # set the relevant lazydata to the realized UOps + # set the relevant uop to the realized UOps for t,s,ns in zip(fixed_tensors, sink.src, new_sink.src): if s is ns: continue - t.lazydata = ns + t.uop = ns # **** Tensor helper functions **** # this tracks the tensor.py METADATA _METADATA: contextvars.ContextVar[Optional[Metadata]] = contextvars.ContextVar("_METADATA", default=None) -def _metaop(op, shape:tuple[sint,...], dtype:DType, device:str|tuple[str, ...], arg=None) -> UOp: return UOp.metaop(op, shape, dtype, device, arg) - def _fromnp(x: 'np.ndarray') -> UOp: # type: ignore [name-defined] # noqa: F821 ret = UOp.new_buffer("NPY", x.size, _from_np_dtype(x.dtype)) # fake realize @@ -121,9 +119,8 @@ class Tensor(MathTrait): np.set_printoptions(precision=4) ``` """ - __slots__ = "lazydata", "requires_grad", "grad" + __slots__ = "uop", "requires_grad", "grad" training: ClassVar[bool] = False - no_grad: ClassVar[bool] = False def __init__(self, data:ConstType|bytes|list|tuple|UOp|'np.ndarray'|pathlib.Path|None, # type: ignore [name-defined] # noqa: F821 device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None): @@ -141,20 +138,24 @@ class Tensor(MathTrait): # create a UOp from the different types of inputs if isinstance(data, UOp): assert dtype is None or dtype==data.dtype, "dtype doesn't match, and casting isn't supported" - if data.op is Ops.BIND: data = _metaop(Ops.BIND, tuple(), dtype or data.dtype, device, data) - elif data is None: data = _metaop(Ops.CONST, (0,), dtype or dtypes.default_float, device, arg=0) - elif isinstance(data, get_args(ConstType)): data = _metaop(Ops.CONST, tuple(), dtype or dtypes.from_py(data), device, data) + if data.op is Ops.BIND: + var, val = data.unbind() + # give the bound constant a device + const = UOp.const(var.dtype, val, device, ()) + data = data.replace(src=(var.replace(src=const.src), const)) + elif data is None: data = UOp.const(dtype or dtypes.default_float, 0, device, ()) + elif isinstance(data, get_args(ConstType)): data = UOp.const(dtype or dtypes.from_py(data), data, device, ()) elif isinstance(data, bytes): data = _frompy(data, dtypes.uint8 if dtype is None else dtype) elif isinstance(data, (list, tuple)): if dtype is None: if (d := fully_flatten(data)) and all(isinstance(s, bool) for s in d): dtype = dtypes.bool else: dtype = dtypes.default_int if d and all_int(d) else dtypes.default_float # NOTE: this works because all_int([True, False]) is True - if dtype in [dtypes.bfloat16, *dtypes.fp8s]: data = Tensor(_frompy(data, dtypes.float32), device=device).cast(dtype).lazydata + if dtype in [dtypes.bfloat16, *dtypes.fp8s]: data = Tensor(_frompy(data, dtypes.float32), device=device).cast(dtype).uop else: data = _frompy(data, dtype) elif str(type(data)) == "": import numpy as np assert isinstance(data, np.ndarray), f"expected np.ndarray, got {data}" - if data.shape == (): data = _metaop(Ops.CONST, tuple(), dtype or _from_np_dtype(data.dtype), device, data.item()) + if data.shape == (): data = UOp.const(dtype or _from_np_dtype(data.dtype), data.item(), device, ()) else: data = _fromnp(data.astype(npdtype) if dtype is not None and (npdtype:=_to_np_dtype(dtype)) is not None else data) # type: ignore [name-defined] elif isinstance(data, pathlib.Path): dtype = dtype or dtypes.uint8 @@ -164,19 +165,19 @@ class Tensor(MathTrait): if not isinstance(data, UOp): raise RuntimeError(f"can't create Tensor from {data!r} with type {type(data)}") # data might be on a different device - if isinstance(device, str): self.lazydata:UOp = data if data.device == device else data.copy_to_device(device) + if isinstance(device, str): self.uop:UOp = data if data.device == device else data.copy_to_device(device) # if device is a tuple, we should have/construct a MultiLazyBuffer - elif isinstance(data, UOp) and isinstance(data.device, str): self.lazydata = Tensor(data).shard(device).lazydata + elif isinstance(data.device, str): self.uop = Tensor(data).shard(device).uop else: assert data.device == device, f"MultiLazyBuffer device mismatch, {data.device} != {device}" - self.lazydata = data + self.uop = data # add to all_tensors after construction succeeds all_tensors.add(weakref.ref(self)) def __del__(self): all_tensors.discard(weakref.ref(self)) def _apply_uop(self, fxn:Callable, *x:Tensor, **kwargs) -> Tensor: - new_uop: UOp = fxn(*[t.lazydata for t in (self,)+x], **kwargs) + new_uop: UOp = fxn(*[t.uop for t in (self,)+x], **kwargs) if (metadata:=_METADATA.get()) is not None: all_metadata[new_uop] = (metadata,) needs_input_grad = [t.requires_grad for t in (self,)+x] return Tensor(new_uop, device=new_uop.device, requires_grad=True if any(needs_input_grad) else None if None in needs_input_grad else False) @@ -185,6 +186,9 @@ class Tensor(MathTrait): lhs,rhs = self._broadcasted(x, reverse) return lhs._apply_uop(fxn, rhs) + # _binop is used by MathTrait + def _binop(self, op, x, reverse): return self._apply_broadcasted_uop(lambda *u: UOp.alu(u[0], op, *u[1:]), x, reverse) + def requires_grad_(self, requires_grad=True) -> Tensor: self.requires_grad = requires_grad return self @@ -194,15 +198,10 @@ class Tensor(MathTrait): def __enter__(self): self.prev, Tensor.training = Tensor.training, self.mode def __exit__(self, exc_type, exc_value, traceback): Tensor.training = self.prev - class test(ContextDecorator): - def __init__(self, mode:bool = True): self.mode = mode - def __enter__(self): self.prev, Tensor.no_grad = Tensor.no_grad, self.mode - def __exit__(self, exc_type, exc_value, traceback): Tensor.no_grad = self.prev - def __repr__(self): - ld = self.lazydata + ld = self.uop ld_repr = f"" - return f"" + return f"" # Python has a non moving GC, so this should be okay def __hash__(self): return id(self) @@ -214,13 +213,13 @@ class Tensor(MathTrait): return self.shape[0] @property - def device(self) -> str|tuple[str, ...]: return self.lazydata.device + def device(self) -> str|tuple[str, ...]: return self.uop.device @property - def shape(self) -> tuple[sint, ...]: return self.lazydata.shape + def shape(self) -> tuple[sint, ...]: return self.uop.shape @property - def dtype(self) -> DType: return self.lazydata.dtype + def dtype(self) -> DType: return self.uop.dtype # ***** data handlers **** @@ -230,7 +229,7 @@ class Tensor(MathTrait): NOTE: Kernelize can be called multiple times on a Tensor """ - big_sink = UOp.sink(*[x.lazydata for x in (self,)+lst]) + big_sink = UOp.sink(*[x.uop for x in (self,)+lst]) # verify Tensors match the spec if __debug__: type_verify(list(big_sink.toposort()), tensor_uop_spec) @@ -247,7 +246,7 @@ class Tensor(MathTrait): """ st = time.perf_counter() self.kernelize(*lst) - sink = UOp.sink(*[x.lazydata for x in (self,)+lst]) + sink = UOp.sink(*[x.uop for x in (self,)+lst]) # remove all ASSIGNs, after scheduling, the tensors are just buffers remove_assign_map = {u:u.buf_uop for u in sink.toposort() if u.op is Ops.ASSIGN} @@ -276,31 +275,31 @@ class Tensor(MathTrait): """ # used for replacing a Tensor with a new version of it (potentially with a different device and dtype) assert self.shape == x.shape or allow_shape_mismatch, f"replace shape mismatch {self.shape} != {x.shape}" - self.lazydata = x.lazydata + self.uop = x.uop return self def assign(self, x) -> Tensor: # TODO: this is a hack for writing to DISK. remove with working assign if isinstance(self.device, str) and self.device.startswith("DISK"): if x.__class__ is not Tensor: x = Tensor(x, device="CPU", dtype=self.dtype) - cast(Buffer, self.contiguous().realize().lazydata.base.buffer).ensure_allocated().copyin(x._data()) + cast(Buffer, self.contiguous().realize().uop.base.buffer).ensure_allocated().copyin(x._data()) return self if x.__class__ is not Tensor: x = Tensor(x, device=self.device, dtype=self.dtype) - if self.lazydata is x.lazydata: return self # a self assign is a NOOP + if self.uop is x.uop: return self # a self assign is a NOOP # NOTE: we allow cross device assign assert self.shape == x.shape, f"assign shape mismatch {self.shape} != {x.shape}" assert self.device == x.device, f"assign device mismatch {self.device} != {x.device}" assert self.dtype == x.dtype, f"assign dtype mismatch {self.dtype} != {x.dtype}" - self.lazydata = self.lazydata.assign(x.lazydata) + self.uop = self.uop.assign(x.uop) return self def detach(self) -> Tensor: """ Returns a new tensor with the same data as this tensor, but detached from the autograd graph. """ - return Tensor(self.lazydata.detach(), device=self.device, requires_grad=False) + return Tensor(self.uop.detach(), device=self.device, requires_grad=False) - def _buffer(self) -> Buffer: return cast(Buffer, self.cast(self.dtype.base).contiguous().to("CPU").realize().lazydata.base.buffer) + def _buffer(self) -> Buffer: return cast(Buffer, self.cast(self.dtype.base).contiguous().to("CPU").realize().uop.base.buffer) def _data(self) -> memoryview: return self._buffer().as_buffer() def data(self) -> memoryview: @@ -376,7 +375,7 @@ class Tensor(MathTrait): device = tuple(Device.canonicalize(x) for x in device) if isinstance(device, (tuple, list)) else Device.canonicalize(device) if device == self.device: return self if not isinstance(device, str): return self.shard(device) - ret = Tensor(self.lazydata, device, requires_grad=self.requires_grad) + ret = Tensor(self.uop, device, requires_grad=self.requires_grad) if self.grad is not None: ret.grad = self.grad.to(device) return ret @@ -394,12 +393,12 @@ class Tensor(MathTrait): ```python exec="true" source="above" session="tensor" result="python" t = Tensor.empty(2, 4) - print(t.shard((t.device, t.device), axis=1).lazydata) + print(t.shard((t.device, t.device), axis=1).uop) ``` """ assert isinstance(self.device, str), "can't shard a MultiLazyBuffer" devices = tuple(Device.canonicalize(x) for x in devices) - mlb = self.lazydata.shard(devices, self._resolve_dim(axis)) if axis is not None else self.lazydata.copy_to_device(devices) + mlb = self.uop.shard(devices, self._resolve_dim(axis)) if axis is not None else self.uop.copy_to_device(devices) return Tensor(mlb, device=devices, requires_grad=self.requires_grad) def shard_(self, devices:tuple[str, ...], axis:int|None=None) -> Tensor: @@ -448,19 +447,19 @@ class Tensor(MathTrait): """ r = Tensor.empty(*shape, **kwargs) assert isinstance(r.device, str) - cast(Buffer, r.lazydata.buffer).allocate(external_ptr=ptr) + cast(Buffer, r.uop.buffer).allocate(external_ptr=ptr) return r @staticmethod def from_url(url:str, gunzip:bool=False, **kwargs) -> Tensor: """ - Create a Tensor from a URL. + Creates a Tensor from a URL. This is the preferred way to access Internet resources. It currently returns a DISK Tensor, but in the future it may return an HTTP Tensor. This also will soon become lazy (when possible) and not print progress without DEBUG. - THe `gunzip` flag will gzip extract the resource and return an extracted Tensor. + The `gunzip` flag will gzip extract the resource and return an extracted Tensor. """ return Tensor(fetch(url, gunzip=gunzip), **kwargs) @@ -520,12 +519,13 @@ class Tensor(MathTrait): Tensor._device_seeds[device] = Tensor( [int.from_bytes(hashlib.sha256(len(Tensor._device_seeds).to_bytes(4, "big")).digest(), "big"), Tensor._seed], device=device, dtype=dtypes.uint32, requires_grad=False) - Tensor._device_rng_counters[device] = Tensor([0], device=device, dtype=dtypes.uint32, requires_grad=False) + Tensor._device_rng_counters[device] = Tensor([num], device=device, dtype=dtypes.uint32, requires_grad=False) # increment rng counter for devices else: Tensor._device_rng_counters[device].assign(Tensor._device_rng_counters[device] + num).contiguous() # threefry random bits - counts0 = (Tensor.arange(ceildiv(num, 2), device=device, dtype=dtypes.uint32, requires_grad=False)+Tensor._device_rng_counters[device]) + bits_count = Tensor._device_rng_counters[device] - num + counts0 = (Tensor.arange(ceildiv(num, 2), device=device, dtype=dtypes.uint32, requires_grad=False)+bits_count) counts1 = counts0 + ceildiv(num, 2) bits = Tensor._threefry_random_bits(Tensor._device_seeds[device], counts0, counts1)[:num] @@ -722,7 +722,7 @@ class Tensor(MathTrait): dtype = kwargs.pop("dtype", self.dtype) if isinstance(self.device, tuple): if kwargs.get("device") is not None: raise RuntimeError("cannot specify `device` on `rand_like` of a multi device tensor") - return Tensor.rand(*self.shape, dtype=dtype, **kwargs).shard(self.device, self.lazydata.axis) + return Tensor.rand(*self.shape, dtype=dtype, **kwargs).shard(self.device, self.uop.axis) return Tensor.rand(*self.shape, device=kwargs.pop("device", self.device), dtype=dtype, **kwargs) # ***** rng hlops ***** @@ -875,12 +875,30 @@ class Tensor(MathTrait): return Tensor.normal(*shape, mean=0.0, std=std, **kwargs) @staticmethod - def randperm(n: int, *, device=None, dtype=dtypes.int32, **kwargs) -> Tensor: + def randperm(n:int, device=None, dtype=dtypes.int32, **kwargs) -> Tensor: + """ + Returns a tensor with a random permutation of integers from `0` to `n-1`. + + ```python exec="true" source="above" session="tensor" result="python" + Tensor.manual_seed(42) + print(Tensor.randperm(6).numpy()) + ``` + """ r = Tensor.rand(n, device=device, **kwargs) _, indices = r.sort() return indices.cast(dtype) def multinomial(self:Tensor, num_samples:int = 1, replacement:bool = False) -> Tensor: + """ + Returns a tensor with `num_samples` indices sampled from a multinomial distribution weighted by `self`. + + NOTE: `replacement=False` for `num_samples > 1` is not supported yet. + ```python exec="true" source="above" session="tensor" result="python" + Tensor.manual_seed(42) + t = Tensor([1, 2, 3, 4]) + print(t.multinomial(20, replacement=True).numpy()) + ``` + """ assert 1 <= self.ndim <= 2 and num_samples > 0, f"{self.ndim=} must be 1 or 2 dim, {num_samples=} must be positive" assert replacement or num_samples == 1, "no replacement only supports num_samples = 1" weight = self.unsqueeze(0) if self.ndim == 1 else self @@ -893,7 +911,7 @@ class Tensor(MathTrait): def gradient(self, *targets:Tensor, gradient:Tensor|None=None, materialize_grads=False) -> list[Tensor]: """ - Compute the gradient of the targets with respect to self. + Computes the gradient of the targets with respect to self. ```python exec="true" source="above" session="tensor" result="python" x = Tensor.eye(3) @@ -908,18 +926,16 @@ class Tensor(MathTrait): assert gradient is not None or self.shape == tuple(), "when no gradient is provided, backward must be called on a scalar tensor" if not (self.is_floating_point() and all(t.is_floating_point() for t in targets)): raise RuntimeError("only float Tensors have gradient") if gradient is None: gradient = Tensor(1.0, dtype=self.dtype, device=self.device, requires_grad=False) - rets = [] - target_uops = [x.lazydata for x in targets] - grads = compute_gradient(self.lazydata, gradient.lazydata, set(target_uops)) + target_uops = [x.uop for x in targets] + grads = compute_gradient(self.uop, gradient.uop, set(target_uops)) ret = [] for x in target_uops: if (y:=grads.get(x)) is None: if materialize_grads: y = x.const_like(0) - else: raise RuntimeError(f"{x}\n\nnot found in\n\n{self.lazydata}") + else: raise RuntimeError(f"{x}\n\nnot found in\n\n{self.uop}") ret.append(y) - rets.append(ret) # create returned Tensors - return [Tensor(u, device=t.device) for t,u in zip(targets, rets[0])] + return [Tensor(u, device=t.device) for t,u in zip(targets, ret)] def backward(self, gradient:Tensor|None=None) -> Tensor: """ @@ -931,9 +947,9 @@ class Tensor(MathTrait): print(t.grad.numpy()) ``` """ - all_uops = self.lazydata.toposort() + all_uops = self.uop.toposort() tensors_need_grad: list[Tensor] = [t for tref in all_tensors if (t:=tref()) is not None and \ - t.lazydata in all_uops and t.requires_grad and not Tensor.no_grad] + t.uop in all_uops and t.requires_grad] # clear contexts for t,g in zip(tensors_need_grad, self.gradient(*tensors_need_grad, gradient=gradient, materialize_grads=True)): assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}" @@ -944,7 +960,7 @@ class Tensor(MathTrait): def view(self, shape:tuple[sint, ...], *args) -> Tensor: """`.view` is an alias for `.reshape`.""" - return self.reshape(argfix(shape, *args)) + return self.reshape(shape, *args) def reshape(self, shape, *args) -> Tensor: """ @@ -1127,17 +1143,19 @@ class Tensor(MathTrait): boundary = [index, index+1] if index >= 0 else [index+size, index+size+1] case slice(): if index.step == 0: raise ValueError(f"{index=} cannot have 0 as step") - if all(isinstance(s, int) or s is None for s in (index.start,index.stop,index.step)): + start, stop = 0 if index.start is None else index.start, size if index.stop is None else index.stop + step = 1 if index.step is None else index.step + boundary, stride = [start, stop], step + if all(isinstance(s, int) for s in (start,stop,step)): # handle int slicing *boundary, stride = index.indices(cast(SupportsIndex, size)) if stride * (boundary[1] - boundary[0]) < 0: boundary = [0, 0] elif stride < 0: boundary = [boundary[1] + 1, boundary[0] + 1] # update size for slice size = ceildiv((boundary[1] - boundary[0]), abs(stride)) - elif index.step is None and all(isinstance(s,(int,UOp))for s in (index.start,index.stop)) and resolve((index.stop-index.start) > 0, False): + elif (step == 1) and isinstance(step, int) and all(isinstance(s,(int,UOp)) for s in (start, stop)) and resolve((stop-start) > 0, False): # simple symbolic slice - boundary = [index.start, index.stop] - size = (index.stop - index.start).ssimplify() + size = cast(UOp|int, cast(UOp, (stop - start)).ssimplify()) else: raise TypeError(f"slice {index=} is not supported") case None: pass # do nothing case _: raise IndexError(f"{type(index).__name__} indexing is not supported") @@ -1197,7 +1215,7 @@ class Tensor(MathTrait): def __getitem__(self, indices) -> Tensor: """ - Retrieve a sub-tensor using indexing. + Retrieves a sub-tensor using indexing. Supported Index Types: `int | slice | Tensor | None | list | tuple | Ellipsis` @@ -1240,14 +1258,14 @@ class Tensor(MathTrait): self.realize()._getitem(indices).assign(v) return # NOTE: check that setitem target is valid first - if not unwrap(self.lazydata.st).contiguous: raise RuntimeError("setitem target needs to be contiguous") + if not unwrap(self.uop.st).contiguous: raise RuntimeError("setitem target needs to be contiguous") if isinstance(v, get_args(ConstType)): v = Tensor(v, device=self.device, dtype=self.dtype) if not isinstance(v, Tensor): raise TypeError(f"can't set a {type(v).__name__} to a Tensor") if self.requires_grad or v.requires_grad: raise NotImplementedError("setitem with requires_grad is not supported") res = self.realize()._getitem(indices, v) # if shapes match and data is not shared it's a copy and we assign to self - if res.shape == self.shape and res.lazydata is not self.lazydata: + if res.shape == self.shape and res.uop is not self.uop: self.assign(res).realize() else: # no copy, basic setitem v = v.cast(res.dtype)._broadcast_to(_broadcast_shape(res.shape, v.shape)).contiguous() @@ -1309,7 +1327,7 @@ class Tensor(MathTrait): def repeat_interleave(self, repeats:int, dim:int|None=None) -> Tensor: """ - Repeat elements of a tensor. + Repeats elements of a tensor. ```python exec="true" source="above" session="tensor" result="python" t = Tensor([1, 2, 3]) @@ -1616,6 +1634,24 @@ class Tensor(MathTrait): idxs = counts.scatter(0, mask_cumsum, 1, reduce='add').cumsum() return x[idxs] + def masked_fill(self:Tensor, mask:Tensor, value:Tensor|ConstType) -> Tensor: + """ + Replaces `self` with `value` wherever the elements of `mask` are True. + + ```python exec="true" source="above" session="tensor" result="python" + t = Tensor([1, 2, 3, 4, 5]) + mask = Tensor([True, False, True, False, False]) + print(t.masked_fill(mask, -12).numpy()) + ``` + ```python exec="true" source="above" session="tensor" result="python" + t = Tensor([1, 2, 3, 4, 5]) + mask = Tensor([True, False, True, False, False]) + value = Tensor([-1, -2, -3, -4, -5]) + print(t.masked_fill(mask, value).numpy()) + ``` + """ + return mask.where(value, self) + # ***** reduce ops ***** def _reduce(self, op:Ops, axis:int|Sequence[int]|None=None, keepdim=False) -> Tensor: @@ -1900,6 +1936,56 @@ class Tensor(MathTrait): """ return self.std(axis, keepdim, correction), self.mean(axis, keepdim) + def keccak(self, cfg:str|tuple[int, int] = "sha3_256"): + """ + Calculates a Keccak hash over the last dimension. Uses "sha3_256" by default. + + ```python exec="false" source="above" session="tensor" result="python" + t = Tensor(b"Hello World!").keccak() + print(t.data().hex()) + ``` + """ + + # https://keccak.team/keccak_specs_summary.html + + def ctensor(l: Sequence[ConstType], dtype: DType = dtypes.uint64): return Tensor.stack(*(Tensor(v, dtype=dtype, device=self.device) for v in l)) + rot_offsets = [44, 43, 21, 14, 28, 20, 3, 45, 61, 1, 6, 25, 8, 18, 27, 36, 10, 15, 56, 62, 55, 39, 41, 2] + rot_offsets_v0, rot_offsets_v1 = ctensor([0] + [1 << v for v in rot_offsets]), ctensor([1] + [1 << (64 - v) for v in rot_offsets]) + + # calculated from π step + reorder_indexes = [0,6,12,18,24,3,9,10,16,22,1,7,13,19,20,4,5,11,17,23,2,8,14,15,21] + rnd_const_masks = [ctensor([v]).pad((0, 24)) for v in (1, 0x8082, 0x800000000000808a, 0x8000000080008000, 0x808b, 0x80000001, 0x8000000080008081, + 0x8000000000008009, 0x8a, 0x88, 0x80008009, 0x8000000a, 0x8000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003, + 0x8000000000008002, 0x8000000000000080, 0x800a, 0x800000008000000a, 0x8000000080008081, 0x8000000000008080, 0x80000001, 0x8000000080008008)] + + rate, dsbyte = { "sha3_224": (144, 6), "sha3_256": (136, 6), "shake_128": (168, 31) }[cfg] if isinstance(cfg, str) else cfg + data, data_pad = self.bitcast(dtypes.uint8).reshape(prod(self.shape[:-1]), -1), rate - (self.shape[-1] * self.dtype.itemsize % rate) + # pad batches then pad blocks + data = data.pad((None, (0, data_pad))).reshape(data.shape[0], -1, rate).pad((None, None, (0, 200 - rate))).flatten(1) + + # create pad mask + lbe = data.shape[1] + rate - data_pad - 200 + if data_pad == 1: mb = [(lbe, 0), (1, dsbyte ^ 0x80), (data.shape[-1] - lbe - 1, 0)] + else: mb = [(lbe, 0), (1, dsbyte), (data.shape[-1] + rate - lbe - 202, 0), (1, 0x80), (200 - rate, 0)] + pad_mask = Tensor.cat(*(Tensor(v, dtype=dtypes.uint8, device=data.device).expand(l) for l, v in mb if l > 0)) + + data = (data ^ pad_mask).reshape(data.shape[0], -1, 200).bitcast(dtypes.uint64) + + state = Tensor.zeros((data.shape[0], 25), device=self.device, dtype=dtypes.uint64) + for k in range(int(data.shape[1])): + state = state.bitwise_xor(data[:,k].reshape(-1, 25)) + for i in range(24): # f1600 + # θ step + p = state.reshape((-1, 5, 5)).transpose(2, 1) + t1 = (p[:,:,0] ^ p[:,:,1] ^ p[:,:,2] ^ p[:,:,3] ^ p[:,:,4]).roll(-1, 1) # xor reduce + state = state ^ (t1.roll(2, 1).bitwise_xor((t1 << 1) ^ (t1 >> 63)).unsqueeze(2).expand((-1, -1, 5)).transpose(2, 1).flatten(1)) + # ρ and π steps + state = state[:,reorder_indexes] + state = (state * rot_offsets_v0).bitwise_or(state // rot_offsets_v1).reshape((-1, 5, 5)) + # χ and ι step + state = state.bitwise_xor((state.roll(shifts=-1, dims=2) ^ -1) & state.roll(shifts=-2, dims=2)).flatten(1) ^ rnd_const_masks[i] + return state.bitcast(dtypes.uint8)[:,:(200 - rate) // 2].reshape(*self.shape[:-1], -1) + def _softmax(self, axis, dtype:DTypeLike|None=None) -> tuple[Tensor, Tensor, Tensor]: m = self - self.max(axis=axis, keepdim=True).detach() if dtype is not None: m = m.cast(dtype) @@ -2590,7 +2676,7 @@ class Tensor(MathTrait): def _pre_scatter(self, dim:int, index:Tensor, src:Tensor) -> tuple[Tensor, Tensor]: index, dim = index.to(self.device), self._resolve_dim(dim) - assert index.ndim == self.ndim == src.ndim, f"self.ndim, index.ndim and src.dim must all equal, {self.ndim=} {index.ndim=} {src.ndim=}" + assert index.ndim == self.ndim == src.ndim, f"self.ndim, index.ndim and src.ndim must all equal, {self.ndim=} {index.ndim=} {src.ndim=}" assert all((d == dim or self_ >= index_) and src_ >= index_ for d,(self_,index_,src_) in enumerate(zip(self.shape, index.shape, src.shape))), \ f"All dimensions of {index.shape=} should be <= to all dimensions of {src.shape=} and all dimensions except dimension {dim} of {self.shape=}" if self.dtype != src.dtype: raise RuntimeError(f"expect {self.dtype=} to be equal to {src.dtype=}") @@ -2669,14 +2755,13 @@ class Tensor(MathTrait): """ src, mask = self._pre_scatter(dim, index, src) def _inv_mask(a:Tensor|ConstType, b:Tensor|ConstType) -> Tensor: return mask.any(-1).logical_not().where(a, b) - # TODO: should not overwrite dtype here? - if reduce == "sum": return mask.where(src, 0).sum(-1, dtype=self.dtype).add(self if include_self else _inv_mask(self, 0)) - if reduce == "prod": return mask.where(src, 1).prod(-1, dtype=self.dtype).mul(self if include_self else _inv_mask(self, 1)) + if reduce == "sum": return mask.where(src, 0).sum(-1).add(self if include_self else _inv_mask(self, 0)) + if reduce == "prod": return mask.where(src, 1).prod(-1).mul(self if include_self else _inv_mask(self, 1)) if reduce == "amax": return mask.where(src, m := dtypes.min(src.dtype)).max(-1).maximum(self if include_self else _inv_mask(self, m)) if reduce == "amin": return mask.where(src, m := dtypes.max(src.dtype)).min(-1).minimum(self if include_self else _inv_mask(self, m)) if reduce == "mean": - count = mask.where(1, 0).sum(-1, dtype=self.dtype).add(1 if include_self else _inv_mask(1, 0)) - return mask.where(src, 0).sum(-1, dtype=self.dtype).add(self if include_self else _inv_mask(self, 0)).div(count) + count = mask.where(1, 0).sum(-1).add(1 if include_self else _inv_mask(1, 0)) + return mask.where(src, 0).sum(-1).add(self if include_self else _inv_mask(self, 0)).div(count) raise RuntimeError(f"{reduce=} must be one of 'sum', 'prod', 'mean', 'amax', 'amin'") def sort(self, dim:int=-1, descending:bool=False) -> tuple[Tensor, Tensor]: @@ -2783,7 +2868,7 @@ class Tensor(MathTrait): def fuse(self) -> Tensor: """ - Make this a single kernel back to Ops.CONTIGUOUS on the inputs. + Makes this a single kernel back to Ops.CONTIGUOUS on the inputs. Useful for single kernel softmax and flash attention. Careful, this can break codegen or make kernels really slow. @@ -2872,7 +2957,7 @@ class Tensor(MathTrait): def hardsigmoid(self, alpha:float=1/6, beta:float=0.5) -> Tensor: """ Applies the Hardsigmoid function element-wise. - NOTE: default `alpha` and `beta` values is taken from torch + NOTE: default `alpha` and `beta` values are taken from torch - Described: https://paperswithcode.com/method/hard-sigmoid - See: https://pytorch.org/docs/stable/generated/torch.nn.functional.hardsigmoid.html @@ -3103,7 +3188,7 @@ class Tensor(MathTrait): def reciprocal(self) -> Tensor: """ - Compute `1/x` element-wise. + Computes `1/x` element-wise. ```python exec="true" source="above" session="tensor" result="python" print(Tensor([1., 2., 3., 4.]).reciprocal().numpy()) @@ -3408,26 +3493,6 @@ class Tensor(MathTrait): # broadcast return x._broadcast_to(out_shape:=_broadcast_shape(x.shape, y.shape)), y._broadcast_to(out_shape) - def add(self, x:Tensor|ConstType, reverse=False) -> Tensor: - """ - Adds `self` and `x`. - Equivalent to `self + x`. - Supports broadcasting to a common shape, type promotion, and integer, float, boolean inputs. - - ```python exec="true" source="above" session="tensor" result="python" - Tensor.manual_seed(42) - t = Tensor.randn(4) - print(t.numpy()) - ``` - ```python exec="true" source="above" session="tensor" result="python" - print(t.add(20).numpy()) - ``` - ```python exec="true" source="above" session="tensor" result="python" - print(t.add(Tensor([[2.0], [3.5]])).numpy()) - ``` - """ - return self._apply_broadcasted_uop(UOp.add, x, reverse) - def sub(self, x:Tensor|ConstType, reverse=False) -> Tensor: """ Subtracts `x` from `self`. @@ -3449,39 +3514,6 @@ class Tensor(MathTrait): a, b = self._broadcasted(x, reverse) return a + (-b) - def mul(self, x:Tensor|ConstType, reverse=False) -> Tensor: - """ - Multiplies `self` and `x`. - Equivalent to `self * x`. - Supports broadcasting to a common shape, type promotion, and integer, float, boolean inputs. - - ```python exec="true" source="above" session="tensor" result="python" - Tensor.manual_seed(42) - t = Tensor.randn(4) - print(t.numpy()) - ``` - ```python exec="true" source="above" session="tensor" result="python" - print(t.mul(3).numpy()) - ``` - ```python exec="true" source="above" session="tensor" result="python" - print(t.mul(Tensor([[-1.0], [2.0]])).numpy()) - ``` - """ - return self._apply_broadcasted_uop(UOp.mul, x, reverse) - - def idiv(self, x:Tensor|ConstType, reverse=False) -> Tensor: - """ - Divides `self` by `x`. - Equivalent to `self // x`. - Supports broadcasting to a common shape, type promotion, and integer inputs. - `idiv` performs integer division (truncate towards zero). - - ```python exec="true" source="above" session="tensor" result="python" - print(Tensor([-4, 7, 5, 4, -7, 8]).idiv(Tensor([2, -3, 8, -2, 3, 5])).numpy()) - ``` - """ - return self._apply_broadcasted_uop(UOp.idiv, x, reverse) - def div(self, x:Tensor|ConstType, reverse=False, rounding_mode:Literal["trunc", "floor"]|None=None) -> Tensor: """ Divides `self` by `x`. @@ -3547,7 +3579,7 @@ class Tensor(MathTrait): def bitwise_and(self, x:Tensor|ConstType, reverse=False) -> Tensor: """ - Compute the bitwise AND of `self` and `x`. + Computes the bitwise AND of `self` and `x`. Equivalent to `self & x`. Supports broadcasting to a common shape, type promotion, and integer, boolean inputs. ```python exec="true" source="above" session="tensor" result="python" @@ -3562,7 +3594,7 @@ class Tensor(MathTrait): def bitwise_or(self, x:Tensor|ConstType, reverse=False) -> Tensor: """ - Compute the bitwise OR of `self` and `x`. + Computes the bitwise OR of `self` and `x`. Equivalent to `self | x`. Supports broadcasting to a common shape, type promotion, and integer, boolean inputs. ```python exec="true" source="above" session="tensor" result="python" @@ -3577,7 +3609,7 @@ class Tensor(MathTrait): def bitwise_not(self) -> Tensor: """ - Compute the bitwise NOT of `self`. + Computes the bitwise NOT of `self`. Equivalent to `~self`. ```python exec="true" source="above" session="tensor" result="python" print(Tensor([0, 2, 5, 255], dtype="int8").bitwise_not().numpy()) @@ -3632,9 +3664,9 @@ class Tensor(MathTrait): # TODO: int pow if not base.is_floating_point(): raise RuntimeError("base needs to be float") - # NOTE: pow(int, float) -> int ret = base._apply_uop(UOp.pow, exponent) - return ret.round().cast(self.dtype) if not dtypes.is_float(self.dtype) else ret + # NOTE: pow(int, float) -> int + return ret.round().cast(self.dtype) if not reverse and not dtypes.is_float(self.dtype) else ret def maximum(self, x:Tensor|ConstType) -> Tensor: """ @@ -3665,7 +3697,7 @@ class Tensor(MathTrait): def where(self:Tensor, x:Tensor|ConstType|sint, y:Tensor|ConstType|sint) -> Tensor: """ - Return a tensor of elements selected from either `x` or `y`, depending on `self`. + Returns a tensor of elements selected from either `x` or `y`, depending on `self`. `output_i = x_i if self_i else y_i`. ```python exec="true" source="above" session="tensor" result="python" @@ -3687,11 +3719,9 @@ class Tensor(MathTrait): cond, y = cond._broadcasted(y, match_dtype=False) return cond.cast(dtypes.bool)._apply_uop(UOp.where, *x._broadcasted(y)) - def masked_fill(self:Tensor, mask:Tensor, value:Tensor|ConstType) -> Tensor: return mask.where(value, self) - def copysign(self, other) -> Tensor: """ - Return a tensor of with the magnitude of `self` and the sign of `other`, elementwise. + Returns a tensor of with the magnitude of `self` and the sign of `other`, elementwise. """ # NOTE: torch always return in float, we return based on the broadcasting rule. other = self._broadcasted(other)[1] @@ -3930,7 +3960,7 @@ class Tensor(MathTrait): def cross_entropy(self, Y:Tensor, reduction:ReductionStr="mean", label_smoothing:float=0.0) -> Tensor: """ - Compute the cross entropy loss between input logits and target. + Computes the cross entropy loss between input logits and target. NOTE: `self` are logits and `Y` are the target labels or class probabilities. @@ -3955,7 +3985,7 @@ class Tensor(MathTrait): def nll_loss(self, Y:Tensor, weight:Tensor|None=None, ignore_index:int|None=None, reduction:ReductionStr="mean") -> Tensor: """ - Compute the negative log likelihood loss between log-probabilities and target labels. + Computes the negative log likelihood loss between log-probabilities and target labels. NOTE: `self` is log-probabilities and `Y` is the Y labels or class probabilities. @@ -4038,7 +4068,7 @@ class Tensor(MathTrait): def size(self, dim:int|None=None) -> sint|tuple[sint, ...]: """ - Return the size of the tensor. If `dim` is specified, return the length along dimension `dim`. Otherwise return the shape of the tensor. + Returns the size of the tensor. If `dim` is specified, return the length along dimension `dim`. Otherwise return the shape of the tensor. ```python exec="true" source="above" session="tensor" result="python" t = Tensor([[4, 5, 6], [7, 8, 9]]) @@ -4100,7 +4130,10 @@ class Tensor(MathTrait): if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and ns != os: new_uint, old_uint = to_dtype(f"uint{8*ns}"), to_dtype(f"uint{8*os}") tmp = self.bitcast(old_uint) - if ns > os: return functools.reduce(Tensor.add, (tmp[..., i::ns//os].cast(new_uint) << 8*i*os for i in range(ns//os))).bitcast(dtype) + if ns > os: + tmp = tmp.reshape(self.shape[:-1] + (self.shape[-1]//(rate := ns//os), rate)) + nones = (None,) * (tmp.ndim - 1) + return functools.reduce(Tensor.add, (tmp.shrink(nones + ((i, i+1),)).cast(new_uint)<<8*i*os for i in range(rate))).squeeze(-1).bitcast(dtype) return Tensor.stack(*(tmp>>8*i*ns for i in range(os//ns)), dim=-1).flatten(-2).cast(new_uint).bitcast(dtype) return self._apply_uop(UOp.bitcast, dtype=dt) if self.dtype != dt else self diff --git a/tinygrad_repo/tinygrad/uop/__init__.py b/tinygrad_repo/tinygrad/uop/__init__.py index efae111..277bb40 100644 --- a/tinygrad_repo/tinygrad/uop/__init__.py +++ b/tinygrad_repo/tinygrad/uop/__init__.py @@ -1 +1,99 @@ -from tinygrad.uop.ops import UOp, Ops # noqa: F401 \ No newline at end of file +from enum import auto, IntEnum, Enum + +# wrapper around IntEnum that preserves Enum.__str__ and makes auto() unique across all FastEnum subclasses +class FastEnum(IntEnum): + def __str__(self): return Enum.__str__(self) + @staticmethod + def _generate_next_value_(_, __, ___, last_values): return 1 + max([0, *last_values, *[max(c) for c in FastEnum.__subclasses__()]]) + +# the order of these Ops controls the order of the toposort +class Ops(FastEnum): + # uops that aren't rendered + SINK = auto(); CONTIGUOUS = auto(); CONTIGUOUS_BACKWARD = auto(); DETACH = auto(); KERNEL = auto(); UNIQUE = auto() # noqa: E702 + + # MetaOps + COPY = auto(); BUFFER_VIEW = auto(); MSELECT = auto(); MSTACK = auto() # noqa: E702 + + # blocks in linearizer + BLOCK = auto(); BLOCKSTART = auto(); BLOCKEND = auto(); BLOCKFINAL = auto() # noqa: E702 + + # movement ops! + RESHAPE = auto(); PERMUTE = auto(); EXPAND = auto(); PAD = auto(); SHRINK = auto(); FLIP = auto() # noqa: E702 + + # misc ops + UNROLL = auto(); CONTRACT = auto() # noqa: E702 + VIEW = auto(); DEFINE_GLOBAL = auto(); BUFFER = auto() # noqa: E702 + DEFINE_VAR = auto(); DEFINE_LOCAL = auto(); DEFINE_ACC = auto() # noqa: E702 + VALID = auto(); SPECIAL = auto(); NOOP = auto() # noqa: E702 + + # reduce + REDUCE_AXIS = auto(); REDUCE = auto(); ALLREDUCE = auto() # noqa: E702 + + # helper ops + GEP = auto(); VECTORIZE = auto(); CAT = auto(); PTRCAT = auto() # noqa: E702 + + # UnaryOps + CAST = auto(); BITCAST = auto(); EXP2 = auto(); LOG2 = auto(); SIN = auto(); SQRT = auto(); RECIP = auto(); NEG = auto() # noqa: E702 + + # load/store before math + LOAD = auto(); STORE = auto() # noqa: E702 + + # early INDEX + INDEX = auto() + + # math ops + WMMA = auto() + + # BinaryOps + ADD = auto(); MUL = auto(); SHL = auto(); SHR = auto(); IDIV = auto(); MAX = auto(); MOD = auto(); CMPLT = auto(); CMPNE = auto() # noqa: E702 + XOR = auto(); OR = auto(); AND = auto(); THREEFRY = auto(); SUB = auto(); FDIV = auto(); POW = auto() # noqa: E702 + + # TernaryOps + WHERE = auto(); MULACC = auto() # noqa: E702 + + # assignment ops + ASSIGN = auto() + BIND = auto() + + # control flow ops + BARRIER = auto(); RANGE = auto(); IF = auto(); ENDRANGE = auto(); ENDIF = auto(); GBARRIER = auto() # noqa: E702 + + # consts last! + VCONST = auto(); CONST = auto() # noqa: E702 + + # device + DEVICE = auto() + MULTI = auto() + + # CUSTOMI is inline + CUSTOM = auto(); CUSTOMI = auto() # noqa: E702 + FUSE = auto() + +class GroupOp: + Unary = {Ops.EXP2, Ops.LOG2, Ops.SIN, Ops.SQRT, Ops.RECIP, Ops.NEG} + Binary = {Ops.ADD, Ops.MUL, Ops.IDIV, Ops.MAX, Ops.MOD, Ops.CMPLT, Ops.CMPNE, Ops.XOR, Ops.SHL, Ops.SHR, Ops.OR, Ops.AND, Ops.THREEFRY, + Ops.SUB, Ops.FDIV, Ops.POW} + Ternary = {Ops.WHERE, Ops.MULACC} + ALU = set.union(Unary, Binary, Ternary) + + Irreducible = {Ops.CONST, Ops.DEFINE_VAR, Ops.SPECIAL, Ops.RANGE} + Movement = {Ops.RESHAPE, Ops.EXPAND, Ops.PERMUTE, Ops.PAD, Ops.SHRINK, Ops.FLIP} + + Buffer = {Ops.LOAD, Ops.STORE, Ops.VALID, Ops.CONST, Ops.DEFINE_VAR} + Block = {Ops.BLOCK, Ops.BLOCKEND, Ops.BLOCKSTART} + + # BinaryOps that can be flipped + Commutative = {Ops.ADD, Ops.MUL, Ops.MAX, Ops.CMPNE, Ops.XOR, Ops.AND, Ops.OR} + + # BinaryOps where f(f(a,b),c) = f(a,f(b,c)) + Associative = {Ops.ADD, Ops.MUL, Ops.AND, Ops.OR, Ops.MAX} + + # BinaryOps that satisfy f(x,x)=x see https://en.wikipedia.org/wiki/Idempotence + Idempotent = {Ops.OR, Ops.AND, Ops.MAX} + + # do not preserve f(0) = 0 + UnsafePad = {Ops.RECIP, Ops.LOG2, Ops.EXP2, Ops.IDIV, Ops.POW} + + Meta = {Ops.COPY, Ops.BUFFER_VIEW} + + All = set(Ops) diff --git a/tinygrad_repo/tinygrad/uop/mathtraits.py b/tinygrad_repo/tinygrad/uop/mathtraits.py new file mode 100644 index 0000000..1433dde --- /dev/null +++ b/tinygrad_repo/tinygrad/uop/mathtraits.py @@ -0,0 +1,124 @@ +from tinygrad.uop import Ops +from tinygrad.helpers import T +from tinygrad.dtype import dtypes + +class MathTrait: + # required to implement + def alu(self:T, arg:Ops, *src) -> T: raise NotImplementedError + def const_like(self:T, b) -> T: raise NotImplementedError + + # great functions you get! + def ufix(self, x): return self.const_like(x) if not isinstance(x, MathTrait) else x + def _binop(self, op, x, reverse): return self.ufix(x).alu(op, self) if reverse else self.alu(op, self.ufix(x)) + def logical_not(self): return self.ne(True) + def neg(self): + if (dtype:=getattr(self, 'dtype')) is None: raise TypeError(f"MathTraits __neg__ requires a dtype, {self=}") + return self.logical_not() if dtype.scalar() == dtypes.bool else self*(-1) + def add(self, x, reverse=False): + """ + Adds `self` and `x`. + Equivalent to `self + x`. + Supports broadcasting to a common shape, type promotion, and integer, float, boolean inputs. + ```python exec="true" source="above" session="tensor" result="python" + Tensor.manual_seed(42) + t = Tensor.randn(4) + print(t.numpy()) + ``` + ```python exec="true" source="above" session="tensor" result="python" + print(t.add(20).numpy()) + ``` + ```python exec="true" source="above" session="tensor" result="python" + print(t.add(Tensor([[2.0], [3.5]])).numpy()) + ``` + """ + return self._binop(Ops.ADD, x, reverse) + def mul(self, x, reverse=False): + """ + Multiplies `self` and `x`. + Equivalent to `self * x`. + Supports broadcasting to a common shape, type promotion, and integer, float, boolean inputs. + + ```python exec="true" source="above" session="tensor" result="python" + Tensor.manual_seed(42) + t = Tensor.randn(4) + print(t.numpy()) + ``` + ```python exec="true" source="above" session="tensor" result="python" + print(t.mul(3).numpy()) + ``` + ```python exec="true" source="above" session="tensor" result="python" + print(t.mul(Tensor([[-1.0], [2.0]])).numpy()) + ``` + """ + return self._binop(Ops.MUL, x, reverse) + def bitwise_and(self, x, reverse=False): return self._binop(Ops.AND, x, reverse) + def bitwise_or(self, x, reverse=False): return self._binop(Ops.OR, x, reverse) + def bitwise_xor(self, x, reverse=False): return self._binop(Ops.XOR, x, reverse) + def idiv(self, x, reverse=False): + """ + Divides `self` by `x`. + Equivalent to `self // x`. + Supports broadcasting to a common shape, type promotion, and integer inputs. + `idiv` performs integer division (truncate towards zero). + + ```python exec="true" source="above" session="tensor" result="python" + print(Tensor([-4, 7, 5, 4, -7, 8]).idiv(Tensor([2, -3, 8, -2, 3, 5])).numpy()) + ``` + """ + return self._binop(Ops.IDIV, x, reverse) + def mod(self, x, reverse=False): return self._binop(Ops.MOD, x, reverse) + def sub(self, x, reverse=False): return self.ufix(x).alu(Ops.ADD, -self) if reverse else self.alu(Ops.ADD, self.ufix(-x)) + def div(self, x, reverse=False): return (self.ufix(x)*self.alu(Ops.RECIP)) if reverse else (self*self.ufix(x).alu(Ops.RECIP)) + + def __neg__(self): return self.neg() + + def __add__(self, x): return self.add(x) + def __sub__(self, x): return self.sub(x) + def __mul__(self, x): return self.mul(x) + def __truediv__(self, x): return self.div(x) + def __floordiv__(self, x): return self.idiv(x) # TODO: idiv is trunc div, not floordiv + def __mod__(self, x): return self.mod(x) + def __and__(self, x): return self.bitwise_and(x) + def __or__(self, x): return self.bitwise_or(x) + def __xor__(self, x): return self.bitwise_xor(x) + + def __radd__(self, x): return self.add(x, True) + def __rsub__(self, x): return self.sub(x, True) + def __rmul__(self, x): return self.mul(x, True) + def __rtruediv__(self, x): return self.div(x, True) + def __rfloordiv__(self, x): return self.idiv(x, True) + def __rand__(self, x): return self.bitwise_and(x, True) + def __ror__(self, x): return self.bitwise_or(x, True) + def __rxor__(self, x): return self.bitwise_xor(x, True) + def __rmod__(self, x): return self.mod(x, True) + + def __lt__(self, x): return self.alu(Ops.CMPLT, self.ufix(x)) + def __gt__(self, x): return self.ufix(x).alu(Ops.CMPLT, self) + def __ge__(self, x): return (self < x).logical_not() + def __le__(self, x): return (self > x).logical_not() + + def ne(self, x): return self.alu(Ops.CMPNE, self.ufix(x)) + def eq(self, x): return self.ne(x).logical_not() + def __ne__(self, x): return self.ne(x) + # NOTE: __eq__ isn't overridden, and means the same thing as is by default + + def lshift(self, x, reverse=False): return self._binop(Ops.SHL, x, reverse) + def rshift(self, x, reverse=False): return self._binop(Ops.SHR, x, reverse) + def __lshift__(self, x): return self.lshift(x) + def __rshift__(self, x): return self.rshift(x) + def __rlshift__(self, x): return self.lshift(x, True) + def __rrshift__(self, x): return self.rshift(x, True) + + def maximum(self, x): return self.alu(Ops.MAX, self.ufix(x)) + def minimum(self, x): return -(-self).maximum(-x) + def where(self, x, y): + if type(self) is type(x): return self.alu(Ops.WHERE, x, x.ufix(y)) + if type(self) is type(y): return self.alu(Ops.WHERE, y.ufix(x), y) + raise RuntimeError("where needs at least one UOp arg") + def threefry(self, seed): return self.alu(Ops.THREEFRY, seed) + def reciprocal(self): return self.alu(Ops.RECIP) + def sqrt(self): return self.alu(Ops.SQRT) + def sin(self): return self.alu(Ops.SIN) + def log2(self): return self.alu(Ops.LOG2) + def exp2(self): return self.alu(Ops.EXP2) + def pow(self, x): return self.alu(Ops.POW, self.ufix(x)) diff --git a/tinygrad_repo/tinygrad/uop/ops.py b/tinygrad_repo/tinygrad/uop/ops.py index 1812bca..5975539 100644 --- a/tinygrad_repo/tinygrad/uop/ops.py +++ b/tinygrad_repo/tinygrad/uop/ops.py @@ -1,188 +1,16 @@ from __future__ import annotations -from typing import Any, Optional, Union, Callable, cast, TYPE_CHECKING, Type, get_args, Sequence +from typing import Any, Optional, Union, Callable, cast, TYPE_CHECKING, Type, Sequence import sys, time, functools, itertools, math, operator, hashlib, os, types, pickle, pathlib, inspect, weakref -from enum import auto, IntEnum, Enum from dataclasses import dataclass, field +from tinygrad.uop import Ops, GroupOp +from tinygrad.uop.mathtraits import MathTrait from tinygrad.dtype import ConstType, ImageDType, dtypes, DType, truncate from tinygrad.helpers import ContextVar, all_int, prod, getenv, all_same, Context, partition, temp, unwrap, T, argfix, Metadata, flatten -from tinygrad.helpers import PICKLE_BUFFERS, dedup, cdiv, cmod +from tinygrad.helpers import PICKLE_BUFFERS, dedup, cdiv, cmod, diskcache_put if TYPE_CHECKING: from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.device import Buffer, MultiBuffer -# wrapper around IntEnum that preserves Enum.__str__ and makes auto() unique across all FastEnum subclasses -class FastEnum(IntEnum): - def __str__(self): return Enum.__str__(self) - @staticmethod - def _generate_next_value_(_, __, ___, last_values): return 1 + max([0, *last_values, *[max(c) for c in FastEnum.__subclasses__()]]) - -class MathTrait: - # required to implement - def alu(self:T, arg:Ops, *src) -> T: raise NotImplementedError - def const_like(self:T, b:ConstLike) -> T: raise NotImplementedError - - # great functions you get! - def ufix(self, x): return self.const_like(x) if not isinstance(x, MathTrait) else x - def _binop(self, op, x, reverse): return self.ufix(x).alu(op, self) if reverse else self.alu(op, self.ufix(x)) - def logical_not(self): return self.ne(True) - def neg(self): - if (dtype:=getattr(self, 'dtype')) is None: raise TypeError(f"MathTraits __neg__ requires a dtype, {self=}") - return self.logical_not() if dtype.scalar() == dtypes.bool else self*(-1) - def add(self, x, reverse=False): return self._binop(Ops.ADD, x, reverse) - def mul(self, x, reverse=False): return self._binop(Ops.MUL, x, reverse) - def bitwise_and(self, x, reverse=False): return self._binop(Ops.AND, x, reverse) - def bitwise_or(self, x, reverse=False): return self._binop(Ops.OR, x, reverse) - def bitwise_xor(self, x, reverse=False): return self._binop(Ops.XOR, x, reverse) - def idiv(self, x, reverse=False): return self._binop(Ops.IDIV, x, reverse) - def mod(self, x, reverse=False): return self._binop(Ops.MOD, x, reverse) - def sub(self, x, reverse=False): return self.ufix(x).alu(Ops.ADD, -self) if reverse else self.alu(Ops.ADD, self.ufix(-x)) - def div(self, x, reverse=False): return (self.ufix(x)*self.alu(Ops.RECIP)) if reverse else (self*self.ufix(x).alu(Ops.RECIP)) - - def __neg__(self): return self.neg() - - def __add__(self, x): return self.add(x) - def __sub__(self, x): return self.sub(x) - def __mul__(self, x): return self.mul(x) - def __truediv__(self, x): return self.div(x) - def __floordiv__(self, x): return self.idiv(x) # TODO: idiv is trunc div, not floordiv - def __mod__(self, x): return self.mod(x) - def __and__(self, x): return self.bitwise_and(x) - def __or__(self, x): return self.bitwise_or(x) - def __xor__(self, x): return self.bitwise_xor(x) - - def __radd__(self, x): return self.add(x, True) - def __rsub__(self, x): return self.sub(x, True) - def __rmul__(self, x): return self.mul(x, True) - def __rtruediv__(self, x): return self.div(x, True) - def __rfloordiv__(self, x): return self.idiv(x, True) - def __rand__(self, x): return self.bitwise_and(x, True) - def __ror__(self, x): return self.bitwise_or(x, True) - def __rxor__(self, x): return self.bitwise_xor(x, True) - def __rmod__(self, x): return self.mod(x, True) - - def __lt__(self, x): return self.alu(Ops.CMPLT, self.ufix(x)) - def __gt__(self, x): return self.ufix(x).alu(Ops.CMPLT, self) - def __ge__(self, x): return (self < x).logical_not() - def __le__(self, x): return (self > x).logical_not() - - def ne(self, x): return self.alu(Ops.CMPNE, self.ufix(x)) - def eq(self, x): return self.ne(x).logical_not() - def __ne__(self, x): return self.ne(x) - # NOTE: __eq__ isn't overridden, and means the same thing as is by default - - def lshift(self, x, reverse=False): return self._binop(Ops.SHL, x, reverse) - def rshift(self, x, reverse=False): return self._binop(Ops.SHR, x, reverse) - def __lshift__(self, x): return self.lshift(x) - def __rshift__(self, x): return self.rshift(x) - def __rlshift__(self, x): return self.lshift(x, True) - def __rrshift__(self, x): return self.rshift(x, True) - - def maximum(self, x): return self.alu(Ops.MAX, self.ufix(x)) - def minimum(self, x): return -(-self).maximum(-x) - def where(self, x, y): - if type(self) is type(x): return self.alu(Ops.WHERE, x, x.ufix(y)) - if type(self) is type(y): return self.alu(Ops.WHERE, y.ufix(x), y) - raise RuntimeError("where needs at least one UOp arg") - def threefry(self, seed): return self.alu(Ops.THREEFRY, seed) - def reciprocal(self): return self.alu(Ops.RECIP) - def sqrt(self): return self.alu(Ops.SQRT) - def sin(self): return self.alu(Ops.SIN) - def log2(self): return self.alu(Ops.LOG2) - def exp2(self): return self.alu(Ops.EXP2) - def pow(self, x): return self.alu(Ops.POW, self.ufix(x)) - -# the order of these Ops controls the order of the toposort -class Ops(FastEnum): - # uops that aren't rendered - SINK = auto(); CONTIGUOUS = auto(); CONTIGUOUS_BACKWARD = auto(); DETACH = auto(); KERNEL = auto(); UNIQUE = auto() # noqa: E702 - - # MetaOps - COPY = auto(); BUFFER_VIEW = auto(); MSELECT = auto() # noqa: E702 - - # blocks in linearizer - BLOCK = auto(); BLOCKSTART = auto(); BLOCKEND = auto(); BLOCKFINAL = auto() # noqa: E702 - - # movement ops! - RESHAPE = auto(); PERMUTE = auto(); EXPAND = auto(); PAD = auto(); SHRINK = auto(); FLIP = auto() # noqa: E702 - - # misc ops - UNROLL = auto(); CONTRACT = auto() # noqa: E702 - VIEW = auto(); DEFINE_GLOBAL = auto(); BUFFER = auto() # noqa: E702 - DEFINE_VAR = auto(); DEFINE_LOCAL = auto(); DEFINE_ACC = auto() # noqa: E702 - VALID = auto(); SPECIAL = auto(); NOOP = auto() # noqa: E702 - - # reduce - REDUCE_AXIS = auto(); REDUCE = auto(); ALLREDUCE = auto() # noqa: E702 - - # helper ops - GEP = auto(); VECTORIZE = auto(); CAT = auto(); PTRCAT = auto() # noqa: E702 - - # UnaryOps - CAST = auto(); BITCAST = auto(); EXP2 = auto(); LOG2 = auto(); SIN = auto(); SQRT = auto(); RECIP = auto(); NEG = auto() # noqa: E702 - - # load/store before math - LOAD = auto(); STORE = auto() # noqa: E702 - - # early INDEX - INDEX = auto() - - # math ops - WMMA = auto() - - # BinaryOps - ADD = auto(); MUL = auto(); SHL = auto(); SHR = auto(); IDIV = auto(); MAX = auto(); MOD = auto(); CMPLT = auto(); CMPNE = auto() # noqa: E702 - XOR = auto(); OR = auto(); AND = auto(); THREEFRY = auto(); SUB = auto(); FDIV = auto(); POW = auto() # noqa: E702 - - # TernaryOps - WHERE = auto(); MULACC = auto() # noqa: E702 - - # assignment ops - ASSIGN = auto() - BIND = auto() - - # control flow ops - BARRIER = auto(); RANGE = auto(); IF = auto(); ENDRANGE = auto(); ENDIF = auto(); GBARRIER = auto() # noqa: E702 - - # consts last! - VCONST = auto(); CONST = auto() # noqa: E702 - - # device - DEVICE = auto() - MULTI = auto() - - # CUSTOMI is inline - CUSTOM = auto(); CUSTOMI = auto() # noqa: E702 - IGNORE = auto(); FUSE = auto() # noqa: E702 - -class GroupOp: - Unary = {Ops.EXP2, Ops.LOG2, Ops.SIN, Ops.SQRT, Ops.RECIP, Ops.NEG} - Binary = {Ops.ADD, Ops.MUL, Ops.IDIV, Ops.MAX, Ops.MOD, Ops.CMPLT, Ops.CMPNE, Ops.XOR, Ops.SHL, Ops.SHR, Ops.OR, Ops.AND, Ops.THREEFRY, - Ops.SUB, Ops.FDIV, Ops.POW} - Ternary = {Ops.WHERE, Ops.MULACC} - ALU = set.union(Unary, Binary, Ternary) - - Irreducible = {Ops.CONST, Ops.DEFINE_VAR, Ops.SPECIAL, Ops.RANGE} - Movement = {Ops.RESHAPE, Ops.EXPAND, Ops.PERMUTE, Ops.PAD, Ops.SHRINK, Ops.FLIP} - - Buffer = {Ops.LOAD, Ops.STORE, Ops.VALID, Ops.CONST, Ops.DEFINE_VAR} - Block = {Ops.BLOCK, Ops.BLOCKEND, Ops.BLOCKSTART} - - # BinaryOps that can be flipped - Commutative = {Ops.ADD, Ops.MUL, Ops.MAX, Ops.CMPNE, Ops.XOR, Ops.AND, Ops.OR} - - # BinaryOps where f(f(a,b),c) = f(a,f(b,c)) - Associative = {Ops.ADD, Ops.MUL, Ops.AND, Ops.OR, Ops.MAX} - - # BinaryOps that satisfy f(x,x)=x see https://en.wikipedia.org/wiki/Idempotence - Idempotent = {Ops.OR, Ops.AND, Ops.MAX} - - # do not preserve f(0) = 0 - UnsafePad = {Ops.RECIP, Ops.LOG2, Ops.EXP2, Ops.IDIV, Ops.POW} - - Meta = {Ops.COPY, Ops.BUFFER_VIEW} - - All = set(Ops) - # https://en.wikipedia.org/wiki/Identity_element def identity_element(op:Ops, dt:DType) -> ConstType: return dtypes.as_const({Ops.ADD:0, Ops.MUL:1, Ops.MAX:dtypes.min(dt)}[op], dt) @@ -263,8 +91,9 @@ class UOp(MathTrait, metaclass=UOpMetaClass): @functools.cached_property def key(self) -> bytes: return hashlib.sha256(str((self.op, self.dtype, self.arg)).encode() + b"".join([s.key for s in self.src])).digest() - def __repr__(self): return pretty_print(self, lambda x: f"{type(self).__name__}({x.op}, {x.dtype}, arg={x.argstr()}, src=(%s))") + def __repr__(self): return pretty_print(self, lambda x: f"{type(self).__name__}({x.op}, {x.dtype}, arg={x.argstr()}{x.tagstr()}, src=(%s))") def argstr(self): return f'({", ".join(map(str, self.arg))})' if self.op is Ops.REDUCE_AXIS else repr(self.arg) + def tagstr(self): return f", tag={self.tag}" if self.tag is not None else "" @functools.cached_property def parents(self:UOp) -> dict[UOp, None]: @@ -342,7 +171,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass): def simplify(self): # late import! - from tinygrad.codegen.symbolic import symbolic + from tinygrad.uop.symbolic import symbolic with Context(TRACK_MATCH_STATS=0): return graph_rewrite(self, symbolic) def ssimplify(self) -> Union[UOp, ConstType]: return ret.arg if (ret:=self.simplify()).op is Ops.CONST else ret @@ -378,8 +207,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass): def index(self, idx:UOp, valid:UOp|None=None): return UOp(Ops.INDEX, self.dtype, (self,idx,valid) if valid is not None else (self,idx)) def const_like(self, b:ConstLike): # constants can optionally have a DEVICE source - if self._device is None: return UOp.const(self.dtype, b) - return UOp.metaop(Ops.CONST, self.shape, self.dtype, self.device, b) + return UOp.const(self.dtype, b, device=self._device, shape=self.shape if self.st is not None else None) def broadcast(self, count:int): assert self.dtype.count == 1 if count == 1: return self @@ -407,16 +235,17 @@ class UOp(MathTrait, metaclass=UOpMetaClass): if arg in {Ops.CMPLT, Ops.CMPNE}: out_dtype = dtypes.bool.vec(out_dtype.count) if out_dtype.count > 1 else dtypes.bool return UOp(arg, out_dtype, (self,)+src) @staticmethod - def const(dtype:DType, b:ConstLike): + def const(dtype:DType, b:ConstLike, device:str|tuple[str, ...]|None=None, shape:tuple[sint, ...]|None=None): if isinstance(b, UOp): return b.unbind()[0] if b.op is Ops.BIND else b if isinstance(b, tuple) and all_same(b): b = b[0] # doesn't have to be a VCONST if they are all the same - return UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype)) - def valid(self, st:ShapeTracker): - assert self.op in {Ops.CONST, Ops.DEFINE_VAR} and any(v.mask is not None for v in st.views), f"attempting to create VALID with {self.op} {st}" - from tinygrad.shape.shapetracker import ShapeTracker - # NOTE: only VALID has a masked ShapeTracker, the CONST operands are unmasked - unmasked_st = ShapeTracker.from_shape(()).reshape((1,)*len(st.shape)).expand(st.shape).to_uop() - return UOp(Ops.VALID, dtypes.bool, (st.to_uop(),)).where(self.replace(src=(unmasked_st,)), UOp.const(self.dtype, 0).replace(src=(unmasked_st,))) + ret = UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype)) + if shape is not None: + from tinygrad.shape.shapetracker import ShapeTracker + ret = ret.replace(src=(ShapeTracker.from_shape(()).reshape((1,)*len(shape)).expand(shape).to_uop(),)) + if device is not None: + ret = ret.replace(src=(UOp(Ops.DEVICE, arg=device).view(unwrap(ret.st)),)) + return ret + def valid(self): return UOp.where(UOp(Ops.VALID, dtypes.bool, (UOp(Ops.VIEW, arg=self.st),)), self.const_like(self.base.arg), 0) @staticmethod def range(dtype:DType, end:sint, idx:int): return UOp(Ops.RANGE, dtype=dtype, src=(sint_to_uop(end),), arg=idx) def r(self, op:Ops, axis:tuple[int, ...]): @@ -476,18 +305,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass): # *** from LazyBuffer *** - @staticmethod - def metaop(op:Ops, shape:tuple[sint, ...], dtype:DType, device:str|tuple[str, ...], arg=None) -> UOp: - from tinygrad.shape.shapetracker import ShapeTracker - # Tensor const is CONST(VIEW(DEVICE)) -> RESHAPE -> EXPAND - if op is Ops.CONST: - assert isinstance(arg, get_args(ConstType)), f"trying to create CONST with {arg=}" - return UOp.const(dtype, unwrap(arg)).replace(src=(UOp(Ops.VIEW, dtypes.void, (UOp(Ops.DEVICE, arg=device),), - ShapeTracker.from_shape(())),)).reshape((1,)*len(shape)).expand(shape) - # Tensor variable binding is BIND(VAR(VIEW(DEVICE)), CONST(VIEW(DEVICE))) - assert op is Ops.BIND, f"unknown op {op}" - var, val = arg.unbind() - return var.replace(src=(UOp(Ops.VIEW, dtypes.void, (UOp(Ops.DEVICE, arg=device),), ShapeTracker.from_shape(shape)),)).bind(val) def copy_to_device(self, device:str|tuple[str, ...]|UOp, arg=None): assert arg is None or isinstance(self.device, tuple) inp = self if arg is None else UOp(Ops.MSELECT, self.dtype, src=(self,), arg=arg) @@ -501,8 +318,9 @@ class UOp(MathTrait, metaclass=UOpMetaClass): @property def base(self) -> UOp: if (self.op is Ops.VIEW and len(self.src) != 0) or self.op in GroupOp.Movement: return self.src[0].base + if self.op is Ops.MULTI: return self.src[0].base # MULTI is really a VIEW return self - def view(self, new_st:ShapeTracker) -> UOp: return UOp(Ops.VIEW, self.dtype, (self.base,), new_st) + def view(self, new_st:ShapeTracker) -> UOp: return UOp(Ops.VIEW, self.dtype, (self,), new_st) def _mop(self, op:Ops, arg): ret = UOp(op, self.dtype, (self,), arg) @@ -535,12 +353,14 @@ class UOp(MathTrait, metaclass=UOpMetaClass): if self.op is Ops.MSELECT: assert isinstance(self.src[0].device, tuple), "mselect must be on tuple device" return self.src[0].device[self.arg] + if self.op is Ops.MSTACK: return tuple(cast(str, x.device) for x in self.src) if self.op in {Ops.COPY, Ops.BUFFER, Ops.ALLREDUCE}: return self.src[1].device return dsrcs[0]._device if len(dsrcs:=[x for x in self.src if x._device is not None]) != 0 else None @property def buf_uop(self) -> UOp: if self.op is Ops.BUFFER: return self if self.op is Ops.MSELECT: return self.src[0].buf_uop.mselect(self.arg) + if self.op is Ops.MSTACK: return UOp(Ops.MSTACK, self.dtype, src=tuple(x.buf_uop for x in self.src)) assert self.op is Ops.ASSIGN, f"must be ASSIGN {self.op}" return self.src[0].base @property @@ -553,6 +373,11 @@ class UOp(MathTrait, metaclass=UOpMetaClass): ret = self.src[0].buffer assert isinstance(ret, MultiBuffer) return ret.bufs[self.arg] + if self.op is Ops.MSTACK: + ret = MultiBuffer.__new__(MultiBuffer) + ret.bufs = [cast(Buffer, x.buffer) for x in self.src] + assert all_same([x.size for x in ret.bufs]) and all_same([x.dtype for x in ret.bufs]), "multibuffers mismatch buffers" + return ret assert self.op is Ops.BUFFER, f"must be BUFFER {self.op}" if (cret:=buffers.get(self)) is not None: return cret rdtype = self.dtype if isinstance(self.dtype, ImageDType) else self.dtype.base @@ -561,7 +386,9 @@ class UOp(MathTrait, metaclass=UOpMetaClass): buffers[self] = ret return ret @property - def realized(self) -> Optional[Buffer|MultiBuffer]: return self.buffer if self.op is Ops.BUFFER and self.buffer.is_allocated() else None + def realized(self) -> Optional[Buffer|MultiBuffer]: + # NOTE: this is used by the JIT to determine which inputs we capture + return self.buffer if self.op in {Ops.BUFFER, Ops.MSTACK} and self.buffer.is_allocated() else None @property def is_realized(self) -> bool: return all(x.base.realized is not None for x in self.base.src) if self.base.op is Ops.MULTI else self.base.realized is not None @@ -725,8 +552,9 @@ def print_uops(uops:list[UOp]): def get_location() -> tuple[str, int]: frm = sys._getframe(1) - # skip over ops.py (unless there's nothing but ops.py) - while pathlib.Path(frm.f_code.co_filename).name == "ops.py" and frm.f_back is not None and not frm.f_back.f_code.co_filename.startswith(" UOp: + # apply rewrite rules until a fixed point is reached. may return `uop` itself if PatternMatcher doesn't match + new_n: UOp|None = uop + while new_n is not None: last_n, new_n = new_n, self.rewrite(new_n, ctx) + return last_n + # *** tracking pattern matcher *** TRACK_MATCH_STATS = ContextVar("TRACK_MATCH_STATS", 2 if getenv("VIZ") else 0) @@ -889,13 +723,21 @@ match_stats:dict[UPat, list[Union[int, float]]] = dict() class TrackedGraphRewrite: loc: tuple[str, int] # location that called graph_rewrite sink: UOp # the sink input to graph_rewrite - bottom_up: bool matches: list[tuple[UOp, UOp, UPat]] # before+after of all the matches - name: str|None - depth: int + name: str|None # optional name of the rewrite + depth: int # depth if it's a subrewrite + bottom_up: bool tracked_keys:list[Any] = [] tracked_ctxs:list[list[TrackedGraphRewrite]] = [] _name_cnt:dict[str, int] = {} + +if getenv("CAPTURE_PROCESS_REPLAY"): + replay_capture: dict[str, bytes] = {} + import atexit + @atexit.register + def save_to_diskcache(): + for k,v in replay_capture.items(): diskcache_put("process_replay", k, v, prepickled=True) + def track_rewrites(named=False, name_fxn:Callable|None=None): def _decorator(func): def __wrapper(*args, **kwargs): @@ -904,7 +746,16 @@ def track_rewrites(named=False, name_fxn:Callable|None=None): tracked_keys.append(f"{func.__name__}_{_name_cnt[func.__name__]}" if count_names else args[0]) tracked_ctxs.append([]) ret = func(*args, **kwargs) - if TRACK_MATCH_STATS >= 2 and name_fxn is not None: tracked_keys[-1] = f"{name_fxn(ret)} n{_name_cnt[func.__name__]}" + if TRACK_MATCH_STATS >= 2 and name_fxn is not None: tracked_keys[-1] = f"{name_fxn(*args, **kwargs, ret=ret)} n{_name_cnt[func.__name__]}" + if getenv("CAPTURE_PROCESS_REPLAY"): + # find the unittest frame we're capturing in + frm = sys._getframe(1) + while (f_back:=frm.f_back) is not None and "unittest" not in f_back.f_code.co_filename: frm = f_back + loc = f"{frm.f_code.co_filename.split('/')[-1]}:{frm.f_lineno} {frm.f_code.co_name}" + # capture global context vars and all the args passed in + with Context(PICKLE_BUFFERS=0): + inputs = (func.__name__, args, kwargs, ContextVar._cache) + replay_capture[hashlib.sha256(pickle.dumps(inputs)).hexdigest()] = pickle.dumps(inputs+(loc, ret)) return ret return __wrapper return _decorator @@ -915,7 +766,7 @@ def track_matches(func): if tracking:=(TRACK_MATCH_STATS >= 2 and tracked_ctxs): loc = ((frm:=sys._getframe(1)).f_code.co_filename, frm.f_lineno) depth = len(active_rewrites) - tracked_ctxs[-1].append(ctx:=TrackedGraphRewrite(loc, args[0], kwargs.get("bottom_up", False),[], kwargs.get("name", None), depth)) + tracked_ctxs[-1].append(ctx:=TrackedGraphRewrite(loc, args[0], [], kwargs.get("name", None), depth, kwargs.get("bottom_up", False))) active_rewrites.append(ctx) ret = func(*args, **kwargs) if tracking: active_rewrites.pop() @@ -975,32 +826,46 @@ class RewriteContext: self.pm: PatternMatcher = pm self.ctx = ctx self.replace: dict[UOp, UOp] = {} - def top_down_rewrite(self, n:UOp) -> UOp: - if (rn := self.replace.get(n)) is not None: return rn - new_src = tuple([self.top_down_rewrite(x) for x in n.src]) - new_n = self.pm.rewrite(n, self.ctx) if new_src == n.src else UOp(n.op, n.dtype, new_src, n.arg) - self.replace[n] = ret = n if new_n is None else self.top_down_rewrite(new_n) - return ret - def bottom_up_rewrite(self, n:UOp) -> UOp: - if (rn := self.replace.get(n)) is not None: return rn - new_n: UOp|None = n - while new_n is not None: last_n, new_n = new_n, self.pm.rewrite(new_n, self.ctx) - new_src = tuple([self.bottom_up_rewrite(x) for x in last_n.src]) - self.replace[n] = ret = last_n if new_src == last_n.src else self.bottom_up_rewrite(UOp(last_n.op, last_n.dtype, new_src, last_n.arg)) - return ret + + def unified_rewrite(self, root:UOp, bottom_up=False) -> UOp: + stack: list[tuple[UOp, int, UOp]] = [(root, 0, root)] + while stack: + n, stage, new_n = stack.pop() + if n in self.replace: continue # skip any nodes we have seen + if stage == 0: + # if bottom up, we rewrite this node early. in both cases, we add its parents to the stack + if bottom_up: new_n = self.pm.fixed_point_rewrite(new_n, self.ctx) + stack.append((n, 1, new_n)) + for x in reversed(new_n.src): stack.append((x, 0, x)) + elif stage == 1: + if (new_src:=tuple([self.replace[x] for x in new_n.src])) == new_n.src: + # if top down, do the rewrite. if no rewrite or bottom up, we are done rewriting this node so we add it to the dict + if bottom_up or (new_src_n:=self.pm.rewrite(new_n, self.ctx)) is None: + self.replace[n] = new_n + continue + else: + # if srcs changed from rewrites, construct a new UOp with the new srcs + new_src_n = UOp(new_n.op, new_n.dtype, new_src, new_n.arg, new_n.tag) + # trigger a rewrite of new_src_n, then after that rewrite is done, link it back to n + stack.append((n, 2, new_src_n)) + stack.append((new_src_n, 0, new_src_n)) + else: + # in stage 2, we link the result of new_n to the result of n + self.replace[n] = self.replace[new_n] + return self.replace[root] @track_matches def graph_rewrite(sink:UOp, pm:PatternMatcher, ctx=None, bottom_up=False, name=None) -> UOp: rewrite_ctx = RewriteContext(pm, ctx) - return rewrite_ctx.bottom_up_rewrite(sink) if bottom_up else rewrite_ctx.top_down_rewrite(sink) + return rewrite_ctx.unified_rewrite(sink, bottom_up) @track_matches def graph_rewrite_map(sink:UOp, pm:PatternMatcher, ctx=None, bottom_up=False, name=None, input_map:dict[UOp, UOp]|None=None) -> dict[UOp, UOp]: rewrite_ctx = RewriteContext(pm, ctx) new_map: dict[UOp, UOp] = {} for k in sink.toposort(): - new_map[k] = v = rewrite_ctx.bottom_up_rewrite(k) if bottom_up else rewrite_ctx.top_down_rewrite(k) - if k.metadata is not None: all_metadata[v] = tuple(dedup(all_metadata.get(v, ())))+k.metadata + new_map[k] = v = rewrite_ctx.unified_rewrite(k, bottom_up) + if k is not v and k.metadata is not None: all_metadata[v] = tuple(dedup(all_metadata.get(v, ())))+k.metadata if input_map is not None: for k,v in input_map.items(): new_map[k] = new_map.get(v,v) return new_map diff --git a/tinygrad_repo/tinygrad/uop/spec.py b/tinygrad_repo/tinygrad/uop/spec.py index 1e65d5a..d577b09 100644 --- a/tinygrad_repo/tinygrad/uop/spec.py +++ b/tinygrad_repo/tinygrad/uop/spec.py @@ -1,7 +1,7 @@ from typing import cast, Callable from tinygrad.uop.ops import PatternMatcher, UPat, GroupOp, Ops, UOp, print_uops, python_alu, graph_rewrite, resolve from tinygrad.dtype import DType, ImageDType, dtypes, PtrDType -from tinygrad.helpers import all_same, prod, DEBUG, IGNORE_OOB, Context +from tinygrad.helpers import all_same, prod, DEBUG, ContextVar, Context try: import z3 @@ -25,12 +25,17 @@ try: (UPat(Ops.LOAD, name="x"), lambda x,ctx: UOp(Ops.NOOP, arg=create_bounded(f"load{ctx[1].setdefault(x, len(ctx[1]))}", x.vmin, x.vmax, ctx[0]))), (UPat(Ops.CONST, name="x"), lambda x,ctx: UOp(Ops.NOOP, arg=(z3.BoolVal if dtypes.is_bool(x.dtype) else z3.IntVal)(x.arg, ctx=ctx[0].ctx))), (UPat(Ops.CAST, name="x"), lambda x: x.src[0]), + (UPat(Ops.XOR, src=UPat(Ops.NOOP), name="x"), + lambda x: UOp(Ops.NOOP, arg=z3.BV2Int(z3_alu[x.op](*(z3.Int2BV(s.arg, x.dtype.itemsize*8) for s in x.src))))), (UPat(GroupOp.ALU, src=UPat(Ops.NOOP), name="x"), lambda x: UOp(Ops.NOOP, arg=z3_alu[x.op](*(s.arg for s in x.src)))), ]) z3_imported = True except (ImportError, AttributeError): z3_imported = False +# if you have z3 installed, by default we check the bounds +IGNORE_OOB = ContextVar("IGNORE_OOB", int(not z3_imported)) + buffer_spec = PatternMatcher([ (UPat(Ops.UNIQUE, dtypes.void, ()), lambda: True), (UPat(Ops.DEVICE, dtypes.void, (), name="d"), lambda d: @@ -39,6 +44,9 @@ buffer_spec = PatternMatcher([ lambda buf: isinstance(buf.arg, int) and isinstance(buf.dtype, (DType, ImageDType))), (UPat(Ops.BUFFER_VIEW, src=(UPat(Ops.BUFFER),), name="buf_view"), lambda buf_view: isinstance(buf_view.arg, tuple) and len(buf_view.arg) == 2 and all(isinstance(arg, (int, UOp)) for arg in buf_view.arg)), + (UPat(Ops.BUFFER_VIEW, src=(UPat(Ops.MSTACK, src=UPat(Ops.BUFFER)),)), lambda: True), + # allow VIEW here. TODO: what views specifically are allowed? does this mess with gradient? + (UPat(Ops.VIEW), lambda: True), ]) def validate_kernel(k:UOp): @@ -48,13 +56,16 @@ def validate_kernel(k:UOp): assign_spec = PatternMatcher([ # KERNEL can attach to an ASSIGN to describe the compute required to realize a BUFFER - (UPat(Ops.KERNEL, src=UPat((Ops.BUFFER, Ops.BUFFER_VIEW, Ops.ASSIGN, Ops.MSELECT)), name="k"), validate_kernel), + (UPat(Ops.KERNEL, src=UPat((Ops.BUFFER, Ops.BUFFER_VIEW, Ops.ASSIGN, Ops.MSELECT, Ops.MSTACK)), name="k"), validate_kernel), # ASSIGN has a target and a value. It can also optionally depend on other assigns (UPat(Ops.ASSIGN, name="x"), lambda x: len(x.src) >= 2 and all(s.op is Ops.ASSIGN for s in x.src[2:])), # MSELECT chooses one of the multi buffers (UPat(Ops.MSELECT, name="x"), lambda x: isinstance(x.src[0].device, tuple) and x.arg < len(x.src[0].device)), + + # MSTACK combines buffers into multi + (UPat(Ops.MSTACK, name="x"), lambda x: all(isinstance(x.device, str) for x in x.src)), ]) # *** this is the spec of a Tensor in UOp *** @@ -72,8 +83,9 @@ tensor_uop_spec = buffer_spec+assign_spec+PatternMatcher([ (UPat(Ops.BIND, dtypes.int, (UPat(Ops.DEFINE_VAR), UPat.cvar(dtype=dtypes.int)), arg=None), lambda: True), # Tensor const has a device and an unmasked ShapeTracker of stride 0 + # NOTE: variables in shape can cause multiple views in this ShapeTracker and other issues, see TestSymbolicJit.test_ones_sum (UPat(Ops.CONST, src=(UPat(Ops.VIEW, name="st", src=(UPat(Ops.DEVICE),)),)), - lambda st: st.st.views[0].mask is None and len(st.st.views) == 1 and all(s == 0 for s in st.st.views[0].strides)), + lambda st: len(st.st.views) == 1 and all(v.mask is None for v in st.st.views)), # DETACH and CONTIGUOUS change how we interpret the source UOp # CONTIGUOUS ensures the source UOp realizes @@ -127,12 +139,12 @@ spec = PatternMatcher([ (UPat(Ops.VALID, dtypes.bool, (UPat(Ops.VIEW),)), lambda: True), (UPat(Ops.CONST, name="x"), lambda x: type(x.arg) is type(dtypes.as_const(x.arg, x.dtype))), - # early LOAD has a - (UPat(Ops.LOAD, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)), UPat(Ops.VIEW))), lambda: True), - (UPat(Ops.LOAD, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)), UPat(Ops.VIEW), UPat(Ops.STORE))), lambda: True), + # early LOAD has a + (UPat(Ops.LOAD, src=(UPat(Ops.VIEW, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)),)),)), lambda: True), + (UPat(Ops.LOAD, src=(UPat(Ops.VIEW, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)),)), UPat(Ops.STORE))), lambda: True), - # early STORE has a - (UPat(Ops.STORE, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)), UPat(Ops.VIEW), UPat())), lambda: True), + # early STORE has a + (UPat(Ops.STORE, src=(UPat(Ops.VIEW, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL)),)), UPat())), lambda: True), # **** new style load/store **** @@ -188,12 +200,6 @@ spec = PatternMatcher([ (UPat((Ops.LOAD, Ops.STORE), src=(UPat(dtype=dtypes.int64),), allow_any_len=True), lambda: True), ]) -# *** schedule spec only allows buffers, assigns and kernels in the graph *** - -sched_spec = buffer_spec+assign_spec+PatternMatcher([ - (UPat(GroupOp.All-{Ops.SINK}), lambda: False), -]) - # *** this is the UOp AST spec *** def verify_sink_dims(sink:UOp): @@ -207,6 +213,7 @@ ast_spec = PatternMatcher([ # shapes must have either 1 or n in each dimension (UPat(Ops.SINK, src=UPat(Ops.STORE), name="sink"), verify_sink_dims), # VIEW can only exist in the edges + (UPat(Ops.VIEW, src=(UPat((Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL),))), lambda: True), (UPat(Ops.VIEW, name="view"), lambda view: len(view.src) == 0), # all parent UOps must have the same shape (UPat(GroupOp.All-{Ops.SINK}, name="root"), lambda root: all_same([x.shape for x in root.src if x.st is not None])), diff --git a/tinygrad_repo/tinygrad/codegen/symbolic.py b/tinygrad_repo/tinygrad/uop/symbolic.py similarity index 99% rename from tinygrad_repo/tinygrad/codegen/symbolic.py rename to tinygrad_repo/tinygrad/uop/symbolic.py index f293e66..3764b67 100644 --- a/tinygrad_repo/tinygrad/codegen/symbolic.py +++ b/tinygrad_repo/tinygrad/uop/symbolic.py @@ -5,7 +5,7 @@ from collections import defaultdict from tinygrad.uop.ops import Ops, PatternMatcher, UPat, UOp, GroupOp, exec_alu from tinygrad.dtype import ConstType, dtypes, PtrDType from tinygrad.helpers import partition, all_same, prod, flatten, get_single_element, cdiv, cmod, CORRECT_DIVMOD_FOLDING -from tinygrad.codegen.transcendental import xpow +from tinygrad.uop.transcendental import xpow # ******** phase 1 of symbolic used to live in ops, it's the most generic folding rules ******** @@ -25,6 +25,7 @@ symbolic_simple = PatternMatcher([ # ** self folding ** (UPat.var("x") + 0, lambda x: x), # x+0 -> x (UPat.var("x") * 1, lambda x: x), # x*1 -> x + (UPat.var("x", dtype=dtypes.ints) ^ 0, lambda x: x), # x^0 -> x (UPat.var("x") // UPat.var("x"), lambda x: x.const_like(1)), # x//x -> 1 (UPat.var("x") // 1, lambda x: x), # x//1 -> x (UPat.var("x") // -1, lambda x: -x), # x//-1 -> -x diff --git a/tinygrad_repo/tinygrad/codegen/transcendental.py b/tinygrad_repo/tinygrad/uop/transcendental.py similarity index 100% rename from tinygrad_repo/tinygrad/codegen/transcendental.py rename to tinygrad_repo/tinygrad/uop/transcendental.py diff --git a/tinygrad_repo/tinygrad/viz/index.html b/tinygrad_repo/tinygrad/viz/index.html index a6d847a..f0fba8e 100644 --- a/tinygrad_repo/tinygrad/viz/index.html +++ b/tinygrad_repo/tinygrad/viz/index.html @@ -44,11 +44,17 @@ ul.active { opacity: 1; } + ul > ul { + display: none; + } + ul.expanded > ul { + display: block; + } ul.disabled { opacity: 0.4; pointer-events: none; } - svg { + .graph svg { width: 100%; height: 100%; } @@ -56,6 +62,21 @@ cursor: default; user-select: none; } + g.clickable * { + cursor: pointer; + user-select: auto; + } + g.tag circle { + r: 5; + fill: #FFD700; + stroke: #B8860B; + stroke-width: 0.8; + } + g.tag text { + text-anchor: middle; + font-size: 6px; + fill: black; + } .label :is(text, p) { color: #08090e; font-weight: 350; @@ -78,10 +99,10 @@ position: relative; height: 100%; } - .metadata > * + *, .rewrite-container > * + *, .kernel-list > * + * { + .metadata > * + *, .rewrite-container > * + *, .ctx-list > * + * { margin-top: 12px; } - .kernel-list > ul > * + * { + .ctx-list > ul > * + * { margin-top: 4px; } .graph { @@ -89,12 +110,12 @@ inset: 0; z-index: 1; } - .kernel-list-parent { + .ctx-list-parent { width: 15%; padding-top: 50px; border-right: 1px solid #4a4b56; } - .kernel-list { + .ctx-list { width: 100%; height: 100%; overflow-y: auto; @@ -189,7 +210,7 @@ -
+
Rendering new layout...
diff --git a/tinygrad_repo/tinygrad/viz/js/index.js b/tinygrad_repo/tinygrad/viz/js/index.js index e295490..be91b1a 100644 --- a/tinygrad_repo/tinygrad/viz/js/index.js +++ b/tinygrad_repo/tinygrad/viz/js/index.js @@ -28,7 +28,7 @@ async function renderDag(graph, additions, recenter=false) { if (timeout != null) clearTimeout(timeout); const progressMessage = document.querySelector(".progress-message"); timeout = setTimeout(() => {progressMessage.style.display = "block"}, 2000); - worker.postMessage({graph, additions}); + worker.postMessage({graph, additions, ctxs}); worker.onmessage = (e) => { progressMessage.style.display = "none"; clearTimeout(timeout); @@ -37,15 +37,20 @@ async function renderDag(graph, additions, recenter=false) { // draw nodes const STROKE_WIDTH = 1.4; const nodes = d3.select("#nodes").selectAll("g").data(g.nodes().map(id => g.node(id)), d => d).join("g") - .attr("transform", d => `translate(${d.x},${d.y})`); + .attr("transform", d => `translate(${d.x},${d.y})`).classed("clickable", d => d.ref != null) + .on("click", (_,d) => setCtxWithHistory(d.ref)); nodes.selectAll("rect").data(d => [d]).join("rect").attr("width", d => d.width).attr("height", d => d.height).attr("fill", d => d.color) - .attr("x", d => -d.width/2).attr("y", d => -d.height/2).attr("style", d => `stroke:#4a4b57; stroke-width:${STROKE_WIDTH}px; ${d.style}`); + .attr("x", d => -d.width/2).attr("y", d => -d.height/2).attr("style", d => d.style ?? `stroke:#4a4b57; stroke-width:${STROKE_WIDTH}px;`); nodes.selectAll("g.label").data(d => [d]).join("g").attr("class", "label").attr("transform", d => { const x = (d.width-d.padding*2)/2; const y = (d.height-d.padding*2)/2+STROKE_WIDTH; return `translate(-${x}, -${y})`; }).selectAll("text").data(d => [d.label.split("\n")]).join("text").selectAll("tspan").data(d => d).join("tspan").text(d => d).attr("x", "0") .attr("dy", 14).attr("xml:space", "preserve"); + const tags = nodes.selectAll("g.tag").data(d => d.tag != null ? [d] : []).join("g").attr("class", "tag") + .attr("transform", d => `translate(${-d.width/2+8}, ${-d.height/2+8})`); + tags.selectAll("circle").data(d => [d]).join("circle"); + tags.selectAll("text").data(d => [d.tag]).join("text").text(d => d).attr("dy", "0.35em"); // draw edges const line = d3.line().x(d => d.x).y(d => d.y).curve(d3.curveBasis); d3.select("#edges").selectAll("path.edgePath").data(g.edges()).join("path").attr("class", "edgePath").attr("d", (e) => { @@ -69,11 +74,9 @@ async function renderDag(graph, additions, recenter=false) { const x = p2.x - ux * offset; const y = p2.y - uy * offset; return `translate(${x}, ${y})` - }); - edgeLabels.selectAll("circle").data(e => [g.edge(e).label]).join("circle").attr("r", 5).attr("fill", "#FFD700").attr("stroke", "#B8860B") - .attr("stroke-width", 0.8); - edgeLabels.selectAll("text").data(e => [g.edge(e).label]).join("text").text(d => d).attr("text-anchor", "middle").attr("dy", "0.35em") - .attr("font-size", "6px").attr("fill", "black"); + }).attr("class", "tag"); + edgeLabels.selectAll("circle").data(e => [g.edge(e).label]).join("circle"); + edgeLabels.selectAll("text").data(e => [g.edge(e).label]).join("text").text(d => d).attr("dy", "0.35em"); if (recenter) document.getElementById("zoom-to-fit-btn").click(); }; @@ -168,6 +171,9 @@ function renderMemoryGraph(graph) { b.y.push(b.y.at(-1)); } // ** render traces + // clear existing groups + document.querySelector(".progress-message").style.display = "none"; + for (c of document.getElementById("render").children) c.innerHTML = ""; const render = d3.select("#bars"); const yscale = d3.scaleLinear().domain([0, peak]).range([576, 0]); const xscale = d3.scaleLinear().domain([0, timestep]).range([0, 1024]); @@ -201,31 +207,29 @@ function renderMemoryGraph(graph) { d3.select(e.currentTarget).attr("stroke", null).attr("stroke-width", null); document.getElementById("current-buf")?.remove() }); - // TODO: add the toposort graph here - document.querySelector(".progress-message").style.display = "none"; - d3.select("#nodes").html(""); - d3.select("#edges").html(""); + // TODO: add the kernel line here document.getElementById("zoom-to-fit-btn").click(); } // ** zoom and recentering -const zoom = d3.zoom().on("zoom", (e) => d3.select("#render").attr("transform", e.transform)); -d3.select("#graph-svg").call(zoom); +const svgZoom = d3.zoom().on("zoom", (e) => d3.select("#render").attr("transform", e.transform)); +d3.select("#graph-svg").call(svgZoom); + // zoom to fit into view document.getElementById("zoom-to-fit-btn").addEventListener("click", () => { const svg = d3.select("#graph-svg"); - svg.call(zoom.transform, d3.zoomIdentity); + svg.call(svgZoom.transform, d3.zoomIdentity); const mainRect = rect(".main-container"); - const x0 = rect(".kernel-list-parent").right; + const x0 = rect(".ctx-list-parent").right; const x1 = rect(".metadata-parent").left; const pad = 16; const R = { x: x0+pad, y: mainRect.top+pad, width: (x1>0 ? x1-x0 : mainRect.width)-2*pad, height: mainRect.height-2*pad }; const r = rect("#render"); if (r.width === 0) return; const scale = Math.min(R.width/r.width, R.height/r.height); - const [tx, ty] = [R.x+(R.width-r.width*scale)/2, R.y+(R.height-r.height*scale)/2]; - svg.call(zoom.transform, d3.zoomIdentity.translate(tx, ty).scale(scale)); + const [tx, ty] = [R.x+(R.width-r.width*scale)/2-r.left*scale, R.y+(R.height-r.height*scale)/2]; + svg.call(svgZoom.transform, d3.zoomIdentity.translate(tx, ty).scale(scale)); }); // **** main VIZ interfacae @@ -245,6 +249,11 @@ function codeBlock(st, language, { loc, wrap }) { return ret; } +function setActive(e) { + e.classList.add("active"); + requestAnimationFrame(() => e.scrollIntoView({ behavior: "auto", block: "nearest" })); +} + // ** hljs extra definitions for UOps and float4 hljs.registerLanguage("python", (hljs) => ({ ...hljs.getLanguage("python"), @@ -264,69 +273,92 @@ hljs.registerLanguage("cpp", (hljs) => ({ var ret = []; var cache = {}; -var kernels = null; +var ctxs = null; const evtSources = []; -const state = {currentKernel:-1, currentUOp:0, currentRewrite:0, expandKernel:false}; +// VIZ displays graph rewrites in 3 levels, from bottom-up: +// rewrite: a single UOp transformation +// step: collection of rewrites +// context: collection of steps +const state = {currentCtx:-1, currentStep:0, currentRewrite:0, expandSteps:false}; function setState(ns) { + const { currentCtx:prevCtx, currentStep:prevStep } = state; Object.assign(state, ns); + // update element styles if needed + document.getElementById(`ctx-${state.currentCtx}`)?.classList.toggle("expanded", state.expandSteps); + if (state.currentCtx !== prevCtx) { + document.getElementById(`ctx-${prevCtx}`)?.classList.remove("active", "expanded"); + setActive(document.getElementById(`ctx-${state.currentCtx}`)); + } + if (state.currentCtx !== prevCtx || state.currentStep !== prevStep) { + document.getElementById(`step-${prevCtx}-${prevStep}`)?.classList.remove("active"); + setActive(document.getElementById(`step-${state.currentCtx}-${state.currentStep}`)); + } + // re-render main(); } + +// set a new context and keep the old one in browser history +function setCtxWithHistory(newCtx) { + if (newCtx == null) return; + // NOTE: browser does a structured clone, passing a mutable object is safe. + history.replaceState(state, ""); + history.pushState(state, ""); + setState({ expandSteps:true, currentCtx:newCtx, currentStep:0, currentRewrite:0 }); +} + +window.addEventListener("popstate", (e) => { + if (e.state != null) setState(e.state); +}); + async function main() { - const { currentKernel, currentUOp, currentRewrite, expandKernel } = state; - // ** left sidebar kernel list - if (kernels == null) { - kernels = await (await fetch("/kernels")).json(); - setState({ currentKernel:-1 }); - } - const kernelList = document.querySelector(".kernel-list"); - kernelList.innerHTML = ""; - for (const [i,k] of kernels.entries()) { - const ul = kernelList.appendChild(document.createElement("ul")); - if (i === currentKernel) { - ul.className = "active"; - requestAnimationFrame(() => ul.scrollIntoView({ behavior: "auto", block: "nearest" })); - } - const p = ul.appendChild(document.createElement("p")); - p.innerHTML = k[0].replace(/\u001b\[(\d+)m(.*?)\u001b\[0m/g, (_, code, st) => { - const colors = ['gray','red','green','yellow','blue','magenta','cyan','white']; - return `${st}`; - }); - p.onclick = () => { - setState(i === currentKernel ? { expandKernel:!expandKernel } : { expandKernel:true, currentKernel:i, currentUOp:0, currentRewrite:0 }); - } - for (const [j,u] of k[1].entries()) { - const inner = ul.appendChild(document.createElement("ul")); - if (i === currentKernel && j === currentUOp) { - inner.className = "active"; - requestAnimationFrame(() => inner.scrollIntoView({ behavior: "auto", block: "nearest" })); + // ** left sidebar context list + if (ctxs == null) { + ctxs = await (await fetch("/ctxs")).json(); + const ctxList = document.querySelector(".ctx-list"); + for (const [i,{name, steps}] of ctxs.entries()) { + const ul = ctxList.appendChild(document.createElement("ul")); + ul.id = `ctx-${i}`; + const p = ul.appendChild(document.createElement("p")); + p.innerHTML = name.replace(/\u001b\[(\d+)m(.*?)\u001b\[0m/g, (_, code, st) => { + const colors = ['gray','red','green','yellow','blue','magenta','cyan','white']; + return `${st}`; + }); + p.onclick = () => { + setState(i === state.currentCtx ? { expandSteps:!state.expandSteps } : { expandSteps:true, currentCtx:i, currentStep:0, currentRewrite:0 }); } - inner.innerText = `${u.name ?? u.loc[0].replaceAll("\\", "/").split("/").pop()+':'+u.loc[1]} - ${u.match_count}`; - inner.style.marginLeft = `${8*u.depth}px`; - inner.style.display = i === currentKernel && expandKernel ? "block" : "none"; - inner.onclick = (e) => { - e.stopPropagation(); - setState({ currentUOp:j, currentKernel:i, currentRewrite:0 }); + for (const [j,u] of steps.entries()) { + const inner = ul.appendChild(document.createElement("ul")); + inner.id = `step-${i}-${j}`; + inner.innerText = `${u.name ?? u.loc[0].replaceAll("\\", "/").split("/").pop()+':'+u.loc[1]} - ${u.match_count}`; + inner.style.marginLeft = `${8*u.depth}px`; + inner.onclick = (e) => { + e.stopPropagation(); + setState({ currentStep:j, currentCtx:i, currentRewrite:0 }); + } } } + return setState({ currentCtx:-1 }); } // ** center graph - if (currentKernel == -1) return; - const kernel = kernels[currentKernel][1][currentUOp]; - const cacheKey = `kernel=${currentKernel}&idx=${currentUOp}`; + const { currentCtx, currentStep, currentRewrite, expandSteps } = state; + if (currentCtx == -1) return; + const ctx = ctxs[currentCtx]; + const step = ctx.steps[currentStep]; + const ckey = `ctx=${currentCtx}&idx=${currentStep}`; // close any pending event sources let activeSrc = null; for (const e of evtSources) { - if (e.url.split("?")[1] !== cacheKey) e.close(); + if (e.url.split("?")[1] !== ckey) e.close(); else if (e.readyState === EventSource.OPEN) activeSrc = e; } - if (cacheKey in cache) { - ret = cache[cacheKey]; + if (ckey in cache) { + ret = cache[ckey]; } - // if we don't have a complete cache yet we start streaming this kernel - if (!(cacheKey in cache) || (cache[cacheKey].length !== kernel.match_count+1 && activeSrc == null)) { + // if we don't have a complete cache yet we start streaming rewrites in this step + if (!(ckey in cache) || (cache[ckey].length !== step.match_count+1 && activeSrc == null)) { ret = []; - cache[cacheKey] = ret; - const eventSource = new EventSource(`/kernels?kernel=${currentKernel}&idx=${currentUOp}`); + cache[ckey] = ret; + const eventSource = new EventSource(`/ctxs?${ckey}`); evtSources.push(eventSource); eventSource.onmessage = (e) => { if (e.data === "END") return eventSource.close(); @@ -340,20 +372,20 @@ async function main() { }; } if (ret.length === 0) return; - if (kernel.name == "View Memory Graph") { + if (step.name == "View Memory Graph") { renderMemoryGraph(ret[currentRewrite].graph); } else { renderDag(ret[currentRewrite].graph, ret[currentRewrite].changed_nodes || [], recenter=currentRewrite === 0); } // ** right sidebar code blocks const metadata = document.querySelector(".metadata"); - const [code, lang] = kernel.kernel_code != null ? [kernel.kernel_code, "cpp"] : [ret[currentRewrite].uop, "python"]; - metadata.replaceChildren(codeBlock(kernel.code_line, "python", { loc:kernel.loc, wrap:true }), codeBlock(code, lang, { wrap:false })); + const [code, lang] = ctx.kernel_code != null ? [ctx.kernel_code, "cpp"] : [ret[currentRewrite].uop, "python"]; + metadata.replaceChildren(codeBlock(step.code_line, "python", { loc:step.loc, wrap:true }), codeBlock(code, lang, { wrap:false })); // ** rewrite steps - if (kernel.match_count >= 1) { + if (step.match_count >= 1) { const rewriteList = metadata.appendChild(document.createElement("div")); rewriteList.className = "rewrite-list"; - for (let s=0; s<=kernel.match_count; s++) { + for (let s=0; s<=step.match_count; s++) { const ul = rewriteList.appendChild(document.createElement("ul")); ul.innerText = s; ul.id = `rewrite-${s}`; @@ -407,36 +439,37 @@ function appendResizer(element, { minWidth, maxWidth }, left=false) { }, { once: true }); }); } -appendResizer(document.querySelector(".kernel-list-parent"), { minWidth: 15, maxWidth: 50 }, left=true); +appendResizer(document.querySelector(".ctx-list-parent"), { minWidth: 15, maxWidth: 50 }, left=true); appendResizer(document.querySelector(".metadata-parent"), { minWidth: 20, maxWidth: 50 }); // **** keyboard shortcuts document.addEventListener("keydown", async function(event) { - const { currentKernel, currentUOp, currentRewrite, expandKernel } = state; - // up and down change the UOp or kernel from the list + const { currentCtx, currentStep, currentRewrite, expandSteps } = state; + // up and down change the step or context from the list + const changeStep = expandSteps && ctxs[currentCtx].steps?.length; if (event.key == "ArrowUp") { event.preventDefault(); - if (expandKernel) { - return setState({ currentRewrite:0, currentUOp:Math.max(0, currentUOp-1) }); + if (changeStep) { + return setState({ currentRewrite:0, currentStep:Math.max(0, currentStep-1) }); } - return setState({ currentUOp:0, currentRewrite:0, currentKernel:Math.max(0, currentKernel-1) }); + return setState({ currentStep:0, currentRewrite:0, currentCtx:Math.max(0, currentCtx-1), expandSteps:false }); } if (event.key == "ArrowDown") { event.preventDefault(); - if (expandKernel) { - const totalUOps = kernels[currentKernel][1].length-1; - return setState({ currentRewrite:0, currentUOp:Math.min(totalUOps, currentUOp+1) }); + if (changeStep) { + const totalUOps = ctxs[currentCtx].steps.length-1; + return setState({ currentRewrite:0, currentStep:Math.min(totalUOps, currentStep+1) }); } - return setState({ currentUOp:0, currentRewrite:0, currentKernel:Math.min(kernels.length-1, currentKernel+1) }); + return setState({ currentStep:0, currentRewrite:0, currentCtx:Math.min(ctxs.length-1, currentCtx+1), expandSteps:false }); } // enter toggles focus on a single rewrite stage if (event.key == "Enter") { event.preventDefault() - if (state.currentKernel === -1) { - return setState({ currentKernel:0, expandKernel:true }); + if (currentCtx === -1) { + return setState({ currentCtx:0, expandSteps:true }); } - return setState({ currentUOp:0, currentRewrite:0, expandKernel:!expandKernel }); + return setState({ expandSteps:!expandSteps }); } // left and right go through rewrites in a single UOp if (event.key == "ArrowLeft") { diff --git a/tinygrad_repo/tinygrad/viz/js/worker.js b/tinygrad_repo/tinygrad/viz/js/worker.js index 973a484..5ff131e 100644 --- a/tinygrad_repo/tinygrad/viz/js/worker.js +++ b/tinygrad_repo/tinygrad/viz/js/worker.js @@ -5,18 +5,22 @@ const ctx = canvas.getContext("2d"); ctx.font = `${LINE_HEIGHT}px sans-serif`; onmessage = (e) => { - const { graph, additions } = e.data; + const { graph, additions, ctxs } = e.data; const g = new dagre.graphlib.Graph({ compound: true }); g.setGraph({ rankdir: "LR" }).setDefaultEdgeLabel(function() { return {}; }); - if (additions.length !== 0) g.setNode("addition", {label:"", style:"fill: rgba(26, 27, 38, 0.5); stroke: none;", padding:0}); - for (const [k, {label, src, color}] of Object.entries(graph)) { + if (additions.length !== 0) g.setNode("addition", {label:"", style:"fill: rgba(26, 27, 38, 0.5);", padding:0}); + for (let [k, {label, src, ref, ...rest }] of Object.entries(graph)) { + const idx = ref ? ctxs.findIndex(k => k.ref === ref) : -1; + // replace colors in label + if (idx != -1) label += `\ncodegen@${ctxs[idx].name.replace(/\x1b\[\d+m(.*?)\x1b\[0m/g, "$1")}`; // adjust node dims by label size + add padding let [width, height] = [0, 0]; for (line of label.split("\n")) { width = Math.max(width, ctx.measureText(line).width); height += LINE_HEIGHT; } - g.setNode(k, {label, color, width:width+NODE_PADDING*2, height:height+NODE_PADDING*2, padding:NODE_PADDING}); + g.setNode(k, {width:width+NODE_PADDING*2, height:height+NODE_PADDING*2, padding:NODE_PADDING, label, ref:idx==-1 ? null : idx, ...rest}); + // add edges const edgeCounts = {} for (const s of src) edgeCounts[s] = (edgeCounts[s] || 0)+1; for (const s of src) g.setEdge(s, k, { label: edgeCounts[s] > 1 ? edgeCounts[s] : null }); diff --git a/tinygrad_repo/tinygrad/viz/serve.py b/tinygrad_repo/tinygrad/viz/serve.py index 273cf40..e2ebbe0 100755 --- a/tinygrad_repo/tinygrad/viz/serve.py +++ b/tinygrad_repo/tinygrad/viz/serve.py @@ -12,34 +12,27 @@ from tinygrad.dtype import dtypes uops_colors = {Ops.LOAD: "#ffc0c0", Ops.STORE: "#87CEEB", Ops.CONST: "#e0e0e0", Ops.VCONST: "#e0e0e0", Ops.REDUCE: "#FF5B5B", Ops.DEFINE_GLOBAL: "#ffe0b0", Ops.DEFINE_LOCAL: "#ffe0d0", Ops.DEFINE_ACC: "#f0ffe0", Ops.REDUCE_AXIS: "#FF6B6B", Ops.RANGE: "#c8a0e0", Ops.ASSIGN: "#909090", Ops.BARRIER: "#ff8080", Ops.IF: "#c8b0c0", Ops.SPECIAL: "#c0c0ff", - Ops.INDEX: "#e8ffa0", Ops.WMMA: "#efefc0", Ops.VIEW: "#C8F9D4", Ops.MULTI: "#f6ccff", Ops.KERNEL: "#3e7f55", Ops.IGNORE: "#00C000", + Ops.INDEX: "#e8ffa0", Ops.WMMA: "#efefc0", Ops.VIEW: "#C8F9D4", Ops.MULTI: "#f6ccff", Ops.KERNEL: "#3e7f55", **{x:"#D8F9E4" for x in GroupOp.Movement}, **{x:"#ffffc0" for x in GroupOp.ALU}, Ops.THREEFRY:"#ffff80", Ops.BUFFER_VIEW: "#E5EAFF", Ops.BLOCK: "#C4A484", Ops.BLOCKEND: "#C4A4A4", Ops.BUFFER: "#B0BDFF", Ops.COPY: "#a040a0", Ops.FUSE: "#FFa500", - Ops.ALLREDUCE: "#ff40a0", Ops.GBARRIER: "#FFC14D", Ops.MSELECT: "#d040a0"} + Ops.ALLREDUCE: "#ff40a0", Ops.GBARRIER: "#FFC14D", Ops.MSELECT: "#d040a0", Ops.MSTACK: "#d040a0"} # VIZ API # ** Metadata for a track_rewrites scope -class GraphRewriteMetadata(TypedDict): - loc: tuple[str, int] # [path, lineno] calling graph_rewrite - match_count: int # total match count in this context - code_line: str # source code calling graph_rewrite - kernel_code: str|None # optionally render the final kernel code - name: str|None # optional name of the rewrite - depth: int # depth if it's a subrewrite - @functools.cache def render_program(k:Kernel): try: return k.opts.render(k.uops) except Exception as e: return f"ISSUE RENDERING KERNEL: {e}\nast = {k.ast}\nopts = {k.applied_opts}" -def to_metadata(k:Any, v:TrackedGraphRewrite) -> GraphRewriteMetadata: - return {"loc":v.loc, "match_count":len(v.matches), "name":v.name, "depth":v.depth, "code_line":lines(v.loc[0])[v.loc[1]-1].strip(), - "kernel_code":render_program(k) if isinstance(k, Kernel) else None} - -def get_metadata(keys:list[Any], contexts:list[list[TrackedGraphRewrite]]) -> list[tuple[str, list[GraphRewriteMetadata]]]: - return [(k.name if isinstance(k, Kernel) else str(k), [to_metadata(k, v) for v in vals]) for k,vals in zip(keys, contexts)] +def get_metadata(keys:list[Any], contexts:list[list[TrackedGraphRewrite]]) -> list[dict]: + ret = [] + for k,v in zip(keys, contexts): + steps = [{"name":s.name, "loc":s.loc, "depth":s.depth, "match_count":len(s.matches), "code_line":lines(s.loc[0])[s.loc[1]-1].strip()} for s in v] + if isinstance(k, Kernel): ret.append({"name":k.name, "kernel_code":render_program(k), "ref":id(k.ast), "steps":steps}) + else: ret.append({"name":str(k), "steps":steps}) + return ret # ** Complete rewrite details for a graph_rewrite call @@ -82,10 +75,11 @@ def uop_to_json(x:UOp) -> dict[int, dict]: label += "\n" # NOTE: kernel already has metadata in arg if TRACEMETA >= 2 and u.metadata is not None and u.op is not Ops.KERNEL: label += "\n"+repr(u.metadata) - graph[id(u)] = {"label":label, "src":[id(x) for x in u.src if x not in excluded], "color":uops_colors.get(u.op, "#ffffff")} + graph[id(u)] = {"label":label, "src":[id(x) for x in u.src if x not in excluded], "color":uops_colors.get(u.op, "#ffffff"), + "ref":id(u.arg.ast) if u.op is Ops.KERNEL else None, "tag":u.tag} return graph -def get_details(k:Any, ctx:TrackedGraphRewrite) -> Generator[GraphRewriteDetails, None, None]: +def get_details(ctx:TrackedGraphRewrite) -> Generator[GraphRewriteDetails, None, None]: yield {"graph":uop_to_json(next_sink:=ctx.sink), "uop":str(ctx.sink), "changed_nodes":None, "diff":None, "upat":None} replaces: dict[UOp, UOp] = {} for u0,u1,upat in tqdm(ctx.matches): @@ -139,23 +133,23 @@ class Handler(BaseHTTPRequestHandler): if url.path.endswith(".js"): content_type = "application/javascript" if url.path.endswith(".css"): content_type = "text/css" except FileNotFoundError: status_code = 404 - elif url.path == "/kernels": - if "kernel" in (query:=parse_qs(url.query)): - kidx, ridx = int(query["kernel"][0]), int(query["idx"][0]) + elif url.path == "/ctxs": + if "ctx" in (query:=parse_qs(url.query)): + kidx, ridx = int(query["ctx"][0]), int(query["idx"][0]) try: # stream details self.send_response(200) self.send_header("Content-Type", "text/event-stream") self.send_header("Cache-Control", "no-cache") self.end_headers() - for r in get_details(contexts[0][kidx], contexts[1][kidx][ridx]): + for r in get_details(contexts[1][kidx][ridx]): self.wfile.write(f"data: {json.dumps(r)}\n\n".encode("utf-8")) self.wfile.flush() self.wfile.write("data: END\n\n".encode("utf-8")) return self.wfile.flush() # pass if client closed connection except (BrokenPipeError, ConnectionResetError): return - ret, content_type = json.dumps(kernels).encode(), "application/json" + ret, content_type = json.dumps(ctxs).encode(), "application/json" elif url.path == "/get_profile" and perfetto_profile is not None: ret, content_type = perfetto_profile, "application/json" else: status_code = 404 @@ -200,7 +194,7 @@ if __name__ == "__main__": contexts, profile = load_pickle(args.kernels), load_pickle(args.profile) # NOTE: this context is a tuple of list[keys] and list[values] - kernels = get_metadata(*contexts) if contexts is not None else [] + ctxs = get_metadata(*contexts) if contexts is not None else [] perfetto_profile = to_perfetto(profile) if profile is not None else None @@ -208,7 +202,7 @@ if __name__ == "__main__": reloader_thread = threading.Thread(target=reloader) reloader_thread.start() print(f"*** started viz on {HOST}:{PORT}") - print(colored(f"*** ready in {(time.perf_counter()-st)*1e3:4.2f}ms", "green")) + print(colored(f"*** ready in {(time.perf_counter()-st)*1e3:4.2f}ms", "green"), flush=True) if len(getenv("BROWSER", "")) > 0: webbrowser.open(f"{HOST}:{PORT}{'/profiler' if contexts is None else ''}") try: server.serve_forever() except KeyboardInterrupt: