import ctypes, array from hexdump import hexdump from tinygrad.runtime.ops_gpu import GPUDevice from tinygrad.helpers import getenv, to_mv, mv_address from tinygrad.dtype import dtypes from tinygrad import Tensor, TinyJit from tinygrad.runtime.autogen import opencl as cl if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import # create raw opencl buffer. gdev = GPUDevice() cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 0x100, None, status := ctypes.c_int32()) assert status.value == 0 # fill it with something for fun data = memoryview(array.array('I', [i for i in range(64)])) cl.clEnqueueWriteBuffer(gdev.queue, cl_buf, False, 0, 0x100, mv_address(data), 0, None, None) cl.clFinish(gdev.queue) # wait writes to complete # get raw gpu pointer from opencl buffer. ## get buf desc hexdump(to_mv(ctypes.addressof(cl_buf), 0x40)) cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_buf), 8).cast('Q')[0] ## get buf device ptr hexdump(to_mv(cl_buf_desc_ptr, 0x100)) rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer. # create QCOM tensor with the externally managed buffer x = Tensor.from_blob(rawbuf_ptr, (8, 8), dtype=dtypes.int, device='QCOM') y = (x + 1).numpy() print(y) # all calculations are done, save to free the object cl.clReleaseMemObject(cl_buf) # all together with jit @TinyJit def calc(x): return x + 2 for i in range(4): cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 2*2*4, None, status := ctypes.c_int32()) assert status.value == 0 data = memoryview(array.array('I', [x+i for x in range(2*2)])) cl.clEnqueueWriteBuffer(gdev.queue, cl_buf, False, 0, 2*2*4, mv_address(data), 0, None, None) cl.clFinish(gdev.queue) # wait writes to complete cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_buf), 8).cast('Q')[0] rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] y = calc(x = Tensor.from_blob(rawbuf_ptr, (2, 2), dtype=dtypes.int, device='QCOM')).numpy() print(f'jit {i}\n', y) # all calculations are done, save to free the object cl.clReleaseMemObject(cl_buf) # now images! h, w = 128, 128 cl_img = cl.clCreateImage2D(gdev.context, cl.CL_MEM_READ_WRITE, cl.cl_image_format(cl.CL_RGBA, cl.CL_FLOAT), w, h, 0, None, status := ctypes.c_int32()) assert status.value == 0 # fill it with something for fun data = memoryview(array.array('f', [i for i in range(h*w*4)])) cl.clEnqueueWriteImage(gdev.queue, cl_img, False, (ctypes.c_size_t * 3)(0,0,0), (ctypes.c_size_t * 3)(w,h,1), 0, 0, mv_address(data), 0, None, None) cl.clFinish(gdev.queue) # wait writes to complete # get raw gpu pointer from opencl buffer. ## get buf desc hexdump(to_mv(ctypes.addressof(cl_img), 0x40)) cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_img), 8).cast('Q')[0] ## get buf device ptr hexdump(to_mv(cl_buf_desc_ptr, 0x100)) rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer. # create QCOM tensor with the externally managed buffer # dtypes.imageh = cl.cl_image_format(cl.CL_RGBA, cl.CL_HALF_FLOAT) # dtypes.imagef = cl.cl_image_format(cl.CL_RGBA, cl.CL_FLOAT) x = Tensor.from_blob(rawbuf_ptr, (h*w*4,), dtype=dtypes.imagef((h,w)), device='QCOM') y = (x + 1).numpy() print(y) # all calculations are done, save to free the object cl.clReleaseMemObject(cl_img)