carrot/tinygrad_repo/extra/qcom_gpu_driver/qcom_opencl_interop.py
Vehicle Researcher 8eb8330d95 openpilot v0.9.9 release
date: 2025-03-08T09:09:29
master commit: ce355250be726f9bc8f0ac165a6cde41586a983d
2025-03-08 09:09:31 +00:00

88 lines
3.3 KiB
Python

import ctypes, array
from hexdump import hexdump
from tinygrad.runtime.ops_gpu import GPUDevice
from tinygrad.helpers import getenv, to_mv, mv_address
from tinygrad.dtype import dtypes
from tinygrad import Tensor, TinyJit
from tinygrad.runtime.autogen import opencl as cl
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
# create raw opencl buffer.
gdev = GPUDevice()
cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 0x100, None, status := ctypes.c_int32())
assert status.value == 0
# fill it with something for fun
data = memoryview(array.array('I', [i for i in range(64)]))
cl.clEnqueueWriteBuffer(gdev.queue, cl_buf, False, 0, 0x100, mv_address(data), 0, None, None)
cl.clFinish(gdev.queue) # wait writes to complete
# get raw gpu pointer from opencl buffer.
## get buf desc
hexdump(to_mv(ctypes.addressof(cl_buf), 0x40))
cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_buf), 8).cast('Q')[0]
## get buf device ptr
hexdump(to_mv(cl_buf_desc_ptr, 0x100))
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
# create QCOM tensor with the externally managed buffer
x = Tensor.from_blob(rawbuf_ptr, (8, 8), dtype=dtypes.int, device='QCOM')
y = (x + 1).numpy()
print(y)
# all calculations are done, save to free the object
cl.clReleaseMemObject(cl_buf)
# all together with jit
@TinyJit
def calc(x): return x + 2
for i in range(4):
cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 2*2*4, None, status := ctypes.c_int32())
assert status.value == 0
data = memoryview(array.array('I', [x+i for x in range(2*2)]))
cl.clEnqueueWriteBuffer(gdev.queue, cl_buf, False, 0, 2*2*4, mv_address(data), 0, None, None)
cl.clFinish(gdev.queue) # wait writes to complete
cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_buf), 8).cast('Q')[0]
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20]
y = calc(x = Tensor.from_blob(rawbuf_ptr, (2, 2), dtype=dtypes.int, device='QCOM')).numpy()
print(f'jit {i}\n', y)
# all calculations are done, save to free the object
cl.clReleaseMemObject(cl_buf)
# now images!
h, w = 128, 128
cl_img = cl.clCreateImage2D(gdev.context, cl.CL_MEM_READ_WRITE, cl.cl_image_format(cl.CL_RGBA, cl.CL_FLOAT), w, h, 0, None, status := ctypes.c_int32())
assert status.value == 0
# fill it with something for fun
data = memoryview(array.array('f', [i for i in range(h*w*4)]))
cl.clEnqueueWriteImage(gdev.queue, cl_img, False, (ctypes.c_size_t * 3)(0,0,0), (ctypes.c_size_t * 3)(w,h,1), 0, 0, mv_address(data), 0, None, None)
cl.clFinish(gdev.queue) # wait writes to complete
# get raw gpu pointer from opencl buffer.
## get buf desc
hexdump(to_mv(ctypes.addressof(cl_img), 0x40))
cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_img), 8).cast('Q')[0]
## get buf device ptr
hexdump(to_mv(cl_buf_desc_ptr, 0x100))
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
# create QCOM tensor with the externally managed buffer
# dtypes.imageh = cl.cl_image_format(cl.CL_RGBA, cl.CL_HALF_FLOAT)
# dtypes.imagef = cl.cl_image_format(cl.CL_RGBA, cl.CL_FLOAT)
x = Tensor.from_blob(rawbuf_ptr, (h*w*4,), dtype=dtypes.imagef((h,w)), device='QCOM')
y = (x + 1).numpy()
print(y)
# all calculations are done, save to free the object
cl.clReleaseMemObject(cl_img)