88 lines
3.3 KiB
Python
88 lines
3.3 KiB
Python
import ctypes, array
|
|
from hexdump import hexdump
|
|
from tinygrad.runtime.ops_gpu import GPUDevice
|
|
from tinygrad.helpers import getenv, to_mv, mv_address
|
|
from tinygrad.dtype import dtypes
|
|
from tinygrad import Tensor, TinyJit
|
|
from tinygrad.runtime.autogen import opencl as cl
|
|
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
|
|
|
# create raw opencl buffer.
|
|
gdev = GPUDevice()
|
|
cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 0x100, None, status := ctypes.c_int32())
|
|
assert status.value == 0
|
|
|
|
# fill it with something for fun
|
|
data = memoryview(array.array('I', [i for i in range(64)]))
|
|
cl.clEnqueueWriteBuffer(gdev.queue, cl_buf, False, 0, 0x100, mv_address(data), 0, None, None)
|
|
cl.clFinish(gdev.queue) # wait writes to complete
|
|
|
|
# get raw gpu pointer from opencl buffer.
|
|
|
|
## get buf desc
|
|
hexdump(to_mv(ctypes.addressof(cl_buf), 0x40))
|
|
cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_buf), 8).cast('Q')[0]
|
|
|
|
## get buf device ptr
|
|
hexdump(to_mv(cl_buf_desc_ptr, 0x100))
|
|
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
|
|
|
|
# create QCOM tensor with the externally managed buffer
|
|
x = Tensor.from_blob(rawbuf_ptr, (8, 8), dtype=dtypes.int, device='QCOM')
|
|
y = (x + 1).numpy()
|
|
print(y)
|
|
|
|
# all calculations are done, save to free the object
|
|
cl.clReleaseMemObject(cl_buf)
|
|
|
|
# all together with jit
|
|
@TinyJit
|
|
def calc(x): return x + 2
|
|
|
|
for i in range(4):
|
|
cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 2*2*4, None, status := ctypes.c_int32())
|
|
assert status.value == 0
|
|
data = memoryview(array.array('I', [x+i for x in range(2*2)]))
|
|
cl.clEnqueueWriteBuffer(gdev.queue, cl_buf, False, 0, 2*2*4, mv_address(data), 0, None, None)
|
|
cl.clFinish(gdev.queue) # wait writes to complete
|
|
|
|
cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_buf), 8).cast('Q')[0]
|
|
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20]
|
|
|
|
y = calc(x = Tensor.from_blob(rawbuf_ptr, (2, 2), dtype=dtypes.int, device='QCOM')).numpy()
|
|
print(f'jit {i}\n', y)
|
|
|
|
# all calculations are done, save to free the object
|
|
cl.clReleaseMemObject(cl_buf)
|
|
|
|
# now images!
|
|
|
|
h, w = 128, 128
|
|
cl_img = cl.clCreateImage2D(gdev.context, cl.CL_MEM_READ_WRITE, cl.cl_image_format(cl.CL_RGBA, cl.CL_FLOAT), w, h, 0, None, status := ctypes.c_int32())
|
|
assert status.value == 0
|
|
|
|
# fill it with something for fun
|
|
data = memoryview(array.array('f', [i for i in range(h*w*4)]))
|
|
cl.clEnqueueWriteImage(gdev.queue, cl_img, False, (ctypes.c_size_t * 3)(0,0,0), (ctypes.c_size_t * 3)(w,h,1), 0, 0, mv_address(data), 0, None, None)
|
|
cl.clFinish(gdev.queue) # wait writes to complete
|
|
|
|
# get raw gpu pointer from opencl buffer.
|
|
|
|
## get buf desc
|
|
hexdump(to_mv(ctypes.addressof(cl_img), 0x40))
|
|
cl_buf_desc_ptr = to_mv(ctypes.addressof(cl_img), 8).cast('Q')[0]
|
|
|
|
## get buf device ptr
|
|
hexdump(to_mv(cl_buf_desc_ptr, 0x100))
|
|
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
|
|
|
|
# create QCOM tensor with the externally managed buffer
|
|
# dtypes.imageh = cl.cl_image_format(cl.CL_RGBA, cl.CL_HALF_FLOAT)
|
|
# dtypes.imagef = cl.cl_image_format(cl.CL_RGBA, cl.CL_FLOAT)
|
|
x = Tensor.from_blob(rawbuf_ptr, (h*w*4,), dtype=dtypes.imagef((h,w)), device='QCOM')
|
|
y = (x + 1).numpy()
|
|
print(y)
|
|
|
|
# all calculations are done, save to free the object
|
|
cl.clReleaseMemObject(cl_img)
|