Skip to content

Commit 6ea84f9

Browse files
committed
opencl: refactor q4_0 get_tensor
1 parent f71c17f commit 6ea84f9

1 file changed

Lines changed: 14 additions & 57 deletions

File tree

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 14 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5672,8 +5672,9 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
56725672

56735673
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
56745674
if (use_adreno_kernels(backend_ctx, tensor)) {
5675-
cl_int err;
5676-
cl_kernel kernel;
5675+
ggml_cl_buffer buf_trans_q;
5676+
ggml_cl_buffer buf_trans_d;
5677+
ggml_cl_buffer buf_unpacked;
56775678

56785679
cl_int M = tensor->ne[1]; // ne01
56795680
cl_int K = tensor->ne[0]; // ne00
@@ -5685,72 +5686,28 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
56855686
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
56865687
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
56875688

5688-
cl_mem buf_trans_q;
5689-
cl_mem buf_trans_d;
5690-
5691-
CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
5692-
size_q, NULL, &err), err));
5693-
CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
5694-
size_d, NULL, &err), err));
5695-
5696-
kernel = backend_ctx->kernel_transpose_16_buf;
5697-
5698-
// transpose q back
5699-
cl_int stride_k_q = K/4;
5700-
size_t local_size_q[3] = {64, 1, 1};
5701-
size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
5702-
5703-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
5704-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
5705-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
5706-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
5707-
5708-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5709-
global_size_q, local_size_q, 0, NULL, NULL));
5710-
5711-
// transpose scales back
5712-
cl_int stride_k_d = K/32;
5713-
size_t local_size_d[3] = {64, 1, 1};
5714-
size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
5715-
5716-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
5717-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
5718-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
5719-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
5720-
5721-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5722-
global_size_d, local_size_d, 0, NULL, NULL));
5689+
buf_trans_q.allocate(backend_ctx->context, size_q);
5690+
buf_trans_d.allocate(backend_ctx->context, size_d);
5691+
buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
57235692

5724-
// unpack
5725-
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
5726-
ggml_nbytes(tensor), NULL, &err);
5727-
CL_CHECK(err);
5693+
transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
5694+
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32);
57285695

57295696
cl_uchar mask_0F = 0x0F;
57305697
cl_uchar mask_F0 = 0xF0;
57315698

57325699
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
57335700
size_t local_work_size[] = {1, 1, 1};
57345701

5735-
kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
5736-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
5737-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
5738-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
5702+
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
5703+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
5704+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d.buffer));
5705+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_unpacked.buffer));
57395706
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
57405707
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
57415708

5742-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5743-
global_work_size, local_work_size, 0, NULL, NULL));
5744-
5745-
// read back to host
5746-
CL_CHECK(clEnqueueReadBuffer(
5747-
queue, data_device, CL_TRUE, offset,
5748-
size, data, 0, NULL, NULL));
5749-
5750-
CL_CHECK(clReleaseMemObject(data_device));
5751-
CL_CHECK(clReleaseMemObject(buf_trans_q));
5752-
CL_CHECK(clReleaseMemObject(buf_trans_d));
5753-
5709+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5710+
CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
57545711
return;
57555712
}
57565713
#endif

0 commit comments

Comments
 (0)