Skip to content

Commit f71c17f

Browse files
committed
opencl: simplify adreno q4_0 set_tensor
1 parent c0b1354 commit f71c17f

1 file changed

Lines changed: 7 additions & 156 deletions

File tree

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 7 additions & 156 deletions
Original file line numberDiff line numberDiff line change
@@ -4938,164 +4938,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
49384938
// Only do transpose for large, non batched matrix
49394939
// TODO: use preallocated images instead of sub-buffer then image
49404940
if (use_adreno_kernels(backend_ctx, tensor)) {
4941-
// <----------------------------------------------------------------------------------> //
4942-
// start transpose
4943-
// <----------------------------------------------------------------------------------> //
4944-
int M = tensor->ne[1]; // ne01
4945-
int K = tensor->ne[0]; // ne00
4946-
4947-
//For matrix-vector multiplication kernel, we assume K is a multiple of 32
4948-
GGML_ASSERT(K % 32 == 0);
4949-
//For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
4950-
GGML_ASSERT(M % 4 == 0);
4951-
4952-
// transpose is out of place, so we need to allocate transposed buffers
4953-
// <----------------------------------------------------------------------------------> //
4954-
// use sub_buffer of max buffer size instead
4955-
4956-
size_t q_size_bytes = K * M / 8 * sizeof(float);
4957-
backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes);
4958-
4959-
cl_buffer_region region;
4960-
region.origin = 0;
4961-
region.size = q_size_bytes;
4962-
cl_mem qT_d = clCreateSubBuffer(
4963-
backend_ctx->prealloc_quant_trans.buffer,
4964-
0,
4965-
CL_BUFFER_CREATE_TYPE_REGION,
4966-
&region,
4967-
&err);
4968-
CL_CHECK(err);
4969-
4970-
bool K_tile_trans = true;
4971-
if ((K / 32) % 4 != 0){
4972-
K_tile_trans =false;
4973-
}
4974-
4975-
size_t d_size_bytes = M * (K / 32) * 2;
4976-
backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes);
4977-
4978-
region.origin = 0;
4979-
region.size = d_size_bytes;
4980-
cl_mem dT_d = clCreateSubBuffer(
4981-
backend_ctx->prealloc_scales_trans.buffer,
4982-
0,
4983-
CL_BUFFER_CREATE_TYPE_REGION,
4984-
&region,
4985-
&err);
4986-
CL_CHECK(err);
4987-
4988-
// <----------------------------------------------------------------------------------> //
4989-
4990-
4991-
// create images from the buffers
4992-
// <----------------------------------------------------------------------------------> //
4993-
cl_mem q_d_image1D;
4994-
cl_mem d_d_image1D;
4995-
cl_mem qT_d_image1D;
4996-
cl_mem dT_d_image1D;
4997-
4998-
cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
4999-
cl_image_desc img_desc_1d;
5000-
5001-
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
5002-
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
5003-
img_desc_1d.image_width = M * K / 4 / 4;
5004-
img_desc_1d.buffer = extra->q;
5005-
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
5006-
CL_CHECK(err);
5007-
5008-
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
5009-
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
5010-
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
5011-
img_desc_1d.image_width = M * K / 4 / 4;
5012-
img_desc_1d.buffer = qT_d;
5013-
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
5014-
CL_CHECK(err);
5015-
5016-
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
5017-
if (K_tile_trans) {
5018-
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
5019-
img_desc_1d.image_width = M * K / 32 / 4;
5020-
} else {
5021-
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
5022-
img_desc_1d.image_width = M * K / 32;
5023-
}
5024-
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
5025-
img_desc_1d.buffer = extra->d;
5026-
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
5027-
CL_CHECK(err);
5028-
5029-
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
5030-
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
5031-
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
5032-
img_desc_1d.image_width = M * K / 32 / 4;
5033-
img_desc_1d.buffer = dT_d;
5034-
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
5035-
CL_CHECK(err);
5036-
// <----------------------------------------------------------------------------------> //
5037-
5038-
// set up and call the transpose kernels
5039-
// <----------------------------------------------------------------------------------> //
5040-
// weights
5041-
int height_q = M / 4;
5042-
int width_q = K / 4 / 4;
5043-
kernel = backend_ctx->kernel_transpose_16;
5044-
5045-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
5046-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
5047-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
5048-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
5049-
5050-
size_t local_size_q[3] = {4, 16, 1};
5051-
size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
5052-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
5053-
CL_CHECK(clWaitForEvents(1, &evt));
5054-
5055-
// scales
5056-
int height_s = M / 4;
5057-
int width_s = K / 32 / 4;
5058-
5059-
kernel = backend_ctx->kernel_transpose_16;
5060-
if (!K_tile_trans) {
5061-
kernel = backend_ctx->kernel_transpose_16_4x1;
5062-
width_s = K / 32;
5063-
}
5064-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
5065-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
5066-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
5067-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
5068-
5069-
size_t local_size_s[3] = {4, 16, 1};
5070-
size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
5071-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
5072-
CL_CHECK(clWaitForEvents(1, &evt));
5073-
// <----------------------------------------------------------------------------------> //
4941+
int M = tensor->ne[1];
4942+
int K = tensor->ne[0];
50744943

5075-
// copy transposed buffer contents to original buffers
5076-
// <----------------------------------------------------------------------------------> //
5077-
// weights
5078-
CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
5079-
CL_CHECK(clWaitForEvents(1, &evt));
4944+
GGML_ASSERT(K % 32 == 0);
50804945

5081-
// scales
5082-
CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
5083-
CL_CHECK(clWaitForEvents(1, &evt));
5084-
// <----------------------------------------------------------------------------------> //
5085-
5086-
// deallocate transpose buffers
5087-
// <----------------------------------------------------------------------------------> //
5088-
CL_CHECK(clReleaseMemObject(qT_d));
5089-
CL_CHECK(clReleaseMemObject(dT_d));
5090-
5091-
// deallocate temporary images
5092-
CL_CHECK(clReleaseMemObject(q_d_image1D));
5093-
CL_CHECK(clReleaseMemObject(d_d_image1D));
5094-
CL_CHECK(clReleaseMemObject(qT_d_image1D));
5095-
CL_CHECK(clReleaseMemObject(dT_d_image1D));
5096-
// <----------------------------------------------------------------------------------> //
5097-
// end transpose
5098-
// <----------------------------------------------------------------------------------> //
4946+
// Transpose q as ushort
4947+
transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
4948+
// Transpose d as ushort
4949+
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
50994950
}
51004951
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
51014952

0 commit comments

Comments
 (0)