@@ -4938,164 +4938,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
49384938 // Only do transpose for large, non batched matrix
49394939 // TODO: use preallocated images instead of sub-buffer then image
49404940 if (use_adreno_kernels(backend_ctx, tensor)) {
4941- // <----------------------------------------------------------------------------------> //
4942- // start transpose
4943- // <----------------------------------------------------------------------------------> //
4944- int M = tensor->ne[1]; // ne01
4945- int K = tensor->ne[0]; // ne00
4946-
4947- //For matrix-vector multiplication kernel, we assume K is a multiple of 32
4948- GGML_ASSERT(K % 32 == 0);
4949- //For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
4950- GGML_ASSERT(M % 4 == 0);
4951-
4952- // transpose is out of place, so we need to allocate transposed buffers
4953- // <----------------------------------------------------------------------------------> //
4954- // use sub_buffer of max buffer size instead
4955-
4956- size_t q_size_bytes = K * M / 8 * sizeof(float);
4957- backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes);
4958-
4959- cl_buffer_region region;
4960- region.origin = 0;
4961- region.size = q_size_bytes;
4962- cl_mem qT_d = clCreateSubBuffer(
4963- backend_ctx->prealloc_quant_trans.buffer,
4964- 0,
4965- CL_BUFFER_CREATE_TYPE_REGION,
4966- ®ion,
4967- &err);
4968- CL_CHECK(err);
4969-
4970- bool K_tile_trans = true;
4971- if ((K / 32) % 4 != 0){
4972- K_tile_trans =false;
4973- }
4974-
4975- size_t d_size_bytes = M * (K / 32) * 2;
4976- backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes);
4977-
4978- region.origin = 0;
4979- region.size = d_size_bytes;
4980- cl_mem dT_d = clCreateSubBuffer(
4981- backend_ctx->prealloc_scales_trans.buffer,
4982- 0,
4983- CL_BUFFER_CREATE_TYPE_REGION,
4984- ®ion,
4985- &err);
4986- CL_CHECK(err);
4987-
4988- // <----------------------------------------------------------------------------------> //
4989-
4990-
4991- // create images from the buffers
4992- // <----------------------------------------------------------------------------------> //
4993- cl_mem q_d_image1D;
4994- cl_mem d_d_image1D;
4995- cl_mem qT_d_image1D;
4996- cl_mem dT_d_image1D;
4997-
4998- cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
4999- cl_image_desc img_desc_1d;
5000-
5001- memset(&img_desc_1d, 0, sizeof(img_desc_1d));
5002- img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
5003- img_desc_1d.image_width = M * K / 4 / 4;
5004- img_desc_1d.buffer = extra->q;
5005- q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
5006- CL_CHECK(err);
5007-
5008- img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
5009- memset(&img_desc_1d, 0, sizeof(img_desc_1d));
5010- img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
5011- img_desc_1d.image_width = M * K / 4 / 4;
5012- img_desc_1d.buffer = qT_d;
5013- qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
5014- CL_CHECK(err);
5015-
5016- memset(&img_desc_1d, 0, sizeof(img_desc_1d));
5017- if (K_tile_trans) {
5018- img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
5019- img_desc_1d.image_width = M * K / 32 / 4;
5020- } else {
5021- img_fmt_1d = { CL_R, CL_HALF_FLOAT };
5022- img_desc_1d.image_width = M * K / 32;
5023- }
5024- img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
5025- img_desc_1d.buffer = extra->d;
5026- d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
5027- CL_CHECK(err);
5028-
5029- img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
5030- memset(&img_desc_1d, 0, sizeof(img_desc_1d));
5031- img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
5032- img_desc_1d.image_width = M * K / 32 / 4;
5033- img_desc_1d.buffer = dT_d;
5034- dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
5035- CL_CHECK(err);
5036- // <----------------------------------------------------------------------------------> //
5037-
5038- // set up and call the transpose kernels
5039- // <----------------------------------------------------------------------------------> //
5040- // weights
5041- int height_q = M / 4;
5042- int width_q = K / 4 / 4;
5043- kernel = backend_ctx->kernel_transpose_16;
5044-
5045- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
5046- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
5047- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
5048- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
5049-
5050- size_t local_size_q[3] = {4, 16, 1};
5051- size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
5052- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
5053- CL_CHECK(clWaitForEvents(1, &evt));
5054-
5055- // scales
5056- int height_s = M / 4;
5057- int width_s = K / 32 / 4;
5058-
5059- kernel = backend_ctx->kernel_transpose_16;
5060- if (!K_tile_trans) {
5061- kernel = backend_ctx->kernel_transpose_16_4x1;
5062- width_s = K / 32;
5063- }
5064- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
5065- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
5066- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
5067- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
5068-
5069- size_t local_size_s[3] = {4, 16, 1};
5070- size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
5071- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
5072- CL_CHECK(clWaitForEvents(1, &evt));
5073- // <----------------------------------------------------------------------------------> //
4941+ int M = tensor->ne[1];
4942+ int K = tensor->ne[0];
50744943
5075- // copy transposed buffer contents to original buffers
5076- // <----------------------------------------------------------------------------------> //
5077- // weights
5078- CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
5079- CL_CHECK(clWaitForEvents(1, &evt));
4944+ GGML_ASSERT(K % 32 == 0);
50804945
5081- // scales
5082- CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
5083- CL_CHECK(clWaitForEvents(1, &evt));
5084- // <----------------------------------------------------------------------------------> //
5085-
5086- // deallocate transpose buffers
5087- // <----------------------------------------------------------------------------------> //
5088- CL_CHECK(clReleaseMemObject(qT_d));
5089- CL_CHECK(clReleaseMemObject(dT_d));
5090-
5091- // deallocate temporary images
5092- CL_CHECK(clReleaseMemObject(q_d_image1D));
5093- CL_CHECK(clReleaseMemObject(d_d_image1D));
5094- CL_CHECK(clReleaseMemObject(qT_d_image1D));
5095- CL_CHECK(clReleaseMemObject(dT_d_image1D));
5096- // <----------------------------------------------------------------------------------> //
5097- // end transpose
5098- // <----------------------------------------------------------------------------------> //
4946+ // Transpose q as ushort
4947+ transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
4948+ // Transpose d as ushort
4949+ transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
50994950 }
51004951 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
51014952
0 commit comments