@@ -5672,8 +5672,9 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
56725672
56735673#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
56745674 if (use_adreno_kernels(backend_ctx, tensor)) {
5675- cl_int err;
5676- cl_kernel kernel;
5675+ ggml_cl_buffer buf_trans_q;
5676+ ggml_cl_buffer buf_trans_d;
5677+ ggml_cl_buffer buf_unpacked;
56775678
56785679 cl_int M = tensor->ne[1]; // ne01
56795680 cl_int K = tensor->ne[0]; // ne00
@@ -5685,72 +5686,28 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
56855686 size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
56865687 GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
56875688
5688- cl_mem buf_trans_q;
5689- cl_mem buf_trans_d;
5690-
5691- CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
5692- size_q, NULL, &err), err));
5693- CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
5694- size_d, NULL, &err), err));
5695-
5696- kernel = backend_ctx->kernel_transpose_16_buf;
5697-
5698- // transpose q back
5699- cl_int stride_k_q = K/4;
5700- size_t local_size_q[3] = {64, 1, 1};
5701- size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
5702-
5703- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
5704- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
5705- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
5706- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
5707-
5708- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5709- global_size_q, local_size_q, 0, NULL, NULL));
5710-
5711- // transpose scales back
5712- cl_int stride_k_d = K/32;
5713- size_t local_size_d[3] = {64, 1, 1};
5714- size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
5715-
5716- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
5717- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
5718- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
5719- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
5720-
5721- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5722- global_size_d, local_size_d, 0, NULL, NULL));
5689+ buf_trans_q.allocate(backend_ctx->context, size_q);
5690+ buf_trans_d.allocate(backend_ctx->context, size_d);
5691+ buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
57235692
5724- // unpack
5725- cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
5726- ggml_nbytes(tensor), NULL, &err);
5727- CL_CHECK(err);
5693+ transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
5694+ transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32);
57285695
57295696 cl_uchar mask_0F = 0x0F;
57305697 cl_uchar mask_F0 = 0xF0;
57315698
57325699 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
57335700 size_t local_work_size[] = {1, 1, 1};
57345701
5735- kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
5736- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
5737- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
5738- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device ));
5702+ cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
5703+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer ));
5704+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d.buffer ));
5705+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_unpacked.buffer ));
57395706 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
57405707 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
57415708
5742- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5743- global_work_size, local_work_size, 0, NULL, NULL));
5744-
5745- // read back to host
5746- CL_CHECK(clEnqueueReadBuffer(
5747- queue, data_device, CL_TRUE, offset,
5748- size, data, 0, NULL, NULL));
5749-
5750- CL_CHECK(clReleaseMemObject(data_device));
5751- CL_CHECK(clReleaseMemObject(buf_trans_q));
5752- CL_CHECK(clReleaseMemObject(buf_trans_d));
5753-
5709+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5710+ CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
57545711 return;
57555712 }
57565713#endif
0 commit comments