[update] llm_vlm add pause action & perf post process

Abandon-ht · Abandon-ht · commit e19220e5ec2e · 2025-02-20T10:37:10.000+08:00
diff --git a/projects/llm_framework/main_vlm/src/main.cpp b/projects/llm_framework/main_vlm/src/main.cpp
@@ -125,6 +125,8 @@ class llm_task {
             CONFIG_AUTO_SET(file_body["mode_param"], b_use_mmap_load_embed);
             CONFIG_AUTO_SET(file_body["mode_param"], b_dynamic_load_axmodel_layer);
             CONFIG_AUTO_SET(file_body["mode_param"], max_token_len);
+            CONFIG_AUTO_SET(file_body["mode_param"], temperature);
+            CONFIG_AUTO_SET(file_body["mode_param"], top_p);
 
             if (mode_config_.filename_tokenizer_model.find("http:") != std::string::npos) {
                 std::string tokenizer_file;
@@ -171,7 +173,11 @@ class llm_task {
                 }
             };
             lLaMa_ = std::make_unique<LLM>();
-            if (!lLaMa_->Init(mode_config_)) return -2;
+            if (!lLaMa_->Init(mode_config_)) {
+                lLaMa_->Deinit();
+                lLaMa_.reset();
+                return -2;
+            }
 
         } catch (...) {
             SLOGE("config false");
@@ -293,6 +299,33 @@ class llm_llm : public StackFlow {
         }
     }
 
+    void task_pause(const std::weak_ptr<llm_task> llm_task_obj_weak,
+                const std::weak_ptr<llm_channel_obj> llm_channel_weak)
+    {
+        auto llm_task_obj = llm_task_obj_weak.lock();
+        auto llm_channel  = llm_channel_weak.lock();
+        if (!(llm_task_obj && llm_channel)) {
+            return;
+        }
+        llm_task_obj->lLaMa_->Stop();
+    }
+
+    void pause(const std::string &work_id, const std::string &object, const std::string &data) override
+    {
+        SLOGI("llm_asr::work:%s", data.c_str());
+
+        nlohmann::json error_body;
+        int work_id_num = sample_get_work_id_num(work_id);
+        if (llm_task_.find(work_id_num) == llm_task_.end()) {
+            error_body["code"]    = -6;
+            error_body["message"] = "Unit Does Not Exist";
+            send("None", "None", error_body, work_id);
+            return;
+        }
+        task_pause(llm_task_[work_id_num], get_channel(work_id_num));
+        send("None", "None", LLM_NO_ERROR, work_id);
+    }
+
     void task_user_data(const std::weak_ptr<llm_task> llm_task_obj_weak,
                         const std::weak_ptr<llm_channel_obj> llm_channel_weak, const std::string &object,
                         const std::string &data)
diff --git a/projects/llm_framework/main_vlm/src/runner/LLM.hpp b/projects/llm_framework/main_vlm/src/runner/LLM.hpp
@@ -12,6 +12,7 @@
 #include "timer.hpp"
 #include "opencv2/opencv.hpp"
 #include "ax_sys_api.h"
+#include "LLMPostprocess.hpp"
 
 typedef std::function<void(int*, int, const char*, float, void*)> LLMRuningCallback;
 
@@ -26,6 +27,8 @@ struct LLMAttrType
 
     std::string filename_post_axmodel = "tinyllama-int8/tinyllama_post.axmodel";
 
+    bool b_use_topk = false;
+
     std::string filename_vpm_encoder_axmodedl = "minicpmv/vpm_resampler_version0_fp16.axmodel";
     std::string filename_vpm_resampler_axmodedl = "minicpmv/vpm_resampler_version0_fp16.axmodel";
     int vpm_width = 280;
@@ -44,13 +47,13 @@ struct LLMAttrType
     int kv_cache_num = 1024; // auto calc
     int kv_cache_size = 256; // auto calc
 
+    float temperature = 0.7f;
+    float top_p = 0.9f;
     bool b_use_mmap_load_embed = false;
     bool b_dynamic_load_axmodel_layer = false;
 
     bool b_use_mmap_load_layer = true;
 
-    bool b_use_topk = false;
-
     // bool b_live_print = true;
     LLMRuningCallback runing_callback = nullptr;
     void *reserve = nullptr;
@@ -84,41 +87,37 @@ class LLM
 
     bool b_stop = false;
 
-    static int FindMax(unsigned short *p, int n, float *val = 0)
+    int post_process(unsigned short *p, int n, std::vector<int> &history, float *val = 0)
     {
-        float max_val = -MAXFLOAT;
-        int max_index = 0;
+        std::vector<float> logits(n);
         for (int i = 0; i < n; i++)
         {
             unsigned int proc = p[i] << 16;
-            float tmp = *reinterpret_cast<float *>(&proc);
-            if (tmp > max_val)
-            {
-                max_val = tmp;
-                max_index = i;
-            }
+            logits[i] = *reinterpret_cast<float *>(&proc);
         }
+        LLMPostprocess postprocess;
+        postprocess.set_temperature(true, _attr.temperature);
+        postprocess.set_repetition_penalty(true, 1.2f);
+        // postprocess.set_top_k_sampling(true, 40);
+        postprocess.set_top_p_sampling(true, _attr.top_p);
 
-        // for (int i = 0; i < n; i += 4)
-        // {
-        //     uint16x4_t bf16_data = vld1_u16(&p[i]);
-        //     uint32x4_t float_data = vmovl_u16(bf16_data);
-        //     float32x4_t tmp_floats = vreinterpretq_f32_u32(vshlq_n_u32(float_data, 16));
+        return postprocess.apply(logits, history);
 
-        //     for (int j = 0; j < 4; j++)
+        // float max_val = -MAXFLOAT;
+        // int max_index = 0;
+        // for (int i = 0; i < n; i++)
+        // {
+        //     unsigned int proc = p[i] << 16;
+        //     float tmp = *reinterpret_cast<float *>(&proc);
+        //     if (tmp > max_val)
         //     {
-        //         float tmp = vgetq_lane_f32(tmp_floats, j);
-        //         if (tmp > max_val)
-        //         {
-        //             max_val = tmp;
-        //             max_index = i + j;
-        //         }
+        //         max_val = tmp;
+        //         max_index = i;
         //     }
         // }
-
-        if (val)
-            *val = max_val;
-        return max_index;
+        // if (val)
+        //     *val = max_val;
+        // return max_index;
     }
 
 public:
@@ -552,7 +551,7 @@ class LLM
                 AX_SYS_MinvalidateCache(output_post.phyAddr, output_post.pVirAddr, output_post.nSize);
                 unsigned short *post_out = (unsigned short *)output_post.pVirAddr;
                 float max_val = -MAXFLOAT;
-                max_index = FindMax(post_out, _attr.tokens_embed_num, &max_val);
+                max_index = post_process(post_out, _attr.tokens_embed_num, token_ids, &max_val);
             }
             next_token = max_index;
 
@@ -654,7 +653,7 @@ class LLM
                     AX_SYS_MinvalidateCache(output_post.phyAddr, output_post.pVirAddr, output_post.nSize);
                     unsigned short *post_out = (unsigned short *)output_post.pVirAddr;
                     float max_val = -MAXFLOAT;
-                    max_index = FindMax(post_out, _attr.tokens_embed_num, &max_val);
+                    max_index = post_process(post_out, _attr.tokens_embed_num, token_ids, &max_val);
                 }
                 next_token = max_index;
 
@@ -676,7 +675,7 @@ class LLM
                 if (_attr.runing_callback)
                 {
                     cached_token.push_back(max_index);
-                    if (cached_token.size() >= 3)
+                    if (cached_token.size() >= 5)
                     {
                         float t_cost_ms = t_cost.cost();
                         float token_per_sec = token_ids.size() / (t_cost_ms / 1000);
diff --git a/projects/llm_framework/main_vlm/src/runner/LLMPostprocess.hpp b/projects/llm_framework/main_vlm/src/runner/LLMPostprocess.hpp