feat: Add PaddleOCR-VL support and enhance cache handling

LittleMouse · LittleMouse · commit 769670147da6 · 2026-04-02T10:43:52.000+08:00
- Updated VLMType to include PaddleOCRVL (ID 6).
- Implemented PaddleOCRVL image processing in image_processor.cpp and header.
- Enhanced disk cache loading to guard against corrupted files and added limits on key length and block count.
- Introduced port availability check in net_utils for better error handling.
- Updated vision module to handle PaddleOCR-VL input format and normalization.
- Improved error handling during cache save operations to prevent corruption.
- Adjusted mRoPE configuration to accommodate PaddleOCR-VL.
diff --git a/projects/llm_framework/SConstruct b/projects/llm_framework/SConstruct
@@ -5,7 +5,7 @@ import shutil
 os.environ['SDK_PATH'] = os.path.normpath(str(Path(os.getcwd())/'..'/'..'/'SDK'))
 os.environ['EXT_COMPONENTS_PATH'] = os.path.normpath(str(Path(os.getcwd())/'..'/'..'/'ext_components'))
 
-version = 'v0.1.7'
+version = 'v0.1.8'
 static_lib = 'static_lib'
 update = False
 
diff --git a/projects/llm_framework/main_llm2/src/main.cpp b/projects/llm_framework/main_llm2/src/main.cpp
@@ -109,7 +109,7 @@ class llm_task {
             return {};
         }
 
-        const auto now = std::chrono::steady_clock::now().time_since_epoch().count();
+        const auto now  = std::chrono::steady_clock::now().time_since_epoch().count();
         const auto path = tmpdir / ("img_" + std::to_string(now) + "_" + std::to_string(getpid()) + ".jpg");
 
         std::ofstream ofs(path, std::ios::binary);
@@ -153,7 +153,10 @@ class llm_task {
             model_           = config_body.at("model");
             response_format_ = config_body.at("response_format");
             enoutput_        = config_body.at("enoutput");
-            prompt_          = config_body.at("prompt");
+
+            if (config_body.contains("prompt")) {
+                mode_config_.system_prompt = config_body.at("prompt").get<std::string>();
+            }
 
             if (config_body.contains("input")) {
                 if (config_body["input"].is_string()) {
@@ -200,6 +203,9 @@ class llm_task {
             SLOGI("base_model %s", base_model.c_str());
 
             CONFIG_AUTO_SET(file_body["mode_param"], system_prompt);
+            if (!config_body.contains("system_prompt") && config_body.contains("prompt")) {
+                mode_config_.system_prompt = config_body.at("prompt").get<std::string>();
+            }
 
             CONFIG_AUTO_SET(file_body["mode_param"], template_filename_axmodel);
             CONFIG_AUTO_SET(file_body["mode_param"], axmodel_num);
@@ -271,7 +277,8 @@ class llm_task {
             };
 
             if (!parse_vlm_type(config_body, "vlm_type") && !parse_vlm_type(config_body, "VLM_TYPE")) {
-                parse_vlm_type(file_body["mode_param"], "vlm_type") || parse_vlm_type(file_body["mode_param"], "VLM_TYPE");
+                parse_vlm_type(file_body["mode_param"], "vlm_type") ||
+                    parse_vlm_type(file_body["mode_param"], "VLM_TYPE");
             }
 
             mode_config_.template_filename_axmodel      = base_model + mode_config_.template_filename_axmodel;
diff --git a/projects/llm_framework/main_llm2/src/runner/LLM.cpp b/projects/llm_framework/main_llm2/src/runner/LLM.cpp
@@ -739,17 +739,25 @@ struct LLM::Impl {
                    std::vector<std::vector<unsigned short>> &kv_v,
                    int _precompute_len, int input_num_token)
     {
+        // Always start from the largest group by default, then pick the first group that fits.
+        _attr.prefill_grpid = (int)_attr.prefill_max_kv_cache_num_grp.size();
         for (size_t i = 0; i < _attr.prefill_max_kv_cache_num_grp.size(); i++)
         {
             if (_precompute_len + input_num_token <= _attr.prefill_max_kv_cache_num_grp[i]) { _attr.prefill_grpid = (int)i + 1; break; }
         }
         int kv_cache_num = _attr.prefill_max_kv_cache_num_grp[_attr.prefill_grpid - 1];
         ALOGI("prefill_grpid:%d kv_cache_num:%d precompute_len:%d input_num_token:%d", _attr.prefill_grpid, kv_cache_num, _precompute_len, input_num_token);
-        _attr.prefill_max_token_num = ALIGN_DOWN(_attr.prefill_max_token_num - _precompute_len, _attr.prefill_token_num);
-        ALOGI("current prefill_max_token_num:%d", _attr.prefill_max_token_num);
-        if (_precompute_len == 0) { ALOGI("first run"); return 0; }
+        // Remaining prefill budget should be derived from the model capacity, not accumulated across calls.
+        // Otherwise, a failed prefill (e.g. context overflow) can make it negative and break `/reset`.
+        const int max_cap = _attr.prefill_max_kv_cache_num_grp.empty() ? 0 : _attr.prefill_max_kv_cache_num_grp.back();
+        int remaining = max_cap - _precompute_len;
+        if (remaining < 0) remaining = 0;
+        remaining = ALIGN_DOWN(remaining, _attr.prefill_token_num);
+        _attr.prefill_max_token_num = remaining;
+        ALOGI("current prefill_max_token_num:%d", remaining);
         if (_precompute_len + input_num_token > kv_cache_num) { ALOGE("precompute_len(%d) + input_num_token(%d) > kv_cache_num(%d)", _precompute_len, input_num_token, kv_cache_num); return -1; }
-        if (input_num_token > _attr.prefill_max_token_num) { ALOGE("input_num_token(%d) > prefill_max_token_num(%d)", input_num_token, _attr.prefill_max_token_num); return -1; }
+        if (input_num_token > remaining) { ALOGE("input_num_token(%d) > prefill_max_token_num(%d)", input_num_token, remaining); return -1; }
+        if (_precompute_len == 0) { ALOGI("first run"); return 0; }
         if (!b_os_kvcache) return 0;
         if (kv_k.size() != kv_v.size() || (int)kv_k.size() != _attr.axmodel_num) { ALOGE("kv cache size mismatch"); return -1; }
         for (int i = 0; i < _attr.axmodel_num; i++)
@@ -777,6 +785,11 @@ struct LLM::Impl {
     void ResetKVCache()
     {
         last_tokens_ids.clear(); k_caches.clear(); v_caches.clear(); precompute_len = 0;
+        _attr.prefill_grpid = (int)_attr.prefill_max_kv_cache_num_grp.size();
+        if (!_attr.prefill_max_kv_cache_num_grp.empty())
+        {
+            _attr.prefill_max_token_num = _attr.prefill_max_kv_cache_num_grp.back();
+        }
         for (int i = 0; i < _attr.axmodel_num; i++)
         {
             auto &lyr  = llama_layers[i]; int devid = LLM_DEVID(lyr);
@@ -1194,7 +1207,12 @@ struct LLM::Impl {
             if (!new_tokens.empty()) { precompute_len = (int)new_tokens.size() - 1; tokens_diff = {new_tokens.back()}; }
             else { ResetKVCache(); precompute_len = 0; }
         }
-        SetKVCache(k_caches, v_caches, precompute_len, (int)tokens_diff.size());
+        const int kv_ret = SetKVCache(k_caches, v_caches, precompute_len, (int)tokens_diff.size());
+        if (kv_ret != 0)
+        {
+            ALOGE("SetKVCache failed");
+            return history;
+        }
         std::vector<unsigned short> out_embed(tokens_diff.size() * _attr.tokens_embed_size);
         for (size_t i = 0; i < tokens_diff.size(); i++)
         {
diff --git a/projects/llm_framework/main_llm2/src/runner/LLM.hpp b/projects/llm_framework/main_llm2/src/runner/LLM.hpp
@@ -55,7 +55,7 @@ struct LLMAttrType {
     // ---- vision / VLM (optional, runtime switch by `vlm_type`) ----
     // If `vlm_type != VLMType::None`, vision encoder will be initialized and used.
     // See `VLMType` in `src/runner/VLMType.hpp`.
-    // Names/ids (via magic_enum): `None(0)`, `Qwen2_5VL(1)`, `Qwen3VL(2)`, `InternVL3(3)`, `FastVLM(4)`, `SmolVLM2(5)`.
+    // Names/ids (via magic_enum): `None(0)`, `Qwen2_5VL(1)`, `Qwen3VL(2)`, `InternVL3(3)`, `FastVLM(4)`, `SmolVLM2(5)`, `PaddleOCRVL(6)`.
     VLMType vlm_type = VLMType::None;
 
     // Vision encoder axmodel (image/video encoder). Required if `vlm_type != VLMType::None`.
diff --git a/projects/llm_framework/main_llm2/src/runner/VLMType.hpp b/projects/llm_framework/main_llm2/src/runner/VLMType.hpp
@@ -21,6 +21,7 @@ enum class VLMType : int {
     InternVL3 = 3,
     FastVLM = 4,
     SmolVLM2 = 5,
+    PaddleOCRVL = 6,
 };
 
 inline constexpr std::string_view VLMTypeName(VLMType t) {
diff --git a/projects/llm_framework/main_llm2/src/runner/utils/image_processor.cpp b/projects/llm_framework/main_llm2/src/runner/utils/image_processor.cpp
@@ -128,6 +128,47 @@ int Qwen2VideoProcessor(std::vector<axcv::Mat>& src,
     return 0;
 }
 
+int PaddleOCRVLImageProcessor(axcv::Mat& src,
+                              std::vector<unsigned char>& output,
+                              int tgt_h, int tgt_w,
+                              int patch_size) {
+    // Resize to target size and convert BGR->RGB.
+    axcv::Mat img_rs;
+    if (axcv::width(src) != tgt_w || axcv::height(src) != tgt_h) {
+        axcv::resize(src, img_rs, tgt_w, tgt_h);
+    } else {
+        img_rs = src;
+    }
+    axcv::Mat rgb;
+    axcv::cvtColorBGR2RGB(img_rs, rgb);
+
+    const int grid_h = tgt_h / patch_size;
+    const int grid_w = tgt_w / patch_size;
+    const int N = grid_h * grid_w;
+    const int C = 3;
+
+    // Output layout: [N, C, pH, pW] matching PaddleOCR-VL VIT input format (1, N, C, pH, pW).
+    output.resize((size_t)N * C * patch_size * patch_size);
+
+    size_t idx = 0;
+    for (int n = 0; n < N; n++) {
+        const int gh = n / grid_w;
+        const int gw = n % grid_w;
+        for (int c = 0; c < C; c++) {
+            for (int ph = 0; ph < patch_size; ph++) {
+                const int row = gh * patch_size + ph;
+                const uint8_t* row_ptr = axcv::row_ptr(rgb, row);
+                for (int pw = 0; pw < patch_size; pw++) {
+                    const int col = gw * patch_size + pw;
+                    output[idx++] = row_ptr[col * C + c];
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
 static std::vector<axcv::Mat> splitImageSafe(axcv::Mat src, int rows, int cols, int tile_w, int tile_h) {
     std::vector<axcv::Mat> subImages;
 
diff --git a/projects/llm_framework/main_llm2/src/runner/utils/image_processor.hpp b/projects/llm_framework/main_llm2/src/runner/utils/image_processor.hpp
@@ -17,6 +17,14 @@ int Qwen2VideoProcessor(std::vector<axcv::Mat>& src,
                         int merge_size = 2,
                         int patch_size = 14);
 
+// PaddleOCR-VL image processor: produces patches in [N, C, pH, pW] format
+// (channel-first per patch, row-major across patches, no spatial merge).
+// This is the format expected by the PaddleOCR-VL VIT axmodel.
+int PaddleOCRVLImageProcessor(axcv::Mat& src,
+                              std::vector<unsigned char>& output,
+                              int tgt_h, int tgt_w,
+                              int patch_size = 14);
+
 // SmolVLM2 image processor:
 // For each input image, outputs 5 blocks: 2x2 tiles (4) + global resized image (1).
 int Smolvlm2ImageProcessor(std::vector<axcv::Mat>& src,
diff --git a/projects/llm_framework/main_llm2/src/runner/utils/logger.cpp b/projects/llm_framework/main_llm2/src/runner/utils/logger.cpp
@@ -133,6 +133,22 @@ static void init_from_env()
     ws.stdout_is_console = (ws.stdout_handle != nullptr) && GetConsoleMode(ws.stdout_handle, &mode);
     ws.stderr_is_console = (ws.stderr_handle != nullptr) && GetConsoleMode(ws.stderr_handle, &mode);
 
+    // Try to make UTF-8 work out of the box on Windows terminals, so Chinese prompts
+    // and box drawing characters render correctly (e.g. "退出", "├──").
+    // Can be disabled by setting AXLLM_WIN_UTF8=0/false/off/no.
+    const auto win_utf8 = getenv_sv("AXLLM_WIN_UTF8");
+    const bool enable_utf8 =
+        !(str_eq_nocase(win_utf8, "0") || str_eq_nocase(win_utf8, "false") || str_eq_nocase(win_utf8, "off") || str_eq_nocase(win_utf8, "no"));
+    // Don't rely on stdout/stderr being a console handle: on modern Windows terminals (ConPTY),
+    // the std handles may be pipes even though the process is attached to a console.
+    const UINT out_cp = GetConsoleOutputCP();
+    const UINT in_cp = GetConsoleCP();
+    if (enable_utf8 && (out_cp != 0 || in_cp != 0))
+    {
+        if (out_cp != 0 && out_cp != CP_UTF8) SetConsoleOutputCP(CP_UTF8);
+        if (in_cp != 0 && in_cp != CP_UTF8) SetConsoleCP(CP_UTF8);
+    }
+
     if (ws.stdout_is_console)
     {
         CONSOLE_SCREEN_BUFFER_INFO info{};
diff --git a/projects/llm_framework/main_llm2/src/runner/utils/mrope.cpp b/projects/llm_framework/main_llm2/src/runner/utils/mrope.cpp
diff --git a/projects/llm_framework/main_llm2/src/runner/utils/net_utils.cpp b/projects/llm_framework/main_llm2/src/runner/utils/net_utils.cpp
diff --git a/projects/llm_framework/main_llm2/src/runner/utils/net_utils.hpp b/projects/llm_framework/main_llm2/src/runner/utils/net_utils.hpp
diff --git a/projects/llm_framework/main_llm2/src/runner/vision/vision_module.cpp b/projects/llm_framework/main_llm2/src/runner/vision/vision_module.cpp