Skip to content

Commit 7696701

Browse files
author
LittleMouse
committed
feat: Add PaddleOCR-VL support and enhance cache handling
- Updated VLMType to include PaddleOCRVL (ID 6). - Implemented PaddleOCRVL image processing in image_processor.cpp and header. - Enhanced disk cache loading to guard against corrupted files and added limits on key length and block count. - Introduced port availability check in net_utils for better error handling. - Updated vision module to handle PaddleOCR-VL input format and normalization. - Improved error handling during cache save operations to prevent corruption. - Adjusted mRoPE configuration to accommodate PaddleOCR-VL.
1 parent 0fe032e commit 7696701

12 files changed

Lines changed: 552 additions & 82 deletions

File tree

projects/llm_framework/SConstruct

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import shutil
55
os.environ['SDK_PATH'] = os.path.normpath(str(Path(os.getcwd())/'..'/'..'/'SDK'))
66
os.environ['EXT_COMPONENTS_PATH'] = os.path.normpath(str(Path(os.getcwd())/'..'/'..'/'ext_components'))
77

8-
version = 'v0.1.7'
8+
version = 'v0.1.8'
99
static_lib = 'static_lib'
1010
update = False
1111

projects/llm_framework/main_llm2/src/main.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ class llm_task {
109109
return {};
110110
}
111111

112-
const auto now = std::chrono::steady_clock::now().time_since_epoch().count();
112+
const auto now = std::chrono::steady_clock::now().time_since_epoch().count();
113113
const auto path = tmpdir / ("img_" + std::to_string(now) + "_" + std::to_string(getpid()) + ".jpg");
114114

115115
std::ofstream ofs(path, std::ios::binary);
@@ -153,7 +153,10 @@ class llm_task {
153153
model_ = config_body.at("model");
154154
response_format_ = config_body.at("response_format");
155155
enoutput_ = config_body.at("enoutput");
156-
prompt_ = config_body.at("prompt");
156+
157+
if (config_body.contains("prompt")) {
158+
mode_config_.system_prompt = config_body.at("prompt").get<std::string>();
159+
}
157160

158161
if (config_body.contains("input")) {
159162
if (config_body["input"].is_string()) {
@@ -200,6 +203,9 @@ class llm_task {
200203
SLOGI("base_model %s", base_model.c_str());
201204

202205
CONFIG_AUTO_SET(file_body["mode_param"], system_prompt);
206+
if (!config_body.contains("system_prompt") && config_body.contains("prompt")) {
207+
mode_config_.system_prompt = config_body.at("prompt").get<std::string>();
208+
}
203209

204210
CONFIG_AUTO_SET(file_body["mode_param"], template_filename_axmodel);
205211
CONFIG_AUTO_SET(file_body["mode_param"], axmodel_num);
@@ -271,7 +277,8 @@ class llm_task {
271277
};
272278

273279
if (!parse_vlm_type(config_body, "vlm_type") && !parse_vlm_type(config_body, "VLM_TYPE")) {
274-
parse_vlm_type(file_body["mode_param"], "vlm_type") || parse_vlm_type(file_body["mode_param"], "VLM_TYPE");
280+
parse_vlm_type(file_body["mode_param"], "vlm_type") ||
281+
parse_vlm_type(file_body["mode_param"], "VLM_TYPE");
275282
}
276283

277284
mode_config_.template_filename_axmodel = base_model + mode_config_.template_filename_axmodel;

projects/llm_framework/main_llm2/src/runner/LLM.cpp

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -739,17 +739,25 @@ struct LLM::Impl {
739739
std::vector<std::vector<unsigned short>> &kv_v,
740740
int _precompute_len, int input_num_token)
741741
{
742+
// Always start from the largest group by default, then pick the first group that fits.
743+
_attr.prefill_grpid = (int)_attr.prefill_max_kv_cache_num_grp.size();
742744
for (size_t i = 0; i < _attr.prefill_max_kv_cache_num_grp.size(); i++)
743745
{
744746
if (_precompute_len + input_num_token <= _attr.prefill_max_kv_cache_num_grp[i]) { _attr.prefill_grpid = (int)i + 1; break; }
745747
}
746748
int kv_cache_num = _attr.prefill_max_kv_cache_num_grp[_attr.prefill_grpid - 1];
747749
ALOGI("prefill_grpid:%d kv_cache_num:%d precompute_len:%d input_num_token:%d", _attr.prefill_grpid, kv_cache_num, _precompute_len, input_num_token);
748-
_attr.prefill_max_token_num = ALIGN_DOWN(_attr.prefill_max_token_num - _precompute_len, _attr.prefill_token_num);
749-
ALOGI("current prefill_max_token_num:%d", _attr.prefill_max_token_num);
750-
if (_precompute_len == 0) { ALOGI("first run"); return 0; }
750+
// Remaining prefill budget should be derived from the model capacity, not accumulated across calls.
751+
// Otherwise, a failed prefill (e.g. context overflow) can make it negative and break `/reset`.
752+
const int max_cap = _attr.prefill_max_kv_cache_num_grp.empty() ? 0 : _attr.prefill_max_kv_cache_num_grp.back();
753+
int remaining = max_cap - _precompute_len;
754+
if (remaining < 0) remaining = 0;
755+
remaining = ALIGN_DOWN(remaining, _attr.prefill_token_num);
756+
_attr.prefill_max_token_num = remaining;
757+
ALOGI("current prefill_max_token_num:%d", remaining);
751758
if (_precompute_len + input_num_token > kv_cache_num) { ALOGE("precompute_len(%d) + input_num_token(%d) > kv_cache_num(%d)", _precompute_len, input_num_token, kv_cache_num); return -1; }
752-
if (input_num_token > _attr.prefill_max_token_num) { ALOGE("input_num_token(%d) > prefill_max_token_num(%d)", input_num_token, _attr.prefill_max_token_num); return -1; }
759+
if (input_num_token > remaining) { ALOGE("input_num_token(%d) > prefill_max_token_num(%d)", input_num_token, remaining); return -1; }
760+
if (_precompute_len == 0) { ALOGI("first run"); return 0; }
753761
if (!b_os_kvcache) return 0;
754762
if (kv_k.size() != kv_v.size() || (int)kv_k.size() != _attr.axmodel_num) { ALOGE("kv cache size mismatch"); return -1; }
755763
for (int i = 0; i < _attr.axmodel_num; i++)
@@ -777,6 +785,11 @@ struct LLM::Impl {
777785
void ResetKVCache()
778786
{
779787
last_tokens_ids.clear(); k_caches.clear(); v_caches.clear(); precompute_len = 0;
788+
_attr.prefill_grpid = (int)_attr.prefill_max_kv_cache_num_grp.size();
789+
if (!_attr.prefill_max_kv_cache_num_grp.empty())
790+
{
791+
_attr.prefill_max_token_num = _attr.prefill_max_kv_cache_num_grp.back();
792+
}
780793
for (int i = 0; i < _attr.axmodel_num; i++)
781794
{
782795
auto &lyr = llama_layers[i]; int devid = LLM_DEVID(lyr);
@@ -1194,7 +1207,12 @@ struct LLM::Impl {
11941207
if (!new_tokens.empty()) { precompute_len = (int)new_tokens.size() - 1; tokens_diff = {new_tokens.back()}; }
11951208
else { ResetKVCache(); precompute_len = 0; }
11961209
}
1197-
SetKVCache(k_caches, v_caches, precompute_len, (int)tokens_diff.size());
1210+
const int kv_ret = SetKVCache(k_caches, v_caches, precompute_len, (int)tokens_diff.size());
1211+
if (kv_ret != 0)
1212+
{
1213+
ALOGE("SetKVCache failed");
1214+
return history;
1215+
}
11981216
std::vector<unsigned short> out_embed(tokens_diff.size() * _attr.tokens_embed_size);
11991217
for (size_t i = 0; i < tokens_diff.size(); i++)
12001218
{

projects/llm_framework/main_llm2/src/runner/LLM.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ struct LLMAttrType {
5555
// ---- vision / VLM (optional, runtime switch by `vlm_type`) ----
5656
// If `vlm_type != VLMType::None`, vision encoder will be initialized and used.
5757
// See `VLMType` in `src/runner/VLMType.hpp`.
58-
// Names/ids (via magic_enum): `None(0)`, `Qwen2_5VL(1)`, `Qwen3VL(2)`, `InternVL3(3)`, `FastVLM(4)`, `SmolVLM2(5)`.
58+
// Names/ids (via magic_enum): `None(0)`, `Qwen2_5VL(1)`, `Qwen3VL(2)`, `InternVL3(3)`, `FastVLM(4)`, `SmolVLM2(5)`, `PaddleOCRVL(6)`.
5959
VLMType vlm_type = VLMType::None;
6060

6161
// Vision encoder axmodel (image/video encoder). Required if `vlm_type != VLMType::None`.

projects/llm_framework/main_llm2/src/runner/VLMType.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ enum class VLMType : int {
2121
InternVL3 = 3,
2222
FastVLM = 4,
2323
SmolVLM2 = 5,
24+
PaddleOCRVL = 6,
2425
};
2526

2627
inline constexpr std::string_view VLMTypeName(VLMType t) {

projects/llm_framework/main_llm2/src/runner/utils/image_processor.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,47 @@ int Qwen2VideoProcessor(std::vector<axcv::Mat>& src,
128128
return 0;
129129
}
130130

131+
int PaddleOCRVLImageProcessor(axcv::Mat& src,
132+
std::vector<unsigned char>& output,
133+
int tgt_h, int tgt_w,
134+
int patch_size) {
135+
// Resize to target size and convert BGR->RGB.
136+
axcv::Mat img_rs;
137+
if (axcv::width(src) != tgt_w || axcv::height(src) != tgt_h) {
138+
axcv::resize(src, img_rs, tgt_w, tgt_h);
139+
} else {
140+
img_rs = src;
141+
}
142+
axcv::Mat rgb;
143+
axcv::cvtColorBGR2RGB(img_rs, rgb);
144+
145+
const int grid_h = tgt_h / patch_size;
146+
const int grid_w = tgt_w / patch_size;
147+
const int N = grid_h * grid_w;
148+
const int C = 3;
149+
150+
// Output layout: [N, C, pH, pW] matching PaddleOCR-VL VIT input format (1, N, C, pH, pW).
151+
output.resize((size_t)N * C * patch_size * patch_size);
152+
153+
size_t idx = 0;
154+
for (int n = 0; n < N; n++) {
155+
const int gh = n / grid_w;
156+
const int gw = n % grid_w;
157+
for (int c = 0; c < C; c++) {
158+
for (int ph = 0; ph < patch_size; ph++) {
159+
const int row = gh * patch_size + ph;
160+
const uint8_t* row_ptr = axcv::row_ptr(rgb, row);
161+
for (int pw = 0; pw < patch_size; pw++) {
162+
const int col = gw * patch_size + pw;
163+
output[idx++] = row_ptr[col * C + c];
164+
}
165+
}
166+
}
167+
}
168+
169+
return 0;
170+
}
171+
131172
static std::vector<axcv::Mat> splitImageSafe(axcv::Mat src, int rows, int cols, int tile_w, int tile_h) {
132173
std::vector<axcv::Mat> subImages;
133174

projects/llm_framework/main_llm2/src/runner/utils/image_processor.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@ int Qwen2VideoProcessor(std::vector<axcv::Mat>& src,
1717
int merge_size = 2,
1818
int patch_size = 14);
1919

20+
// PaddleOCR-VL image processor: produces patches in [N, C, pH, pW] format
21+
// (channel-first per patch, row-major across patches, no spatial merge).
22+
// This is the format expected by the PaddleOCR-VL VIT axmodel.
23+
int PaddleOCRVLImageProcessor(axcv::Mat& src,
24+
std::vector<unsigned char>& output,
25+
int tgt_h, int tgt_w,
26+
int patch_size = 14);
27+
2028
// SmolVLM2 image processor:
2129
// For each input image, outputs 5 blocks: 2x2 tiles (4) + global resized image (1).
2230
int Smolvlm2ImageProcessor(std::vector<axcv::Mat>& src,

projects/llm_framework/main_llm2/src/runner/utils/logger.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,22 @@ static void init_from_env()
133133
ws.stdout_is_console = (ws.stdout_handle != nullptr) && GetConsoleMode(ws.stdout_handle, &mode);
134134
ws.stderr_is_console = (ws.stderr_handle != nullptr) && GetConsoleMode(ws.stderr_handle, &mode);
135135

136+
// Try to make UTF-8 work out of the box on Windows terminals, so Chinese prompts
137+
// and box drawing characters render correctly (e.g. "退出", "├──").
138+
// Can be disabled by setting AXLLM_WIN_UTF8=0/false/off/no.
139+
const auto win_utf8 = getenv_sv("AXLLM_WIN_UTF8");
140+
const bool enable_utf8 =
141+
!(str_eq_nocase(win_utf8, "0") || str_eq_nocase(win_utf8, "false") || str_eq_nocase(win_utf8, "off") || str_eq_nocase(win_utf8, "no"));
142+
// Don't rely on stdout/stderr being a console handle: on modern Windows terminals (ConPTY),
143+
// the std handles may be pipes even though the process is attached to a console.
144+
const UINT out_cp = GetConsoleOutputCP();
145+
const UINT in_cp = GetConsoleCP();
146+
if (enable_utf8 && (out_cp != 0 || in_cp != 0))
147+
{
148+
if (out_cp != 0 && out_cp != CP_UTF8) SetConsoleOutputCP(CP_UTF8);
149+
if (in_cp != 0 && in_cp != CP_UTF8) SetConsoleCP(CP_UTF8);
150+
}
151+
136152
if (ws.stdout_is_console)
137153
{
138154
CONSOLE_SCREEN_BUFFER_INFO info{};

0 commit comments

Comments
 (0)