Skip to content

Commit 0eaeebe

Browse files
committed
[update] Organize the tokenizer script.
1 parent c66bc50 commit 0eaeebe

22 files changed

Lines changed: 128 additions & 39 deletions

ext_components/StackFlow/stackflow/StackFlowUtil.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
*/
66
#include "StackFlowUtil.h"
77
#include <vector>
8+
#include <glob.h>
9+
#include <fstream>
810
#include "pzmq.hpp"
911

1012
std::string StackFlows::sample_json_str_get(const std::string &json_str, const std::string &json_key)
@@ -298,4 +300,33 @@ std::list<std::string> StackFlows::get_config_file_paths(std::string &base_model
298300
config_file_paths.push_back(base_model_path + std::string("../share/") + std::string("./mode_") + mode_name +
299301
".json");
300302
return config_file_paths;
303+
}
304+
305+
std::vector<std::string> StackFlows::glob_files(const std::vector<std::string> &patterns)
306+
{
307+
std::vector<std::string> files;
308+
glob_t glob_result;
309+
memset(&glob_result, 0, sizeof(glob_result));
310+
for (const auto &pattern : patterns) {
311+
int ret = glob(pattern.c_str(), GLOB_TILDE | GLOB_BRACE, nullptr, &glob_result);
312+
if (ret != 0) {
313+
// if (ret == GLOB_NOMATCH) {
314+
// std::cerr << "No files matched for pattern: " << pattern << std::endl;
315+
// } else {
316+
// std::cerr << "glob() failed with error code: " << ret << std::endl;
317+
// }
318+
continue;
319+
}
320+
for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
321+
files.push_back(glob_result.gl_pathv[i]);
322+
}
323+
}
324+
globfree(&glob_result);
325+
return files;
326+
}
327+
328+
bool StackFlows::file_exists(const std::string &filePath)
329+
{
330+
std::ifstream file(filePath);
331+
return file.good();
301332
}

ext_components/StackFlow/stackflow/StackFlowUtil.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <cstring>
99
#include <unordered_map>
1010
#include <list>
11+
#include <vector>
1112
#define WORK_ID_NONE -100
1213

1314
#define RPC_PUSH_PARAM(_obj, _data1, _data2) \
@@ -33,4 +34,6 @@ int decode_base64(const std::string &in, std::string &out);
3334
int encode_base64(const std::string &in, std::string &out);
3435
std::string unit_call(const std::string &unit_name, const std::string &unit_action, const std::string &data);
3536
std::list<std::string> get_config_file_paths(std::string &base_model_path, std::string &base_model_config_path, const std::string &mode_name);
37+
std::vector<std::string> glob_files(const std::vector<std::string> &patterns);
38+
bool file_exists(const std::string& filePath);
3639
}; // namespace StackFlows

projects/llm_framework/main_kws/SConstruct

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ LDFLAGS += ['-l:libcargs.a', '-l:libonnxruntime.a',
3131
'-l:libsherpa-onnx-core.a', '-l:libkaldi-native-fbank-core.a',
3232
'-l:libkaldi-decoder-core.a', '-l:libssentencepiece_core.a']
3333

34+
STATIC_FILES += Glob('llm-kws_text2token.py')
3435
STATIC_FILES += Glob('mode_*.json')
3536

3637
env['COMPONENTS'].append({'target':'llm_kws',
File renamed without changes.

projects/llm_framework/main_kws/src/main.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,13 @@ class llm_task {
175175
temp_awake_key << kws_;
176176
temp_awake_key.close();
177177
std::ostringstream awake_key_compile_cmd;
178-
awake_key_compile_cmd << "/usr/bin/python3 /opt/m5stack/scripts/text2token.py ";
178+
if (file_exists("/opt/m5stack/scripts/text2token.py"))
179+
awake_key_compile_cmd << "/usr/bin/python3 /opt/m5stack/scripts/text2token.py ";
180+
else if (file_exists("/opt/m5stack/scripts/llm-kws_text2token.py"))
181+
awake_key_compile_cmd << "/bin/bash /opt/m5stack/scripts/llm-kws_text2token.py ";
182+
else {
183+
SLOGE("text2token.py or llm-kws_text2token.py not found!");
184+
}
179185
awake_key_compile_cmd << "--text /tmp/kws_awake.txt.tmp ";
180186
awake_key_compile_cmd << "--tokens " << mode_config_.model_config.tokens << " ";
181187
if (file_body["mode_param"].contains("text2token-tokens-type")) {

projects/llm_framework/main_llm/SConstruct

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,7 @@ static_file = Glob('../static_lib/module-llm/libabsl_*')
3434
static_file += [AFile('../static_lib/module-llm/libre2.a'), AFile('../static_lib/module-llm/libsentencepiece.a'), AFile('../static_lib/module-llm/libsentencepiece_train.a')]
3535
STATIC_LIB += static_file * 4
3636

37-
STATIC_FILES += [AFile('llama3.2-1B-prefill-ax630c_tokenizer.py'),
38-
AFile('openbuddy-llama3.2-1B-ax630c_tokenizer.py'),
39-
AFile('qwen2.5-coder-0.5B-ax630c_tokenizer.py')
40-
]
37+
STATIC_FILES += Glob('tokenizer_*.py')
4138
STATIC_FILES += Glob('mode_*.json')
4239

4340
env['COMPONENTS'].append({'target':'llm_llm',

projects/llm_framework/main_llm/mode_llama3.2-1B-prefill-ax630c.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"tokens_embed_num":128256,
2929
"tokens_embed_size":2048,
3030
"b_use_mmap_load_embed":true,
31-
"b_dynamic_load_axmodel_layer":false
31+
"b_dynamic_load_axmodel_layer":false,
32+
"ext_scripts":["tokenizer_llama3.2-1B-prefill-ax630c.py"]
3233
}
3334
}

projects/llm_framework/main_llm/mode_openbuddy-llama3.2-1B-ax630c.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"tokens_embed_num":128256,
2929
"tokens_embed_size":2048,
3030
"b_use_mmap_load_embed":true,
31-
"b_dynamic_load_axmodel_layer":false
31+
"b_dynamic_load_axmodel_layer":false,
32+
"ext_scripts":["tokenizer_openbuddy-llama3.2-1B-ax630c.py"]
3233
}
3334
}

projects/llm_framework/main_llm/mode_qwen2.5-1.5B-ax630c.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"tokens_embed_num":151936,
2929
"tokens_embed_size":1536,
3030
"b_use_mmap_load_embed":true,
31-
"b_dynamic_load_axmodel_layer":false
31+
"b_dynamic_load_axmodel_layer":false,
32+
"ext_scripts":["tokenizer_qwen2.5-1.5B-ax630c.py"]
3233
}
3334
}

projects/llm_framework/main_llm/mode_qwen2.5-coder-0.5B-ax630c.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"tokens_embed_num":151936,
2929
"tokens_embed_size":896,
3030
"b_use_mmap_load_embed":true,
31-
"b_dynamic_load_axmodel_layer":false
31+
"b_dynamic_load_axmodel_layer":false,
32+
"ext_scripts":["tokenizer_qwen2.5-coder-0.5B-ax630c.py"]
3233
}
3334
}

0 commit comments

Comments
 (0)