[update] Organize the tokenizer script.

dianjixz · dianjixz · commit 0eaeebe2f9fe · 2025-02-17T15:09:30.000+08:00
diff --git a/ext_components/StackFlow/stackflow/StackFlowUtil.cpp b/ext_components/StackFlow/stackflow/StackFlowUtil.cpp
@@ -5,6 +5,8 @@
  */
 #include "StackFlowUtil.h"
 #include <vector>
+#include <glob.h>
+#include <fstream>
 #include "pzmq.hpp"
 
 std::string StackFlows::sample_json_str_get(const std::string &json_str, const std::string &json_key)
@@ -298,4 +300,33 @@ std::list<std::string> StackFlows::get_config_file_paths(std::string &base_model
     config_file_paths.push_back(base_model_path + std::string("../share/") + std::string("./mode_") + mode_name +
                                 ".json");
     return config_file_paths;
+}
+
+std::vector<std::string> StackFlows::glob_files(const std::vector<std::string> &patterns)
+{
+    std::vector<std::string> files;
+    glob_t glob_result;
+    memset(&glob_result, 0, sizeof(glob_result));
+    for (const auto &pattern : patterns) {
+        int ret = glob(pattern.c_str(), GLOB_TILDE | GLOB_BRACE, nullptr, &glob_result);
+        if (ret != 0) {
+            // if (ret == GLOB_NOMATCH) {
+            //     std::cerr << "No files matched for pattern: " << pattern << std::endl;
+            // } else {
+            //     std::cerr << "glob() failed with error code: " << ret << std::endl;
+            // }
+            continue;
+        }
+        for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
+            files.push_back(glob_result.gl_pathv[i]);
+        }
+    }
+    globfree(&glob_result);
+    return files;
+}
+
+bool StackFlows::file_exists(const std::string &filePath)
+{
+    std::ifstream file(filePath);
+    return file.good();
 }
diff --git a/ext_components/StackFlow/stackflow/StackFlowUtil.h b/ext_components/StackFlow/stackflow/StackFlowUtil.h
@@ -8,6 +8,7 @@
 #include <cstring>
 #include <unordered_map>
 #include <list>
+#include <vector>
 #define WORK_ID_NONE -100
 
 #define RPC_PUSH_PARAM(_obj, _data1, _data2)                                                     \
@@ -33,4 +34,6 @@ int decode_base64(const std::string &in, std::string &out);
 int encode_base64(const std::string &in, std::string &out);
 std::string unit_call(const std::string &unit_name, const std::string &unit_action, const std::string &data);
 std::list<std::string> get_config_file_paths(std::string &base_model_path, std::string &base_model_config_path, const std::string &mode_name);
+std::vector<std::string> glob_files(const std::vector<std::string> &patterns);
+bool file_exists(const std::string& filePath);
 };  // namespace StackFlows
diff --git a/projects/llm_framework/main_kws/SConstruct b/projects/llm_framework/main_kws/SConstruct
@@ -31,6 +31,7 @@ LDFLAGS += ['-l:libcargs.a', '-l:libonnxruntime.a',
             '-l:libsherpa-onnx-core.a', '-l:libkaldi-native-fbank-core.a',
             '-l:libkaldi-decoder-core.a', '-l:libssentencepiece_core.a']
 
+STATIC_FILES += Glob('llm-kws_text2token.py')
 STATIC_FILES += Glob('mode_*.json')
 
 env['COMPONENTS'].append({'target':'llm_kws',
diff --git a/projects/llm_framework/main_kws/llm-kws_text2token.py b/projects/llm_framework/main_kws/llm-kws_text2token.py
diff --git a/projects/llm_framework/main_kws/src/main.cpp b/projects/llm_framework/main_kws/src/main.cpp
@@ -175,7 +175,13 @@ class llm_task {
             temp_awake_key << kws_;
             temp_awake_key.close();
             std::ostringstream awake_key_compile_cmd;
-            awake_key_compile_cmd << "/usr/bin/python3 /opt/m5stack/scripts/text2token.py ";
+            if (file_exists("/opt/m5stack/scripts/text2token.py"))
+                awake_key_compile_cmd << "/usr/bin/python3 /opt/m5stack/scripts/text2token.py ";
+            else if (file_exists("/opt/m5stack/scripts/llm-kws_text2token.py"))
+                awake_key_compile_cmd << "/bin/bash /opt/m5stack/scripts/llm-kws_text2token.py ";
+            else {
+                SLOGE("text2token.py or llm-kws_text2token.py not found!");
+            }
             awake_key_compile_cmd << "--text /tmp/kws_awake.txt.tmp ";
             awake_key_compile_cmd << "--tokens " << mode_config_.model_config.tokens << " ";
             if (file_body["mode_param"].contains("text2token-tokens-type")) {
diff --git a/projects/llm_framework/main_llm/SConstruct b/projects/llm_framework/main_llm/SConstruct
@@ -34,10 +34,7 @@ static_file = Glob('../static_lib/module-llm/libabsl_*')
 static_file += [AFile('../static_lib/module-llm/libre2.a'), AFile('../static_lib/module-llm/libsentencepiece.a'), AFile('../static_lib/module-llm/libsentencepiece_train.a')]
 STATIC_LIB += static_file * 4
 
-STATIC_FILES += [AFile('llama3.2-1B-prefill-ax630c_tokenizer.py'),
-                 AFile('openbuddy-llama3.2-1B-ax630c_tokenizer.py'),
-                 AFile('qwen2.5-coder-0.5B-ax630c_tokenizer.py')
-                 ]
+STATIC_FILES += Glob('tokenizer_*.py')
 STATIC_FILES += Glob('mode_*.json')
 
 env['COMPONENTS'].append({'target':'llm_llm',
diff --git a/projects/llm_framework/main_llm/mode_llama3.2-1B-prefill-ax630c.json b/projects/llm_framework/main_llm/mode_llama3.2-1B-prefill-ax630c.json
@@ -28,6 +28,7 @@
         "tokens_embed_num":128256,
         "tokens_embed_size":2048,
         "b_use_mmap_load_embed":true,
-        "b_dynamic_load_axmodel_layer":false
+        "b_dynamic_load_axmodel_layer":false,
+        "ext_scripts":["tokenizer_llama3.2-1B-prefill-ax630c.py"]
     }
 }
diff --git a/projects/llm_framework/main_llm/mode_openbuddy-llama3.2-1B-ax630c.json b/projects/llm_framework/main_llm/mode_openbuddy-llama3.2-1B-ax630c.json
@@ -28,6 +28,7 @@
         "tokens_embed_num":128256,
         "tokens_embed_size":2048,
         "b_use_mmap_load_embed":true,
-        "b_dynamic_load_axmodel_layer":false
+        "b_dynamic_load_axmodel_layer":false,
+        "ext_scripts":["tokenizer_openbuddy-llama3.2-1B-ax630c.py"]
     }
 }
diff --git a/projects/llm_framework/main_llm/mode_qwen2.5-1.5B-ax630c.json b/projects/llm_framework/main_llm/mode_qwen2.5-1.5B-ax630c.json
@@ -28,6 +28,7 @@
         "tokens_embed_num":151936,
         "tokens_embed_size":1536,
         "b_use_mmap_load_embed":true,
-        "b_dynamic_load_axmodel_layer":false
+        "b_dynamic_load_axmodel_layer":false,
+        "ext_scripts":["tokenizer_qwen2.5-1.5B-ax630c.py"]
     }
 }
diff --git a/projects/llm_framework/main_llm/mode_qwen2.5-coder-0.5B-ax630c.json b/projects/llm_framework/main_llm/mode_qwen2.5-coder-0.5B-ax630c.json
@@ -28,6 +28,7 @@
         "tokens_embed_num":151936,
         "tokens_embed_size":896,
         "b_use_mmap_load_embed":true,
-        "b_dynamic_load_axmodel_layer":false
+        "b_dynamic_load_axmodel_layer":false,
+        "ext_scripts":["tokenizer_qwen2.5-coder-0.5B-ax630c.py"]
     }
 }
diff --git a/projects/llm_framework/main_llm/src/main.cpp b/projects/llm_framework/main_llm/src/main.cpp
@@ -121,12 +121,25 @@ class llm_task {
             CONFIG_AUTO_SET(file_body["mode_param"], max_token_len);
 
             if (mode_config_.filename_tokenizer_model.find("http:") != std::string::npos) {
+                std::string tokenizer_file;
+                if (file_exists(std::string("/opt/m5stack/scripts/") + model_ + std::string("_tokenizer.py"))) {
+                    tokenizer_file = std::string("/opt/m5stack/scripts/") + model_ + std::string("_tokenizer.py");
+                } else if (file_exists(std::string("/opt/m5stack/scripts/") + std::string("tokenizer_") + model_ +
+                                       std::string(".py"))) {
+                    tokenizer_file =
+                        std::string("/opt/m5stack/scripts/") + std::string("tokenizer_") + model_ + std::string(".py");
+                } else {
+                    std::string __log = model_ + std::string("_tokenizer.py");
+                    __log += " or ";
+                    __log += std::string("tokenizer_") + model_ + std::string(".py");
+                    __log += " not found!";
+                    SLOGE("%s", __log.c_str());
+                }
                 if (!tokenizer_server_flage_) {
                     pid_t pid = fork();
                     if (pid == 0) {
-                        execl("/usr/bin/python3", "python3",
-                              ("/opt/m5stack/scripts/" + model_ + "_tokenizer.py").c_str(), "--host", "localhost",
-                              "--port", std::to_string(port_).c_str(), "--model_id", (base_model + "tokenizer").c_str(),
+                        execl("/usr/bin/python3", "python3", tokenizer_file.c_str(), "--host", "localhost", "--port",
+                              std::to_string(port_).c_str(), "--model_id", (base_model + "tokenizer").c_str(),
                               "--content", ("'" + prompt_ + "'").c_str(), nullptr);
                         perror("execl failed");
                         exit(1);
diff --git a/projects/llm_framework/main_llm/tokenizer_llama3.2-1B-prefill-ax630c.py b/projects/llm_framework/main_llm/tokenizer_llama3.2-1B-prefill-ax630c.py
diff --git a/projects/llm_framework/main_llm/tokenizer_openbuddy-llama3.2-1B-ax630c.py b/projects/llm_framework/main_llm/tokenizer_openbuddy-llama3.2-1B-ax630c.py
diff --git a/projects/llm_framework/main_llm/tokenizer_qwen2.5-1.5B-ax630c.py b/projects/llm_framework/main_llm/tokenizer_qwen2.5-1.5B-ax630c.py
diff --git a/projects/llm_framework/main_llm/tokenizer_qwen2.5-coder-0.5B-ax630c.py b/projects/llm_framework/main_llm/tokenizer_qwen2.5-coder-0.5B-ax630c.py
diff --git a/projects/llm_framework/main_vlm/SConstruct b/projects/llm_framework/main_vlm/SConstruct
@@ -49,7 +49,7 @@ static_file += [AFile('../static_lib/libopencv-4.6-aarch64-none/lib/libtegra_hal
 static_file += [AFile('../static_lib/libopencv-4.6-aarch64-none/lib/libzlib.a')]
 STATIC_LIB += static_file * 4
 
-STATIC_FILES += [AFile('internvl2-1B-ax630c_tokenizer.py')]
+STATIC_FILES += Glob('tokenizer_*.py')
 STATIC_FILES += Glob('mode_*.json')
 
 env['COMPONENTS'].append({'target':'llm_vlm',
diff --git a/projects/llm_framework/main_vlm/mode_internvl2-1B-ax630c.json b/projects/llm_framework/main_vlm/mode_internvl2-1B-ax630c.json
@@ -27,6 +27,7 @@
         "tokens_embed_num":151655,
         "tokens_embed_size":896,
         "b_use_mmap_load_embed":true,
-        "b_dynamic_load_axmodel_layer":false
+        "b_dynamic_load_axmodel_layer":false,
+        "ext_scripts":["tokenizer_internvl2-1B-ax630c.py"]
     }
 }
diff --git a/projects/llm_framework/main_vlm/mode_internvl2.5-1B-ax630c.json b/projects/llm_framework/main_vlm/mode_internvl2.5-1B-ax630c.json
@@ -27,6 +27,7 @@
         "tokens_embed_num":151674,
         "tokens_embed_size":896,
         "b_use_mmap_load_embed":true,
-        "b_dynamic_load_axmodel_layer":false
+        "b_dynamic_load_axmodel_layer":false,
+        "ext_scripts":["tokenizer_internvl2.5-1B-ax630c.py"]
     }
 }
diff --git a/projects/llm_framework/main_vlm/src/main.cpp b/projects/llm_framework/main_vlm/src/main.cpp
@@ -127,12 +127,25 @@ class llm_task {
             CONFIG_AUTO_SET(file_body["mode_param"], max_token_len);
 
             if (mode_config_.filename_tokenizer_model.find("http:") != std::string::npos) {
+                std::string tokenizer_file;
+                if (file_exists(std::string("/opt/m5stack/scripts/") + model_ + std::string("_tokenizer.py"))) {
+                    tokenizer_file = std::string("/opt/m5stack/scripts/") + model_ + std::string("_tokenizer.py");
+                } else if (file_exists(std::string("/opt/m5stack/scripts/") + std::string("tokenizer_") + model_ +
+                                       std::string(".py"))) {
+                    tokenizer_file =
+                        std::string("/opt/m5stack/scripts/") + std::string("tokenizer_") + model_ + std::string(".py");
+                } else {
+                    std::string __log = model_ + std::string("_tokenizer.py");
+                    __log += " or ";
+                    __log += std::string("tokenizer_") + model_ + std::string(".py");
+                    __log += " not found!";
+                    SLOGE("%s", __log.c_str());
+                }
                 if (!tokenizer_server_flage_) {
                     pid_t pid = fork();
                     if (pid == 0) {
-                        execl("/usr/bin/python3", "python3",
-                              ("/opt/m5stack/scripts/" + model_ + "_tokenizer.py").c_str(), "--host", "localhost",
-                              "--port", std::to_string(port_).c_str(), "--model_id", (base_model + "tokenizer").c_str(),
+                        execl("/usr/bin/python3", "python3", tokenizer_file.c_str(), "--host", "localhost", "--port",
+                              std::to_string(port_).c_str(), "--model_id", (base_model + "tokenizer").c_str(),
                               "--content", ("'" + prompt_ + "'").c_str(), nullptr);
                         perror("execl failed");
                         exit(1);
diff --git a/projects/llm_framework/main_vlm/tokenizer_internvl2-1B-ax630c.py b/projects/llm_framework/main_vlm/tokenizer_internvl2-1B-ax630c.py
diff --git a/projects/llm_framework/main_vlm/tokenizer_internvl2.5-1B-ax630c.py b/projects/llm_framework/main_vlm/tokenizer_internvl2.5-1B-ax630c.py
diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py
@@ -7,6 +7,8 @@
 import tarfile
 import shutil
 import concurrent.futures
+import json
+import glob
 '''
 {package_name}_{version}-{revision}_{architecture}.deb
 lib-llm_1.0-m5stack1_arm64.deb
@@ -27,7 +29,7 @@ def create_lib_deb(package_name, version, src_folder, revision = 'm5stack1'):
     os.makedirs(deb_folder, exist_ok = True)
 
     for item in os.listdir(src_folder):
-        if item.startswith('llm_'):
+        if item.startswith('llm_') or item.startswith('tokenizer_') or item.startswith('llm-kws_'):
             continue
         elif item.startswith('lib'):
             os.makedirs(os.path.join(deb_folder, 'opt/m5stack/lib'), exist_ok = True)
@@ -37,23 +39,23 @@ def create_lib_deb(package_name, version, src_folder, revision = 'm5stack1'):
             shutil.copy2(os.path.join(src_folder, item), os.path.join(deb_folder, 'opt/m5stack/share', item))
     # os.makedirs(os.path.join(deb_folder, 'opt/m5stack/data'), exist_ok = True)
 
-    zip_file = 'm5stack_scripts.tar.gz'
-    down_url = 'https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_scripts.tar.gz'
-    zip_file_extrpath = 'm5stack_scripts'
-    if not os.path.exists(zip_file_extrpath):
-        # Downloading via HTTP (more common)
-        if not os.path.exists(zip_file):
-            response = requests.get(down_url)
-            if response.status_code == 200:
-                with open(zip_file, 'wb') as file:
-                    file.write(response.content)
-            else:
-                print("{} down failed".format(down_url))
-        with tarfile.open(zip_file, 'r:gz') as tar:
-            tar.extractall(path=zip_file_extrpath)
-        print("The {} download successful.".format(down_url))
-    if os.path.exists(zip_file_extrpath):
-        shutil.copytree(zip_file_extrpath, os.path.join(deb_folder, 'opt/m5stack/scripts'))
+    # zip_file = 'm5stack_scripts.tar.gz'
+    # down_url = 'https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_scripts.tar.gz'
+    # zip_file_extrpath = 'm5stack_scripts'
+    # if not os.path.exists(zip_file_extrpath):
+    #     # Downloading via HTTP (more common)
+    #     if not os.path.exists(zip_file):
+    #         response = requests.get(down_url)
+    #         if response.status_code == 200:
+    #             with open(zip_file, 'wb') as file:
+    #                 file.write(response.content)
+    #         else:
+    #             print("{} down failed".format(down_url))
+    #     with tarfile.open(zip_file, 'r:gz') as tar:
+    #         tar.extractall(path=zip_file_extrpath)
+    #     print("The {} download successful.".format(down_url))
+    # if os.path.exists(zip_file_extrpath):
+    #     shutil.copytree(zip_file_extrpath, os.path.join(deb_folder, 'opt/m5stack/scripts'))
 
     zip_file = 'm5stack_dist-packages.tar.gz'
     down_url = 'https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_dist-packages.tar.gz'
@@ -162,10 +164,21 @@ def create_data_deb(package_name, version, src_folder, revision = 'm5stack1'):
     RED = "\033[31m"
     RESET = "\033[0m"
     os.makedirs(os.path.join(deb_folder, 'opt/m5stack/data/models'), exist_ok = True)
-    if os.path.exists(os.path.join(src_folder,'mode_{}.json'.format(package_name[4:]))):
-        shutil.copy2(os.path.join(src_folder,'mode_{}.json'.format(package_name[4:])), os.path.join(deb_folder, 'opt/m5stack/data/models', 'mode_{}.json'.format(package_name[4:])))
+    mode_config_file = os.path.join(src_folder,'mode_{}.json'.format(package_name[4:]))
+    if os.path.exists(mode_config_file):
+        shutil.copy2(mode_config_file, os.path.join(deb_folder, 'opt/m5stack/data/models', 'mode_{}.json'.format(package_name[4:])))
+        try:
+            with open(mode_config_file, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+            for scripts_file in data['mode_param']['ext_scripts']:
+                tokenizer_py_file = os.path.join(src_folder, scripts_file)
+                if os.path.exists(tokenizer_py_file):
+                    os.makedirs(os.path.join(deb_folder, 'opt/m5stack/scripts'), exist_ok = True)
+                    shutil.copy2(tokenizer_py_file, os.path.join(deb_folder, 'opt/m5stack/scripts', scripts_file))
+        except:
+            pass
     else:
-        print(RED, os.path.join(src_folder,'mode_{}.json'.format(package_name[4:])), " miss", RESET)
+        print(RED, mode_config_file, " miss", RESET)
 
     os.makedirs(os.path.join(deb_folder, 'DEBIAN'), exist_ok = True)
     with open(os.path.join(deb_folder, 'DEBIAN/control'),'w') as f:
@@ -202,6 +215,12 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'):
     os.makedirs(os.path.join(deb_folder, 'DEBIAN'), exist_ok = True)
     # shutil.copytree(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'deb_overlay'), deb_folder)
     shutil.copy2(os.path.join(src_folder, package_name.replace("-", "_")), os.path.join(deb_folder, 'opt/m5stack/bin', package_name.replace("-", "_")))
+    ext_scripts_files = glob.glob(os.path.join(src_folder, package_name + "_*"))
+    if ext_scripts_files:
+        os.makedirs(os.path.join(deb_folder, 'opt/m5stack/scripts'), exist_ok = True)
+        for ext_script_file in ext_scripts_files:
+            shutil.copy2(ext_script_file, os.path.join(deb_folder, 'opt/m5stack/scripts'))
+
     with open(os.path.join(deb_folder, 'DEBIAN/control'),'w') as f:
         f.write(f'Package: {package_name}\n')
         f.write(f'Version: {version}\n')

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@`
`28`	`28`	`"tokens_embed_num":128256,`
`29`	`29`	`"tokens_embed_size":2048,`
`30`	`30`	`"b_use_mmap_load_embed":true,`
`31`		`- "b_dynamic_load_axmodel_layer":false`
	`31`	`+ "b_dynamic_load_axmodel_layer":false,`
	`32`	`+ "ext_scripts":["tokenizer_llama3.2-1B-prefill-ax630c.py"]`
`32`	`33`	`}`
`33`	`34`	`}`