File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -8,6 +8,25 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
88 set_property (CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" )
99endif ()
1010
11+
12+ message (STATUS "polyfilling #embed via Python..." )
13+
14+ set (VOCAB_HEADER "${CMAKE_CURRENT_SOURCE_DIR} /src/vocab/vocab.hpp" )
15+ set (GENERATED_VOCAB_HPP "${CMAKE_CURRENT_SOURCE_DIR} /src/vocab/vocab_generated.h" )
16+ execute_process (
17+ COMMAND python ${CMAKE_CURRENT_SOURCE_DIR} /script/generate_vocab.py
18+ ${VOCAB_HEADER}
19+ ${GENERATED_VOCAB_HPP}
20+ RESULT_VARIABLE result
21+ )
22+
23+ if (NOT result EQUAL 0)
24+ message (FATAL_ERROR "Failed to run generate_vocab.py" )
25+ endif ()
26+
27+ add_definitions (-DUSE_GENERATED_VOCAB )
28+
29+
1130if (MSVC )
1231 add_compile_definitions (_CRT_SECURE_NO_WARNINGS )
1332 add_compile_definitions (_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING )
Original file line number Diff line number Diff line change 1+ import re
2+ import os
3+ import sys
4+
5+ def file_to_c_array (filepath ):
6+ with open (filepath , "rb" ) as f :
7+ data = f .read ()
8+
9+ cont = []
10+ for i , b in enumerate (data ):
11+ val = ""
12+ if (i + 1 ) % 16 == 0 :
13+ val = "\n "
14+ val += f'{ b } ' ;
15+ cont .append (val )
16+
17+ return "," .join (cont )
18+
19+ def process_header (input_path , output_hpp ):
20+ with open (input_path , "r" , encoding = "utf-8" ) as f :
21+ content = f .read ()
22+
23+ pattern = re .compile (r'char\s+(\w+)\[\]\s*=?\s*\{\s*#embed\s+"([^"]+)"\s*\};' )
24+ matches = pattern .findall (content )
25+ if not matches :
26+ print ("No #embed found in vocab.hpp" )
27+ return
28+
29+ out_content = f'#pragma once\n \n '
30+ for var_name , file_path in matches :
31+ print (f"Embedding { file_path } into { var_name } ..." )
32+ hex_data = file_to_c_array (file_path )
33+ out_content += f"static const unsigned char { var_name } [] = {{\n { hex_data } \n }};\n "
34+
35+ with open (output_hpp , "w" ) as f : f .write (out_content )
36+
37+ if __name__ == "__main__" :
38+ # Usage: python embed_fix.py <vocab.hpp> <out.hpp>
39+ process_header (sys .argv [1 ], sys .argv [2 ])
Original file line number Diff line number Diff line change 11#include " vocab.h"
22
3+ #ifdef USE_GENERATED_VOCAB
4+ #include " vocab_generated.h"
5+ #else
36static unsigned char clip_merges_utf8_c_str[] = {
47 #embed " embed/merges.txt"
58};
@@ -18,6 +21,7 @@ static unsigned char qwen2_merges_utf8_c_str[] = {
1821static unsigned char umt5_tokenizer_json_str[] = {
1922 #embed " embed/umt5_tokenizer.json"
2023};
24+ #endif
2125
2226std::string load_clip_merges () {
2327 std::string merges_utf8_str (reinterpret_cast <const char *>(clip_merges_utf8_c_str), sizeof (clip_merges_utf8_c_str));
You can’t perform that action at this time.
0 commit comments