Skip to content

Commit 13af3e1

Browse files
committed
not really™ fix (better to change options)
1 parent bc841ed commit 13af3e1

3 files changed

Lines changed: 62 additions & 0 deletions

File tree

CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,25 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
88
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
99
endif()
1010

11+
12+
message(STATUS "polyfilling #embed via Python...")
13+
14+
set(VOCAB_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/src/vocab/vocab.hpp")
15+
set(GENERATED_VOCAB_HPP "${CMAKE_CURRENT_SOURCE_DIR}/src/vocab/vocab_generated.h")
16+
execute_process(
17+
COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/script/generate_vocab.py
18+
${VOCAB_HEADER}
19+
${GENERATED_VOCAB_HPP}
20+
RESULT_VARIABLE result
21+
)
22+
23+
if (NOT result EQUAL 0)
24+
message(FATAL_ERROR "Failed to run generate_vocab.py")
25+
endif()
26+
27+
add_definitions(-DUSE_GENERATED_VOCAB)
28+
29+
1130
if (MSVC)
1231
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
1332
add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)

script/generate_vocab.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import re
2+
import os
3+
import sys
4+
5+
def file_to_c_array(filepath):
6+
with open(filepath, "rb") as f:
7+
data = f.read()
8+
9+
cont = []
10+
for i, b in enumerate(data):
11+
val = ""
12+
if (i + 1) % 16 == 0:
13+
val = "\n"
14+
val += f'{b}';
15+
cont.append(val)
16+
17+
return ",".join(cont)
18+
19+
def process_header(input_path, output_hpp):
20+
with open(input_path, "r", encoding="utf-8") as f:
21+
content = f.read()
22+
23+
pattern = re.compile(r'char\s+(\w+)\[\]\s*=?\s*\{\s*#embed\s+"([^"]+)"\s*\};')
24+
matches = pattern.findall(content)
25+
if not matches:
26+
print("No #embed found in vocab.hpp")
27+
return
28+
29+
out_content = f'#pragma once\n\n'
30+
for var_name, file_path in matches:
31+
print(f"Embedding {file_path} into {var_name}...")
32+
hex_data = file_to_c_array(file_path)
33+
out_content += f"static const unsigned char {var_name}[] = {{\n{hex_data}\n}};\n"
34+
35+
with open(output_hpp, "w") as f: f.write(out_content)
36+
37+
if __name__ == "__main__":
38+
# Usage: python embed_fix.py <vocab.hpp> <out.hpp>
39+
process_header(sys.argv[1], sys.argv[2])

src/vocab/vocab.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#include "vocab.h"
22

3+
#ifdef USE_GENERATED_VOCAB
4+
#include "vocab_generated.h"
5+
#else
36
static unsigned char clip_merges_utf8_c_str[] = {
47
#embed "embed/merges.txt"
58
};
@@ -18,6 +21,7 @@ static unsigned char qwen2_merges_utf8_c_str[] = {
1821
static unsigned char umt5_tokenizer_json_str[] = {
1922
#embed "embed/umt5_tokenizer.json"
2023
};
24+
#endif
2125

2226
std::string load_clip_merges() {
2327
std::string merges_utf8_str(reinterpret_cast<const char*>(clip_merges_utf8_c_str), sizeof(clip_merges_utf8_c_str));

0 commit comments

Comments
 (0)