Skip to content

Commit 43c3438

Browse files
author
LittleMouse
committed
[fix] Fix non-utf-8 characters
1 parent 16dfe70 commit 43c3438

1 file changed

Lines changed: 29 additions & 1 deletion

File tree

  • projects/llm_framework/main_whisper/src

projects/llm_framework/main_whisper/src/main.cpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,34 @@ class llm_task {
192192
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
193193
}
194194

195+
bool is_valid_utf8(const std::string &str)
196+
{
197+
int bytes = 0;
198+
for (unsigned char c : str) {
199+
if (bytes == 0) {
200+
if ((c >> 5) == 0b110)
201+
bytes = 1;
202+
else if ((c >> 4) == 0b1110)
203+
bytes = 2;
204+
else if ((c >> 3) == 0b11110)
205+
bytes = 3;
206+
else if ((c >> 7))
207+
return false;
208+
} else {
209+
if ((c >> 6) != 0b10) return false;
210+
bytes--;
211+
}
212+
}
213+
return bytes == 0;
214+
}
215+
216+
void fix_utf8_string(std::string &s)
217+
{
218+
while (!s.empty() && !is_valid_utf8(s)) {
219+
s.pop_back();
220+
}
221+
}
222+
195223
int load_model(const nlohmann::json &config_body)
196224
{
197225
if (parse_config(config_body)) {
@@ -475,7 +503,7 @@ class llm_task {
475503
(uint32)mode_config_.token_tables[i].size(), str);
476504
s += str;
477505
}
478-
506+
fix_utf8_string(s);
479507
if (mode_config_.language == "en" || mode_config_.language == "ja") {
480508
if (out_callback_) out_callback_(s, true);
481509
} else {

0 commit comments

Comments
 (0)