Skip to content

Commit 5782f89

Browse files
committed
处理陌生英语单词
1 parent 3897870 commit 5782f89

3 files changed

Lines changed: 87 additions & 7 deletions

File tree

.clang-format

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,5 +163,4 @@ StatementMacros:
163163
- QT_REQUIRE_VERSION
164164
TabWidth: 4
165165
UseCRLF: false
166-
UseTab: Never
167-
...
166+
UseTab: Never

projects/llm_framework/main_melotts/mode_melotts-ja-jp.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"gbin": "g-jp.bin",
2222
"tokens": "tokens-jp.txt",
2323
"lexicon": "lexicon-jp.txt",
24-
"spacker_speed": 1.0,
24+
"spacker_speed": 1.1,
2525
"mode_rate": 44100,
2626
"audio_rate": 16000,
2727
"awake_delay": 1000

projects/llm_framework/main_melotts/src/runner/Lexicon.hpp

Lines changed: 85 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,90 @@ class Lexicon {
109109
{
110110
return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z'));
111111
}
112+
113+
bool is_english_token_char(const std::string& s)
114+
{
115+
if (s.size() != 1) return false;
116+
char c = s[0];
117+
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_';
118+
}
119+
120+
void process_unknown_english(const std::string& word, std::vector<int>& phones, std::vector<int>& tones)
121+
{
122+
SLOGI("Processing unknown term: %s", word.c_str());
123+
124+
std::string orig_word = word;
125+
std::vector<std::string> parts;
126+
std::vector<std::string> phonetic_parts;
127+
128+
size_t start = 0;
129+
while (start < word.size()) {
130+
bool matched = false;
131+
132+
for (size_t len = std::min(word.size() - start, (size_t)10); len > 0 && !matched; --len) {
133+
std::string sub_word = word.substr(start, len);
134+
std::string lower_sub_word = sub_word;
135+
std::transform(lower_sub_word.begin(), lower_sub_word.end(), lower_sub_word.begin(),
136+
[](unsigned char c) { return std::tolower(c); });
137+
138+
if (lexicon.find(lower_sub_word) != lexicon.end()) {
139+
// Substring found in lexicon
140+
auto& [sub_phones, sub_tones] = lexicon[lower_sub_word];
141+
phones.insert(phones.end(), sub_phones.begin(), sub_phones.end());
142+
tones.insert(tones.end(), sub_tones.begin(), sub_tones.end());
143+
144+
parts.push_back(sub_word);
145+
phonetic_parts.push_back(phonesToString(sub_phones));
146+
147+
SLOGI(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
148+
149+
start += len;
150+
matched = true;
151+
break;
152+
}
153+
}
154+
155+
if (!matched) {
156+
std::string single_char = word.substr(start, 1);
157+
std::string lower_char = single_char;
158+
std::transform(lower_char.begin(), lower_char.end(), lower_char.begin(),
159+
[](unsigned char c) { return std::tolower(c); });
160+
161+
if (lexicon.find(lower_char) != lexicon.end()) {
162+
auto& [char_phones, char_tones] = lexicon[lower_char];
163+
phones.insert(phones.end(), char_phones.begin(), char_phones.end());
164+
tones.insert(tones.end(), char_tones.begin(), char_tones.end());
165+
166+
parts.push_back(single_char);
167+
phonetic_parts.push_back(phonesToString(char_phones));
168+
169+
SLOGI(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
170+
} else {
171+
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
172+
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
173+
174+
parts.push_back(single_char);
175+
phonetic_parts.push_back("_unknown_");
176+
177+
SLOGI(" Unknown: '%s'", single_char.c_str());
178+
}
179+
180+
start++;
181+
}
182+
}
183+
184+
std::string parts_str, phonetic_str;
185+
for (size_t i = 0; i < parts.size(); i++) {
186+
if (i > 0) {
187+
parts_str += " ";
188+
phonetic_str += " ";
189+
}
190+
parts_str += parts[i];
191+
phonetic_str += phonetic_parts[i];
192+
}
193+
194+
SLOGI("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(), phonetic_str.c_str());
195+
}
112196
void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
113197
{
114198
SLOGI("\n开始处理文本: \"%s\"", text.c_str());
@@ -139,10 +223,7 @@ class Lexicon {
139223
SLOGI("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
140224
tonesToString(eng_tones).c_str());
141225
} else {
142-
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
143-
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
144-
SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_word.c_str(), phonesToString(unknown_token.first).c_str(),
145-
tonesToString(unknown_token.second).c_str());
226+
process_unknown_english(orig_word, phones, tones);
146227
}
147228
continue;
148229
}

0 commit comments

Comments
 (0)