@@ -109,6 +109,90 @@ class Lexicon {
109109 {
110110 return s.size () == 1 && ((s[0 ] >= ' A' && s[0 ] <= ' Z' ) || (s[0 ] >= ' a' && s[0 ] <= ' z' ));
111111 }
112+
113+ bool is_english_token_char (const std::string& s)
114+ {
115+ if (s.size () != 1 ) return false ;
116+ char c = s[0 ];
117+ return (c >= ' A' && c <= ' Z' ) || (c >= ' a' && c <= ' z' ) || (c >= ' 0' && c <= ' 9' ) || c == ' -' || c == ' _' ;
118+ }
119+
120+ void process_unknown_english (const std::string& word, std::vector<int >& phones, std::vector<int >& tones)
121+ {
122+ SLOGI (" Processing unknown term: %s" , word.c_str ());
123+
124+ std::string orig_word = word;
125+ std::vector<std::string> parts;
126+ std::vector<std::string> phonetic_parts;
127+
128+ size_t start = 0 ;
129+ while (start < word.size ()) {
130+ bool matched = false ;
131+
132+ for (size_t len = std::min (word.size () - start, (size_t )10 ); len > 0 && !matched; --len) {
133+ std::string sub_word = word.substr (start, len);
134+ std::string lower_sub_word = sub_word;
135+ std::transform (lower_sub_word.begin (), lower_sub_word.end (), lower_sub_word.begin (),
136+ [](unsigned char c) { return std::tolower (c); });
137+
138+ if (lexicon.find (lower_sub_word) != lexicon.end ()) {
139+ // Substring found in lexicon
140+ auto & [sub_phones, sub_tones] = lexicon[lower_sub_word];
141+ phones.insert (phones.end (), sub_phones.begin (), sub_phones.end ());
142+ tones.insert (tones.end (), sub_tones.begin (), sub_tones.end ());
143+
144+ parts.push_back (sub_word);
145+ phonetic_parts.push_back (phonesToString (sub_phones));
146+
147+ SLOGI (" Matched: '%s' -> %s" , sub_word.c_str (), phonesToString (sub_phones).c_str ());
148+
149+ start += len;
150+ matched = true ;
151+ break ;
152+ }
153+ }
154+
155+ if (!matched) {
156+ std::string single_char = word.substr (start, 1 );
157+ std::string lower_char = single_char;
158+ std::transform (lower_char.begin (), lower_char.end (), lower_char.begin (),
159+ [](unsigned char c) { return std::tolower (c); });
160+
161+ if (lexicon.find (lower_char) != lexicon.end ()) {
162+ auto & [char_phones, char_tones] = lexicon[lower_char];
163+ phones.insert (phones.end (), char_phones.begin (), char_phones.end ());
164+ tones.insert (tones.end (), char_tones.begin (), char_tones.end ());
165+
166+ parts.push_back (single_char);
167+ phonetic_parts.push_back (phonesToString (char_phones));
168+
169+ SLOGI (" Single char: '%s' -> %s" , single_char.c_str (), phonesToString (char_phones).c_str ());
170+ } else {
171+ phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
172+ tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
173+
174+ parts.push_back (single_char);
175+ phonetic_parts.push_back (" _unknown_" );
176+
177+ SLOGI (" Unknown: '%s'" , single_char.c_str ());
178+ }
179+
180+ start++;
181+ }
182+ }
183+
184+ std::string parts_str, phonetic_str;
185+ for (size_t i = 0 ; i < parts.size (); i++) {
186+ if (i > 0 ) {
187+ parts_str += " " ;
188+ phonetic_str += " " ;
189+ }
190+ parts_str += parts[i];
191+ phonetic_str += phonetic_parts[i];
192+ }
193+
194+ SLOGI (" %s\t |\t Decomposed: %s\t |\t Phonetics: %s" , orig_word.c_str (), parts_str.c_str (), phonetic_str.c_str ());
195+ }
112196 void convert (const std::string& text, std::vector<int >& phones, std::vector<int >& tones)
113197 {
114198 SLOGI (" \n 开始处理文本: \" %s\" " , text.c_str ());
@@ -139,10 +223,7 @@ class Lexicon {
139223 SLOGI (" %s\t |\t %s\t |\t %s" , orig_word.c_str (), phonesToString (eng_phones).c_str (),
140224 tonesToString (eng_tones).c_str ());
141225 } else {
142- phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
143- tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
144- SLOGI (" %s\t |\t %s (未匹配)\t |\t %s" , orig_word.c_str (), phonesToString (unknown_token.first ).c_str (),
145- tonesToString (unknown_token.second ).c_str ());
226+ process_unknown_english (orig_word, phones, tones);
146227 }
147228 continue ;
148229 }
0 commit comments