77#include < algorithm>
88#include < sstream>
99#include < cassert>
10- #include < iostream> // 用于日志输出
10+ #include < iostream>
11+ #include " ../../../../../SDK/components/utilities/include/sample_log.h"
1112
12- // 使用引用传参优化split函数,避免不必要的拷贝
13- std::vector<std::string> split ( const std::string &s, char delim) {
13+ std::vector<std::string> split ( const std::string& s, char delim)
14+ {
1415 std::vector<std::string> result;
1516 std::stringstream ss (s);
1617 std::string item;
1718 while (getline (ss, item, delim)) {
18- if (!item.empty ()) { // 避免添加空字符串
19+ if (!item.empty ()) {
1920 result.push_back (item);
2021 }
2122 }
2223 return result;
2324}
24-
2525class Lexicon {
2626private:
2727 std::unordered_map<std::string, std::pair<std::vector<int >, std::vector<int >>> lexicon;
28- size_t max_phrase_length; // 追踪词典中最长的词组长度
29- std::pair<std::vector<int >, std::vector<int >> unknown_token; // '_'的发音作为未知词的默认值
30- std::unordered_map<int , std::string> reverse_tokens; // 用于将音素ID转回音素符号,用于日志
28+ size_t max_phrase_length;
29+ std::pair<std::vector<int >, std::vector<int >> unknown_token;
30+ std::unordered_map<int , std::string> reverse_tokens;
3131
3232public:
33- Lexicon (const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0 ) {
33+ Lexicon (const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0 )
34+ {
3435 std::unordered_map<std::string, int > tokens;
35-
36- // 加载tokens
3736 std::ifstream ifs (tokens_filename);
3837 assert (ifs.is_open ());
39-
4038 std::string line;
4139 while (std::getline (ifs, line)) {
4240 auto splitted_line = split (line, ' ' );
4341 if (splitted_line.size () >= 2 ) {
4442 int token_id = std::stoi (splitted_line[1 ]);
4543 tokens.insert ({splitted_line[0 ], token_id});
46- reverse_tokens[token_id] = splitted_line[0 ]; // 建立反向映射
44+ reverse_tokens[token_id] = splitted_line[0 ];
4745 }
4846 }
4947 ifs.close ();
50-
51- // 加载lexicon
5248 ifs.open (lexicon_filename);
5349 assert (ifs.is_open ());
5450 while (std::getline (ifs, line)) {
5551 auto splitted_line = split (line, ' ' );
5652 if (splitted_line.empty ()) continue ;
57-
5853 std::string word_or_phrase = splitted_line[0 ];
59-
60- // 更新最长词组长度
61- auto chars = splitEachChar (word_or_phrase);
62- max_phrase_length = std::max (max_phrase_length, chars.size ());
63-
64- size_t phone_tone_len = splitted_line.size () - 1 ;
65- size_t half_len = phone_tone_len / 2 ;
54+ auto chars = splitEachChar (word_or_phrase);
55+ max_phrase_length = std::max (max_phrase_length, chars.size ());
56+ size_t phone_tone_len = splitted_line.size () - 1 ;
57+ size_t half_len = phone_tone_len / 2 ;
6658 std::vector<int > phones, tones;
67-
6859 for (size_t i = 0 ; i < phone_tone_len; i++) {
6960 auto phone_or_tone = splitted_line[i + 1 ];
7061 if (i < half_len) {
@@ -75,213 +66,161 @@ class Lexicon {
7566 tones.push_back (std::stoi (phone_or_tone));
7667 }
7768 }
78-
7969 lexicon[word_or_phrase] = std::make_pair (phones, tones);
8070 }
81-
82- // 添加特殊映射
83- lexicon[" 呣" ] = lexicon[" 母" ];
84- lexicon[" 嗯" ] = lexicon[" 恩" ];
85-
86- // 添加标点符号
8771 const std::vector<std::string> punctuation{" !" , " ?" , " …" , " ," , " ." , " '" , " -" };
8872 for (const auto & p : punctuation) {
8973 if (tokens.find (p) != tokens.end ()) {
90- int i = tokens[p];
74+ int i = tokens[p];
9175 lexicon[p] = std::make_pair (std::vector<int >{i}, std::vector<int >{0 });
9276 }
9377 }
94-
95- // 设置'_'作为未知词的发音
96- assert (tokens.find (" _" ) != tokens.end ()); // 确保tokens中包含"_"
78+ assert (tokens.find (" _" ) != tokens.end ());
9779 unknown_token = std::make_pair (std::vector<int >{tokens[" _" ]}, std::vector<int >{0 });
98-
99- // 空格映射到'_'的发音
100- lexicon[" " ] = unknown_token;
101-
102- // 中文标点转换映射
80+ lexicon[" " ] = unknown_token;
10381 lexicon[" ," ] = lexicon[" ," ];
10482 lexicon[" 。" ] = lexicon[" ." ];
10583 lexicon[" !" ] = lexicon[" !" ];
10684 lexicon[" ?" ] = lexicon[" ?" ];
107-
108- // 输出词典信息
109- std::cout << " 词典加载完成,包含 " << lexicon.size () << " 个条目,最长词组长度: " << max_phrase_length << std::endl;
85+ SLOGI (" 词典加载完成,包含 %zu 个条目,最长词组长度: %zu" , lexicon.size (), max_phrase_length);
11086 }
111-
112- std::vector<std::string> splitEachChar ( const std::string& text) {
87+ std::vector<std::string> splitEachChar ( const std::string& text)
88+ {
11389 std::vector<std::string> words;
11490 int len = text.length ();
115- int i = 0 ;
116-
91+ int i = 0 ;
11792 while (i < len) {
11893 int next = 1 ;
11994 if ((text[i] & 0x80 ) == 0x00 ) {
12095 // ASCII
12196 } else if ((text[i] & 0xE0 ) == 0xC0 ) {
122- next = 2 ; // 2字节UTF-8
97+ next = 2 ; // 2字节UTF-8
12398 } else if ((text[i] & 0xF0 ) == 0xE0 ) {
124- next = 3 ; // 3字节UTF-8
99+ next = 3 ; // 3字节UTF-8
125100 } else if ((text[i] & 0xF8 ) == 0xF0 ) {
126- next = 4 ; // 4字节UTF-8
101+ next = 4 ; // 4字节UTF-8
127102 }
128103 words.push_back (text.substr (i, next));
129104 i += next;
130105 }
131106 return words;
132- }
133-
134- bool is_english ( const std::string& s) {
107+ }
108+ bool is_english ( const std::string& s)
109+ {
135110 return s.size () == 1 && ((s[0 ] >= ' A' && s[0 ] <= ' Z' ) || (s[0 ] >= ' a' && s[0 ] <= ' z' ));
136111 }
137-
138- // 根据词典中的内容,使用最长匹配算法处理输入文本
139- void convert (const std::string& text, std::vector<int >& phones, std::vector<int >& tones) {
140- std::cout << " \n 开始处理文本: \" " << text << " \" " << std::endl;
141- std::cout << " =======匹配结果=======" << std::endl;
142- std::cout << " 单元\t |\t 音素\t |\t 声调" << std::endl;
143- std::cout << " -----------------------------" << std::endl;
144-
145- // 在开头添加'_'边界标记
112+ void convert (const std::string& text, std::vector<int >& phones, std::vector<int >& tones)
113+ {
114+ SLOGI (" \n 开始处理文本: \" %s\" " , text.c_str ());
115+ SLOGI (" =======匹配结果=======" );
116+ SLOGI (" 单元\t |\t 音素\t |\t 声调" );
117+ SLOGI (" -----------------------------" );
146118 phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
147119 tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
148- std::cout << " <BOS> \t | \t " << phonesToString (unknown_token. first ) << " \t | \t "
149- << tonesToString (unknown_token.second ) << std::endl;
150-
120+
121+ SLOGI ( " <BOS> \t | \t %s \t | \t %s " , phonesToString (unknown_token.first ). c_str (),
122+ tonesToString (unknown_token. second ). c_str ());
151123 auto chars = splitEachChar (text);
152- int i = 0 ;
153-
124+ int i = 0 ;
154125 while (i < chars.size ()) {
155- // 处理英文单词
156126 if (is_english (chars[i])) {
157127 std::string eng_word;
158128 int start = i;
159129 while (i < chars.size () && is_english (chars[i])) {
160130 eng_word += chars[i++];
161131 }
162-
163- // 英文转小写
164- std::string orig_word = eng_word; // 保留原始单词用于日志
132+ std::string orig_word = eng_word;
165133 std::transform (eng_word.begin (), eng_word.end (), eng_word.begin (),
166- [](unsigned char c){ return std::tolower (c); });
167-
168- // 如果词典中有这个英文单词,使用它;否则使用'_'的发音
134+ [](unsigned char c) { return std::tolower (c); });
169135 if (lexicon.find (eng_word) != lexicon.end ()) {
170136 auto & [eng_phones, eng_tones] = lexicon[eng_word];
171137 phones.insert (phones.end (), eng_phones.begin (), eng_phones.end ());
172138 tones.insert (tones.end (), eng_tones.begin (), eng_tones.end ());
173-
174- // 打印匹配信息
175- std::cout << orig_word << " \t |\t " << phonesToString (eng_phones) << " \t |\t "
176- << tonesToString (eng_tones) << std::endl;
139+ SLOGI (" %s\t |\t %s\t |\t %s" , orig_word.c_str (), phonesToString (eng_phones).c_str (),
140+ tonesToString (eng_tones).c_str ());
177141 } else {
178- // 未找到单词,使用'_'的发音
179142 phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
180143 tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
181-
182- // 打印未匹配信息
183- std::cout << orig_word << " \t |\t " << phonesToString (unknown_token.first ) << " (未匹配)\t |\t "
184- << tonesToString (unknown_token.second ) << std::endl;
144+ SLOGI (" %s\t |\t %s (未匹配)\t |\t %s" , orig_word.c_str (), phonesToString (unknown_token.first ).c_str (),
145+ tonesToString (unknown_token.second ).c_str ());
185146 }
186147 continue ;
187148 }
188- // 处理非英文字符(如空格、标点)
189149 std::string c = chars[i++];
190- if (c == " " ) continue ; // 跳过空格
191- // 回退一步,用于最长匹配
150+ if (c == " " ) continue ;
192151 i--;
193-
194-
195- // 最长匹配算法处理中文/日文
196152 bool matched = false ;
197- // 尝试从最长的词组开始匹配
198153 for (size_t len = std::min (max_phrase_length, chars.size () - i); len > 0 && !matched; --len) {
199154 std::string phrase;
200155 for (size_t j = 0 ; j < len; ++j) {
201156 phrase += chars[i + j];
202157 }
203-
204158 if (lexicon.find (phrase) != lexicon.end ()) {
205159 auto & [phrase_phones, phrase_tones] = lexicon[phrase];
206160 phones.insert (phones.end (), phrase_phones.begin (), phrase_phones.end ());
207161 tones.insert (tones.end (), phrase_tones.begin (), phrase_tones.end ());
208-
209- // 打印匹配信息
210- std::cout << phrase << " \t |\t " << phonesToString (phrase_phones) << " \t |\t "
211- << tonesToString (phrase_tones) << std::endl;
212-
162+ SLOGI (" %s\t |\t %s\t |\t %s" , phrase.c_str (), phonesToString (phrase_phones).c_str (),
163+ tonesToString (phrase_tones).c_str ());
213164 i += len;
214165 matched = true ;
215166 break ;
216167 }
217168 }
218-
219- // 如果没有匹配到任何词组,使用'_'的发音
220169 if (!matched) {
221- std::string c = chars[i++];
222- std::string s = c;
223-
224- // 中文标点符号转换
225- std::string orig_char = s; // 保留原始字符用于日志
226- if (s == " , " ) s = " , " ;
227- else if (s == " 。 " ) s = " ." ;
228- else if (s == " !" ) s = " ! " ;
229- else if (s == " ? " ) s = " ? " ;
230-
231- // 如果词典中找不到,则使用'_'的发音
170+ std::string c = chars[i++];
171+ std::string s = c;
172+ std::string orig_char = s;
173+ if (s == " , " )
174+ s = " , " ;
175+ else if (s == " 。 " )
176+ s = " ." ;
177+ else if (s == " !" )
178+ s = " ! " ;
179+ else if (s == " ? " )
180+ s = " ? " ;
232181 if (lexicon.find (s) != lexicon.end ()) {
233182 auto & [char_phones, char_tones] = lexicon[s];
234183 phones.insert (phones.end (), char_phones.begin (), char_phones.end ());
235184 tones.insert (tones.end (), char_tones.begin (), char_tones.end ());
236-
237- // 打印匹配信息
238- std::cout << orig_char << " \t |\t " << phonesToString (char_phones) << " \t |\t "
239- << tonesToString (char_tones) << std::endl;
185+ SLOGI (" %s\t |\t %s\t |\t %s" , orig_char.c_str (), phonesToString (char_phones).c_str (),
186+ tonesToString (char_tones).c_str ());
240187 } else {
241188 phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
242189 tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
243-
244- // 打印未匹配信息
245- std::cout << orig_char << " \t |\t " << phonesToString (unknown_token.first ) << " (未匹配)\t |\t "
246- << tonesToString (unknown_token.second ) << std::endl;
190+ SLOGI (" %s\t |\t %s (未匹配)\t |\t %s" , orig_char.c_str (), phonesToString (unknown_token.first ).c_str (),
191+ tonesToString (unknown_token.second ).c_str ());
247192 }
248193 }
249194 }
250-
251- // 在末尾添加'_'边界标记
252195 phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
253196 tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
254- std::cout << " <EOS>\t |\t " << phonesToString (unknown_token.first ) << " \t |\t "
255- << tonesToString (unknown_token.second ) << std::endl;
256-
257- // 汇总打印最终结果
258- std::cout << " \n 处理结果汇总:" << std::endl;
259- std::cout << " 原文: " << text << std::endl;
260- std::cout << " 音素: " << phonesToString (phones) << std::endl;
261- std::cout << " 声调: " << tonesToString (tones) << std::endl;
262- std::cout << " ====================" << std::endl;
197+ SLOGI (" <EOS>\t |\t %s\t |\t %s" , phonesToString (unknown_token.first ).c_str (),
198+ tonesToString (unknown_token.second ).c_str ());
199+ SLOGI (" \n 处理结果汇总:" );
200+ SLOGI (" 原文: %s" , text.c_str ());
201+ SLOGI (" 音素: %s" , phonesToString (phones).c_str ());
202+ SLOGI (" 声调: %s" , tonesToString (tones).c_str ());
203+ SLOGI (" ====================" );
263204 }
264205
265206private:
266- // 处理单个字符
267- void processChar ( const std::string& c, std::vector< int >& phones, std::vector< int >& tones) {
207+ void processChar ( const std::string& c, std::vector< int >& phones, std::vector< int >& tones)
208+ {
268209 std::string s = c;
269-
270- // 中文标点符号转换
271- if (s == " , " ) s = " , " ;
272- else if (s == " 。 " ) s = " ." ;
273- else if (s == " !" ) s = " ! " ;
274- else if (s == " ? " ) s = " ? " ;
275-
276- // 如果词典中找不到,则使用'_'的发音
210+ if (s == " , " )
211+ s = " , " ;
212+ else if (s == " 。 " )
213+ s = " ." ;
214+ else if (s == " !" )
215+ s = " ! " ;
216+ else if (s == " ? " )
217+ s = " ? " ;
277218 auto & phones_and_tones = (lexicon.find (s) != lexicon.end ()) ? lexicon[s] : unknown_token;
278-
279219 phones.insert (phones.end (), phones_and_tones.first .begin (), phones_and_tones.first .end ());
280220 tones.insert (tones.end (), phones_and_tones.second .begin (), phones_and_tones.second .end ());
281221 }
282-
283- // 将音素ID数组转换为字符串用于日志输出
284- std::string phonesToString (const std::vector<int >& phones) {
222+ std::string phonesToString (const std::vector<int >& phones)
223+ {
285224 std::string result;
286225 for (auto id : phones) {
287226 if (!result.empty ()) result += " " ;
@@ -293,14 +232,13 @@ class Lexicon {
293232 }
294233 return result;
295234 }
296-
297- // 将声调数组转换为字符串用于日志输出
298- std::string tonesToString (const std::vector<int >& tones) {
235+ std::string tonesToString (const std::vector<int >& tones)
236+ {
299237 std::string result;
300238 for (auto tone : tones) {
301239 if (!result.empty ()) result += " " ;
302240 result += std::to_string (tone);
303241 }
304242 return result;
305243 }
306- };
244+ };
0 commit comments