Implement Sola algorithm for smoother audio transitions

yuyun2000 · yuyun2000 · commit e3c70bc1273f · 2025-05-06T16:48:11.000+08:00
Apply the Synchronized Overlap-Add (SOLA) algorithm to smooth the connection between audio segments output by the decoder, resulting in more natural-sounding transitions between segments.
diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp
@@ -9,6 +9,7 @@
 #include "Lexicon.hpp"
 #include <ax_sys_api.h>
 #include "AudioFile.h"
+#include "SolaProcessor.h"
 #include "Lexicon.hpp"
 
 #include <signal.h>
@@ -263,49 +264,71 @@ class llm_task {
             auto encoder_output =
                 encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w,
                               mode_config_.get_length_scale(), mode_config_.sdp_ratio);
-            float *zp_data      = encoder_output.at(0).GetTensorMutableData<float>();
-            int audio_len       = encoder_output.at(2).GetTensorMutableData<int>()[0];
-            auto zp_info        = encoder_output.at(0).GetTensorTypeAndShapeInfo();
-            auto zp_shape       = zp_info.GetShape();
-            int zp_size         = decoder_->GetInputSize(0) / sizeof(float);
-            int dec_len         = zp_size / zp_shape[1];
-            int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float);
-            std::vector<float> decoder_output(audio_slice_len);
-            int dec_slice_num = int(std::ceil(zp_shape[2] * 1.0 / dec_len));
+            float *zp_data = encoder_output.at(0).GetTensorMutableData<float>();
+            int audio_len  = encoder_output.at(2).GetTensorMutableData<int>()[0];
+            auto zp_info   = encoder_output.at(0).GetTensorTypeAndShapeInfo();
+            auto zp_shape  = zp_info.GetShape();
+
+            // Decoder parameters setup
+            int zp_size                 = decoder_->GetInputSize(0) / sizeof(float);
+            int dec_len                 = zp_size / zp_shape[1];
+            int audio_slice_len         = decoder_->GetOutputSize(0) / sizeof(float);
+            const int pad_frames        = 16;
+            const int samples_per_frame = 512;
+            const int effective_frames  = dec_len - 2 * pad_frames;
+            int dec_slice_num =
+                static_cast<int>(std::ceil(static_cast<double>(zp_shape[2]) / static_cast<double>(effective_frames)));
+            SolaProcessor sola(pad_frames, samples_per_frame);
             std::vector<float> pcmlist;
+
             for (int i = 0; i < dec_slice_num; i++) {
+                int input_start = i * effective_frames;
+                if (i > 0) {
+                    input_start -= pad_frames;
+                }
+                input_start    = std::max(0, input_start);
+                int actual_len = std::min(dec_len, static_cast<int>(zp_shape[2] - input_start));
                 std::vector<float> zp(zp_size, 0);
-                int actual_size = (i + 1) * dec_len < zp_shape[2] ? dec_len : zp_shape[2] - i * dec_len;
+
                 for (int n = 0; n < zp_shape[1]; n++) {
-                    memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + i * dec_len,
-                           sizeof(float) * actual_size);
+                    int copy_size = std::min(actual_len, static_cast<int>(zp_shape[2] - input_start));
+                    if (copy_size > 0) {
+                        memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + input_start,
+                               sizeof(float) * copy_size);
+                    }
                 }
+                // Run decoder
+                std::vector<float> decoder_output(audio_slice_len);
                 decoder_->SetInput(zp.data(), 0);
                 decoder_->SetInput(g_matrix.data(), 1);
                 if (0 != decoder_->Run()) {
-                    printf("Run decoder model failed!\n");
                     throw std::string("decoder_ RunSync error");
                 }
                 decoder_->GetOutput(decoder_output.data(), 0);
-                actual_size = (i + 1) * audio_slice_len < audio_len ? audio_slice_len : audio_len - i * audio_slice_len;
-                if (decoder_output.size() > actual_size) {
-                    pcmlist.reserve(pcmlist.size() + actual_size);
-                    std::copy(decoder_output.begin(), decoder_output.begin() + actual_size,
-                              std::back_inserter(pcmlist));
-                } else {
-                    pcmlist.reserve(pcmlist.size() + decoder_output.size());
-                    std::copy(decoder_output.begin(), decoder_output.end(), std::back_inserter(pcmlist));
-                }
+                std::vector<float> processed_output = sola.ProcessFrame(decoder_output, i, dec_slice_num, actual_len);
+
+                pcmlist.insert(pcmlist.end(), processed_output.begin(), processed_output.end());
             }
+
             double src_ratio = (mode_config_.audio_rate * 1.0f) / (mode_config_.mode_rate * 1.0f);
             std::vector<float> tmp_pcm((pcmlist.size() * src_ratio + 1));
             int len;
             resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio);
+
+            // Convert to 16-bit PCM
+            wav_pcm_data.reserve(len);
             std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data),
                            [](const auto val) { return (int16_t)(val * INT16_MAX); });
+
+            // Call callback function with output
             if (out_callback_)
                 out_callback_(std::string((char *)wav_pcm_data.data(), wav_pcm_data.size() * sizeof(int16_t)), finish);
+
+        } catch (const std::exception &e) {
+            SLOGI("TTS processing exception: %s", e.what());
+            return true;
         } catch (...) {
+            SLOGI("TTS processing encountered unknown exception");
             return true;
         }
         return false;
diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp
@@ -32,6 +32,7 @@ class Lexicon {
 public:
     Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0)
     {
+        SLOGI("词典加载: %zu 发音表加载: %zu", tokens_filename, lexicon_filename);
         std::unordered_map<std::string, int> tokens;
         std::ifstream ifs(tokens_filename);
         assert(ifs.is_open());
diff --git a/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h b/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h
@@ -0,0 +1,269 @@
+#ifndef SOLA_PROCESSOR_H
+#define SOLA_PROCESSOR_H
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <string>
+#include <vector>
+
+/**
+ * SolaProcessor - Synchronous Overlap-Add method for audio frame processing
+ *
+ * This class provides functionality for smoothly concatenating audio frames
+ * using the SOLA algorithm, which finds optimal alignment points between
+ * consecutive frames and applies crossfading for smooth transitions.
+ */
+class SolaProcessor {
+public:
+    /**
+     * Constructor
+     *
+     * @param padFrames Number of padding frames at the beginning and end
+     * @param samplesPerFrame Number of audio samples in each frame
+     */
+    SolaProcessor(int padFrames, int samplesPerFrame)
+        : pad_frames_(padFrames), samples_per_frame_(samplesPerFrame), first_frame_(true)
+    {
+        Initialize();
+    }
+
+    /**
+     * Reset the processor to its initial state
+     */
+    void Reset()
+    {
+        first_frame_ = true;
+        std::fill(sola_buffer_.begin(), sola_buffer_.end(), 0.0f);
+    }
+
+    /**
+     * Process a single audio frame
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param frameIndex Current frame index
+     * @param totalFrames Total number of frames
+     * @param actualFrameLen Actual length of the frame
+     * @return Processed audio samples
+     */
+    std::vector<float> ProcessFrame(const std::vector<float>& decoder_output, int frameIndex, int totalFrames,
+                                    int actualFrameLen)
+    {
+        std::vector<float> processed_output;
+
+        if (first_frame_) {
+            // Special handling for the first frame
+            ProcessFirstFrame(decoder_output, processed_output, actualFrameLen);
+            first_frame_ = false;
+        } else {
+            // Process subsequent frames with SOLA algorithm
+            ProcessSubsequentFrame(decoder_output, processed_output, frameIndex, totalFrames, actualFrameLen);
+        }
+
+        return processed_output;
+    }
+
+private:
+    /**
+     * Initialize the SOLA processor parameters and buffers
+     */
+    void Initialize()
+    {
+        // Calculate SOLA parameters
+        sola_buffer_frame_ = pad_frames_ * samples_per_frame_;
+        sola_search_frame_ = pad_frames_ * samples_per_frame_;
+        effective_frames_  = 0;  // Will be set during frame processing
+
+        // Create fade-in and fade-out windows
+        fade_in_window_.resize(sola_buffer_frame_);
+        fade_out_window_.resize(sola_buffer_frame_);
+
+        for (int i = 0; i < sola_buffer_frame_; i++) {
+            fade_in_window_[i]  = static_cast<float>(i) / sola_buffer_frame_;
+            fade_out_window_[i] = 1.0f - fade_in_window_[i];
+        }
+
+        // Initialize SOLA buffer
+        sola_buffer_.resize(sola_buffer_frame_, 0.0f);
+    }
+
+    /**
+     * Process the first audio frame
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param processed_output Output buffer for processed audio
+     * @param actualFrameLen Actual length of the frame
+     */
+    void ProcessFirstFrame(const std::vector<float>& decoder_output, std::vector<float>& processed_output,
+                           int actualFrameLen)
+    {
+        int audio_start = pad_frames_ * samples_per_frame_;
+        int audio_len   = (actualFrameLen - 2 * pad_frames_) * samples_per_frame_;
+
+        // Boundary check
+        audio_len = std::min(audio_len, static_cast<int>(decoder_output.size() - audio_start));
+
+        // Add first frame data to output
+        processed_output.insert(processed_output.end(), decoder_output.begin() + audio_start,
+                                decoder_output.begin() + audio_start + audio_len);
+
+        // Save the end part to SOLA buffer for next frame alignment
+        int buffer_start = audio_start + audio_len;
+        if (buffer_start + sola_buffer_frame_ <= decoder_output.size()) {
+            std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame_,
+                      sola_buffer_.begin());
+        }
+    }
+
+    /**
+     * Process subsequent audio frames using SOLA algorithm
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param processed_output Output buffer for processed audio
+     * @param frameIndex Current frame index
+     * @param totalFrames Total number of frames
+     * @param actualFrameLen Actual length of the frame
+     */
+    void ProcessSubsequentFrame(const std::vector<float>& decoder_output, std::vector<float>& processed_output,
+                                int frameIndex, int totalFrames, int actualFrameLen)
+    {
+        int audio_start = pad_frames_ * samples_per_frame_;
+
+        // 1. Prepare search window
+        std::vector<float> search_window(sola_buffer_frame_ + sola_search_frame_);
+        std::copy(decoder_output.begin() + audio_start, decoder_output.begin() + audio_start + search_window.size(),
+                  search_window.begin());
+
+        // 2. Find best alignment point (compute cross-correlation)
+        int best_offset = FindBestOffset(search_window);
+
+        // 3. Apply alignment offset
+        int aligned_start = audio_start + best_offset;
+
+        // 4. Create smooth transition
+        std::vector<float> crossfade_region = CreateCrossfade(decoder_output, aligned_start);
+
+        // 5. Add crossfade region to output
+        processed_output.insert(processed_output.end(), crossfade_region.begin(), crossfade_region.end());
+
+        // 6. Add remaining valid audio data
+        AddRemainingAudio(decoder_output, processed_output, aligned_start, frameIndex, totalFrames, actualFrameLen);
+    }
+
+    /**
+     * Find the best alignment offset using normalized cross-correlation
+     *
+     * @param search_window Window of audio samples to search in
+     * @return Optimal offset for alignment
+     */
+    int FindBestOffset(const std::vector<float>& search_window)
+    {
+        int best_offset        = 0;
+        float best_correlation = -1.0f;
+
+        for (int offset = 0; offset <= sola_search_frame_; offset++) {
+            float correlation = 0.0f;
+            float energy      = 0.0f;
+
+            for (int j = 0; j < sola_buffer_frame_; j++) {
+                correlation += sola_buffer_[j] * search_window[j + offset];
+                energy += search_window[j + offset] * search_window[j + offset];
+            }
+
+            // Normalize correlation
+            float normalized_correlation = (energy > 1e-8) ? correlation / std::sqrt(energy) : 0.0f;
+
+            if (normalized_correlation > best_correlation) {
+                best_correlation = normalized_correlation;
+                best_offset      = offset;
+            }
+        }
+
+        return best_offset;
+    }
+
+    /**
+     * Create crossfade transition region
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param aligned_start Starting point after alignment
+     * @return Crossfaded audio samples
+     */
+    std::vector<float> CreateCrossfade(const std::vector<float>& decoder_output, int aligned_start)
+    {
+        std::vector<float> crossfade_region(sola_buffer_frame_);
+
+        for (int j = 0; j < sola_buffer_frame_; j++) {
+            // Apply fade-in and fade-out window functions
+            crossfade_region[j] =
+                decoder_output[aligned_start + j] * fade_in_window_[j] + sola_buffer_[j] * fade_out_window_[j];
+        }
+
+        return crossfade_region;
+    }
+
+    /**
+     * Add remaining audio data and update buffer
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param processed_output Output buffer for processed audio
+     * @param aligned_start Starting point after alignment
+     * @param frameIndex Current frame index
+     * @param totalFrames Total number of frames
+     * @param actualFrameLen Actual length of the frame
+     */
+    void AddRemainingAudio(const std::vector<float>& decoder_output, std::vector<float>& processed_output,
+                           int aligned_start, int frameIndex, int totalFrames, int actualFrameLen)
+    {
+        int remaining_start = aligned_start + sola_buffer_frame_;
+        int remaining_len   = (actualFrameLen - 2 * pad_frames_) * samples_per_frame_ - sola_buffer_frame_;
+
+        // Boundary check
+        remaining_len = std::min(remaining_len, static_cast<int>(decoder_output.size() - remaining_start));
+
+        if (remaining_len > 0) {
+            processed_output.insert(processed_output.end(), decoder_output.begin() + remaining_start,
+                                    decoder_output.begin() + remaining_start + remaining_len);
+        }
+
+        // Update SOLA buffer
+        UpdateSolaBuffer(decoder_output, remaining_start + remaining_len);
+    }
+
+    /**
+     * Update SOLA buffer with new audio data
+     *
+     * @param decoder_output Raw audio data from decoder
+     * @param buffer_start Starting point for the new buffer data
+     */
+    void UpdateSolaBuffer(const std::vector<float>& decoder_output, int buffer_start)
+    {
+        // Check if there's enough data for the next buffer
+        if (buffer_start + sola_buffer_frame_ <= decoder_output.size()) {
+            std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame_,
+                      sola_buffer_.begin());
+        } else {
+            // Fill with zeros if not enough data
+            int avail = static_cast<int>(decoder_output.size() - buffer_start);
+            if (avail > 0) {
+                std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer_.begin());
+            }
+            std::fill(sola_buffer_.begin() + avail, sola_buffer_.end(), 0.0f);
+        }
+    }
+
+private:
+    int pad_frames_;         // Number of padding frames
+    int samples_per_frame_;  // Number of samples per frame
+    int effective_frames_;   // Number of effective frames
+    int sola_buffer_frame_;  // SOLA buffer length
+    int sola_search_frame_;  // SOLA search window length
+
+    std::vector<float> fade_in_window_;   // Fade-in window
+    std::vector<float> fade_out_window_;  // Fade-out window
+    std::vector<float> sola_buffer_;      // SOLA buffer
+
+    bool first_frame_;  // Flag for first frame processing
+};
+
+#endif  // SOLA_PROCESSOR_H

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ class Lexicon {`
`32`	`32`	`public:`
`33`	`33`	`Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0)`
`34`	`34`	`{`
	`35`	`+ SLOGI("词典加载: %zu 发音表加载: %zu", tokens_filename, lexicon_filename);`
`35`	`36`	`std::unordered_map<std::string, int> tokens;`
`36`	`37`	`std::ifstream ifs(tokens_filename);`
`37`	`38`	`assert(ifs.is_open());`