Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions diffsynth_engine/pipelines/wan_s2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ def predict_noise(
drop_motion_frames=drop_motion_frames,
audio_mask=audio_mask,
void_audio_input=void_audio_input,
attn_kwargs=attn_kwargs,
#attn_kwargs=attn_kwargs, # I have raised issue for this, should be fixed in wan_s2v_dit.py
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This line is commented out with a note about an issue. While this is informative, it's generally better to remove dead code from the repository. The associated issue (#221) and version control history are the best places to track this. If you must keep it as a reminder, consider formatting it as a TODO comment for better visibility for developers and tools.

Suggested change
#attn_kwargs=attn_kwargs, # I have raised issue for this, should be fixed in wan_s2v_dit.py
# TODO(#221): Re-enable `attn_kwargs` once the TypeError in `WanS2VDiT.forward` is fixed.

)
return noise_pred

Expand All @@ -418,13 +418,14 @@ def __call__(
self,
audio: torch.Tensor,
prompt: str,
fps: int | None = 16, # fps of the generated video
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

There are a couple of issues with the new fps parameter:

  1. It's not used: The parameter is not used within the __call__ method. The pipeline will continue to use self.config.fps, making this new feature non-functional. You should use the fps parameter to temporarily update self.config.fps for the duration of this call. Using a try...finally block is a robust way to ensure the original configuration is restored, even if errors occur.

  2. Ambiguous type hint: The type hint int | None = 16 is confusing. It allows None but defaults to an integer. For better clarity, consider:

    • fps: int = 16 if fps must always be an integer.
    • fps: int | None = None if None is meant to signal using the default from the configuration. You would then handle the None case inside the method.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hrudaykolla As mentioned in this comment, the fps parameter is added but not used, it should generate video and audio out of sync. Need some fixing.

negative_prompt: str = "",
cfg_scale: float | None = None,
num_inference_steps: int | None = None,
seed: int | None = None,
height: int | None = None,
width: int | None = None,
num_frames_per_clip: int = 80,
num_frames_per_clip: int = 80, # number of frames per clip and fps together determine the duration of each clip
ref_image: Image.Image | None = None,
pose_video: List[Image.Image] | None = None,
pose_video_fps: int | None = None,
Expand Down
14 changes: 9 additions & 5 deletions examples/wan_speech_to_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@ def wan_rs2v(pipe: WanSpeech2VideoPipeline, input_data_dir: str):
audio_path = f"{input_data_dir}/sing.mp3"
audio = librosa.load(audio_path, sr=16000)[0]
audio = torch.from_numpy(audio)[None] # (1, audio_len)
fps_of_generated_video = 12 # The fps of the generated video, default is 16
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The value 12 for fps_of_generated_video is repeated across multiple functions in this example file. To improve maintainability and avoid magic numbers, consider defining it as a constant at the module level, for example: EXAMPLE_VIDEO_FPS = 12.

frames = pipe(
ref_image=Image.open(f"{input_data_dir}/woman.png").convert("RGB"),
audio=audio,
fps=fps_of_generated_video,
prompt="画面清晰,视频中,一个女人正在唱歌,表情动作十分投入",
negative_prompt="画面模糊,最差质量,画面模糊,细节模糊不清,情绪激动剧烈,手快速抖动,字幕,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走",
cfg_scale=4.5,
Expand All @@ -24,14 +26,15 @@ def wan_rs2v(pipe: WanSpeech2VideoPipeline, input_data_dir: str):
num_clips=3,
ref_as_first_frame=True,
)
save_video_with_audio(frames, audio_path=audio_path, target_video_path="wan_rs2v.mp4")
save_video_with_audio(frames, audio_path=audio_path, target_video_path="wan_rs2v.mp4", fps=fps_of_generated_video)


def wan_rsp2v(pipe: WanSpeech2VideoPipeline, input_data_dir: str):
audio_path = f"{input_data_dir}/sing.mp3"
audio = librosa.load(audio_path, sr=16000)[0]
audio = torch.from_numpy(audio)[None] # (1, audio_len)
pose_video = load_video(f"{input_data_dir}/pose.mp4")
fps_of_generated_video = 12 # The fps of the generated video, default is 16
frames = pipe(
ref_image=Image.open(f"{input_data_dir}/pose.png").convert("RGB"),
audio=audio,
Expand All @@ -46,15 +49,16 @@ def wan_rsp2v(pipe: WanSpeech2VideoPipeline, input_data_dir: str):
num_clips=2,
ref_as_first_frame=False,
)
save_video_with_audio(frames, audio_path=audio_path, target_video_path="wan_rsp2v.mp4")
save_video_with_audio(frames, audio_path=audio_path, target_video_path="wan_rsp2v.mp4", fps=fps_of_generated_video)


def wan_rs2v_multi_speaker(pipe: WanSpeech2VideoPipeline, input_data_dir: str):
audio_path = f"{input_data_dir}/sing2.mp3"
audio = librosa.load(audio_path, sr=16000)[0]
audio = load_audio(audio_path, sr=16000)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The function load_audio is used here, but it's not defined or imported in this file, which will cause a NameError. You likely intended to use librosa.load as you did before this change and in other functions in this file.

Suggested change
audio = load_audio(audio_path, sr=16000)
audio = librosa.load(audio_path, sr=16000)[0]

audio = torch.from_numpy(audio)[None] # (1, audio_len)
void_audio = librosa.load(f"{input_data_dir}/void_audio.mp3", sr=16000)[0]
void_audio = load_audio(f"{input_data_dir}/void_audio.mp3", sr=16000)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Similar to the line above, load_audio is not defined. This should probably be a call to librosa.load.

Suggested change
void_audio = load_audio(f"{input_data_dir}/void_audio.mp3", sr=16000)
void_audio = librosa.load(f"{input_data_dir}/void_audio.mp3", sr=16000)[0]

void_audio = torch.from_numpy(void_audio)[None] # (1, void_audio_len)
fps_of_generated_video = 12 # The fps of the generated video, default is 16
frames = pipe(
ref_image=Image.open(f"{input_data_dir}/2girl.png").convert("RGB"),
audio=audio,
Expand All @@ -70,7 +74,7 @@ def wan_rs2v_multi_speaker(pipe: WanSpeech2VideoPipeline, input_data_dir: str):
num_clips=2,
ref_as_first_frame=False,
)
save_video_with_audio(frames, audio_path=audio_path, target_video_path="wan_rs2v_multi_speaker.mp4")
save_video_with_audio(frames, audio_path=audio_path, target_video_path="wan_rs2v_multi_speaker.mp4", fps=fps_of_generated_video)


if __name__ == "__main__":
Expand Down