33import torch
44from torch import no_grad , LongTensor
55import argparse
6- from models_infer import spectrogram_torch
6+ from mel_processing import spectrogram_torch
77import utils
88from models_infer import SynthesizerTrn
99import gradio as gr
10- import torchaudio
10+ import librosa
1111import webbrowser
1212device = "cuda:0" if torch .cuda .is_available () else "cpu"
1313
@@ -20,15 +20,16 @@ def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
2020 original_speaker_id = speaker_ids [original_speaker ]
2121 target_speaker_id = speaker_ids [target_speaker ]
2222
23- audio = torch . tensor (audio ). type ( torch .float32 )
24- audio = audio .squeeze (). unsqueeze ( 0 )
25- audio = audio / max ( - audio .min (), audio . max ()) / 0.99
23+ audio = ( audio / np . iinfo (audio . dtype ). max ). astype ( np .float32 )
24+ if len ( audio .shape ) > 1 :
25+ audio = librosa . to_mono ( audio .transpose ( 1 , 0 ))
2626 if sampling_rate != hps .data .sampling_rate :
27- audio = torchaudio . transforms . Resample ( orig_freq = sampling_rate , new_freq = 22050 )( audio )
27+ audio = librosa . resample ( audio , orig_sr = sampling_rate , target_sr = hps . data . sampling_rate )
2828 with no_grad ():
2929 y = torch .FloatTensor (audio )
3030 y = y / max (- y .min (), y .max ()) / 0.99
3131 y = y .to (device )
32+ y = y .unsqueeze (0 )
3233 spec = spectrogram_torch (y , hps .data .filter_length ,
3334 hps .data .sampling_rate , hps .data .hop_length , hps .data .win_length ,
3435 center = False ).to (device )
0 commit comments