upload files

Plachtaa · Plachtaa · commit 2834647ec55c · 2023-02-15T16:18:49.000+08:00
diff --git a/VC_inference.py b/VC_inference.py
@@ -3,11 +3,11 @@
 import torch
 from torch import no_grad, LongTensor
 import argparse
-from models_infer import spectrogram_torch
+from mel_processing import spectrogram_torch
 import utils
 from models_infer import SynthesizerTrn
 import gradio as gr
-import torchaudio
+import librosa
 import webbrowser
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
@@ -20,15 +20,16 @@ def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
         original_speaker_id = speaker_ids[original_speaker]
         target_speaker_id = speaker_ids[target_speaker]
 
-        audio = torch.tensor(audio).type(torch.float32)
-        audio = audio.squeeze().unsqueeze(0)
-        audio = audio / max(-audio.min(), audio.max()) / 0.99
+        audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
+        if len(audio.shape) > 1:
+            audio = librosa.to_mono(audio.transpose(1, 0))
         if sampling_rate != hps.data.sampling_rate:
-            audio = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=22050)(audio)
+            audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
         with no_grad():
             y = torch.FloatTensor(audio)
             y = y / max(-y.min(), y.max()) / 0.99
             y = y.to(device)
+            y = y.unsqueeze(0)
             spec = spectrogram_torch(y, hps.data.filter_length,
                                      hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
                                      center=False).to(device)
diff --git a/models_infer.py b/models_infer.py
@@ -400,24 +400,3 @@ def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
     o_hat = self.dec(z_hat * y_mask, g=g_tgt)
     return o_hat, y_mask, (z, z_p, z_hat)
 
-def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
-  if torch.min(y) < -1.:
-    print('min value is ', torch.min(y))
-  if torch.max(y) > 1.:
-    print('max value is ', torch.max(y))
-
-  global hann_window
-  dtype_device = str(y.dtype) + '_' + str(y.device)
-  wnsize_dtype_device = str(win_size) + '_' + dtype_device
-  if wnsize_dtype_device not in hann_window:
-    hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
-
-  y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
-                              mode='reflect')
-  y = y.squeeze(1)
-
-  spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
-                    center=center, pad_mode='reflect', normalized=False, onesided=True)
-
-  spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-  return spec
diff --git a/requirements_infer.txt b/requirements_infer.txt
@@ -1,4 +1,5 @@
 Cython
+librosa
 numpy
 scipy
 torch

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`Cython`
	`2`	`+librosa`
`2`	`3`	`numpy`
`3`	`4`	`scipy`
`4`	`5`	`torch`