Skip to content
This repository was archived by the owner on Oct 19, 2024. It is now read-only.

Commit 2097e0d

Browse files
committed
initial commit
0 parents  commit 2097e0d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+25744
-0
lines changed

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2021 Jaehyeon Kim
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# soft-vc-singingvc
2+

app.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import gradio as gr
2+
import os
3+
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
4+
5+
import logging
6+
7+
numba_logger = logging.getLogger('numba')
8+
numba_logger.setLevel(logging.WARNING)
9+
10+
import librosa
11+
import torch
12+
13+
import commons
14+
import utils
15+
from models import SynthesizerTrn
16+
from text.symbols import symbols
17+
from text import text_to_sequence
18+
def resize2d(source, target_len):
19+
source[source<0.001] = np.nan
20+
target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
21+
return np.nan_to_num(target)
22+
def convert_wav_22050_to_f0(audio):
23+
tmp = librosa.pyin(audio,
24+
fmin=librosa.note_to_hz('C0'),
25+
fmax=librosa.note_to_hz('C7'),
26+
frame_length=1780)[0]
27+
f0 = np.zeros_like(tmp)
28+
f0[tmp>0] = tmp[tmp>0]
29+
return f0
30+
31+
def get_text(text, hps):
32+
text_norm = text_to_sequence(text, hps.data.text_cleaners)
33+
if hps.data.add_blank:
34+
text_norm = commons.intersperse(text_norm, 0)
35+
text_norm = torch.LongTensor(text_norm)
36+
print(text_norm.shape)
37+
return text_norm
38+
39+
40+
hps = utils.get_hparams_from_file("configs/ljs_base.json")
41+
hps_ms = utils.get_hparams_from_file("configs/vctk_base.json")
42+
net_g_ms = SynthesizerTrn(
43+
len(symbols),
44+
hps_ms.data.filter_length // 2 + 1,
45+
hps_ms.train.segment_size // hps.data.hop_length,
46+
n_speakers=hps_ms.data.n_speakers,
47+
**hps_ms.model)
48+
49+
import numpy as np
50+
51+
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")
52+
53+
_ = utils.load_checkpoint("G_312000.pth", net_g_ms, None)
54+
55+
def vc_fn(input_audio,vc_transform):
56+
if input_audio is None:
57+
return "You need to upload an audio", None
58+
sampling_rate, audio = input_audio
59+
# print(audio.shape,sampling_rate)
60+
duration = audio.shape[0] / sampling_rate
61+
if duration > 30:
62+
return "Error: Audio is too long", None
63+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
64+
if len(audio.shape) > 1:
65+
audio = librosa.to_mono(audio.transpose(1, 0))
66+
if sampling_rate != 16000:
67+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
68+
69+
audio22050 = librosa.resample(audio, orig_sr=16000, target_sr=22050)
70+
f0 = convert_wav_22050_to_f0(audio22050)
71+
72+
source = torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0)
73+
print(source.shape)
74+
with torch.inference_mode():
75+
units = hubert.units(source)
76+
soft = units.squeeze(0).numpy()
77+
print(sampling_rate)
78+
f0 = resize2d(f0, len(soft[:, 0])) * vc_transform
79+
soft[:, 0] = f0 / 10
80+
sid = torch.LongTensor([0])
81+
stn_tst = torch.FloatTensor(soft)
82+
with torch.no_grad():
83+
x_tst = stn_tst.unsqueeze(0)
84+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
85+
audio = net_g_ms.infer(x_tst, x_tst_lengths,sid=sid, noise_scale=0.1, noise_scale_w=0.1, length_scale=1)[0][
86+
0, 0].data.float().numpy()
87+
88+
return "Success", (hps.data.sampling_rate, audio)
89+
90+
91+
92+
app = gr.Blocks()
93+
with app:
94+
with gr.Tabs():
95+
with gr.TabItem("Basic"):
96+
vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
97+
vc_transform = gr.Number(label="transform",value=1.0)
98+
vc_submit = gr.Button("Convert", variant="primary")
99+
vc_output1 = gr.Textbox(label="Output Message")
100+
vc_output2 = gr.Audio(label="Output Audio")
101+
vc_submit.click(vc_fn, [ vc_input3,vc_transform], [vc_output1, vc_output2])
102+
103+
app.launch()

0 commit comments

Comments
 (0)