1+ import gradio as gr
2+ import os
3+ os .system ('cd monotonic_align && python setup.py build_ext --inplace && cd ..' )
4+
5+ import logging
6+
7+ numba_logger = logging .getLogger ('numba' )
8+ numba_logger .setLevel (logging .WARNING )
9+
10+ import librosa
11+ import torch
12+
13+ import commons
14+ import utils
15+ from models import SynthesizerTrn
16+ from text .symbols import symbols
17+ from text import text_to_sequence
18+ def resize2d (source , target_len ):
19+ source [source < 0.001 ] = np .nan
20+ target = np .interp (np .arange (0 , len (source )* target_len , len (source ))/ target_len , np .arange (0 , len (source )), source )
21+ return np .nan_to_num (target )
22+ def convert_wav_22050_to_f0 (audio ):
23+ tmp = librosa .pyin (audio ,
24+ fmin = librosa .note_to_hz ('C0' ),
25+ fmax = librosa .note_to_hz ('C7' ),
26+ frame_length = 1780 )[0 ]
27+ f0 = np .zeros_like (tmp )
28+ f0 [tmp > 0 ] = tmp [tmp > 0 ]
29+ return f0
30+
31+ def get_text (text , hps ):
32+ text_norm = text_to_sequence (text , hps .data .text_cleaners )
33+ if hps .data .add_blank :
34+ text_norm = commons .intersperse (text_norm , 0 )
35+ text_norm = torch .LongTensor (text_norm )
36+ print (text_norm .shape )
37+ return text_norm
38+
39+
40+ hps = utils .get_hparams_from_file ("configs/ljs_base.json" )
41+ hps_ms = utils .get_hparams_from_file ("configs/vctk_base.json" )
42+ net_g_ms = SynthesizerTrn (
43+ len (symbols ),
44+ hps_ms .data .filter_length // 2 + 1 ,
45+ hps_ms .train .segment_size // hps .data .hop_length ,
46+ n_speakers = hps_ms .data .n_speakers ,
47+ ** hps_ms .model )
48+
49+ import numpy as np
50+
51+ hubert = torch .hub .load ("bshall/hubert:main" , "hubert_soft" )
52+
53+ _ = utils .load_checkpoint ("G_312000.pth" , net_g_ms , None )
54+
55+ def vc_fn (input_audio ,vc_transform ):
56+ if input_audio is None :
57+ return "You need to upload an audio" , None
58+ sampling_rate , audio = input_audio
59+ # print(audio.shape,sampling_rate)
60+ duration = audio .shape [0 ] / sampling_rate
61+ if duration > 30 :
62+ return "Error: Audio is too long" , None
63+ audio = (audio / np .iinfo (audio .dtype ).max ).astype (np .float32 )
64+ if len (audio .shape ) > 1 :
65+ audio = librosa .to_mono (audio .transpose (1 , 0 ))
66+ if sampling_rate != 16000 :
67+ audio = librosa .resample (audio , orig_sr = sampling_rate , target_sr = 16000 )
68+
69+ audio22050 = librosa .resample (audio , orig_sr = 16000 , target_sr = 22050 )
70+ f0 = convert_wav_22050_to_f0 (audio22050 )
71+
72+ source = torch .FloatTensor (audio ).unsqueeze (0 ).unsqueeze (0 )
73+ print (source .shape )
74+ with torch .inference_mode ():
75+ units = hubert .units (source )
76+ soft = units .squeeze (0 ).numpy ()
77+ print (sampling_rate )
78+ f0 = resize2d (f0 , len (soft [:, 0 ])) * vc_transform
79+ soft [:, 0 ] = f0 / 10
80+ sid = torch .LongTensor ([0 ])
81+ stn_tst = torch .FloatTensor (soft )
82+ with torch .no_grad ():
83+ x_tst = stn_tst .unsqueeze (0 )
84+ x_tst_lengths = torch .LongTensor ([stn_tst .size (0 )])
85+ audio = net_g_ms .infer (x_tst , x_tst_lengths ,sid = sid , noise_scale = 0.1 , noise_scale_w = 0.1 , length_scale = 1 )[0 ][
86+ 0 , 0 ].data .float ().numpy ()
87+
88+ return "Success" , (hps .data .sampling_rate , audio )
89+
90+
91+
92+ app = gr .Blocks ()
93+ with app :
94+ with gr .Tabs ():
95+ with gr .TabItem ("Basic" ):
96+ vc_input3 = gr .Audio (label = "Input Audio (30s limitation)" )
97+ vc_transform = gr .Number (label = "transform" ,value = 1.0 )
98+ vc_submit = gr .Button ("Convert" , variant = "primary" )
99+ vc_output1 = gr .Textbox (label = "Output Message" )
100+ vc_output2 = gr .Audio (label = "Output Audio" )
101+ vc_submit .click (vc_fn , [ vc_input3 ,vc_transform ], [vc_output1 , vc_output2 ])
102+
103+ app .launch ()
0 commit comments