Skip to content

Commit e1700a8

Browse files
committed
update README.md
1 parent 21f8509 commit e1700a8

File tree

10 files changed

+245
-92
lines changed

10 files changed

+245
-92
lines changed

README.md

Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,63 @@
11
# SoftVC VITS Singing Voice Conversion
2-
## 重大BUG修复
3-
+ 断音问题已解决,音质提升了一个档次
4-
+ 2.0版本已经移至 sovits_2.0分支
5-
+ 3.0版本使用FreeVC的代码结构,与旧版本不通用
6-
2+
## Update
3+
> 断音问题已解决,音质提升了一个档次\
4+
> 2.0版本已经移至 sovits_2.0分支\
5+
> 3.0版本使用FreeVC的代码结构,与旧版本不通用
76
## 模型简介
8-
歌声音色转换模型,通过SoftVC内容编码器提取源音频语音特征,与F0同时输入VITS替换原本的文本输入达到歌声转换的效果。
9-
> 目前模型修使用 [coarse F0](https://github.com/PlayVoice/VI-SVC/blob/main/svc/prepare/preprocess_wave.py) ,尝试使用[HarmoF0](https://github.com/wx-wei/harmof0) 进行f0提取但效果不佳,尝试使用[icassp2022-vocal-transcription](https://github.com/keums/icassp2022-vocal-transcription)提取midi替换f0输入但效果不佳
7+
歌声音色转换模型,通过SoftVC内容编码器提取源音频语音特征,与F0同时输入VITS替换原本的文本输入达到歌声转换的效果。同时,更换声码器为 [NSF HiFiGAN](https://github.com/openvpi/DiffSinger/tree/refactor/modules/nsf_hifigan) 解决断音问题
8+
9+
## 数据集准备
10+
```shell
11+
仅需要以以下文件结构将数据集放入raw目录即可
12+
13+
raw
14+
├───speaker0
15+
│ ├───xxx1-xxx1.wav
16+
│ ├───...
17+
│ └───Lxx-0xx8.wav
18+
└───speaker1
19+
├───xx2-0xxx2.wav
20+
├───...
21+
└───xxx7-xxx007.wav
22+
```
23+
24+
## 预先下载的模型文件
25+
+ soft vc hubert:[hubert-soft-0d54a1f4.pt](https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt)
26+
+ 放在hubert目录下
27+
+ 预训练模型文件 [G_0.pth D_0.pth](https://)
28+
+ 放在logs/48k 目录下
29+
+ 预训练模型为必选项,因为据测试从零开始训练有概率不收敛,同时也能加快训练速度
30+
+ 预训练模型删除了optimizer flow speakerembedding 等无关权重,因此可以认为基本剔除了旧的音色信息
31+
```shell
32+
# 一键下载
33+
# hubert
34+
wget -P hubert/ https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt
35+
# G与D预训练模型
36+
wget -P logs/48k/ https://
37+
wget -P logs/48k/ https://
38+
39+
```
40+
## 数据预处理
41+
1. 重采样至 48khz
42+
43+
```shell
44+
python resample.py
45+
```
46+
2. 自动划分训练集 验证集 测试集 以及配置文件
47+
```shell
48+
python preprocess_flist_config.py
49+
```
50+
3. 生成hubert与f0
51+
```shell
52+
python preprocess_hubert_f0.py
53+
```
54+
执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除raw文件夹了
55+
56+
## 训练
57+
```shell
58+
python train.py -c configs/config.json -m 48k
59+
```
60+
61+
## 推理
1062

11-
模型推理、训练、一键脚本汇总整理仓库 [sovits_guide](https://github.com/IceKyrin/sovits_guide)
1263

configs/config.json

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
"eval_interval": 200,
55
"seed": 1234,
66
"epochs": 10000,
7-
"learning_rate": 2e-4,
8-
"betas": [0.8, 0.99],
9-
"eps": 1e-9,
7+
"learning_rate": 0.0002,
8+
"betas": [
9+
0.8,
10+
0.99
11+
],
12+
"eps": 1e-09,
1013
"batch_size": 16,
1114
"fp16_run": false,
1215
"lr_decay": 0.999875,
@@ -20,8 +23,8 @@
2023
"port": "8001"
2124
},
2225
"data": {
23-
"training_files":"filelists/train.txt",
24-
"validation_files":"filelists/val.txt",
26+
"training_files": "filelists/train.txt",
27+
"validation_files": "filelists/val.txt",
2528
"max_wav_value": 32768.0,
2629
"sampling_rate": 48000,
2730
"filter_length": 1280,
@@ -40,19 +43,49 @@
4043
"kernel_size": 3,
4144
"p_dropout": 0.1,
4245
"resblock": "1",
43-
"resblock_kernel_sizes": [3,7,11],
44-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
45-
"upsample_rates": [10,8,2,2],
46+
"resblock_kernel_sizes": [
47+
3,
48+
7,
49+
11
50+
],
51+
"resblock_dilation_sizes": [
52+
[
53+
1,
54+
3,
55+
5
56+
],
57+
[
58+
1,
59+
3,
60+
5
61+
],
62+
[
63+
1,
64+
3,
65+
5
66+
]
67+
],
68+
"upsample_rates": [
69+
10,
70+
8,
71+
2,
72+
2
73+
],
4674
"upsample_initial_channel": 512,
47-
"upsample_kernel_sizes": [16,16,4,4],
75+
"upsample_kernel_sizes": [
76+
16,
77+
16,
78+
4,
79+
4
80+
],
4881
"n_layers_q": 3,
4982
"use_spectral_norm": false,
5083
"gin_channels": 256,
51-
"ssl_dim": 256
84+
"ssl_dim": 256,
85+
"n_speakers": 4
5286
},
53-
"spk":{
54-
"nen": 0,
55-
"paimon": 1,
56-
"yunhao": 2
87+
"spk": {
88+
"paimon": 0,
89+
"nen": 1
5790
}
58-
}
91+
}

filelists/test.txt

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav
2-
./dataset/48k/paimon/vo_ABLQ005_2_paimon_01.wav
3-
./dataset/48k/nen/kne110_005.wav
4-
./dataset/48k/paimon/vo_ABLQ004_6_paimon_02.wav
5-
./dataset/48k/paimon/vo_ABLQ004_6_paimon_01.wav
6-
./dataset/48k/nen/kne110_003.wav
7-
./dataset/48k/paimon/vo_ABLQ004_7_paimon_01.wav
8-
./dataset/48k/nen/kne110_004.wav
1+
./dataset/48k/nen/kne110_006.wav
92
./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav
3+
./dataset/48k/paimon/vo_ABLQ005_2_paimon_01.wav
104
./dataset/48k/nen/kne110_001.wav
11-
./dataset/48k/nen/kne110_006.wav
5+
./dataset/48k/nen/kne110_003.wav
126
./dataset/48k/nen/kne110_002.wav
7+
./dataset/48k/paimon/vo_ABLQ004_6_paimon_01.wav
8+
./dataset/48k/paimon/vo_ABLQ004_6_paimon_02.wav
9+
./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav
10+
./dataset/48k/nen/kne110_004.wav
11+
./dataset/48k/paimon/vo_ABLQ004_7_paimon_01.wav
12+
./dataset/48k/nen/kne110_005.wav

filelists/val.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav
1+
./dataset/48k/paimon/vo_ABLQ004_7_paimon_01.wav
22
./dataset/48k/nen/kne110_006.wav
3-
./dataset/48k/nen/kne110_002.wav
4-
./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav
3+
./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav
4+
./dataset/48k/nen/kne110_003.wav

models.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ def __init__(self,
280280
upsample_kernel_sizes,
281281
gin_channels,
282282
ssl_dim,
283+
n_speakers,
283284
**kwargs):
284285

285286
super().__init__()
@@ -300,7 +301,7 @@ def __init__(self,
300301
self.segment_size = segment_size
301302
self.gin_channels = gin_channels
302303
self.ssl_dim = ssl_dim
303-
self.emb_g = nn.Embedding(10, gin_channels)
304+
self.emb_g = nn.Embedding(n_speakers, gin_channels)
304305

305306
self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16,0, filter_channels, n_heads, p_dropout)
306307
hps = {

preprocess_flist.py

Lines changed: 0 additions & 49 deletions
This file was deleted.

preprocess_flist_config.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import os
2+
import argparse
3+
from tqdm import tqdm
4+
from random import shuffle
5+
import json
6+
config_template = {
7+
"train": {
8+
"log_interval": 200,
9+
"eval_interval": 200,
10+
"seed": 1234,
11+
"epochs": 10000,
12+
"learning_rate": 2e-4,
13+
"betas": [0.8, 0.99],
14+
"eps": 1e-9,
15+
"batch_size": 16,
16+
"fp16_run": False,
17+
"lr_decay": 0.999875,
18+
"segment_size": 17920,
19+
"init_lr_ratio": 1,
20+
"warmup_epochs": 0,
21+
"c_mel": 45,
22+
"c_kl": 1.0,
23+
"use_sr": True,
24+
"max_speclen": 384,
25+
"port": "8001"
26+
},
27+
"data": {
28+
"training_files":"filelists/train.txt",
29+
"validation_files":"filelists/val.txt",
30+
"max_wav_value": 32768.0,
31+
"sampling_rate": 48000,
32+
"filter_length": 1280,
33+
"hop_length": 320,
34+
"win_length": 1280,
35+
"n_mel_channels": 80,
36+
"mel_fmin": 0.0,
37+
"mel_fmax": None
38+
},
39+
"model": {
40+
"inter_channels": 192,
41+
"hidden_channels": 192,
42+
"filter_channels": 768,
43+
"n_heads": 2,
44+
"n_layers": 6,
45+
"kernel_size": 3,
46+
"p_dropout": 0.1,
47+
"resblock": "1",
48+
"resblock_kernel_sizes": [3,7,11],
49+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
50+
"upsample_rates": [10,8,2,2],
51+
"upsample_initial_channel": 512,
52+
"upsample_kernel_sizes": [16,16,4,4],
53+
"n_layers_q": 3,
54+
"use_spectral_norm": False,
55+
"gin_channels": 256,
56+
"ssl_dim": 256,
57+
"n_speakers": 0,
58+
},
59+
"spk":{
60+
"nen": 0,
61+
"paimon": 1,
62+
"yunhao": 2
63+
}
64+
}
65+
66+
67+
if __name__ == "__main__":
68+
parser = argparse.ArgumentParser()
69+
parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list")
70+
parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list")
71+
parser.add_argument("--test_list", type=str, default="./filelists/test.txt", help="path to test list")
72+
parser.add_argument("--source_dir", type=str, default="./dataset/48k", help="path to source dir")
73+
args = parser.parse_args()
74+
75+
train = []
76+
val = []
77+
test = []
78+
idx = 0
79+
spk_dict = {}
80+
spk_id = 0
81+
for speaker in tqdm(os.listdir(args.source_dir)):
82+
spk_dict[speaker] = spk_id
83+
spk_id += 1
84+
wavs = [os.path.join(args.source_dir, speaker, i)for i in os.listdir(os.path.join(args.source_dir, speaker))]
85+
wavs = [i for i in wavs if i.endswith("wav")]
86+
shuffle(wavs)
87+
train += wavs[2:-10]
88+
val += wavs[:2]
89+
test += wavs[-10:]
90+
n_speakers = len(spk_dict.keys())*2
91+
shuffle(train)
92+
shuffle(val)
93+
shuffle(test)
94+
95+
print("Writing", args.train_list)
96+
with open(args.train_list, "w") as f:
97+
for fname in tqdm(train):
98+
wavpath = fname
99+
f.write(wavpath + "\n")
100+
101+
print("Writing", args.val_list)
102+
with open(args.val_list, "w") as f:
103+
for fname in tqdm(val):
104+
wavpath = fname
105+
f.write(wavpath + "\n")
106+
107+
print("Writing", args.test_list)
108+
with open(args.test_list, "w") as f:
109+
for fname in tqdm(test):
110+
wavpath = fname
111+
f.write(wavpath + "\n")
112+
113+
config_template["model"]["n_speakers"] = n_speakers
114+
config_template["spk"] = spk_dict
115+
print("Writing configs/config.json")
116+
with open("configs/config.json", "w") as f:
117+
json.dump(config_template, f, indent=2)

preprocess_hubert_f0.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def process(filename):
108108
c = torch.load(save_name)
109109
f0path = filename+".f0.npy"
110110
if not os.path.exists(f0path):
111-
cf0, f0 = compute_f0(filename, c.shape[-1] * 3)
111+
cf0, f0 = get_f0(filename, c.shape[-1] * 3)
112112
np.save(f0path, f0)
113113

114114

File renamed without changes.

0 commit comments

Comments
 (0)