-
Notifications
You must be signed in to change notification settings - Fork 24
训练模型出现以下报错怎么办 #178
Description
Active code page: 65001
即将开始训练...
INFO:44k:{'train': {'log_interval': 200, 'eval_interval': 800, 'seed': 1234, 'epochs': 10000, 'learning_rate': 0.0001, 'betas': [0.8, 0.99], 'eps': 1e-09, 'batch_size': 6, 'fp16_run': False, 'lr_decay': 0.999875, 'segment_size': 10240, 'init_lr_ratio': 1, 'warmup_epochs': 0, 'c_mel': 45, 'c_kl': 1.0, 'use_sr': True, 'max_speclen': 512, 'port': '8001', 'keep_ckpts': 3}, 'data': {'training_files': 'filelists/train.txt', 'validation_files': 'filelists/val.txt', 'max_wav_value': 32768.0, 'sampling_rate': 44100, 'filter_length': 2048, 'hop_length': 512, 'win_length': 2048, 'n_mel_channels': 80, 'mel_fmin': 0.0, 'mel_fmax': 22050}, 'model': {'inter_channels': 192, 'hidden_channels': 192, 'filter_channels': 768, 'n_heads': 2, 'n_layers': 6, 'kernel_size': 3, 'p_dropout': 0.1, 'resblock': '1', 'resblock_kernel_sizes': [3, 7, 11], 'resblock_dilation_sizes': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'upsample_rates': [8, 8, 2, 2, 2], 'upsample_initial_channel': 512, 'upsample_kernel_sizes': [16, 16, 4, 4, 4], 'n_layers_q': 3, 'use_spectral_norm': False, 'gin_channels': 256, 'ssl_dim': 256, 'n_speakers': 200}, 'spk': {'specter': 0}, 'model_dir': './logs\44k'}
WARNING:44k:C:\Users\32785\Desktop\ai\so-vits-svc-4.0 is not a git repository, therefore hash value comparison will be ignored.
INFO:torch.distributed.distributed_c10d:Added key: store_based_barrier_key:1 to store for rank: 0
INFO:torch.distributed.distributed_c10d:Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
./logs\44k\G_0.pth
error, emb_g.weight is not in the checkpoint
INFO:44k:emb_g.weight is not in the checkpoint
load
INFO:44k:Loaded checkpoint './logs\44k\G_0.pth' (iteration 0)
./logs\44k\D_0.pth
load
INFO:44k:Loaded checkpoint './logs\44k\D_0.pth' (iteration 0)
Traceback (most recent call last):
File "train.py", line 313, in
main()
File "train.py", line 54, in main
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\multiprocessing\spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\multiprocessing\spawn.py", line 198, in start_processes
while not context.join():
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\multiprocessing\spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\multiprocessing\spawn.py", line 69, in _wrap
fn(i, *args)
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\train.py", line 122, in run
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\train.py", line 162, in train_and_evaluate
(z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths,
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\nn\parallel\distributed.py", line 1040, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\nn\parallel\distributed.py", line 1000, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0])
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\models.py", line 385, in forward
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2)
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\nn\modules\conv.py", line 313, in forward
return self._conv_forward(input, self.weight, self.bias)
File "C:\Users\32785\Desktop\ai\so-vits-svc-4.0\python38\lib\site-packages\torch\nn\modules\conv.py", line 309, in _conv_forward
return F.conv1d(input, weight, bias, self.stride,
RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
You can try to repro this exception using the following code snippet. If that doesn't trigger the error, please include your original repro script when reporting this issue.
import torch
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.allow_tf32 = True
data = torch.randn([6, 256, 1, 761], dtype=torch.float, device='cuda', requires_grad=True)
net = torch.nn.Conv2d(256, 192, kernel_size=[1, 5], padding=[0, 2], stride=[1, 1], dilation=[1, 1], groups=1)
net = net.cuda().float()
out = net(data)
out.backward(torch.randn_like(out))
torch.cuda.synchronize()
ConvolutionParams
memory_format = Contiguous
data_type = CUDNN_DATA_FLOAT
padding = [0, 2, 0]
stride = [1, 1, 0]
dilation = [1, 1, 0]
groups = 1
deterministic = false
allow_tf32 = true
input: TensorDescriptor 0000018F559CC7C0
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 6, 256, 1, 761,
strideA = 194816, 761, 761, 1,
output: TensorDescriptor 0000018F559CC3D0
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 6, 192, 1, 761,
strideA = 146112, 761, 761, 1,
weight: FilterDescriptor 0000018F5EDCC120
type = CUDNN_DATA_FLOAT
tensor_format = CUDNN_TENSOR_NCHW
nbDims = 4
dimA = 192, 256, 1, 5,
Pointer addresses:
input: 0000000712370000
output: 0000000737CC5800
weight: 0000000706432000
Press any key to continue . . .