innnky
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 3 deletions b/‎.gitignore‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎app.py‎
Lines changed: 0 additions & 103 deletions b/‎app.py‎
Lines changed: 0 additions & 103 deletions
diff --git a/‎commons.py‎
Lines changed: 27 additions & 0 deletions b/‎commons.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎configs/nyarumul.json‎ ‎configs/config.json‎configs/nyarumul.json renamed to configs/config.json
Lines changed: 23 additions & 18 deletions b/‎configs/nyarumul.json‎ ‎configs/config.json‎configs/nyarumul.json renamed to configs/config.json
Lines changed: 23 additions & 18 deletions
diff --git a/‎configs/nyarusing.json‎
Lines changed: 0 additions & 52 deletions b/‎configs/nyarusing.json‎
Lines changed: 0 additions & 52 deletions
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2021 Jaehyeon Kim
+Copyright (c) 2021 Jingyi Li
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -4,6 +4,23 @@
 from torch import nn
 from torch.nn import functional as F
 
+def slice_pitch_segments(x, ids_str, segment_size=4):
+  ret = torch.zeros_like(x[:, :segment_size])
+  for i in range(x.size(0)):
+    idx_str = ids_str[i]
+    idx_end = idx_str + segment_size
+    ret[i] = x[i, idx_str:idx_end]
+  return ret
+
+def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size + 1
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size)
+  return ret, ret_pitch, ids_str
 
 def init_weights(m, mean=0.0, std=0.01):
   classname = m.__class__.__name__
@@ -64,6 +81,16 @@ def rand_slice_segments(x, x_lengths=None, segment_size=4):
   return ret, ids_str
 
 
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+
+
 def get_timing_signal_1d(
     length, channels, min_timescale=1.0, max_timescale=1.0e4):
   position = torch.arange(length, dtype=torch.float)
 
@@ -1,40 +1,39 @@
 {
   "train": {
     "log_interval": 200,
-    "eval_interval": 2000,
+    "eval_interval": 200,
     "seed": 1234,
     "epochs": 10000,
     "learning_rate": 2e-4,
     "betas": [0.8, 0.99],
     "eps": 1e-9,
     "batch_size": 16,
-    "fp16_run": true,
+    "fp16_run": false,
     "lr_decay": 0.999875,
-    "segment_size": 8192,
+    "segment_size": 17920,
     "init_lr_ratio": 1,
     "warmup_epochs": 0,
     "c_mel": 45,
-    "c_kl": 1.0
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 384,
+    "port": "8001"
   },
   "data": {
-    "training_files":"/content/drive/MyDrive/SingingVC/trainmul.txt",
-    "validation_files":"/content/drive/MyDrive/SingingVC/valmul.txt",
-    "text_cleaners":["english_cleaners2"],
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
     "max_wav_value": 32768.0,
-    "sampling_rate": 22050,
-    "filter_length": 1024,
-    "hop_length": 256,
-    "win_length": 1024,
+    "sampling_rate": 48000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
     "n_mel_channels": 80,
     "mel_fmin": 0.0,
-    "mel_fmax": null,
-    "add_blank": true,
-    "n_speakers": 3,
-    "cleaned_text": true
+    "mel_fmax": null
   },
   "model": {
     "inter_channels": 192,
-    "hidden_channels": 256,
+    "hidden_channels": 192,
     "filter_channels": 768,
     "n_heads": 2,
     "n_layers": 6,
@@ -43,11 +42,17 @@
     "resblock": "1",
     "resblock_kernel_sizes": [3,7,11],
     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [8,8,2,2],
+    "upsample_rates": [10,8,2,2],
     "upsample_initial_channel": 512,
     "upsample_kernel_sizes": [16,16,4,4],
     "n_layers_q": 3,
     "use_spectral_norm": false,
-    "gin_channels": 256
+    "gin_channels": 256,
+    "ssl_dim": 256
+  },
+  "spk":{
+    "nen": 0,
+    "paimon": 1,
+    "yunhao": 2
   }
 }