@@ -46,7 +46,7 @@ def __init__(self, common_config, arch_config):
4646 # - For Non-MDX23C models: You can choose between 0.001-0.999
4747 self .overlap = arch_config .get ("overlap" , 0.25 )
4848
49- # Ensure overlap is within the range [0.001, 0.99 ]
49+ # Ensure overlap is within the range [0.001, 0.999 ]
5050 if self .overlap < 0.001 :
5151 self .logger .warning (f"overlap { self .overlap } is less than the minimum allowed value of 0.001. Setting overlap to 0.001." )
5252 self .overlap = 0.001
@@ -184,34 +184,33 @@ def separate(self, audio_file_path, custom_output_names=None):
184184 mix = self .prepare_mix (self .audio_file_path )
185185
186186 self .logger .debug ("Normalizing mix before demixing..." )
187+ peak = np .abs (mix ).max ()
187188 mix = spec_utils .normalize (wave = mix , max_peak = self .normalization_threshold , min_peak = self .amplification_threshold )
188189
189190 # Start the demixing process
190- source = self .demix (mix )
191+ source = self .demix (mix ) * peak
191192 self .logger .debug ("Demixing completed." )
192193
194+ if not isinstance (self .primary_source , np .ndarray ):
195+ self .primary_source = source .T
196+
193197 # In UVR, the source is cached here if it's a vocal split model, but we're not supporting that yet
194198
195199 # Initialize the list for output files
196200 output_files = []
197201 self .logger .debug ("Processing output files..." )
198202
199- # Normalize and transpose the primary source if it's not already an array
200- if not isinstance (self .primary_source , np .ndarray ):
201- self .logger .debug ("Normalizing primary source..." )
202- self .primary_source = spec_utils .normalize (wave = source , max_peak = self .normalization_threshold , min_peak = self .amplification_threshold ).T
203-
204203 # Process the secondary source if not already an array
205204 if not isinstance (self .secondary_source , np .ndarray ):
206205 self .logger .debug ("Producing secondary source: demixing in match_mix mode" )
207206 raw_mix = self .demix (mix , is_match_mix = True )
208207
209208 if self .invert_using_spec :
210209 self .logger .debug ("Inverting secondary stem using spectogram as invert_using_spec is set to True" )
211- self .secondary_source = spec_utils .invert_stem (raw_mix , source )
210+ self .secondary_source = spec_utils .invert_stem (raw_mix , self . primary_source * self . compensate )
212211 else :
213212 self .logger .debug ("Inverting secondary stem by subtracting of transposed demixed stem from transposed original mix" )
214- self .secondary_source = mix . T - source .T
213+ self .secondary_source = ( - self . primary_source * self . compensate ) + mix .T
215214
216215 # Save and process the secondary stem if needed
217216 if not self .output_single_stem or self .output_single_stem .lower () == self .secondary_stem_name .lower ():
@@ -224,10 +223,6 @@ def separate(self, audio_file_path, custom_output_names=None):
224223 # Save and process the primary stem if needed
225224 if not self .output_single_stem or self .output_single_stem .lower () == self .primary_stem_name .lower ():
226225 self .primary_stem_output_path = self .get_stem_output_path (self .primary_stem_name , custom_output_names )
227-
228- if not isinstance (self .primary_source , np .ndarray ):
229- self .primary_source = source .T
230-
231226 self .logger .info (f"Saving { self .primary_stem_name } stem to { self .primary_stem_output_path } ..." )
232227 self .final_process (self .primary_stem_output_path , self .primary_source , self .primary_stem_name )
233228 output_files .append (self .primary_stem_output_path )
@@ -284,7 +279,15 @@ def initialize_mix(self, mix, is_ckpt=False):
284279 pad = self .gen_size + self .trim - (mix .shape [- 1 ] % self .gen_size )
285280 self .logger .debug (f"Padding calculated: { pad } " )
286281 # Add padding at the beginning and the end of the mix
287- mixture = np .concatenate ((np .zeros ((2 , self .trim ), dtype = "float32" ), mix , np .zeros ((2 , pad ), dtype = "float32" )), 1 )
282+ mixture = np .concatenate (
283+ (
284+ np .zeros ((2 , self .trim ), dtype = "float32" ), # Pad at the start
285+ mix ,
286+ np .zeros ((2 , pad ), dtype = "float32" ), # Pad in the middle (to match chunk size)
287+ np .zeros ((2 , self .trim ), dtype = "float32" ), # Pad at the end
288+ ),
289+ 1
290+ )
288291 # Determine the number of chunks based on the mixture's length
289292 num_chunks = mixture .shape [- 1 ] // self .gen_size
290293 self .logger .debug (f"Mixture shape after padding: { mixture .shape } , Number of chunks: { num_chunks } " )
@@ -431,11 +434,6 @@ def demix(self, mix, is_match_mix=False):
431434
432435 # TODO: In UVR, pitch changing happens here. Consider implementing this as a feature.
433436
434- # Compensates the source if not matching the mix.
435- if not is_match_mix :
436- source *= self .compensate
437- self .logger .debug ("Match mix mode; compensate multiplier applied." )
438-
439437 # TODO: In UVR, VR denoise model gets applied here. Consider implementing this as a feature.
440438
441439 self .logger .debug ("Demixing process completed." )
0 commit comments