diff --git a/README.md b/README.md index 0fcfa3d..4f5abc8 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ python train.py --dataset path/to/dataset --reduction_rate 0.5 --mixup_rate 0.5 ``` ## References -- [1] Jansson et al., "Singing Voice Separation with Deep U-Net Convolutional Networks", https://ismir2017.smcnus.org/wp-content/uploads/2017/10/171_Paper.pdf +- [1] Jansson et al., "Singing Voice Separation with Deep U-Net Convolutional Networks", https://ejhumphrey.com/assets/pdf/jansson2017singing.pdf - [2] Takahashi et al., "Multi-scale Multi-band DenseNets for Audio Source Separation", https://arxiv.org/pdf/1706.09588.pdf - [3] Takahashi et al., "MMDENSELSTM: AN EFFICIENT COMBINATION OF CONVOLUTIONAL AND RECURRENT NEURAL NETWORKS FOR AUDIO SOURCE SEPARATION", https://arxiv.org/pdf/1805.02410.pdf - [4] Liutkus et al., "The 2016 Signal Separation Evaluation Campaign", Latent Variable Analysis and Signal Separation - 12th International Conference diff --git a/augment.py b/augment.py index c6059f7..dd5f886 100644 --- a/augment.py +++ b/augment.py @@ -47,9 +47,9 @@ continue X, _ = librosa.load( - mix_path, args.sr, False, dtype=np.float32, res_type='kaiser_fast') + mix_path, sr=args.sr, mono=False, dtype=np.float32, res_type='kaiser_fast') y, _ = librosa.load( - inst_path, args.sr, False, dtype=np.float32, res_type='kaiser_fast') + inst_path, sr=args.sr, mono=False, dtype=np.float32, res_type='kaiser_fast') X, y = spec_utils.align_wave_head_and_tail(X, y, args.sr) v = X - y @@ -60,9 +60,9 @@ subprocess.call(cmd_v, stderr=subprocess.DEVNULL) y, _ = librosa.load( - output_i, args.sr, False, dtype=np.float32, res_type='kaiser_fast') + output_i, sr=args.sr, mono=False, dtype=np.float32, res_type='kaiser_fast') v, _ = librosa.load( - output_v, args.sr, False, dtype=np.float32, res_type='kaiser_fast') + output_v, sr=args.sr, mono=False, dtype=np.float32, res_type='kaiser_fast') X = y + v diff --git a/inference.py b/inference.py index e264933..f4fe2b6 100644 --- a/inference.py +++ b/inference.py @@ -125,14 +125,18 @@ def main(): device = torch.device('cpu') model = nets.CascadedNet(args.n_fft, 32, 128) model.load_state_dict(torch.load(args.pretrained_model, map_location=device)) - if torch.cuda.is_available() and args.gpu >= 0: - device = torch.device('cuda:{}'.format(args.gpu)) - model.to(device) + if args.gpu >= 0: + if torch.cuda.is_available(): + device = torch.device('cuda:{}'.format(args.gpu)) + model.to(device) + elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): + device = torch.device('mps') + model.to(device) print('done') print('loading wave source...', end=' ') X, sr = librosa.load( - args.input, args.sr, False, dtype=np.float32, res_type='kaiser_fast') + args.input, sr=args.sr, mono=False, dtype=np.float32, res_type='kaiser_fast') basename = os.path.splitext(os.path.basename(args.input))[0] print('done') diff --git a/lib/spec_utils.py b/lib/spec_utils.py index af34f70..7d7a043 100644 --- a/lib/spec_utils.py +++ b/lib/spec_utils.py @@ -27,8 +27,8 @@ def wave_to_spectrogram(wave, hop_length, n_fft): wave_left = np.asfortranarray(wave[0]) wave_right = np.asfortranarray(wave[1]) - spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length) - spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) + spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) + spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) spec = np.asfortranarray([spec_left, spec_right]) return spec @@ -152,9 +152,9 @@ def cache_or_load(mix_path, inst_path, sr, hop_length, n_fft): y = np.load(inst_cache_path) else: X, _ = librosa.load( - mix_path, sr, False, dtype=np.float32, res_type='kaiser_fast') + mix_path, sr=sr, mono=False, dtype=np.float32, res_type='kaiser_fast') y, _ = librosa.load( - inst_path, sr, False, dtype=np.float32, res_type='kaiser_fast') + inst_path, sr=sr, mono=False, dtype=np.float32, res_type='kaiser_fast') X, y = align_wave_head_and_tail(X, y, sr) @@ -196,9 +196,9 @@ def spectrogram_to_wave(spec, hop_length=1024): ], axis=0) * 0.2 X, _ = librosa.load( - sys.argv[1], 44100, False, dtype=np.float32, res_type='kaiser_fast') + sys.argv[1], sr=44100, mono=False, dtype=np.float32, res_type='kaiser_fast') y, _ = librosa.load( - sys.argv[2], 44100, False, dtype=np.float32, res_type='kaiser_fast') + sys.argv[2], sr=44100, mono=False, dtype=np.float32, res_type='kaiser_fast') X, y = align_wave_head_and_tail(X, y, 44100) X_spec = wave_to_spectrogram(X, 1024, 2048) diff --git a/requirements.txt b/requirements.txt index d075e04..f89cf54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # install from https://pytorch.org/get-started/locally/ -# torch>=1.5.1 +# torch>=1.12.0 # torchvision>=0.6.1 tqdm>=4.30 librosa>=0.6.3,<0.9