diff --git a/datasets/speech_commands_dataset.py b/datasets/speech_commands_dataset.py index 990d54d..716998f 100644 --- a/datasets/speech_commands_dataset.py +++ b/datasets/speech_commands_dataset.py @@ -80,7 +80,7 @@ def __init__(self, folder, transform=None, sample_rate=16000, sample_length=1): samples = [] for f in audio_files: path = os.path.join(folder, f) - s, sr = librosa.load(path, sample_rate) + s, sr = librosa.load(path, sr=sample_rate) samples.append(s) samples = np.hstack(samples) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4af02db --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +librosa==0.10.1 +matplotlib==3.8.2 +tensorboardX==2.6.2.2 +torch==2.1.2 +torchaudio==2.1.2 +torchvision==0.16.2 +tqdm==4.66.1 +torchnet==0.0.4 diff --git a/test_speech_commands.py b/test_speech_commands.py index 139ea1b..14f3b9b 100755 --- a/test_speech_commands.py +++ b/test_speech_commands.py @@ -89,12 +89,12 @@ def test(): if args.multi_crop: inputs = multi_crop(inputs) - inputs = Variable(inputs, volatile = True) + inputs = Variable(inputs, requires_grad=False) targets = Variable(targets, requires_grad=False) if use_gpu: inputs = inputs.cuda() - targets = targets.cuda(async=True) + targets = targets.cuda() # forward outputs = model(inputs) @@ -111,7 +111,7 @@ def test(): pred = outputs.data.max(1, keepdim=True)[1] correct += pred.eq(targets.data.view_as(pred)).sum() total += targets.size(0) - confusion_matrix.add(pred, targets.data) + confusion_matrix.add(pred.squeeze(), targets.data) filenames = batch['path'] for j in range(len(pred)): diff --git a/train_speech_commands.py b/train_speech_commands.py index 02f8efa..8e615b3 100755 --- a/train_speech_commands.py +++ b/train_speech_commands.py @@ -152,7 +152,7 @@ def train(epoch): if use_gpu: inputs = inputs.cuda() - targets = targets.cuda(async=True) + targets = targets.cuda() # forward/backward outputs = model(inputs) @@ -167,15 +167,15 @@ def train(epoch): # statistics it += 1 global_step += 1 - running_loss += loss.data[0] + running_loss += loss.item() pred = outputs.data.max(1, keepdim=True)[1] if args.mixup: targets = batch['target'] - targets = Variable(targets, requires_grad=False).cuda(async=True) + targets = Variable(targets, requires_grad=False).cuda() correct += pred.eq(targets.data.view_as(pred)).sum() total += targets.size(0) - writer.add_scalar('%s/loss' % phase, loss.data[0], global_step) + writer.add_scalar('%s/loss' % phase, loss.item(), global_step) # update the progress bar pbar.set_postfix({ @@ -210,7 +210,7 @@ def valid(epoch): if use_gpu: inputs = inputs.cuda() - targets = targets.cuda(async=True) + targets = targets.cuda() # forward outputs = model(inputs) @@ -219,12 +219,12 @@ def valid(epoch): # statistics it += 1 global_step += 1 - running_loss += loss.data[0] + running_loss += loss.item() pred = outputs.data.max(1, keepdim=True)[1] correct += pred.eq(targets.data.view_as(pred)).sum() total += targets.size(0) - writer.add_scalar('%s/loss' % phase, loss.data[0], global_step) + writer.add_scalar('%s/loss' % phase, loss.item(), global_step) # update the progress bar pbar.set_postfix({ diff --git a/transforms/transforms_stft.py b/transforms/transforms_stft.py index e6cab64..55d1f66 100644 --- a/transforms/transforms_stft.py +++ b/transforms/transforms_stft.py @@ -41,7 +41,7 @@ def __call__(self, data): sample_rate = data['sample_rate'] hop_length = data['hop_length'] scale = random.uniform(-self.max_scale, self.max_scale) - stft_stretch = librosa.core.phase_vocoder(stft, 1+scale, hop_length=hop_length) + stft_stretch = librosa.core.phase_vocoder(stft, rate=1+scale, hop_length=hop_length) data['stft'] = stft_stretch return data @@ -108,7 +108,7 @@ def __call__(self, data): stft = data['stft'] sample_rate = data['sample_rate'] n_fft = data['n_fft'] - mel_basis = librosa.filters.mel(sample_rate, n_fft, self.n_mels) + mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=self.n_mels) s = np.dot(mel_basis, np.abs(stft)**2.0) data['mel_spectrogram'] = librosa.power_to_db(s, ref=np.max) return data diff --git a/transforms/transforms_wav.py b/transforms/transforms_wav.py index f4ed1c0..5b2b1de 100644 --- a/transforms/transforms_wav.py +++ b/transforms/transforms_wav.py @@ -22,7 +22,7 @@ def __init__(self, sample_rate=16000): def __call__(self, data): path = data['path'] if path: - samples, sample_rate = librosa.load(path, self.sample_rate) + samples, sample_rate = librosa.load(path, sr=self.sample_rate) else: # silence sample_rate = self.sample_rate @@ -137,7 +137,7 @@ def __init__(self, n_mels=32): def __call__(self, data): samples = data['samples'] sample_rate = data['sample_rate'] - s = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=self.n_mels) + s = librosa.feature.melspectrogram(y=samples, sr=sample_rate, n_mels=self.n_mels) data['mel_spectrogram'] = librosa.power_to_db(s, ref=np.max) return data