1.建立自己的音频定义数据集
Urbansounddataste.py
创建自定义数据集
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
class UrbanSoundDataset(Dataset):
def __init__(self,
annotations_file,
audio_dir,
transformation,
target_sample_rate,
num_samples,
device):
self.annotations = pd.read_csv(annotations_file)
self.audio_dir = audio_dir
self.device = device
self.transformation = transformation.to(self.device)
self.target_sample_rate = target_sample_rate
self.num_samples = num_samples
def __len__(self):
return len(self.annotations)
def __getitem__(self, index):
audio_sample_path = self._get_audio_sample_path(index)
label = self._get_audio_sample_label(index)
signal, sr = torchaudio.load(audio_sample_path)
# 声音可能存在多通道
# signal-> (num_channels, samples) -> (2,16000) -> (1,16000)
signal = self._resample_if_necessary(signal, sr)
signal = signal.to(self.device)
signal = self._mix_down_if_necessary(signal)
signal = self._cut_if_necessary(signal)
signal = self._right_pad_if_necessary(signal)
signal = self.transformation(signal)
return signal, label
def _cut_if_necessary(self, signal):
if signal.shape[1] > self.num_samples:
signal =signal[:, :self.num_samples] #取第1s
return signal
def _right_pad_if_necessary(self, signal):
length_signal = signal.shape[1]
if length_signal < self.num_samples:
num_missing_samples = self.num_samples - length_signal
last_dim_padding = (0, num_missing_samples) # 两个数字分别表示左右填充0的个数
# (1,1,2,1) 1,1表示在最后一个维度的左右填充,2,2表示在倒数第二个维度上填充
signal = torch.nn.functional.pad(signal, last_dim_padding)
return signal
def _resample_if_necessary(self, signal, sr):
if sr != self.target_sample_rate:
resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
signal = resampler(signal)
return signal
def _mix_down_if_necessary(self, signal):
# 聚合多个通道,并将其混合到一个通道
if signal.shape[0] > 1: # (2, 16000)
signal = torch.mean(signal, dim=0, keepdim=True) # 使用最小值的维度
return signal
def _get_audio_sample_path(self, index):
# 获得数据的路径
fold = f"fold{self.annotations.iloc[index, 5]}"
path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0])
return path
def _get_audio_sample_label(self, index):
return self.annotations.iloc[index, 6]
if __name__ == "__main__":
ANNOTATIONS_FILE = "UrbanSound8K/metadata/UrbanSound8K.csv"
AUDIO_DIR = "audio"
SAMPLE_RATE = 16000 # 采样率
NUM_SAMPLES = 22050 # 样本数量
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
print(f"Using devie {device}")
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=SAMPLE_RATE,
n_fft=1024,
hop_length=512,
n_mels=64
) # 将mel频谱图传递给usd
# ms = mel_spectrogram(signal)
usd = UrbanSoundDataset(ANNOTATIONS_FILE,
AUDIO_DIR,
mel_spectrogram,
SAMPLE_RATE,
NUM_SAMPLES,
device)
print(f"There are {len(usd)} samples in the dataset.")
signal, label = usd[0]
Urbansound8k.csv文件
- 建立神经网咯模型cnn.py
from torch import nn
from torchsummary import summary
VGG分类器
class CNNNetwork(nn.Module):
def __init__(self):
super().__init__()
# 4 conv blocks / flatten / linear / softmax
self.conv1 = nn.Sequential(
nn.Conv2d(
in_channels=1,
out_channels=16,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.conv2 = nn.Sequential(
nn.Conv2d(
in_channels=16,
out_channels=32,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.conv3 = nn.Sequential(
nn.Conv2d(
in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.conv4 = nn.Sequential(
nn.Conv2d(
in_channels=64,
out_channels=128,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.flatten = nn.Flatten()
# 对于torch.nn.Flatten(),因为其被用在神经网络中,
# 输入为一批数据,第一维为batch,通常要把一个数据拉成一维,而不是将一
# 批数据拉为一维。所以torch.nn.Flatten()默认从第二维开始平坦化。
self.linear = nn.Linear(128*5*4, 10) # 128:最后一个卷积层的输出,5:
self.softmax = nn.Softmax(dim=1) # 对linear层输出的结果进行归一化
def forward(self, input_data):
x = self.conv1(input_data)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.flatten(x)
logits = self.linear(x)
prediction = self.softmax(logits)
return prediction
if __name__ == "__main__":
cnn = CNNNetwork()
summary(cnn.cuda(), (1, 64, 44))
# 1:输入的通道数量,64是频率轴, 44是时间轴
下面看一下网络层具体是怎么样的
Input size (MB): 0.01
Forward/backward pass size (MB): 1.83
Params size (MB): 0.47
Estimated Total Size (MB): 2.31
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 16, 66, 46] 160
ReLU-2 [-1, 16, 66, 46] 0
MaxPool2d-3 [-1, 16, 33, 23] 0
Conv2d-4 [-1, 32, 35, 25] 4,640
ReLU-5 [-1, 32, 35, 25] 0
MaxPool2d-6 [-1, 32, 17, 12] 0
Conv2d-7 [-1, 64, 19, 14] 18,496
ReLU-8 [-1, 64, 19, 14] 0
MaxPool2d-9 [-1, 64, 9, 7] 0
Conv2d-10 [-1, 128, 11, 9] 73,856
ReLU-11 [-1, 128, 11, 9] 0
MaxPool2d-12 [-1, 128, 5, 4] 0
Flatten-13 [-1, 2560] 0
Linear-14 [-1, 10] 25,610
Softmax-15 [-1, 10] 0
================================================================
Total params: 122,762
Trainable params: 122,762
Non-trainable params: 0
网路层的输出是四维的,以Conv2d-1为例:
-1->batchsize的大小,这里还没有设置
16->输出通道的数量
66*46->输入图片的大小,第一层没有改变大小
输入图片大小 W×W
卷积核大小 F×F
步长 S
padding的像素数 P
于是我们可以得出计算公式为:
N = (W − F + 2P )/S+1
由公式可以得到每一层的输出大小
3.训练神经网络train.py
import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader
from UrbanSoundDataset import UrbanSoundDataset
from cnn import CNNNetwork
BATCH_SIZE = 128
Epochs = 10
LEARNING_RATE = 0.001
ANNOTATIONS_FILE = "UrbanSound8K/metadata/UrbanSound8K.csv"
AUDIO_DIR = "audio"
SAMPLE_RATE = 22050 # 采样率
NUM_SAMPLES = 22050 # 样本数量
def create_data_loader(train_data, batch_size):
train_dataloader = DataLoader(train_data, batch_size=batch_size)
return train_dataloader
def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
for input, targets in data_loader:
input, targets = input.to(device), targets.to(device)
# calculate loss #每一个batch计算loss
# 使用当前模型获得预测
predictions = model(input)
loss = loss_fn(predictions, targets)
# backpropagate loss and update weights
optimiser.zero_grad() # 在每个batch中让梯度重新为0
loss.backward() # 反向传播
optimiser.step()
print(f"Loss: {loss.item()}") # 打印最后的batch的loss
def train(model, data_loader, loss_fn, optimiser, device, epochs):
for i in range(epochs):
print(f"Epoch {i+1}")
train_single_epoch(model, data_loader, loss_fn, optimiser, device)
print("---------------------")
print("Training is down.")
if __name__ == "__main__":
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
print(f"Using {device} device")
# instantiating our dataset object and create data loader
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=SAMPLE_RATE,
n_fft=1024,
hop_length=512,
n_mels=64
) # 将mel频谱图传递给usd
# ms = mel_spectrogram(signal)
usd = UrbanSoundDataset(ANNOTATIONS_FILE,
AUDIO_DIR,
mel_spectrogram,
SAMPLE_RATE,
NUM_SAMPLES,
device)
train_dataloader = create_data_loader(usd, BATCH_SIZE)
# build model
cnn = CNNNetwork().to(device)
print(cnn)
# instantiate loss function + opptimiser
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(cnn.parameters(),
lr=LEARNING_RATE)
# train model
train(cnn, train_dataloader, loss_fn, optimiser, device, Epochs)
torch.save(cnn.state_dict(), "feedforwardnet.pth")
print("Model trained and stored at feedforwardnet.pth")
4.使用网络模型进行判别
inference.py
import torch
import torchaudio
from cnn import CNNNetwork
from UrbanSoundDataset import UrbanSoundDataset
from train import ANNOTATIONS_FILE, AUDIO_DIR, NUM_SAMPLES, SAMPLE_RATE
class_mapping =[
"air_conditioner",
"car_horn",
"children_playing",
"dog_bark",
"drilling",
"engine_idling",
"gun_shot",
"jackhammer",
"siren",
"street_music"
]
def predict(model, input, target, class_mapping):
model.eval()
with torch.no_grad():
predictions = model(input)
# Tensor (1, 10) ->[[0.1, 0.01, ... ,0.6]] #概率最大的即为所选
predicted_index = predictions[0].argmax(0)
predicted = class_mapping[predicted_index]
expected = class_mapping[target]
return predicted, expected
if __name__ == "__main__":
# load back the model
cnn = CNNNetwork()
state_dict = torch.load("feedforwardnet.pth")
cnn.load_state_dict(state_dict)
# load urban sound dataset
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=SAMPLE_RATE,
n_fft=1024,
hop_length=512,
n_mels=64
) # 将mel频谱图传递给usd
# ms = mel_spectrogram(signal)
usd = UrbanSoundDataset(ANNOTATIONS_FILE,
AUDIO_DIR,
mel_spectrogram,
SAMPLE_RATE,
NUM_SAMPLES,
"cpu")
# get a sample from urban sound dataset for inference
input, target = usd[0][0], usd[0][1] # [batch size, num_channels, fr, time]
input.unsqueeze_(0)
# make an inference
predicted, expected = predict(cnn, input, target, class_mapping)
print(f"Predicted:'{predicted}', expected:'{expected}'")
Original: https://blog.csdn.net/weixin_48983346/article/details/125644008
Author: 2020的小跟班
Title: 第二章 Urbansound8k 音频分类
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/663929/
转载文章受原作者版权保护。转载请注明原作者出处!