基于深度学习方法构建和训练一个基本的自动语音识别 (ASR) 模型来识别八个不同的单词。数据集将使用 Speech Commands 数据集(Warden,2018 年)的一部分,其中包含命令的短(一秒或更短)音频片段,例如“down”、“go”、“left”、“no”、“right”、“stop”、“up”和“yes”。
要求实现:
1.绘制波形和频谱图,从数据集中任选9条不同的语音文件,分别绘制其波形图和频谱图。
2.特征提取,提取MFCC特征
3.基于深度学习的语音识别,创建神经网络模型、训练模型并绘制训练和验证损失曲线、评估模型性能。
代码用的是https://tensorflow.google.cn/tutorials/audio/simple_audio?hl=zh-cn#%E5%B0%86%E6%B3%A2%E5%BD%A2%E8%BD%AC%E6%8D%A2%E4%B8%BA%E9%A2%91%E8%B0%B1%E5%9B%BE文章里面的,按照自己的要求改了一些,在添加MFCC特征提取的时候出现错误:
ValueError: The padded shape () is not compatible with the shape (None,) of the corresponding input component.
(SOS)是哪个地方要修改的,一直改不过来
import os
import pathlib
import random
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
# 导入训练集
DATASET_PATH = 'D:/noice/Text/mini_speech_commands'
words = ["down", "go", "left", "no", "right", "stop", "up", "yes"]
data_dir = pathlib.Path(DATASET_PATH)
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[(commands != 'README.md') & (commands != '.DS_Store')]
print('Commands:', commands)
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
directory=data_dir,
batch_size=64,
validation_split=0.2,
seed=0,
output_sequence_length=16000,
subset='both')
label_names = np.array(train_ds.class_names)
print()
print("label names:", label_names)
train_ds.element_spec
def squeeze(audio, labels):
audio = tf.squeeze(audio, axis=-1)
return audio, labels
train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)
test_ds = val_ds.shard(num_shards=2, index=0)
val_ds = val_ds.shard(num_shards=2, index=1)
for example_audio, example_labels in train_ds.take(1):
print(example_audio.shape)
print(example_labels.shape)
label_names[[1,1,3,0]]
# 绘制波形图和频谱图
def get_spectrogram(waveform):
spectrogram = tf.signal.stft(
waveform, frame_length=255, frame_step=128)
spectrogram = tf.abs(spectrogram)
spectrogram = spectrogram[..., tf.newaxis]
return spectrogram
def plot_spectrogram(spectrogram, ax):
if len(spectrogram.shape) > 2:
assert len(spectrogram.shape) == 3
spectrogram = np.squeeze(spectrogram, axis=-1)
log_spec = np.log(spectrogram.T + np.finfo(float).eps)
height = log_spec.shape[0]
width = log_spec.shape[1]
X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
Y = range(height)
ax.pcolormesh(X, Y, log_spec)
# 随机选择9条不同的语音文件
random_examples = []
for audio, label in train_ds.unbatch().shuffle(buffer_size=10000).take(9):
random_examples.append((audio, label))
# 绘制波形图
plt.figure(figsize=(16, 10))
rows = 3
cols = 3
n = rows * cols
for i in range(n):
plt.subplot(rows, cols, i+1)
audio_signal, label = random_examples[i]
plt.plot(audio_signal)
plt.title(label_names[label])
plt.yticks(np.arange(-1.2, 1.2, 0.2))
plt.ylim([-1.1, 1.1])
plt.show()
# 绘制频谱图
fig, axes = plt.subplots(3, 3, figsize=(16, 10))
for i in range(9):
r = i // cols
c = i % cols
ax = axes[r][c]
waveform, label = random_examples[i]
spectrogram = get_spectrogram(waveform)
plot_spectrogram(spectrogram.numpy(), ax)
ax.set_title(label_names[label])
plt.show()
# 提取MFCC特征
def get_mfcc(waveform, sample_rate=16000, n_mfcc=13):
mfcc = librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=n_mfcc)
return mfcc
# 绘制MFCC特征图
fig, axes = plt.subplots(3, 3, figsize=(16, 10))
for i in range(9):
r = i // cols
c = i % cols
ax = axes[r][c]
waveform, label = random_examples[i]
mfcc = get_mfcc(waveform.numpy())
librosa.display.specshow(mfcc, sr=16000, ax=ax, x_axis='time')
ax.set_title(label_names[label])
plt.show()
# 后续模型训练部分
def make_spec_ds(ds):
def get_mfcc_wrapper(waveform, label):
mfcc = tf.numpy_function(get_mfcc, [waveform], tf.float32)
mfcc.set_shape([13, None]) # 设置形状,保留时间轴的可变长度
mfcc = tf.expand_dims(mfcc, -1)
return mfcc, label
return ds.map(get_mfcc_wrapper, num_parallel_calls=tf.data.AUTOTUNE)
train_mfcc_ds = make_spec_ds(train_ds)
val_mfcc_ds = make_spec_ds(val_ds)
test_mfcc_ds = make_spec_ds(test_ds)
# 确定MFCC特征的固定长度
def preprocess_dataset(ds, batch_size=64):
return ds.padded_batch(batch_size, padded_shapes=([13, None, 1], []), padding_values=(0.0, -1))
train_mfcc_ds = preprocess_dataset(train_mfcc_ds)
val_mfcc_ds = preprocess_dataset(val_mfcc_ds)
test_mfcc_ds = preprocess_dataset(test_mfcc_ds)
# 检查示例MFCC特征的形状
for example_mfccs, example_mfcc_labels in train_mfcc_ds.take(1):
print(example_mfccs.shape)
# 模型训练
input_shape = example_mfccs.shape[1:]
print('Input shape:', input_shape)
num_labels = len(label_names)
norm_layer = layers.Normalization()
norm_layer.adapt(data=train_mfcc_ds.map(map_func=lambda spec, label: spec))
model = tf.keras.Sequential([
layers.Input(shape=input_shape),
# Normalize.
norm_layer,
layers.Conv2D(32, 3, activation='relu'),
layers.Conv2D(64, 3, activation='relu'),
layers.MaxPooling2D(),
layers.Dropout(0.25),
layers.Flatten(),
layers.Dense(128, activation='relu'),
layers.Dropout(0.5),
layers.Dense(num_labels, activation='softmax')
])
model.summary()
# Adam优化模型
model.compile(
optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'],
)
# 设置模型训练周期为10
EPOCHS = 10
history = model.fit(
train_mfcc_ds,
validation_data=val_mfcc_ds,
epochs=EPOCHS,
callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)
# 绘制训练和验证损失函数
metrics = history.history
plt.figure(figsize=(16,6))
# 绘制损失函数图像
plt.subplot(1,2,1)
plt.plot(history.epoch, metrics['loss'], label='loss')
plt.plot(history.epoch, metrics['val_loss'], label='val_loss')
plt.legend()
plt.ylim([0, max(max(metrics['loss']), max(metrics['val_loss']))])
plt.xlabel('Epoch')
plt.ylabel('Loss [CrossEntropy]')
# 绘制准确率图像
plt.subplot(1,2,2)
plt.plot(history.epoch, 100*np.array(metrics['accuracy']), label='accuracy')
plt.plot(history.epoch, 100*np.array(metrics['val_accuracy']), label='val_accuracy')
plt.legend()
plt.ylim([0, 100])
plt.xlabel('Epoch')
plt.ylabel('Accuracy [%]')
plt.show()
# 评估模型性能
model.evaluate(test_mfcc_ds, return_dict=True)