python自动语音识别

基于深度学习方法构建和训练一个基本的自动语音识别 (ASR) 模型来识别八个不同的单词。数据集将使用 Speech Commands 数据集（Warden，2018 年）的一部分，其中包含命令的短（一秒或更短）音频片段，例如“down”、“go”、“left”、“no”、“right”、“stop”、“up”和“yes”。
要求实现：
1.绘制波形和频谱图，从数据集中任选9条不同的语音文件，分别绘制其波形图和频谱图。
2.特征提取，提取MFCC特征
3.基于深度学习的语音识别，创建神经网络模型、训练模型并绘制训练和验证损失曲线、评估模型性能。

代码用的是https://tensorflow.google.cn/tutorials/audio/simple_audio?hl=zh-cn#%E5%B0%86%E6%B3%A2%E5%BD%A2%E8%BD%AC%E6%8D%A2%E4%B8%BA%E9%A2%91%E8%B0%B1%E5%9B%BE文章里面的，按照自己的要求改了一些，在添加MFCC特征提取的时候出现错误：
ValueError: The padded shape () is not compatible with the shape (None,) of the corresponding input component.
（SOS）是哪个地方要修改的，一直改不过来


import os
import pathlib
import random
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display

# 导入训练集
DATASET_PATH = 'D:/noice/Text/mini_speech_commands'
words = ["down", "go", "left", "no", "right", "stop", "up", "yes"]

data_dir = pathlib.Path(DATASET_PATH)
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[(commands != 'README.md') & (commands != '.DS_Store')]
print('Commands:', commands)

train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory=data_dir,
    batch_size=64,
    validation_split=0.2,
    seed=0,
    output_sequence_length=16000,
    subset='both')

label_names = np.array(train_ds.class_names)
print()
print("label names:", label_names)

train_ds.element_spec
def squeeze(audio, labels):
  audio = tf.squeeze(audio, axis=-1)
  return audio, labels

train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)
test_ds = val_ds.shard(num_shards=2, index=0)
val_ds = val_ds.shard(num_shards=2, index=1)
for example_audio, example_labels in train_ds.take(1):
  print(example_audio.shape)
  print(example_labels.shape)
label_names[[1,1,3,0]]

# 绘制波形图和频谱图
def get_spectrogram(waveform):
  spectrogram = tf.signal.stft(
      waveform, frame_length=255, frame_step=128)
  spectrogram = tf.abs(spectrogram)
  spectrogram = spectrogram[..., tf.newaxis]
  return spectrogram

def plot_spectrogram(spectrogram, ax):
  if len(spectrogram.shape) > 2:
    assert len(spectrogram.shape) == 3
    spectrogram = np.squeeze(spectrogram, axis=-1)
  log_spec = np.log(spectrogram.T + np.finfo(float).eps)
  height = log_spec.shape[0]
  width = log_spec.shape[1]
  X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
  Y = range(height)
  ax.pcolormesh(X, Y, log_spec)

# 随机选择9条不同的语音文件
random_examples = []
for audio, label in train_ds.unbatch().shuffle(buffer_size=10000).take(9):
  random_examples.append((audio, label))

# 绘制波形图
plt.figure(figsize=(16, 10))
rows = 3
cols = 3
n = rows * cols
for i in range(n):
  plt.subplot(rows, cols, i+1)
  audio_signal, label = random_examples[i]
  plt.plot(audio_signal)
  plt.title(label_names[label])
  plt.yticks(np.arange(-1.2, 1.2, 0.2))
  plt.ylim([-1.1, 1.1])
plt.show()

# 绘制频谱图
fig, axes = plt.subplots(3, 3, figsize=(16, 10))
for i in range(9):
  r = i // cols
  c = i % cols
  ax = axes[r][c]
  waveform, label = random_examples[i]
  spectrogram = get_spectrogram(waveform)
  plot_spectrogram(spectrogram.numpy(), ax)
  ax.set_title(label_names[label])
plt.show()

# 提取MFCC特征
def get_mfcc(waveform, sample_rate=16000, n_mfcc=13):
  mfcc = librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=n_mfcc)
  return mfcc

# 绘制MFCC特征图
fig, axes = plt.subplots(3, 3, figsize=(16, 10))
for i in range(9):
  r = i // cols
  c = i % cols
  ax = axes[r][c]
  waveform, label = random_examples[i]
  mfcc = get_mfcc(waveform.numpy())
  librosa.display.specshow(mfcc, sr=16000, ax=ax, x_axis='time')
  ax.set_title(label_names[label])
plt.show()

# 后续模型训练部分
def make_spec_ds(ds):
  def get_mfcc_wrapper(waveform, label):
    mfcc = tf.numpy_function(get_mfcc, [waveform], tf.float32)
    mfcc.set_shape([13, None])  # 设置形状，保留时间轴的可变长度
    mfcc = tf.expand_dims(mfcc, -1)
    return mfcc, label
  return ds.map(get_mfcc_wrapper, num_parallel_calls=tf.data.AUTOTUNE)

train_mfcc_ds = make_spec_ds(train_ds)
val_mfcc_ds = make_spec_ds(val_ds)
test_mfcc_ds = make_spec_ds(test_ds)

# 确定MFCC特征的固定长度
def preprocess_dataset(ds, batch_size=64):
  return ds.padded_batch(batch_size, padded_shapes=([13, None, 1], []), padding_values=(0.0, -1))

train_mfcc_ds = preprocess_dataset(train_mfcc_ds)
val_mfcc_ds = preprocess_dataset(val_mfcc_ds)
test_mfcc_ds = preprocess_dataset(test_mfcc_ds)

# 检查示例MFCC特征的形状
for example_mfccs, example_mfcc_labels in train_mfcc_ds.take(1):
  print(example_mfccs.shape)

# 模型训练
input_shape = example_mfccs.shape[1:]
print('Input shape:', input_shape)
num_labels = len(label_names)

norm_layer = layers.Normalization()
norm_layer.adapt(data=train_mfcc_ds.map(map_func=lambda spec, label: spec))


model = tf.keras.Sequential([
    layers.Input(shape=input_shape),
    # Normalize.
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels, activation='softmax')
])

model.summary()
# Adam优化模型
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

# 设置模型训练周期为10
EPOCHS = 10
history = model.fit(
    train_mfcc_ds,
    validation_data=val_mfcc_ds,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

# 绘制训练和验证损失函数
metrics = history.history
plt.figure(figsize=(16,6))

# 绘制损失函数图像
plt.subplot(1,2,1)
plt.plot(history.epoch, metrics['loss'], label='loss')
plt.plot(history.epoch, metrics['val_loss'], label='val_loss')
plt.legend()
plt.ylim([0, max(max(metrics['loss']), max(metrics['val_loss']))])
plt.xlabel('Epoch')
plt.ylabel('Loss [CrossEntropy]')

# 绘制准确率图像
plt.subplot(1,2,2)
plt.plot(history.epoch, 100*np.array(metrics['accuracy']), label='accuracy')
plt.plot(history.epoch, 100*np.array(metrics['val_accuracy']), label='val_accuracy')
plt.legend()
plt.ylim([0, 100])
plt.xlabel('Epoch')
plt.ylabel('Accuracy [%]')
plt.show()

# 评估模型性能
model.evaluate(test_mfcc_ds, return_dict=True)

写回答
好问题 0 提建议
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

1条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
阿里嘎多学长 2024-06-04 13:03
关注
以下内容由CHATGPT及阿里嘎多学长共同生成、有用望采纳：

针对您遇到的问题，我将提供一些可能的解决方案和建议来帮助您解决错误，并完成您的任务。

解决 ValueError 错误

错误信息 ValueError: The padded shape () is not compatible with the shape (None,) of the corresponding input component. 通常意味着您在处理数据时，某个维度的形状与模型期望的形状不匹配。在您的代码中，这可能与MFCC特征的维度有关。

检查 MFCC 特征的形状：确保 get_mfcc 函数返回的MFCC特征具有正确的形状。MFCC通常是一个二维数组，形状为 [n_mfcc, n_frames]，其中 n_mfcc 是MFCC系数的数量，n_frames 是音频帧的数量。

调整 preprocess_dataset 函数：在 preprocess_dataset 函数中，您需要确保 padded_shapes 参数正确地反映了数据的形状。特别是，MFCC特征的第二个维度（时间轴）应该是 None，以允许可变长度。

更新模型输入层的形状：在创建模型时，确保输入层的形状与MFCC特征的形状相匹配。例如，如果您的MFCC特征形状为 [13, n_frames, 1]，则模型的输入层应该接受这样的形状。

绘制波形和频谱图

您的代码中已经包含了绘制波形图和频谱图的部分。确保这部分代码能够正确执行，并且生成的图形符合您的期望。

提取和使用 MFCC 特征

提取 MFCC 特征：您的 get_mfcc 函数应该能够正确提取MFCC特征。

在模型中使用 MFCC 特征：在 make_spec_ds 函数中，您需要确保将MFCC特征正确地转换为TensorFlow张量，并设置正确的形状。

创建和训练神经网络模型

您的代码中已经包含了创建和训练模型的部分。确保模型的架构适合处理MFCC特征，并且训练过程能够正常进行。

绘制训练和验证损失曲线，评估模型性能

您的代码中包含了绘制损失曲线和评估模型性能的部分。确保这部分代码能够正确执行，并提供有用的信息来评估模型的表现。

参考资料

TensorFlow 官方文档：TensorFlow Audio Tutorial
Keras 文档：Keras Documentation
Librosa 文档：Librosa Documentation

代码修正建议

以下是一些可能的代码修正建议：

# 在 make_spec_ds 函数中，确保 MFCC 特征的形状正确设置 def get_mfcc_wrapper(waveform, label): mfcc = tf.numpy_function(get_mfcc, [waveform], tf.float32) mfcc.set_shape([13, None, 1]) # 确保形状正确，这里假设 MFCC 是 [13, n_frames, 1] return mfcc, label # 在 preprocess_dataset 函数中，确保 padded_shapes 参数正确 def preprocess_dataset(ds, batch_size=64): return ds.padded_batch( batch_size, padded_shapes=([None, 13, 1], []), # 调整为 None 以适应可变长度 padding_values=(0.0, -1) ) # 确保模型输入层的形状与 MFCC 特征的形状相匹配 model = models.Sequential([ layers.Input(shape=(None, 13, 1)), # 更新输入层形状 # 其他层... ])

请尝试这些建议，并根据您的具体情况进行调整。希望这些信息能帮助您解决问题并完成项目。
解决无用
评论打赏
分享
举报编辑记录

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

使用PYthon进行编程实现语音识别功能
2024-12-28 13:02

本资源采用Python编程语言实现了一个基本的语音识别功能。Python语言因其简洁明了的语法、丰富的库支持以及强大的社区资源，成为实现人工智能应用的首选语言之一。通过Python，开发者可以更容易地将复杂的算法和功能...
基于python编程语言设计的语音识别系统
2024-12-31 11:37

该系统的实现，不仅能够加深对Python编程语言的理解，也能够加深对语音识别技术的理解。通过实际的项目开发，开发者可以掌握从音频数据处理到模型应用的整个流程，这对提升编程技能和专业知识都大有裨益。
用Python编程实现语音控制电脑
2021-01-20 04:52

其实不然，你可以不用懂太多的编程技能，你甚至也可以不用懂自然语言处理技术，这篇文章虽然实现了语音操控但是绝没有你们想象的那么复杂。如果仅仅把语音识别作为一个实现了的接口的话，剩下的逻辑就仅仅是IF-ELSE...
python语言语音识别程序代码
2024-11-30 15:29

Esoft9999的博客 packages\speech_recognition\pocketsphinx-data\放置英文语言模型文件，英文发音字典（en-US）和中文语言模型文件，中文发音字典（zh-CN）c=r.recognize_sphinx(audio, language=‘zh-cn’) #中文识别输出。...
基于DTW的语音识别python系统搭建
2022-05-29 20:27

在构建一个基于DTW（Dynamic Time Warping，动态时间规整）的语音识别Python系统时，...这个系统对于初学者和专业人员都是一个很好的实践平台，可以帮助他们深入了解语音识别技术和Python编程在人工智能领域的应用。
Python语音识别特征提取[代码]
2025-11-16 07:52

Python作为一种编程语言，在这一领域也表现出了强大的功能和灵活性。本文将详细探讨Python在语音识别中的特征提取技术。首先，预处理步骤是语音信号分析的起点。预加重可以提高高频部分的信号，增强语音信号中的...
基于Python的语音识别WebSocket服务设计源码
2025-02-18 01:18

该项目是一个基于Python语言开发的语音识别WebSocket服务的设计实现，它将语音识别技术与WebSocket协议相结合，为用户提供实时、高效的数据交互体验。项目由多个文件构成，其中Python脚本是主要组成部分，负责实现...
基于Python实现的多种编程语言识别系统设计源码
2024-09-30 04:45

项目中包含了182个wav格式的音频文件，这些文件可能用于训练和测试系统的语音识别功能，例如将编程语言的语音讲解转换为文本信息。此外，还有61个mfc文件，这些可能是微软基础类库（Microsoft Foundation Classes）...
AI Python编程学习课件-第4章语音识别
2024-03-19 14:51

### AI Python编程学习课件-第4章语音识别 #### 1. 语音识别简介 语音识别技术（Automatic Speech Recognition, ASR）是近年来迅速发展的关键技术之一，尤其是在深度学习技术的推动下，其应用范围已经从学术研究...
语音识别实战（Python代码）[项目源码]
2025-11-12 15:25

语音识别系统的核心模块通常包括声音信号的采集与预处理、特征提取、声学模型、语言模型和解码器等。早期的语音识别技术依赖于复杂的规则和大量的词汇库，识别率不高且适应性差。随着计算机处理能力的提升和机器...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 6月4日

python自动语音识别

1条回答 默认 最新

解决 ValueError 错误

绘制波形和频谱图

提取和使用 MFCC 特征

创建和训练神经网络模型

绘制训练和验证损失曲线，评估模型性能

参考资料

代码修正建议

问题事件

1条回答默认最新