读取音频的双通道波形并绘制波形图及语谱图

读取双通道波形并绘制波形图

import wave
import matplotlib.pyplot as plt
import numpy as np
"""读取双通道波形并绘制波形图"""

f = wave.open("./audio/audio.wav", "rb")

params = f.getparams()
nchannels, sampwidth, framerate, nframes = params[:4]
print("声道数---", nchannels)
print("量化位数---", sampwidth)
print("采样频率---", framerate)
print("采样点数---", nframes)

str_data = f.readframes(nframes)
f.close()

wave_data = np.fromstring(str_data, dtype=np.short)

wave_data = wave_data * 1.0 / (max(abs(wave_data)))

wave_data = np.reshape(wave_data, [nframes, nchannels])

time = np.arange(0, nframes) * (1.0 / framerate)

plt.figure()

plt.subplot(3, 1, 1)
plt.plot(time, wave_data[:, 0])
plt.xlabel("time (seconds)")
plt.ylabel("Amplitude")
plt.title("Left channel")
plt.grid()

plt.subplot(3, 1, 3)

plt.plot(time, wave_data[:, 1], c="g")
plt.xlabel("time (seconds)")
plt.ylabel("Amplitude")
plt.title("Left channel")
plt.title("right channel")
plt.grid()

plt.show()

这种方法读取的是字符型,需要经过 np.fromstring(str_data, dtype=np.short)类型转换。

第二种读取wav文件的方式:

from scipy.io import wavfile

sampling_freq, audio = wavfile.read(” .wav”)

这里读取的audio直接是数组形式。

import wave
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
"""读取双通道波形并绘制波形图"""
f = wavfile.read("./audio/audio.wav")
nframes = len(f[1])
nchannels = 2
framerate = f[0]
wave_data = f[1]
wave_data = np.reshape(wave_data, [nframes, nchannels])

time = np.arange(0, nframes) * (1.0 / framerate)

plt.figure()

plt.subplot(3, 1, 1)
plt.plot(time, wave_data[:, 0])
plt.xlabel("time (seconds)")
plt.ylabel("Amplitude")
plt.title("Left channel")
plt.grid()

plt.subplot(3, 1, 3)

plt.plot(time, wave_data[:, 1], c="g")
plt.xlabel("time (seconds)")
plt.ylabel("Amplitude")
plt.title("Left channel")
plt.title("right channel")
plt.grid()
plt.show()

结果如下:

读取音频的双通道波形并绘制波形图及语谱图

音频信号的短时频域处理

在语音信号处理中,信号在频域或其他变换域的分析和处理是非常重要的,而对语音进行频域研究可以使信号在时间域不能表现出的特征变得非常明显。音频信号的性质由其频率内容决定。-短时傅里叶变换

[En]

In speech signal processing, the analysis and processing of the signal in the frequency domain or other transform domain is very important, and the study of speech in the frequency domain can make the characteristics that the signal can not show in the time domain become very obvious. the nature of an audio signal is determined by its frequency content. -short-time Fourier transform

import numpy as np
from scipy.io import wavfile
import matplotlib.pyplot as plt

sampling_freq, audio = wavfile.read("./audio/audio.wav")

audio = audio / np.max(audio)

fft_signal = np.fft.fft(audio)
print(fft_signal)

fft_signal = abs(fft_signal)
print(fft_signal)

Freq = np.arange(0, len(fft_signal))

plt.figure()
plt.plot(Freq, fft_signal, color='blue')
plt.xlabel('Freq (in kHz)')
plt.ylabel('Amplitude')
plt.show()

读取音频的双通道波形并绘制波形图及语谱图

显示双通道语音信号和频谱:

[En]

Display dual-channel voice signal and spectrum:

import wave
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
"""读取双通道波形并绘制波形图"""
sampling_freq, f = wavfile.read("./audio/audio.wav")

nframes = len(f)
nchannels = 2
framerate = sampling_freq
wave_data = f
wave_data = np.reshape(wave_data, [nframes, nchannels])

time = np.arange(0, nframes) * (1.0 / framerate)

f = f / np.max(f)

f = np.fft.fft(f)
f = abs(f)

freq = np.arange(0, len(f))

plt.figure()

plt.subplot(3, 1, 1)
plt.plot(time, wave_data[:, 0])
plt.xlabel("time (seconds)")
plt.ylabel("Amplitude")
plt.title("Left channel")
plt.grid()

plt.subplot(3, 1, 2)
plt.plot(time, wave_data[:, 1], c="g")
plt.xlabel("time (seconds)")
plt.ylabel("Amplitude")
plt.title("Left channel")
plt.title("right channel")
plt.grid()

plt.subplot(3, 1, 3)
plt.plot(freq, f, color='blue')
plt.xlabel('Freq (in kHz)')
plt.ylabel('Amplitude')
plt.grid()

plt.show()

读取音频的双通道波形并绘制波形图及语谱图

写入wav文件(合成wav文件)

首先读取现有音频文件,然后写入新文件。

[En]

First read an existing audio file, and then write a new file.

import wave
import numpy as np
import struct

f = wave.open("./audio/audio.wav", "rb")
params = f.getparams()

nchannels_old, sampwidth_old, framerate_old, nframes_old = params[:4]
strData = f.readframes(nframes_old)

waveData = np.fromstring(strData, dtype=np.int16)
f.close()
waveData = waveData * 1.0 / (max(abs(waveData)))

outData = waveData
outwave = wave.open("./audio/compose_test.wav", 'wb')

nchannels = nchannels_old
sampwidth = sampwidth_old
framerate = framerate_old
nframes = nframes_old

comptype = "NONE"
compname = "not compressed"
outwave.setparams((nchannels, sampwidth, framerate, nframes,
                   comptype, compname))

for i in outData:
    outwave.writeframes(struct.pack('h', int(i * 64000 / 2)))

outwave.close()

录音

以SAMPLING_RATE为采样频率,每次读入一块有NUM_SAMPLES个采样的数据块,当读入的采样数据中有COUNT_NUM个值大于LEVEL的取样的时候,将数据保存进WAV文件,一旦开始保存数据,所保存的数据长度最短为SAVE_LENGTH个块。WAV文件以保存时的时刻作为文件名。

从声卡读入的数据和从WAV文件读入的类似,都是二进制数据,由于我们用paInt16格式(16bit的short类型)保存采样值,因此将它自己转换为dtype为np.short的数组。

'''
以SAMPLING_RATE为采样频率,
每次读入一块有NUM_SAMPLES个采样点的数据块,
当读入的采样数据中有COUNT_NUM个值大于LEVEL的取样的时候,
将采样数据保存进WAV文件,
一旦开始保存数据,所保存的数据长度最短为SAVE_LENGTH个数据块。

从声卡读入的数据和从WAV文件读入的类似,都是二进制数据,
由于我们用paInt16格式(16bit的short类型)保存采样值,
因此将它自己转换为dtype为np.short的数组。
'''

from pyaudio import PyAudio, paInt16
import numpy as np
import wave

def save_wave_file(filename, data):
    wf = wave.open(filename, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(2)
    wf.setframerate(SAMPLING_RATE)
    wf.writeframes(b"".join(data))
    wf.close()

NUM_SAMPLES = 2000
SAMPLING_RATE = 8000
LEVEL = 1500
COUNT_NUM = 20
SAVE_LENGTH = 8

pa = PyAudio()
stream = pa.open(format=paInt16, channels=1, rate=SAMPLING_RATE, input=True,
                frames_per_buffer=NUM_SAMPLES)

save_count = 0
save_buffer = []

while True:

    string_audio_data = stream.read(NUM_SAMPLES)

    audio_data = np.fromstring(string_audio_data, dtype=np.short)

    large_sample_count = np.sum( audio_data > LEVEL )
    print(np.max(audio_data))

    if large_sample_count > COUNT_NUM:
        save_count = SAVE_LENGTH
    else:
        save_count -= 1

    if save_count < 0:
        save_count = 0

    if save_count > 0:

        save_buffer.append( string_audio_data )
    else:

        if len(save_buffer) > 0:
            filename = "recorde" + ".wav"
            save_wave_file(filename, save_buffer)
            print(filename, "saved")
            break

MFCC特征提取

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank

sampling_freq, audio = wavfile.read("./audio/audio.wav")

mfcc_features = mfcc(audio, sampling_freq)
filterbank_features = logfbank(audio, sampling_freq)

print('\nMFCC:\n窗口数 =', mfcc_features.shape[0])
print('每个特征的长度 =', mfcc_features.shape[1])
print('\nFilter bank:\n窗口数 =', filterbank_features.shape[0])
print('每个特征的长度 =', filterbank_features.shape[1])

mfcc_features = mfcc_features.T
plt.matshow(mfcc_features)
plt.title('MFCC')

filterbank_features = filterbank_features.T
plt.matshow(filterbank_features)
plt.title('Filter bank')

plt.show()

结果如下(因为输入的音频比较大,所以有点长。在传统的音频处理中,输入的音频多为短时音频,形状为矩形。如果您感兴趣,可以尝试输入不同的音频):

[En]

The result is as follows (because the input audio is larger, it is a bit long. In the traditional audio processing, the input audio is mostly short-term audio and the shape is rectangular. If you are interested, you can try to enter different audio):

读取音频的双通道波形并绘制波形图及语谱图

语谱图

import wave
import matplotlib.pyplot as plt
import numpy as np

f = wave.open('./audio/audio.wav', 'rb')
params = f.getparams()
nchannels, sampwidth, framerate, nframes = params[:4]
strdata = f.readframes(nframes)
wavedata = np.fromstring(strdata, dtype=np.int16)
wavedata = wavedata * 1.0 / (max(abs(wavedata)))
wavedata = np.reshape(wavedata, [nframes, nchannels]).T
f.close()

plt.specgram(wavedata[0], Fs=framerate, scale_by_freq=True, sides = 'default')
plt.ylabel('Frequency(HZ)')
plt.xlabel('Time(s)')
plt.show()

读取音频的双通道波形并绘制波形图及语谱图

语音识别

以下是语音识别的示例

[En]

The following is an example of speech recognition

import os
import argparse
import numpy as np
from scipy.io import wavfile
from hmmlearn import hmm
from python_speech_features import mfcc

def build_arg_parser():
    parser = argparse.ArgumentParser(description='Trains the HMM classifier')
    parser.add_argument( "--input-folder",dest="input_folder", required=True,
            help="Input folder containing the audio files in subfolders")
    return parser

class HMMTrainer(object):
    '''用到高斯隐马尔科夫模型
    n_components:定义了隐藏状态的个数
    cov_type:定义了转移矩阵的协方差类型
    n_iter:定义了训练的迭代次数
    '''
    def __init__(self, model_name='GaussianHMM', n_components=4, cov_type='diag', n_iter=1000):
        self.model_name = model_name
        self.n_components = n_components
        self.cov_type = cov_type
        self.n_iter = n_iter
        self.models = []

        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components,
                    covariance_type=self.cov_type, n_iter=self.n_iter)
        else:
            raise TypeError('Invalid model type')

    def train(self, X):
        np.seterr(all='ignore')
        self.models.append(self.model.fit(X))

    def get_score(self, input_data):
        return self.model.score(input_data)

if __name__=='__main__':

    args = build_arg_parser().parse_args(['--input-folder', 'input_folder'])
    input_folder = args.input_folder

    hmm_models = []

    for dirname in os.listdir(input_folder):

        subfolder = os.path.join(input_folder, dirname)
        if not os.path.isdir(subfolder):
            continue

        label = subfolder[subfolder.rfind('/') + 1:]

        X = np.array([])
        y_words = []

        for filename in [x for x in os.listdir(subfolder) if x.endswith('.wav')][:-1]:

            filepath = os.path.join(subfolder, filename)
            sampling_freq, audio = wavfile.read(filepath)

            mfcc_features = mfcc(audio, sampling_freq)

            if len(X) == 0:
                X = mfcc_features
            else:
                X = np.append(X, mfcc_features, axis=0)

            y_words.append(label)

        print('X.shape =', X.shape)

        hmm_trainer = HMMTrainer()
        hmm_trainer.train(X)
        hmm_models.append((hmm_trainer, label))

        hmm_trainer = None

    input_files = [
            'data/pineapple/pineapple15.wav',
            'data/orange/orange15.wav',
            'data/apple/apple15.wav',
            'data/kiwi/kiwi15.wav'
            ]

    for input_file in input_files:

        sampling_freq, audio = wavfile.read(input_file)

        mfcc_features = mfcc(audio, sampling_freq)

        max_score = 0
        output_label = None
        score_data = []

        for item in hmm_models:
            hmm_model, label = item
            score = hmm_model.get_score(mfcc_features)
            score_data.append(score)
            max_score = max(score_data)
        for item in hmm_models:
            hmm_model, label = item
            score = hmm_model.get_score(mfcc_features)
            if score >= max_score:
                max_score = score
                output_label = label

        print("\nTrue:", input_file[input_file.find('/')+1:input_file.rfind('/')])
        print("Predicted:", output_label)

读取音频的双通道波形并绘制波形图及语谱图

信号分帧

读取音频的双通道波形并绘制波形图及语谱图

wlen为帧长,inc 为帧移,重叠部分为overlap,overlap = wlen – inc

信号帧数为:fn = (N – overlap) / inc = (N – wlen) / inc + 1

N为语音数据长度。

; 加窗

通常对信号截断,分帧需要加窗,因为截断都有频域能量泄露,而窗函数可以减少截断带来的影响。窗函数再scipy.signal信号处理工具箱中。

短时能量

短时能量主要用于区分浊音和清音。

[En]

Short-term energy is mainly used to distinguish voiced and unvoiced segments.

以下是未加窗、有窗和短期能量图:

[En]

The following is to draw unwindowed, windowed, and short-term energy diagrams:

import numpy as np
import wave
import matplotlib.pyplot as plt
import scipy.signal as signal

wlen = 512
inc = 128
f = wave.open('./data/apple/apple01.wav', 'rb')
params = f.getparams()
nchannels, sampwidth, framerate, nframes = params[:4]
str_data = f.readframes(nframes)
wave_data = np.fromstring(str_data, dtype=np.short)
wave_data = wave_data * 1.0 / (max(abs(wave_data)))

time = np.arange(0, wlen) * (1.0 / framerate)

signal_length = len(wave_data)

if signal_length  wlen:
    nf = 1
else :
    nf = int(np.ceil((1.0 * signal_length - wlen + inc) / inc))

pad_length = int((nf - 1 ) * inc + wlen)
zeros = np.zeros((pad_length - signal_length,))
pad_signal = np.concatenate((wave_data, zeros))
indices = np.tile(np.arange(0, wlen), (nf, 1)) + np.tile(np.arange(0, nf*inc, inc), (wlen, 1)).T

indices = np.array(indices, dtype = np.int32)
frames = pad_signal[indices]

a = frames[10:11]

windown = np.hanning(wlen)
b = a[0] * windown
c = np.square(b)

plt.figure(figsize=(10, 4))

plt.subplot(3, 1, 1)
plt.plot(time, a[0], c='b')
plt.xlabel("no-window")
plt.grid()

plt.subplot(3, 1, 2)
plt.plot(time, b, c='g')
plt.xlabel("hanning-window")
plt.grid()

plt.subplot(3, 1, 3)
plt.plot(time, c, c='r')
plt.xlabel("short-power")
plt.grid()
plt.show()

读取音频的双通道波形并绘制波形图及语谱图

绘制第10-11帧的图像

a = frames[10:11]

使用汉明窗

windown = np.hanning(wlen)
b = a[0] * windown
c = np.square(b)

plt.figure(figsize=(10, 4))

绘制未加窗的图

plt.subplot(3, 1, 1)
plt.plot(time, a[0], c=’b’)
plt.xlabel(“no-window”)
plt.grid()

绘制增加汉明窗的图

plt.subplot(3, 1, 2)
plt.plot(time, b, c=’g’)
plt.xlabel(“hanning-window”)
plt.grid()

绘制短时能量图

plt.subplot(3, 1, 3)
plt.plot(time, c, c=’r’)
plt.xlabel(“short-power”)
plt.grid()
plt.show()

读取音频的双通道波形并绘制波形图及语谱图

Original: https://blog.csdn.net/qq_40703471/article/details/109162885
Author: 大鱼不做程序猿
Title: 读取音频的双通道波形并绘制波形图及语谱图

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/526820/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球