精华内容
下载资源
问答
  • MFCC python plot

    千次阅读 2018-06-20 11:33:29
    #!/usr/bin/env python ...from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import logfbank import scipy.io.wavfile as wav import pickl...
    #!/usr/bin/env python
    import os
    from python_speech_features import mfcc
    from python_speech_features import delta
    from python_speech_features import logfbank
    import scipy.io.wavfile as wav
    import pickle
    import numpy as np
    import matplotlib.pyplot as plt
    rootdir = '...'#wavw文件的路径
    list = os.listdir(rootdir)
    dic = {}
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
          print(list[i])
          (rate,sig) = wav.read(path)
         # plt.plot(sig[:3000])
          #plt.show()
          mfcc_feat = mfcc(sig,rate)#, nfft=551
          mfcc_feat -= (np.mean(mfcc_feat, axis=0) + 1e-8)
          mfcc_feat2 = mfcc_feat[0:25,:]#加截断,不显示非零值
     #     plt.plot(mfcc_feat)
          from matplotlib import cm
    
          fig, ax = plt.subplots()
    
          mfcc_data = np.swapaxes(mfcc_feat2, 0, 1)
          #cax = ax.imshow(mfcc_data, interpolation='nearest', cmap=cm.coolwarm, origin='lower', aspect='auto')
          plt.imshow(mfcc_data, cmap=plt.cm.jet, extent=[0, mfcc_data.shape[1], 0, mfcc_data.shape[0]], aspect='auto')
          ax.set_title('MFCC')
    
          plt.show()
    
          d_mfcc_feat = delta(mfcc_feat, 2)
          fbank_feat = logfbank(sig,rate,nfft=551)
          print(len(fbank_feat))
          dic[list[i]] = fbank_feat
    output = open(os.path.join(rootdir, 'data.pkl'), 'wb')
    pickle.dump(dic, output)

    重点想记录的是MFCC的画图,热力图的重点一句是plt.imshow(mfcc_data, cmap=plt.cm.jet, extent=[0, mfcc_data.shape[1], 0, mfcc_data.shape[0]], aspect='auto'),如果音频文件本身有空白,可能出现如下情况
    这里写图片描述
    加了截断的代码mfcc_feat2 = mfcc_feat[0:25,:],不显示非零值,让整体更好看
    这里写图片描述

    参考资料:
    stackoverflow
    Speech Processing for Machine Learning
    python 实现MFCC

    展开全文
  • 提取语音的 mfcc 特征,根据别人分享的代码,自己做了开发,提取基本特征 ,一级差分,二级差分
  • MFCC特征提取Python实现

    2018-08-02 15:11:55
    语音特征提取之MFCC特征提取的Python实现,包括一阶差分和二阶差分系数
  • python实现mfcc

    千次阅读 2019-09-18 15:57:02
    1、利用python_speech_features库编写MFCC特征提取,生成40维的mfcc向量 import scipy.io.wavfile as wavfile from python_speech_features import mfcc, delta def read_wave_data(filename): """获取语音文件...

    1、利用python_speech_features库编写MFCC特征提取,生成40维的mfcc向量

    import scipy.io.wavfile as wavfile
    from python_speech_features import mfcc, delta
        
    def read_wave_data(filename):
    	"""获取语音文件信息: sample_rate:帧速率  signal:数据的矩阵形式 """
        fs, wavsignal = wavfile.read(filename)  # 声音文件数据的矩阵形式
        return fs, wavsignal
    
    def extract_mfcc( wav_arr, sample_rate):
        mfcc_feat = mfcc(wav_arr, sample_rate)
        energy = np.sqrt(wav_arr)
        mfcc_feat = np.stack(mfcc_feat, energy)
        delta1 = delta(mfcc_feat, 1)
        delta2 = delta(delta1, 1)
        mfcc_feat = np.stack(mfcc_feat, delta1, delta2)
        return mfcc_feat.T
    

    2、利用librosa库编写mfcc特征提取

    import librosa
        wavsignal,fs = librosa.load(filename)
        mfccs = librosa.feature.mfcc(wavsignal , sr=fs)
        return mfccs
    

    3、python代码直接编写mfcc特征提取模块

    import numpy
    import scipy.io.wavfile
    from matplotlib import pyplot as plt
    from scipy.fftpack import dct
        sample_rate,signal=scipy.io.wavfile.read('OSR_us_000_0010_8k.wav')
        print(sample_rate,len(signal))
        #读取前3.5s 的数据
        signal=signal[0:int(3.5*sample_rate)]
    print(signal)
    
    #预先处理
    pre_emphasis = 0.97
    emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
    
    frame_size=0.025
    frame_stride=0.1
    frame_length,frame_step=frame_size*sample_rate,frame_stride*sample_rate
    signal_length=len(emphasized_signal)
    frame_length=int(round(frame_length))
    frame_step=int(round(frame_step))
    num_frames=int(numpy.ceil(float(numpy.abs(signal_length-frame_length))/frame_step))
    
    pad_signal_length=num_frames*frame_step+frame_length
    z=numpy.zeros((pad_signal_length-signal_length))
    pad_signal=numpy.append(emphasized_signal,z)
    
    
    indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    
    frames = pad_signal[numpy.mat(indices).astype(numpy.int32, copy=False)]
    
    #加上汉明窗
    frames *= numpy.hamming(frame_length)
    # frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1))  # Explicit Implementation **
    
    #傅立叶变换和功率谱
    NFFT = 512
    mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT))  # Magnitude of the FFT
    #print(mag_frames.shape)
    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
    
    low_freq_mel = 0
    #将频率转换为Mel
    nfilt = 40
    high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))
    mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
    hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
    
    bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)
    
    fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
    
    for m in range(1, nfilt + 1):
        f_m_minus = int(bin[m - 1])   # left
        f_m = int(bin[m])             # center
        f_m_plus = int(bin[m + 1])    # right
        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    filter_banks = numpy.dot(pow_frames, fbank.T)
    filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks)  # Numerical Stability
    filter_banks = 20 * numpy.log10(filter_banks)  # dB
    
    num_ceps = 12
    mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)]
    (nframes, ncoeff) = mfcc.shape
    
    n = numpy.arange(ncoeff)
    cep_lifter =22
    lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter)
    mfcc *= lift  #*
    
    #filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)
    mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
    
    print(mfcc.shape)
    plt.plot(filter_banks)
    
    plt.show()
    
    原文链接:https://blog.csdn.net/TH_NUM/article/details/80597495
    
    展开全文
  • python提取MFCC

    2018-04-02 10:16:48
    包括提取MFCC的完整步骤,测试过可直接用,分享给大家
  • MFCC提取的Python代码

    2019-03-25 16:27:52
    Python语言写的语音的MFCC特征提取代码,用于机器学习、深度学习等音频分类,语音识别,语音情感识别等领域
  • python提取mfcc特征

    千次阅读 2019-03-26 16:23:14
    理论部分请看:...这是我认为最完整的、讲解最清楚的python提取mfcc特征的教程。 用到的OSR_us_000_0010_8k.wav数据在这里下载: http://www.voiptroubleshooter.com/open_speec...

    理论部分请看:https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
    这是我认为最完整的、讲解最清楚的python提取mfcc特征的教程。
    用到的OSR_us_000_0010_8k.wav数据在这里下载:
    http://www.voiptroubleshooter.com/open_speech/american.html
    完整Python代码如下:

    import numpy as np
    import scipy.io.wavfile
    import matplotlib.pylab as plt
    sample_rate, signal = scipy.io.wavfile.read("OSR_us_000_0010_8k.wav")
    signal = signal[0: int(3.5*sample_rate)]
    t = np.linspace(0, 3.5, num=len(signal))
    
    pre_emphasis = 0.97
    emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis*signal[:-1])
    frame_size = 0.025
    frame_stride = 0.01
    frame_length, frame_step = frame_size*sample_rate, frame_stride*sample_rate
    signal_length = len(emphasized_signal)
    frame_length = int(round(frame_length))
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros(int(pad_signal_length - signal_length))
    pad_signal = np.append(emphasized_signal, z)
    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames*frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]
    ham = np.hamming(frame_length)
    # plt.plot(ham)
    # plt.show()
    
    frames *= ham
    
    NFFT = 512
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  #(348, 257)
    
    nfilt = 40
    low_freq_mel = 0
    high_freq_mel = (2595 * np.log10(1 + (sample_rate/2) / 700))
    
    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
    
    hz_points = (700 * (10**(mel_points / 2595) - 1))
    
    bin = np.floor((NFFT + 1)*hz_points / sample_rate)
    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
    for m in range(1, nfilt + 1):
        f_m_minus = int(bin[m-1])
        f_m = int(bin[m])
        f_m_plus = int(bin[m+1])
    
        for k in range(f_m_minus, f_m):
            fbank[m-1, k] = (k-bin[m-1]) / (bin[m]-bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m-1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    
    filter_banks = np.dot(pow_frames, fbank.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
    filter_banks = 20 * np.log10(filter_banks)
    
    
    plt.imshow(np.flipud(filter_banks.T), cmap=plt.cm.jet, aspect=0.2, extent=[0,filter_banks.shape[1],0,filter_banks.shape[0]],shape=[11, 9])
    plt.axis("normal")
    plt.savefig('./test2.png')
    plt.show()
    

    运行的结果如下:
    在这里插入图片描述

    展开全文
  • 1.比较代码 import librosa import os ...from python_speech_features import mfcc #读取某文件夹下的所有.wav文件,并返回文件全称 def file_name(file_dir): L = [] for root, dirs, files in

    1.比较代码

    import librosa
    import os
    import time
    import numpy as np
    import scipy.io.wavfile as wav
    import python_speech_features
    from python_speech_features import mfcc
    
    #读取某文件夹下的所有.wav文件,并返回文件全称
    def file_name(file_dir):
        L = []
        for root, dirs, files in os.walk(file_dir):
            for file in files:
                if os.path.splitext(file)[1] == '.wav':
                    L.append(os.path.join(root, file))
            return L
    '''
    调用python_speech_features包
    '''
    def mfcc_1(filename):
        fs, wavedata = wav.read(filename)
        mfcc_feature = mfcc(wavedata, fs, winlen=0.064, winstep=0.032, nfilt=13, nfft=1024)  # mfcc系数     # nfilt为返回的mfcc数据维数,默认为13维
        d_mfcc_feat = python_speech_features.base.delta(mfcc_feature, 1)     # feat 为mfcc数据或fbank数据    # N - N为1代表一阶差分,N为2代表二阶差分     # 返回:一个大小为特征数量的numpy数组,包含有delta特征,每一行都有一个delta向量
        d_mfcc_feat2 = python_speech_features.base.delta(mfcc_feature, 2)
        mfccs = np.hstack((mfcc_feature, d_mfcc_feat, d_mfcc_feat2))
        # 返回 帧数*39 的mfccs参数
        return mfccs
    '''
    调用librosa包
    '''
    def mfcc_2(filename):
        y, sr = librosa.load(filename, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=39)  # n_mfcc为返回的mfcc数据维度
        mfccs=np.array(mfccs)#转成numpy类型
        # 返回 帧数*39 的mfccs参数
        return mfccs.T
    
    def main():
    	#定义一些文件路径
        wav_dir="./data"
        filenames=file_name(wav_dir)
        #开始计时
        start_time=time.time()
    	#循环读取文件名称并求取mfcc参数
        for filename in filenames:
            mfccs=mfcc_1(filename)
        #结束计时
        end_time=time.time()
        print("程序运行时长",str(end_time-start_time))
    
    if __name__ == '__main__':
        main()
    
    

    2.结果

    本次测试使用的是1200条语音,分别调用python_speech_features和librosa两个包求取39维MFCC参数,由于在后续的训练中需要提取大量语音的mfcc参数,这里比对了两种方法的运行时间:

    2.1 调用python_speech_features包生成mfccs参数

    程序运行时长 6.4495463371276855 s
    

    2.2 调用librosa包生成mfccs参数

    程序运行时长 9.830938816070557 s
    

    在进行的速度方面,使用python_speech_features包生成的速度较快一些。

    3. mfccs波形比对

    在这里插入图片描述

    4. 后续训练差异

    这一块的内容,后续会补上。

    展开全文
  • 文章目录准备工作1 使用python_speech_features进行mfcc1 在导入包的时候直接将mfcc,logfbank(dct之前的参数),delta(差分)导入2 在导入包的时候只导入包,不导入具体函数2 使用librosa包进行mfcc 准备工作 首先需要...
  • MFCC算法-Python

    2019-06-13 05:21:40
    Python示例代码 import numpy, numpy.fft def mel(f): return 2595. * numpy.log10(1. + f / 700.) def melinv(m): return 700. * (numpy.power(10., m / 2595.) - 1.) class MFCC(object): def __init...
  • import numpy import scipy.io.wavfile from matplotlib import pyplot as plt from scipy.fftpack import dct sample_rate,signal=scipy.io.wavfile.read('OSR_us_000_0010_8k.wav') print(sample_rate,len...
  • 虽然源码中有介绍,但是为了方便理解和使用。我加入自己的理解。顺便把里面的英语翻译下 # calculate filterbank features. Provides e.g. fbank and mfcc features for ...from python_speech_features import sigproc
  • 语音识别系统的第一步是进行特征提取,mfcc是描述短时功率谱包络的一种特征,在语音识别系统中被广泛应用。 一、mel滤波器 每一段语音信号被分为多帧,每帧信号都对应一个频谱(通过FFT变换实现),频谱表示频率与...
  • 如下所示: import scipy.io.wavfile as wav from python_speech_features import mfcc fs, audio = wav.read(abc.wav) feature_mfcc = mfcc(audio, samplerate=fs) ...以上这篇利用python提取wav文件的mfcc方法就是小
  • 1、Librosa import librosa filepath = "/Users/birenjianmo/Desktop/learn/librosa/mp3/in.wav" y,sr = librosa.load(filepath) ...from python_speech_features import mfcc as pmfcc filepath = "/Users/bire

空空如也

空空如也

1 2 3 4 5 ... 20
收藏数 1,633
精华内容 653
关键字:

mfccpython

python 订阅