diff --git a/VoiceActivityDetection.py b/VoiceActivityDetection.py new file mode 100644 index 0000000..12352c7 --- /dev/null +++ b/VoiceActivityDetection.py @@ -0,0 +1,35 @@ +__author__ = 'Varun Nayyar' +__doc__ = \ + """ + This file is to be modified by users to provide their own Voice Activity Detection (VAD) functions. + I.e. not all frames will have speech present and it is common to remove these frames in many situations + + These functions can be used in most base functions by passing VAD = myVADfunction where + myVADfunction follows the template provided. + """ +import numpy as np + +def templateVAD(frames, sig): + """ + :param frames: numpy array of [NumFrames][SamplesPerFrame] of all the speech frames + :param sig: The entire signal [signLen] + :return: the subset of frames where there is voiced activity detected + """ + raise NotImplementedError + + +def simpleVAD(frames, sig, threshold=0.01): + """ + :param frames: numpy array of [NumFrames][SamplesPerFrame] of all the speech frames + :param sig: The entire signal [signLen] + :param threshold: above what level of average power must the frame be to be considered to have activity + :return: the subset of frames where there is voiced activity detected + + Note that the variance of frame/signal represents the average power of the frame/signal + so this is a power threshold activity detector applied along the frames + """ + + + frameVars = np.var(frames, 1) + reducedFrames = frames[np.where(frameVars > sig.var() * threshold)] + return reducedFrames diff --git a/example.py b/example.py index 094a82c..04bbb18 100644 --- a/example.py +++ b/example.py @@ -6,4 +6,13 @@ mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig,rate) -print fbank_feat[1:3,:] +# print fbank_feat[1:3,:] +print mfcc_feat[1:3,:] + +#Voice Activity detection example +from VoiceActivityDetection import simpleVAD + +print mfcc_feat.shape +mfcc_feat = mfcc(sig,rate, VAD=simpleVAD) +print mfcc_feat.shape + diff --git a/features/base.py b/features/base.py index 0e82537..e67a738 100644 --- a/features/base.py +++ b/features/base.py @@ -5,7 +5,7 @@ from scipy.fftpack import dct def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True): + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True, VAD=None): """Compute MFCC features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array @@ -20,9 +20,10 @@ def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. + :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. """ - feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph, VAD) feat = numpy.log(feat) feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] feat = lifter(feat,ceplifter) @@ -30,7 +31,7 @@ def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, return feat def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, VAD = None): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array @@ -41,13 +42,14 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) - frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) + frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, VAD=VAD) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame @@ -56,7 +58,7 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, return feat,energy def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, VAD = None): """Compute log Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array @@ -67,14 +69,15 @@ def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ - feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph, VAD) return numpy.log(feat) def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, VAD = None): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array @@ -85,12 +88,13 @@ def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) - frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) + frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, VAD=VAD) pspec = sigproc.powspec(frames,nfft) fb = get_filterbanks(nfilt,nfft,samplerate) diff --git a/features/sigproc.py b/features/sigproc.py index ea16ad0..accd863 100644 --- a/features/sigproc.py +++ b/features/sigproc.py @@ -4,13 +4,14 @@ import numpy import math -def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))): +def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x)), VAD=None): """Frame a signal into overlapping frames. :param sig: the audio signal to frame. :param frame_len: length of each frame measured in samples. :param frame_step: number of samples after the start of the previous frame that the next frame should begin. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py :returns: an array of frames. Size is NUMFRAMES by frame_len. """ slen = len(sig) @@ -29,12 +30,18 @@ def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))): indices = numpy.tile(numpy.arange(0,frame_len),(numframes,1)) + numpy.tile(numpy.arange(0,numframes*frame_step,frame_step),(frame_len,1)).T indices = numpy.array(indices,dtype=numpy.int32) frames = padsignal[indices] - win = numpy.tile(winfunc(frame_len),(numframes,1)) + + if VAD is not None: + frames = VAD(frames, sig) + + win = numpy.tile(winfunc(frame_len), (frames.shape[0], 1)) + return frames*win def deframesig(frames,siglen,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))): - """Does overlap-add procedure to undo the action of framesig. + """Does overlap-add procedure to undo the action of framesig. + Not applicable if Voice Activity Detection has been used in framesig :param frames: the array of frames. :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.