📄 fe_vus.cpp
字号:
///////////////////////////////////////////////////////////////////////////////
// This is a part of the Feature program.
// Version: 1.0
// Date: February 22, 2003
// Programmer: Oh-Wook Kwon
// Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
///////////////////////////////////////////////////////////////////////////////
#include "StdAfx.h"
#include "FE_feature.h"
static const float EPD_NB_FRAME_THRESHOLD_LTE=(float)10;
static const float EPD_LAMBDA_LTE=(float)0.97;
static const float EPD_LAMBDA_ZCR=(float)0.97; // old=0.99
static const float EPD_SNR_THRESHOLD_UPD_LTE=(float)4;
static const float EPD_ZCR_THRESHOLD_UPD_LTE=(float)10;
static const float EPD_NOISE_ENERGY_FLOOR=(float)14.5;
static const float EPD_SPEECH_ENERGY_FLOOR=(float)37.5;
static const float lambdaLTEhigherE=(float)0.99;
static const int EPD_MIN_FRAME=10;
int Fe::vus_basic(short *sample, int sampleN, int frameSize, vector<EVusType>& vusA)
{
int n,i;
vector<float> energyA;
vector<float> zcrA;
vector<float> noiseEnA;
vector<float> meanZcrA;
float noiseEn=0;
float meanZcr=0;
int zcrFrameN=0;
#ifdef _DEBUG
#define TEST_FRAME_LEN 300
float tmp1A[TEST_FRAME_LEN];
float tmp2A[TEST_FRAME_LEN];
EVusType vusTmpA[TEST_FRAME_LEN];
float meanEnergyTmpA[TEST_FRAME_LEN];
float meanZcrTmpA[TEST_FRAME_LEN];
#endif
int shiftSize=GetShiftSize();
int frameN=(int)((sampleN-(frameSize-shiftSize))/shiftSize);
energyA.resize(frameN);
zcrA.resize(frameN);
noiseEnA.resize(frameN);
meanZcrA.resize(frameN);
vusA.resize(frameN);
float VUS_ENERGY_TH_VOICED=30;
float VUS_ENERGY_TH_UNVOICED=60;
float VUS_ENERGY_TH_START=20;
float VUS_ENERGY_TH_INSPEECH=15; // old=6
int VUS_ZCR_TH_MAX=50;
int VUS_ZCR_TH_MIN=5;
int VUS_VOICE_ZCR_TH_MIN=0;
int VUS_ZCR_UPD_TH=50;
vector<float> x(frameSize);
float vus_energy_th=VUS_ENERGY_TH_START;
for(n=0;n<frameN;n++){
float lambdaLTE, lambdaZcr;
if(n<EPD_NB_FRAME_THRESHOLD_LTE)
lambdaLTE=1-1/(float)(n+1);
else
lambdaLTE=EPD_LAMBDA_LTE;
if(zcrFrameN<EPD_NB_FRAME_THRESHOLD_LTE)
lambdaZcr=1-1/(float)(zcrFrameN+1);
else
lambdaZcr=EPD_LAMBDA_ZCR;
int begX=n*shiftSize;
{
/* DC offset removal, H(z)=(1-z^(-1))/(1-a*z^(-1)) */
float a=1-1/(float)1024;
x[0]=sample[begX];
for(i=1;i<frameSize;i++){
x[i]=(sample[begX+i]-sample[begX+i-1]+a*x[i-1]);
}
/* low-pass filter, x=filter([1 2 1]/4, [1], x); */
vector<float> xorg=x;
x[0]=(xorg[0]+2*xorg[0]+xorg[1])/4;
for(i=1;i<frameSize-1;i++){
x[i]=(xorg[i-1]+2*xorg[i]+xorg[i+1])/4;
}
x[frameSize-1]=(xorg[frameSize-2]+2*xorg[frameSize-1]+xorg[frameSize-1])/4;
}
/* Normalize energy to frame length */
float sum=0;
for(i=0;i<frameSize;i++){
sum += (x[i]*x[i]);
}
energyA[n] = (0.5+10*LOG10(1+sum/frameSize)); /* dB scale */
if(energyA[n]<EPD_NOISE_ENERGY_FLOOR) energyA[n]=EPD_NOISE_ENERGY_FLOOR;
/* Estimate noise */
float noiseLevel=0;
if((energyA[n]-noiseEn)<EPD_SNR_THRESHOLD_UPD_LTE || n<EPD_MIN_FRAME || (energyA[n] < EPD_SPEECH_ENERGY_FLOOR)){
if((energyA[n]<noiseEn) | (n<EPD_MIN_FRAME) | (energyA[n] < EPD_SPEECH_ENERGY_FLOOR)){
noiseEn=noiseEn+(1-lambdaLTE)*(energyA[n]-noiseEn);
}
else{
noiseEn=noiseEn+(1-lambdaLTEhigherE)*(energyA[n]-noiseEn);
}
/* noise level should be computed from the original noise energy */
noiseLevel=(2*sqrt(exp(log(10)/10*(noiseEn-0.5))-1));
if(noiseEn<EPD_NOISE_ENERGY_FLOOR) noiseEn=EPD_NOISE_ENERGY_FLOOR;
}
noiseEnA[n]=noiseEn;
/* compute zero crossing rate (changed) */
float prev = x[0]-noiseLevel;
int zcr=0;
for(i=1;i<frameSize;i++){
float val = x[i]-noiseLevel;
float ztmp=val*prev;
if(ztmp<0) zcr = zcr+1;
prev=val;
}
zcrA[n]=zcr;
if(zcr>0 || zcrFrameN>0){
zcrFrameN++;
}
if(zcrA[n]<VUS_ZCR_UPD_TH && ((zcrA[n]-meanZcr)<EPD_ZCR_THRESHOLD_UPD_LTE || zcrFrameN<EPD_MIN_FRAME)){
meanZcr=meanZcr+(1-lambdaZcr)*(zcrA[n]-meanZcr);
}
meanZcrA[n]=meanZcr;
float deltaEnergy=energyA[n]-noiseEn;
float deltaZcr=zcrA[n]-meanZcr;
if(deltaEnergy<VUS_ENERGY_TH_UNVOICED && deltaEnergy<vus_energy_th && deltaZcr<VUS_VOICE_ZCR_TH_MIN){
vusA[n]=FRM_SILENCE;
vus_energy_th=VUS_ENERGY_TH_START;
}
else if(deltaZcr>VUS_ZCR_TH_MAX){
vus_energy_th=VUS_ENERGY_TH_START;
vusA[n]=FRM_UNVOICED;
}
else if(deltaZcr>VUS_ZCR_TH_MIN && deltaEnergy<vus_energy_th){
vus_energy_th=VUS_ENERGY_TH_START;
vusA[n]=FRM_UNVOICED;
}
else if(zcr>VUS_ZCR_TH_MIN && deltaEnergy<vus_energy_th){
vus_energy_th=VUS_ENERGY_TH_START;
vusA[n]=FRM_UNVOICED;
}
else if(deltaEnergy>VUS_ENERGY_TH_VOICED){
vus_energy_th=VUS_ENERGY_TH_INSPEECH;
vusA[n]=FRM_VOICED;
}
else if(deltaEnergy>vus_energy_th && (deltaZcr>VUS_VOICE_ZCR_TH_MIN || zcr>VUS_ZCR_TH_MIN)){
vus_energy_th=VUS_ENERGY_TH_INSPEECH;
vusA[n]=FRM_VOICED;
}
else{
vusA[n]=FRM_SILENCE;
vus_energy_th=VUS_ENERGY_TH_START;
}
/* TRACE("%d %f %f %f %f %d\n",n,energyA[n],zcrA[n],deltaEnergy,deltaZcr,(int)vusA[n]); */
}
#ifdef _DEBUG
int k;
for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
tmp1A[k]=energyA[k];
tmp2A[k]=zcrA[k];
meanEnergyTmpA[k]=noiseEnA[k];
meanZcrTmpA[k]=meanZcrA[k];
vusTmpA[k]=vusA[k];
}
#endif
/* median filtering */
vus_median_filter(vusA);
#ifdef _DEBUG
for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
vusTmpA[k]=vusA[k];
}
#endif
/* remove short segments */
vus_remove_short_segments(vusA);
return frameN;
}
int Fe::vus_median_filter(vector<EVusType>& vusA)
{
int n,i,k;
int frameN=vusA.size();
/* two-stage median filtering, length 5 */
vector<EVusType> vusOrgA=vusA;
float tmp[5];
for(n=2;n<frameN-2;n++){
for(i=n-2,k=0;i<=n+2;i++,k++) tmp[k]=(float)(vusOrgA[i]);
vusA[n]=(EVusType)((int)(GetMedian(tmp,5)+0.5));
}
vusOrgA=vusA;
for(n=2;n<frameN-2;n++){
for(i=n-2,k=0;i<=n+2;i++,k++) tmp[k]=(float)(vusOrgA[i]);
vusA[n]=(EVusType)((int)(GetMedian(tmp,5)+0.5));
}
return frameN;
}
int Fe::vus_remove_short_segments(vector<EVusType>& vusA)
{
/* remove short voiced segments */
vus_remove_short_segments_sub(vusA, FRM_VOICED, 5); // old=7
/* remove short silent segments */
vus_remove_short_segments_sub(vusA, FRM_SILENCE, 2); // old=2
/* remove short unvoiced segments */
vus_remove_short_segments_sub(vusA, FRM_UNVOICED, 3); // old=4
return 1;
}
/* remove short segments with duration less than or equal to minDur */
int Fe::vus_remove_short_segments_sub(vector<EVusType>& vusA, EVusType type, int minDur)
{
int startX=1;
int n,i,k=0;
EVusType prevType=vusA[0];
for(n=1;n<vusA.size();n++){
if(vusA[n] == type && vusA[n-1] != type){
startX=n;
}
else if(vusA[n] != type && vusA[n-1] == type){
if(n-startX<=minDur){
for(i=startX;i<n;i++) vusA[i]=vusA[startX-1];
k++;
}
startX=n;
}
}
if(n-startX<=minDur && vusA[n-1]==type){
for(i=startX;i<n;i++) vusA[i]=vusA[startX-1];
k++;
}
return k;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -