📄 fe_feature.cpp
字号:
///////////////////////////////////////////////////////////////////////////////
// This is a part of the Feature program.
// Version: 1.0
// Date: February 22, 2003
// Programmer: Oh-Wook Kwon
// Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
///////////////////////////////////////////////////////////////////////////////
/*************************************************************************
* Feature extraction
* sampleN = Number of speech samples
* frameN = Number of output parameter frames
* frameN = int((sampleN-(frameSize-shiftSize))/shiftSize)
*************************************************************************/
#include "StdAfx.h"
#include "FE_feature.h"
/* Feature name must be consistent with the declaration of FeatKind */
const char *FE_featNameA[]={
"LPC", "LPCC", "PLCC", "MFCC", "FTCC", "FBANK",
"LPC_D", "LPCC_D", "PLCC_D", "MFCC_D", "FTCC_D", "FBANK_D",
"FFT_SPEC", "LPC_SPEC", "LPCC_SPEC", "MFCC_SPEC", "FTCC_SPEC",
"LPCCOV", "LAR", "LSF", "PARCOR", "FORMANT",
"ZCR", "ENERGY", "PITCH", "VUS", "ENDPOINT",
"EPOCH", "GLOFLOW", "GLOPULSE", "LPCRES", "FFTCEP",
"FILE",0
};
#ifdef WIN32
static char* strtok_r(char *s1, const char *s2, char **savept);
#endif
/************************************************************************
* Analysis Condition (Default)
* ---- Don't Change !!
*************************************************************************/
Fe::Fe()
{
/* set core parameters for adc data */
m_sampleRate = DEFAULT_SAMPLING_RATE;
m_emphFac = (float)FE_PRE_EMPH_FAC;
/* set core parameters for analysis */
m_shiftSizeMs = DEFAULT_SHIFT_SIZE_IN_MS;
m_winSizeMs = DEFAULT_WINDOW_SIZE_IN_MS;
m_lpcOrder = FE_LPC_ORDER;
m_cepOrder = FE_CEP_ORDER;
m_fbOrder = FE_NUM_CHANNELS;
m_fftSize = DEFAULT_FFT_SIZE;
m_deltaSize = FE_DELTA;
m_lifter = LIFT_SIN;
/* set extra parameters */
m_dither = FALSE; // set to TRUE in order to prevent underflow in taking log energy
m_covShiftSizeMs = m_shiftSizeMs/5;
m_covWinSizeMs = m_winSizeMs/6;
m_cepSmooth = FALSE;
if ( little_endian() ) {
m_byteOrder = MY_LITTLE_ENDIAN;
m_swapByte = 0;
}
else{
m_byteOrder = MY_BIG_ENDIAN;
m_swapByte = 1;
}
/* PLP */
init_plp();
/* MFCC */
m_MelFBfftSize=0;
m_logEnergyFloor=FE_MIN_LOG_ENERGY;
m_energyFloor=(float)exp(m_logEnergyFloor);
/* control parameters */
m_pProgress = NULL;
m_pCancel = NULL;
}
Fe::~Fe()
{
}
void Fe::Init(FeatKind fk, CFeature& adcData)
{
if(fk==FE_PLCC) m_winSizeMs = PLP_WINDOW_SIZE_IN_MS;
if(adcData.sampleFreqN>0) m_sampleRate = adcData.sampleFreqN;
else if(m_sampleRate>0)adcData.sampleFreqN=m_sampleRate;
if(m_sampleRate==8000){
m_lpcOrder = FE_LPC_ORDER_1;
}
else if(m_sampleRate==16000){
m_lpcOrder = FE_LPC_ORDER_2;
}
if(adcData.shiftN>0)
m_shiftSizeMs = (int)((adcData.shiftN*1000)/adcData.sampleFreqN+0.5);
else if(m_shiftSizeMs>0)
adcData.shiftN=(int)(m_shiftSizeMs*adcData.sampleFreqN/1000);
m_covShiftSizeMs = m_shiftSizeMs/5;
}
int Fe::FeatureMain(FeatKind fk, const char *infile, const char *outfile, const char *parafile, const char* tag)
{
if(parafile){
ReadParaFile(parafile);
}
if(tag){
m_tag=tag;
}
FILE *fi, *fo;
if( (fi = fopen(infile, "rb")) == NULL)
err_fopen(infile);
if( (fo = fopen(outfile, "wb")) == NULL)
err_fopen(outfile);
fseek(fi,0L,SEEK_END);
int fsize = ftell(fi);
rewind(fi);
CFeature adcData;
adcData.vec.resize(fsize/sizeof(short));
int sampleN = ad_read(fi,&adcData.vec[0],fsize/sizeof(short));
CFeature feature;
int frameN = FeatureMain(fk, adcData, 0, sampleN, feature, -1, -1);
int dimN = GetDim(fk);
vector<int> pTag(dimN);
ReadTag(m_tag.c_str(), &pTag[0], dimN);
string featname = GetFeatName(fk);
switch(fk){
case FE_LPCRES:
case FE_EPOCH:
case FE_GLOFLOW:
ad_write(fo, &feature.vec[0], sampleN);
break;
default:
write_feature_vectors(fo, feature.mat, &pTag[0], featname.c_str());
break;
}
fclose(fi);
fclose(fo);
return 1;
}
int Fe::FeatureMain(FeatKind fk, CFeature& adcData, int beginX, int endX, CFeature& feature, int outBeginX, int outEndX)
{
if(adcData.shiftN != feature.shiftN){
FE_ERROR("FeatureMain: The shift sizes of adaData and feature are different.\n");
if(feature.shiftN>0) adcData.shiftN=feature.shiftN;
assert(0);
}
if(adcData.sampleFreqN != feature.sampleFreqN){
FE_ERROR("FeatureMain: The sampling rates of adaData and feature are different.\n");
if(feature.sampleFreqN>0) adcData.sampleFreqN=feature.sampleFreqN;
}
Init(fk, adcData);
if(fk==FE_VUS || fk==FE_FORMANT || fk==FE_PITCH || fk==FE_VUS || FE_ENDPOINT){
/* First reduce noise by Wiener filtering */
enhance_basic(&adcData.vec[beginX],(endX-beginX),m_sampleRate,1);
}
int i;
int frameN;
int sampleN=(endX-beginX);
vector<float> pitchA;
switch(fk){
case FE_PITCH:
frameN = vus_basic(&adcData.vec[beginX], sampleN, GetFrameSize(), m_vusA);
frameN = pitch_basic(&adcData.vec[beginX], sampleN, m_sampleRate, GetShiftSize(), m_vusA, pitchA);
feature.frameN = frameN; feature.dimN = 1; feature.byteN = 4;
feature.mat.Resize(frameN,1);
if(outBeginX<0 || outEndX<0){
outBeginX=0;
outEndX=frameN;
}
for(i=0; i<frameN;i++) feature.mat[outBeginX+i][0] = pitchA[i];
break;
case FE_ENDPOINT:
epd_basic(&adcData.vec[beginX], sampleN, m_sampleRate, feature.label);
break;
default:
frameN = FeatureExtract(fk, adcData, beginX, endX, feature, outBeginX, outEndX);
break;
}
return frameN;
}
int Fe::FeatureExtract(FeatKind fk, CFeature& adcData, int beginX, int endX, CFeature& feature, int outBeginX, int outEndX)
{
int n,k;
vector<short> wave1d;
FeMatrix<float> feature2d;
short *sample = &adcData.vec[beginX];
int sampleN = endX-beginX;
int frameN = 0;
int dimN = GetDim(fk);
int frameSize = GetFrameSize();
/* V/U/S classification works for utterance only */
if(fk==FE_VUS){
frameN = vus_basic(sample, sampleN, frameSize, m_vusA);
if(outBeginX<0) outBeginX=0; if(outEndX<0) outEndX=frameN;
feature.frameN = frameN; feature.dimN = dimN; feature.byteN = 4; feature.mat.Resize(outEndX,1);
for(n=0; n<frameN && n+outBeginX<outEndX; n++) feature.mat[outBeginX+n][0] = m_vusA[n];
return frameN;
}
/* formant tracking requires pitch information to remove non-voice parts */
if(fk==FE_FORMANT){
frameN = vus_basic(sample, sampleN, frameSize, m_vusA);
int shiftSize = GetShiftSize();
int n_pitch_frames = pitch_basic(sample, sampleN, m_sampleRate, shiftSize, m_vusA, m_pitchA);
}
vector<float> preprocessedA;
preprocessedA.resize(sampleN);
/* Preprocessing */
if(fk==FE_ZCR || fk==FE_ENERGY || fk==FE_EPOCH || fk==FE_GLOFLOW || fk==FE_GLOPULSE || fk==FE_VUS){
/* just convert to float */
for(int i=0;i<sampleN;i++) preprocessedA[i] = (float)sample[i];
} else {
preprocessing(sample, sampleN, &preprocessedA[0]);
}
/* Feature extraction */
if(fk==FE_LPCRES || fk==FE_EPOCH || fk==FE_GLOFLOW){
/* result is a 1-dim short vector */
frameN = compute_feature_1d(fk,&preprocessedA[0],sampleN,wave1d);
}
else{
/* result is a 2-dim float matrix */
frameN = compute_feature_2d(fk,&preprocessedA[0],sampleN,feature2d);
}
/* Reorder cepstrum coefficients to follow the convention [c1, c2, c3, ..., c12, c0] */
FeatKind bk=GetBaseFeatKind(fk);
if(bk==FE_LPCC || bk==FE_PLCC || bk==FE_MFCC || bk==FE_FTCC){
for(n=0;n<frameN;n++){
float atmp=feature2d[n][0];
for(k=0;k<m_cepOrder;k++) feature2d[n][k]=feature2d[n][k+1];
feature2d[n][m_cepOrder]=atmp;
}
}
/* Compute delta coefficients */
if(HasDeltaFeat(fk)){
FeMatrix<float> tmp2d=feature2d;
delta_compute(tmp2d, m_deltaSize, feature2d);
}
/* Copy the resulting features */
if(outBeginX<0) outBeginX=0; if(outEndX<0 || outEndX>frameN) outEndX=frameN;
if(fk==FE_LPCRES || fk==FE_EPOCH || fk==FE_GLOFLOW){
assert(sampleN==frameN);
feature.frameN = sampleN; feature.dimN = 1; feature.byteN = 2; feature.vec.resize(sampleN);
for(n=0; n<sampleN; n++) feature.vec[n] = wave1d[n];
}
else{
feature.frameN = frameN; feature.dimN = dimN; feature.byteN = 4; feature.mat.Resize(outEndX,dimN);
for(n=0; n<frameN && n+outBeginX<outEndX; n++){
for(k=0; k<dimN; k++) feature.mat[outBeginX+n][k] = feature2d[n][k];
}
}
return frameN;
}
int Fe::compute_feature_1d(FeatKind fk, float *sample, int sampleN, vector<short>& featA)
{
int n;
int shiftSize, frameSize;
if(fk==FE_GLOFLOW){
shiftSize = (int)(m_covShiftSizeMs*m_sampleRate/1000.0+0.5); if(shiftSize%2) shiftSize += 1;
frameSize = (int)(m_covWinSizeMs*m_sampleRate/1000.0+0.5); if(frameSize%2) frameSize += 1;
if(frameSize < 2*m_lpcOrder) frameSize = 2*m_lpcOrder;
}
else{
shiftSize = GetShiftSize();
frameSize = GetFrameSize();
}
int frameN = (int)((float)(sampleN-(frameSize-shiftSize))/shiftSize);
int dimN = 1;
vector<float> frameA(frameSize);
vector<float> acf(m_lpcOrder+1);
vector<float> kcf(m_lpcOrder+1);
featA.resize(sampleN);
if(fk==FE_EPOCH) return calc_epoch(sample, sampleN, featA, NULL);
vector<float> residual_data(frameSize);
FeMatrix<float> cov(m_lpcOrder+1, m_lpcOrder+1);
int remainN;
for (n=0; ; n++){
int i;
int begX=n*shiftSize;
if(!CheckWinMessage()) break;
ShowProgress((int)((n*100)/frameN));
remainN=my_min(sampleN-begX,frameSize);
if(remainN<=0) break;
for(i=0;i<remainN;i++) frameA[i]=sample[begX+i];
for(i=remainN;i<frameSize;i++) frameA[i]=0;
m_window.Windowing(&frameA[0], remainN, WIN_HAMMING);
switch(fk){
case FE_LPCRES:
{
int k;
float G;
if(remainN == frameSize)
_lpc_basic(&frameA[0], frameSize, &acf[0], m_lpcOrder, &G);
_lpc_error_basic(&frameA[0], remainN, &acf[0], m_lpcOrder, &residual_data[0]);
for(k=0; k<shiftSize && begX+k<sampleN;k++) featA[begX+k] = (short)(residual_data[k]);
}
break;
case FE_GLOFLOW:
{
float G;
if(remainN == frameSize)
lpc_cov(&frameA[0], frameSize, cov, &acf[0], m_lpcOrder, &G);
lpc_cov_error(&frameA[0], remainN, &acf[0], m_lpcOrder, &residual_data[0]);
// differentiated glottal flow
Integrate(&residual_data[0], remainN, &featA[begX]);
// glottal flow
//Integrate(&((*featA)[i]), remainN, featA+i);
}
break;
default:
assert(0);
return 0;
break;
}
}
return sampleN;
}
int Fe::compute_feature_2d(FeatKind fk, float *sample, int sampleN, FeMatrix<float>& featA)
{
int n;
int shiftSize = GetShiftSize();
int frameSize = GetFrameSize();
int frameN = (int)((float)(sampleN-(frameSize-shiftSize))/shiftSize);
int dimN = GetDim(fk);
int fft_size; for (fft_size=1; fft_size<frameSize; fft_size*=2) ;
vector<float> frameA(frameSize);
vector<float> acf(m_lpcOrder+1);
vector<float> kcf(m_lpcOrder+1);
featA.Resize(frameN,dimN);
FeMatrix<float> featTmpA;
if(fk==FE_FORMANT) featTmpA.Resize(frameN, m_lpcOrder/2+1);
fk=GetBaseFeatKind(fk);
int dc_bias=0, level=0;
float mean=0;
if(fk==FE_ENERGY || fk==FE_ZCR){
int i;
for(i=0;i<sampleN;i++) mean += sample[i]; mean = mean/sampleN;
dc_bias = (int)((mean >= 0) ? mean+0.5 : mean-0.5);
if(frameN>10){
/* Assume that there are at least 3 frames of silence at the beginning */
float v=0; int m=3*shiftSize;
for (i=0; i<m;i++) v+=(sample[i]-dc_bias)*(sample[i]-dc_bias);
v=(float)sqrt(v/m);
level=(int)(2*v); /* Set ZCR threshold to 2*noiseLevel */
}
}
for (n=0; n<frameN; n++){
float G=0;
int begX=n*shiftSize;
int i, k;
float energy;
if(!CheckWinMessage()) break;
ShowProgress((int)((n*100)/frameN));
if(!(fk==FE_ENERGY || fk==FE_ZCR)){
for(i=0;i<frameSize;i++) frameA[i]=sample[begX+i];
m_window.Windowing(&frameA[0], frameSize, WIN_HAMMING);
}
switch(fk){
case FE_ENERGY:
energy=compute_energy(sample+begX, frameSize, mean);
featA[n][0] = 10*LOG10((energy>FE_MIN_ENERGY ? energy : FE_MIN_ENERGY));
break;
case FE_ZCR:
featA[n][0] = (float)compute_zero_cross_rate(sample+begX, frameSize, level, dc_bias);
break;
case FE_LAR:
_lpc_parcor_basic(&frameA[0], frameSize, &acf[0], &kcf[0], m_lpcOrder, &G);
featA[n][0] = 0;
for(k=1; k<=m_lpcOrder; k++) featA[n][k] = (float)log((1-kcf[k])/(1+kcf[k]));
break;
case FE_LSF:
{
vector<CComplex> oldRootsP;
vector<CComplex> oldRootsQ;
_lpc_basic(&frameA[0], frameSize, &acf[0], m_lpcOrder, &G);
lpc_to_lsf(&acf[0],m_lpcOrder,&featA[n][0], oldRootsP, oldRootsQ);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -