📄 fe_pitch.cpp
字号:
///////////////////////////////////////////////////////////////////////////////
// This is a part of the Feature program.
// Version: 1.0
// Date: February 22, 2003
// Programmer: Oh-Wook Kwon
// Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
///////////////////////////////////////////////////////////////////////////////
#include "StdAfx.h"
#include "FE_feature.h"
struct FePitchSeg /* Pitch detection parameters */
{
EVusType v;
float f;
};
int Fe::pitch_basic(short *sample, int sampleN, int sampleRate, int shiftSize, vector<EVusType>& vusA, vector<float>& pitchA)
{
int i,n;
int maxFrameSize = 2*sampleRate/MIN_PITCH_FREQ; /* 40 ms */
int minFrameSize = 3*sampleRate/MAX_PITCH_FREQ; /* 12 ms */
int num_frames = sampleN/shiftSize;
int lpfLen = (sampleRate/(2*MAX_PITCH_FREQ)); if(lpfLen%2 == 0) lpfLen += 1;
float maxDeltaPitch = (MAX_PITCH_CHANGE*MAX_PITCH_FREQ); /* 50 Hz for 10 ms */
#ifdef _DEBUG
#define TEST_FRAME_LEN 300
EVusType vusTmpA[TEST_FRAME_LEN];
int k;
#endif
pitchA.resize(num_frames+1);
for(i=0;i<=num_frames;i++) pitchA[i]=0;
if(!(sampleN > 0 && sampleRate > 0)) return false;
if(sampleN<maxFrameSize) return false;
vector<float> amdfA(maxFrameSize);
vector<float> frameA(maxFrameSize);
vector<float> p(num_frames+1+PITCH_BUF_SIZE/2);
FePitchSeg bufA[PITCH_BUF_SIZE];
FePitchSeg curr;
if(vusA.size()==0) vus_basic(sample, sampleN, GetFrameSize(), vusA);
pitch_low_pass_filter(sample, sampleN, lpfLen);
for(i=0;i<PITCH_BUF_SIZE;i++) bufA[i].v = FRM_SILENCE;
int nCurrPos = (PITCH_BUF_SIZE-1)/2;
int PITCH_BUF_SIZE2 = (PITCH_BUF_SIZE-1)/2;
m_meanPitch = 0;
m_pitchFrameN = 0;
for(n=0;n < PITCH_BUF_SIZE/2; n++){
curr.v=vusA[n];
if(curr.v == FRM_VOICED){
for(i=0;i<maxFrameSize;i++) frameA[i]=sample[n*shiftSize+i];
m_window.Windowing(&frameA[0],maxFrameSize,WIN_HANNING);
curr.f = pitch_find(&frameA[0], maxFrameSize, &p[0], n, &amdfA[0], (int)(shiftSize/(float)sampleRate), sampleRate);
}
else{
curr.f = 0;
}
p[n] = curr.f;
for(i=0;i<PITCH_BUF_SIZE-1;i++){
bufA[i] = bufA[i+1];
}
bufA[PITCH_BUF_SIZE-1] = curr;
}
int frameN=0;
int blockSize = maxFrameSize;
for(;n*shiftSize+blockSize < sampleN; n++){
if(!CheckWinMessage()) break;
ShowProgress((int)((n*shiftSize*100.0)/sampleN));
curr.v=vusA[n];
if(curr.v == FRM_VOICED){
for(i=0;i<blockSize;i++) frameA[i]=sample[n*shiftSize+i];
m_window.Windowing(&frameA[0],blockSize,WIN_HANNING);
curr.f = pitch_find(&frameA[0], blockSize, &p[0], n, &amdfA[0], (int)(shiftSize/(float)sampleRate), sampleRate);
if(curr.f == 0){
//printf("Correcting pitch to zero\n");
curr.v = FRM_UNVOICED;
}
}
else{
curr.f = 0;
}
p[n] = curr.f;
for(i=0;i<PITCH_BUF_SIZE-1;i++){
bufA[i] = bufA[i+1];
}
bufA[PITCH_BUF_SIZE-1] = curr;
int sum;
for(i=1,sum=0;i<PITCH_BUF_SIZE-1;i++) sum+=((bufA[i].v == FRM_VOICED)?1:0);
if(sum>=PITCH_BUF_SIZE-3 && (bufA[nCurrPos].v == FRM_UNVOICED || bufA[nCurrPos].v == FRM_SILENCE)) {
if(bufA[nCurrPos].v == FRM_UNVOICED){
//printf("Correcting FRM_UNVOICED->FRM_VOICED\n");
}
else{
//printf("Correcting FRM_SILENCE->FRM_VOICED\n");
}
bufA[nCurrPos].v = FRM_VOICED;
bufA[nCurrPos].f = (bufA[nCurrPos-1].f+bufA[nCurrPos+1].f)/2;
p[n-(PITCH_BUF_SIZE-1-nCurrPos)] = bufA[nCurrPos].f;
}
for(i=0,sum=0;i<PITCH_BUF_SIZE;i++) sum+=((bufA[i].v == FRM_VOICED)?1:0);
if(sum==PITCH_BUF_SIZE){
int peak[PITCH_BUF_SIZE];
int lmin = -1;
int lmax = PITCH_BUF_SIZE;
for(i=1;i<PITCH_BUF_SIZE;i++){
float delta = (float)fabs(bufA[i].f - bufA[i-1].f);
if(delta>maxDeltaPitch){
peak[i] = 1;
if(lmin < 0) lmin = i;
lmax = i;
}
else{
peak[i] = 0;
}
}
if(lmax-lmin <= 2 && lmin >= 3 && lmax < PITCH_BUF_SIZE){
//printf("Correcting pitch\n");
for(i=lmin; i<lmax;i++){
bufA[i].f = (bufA[lmin-1].f+bufA[lmax].f)/2;
p[n-(PITCH_BUF_SIZE-1-i)] = bufA[i].f;
}
}
}
for(i=1,sum=0;i<PITCH_BUF_SIZE-1;i++) sum+=((bufA[i].v == FRM_UNVOICED || bufA[i].v == FRM_SILENCE)?1:0);
if(sum>=PITCH_BUF_SIZE-3 && bufA[nCurrPos].v == FRM_VOICED) {
//printf("Correcting FRM_VOICED->FRM_UNVOICED\n");
bufA[nCurrPos].v = FRM_UNVOICED;
bufA[nCurrPos].f = 0;
p[n-(PITCH_BUF_SIZE-1-nCurrPos)] = bufA[nCurrPos].f;
//printf("Correcting Pitch history\n");
for(i=nCurrPos+1;i<PITCH_BUF_SIZE;i++){
curr.v=vusA[n];
if(curr.v == FRM_VOICED){
int k;
for(k=0;k<blockSize;k++) frameA[k]=sample[n*shiftSize+(i-3)*shiftSize+k];
m_window.Windowing(&frameA[0],blockSize,WIN_HANNING);
curr.f = pitch_find(&frameA[0], blockSize, &p[0], n-(PITCH_BUF_SIZE-1-i), &amdfA[0], (int)(shiftSize/(float)sampleRate), sampleRate);
if(curr.f == 0){
//printf("Correcting pitch to zero\n");
curr.v = FRM_UNVOICED;
}
}
else{
curr.f = 0;
}
p[n-(PITCH_BUF_SIZE-1-i)] = curr.f;
bufA[i] = curr;
}
}
if(n < num_frames){
pitchA[frameN] = bufA[nCurrPos].f;
frameN++;
}
/* use variable block size */
if(bufA[nCurrPos].f > 0)
blockSize = my_max(minFrameSize, my_min(maxFrameSize,(int)(3*sampleRate/(bufA[nCurrPos].f/3)) ));
else
blockSize = maxFrameSize;
}
#ifdef _DEBUG
for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
vusTmpA[k]=vusA[k];
}
#endif
vector<EVusType> vusOrgA(num_frames+1);
for(i=0;i<=num_frames;i++) vusOrgA[i] = FRM_SILENCE;
frameN = 0;
for(n=0;n*shiftSize+blockSize < sampleN; n++){
EVusType v=vusA[n];
if(n < num_frames){
if(v == FRM_VOICED && pitchA[n]==0)
vusOrgA[frameN] = FRM_UNVOICED;
else
vusOrgA[frameN] = v;
frameN++;
}
vusA[n] = vusOrgA[n];
}
#ifdef _DEBUG
for(k=0;k<my_min(TEST_FRAME_LEN,vusA.size());k++){
vusTmpA[k]=vusA[k];
}
#endif
/* correct isolated pitch */
for(i=0;i<frameN;i++){
if(vusA[i]==FRM_VOICED && pitchA[i]==0) vusA[i]=FRM_UNVOICED;
}
vus_remove_short_segments(vusA); /* remove short segments */
for(i=0;i<frameN;i++){
if(vusA[i]==FRM_UNVOICED || vusA[i]==FRM_SILENCE) pitchA[i]=0;
}
/* we have to shift a few frames because a longer block size is used for pitch estimation */
int feFrameSize=GetFrameSize();
int corrFrameN=(int)(0.5*(maxFrameSize-feFrameSize)/(float)feFrameSize);
for(i=frameN-1;i>=corrFrameN;i--){
pitchA[i]=pitchA[i-corrFrameN];
}
/* spike removal */
pitch_remove_spike(pitchA);
/* low-pass filter, x=filter([1 2 1]/4, [1], x); */
pitch_linear_filter(pitchA);
return frameN;
}
bool Fe::pitch_amdf(float *sample, float *amdfA, int blockSize, int pmin, int pmax)
{
int s,n;
for(s=pmin;s<=pmax;s++){
amdfA[s] = 0;
for(n=0;n<blockSize;n++){
if(n+s < blockSize){
amdfA[s] += my_abs(sample[n]-sample[n+s]);
}
else{
amdfA[s] += my_abs(sample[n]);
}
}
}
return true;
}
float Fe::pitch_find(float *sample, int blockSize, float *pitchA, int t, float *amdfA, int shiftSize, int sampleRate)
{
int pmin = sampleRate/MAX_PITCH_FREQ;
int pmax = sampleRate/MIN_PITCH_FREQ;
int i;
float lastPitch = (float)0;
int lastTime = 0;
for(i=0;i<t;i++){
if(pitchA[i]>0) {
lastPitch = pitchA[i];
lastTime = i;
}
}
int startSegTime=t;
for(i=t-1;i>=0;i--){
if(pitchA[i]==0) {
startSegTime=i+1;
break;
}
}
if(t>=167 && t<180){
int aaa=1;
aaa++;
}
float lowF = (float)MIN_PITCH_FREQ;
float highF = (float)MAX_PITCH_FREQ;
float maxDeltaPitch = (MAX_PITCH_CHANGE*MAX_PITCH_FREQ); // 40Hz for 10ms
if(lastPitch > 0){
if(t >= 3 && lastTime == t-1 && (pitchA[t-2] == (float)0 || pitchA[t-3] == (float)0)){
// Start of voice
maxDeltaPitch = (2*MAX_PITCH_CHANGE*lastPitch);
}
else{
// Middle of voice
maxDeltaPitch = (MAX_PITCH_CHANGE*lastPitch);
}
}
float deltaPitch = maxDeltaPitch;
if(lastPitch > 0){
int dur = (t-lastTime);
deltaPitch = dur*maxDeltaPitch;
lowF = my_max(MIN_PITCH_FREQ,lastPitch - deltaPitch);
highF = my_min(MAX_PITCH_FREQ,lastPitch + deltaPitch);
pmin = (int)(sampleRate/highF);
pmax = (int)(sampleRate/lowF);
}
int smin;
int smax;
pitch_amdf(sample, amdfA, blockSize, pmin, pmax);
pitch_find_minmax(amdfA, pmin, pmax, &smin, &smax);
float avgAmdf=0;
for(i=pmin;i<=pmax;i++) avgAmdf += amdfA[i];
avgAmdf /= (pmax-pmin+1);
float amdf=amdfA[smin];
if(t>=167 && t<180){
int aaa=1;
aaa++;
}
float currPitch;
bool reliable=false;
if(pmin < smin && smin < pmax && amdf<avgAmdf*F0_SHARP_TH_1){
currPitch = 1/(float)smin * sampleRate;
if(amdf<avgAmdf*0.9) reliable=true;
}
else{
// try once more with increased range
if(smin == pmax){
lowF = my_max(MIN_PITCH_FREQ,lowF-deltaPitch);
}
if(smin == pmin){
highF = my_min(MAX_PITCH_FREQ,highF+deltaPitch);
}
pmin = (int)(sampleRate/highF);
pmax = (int)(sampleRate/lowF);
pitch_amdf(sample, amdfA, blockSize, pmin, pmax);
pitch_find_minmax(amdfA, pmin, pmax, &smin, &smax);
avgAmdf=0;
for(i=pmin;i<=pmax;i++) avgAmdf += amdfA[i];
avgAmdf /= (pmax-pmin+1);
amdf=amdfA[smin];
if((pmin < smin && smin < pmax) && amdf<avgAmdf*F0_SHARP_TH_2){
currPitch = 1/(float)smin * sampleRate;
}
else{
currPitch = (float)0;
}
if(amdf<avgAmdf*0.5) reliable=true;
}
int gender=0;
if(m_pitchFrameN>20){
if(m_meanPitch>200)
gender=1;
else if(m_meanPitch<150)
gender=-1;
}
/* Correct F0 doubling/halving error */
float orgMag=pitch_fft_one_freq(sample,blockSize,smin);
bool checkDoubling = ((currPitch>F0_DOUBLE_FREQ_TH) || (currPitch>1.6*m_meanPitch));
bool checkHalving = ((currPitch>0) && (currPitch<F0_HALVE_FREQ_TH || currPitch<0.6*m_meanPitch));
bool checkGender = ((gender==1 && currPitch<150) || (gender== -1 && currPitch>200));
float f0_double_th = (float)F0_DOUBLE_TH;
float f0_halve_th = (float)F0_HALVE_TH;
if(checkGender != 0){
if(checkDoubling){
f0_double_th *= 1;
f0_halve_th *= (float)0.2;
}
else if(checkHalving){
f0_double_th *= (float)0.5;
f0_halve_th *= 1;
}
}
if(t>=252){
int aaa=1;
aaa++;
}
/* Apply correction when the estimated F0 is very different at the start of segment */
/* and magnitude of the frequency is large enough */
if(t-startSegTime <= 5 && orgMag > 0.5*blockSize && (checkDoubling || checkHalving || checkGender)){
float hypoDouble, hypoHalve;
hypoDouble=pitch_fft_one_freq(sample,blockSize,smin/2);
hypoHalve=pitch_fft_one_freq(sample,blockSize,smin*2);
if(checkGender==0 && orgMag>hypoDouble*F0_NO_DOUBLE_TH && orgMag>hypoHalve*F0_NO_HALVE_TH){
;
}
else if(checkDoubling){
if(orgMag > f0_double_th*hypoDouble && hypoHalve > f0_halve_th*orgMag)
currPitch /= 2;
}
else if(checkHalving){
if(orgMag < f0_double_th*hypoDouble)
currPitch *= 2;
}
}
/* update the mean pitch frequency */
if(reliable){
float lambda;
if(m_pitchFrameN<10){
lambda = 1-1/(float)(m_pitchFrameN+1);
}
else{
lambda = (float)lambdaPitchTrack;
}
m_meanPitch = lambda*m_meanPitch + (1-lambda)*currPitch;
m_pitchFrameN++;
}
return currPitch;
}
float Fe::pitch_fft_one_freq(float *sample, int blockSize, int p)
{
int i;
float xre=0, xim=0;
for(i=0;i<blockSize;i++){
xre += sample[i]*cos(2*M_PI*i/(float)p);
xim += sample[i]*sin(2*M_PI*i/(float)p);
}
return (float)sqrt(xre*xre + xim*xim);
}
bool Fe::pitch_find_minmax(float *amdfA, int pmin, int pmax, int* smin, int* smax)
{
int s;
*smin = pmin;
*smax = pmin;
float amin = amdfA[pmin];
float amax = amdfA[pmin];
for(s=pmin;s<=pmax;s++){
if(amdfA[s] < amin){
amin = amdfA[s];
*smin = s;
}
else if(amdfA[s] > amax){
amax = amdfA[s];
*smax = s;
}
}
return true;
}
bool Fe::pitch_low_pass_filter(short *sample, int sampleN, int lpfLen)
{
int i,n,sum,kk;
vector<int> tmp(sampleN);
for(n=0;n<lpfLen/2;n++){
sum=0;
kk=0;
for(i=-lpfLen/2;i<=lpfLen/2;i++){
sum += (((n+i) >= 0) ? sample[n+i] : 0);
kk += (((n+i) >= 0) ? 1 : 0);
}
tmp[n] = sum/kk;
}
for(n=lpfLen/2;n<sampleN-lpfLen/2;n++){
sum = 0;
for(i=-lpfLen/2;i<=lpfLen/2;i++) sum += sample[n+i];
tmp[n] = sum/lpfLen;
}
for(n=sampleN-lpfLen/2;n<sampleN;n++){
sum=0;
kk=0;
for(i=-lpfLen/2;i<=lpfLen/2;i++){
sum += (((n+i) < sampleN) ? sample[n+i] : 0);
kk += (((n+i) >= 0) ? 1 : 0);
}
tmp[n] = sum/kk;
}
/* clipping to 16 bit integer */
for(n=0;n<sampleN;n++){
sample[n] = my_min(my_max(SHRT_MIN, tmp[n]), SHRT_MAX);
}
return true;
}
int Fe::pitch_remove_spike(vector<float>& pitchA)
{
int i;
float maxChange=(float)MAX_PITCH_CHANGE_POSTPROC;
int frameN=pitchA.size();
for(i=2;i<frameN-2;i++){
if(pitchA[i-1]==0 && pitchA[i+1]==0){
pitchA[i]=0;
}
else if(pitchA[i-1]>0 && pitchA[i+1]>0){
float ftmp=GetMedian(&pitchA[i-2],5);
if(fabs(ftmp-pitchA[i])>maxChange*pitchA[i]){
pitchA[i]=(pitchA[i-1]+pitchA[i+1])/2;
}
}
else if(i>=5 && pitchA[i]>0 && pitchA[i-1]>0 && pitchA[i-2]>0 && pitchA[i+1]==0){
float ftmp=GetMedian(&pitchA[i-5],5);
if(fabs(ftmp-pitchA[i])>maxChange*pitchA[i]){
pitchA[i]=my_max(-pitchA[i-2]+my_min(2*pitchA[i-1],MAX_PITCH_FREQ), 0);
}
}
else if(i<frameN-5 && pitchA[i]>0 && pitchA[i+1]>0 && pitchA[i+2]>0 && pitchA[i-1]==0){
float ftmp=GetMedian(&pitchA[i],5);
if(fabs(ftmp-pitchA[i])>maxChange*pitchA[i]){
pitchA[i]=my_max(my_min(2*pitchA[i+1],MAX_PITCH_FREQ)-pitchA[i+2], 0);
}
}
}
return frameN;
}
int Fe::pitch_linear_filter(vector<float>& pitchA)
{
int i;
int frameN=pitchA.size();
vector<float> x;
x.resize(frameN);
for(i=0;i<frameN;i++) x[i]=pitchA[i];
for(i=2;i<frameN-2;i++){
if(x[i-2]>0 && x[i-1]>0 && x[i+1]>0 && x[i+2]>0){
pitchA[i]=(x[i-1]+2*x[i]+x[i+1])/4;
}
}
return frameN;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -