📄 speechvoicescore.txt
字号:
#if RUN4PC && PRINT && 0
printf("optimal path: %d - %d (%d) %f %f \n",x,y,grid_size, pscore/1024.0, CalcPitchScore(mult(x,skip1),mult(y,skip2))/1024.0) ;
#endif
switch (step) {
case FROM_LEFT :
x-=2;
y-=1;
break;
case FROM_MID :
x--;
y--;
break;
case FROM_RIGHT :
x--;
y-=2;
break;
case FROM_SELF :
x--;
break;
default :
x=0;
break;
}
#if CPA_F
COUNT.addition += 13 ;
COUNT.multy += 4 ;
#endif
}
return (FLOAT)(PWEI*pscore/path_len) ;
}
/****************************************************/
/** Module to extract pitch contour **/
/****************************************************/
/* calc correlation - samples are in vec filt, summation is performed over len
samples, advancing step samples each time, and for the pitch value period. sq
is a vec of the squares of filt, enrg is the average energy in the checked
part */
FLOAT calc_corr(INT len, INT period, INT step, SHORT *filt)
{
INT ss,x,y ;
INT i;
INT xi, xj ;
FLOAT cor ;
ss = x = y = 0;
for (i=0; i<len; i+=step) {
xi = (INT) filt[i] ;
xj = (INT) filt[period+i] ;
ss += xi*xj/len ;
x += xi*xi/len ;
y += xj*xj/len ;
#if CPA_F
COUNT.addition += 9 ;
COUNT.multy += 6 ;
#endif
}
if (x<1)
return FLT_0_0 ;
if(ss>=(INT)0x1fffff || ss<=(INT)0xffe00000 )
cor = mult((FLOAT)(ss/(x>>FPN)),(FLOAT)(ss/(y>>FPN))) ;
else
cor = mult((FLOAT)((ss<<FPN)/x),(FLOAT)((ss<<FPN)/y)) ;
/** now cor becomes 1.5.10 FP format */
if(ss<0) cor = -cor ;
return cor ;
}
/** find pitch - first pass of pitch finding.
first - check if energy and variance are morhe than a threshold value. if not -
decide unvoiced.
then check there is a zero crossing inside the allowed oitch period. if not - uv.
Set range of allowed pitch values to full range, or to a smaller range around the
last pitch value (if was found).
Calculate the correlation (low resolution) for the allowed pitch values. if a peak
is found, choose it.
check correlation values near the peak with high resolution, and choose the pitch
which gives the best correlation value.
check that the correlation value is more than a threshold.
Allow another check - to find if a doubling or halving occured (this is needed
because otherwise in the next step we will look only around the found pitch value,
and errors could be recoverd from only in the second pass).
last - prepare for a check - the correlation values for several multipliers.
**/
INT find_pitch(INT frm_num, SHORT *filt, FLOAT *thresh, INT *last_voiced ,INT *last_pitch, INT *limit_range)
{
INT i,j, chosen_pitch, a, b , voiced ;
char possible=1;
FLOAT s, best_corr ;
if (possible) {
/* a and b are min and max possible pitch. if limit_range, then range is limited
around prev pitch value */
if (*limit_range && *last_pitch>0) {
a = *last_pitch * 2 / 3 ;
if (a<MIN_F0) a=MIN_F0 ;
b = *last_pitch * 3 / 2 ;
if (b>MAX_F0) b=MAX_F0 ;
} else {
a = MIN_F0;
b = MAX_F0;
}
/* there must be at least two zero crossing in period */
i=0; j=1;
while (j < b && i<2) {
if ((INT)filt[j-1]*filt[j+1]<0) i++;
j++;
}
if ( j > a ) a = j ;
chosen_pitch=-1;
if (a >= b) possible=0;
}
if (possible) {
best_corr=-FLT_2_0;
for (j=a; j<b; j++) {
s = calc_corr(j, j, STEP1, filt);
if ( j>=a+3 && j<b-3 && s>FLT_0_0 && s>best_corr ) {
chosen_pitch = j ;
best_corr = s ;
}
}
if (chosen_pitch==-1) possible=0;
}
if (possible) {
/* last, check again around best candidate, using best resolution */
best_corr=-FLT_2_0;
a=chosen_pitch-4;
if (a<MIN_F0) a=MIN_F0;
b=chosen_pitch+4;
if (b>MAX_F0) b=MAX_F0;
chosen_pitch=0;
for (j=a; j<b; j++) {
s = calc_corr(j, j, 1, filt);
if (s>best_corr) {
chosen_pitch=j;
best_corr=s;
}
}
}
if (!possible) best_corr=FLT_0_0 ;
/* there used to be a moving threshold, but now this is disabled (TMIN=TMAX=THIGH),
as we use here a stricter threshold, and later use the extending proc with
a lower thresh */
if (*last_voiced) {
if (best_corr <*thresh) {
voiced=0;
*thresh = TMIN;
}
else {
*thresh = mult(TMAX,best_corr) ;
if (*thresh<TMIN) *thresh=TMIN;
voiced=1;
}
}
else {
if (best_corr>*thresh) {
voiced=1;
*thresh = TMIN;
}
else {
voiced=0;
*thresh=THIGH;
}
}
*limit_range = (*limit_range+1)%SEQ;
if (!voiced) *limit_range=0;
*last_voiced=voiced;
if (chosen_pitch<0) chosen_pitch=0;
*last_pitch=chosen_pitch;
return(chosen_pitch);
}
/** simple algorithm to remove multiples in pitch contour **/
/** A simple method to use three states to implement **/
#define ABS(x) ((x>0)? (x) : -(x))
void remove_double(SHORT *freq, char *voicedv, INT from, INT to)
{
INT cur,i;
INT pools[3];
INT count[3] ;
SHORT last ;
FLOAT y, s[3] ;
pools[0] = pools[1] = pools[2] = 0 ;
count[0] = count[1] = count[2] = 0 ;
cur = 1 ;
last = freq[from] ;
for (i=from; i<=to; i++) {
if (freq[i]==0) freq[i] = last ;
y = divi(freq[i],last);
if( y > FLT_1_35 ) { /* counted as double */
cur ++ ;
if(cur>2) cur = 2 ;
} else if ( y < FLT_0_75 ) { /*counted as half */
cur -- ;
if(cur<0) cur = 0 ;
}
pools[cur] += (INT)freq[i] ;
count[cur] ++ ;
if (freq[i]>0) last = freq[i];
}
for(i=0;i<3;i++) {
if (count[i]) pools[i] /= count[i] ;
else pools[i] = 10000 ; /** 10000 magic number, do we have better one **/
}
if(count[2]>count[1] && count[2]>count[0]) {
s[0] = FLT_4_0 ; s[1] = FLT_2_0; s[2] = FLT_1_0;
y = divi(pools[2],pools[1]) ;
if(ABS(y-FLT_2_0)>ABS(y-FLT_3_0)) s[1] = FLT_3_0 ;
} else if (count[1]>count[0] && count[1]>count[2]) {
s[0] = FLT_2_0; s[1] = FLT_1_0; s[2] = FLT_0_5;
y = divi(pools[1],pools[2]) ;
if(ABS(y-FLT_0_33)<ABS(y-FLT_0_5)) s[2]=FLT_0_33 ;
y = divi(pools[1],pools[0]) ;
if(ABS(y-FLT_3_0) < ABS(y-FLT_2_0) ) s[0] = FLT_3_0 ;
} else {
s[0] = FLT_1_0 ; s[1] = FLT_0_5 ; s[2] = FLT_0_25 ;
y = divi(pools[0],pools[1]) ;
if(ABS(y-FLT_0_33)<ABS(y-FLT_0_5)) s[1] = FLT_0_33 ;
}
last = freq[from] ;
cur = 1 ;
for (i=from; i<=to; i++) {
y = divi(freq[i],last);
if(y>FLT_1_35) { /* counted as double */
cur ++ ;
if(cur>2) cur = 2 ;
} else if (y<FLT_0_75) { /*counted as half */
cur -- ;
if(cur<0) cur = 0 ;
}
if (freq[i]>0) last = freq[i];
freq[i] = mult(freq[i],s[cur]) ;
}
}
void interpolate_unvoiced(SHORT *freq, char * voicedv, INT len, INT /*FLOAT*/ av)
{
SHORT i,j,n;
SHORT l,m;
/* for unvoiced parts, set freq to be linear interpolation between
surrounding voiced */
i = j = 0;
while (i<len-1) {
while (i<len && !voicedv[i]) i++;
if (i>j) {
l = freq[j];
if (!voicedv[j]) l=av;
m = freq[i];
if (!voicedv[i]) m=av;
if (j==0) l=m;
if (i>=len-2) m = l;
if (i>j)
for (n=j; n<i; n++)
freq[n] = (SHORT) (l + (n-j)*(m-l)/(i-j)) ;
#if CPA_F
COUNT.addition += 15 ;
COUNT.multy += 2 ;
#endif
}
while (i<len && voicedv[i]) i++;
j = i-1;
}
}
/* remove short and isolated voiced or unvoiced parts, calc average file freq,
call the smoothing and remove doubling */
void rect_f0(SHORT *freq, char *v, INT len, SHORT *filt_all, INT *av)
{
INT i,j,l,k ;
/* calc first estimate of file freq avrg */
*av=0; j = 0;
for(i=1; i<len; i++)
if (v[i]) {
*av += (INT) freq[i];
j++;
}
if (j==0) {
*av = 100;
return;
}
*av /= j;
#if RUN4PC && PRINT && 0
if (PRINT) printf("av %d : %d\n", j, *av);
#endif
/* remove short uv parts (1 or 2 or 3) inside v parts */
for (i=2; i<len-2; i++)
if (v[i]==0 && v[i+1]==1 && v[i-1]==1 && (v[i-2]==1 || v[i+2]==1)) v[i]=1;
for (i=2; i<len-5; i++) if (v[i]==1 && v[i+1]==0) {
j=0; l=0;
for (k=i-2; k<=i+4; k++) j+=v[k];
if (v[i+2]==1 && (j>=3 && v[i+3]==1) ) { v[i+1]=1; l=1;}
else if (v[i+3]==1 && (j>3 && v[i+4]==1)) { v[i+1]=1; v[i+2]=1; l=1;}
if (!l) v[i]=0;
}
/* remove isolated voiced - if only one frame in 5 is voieced - turn it off */
for (i=3; i<len-4; i++)
if (v[i]) {
j=0;
for (k=-2; k<3; k++) j+= v[i+k];
if (j<2) v[i] =0;
}
/* for each voiced part - remove doubling and halving */
j = 0;
while (j<len) {
while (v[j]==0 && j<len) j++;
k = j;
while (k<len && v[k]==1) k++;
if (k<len) {
remove_double(freq, v, j,k-1);
}
j = k;
}
/* again - remove short uv parts (1 or 2) inside v parts */
for (i=0; i<len-4; i++) if (v[i]==1 && v[i+1]==0) {
if (v[i+2]==1) v[i+1]=1;
else if (v[i+3]==1)
{ v[i+1]=1; v[i+2]=1;}
}
}
/* fulp(): main function to extract picture contour of a speech utterance */
/** input: speech PCM in smp in num of frames (len) **/
/* smp --> beginning of speech */
INT fulp(SHORT *smp, INT len, char *voicedv, SHORT *freq, INT *av )
{
INT i, k, e ;
FLOAT thresh = TMIN ;
INT last_voiced=0,last_pitch,limit_range=0;
/* find pitch */
for (i=0; i<len; i++) {
k = find_pitch(i, &(smp[i*FRAME_SHIFT]), &thresh, &last_voiced, &last_pitch, &limit_range);
voicedv[i]=last_voiced;
if (!last_voiced || k==0) { freq[i]=0; voicedv[i]=0;}
else freq[i]=(SHORT)((SAMPRATE<<2)/k);
}
freq[0]=0; voicedv[0]=0;
voicedv[len-1]=0;
rect_f0(freq, voicedv, len, smp, av);
interpolate_unvoiced(freq, voicedv, len, *av);
#if RUN4PC && PRINT && 0
for(i=0; i<len; i++)
printf("Freq[%d]=%f (%d)\r\n",i,freq[i]/4.0,voicedv[i]) ;
#endif
}
/****************************************/
/* main function to calculate the score */
/****************************************/
int main(int argc, char **argv)
{
INT TeaLen, TeaStart, TeaEnd ;
INT StdLen, StdStart, StdEnd ;
INT grid_size;
FLOAT align_score, p_score;
#if CPA_F
INT duration ;
#endif
#if RUN4PC
TeaLen = readwav(argv[1],TeacherSpeech) ;
StdLen = readwav(argv[2],StudentSpeech) ;
#else
/* for running in DSP chips */
/* initialize TeacherSpeech to teacher's PCM */
/* initialize StudentSpeech to student's PCM */
/* ADD HERE */
#endif
#if CPA_F
duration = (StdLen>TeaLen?StdLen:TeaLen)/SAMPRATE ;
#endif
/* Endpointing */
TeaLen = EndPointer(TeacherSpeech,TeaLen/FRAME_SHIFT+1, &TeaStart, &TeaEnd) ;
StdLen = EndPointer(StudentSpeech,StdLen/FRAME_SHIFT+1, &StdStart, &StdEnd) ;
#if CPA_F && 0
PrintCOUNT("Endpointing", duration ) ;
#endif
/* det grid size to be minimum of file length and MAX_GRID */
if (StdLen<TeaLen) grid_size = StdLen;
else grid_size = TeaLen;
if (grid_size>MAX_GRID) grid_size = MAX_GRID;
/** Pitch extraction **/
fulp(&TeacherSpeech[TeaStart*FRAME_SHIFT], TeaLen, voicedv1, freq1, &av1) ;
fulp(&StudentSpeech[StdStart*FRAME_SHIFT], StdLen, voicedv2, freq2, &av2) ;
Normalize_Pitch(freq1, voicedv1, TeaLen) ;
Normalize_Pitch(freq2, voicedv2, StdLen) ;
#if CPA_F && 0
PrintCOUNT("Pitch Extraction", duration ) ;
#endif
/** MFCC extraction **/
HCode(&TeacherSpeech[TeaStart*FRAME_SHIFT], TeaLen, TeacherFeature, grid_size) ;
HCode(&StudentSpeech[StdStart*FRAME_SHIFT], StdLen, StudentFeature, grid_size) ;
#if CPA_F && 0
PrintCOUNT("MFCC", duration ) ;
#endif
/** DTW **/
align_score = align(grid_size, &TeaStart, &StdStart);
/** Calculate pitch score during backtrack **/
p_score = backtrack(TeaStart, StdStart, grid_size, TeaLen, StdLen);
#if RUN4PC && PRINT
printf("\nscore1 = %.2f score2 = %.2f\n", align_score/1024.0, p_score/1024.0 ) ;
printf("\nTotal score = %.2f (the smaller the better)\n\n", (align_score+p_score)/1024.0 ) ;
printf(" < 3.0 --> excellent\n" );
printf(" 3.0-4.0 --> very good\n") ;
printf(" 4.0-5.5 --> good\n") ;
printf(" 5.5-7.0 --> poor\n") ;
printf(" > 7.0 --> very bad\n\n") ;
#endif
#if CPA_F
PrintCOUNT("Totally", duration ) ;
#endif
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -