📄 speechvoicescore.txt

📁 语音评分算法的实现,主要可以实现对一段语音信号进行判别并进行打分功能.
💻 TXT
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
#if RUN4PC && PRINT && 0
    printf("optimal path: %d - %d (%d) %f %f \n",x,y,grid_size, pscore/1024.0, CalcPitchScore(mult(x,skip1),mult(y,skip2))/1024.0) ;
#endif
    
    switch (step) {
		case FROM_LEFT : 
			x-=2;
			y-=1;
			break;
		case FROM_MID :
			x--;
			y--;
			break;
		case FROM_RIGHT :
			x--;
			y-=2;
			break;
		case FROM_SELF :
			x--;
			break;
		default :
			x=0;
		break;
    }
#if CPA_F
COUNT.addition += 13 ;
COUNT.multy += 4 ;
#endif 
  }
  return (FLOAT)(PWEI*pscore/path_len) ;
}

/****************************************************/
/** Module to extract pitch contour                **/
/****************************************************/

/* calc correlation - samples are in vec filt, summation is performed over len
   samples, advancing step samples each time, and for the pitch value period. sq
   is a vec of the squares of filt, enrg is the average energy in the checked
   part */
FLOAT calc_corr(INT len, INT period, INT step, SHORT *filt)
{
  INT ss,x,y ;
  INT i;
  INT xi, xj ;
  FLOAT cor ;
  
  ss = x = y = 0;
  
  for (i=0; i<len; i+=step) {
	xi = (INT) filt[i] ;
	xj = (INT) filt[period+i] ;
    ss += xi*xj/len ;
    x  += xi*xi/len ;
    y  += xj*xj/len ;
#if CPA_F
COUNT.addition += 9 ;
COUNT.multy += 6 ;
#endif 
  }
  
  if (x<1) 
    return FLT_0_0 ;
   
  if(ss>=(INT)0x1fffff || ss<=(INT)0xffe00000 ) 
	  cor = mult((FLOAT)(ss/(x>>FPN)),(FLOAT)(ss/(y>>FPN))) ;
  else 
	  cor = mult((FLOAT)((ss<<FPN)/x),(FLOAT)((ss<<FPN)/y)) ;
                  /** now cor becomes 1.5.10 FP format */	  	
                  	  
  if(ss<0)  cor = -cor ;
	
  return cor ;
}


/** find pitch - first pass of pitch finding.
   first - check if energy and variance are morhe than a threshold value. if not -
   decide unvoiced.
   then check there is a zero crossing inside the allowed oitch period. if not - uv.
   Set range of allowed pitch values to full range, or to a smaller range around the
   last pitch value (if was found).
   Calculate the correlation (low resolution) for the allowed pitch values. if a peak
   is found, choose it.
   check correlation values near the peak with high resolution, and choose the pitch
   which gives the best correlation value.
   check that the correlation value is more than a threshold.
   Allow another check - to find if a doubling or halving occured (this is needed 
   because otherwise in the next step we will look only around the found pitch value,
   and errors could be recoverd from only in the second pass).
   last - prepare for a check - the correlation values for several multipliers.
   **/
INT find_pitch(INT frm_num, SHORT *filt, FLOAT *thresh, INT *last_voiced ,INT *last_pitch, INT *limit_range)
{
  INT  i,j, chosen_pitch, a, b , voiced ;
  char possible=1;
  FLOAT s, best_corr ;
  
  if (possible) {
 
    /* a and b are min and max possible pitch. if limit_range, then range is limited
       around prev pitch value */
   if (*limit_range && *last_pitch>0) {
      a = *last_pitch * 2 / 3 ;     
      if (a<MIN_F0) a=MIN_F0 ;
      b = *last_pitch * 3 / 2 ;           
      if (b>MAX_F0) b=MAX_F0 ;
    } else {
	  a = MIN_F0;     
	  b = MAX_F0; 	    
    }

    /* there must be at least two zero crossing in period */
    i=0;     j=1;
    while (j < b && i<2) {
      if ((INT)filt[j-1]*filt[j+1]<0) i++;
      j++;
     }
    if ( j > a ) a = j ;
    chosen_pitch=-1;
    if (a >= b) 	possible=0;
  }

  if (possible) {
	  best_corr=-FLT_2_0; 
      for (j=a; j<b; j++) {
         s = calc_corr(j, j, STEP1, filt);
         
         if ( j>=a+3 && j<b-3 && s>FLT_0_0 &&  s>best_corr  ) {
	           chosen_pitch = j ;
	           best_corr = s ;
      }
    }
    if (chosen_pitch==-1) possible=0; 
  }
  
  if (possible) {
    /* last, check again around best candidate, using best resolution */
    best_corr=-FLT_2_0;
    a=chosen_pitch-4;
    if (a<MIN_F0) a=MIN_F0;
    b=chosen_pitch+4;
    if (b>MAX_F0) b=MAX_F0;
    chosen_pitch=0;
    for (j=a; j<b; j++) {
      s = calc_corr(j, j, 1, filt);
      if (s>best_corr) {
		chosen_pitch=j;
		best_corr=s;
      }
    }
  }
  if (!possible)  best_corr=FLT_0_0 ;

    /* there used to be a moving threshold, but now this is disabled (TMIN=TMAX=THIGH), 
     as we use here a  stricter threshold, and later use the extending proc with 
     a lower thresh */
 
  if (*last_voiced) {
    if (best_corr <*thresh) {
      voiced=0;
      *thresh = TMIN;
    }
    else {
      *thresh = mult(TMAX,best_corr) ;
      if (*thresh<TMIN) *thresh=TMIN;
      voiced=1;
    }
  }
  else {
    if (best_corr>*thresh) {
      voiced=1;
      *thresh = TMIN;
    }
    else {
      voiced=0;
      *thresh=THIGH;
    }
  }
  *limit_range = (*limit_range+1)%SEQ;
  if (!voiced) *limit_range=0;
  *last_voiced=voiced;
  
  if (chosen_pitch<0) chosen_pitch=0;
  *last_pitch=chosen_pitch; 
  return(chosen_pitch);
}


/** simple algorithm to remove multiples in pitch contour **/
/** A simple method to use three states to implement **/

#define ABS(x) ((x>0)? (x) : -(x))

void remove_double(SHORT *freq, char *voicedv, INT from, INT to)
{
  INT cur,i;
  INT pools[3];
  INT count[3] ;
  SHORT last ;
  FLOAT y, s[3] ;

  pools[0] = pools[1] = pools[2] = 0 ;
  count[0] = count[1] = count[2] = 0 ;
  cur = 1 ;
  last = freq[from] ;	
  
  for (i=from; i<=to; i++) { 
	  if (freq[i]==0) freq[i] = last ;
	  y = divi(freq[i],last);
	  if( y > FLT_1_35 ) {  /* counted as double */
		  cur ++ ;
		  if(cur>2) cur = 2 ;
	  } else if ( y < FLT_0_75 ) { /*counted as half */
		  cur -- ;
		  if(cur<0) cur = 0 ;
	  } 
	  
	  pools[cur] += (INT)freq[i] ;
	  count[cur] ++ ;

      if (freq[i]>0) last = freq[i];
  }

  for(i=0;i<3;i++) {
	  if (count[i]) pools[i] /= count[i] ;
	    else pools[i] = 10000 ;   /** 10000 magic number, do we have better one **/
  }

  if(count[2]>count[1] && count[2]>count[0]) { 
	  s[0] = FLT_4_0 ; s[1] = FLT_2_0; s[2] = FLT_1_0; 
	  y = divi(pools[2],pools[1]) ;
	  if(ABS(y-FLT_2_0)>ABS(y-FLT_3_0))  s[1] = FLT_3_0 ;		
   } else if (count[1]>count[0] && count[1]>count[2]) { 
	  s[0] = FLT_2_0; s[1] = FLT_1_0; s[2] = FLT_0_5; 
	  y = divi(pools[1],pools[2]) ;
      if(ABS(y-FLT_0_33)<ABS(y-FLT_0_5)) s[2]=FLT_0_33 ;
	  y = divi(pools[1],pools[0]) ;
	  if(ABS(y-FLT_3_0) < ABS(y-FLT_2_0) ) s[0] = FLT_3_0 ;
	} else { 
		s[0] = FLT_1_0 ; s[1] = FLT_0_5 ; s[2] = FLT_0_25 ; 
	    y = divi(pools[0],pools[1]) ;
		if(ABS(y-FLT_0_33)<ABS(y-FLT_0_5)) s[1] = FLT_0_33 ;
	}

	last = freq[from] ;	
	cur = 1 ;
	for (i=from; i<=to; i++) { 
	  y = divi(freq[i],last);
	  
	  if(y>FLT_1_35) {  /* counted as double */
		cur ++ ;
		if(cur>2) cur = 2 ;
	  } else if (y<FLT_0_75) { /*counted as half */
		 cur -- ;
		 if(cur<0) cur = 0 ;
	  } 
  	  if (freq[i]>0) last = freq[i];
	  freq[i] = mult(freq[i],s[cur]) ;
   }
}

void interpolate_unvoiced(SHORT *freq, char * voicedv, INT len, INT /*FLOAT*/ av)
{
  SHORT   i,j,n;
  SHORT   l,m;
  
  /* for unvoiced parts, set freq to be linear interpolation between 
     surrounding voiced */
  i = j = 0;
  while (i<len-1) {
    while (i<len && !voicedv[i]) i++;
    if (i>j) {
      l = freq[j];
      if (!voicedv[j]) l=av;
      m = freq[i];
      if (!voicedv[i]) m=av;
      if (j==0) l=m;
      if (i>=len-2) m = l;
      if (i>j)
	  for (n=j; n<i; n++) 
	      freq[n] = (SHORT) (l + (n-j)*(m-l)/(i-j)) ;
#if CPA_F
COUNT.addition += 15 ;
COUNT.multy += 2 ;
#endif 
    }
    while (i<len && voicedv[i]) i++;
    j = i-1;
  }
}

/* remove short and isolated voiced or unvoiced parts, calc average file freq,
   call the smoothing and remove doubling */
void rect_f0(SHORT *freq, char *v, INT len, SHORT *filt_all, INT  *av)
{
    INT i,j,l,k ;
    
    /* calc first estimate of file freq avrg */
    *av=0;  j = 0;
    for(i=1; i<len; i++) 
     if (v[i]) {
      *av += (INT) freq[i];
      j++;
    } 
    if (j==0) {
        *av = 100;
        return;
    }
    *av /= j;

#if RUN4PC && PRINT && 0 
     if (PRINT) printf("av %d : %d\n", j, *av); 
#endif
   
    /* remove short uv parts (1 or 2 or 3) inside v parts */    
    for (i=2; i<len-2; i++) 
      if (v[i]==0 && v[i+1]==1 && v[i-1]==1 && (v[i-2]==1 || v[i+2]==1)) v[i]=1;
    
    for (i=2; i<len-5; i++) if (v[i]==1 && v[i+1]==0) {
      j=0;   l=0;
      for (k=i-2; k<=i+4; k++) j+=v[k];
      if (v[i+2]==1 && (j>=3 && v[i+3]==1) ) { v[i+1]=1; l=1;}
      else  if (v[i+3]==1 && (j>3 && v[i+4]==1)) { v[i+1]=1; v[i+2]=1; l=1;}
      if (!l) v[i]=0;
    }
    
    /* remove isolated voiced - if only one frame in 5 is voieced - turn it off */
    for (i=3; i<len-4; i++) 
      if (v[i]) {
        j=0;
        for (k=-2; k<3; k++) j+= v[i+k];
        if (j<2) v[i] =0;
      }
        
    /* for each voiced part - remove doubling and halving */
    j = 0;
    while (j<len) {
      while (v[j]==0 && j<len) j++;
      k = j;
      while (k<len && v[k]==1) k++;
      if (k<len) {
        remove_double(freq, v, j,k-1);
      }
      j = k;
    }
    
    /* again - remove short uv parts (1 or 2) inside v parts */    
    for (i=0; i<len-4; i++) if (v[i]==1 && v[i+1]==0) {
      if (v[i+2]==1) v[i+1]=1;
      else  if (v[i+3]==1) 
      	{ v[i+1]=1; v[i+2]=1;}
    }
}

/* fulp(): main function to extract picture contour of a speech utterance */
/** input: speech PCM in smp in num of frames (len) **/
/* smp --> beginning of speech */
INT fulp(SHORT *smp, INT len, char *voicedv, SHORT *freq, INT *av )
{
	INT     i, k, e ;
	FLOAT   thresh = TMIN ;
	INT     last_voiced=0,last_pitch,limit_range=0; 
				
	  /* find pitch */
  	for (i=0; i<len; i++) {	
      k = find_pitch(i, &(smp[i*FRAME_SHIFT]), &thresh, &last_voiced, &last_pitch, &limit_range);

      voicedv[i]=last_voiced;
      if (!last_voiced || k==0) { freq[i]=0; voicedv[i]=0;}
      else freq[i]=(SHORT)((SAMPRATE<<2)/k);

  }
  
  freq[0]=0; voicedv[0]=0; 
  voicedv[len-1]=0; 

  rect_f0(freq, voicedv, len, smp, av);
 
  interpolate_unvoiced(freq, voicedv, len, *av);
    
#if RUN4PC && PRINT && 0
  for(i=0; i<len; i++)
    printf("Freq[%d]=%f (%d)\r\n",i,freq[i]/4.0,voicedv[i]) ;
#endif
}

/****************************************/
/* main function to calculate the score */
/****************************************/

int main(int argc, char **argv)
{
  INT TeaLen, TeaStart, TeaEnd ;
  INT StdLen, StdStart, StdEnd ;
  INT     grid_size;
  FLOAT   align_score, p_score;
#if CPA_F
  INT duration ;
#endif  
  
#if RUN4PC
  TeaLen = readwav(argv[1],TeacherSpeech) ;
  StdLen = readwav(argv[2],StudentSpeech) ;
#else
  /* for running in DSP chips */
  /* initialize TeacherSpeech to teacher's PCM */
  /* initialize StudentSpeech to student's PCM */

  /* ADD HERE */ 
  
#endif

#if CPA_F
duration = (StdLen>TeaLen?StdLen:TeaLen)/SAMPRATE ;
#endif

  /* Endpointing */
  TeaLen = EndPointer(TeacherSpeech,TeaLen/FRAME_SHIFT+1, &TeaStart, &TeaEnd) ;
  StdLen = EndPointer(StudentSpeech,StdLen/FRAME_SHIFT+1, &StdStart, &StdEnd) ;
 
#if CPA_F && 0
PrintCOUNT("Endpointing", duration ) ;
#endif

     /* det grid size to be minimum of file length and MAX_GRID */
  if (StdLen<TeaLen)     grid_size = StdLen;
  else grid_size = TeaLen;
  if (grid_size>MAX_GRID)      grid_size = MAX_GRID;
  
   /** Pitch extraction **/
  fulp(&TeacherSpeech[TeaStart*FRAME_SHIFT], TeaLen, voicedv1, freq1, &av1) ;
  fulp(&StudentSpeech[StdStart*FRAME_SHIFT], StdLen, voicedv2, freq2, &av2) ;

  Normalize_Pitch(freq1, voicedv1, TeaLen) ;
  Normalize_Pitch(freq2, voicedv2, StdLen) ;
  
#if CPA_F && 0
PrintCOUNT("Pitch Extraction", duration ) ;
#endif
   
  /** MFCC extraction **/
  HCode(&TeacherSpeech[TeaStart*FRAME_SHIFT], TeaLen, TeacherFeature, grid_size) ;
  HCode(&StudentSpeech[StdStart*FRAME_SHIFT], StdLen, StudentFeature, grid_size) ;

#if CPA_F && 0
PrintCOUNT("MFCC", duration ) ;
#endif
   
   
  /**  DTW  **/
  align_score = align(grid_size, &TeaStart, &StdStart);

  /** Calculate pitch score during backtrack **/
   p_score = backtrack(TeaStart, StdStart, grid_size, TeaLen, StdLen);
   
#if RUN4PC && PRINT
  printf("\nscore1 = %.2f   score2 = %.2f\n", align_score/1024.0, p_score/1024.0 ) ;

  printf("\nTotal score = %.2f      (the smaller the better)\n\n", (align_score+p_score)/1024.0 ) ;
  printf("     < 3.0      --> excellent\n" );
  printf("     3.0-4.0    --> very good\n") ;
  printf("     4.0-5.5    --> good\n") ;
  printf("     5.5-7.0    --> poor\n") ;
  printf("     > 7.0      --> very bad\n\n") ;
#endif

#if CPA_F
PrintCOUNT("Totally", duration ) ;
#endif

}
上一页 1 23
💿 文件大小 12 K
👤 上传用户 bling
📂 所属分类嵌入式/单片机编程
📄 代码行数 1,426 行
💻 语言类型 TXT
🏷️ 相关标签

#分 #语音 #算法 #判别
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -