dp_f0.c

来自「speech signal process tools」· C语言代码 · 共 919 行 · 第 1/2 页
919 行
/* * This material contains unpublished, proprietary software of  * Entropic Research Laboratory, Inc. Any reproduction, distribution,  * or publication of this work must be authorized in writing by Entropic  * Research Laboratory, Inc., and must bear the notice:  * *    "Copyright (c) 1990-1996 Entropic Research Laboratory, Inc.  *                   All rights reserved" * * The copyright notice above does not evidence any actual or intended  * publication of this source code.      * * Written by:  David Talkin * Checked by: * Revised by:  Derek Lin, David Talkin * * Brief description:  Estimate speech fundamental frequency. * */static char *sccs_id = "@(#)dp_f0.c	1.14	10/21/96	ERL";/* A fundamental frequency estimation algorithm using the normalized   cross correlation function and dynamic programming.  The algorithm   implemented here is similar to that presented by B. Secrest and   G. Doddington, "An integrated pitch tracking algorithm for speech   systems", Proc. ICASSP-83, pp.1352-1355.  It is fully described   by D. Talkin, "A robust algorithm for ptich tracking (RAPT)", in   W. B. Kleijn & K. K. Paliwal (eds.) Speech Coding and Synthesis,   (New York: Elsevier, 1995). *//* For each frame, up to par->n_cands cross correlation peaks are   considered as F0 intervals.  Each is scored according to its within-   frame properties (relative amplitude, relative location), and   according to its connectivity with each of the candidates in the   previous frame.  An unvoiced hypothesis is also generated at each   frame and is considered in the light of voicing state change cost,   the quality of the cross correlation peak, and frequency continuity. *//* At each frame, each candidate has associated with it the following   items:	its peak value	its peak value modified by its within-frame properties	its location	the candidate # in the previous frame yielding the min. err.		(this is the optimum path pointer!)	its cumulative cost: (local cost + connectivity cost +		cumulative cost of its best-previous-frame-match). *//* Dynamic programming is then used to pick the best F0 trajectory and voicing   state given the local and transition costs for the entire utterance. *//* To avoid the necessity of computing the full crosscorrelation at   the input sample rate, the signal is downsampled; a full ccf is   computed at the lower frequency; interpolation is used to estimate the   location of the peaks at the higher sample rate; and the fine-grained   ccf is computed only in the vicinity of these estimated peak   locations. */#include <math.h>#include <malloc.h>#include <esps/esps.h>#include "f0.h"#include "f0_structs.h"extern int  debug_level;extern char *ProgName;  /* * READ_SIZE: length of input data frame in sec to read * DP_CIRCULAR: determines the initial size of DP circular buffer in sec * DP_HIST: stored frame history in second before checking for common path  *      DP_CIRCULAR > READ_SIZE, DP_CIRCULAR at least 2 times of DP_HIST  * DP_LIMIT: in case no convergence is found, DP frames of DP_LIMIT secs *      are kept before output is forced by simply picking the lowest cost *      path */#define READ_SIZE 0.2#define DP_CIRCULAR 1.5#define DP_HIST 0.5#define DP_LIMIT 1.0/*  * stationarity parameters - * STAT_WSIZE: window size in sec used in measuring frame energy/stationarity * STAT_AINT: analysis interval in sec in measuring frame energy/stationarity */#define STAT_WSIZE 0.030#define STAT_AINT 0.020/* * headF points to current frame in the circular buffer,  * tailF points to the frame where tracks start * cmpthF points to starting frame of converged path to backtrack */static Frame *headF = NULL, *tailF = NULL, *cmpthF = NULL;static  int *pcands = NULL;	/* array for backtracking in convergence check */static int cir_buff_growth_count = 0;static int size_cir_buffer,	/* # of frames in circular DP buffer */           size_frame_hist,	/* # of frames required before convergence test */           size_frame_out,	/* # of frames before forcing output */           num_active_frames,	/* # of frames from tailF to headF */           output_buf_size;	/* # of frames allocated to output buffers *//*  * DP parameters */static float tcost, tfact_a, tfact_s, frame_int, vbias, fdouble, wdur, ln2,             freqwt, lagwt;static int step, size, nlags, start, stop, ncomp, *locs = NULL;static short maxpeaks;static int wReuse = 0;  /* number of windows seen before resued */static Windstat *windstat;static float *f0p = NULL, *vuvp = NULL, *rms_speech = NULL,              *acpkp = NULL, *peaks = NULL;static int first_time = 1, pad;/*--------------------------------------------------------------------*/intget_Nframes(buffsize, pad, step)    long    buffsize;    int     pad, step;{  if (buffsize < pad)    return (0);  else    return ((buffsize - pad)/step);}/*--------------------------------------------------------------------*/intinit_dp_f0(freq, par, buffsize, sdstep)    double	freq;    F0_params	*par;    long	*buffsize, *sdstep;{  int nframes;  int i;  int stat_wsize, agap, ind, downpatch;/* * reassigning some constants  */  tcost = par->trans_cost;  tfact_a = par->trans_amp;  tfact_s = par->trans_spec;  vbias = par->voice_bias;  fdouble = par->double_cost;  frame_int = par->frame_step;    step = round(frame_int * freq);  size = round(par->wind_dur * freq);  frame_int = ((float)step)/freq;  wdur = ((float)size)/freq;  start = round(freq / par->max_f0);  stop = round(freq / par->min_f0);  nlags = stop - start + 1;  ncomp = size + stop + 1; /* # of samples required by xcorr			      comp. per fr. */  maxpeaks = 2 + (nlags/2);	/* maximum number of "peaks" findable in ccf */  ln2 = log(2.0);  size_frame_hist = (int) (DP_HIST / frame_int);  size_frame_out = (int) (DP_LIMIT / frame_int);/* * SET UP THE D.P. WEIGHTING FACTORS: *      The intent is to make the effectiveness of the various fudge factors *      independent of frame rate or sampling frequency.                 */    /* Lag-dependent weighting factor to emphasize early peaks (higher freqs)*/  lagwt = par->lag_weight/stop;    /* Penalty for a frequency skip in F0 per frame */  freqwt = par->freq_weight/frame_int;    i = (int) (READ_SIZE *freq);  if(ncomp >= step) nframes = ((i-ncomp)/step ) + 1;  else nframes = i / step;  /* *buffsize is the number of samples needed to make F0 computation     of nframes DP frames possible.  The last DP frame is patched with     enough points so that F0 computation on it can be carried.  F0     computaion on each frame needs enough points to do     1) xcross or cross correlation measure:           enough points to do xcross - ncomp     2) stationarity measure:           enough to make 30 msec windowing possible - ind     3) downsampling:           enough to make filtering possible -- downpatch      So there are nframes whole DP frames, padded with pad points     to make the last frame F0 computation ok.  */  /* last point in data frame needs points of 1/2 downsampler filter length      long, 0.005 is the filter length used in downsampler */  downpatch = (((int) (freq * 0.005))+1) / 2;  stat_wsize = (int) (STAT_WSIZE * freq);  agap = (int) (STAT_AINT * freq);  ind = ( agap - stat_wsize ) / 2;  i = stat_wsize + ind;  pad = downpatch + ((i>ncomp) ? i:ncomp);  *buffsize = nframes * step + pad;  *sdstep = nframes * step;    /* Allocate space for the DP storage circularly linked data structure */  size_cir_buffer = (int) (DP_CIRCULAR / frame_int);  /* creating circularly linked data structures */  tailF = alloc_frame(nlags, par->n_cands);  headF = tailF;  /* link them up */  for(i=1; i<size_cir_buffer; i++){    headF->next = alloc_frame(nlags, par->n_cands);    headF->next->prev = headF;    headF = headF->next;  }  headF->next = tailF;  tailF->prev = headF;  headF = tailF;  /* Allocate sscratch array to use during backtrack convergence test. */  if( ! pcands ) {    pcands = (int *) malloc( par->n_cands * sizeof(int));    spsassert(pcands,"can't allocate pathcands");  }  /* Allocate arrays to return F0 and related signals. */  /* Note: remember to compare *vecsize with size_frame_out, because     size_cir_buffer is not constant */  output_buf_size = size_cir_buffer;  rms_speech = (float*)malloc(sizeof(float) * output_buf_size);  spsassert(rms_speech,"rms_speech malloc failed");  f0p = (float*)malloc(sizeof(float) * output_buf_size);  spsassert(f0p,"f0p malloc failed");  vuvp = (float*)malloc(sizeof(float)* output_buf_size);  spsassert(vuvp,"vuvp malloc failed");  acpkp = (float*)malloc(sizeof(float) * output_buf_size);  spsassert(acpkp,"acpkp malloc failed");  /* Allocate space for peak location and amplitude scratch arrays. */  peaks = (float*)malloc(sizeof(float) * maxpeaks);  spsassert(peaks,"peaks malloc failed");  locs = (int*)malloc(sizeof(int) * maxpeaks);  spsassert(locs, "locs malloc failed");    /* Initialise the retrieval/saving scheme of window statistic measures */  wReuse = agap / step;  if (wReuse){      windstat = (Windstat *) malloc( wReuse * sizeof(Windstat));      spsassert(windstat, "windstat malloc failed");      for(i=0; i<wReuse; i++){	  windstat[i].err = 0;	  windstat[i].rms = 0;      }  }  if(debug_level){    Fprintf(stderr, "%s: done with initialization:\n", ProgName);    Fprintf(stderr,	    " size_cir_buffer:%d  xcorr frame size:%d start lag:%d nlags:%d\n",	    size_cir_buffer, size, start, nlags);  }  num_active_frames = 0;  first_time = 1;  return(0);}  /*--------------------------------------------------------------------*/intdp_f0(fdata, buff_size, sdstep, freq,      par, f0p_pt, vuvp_pt, rms_speech_pt, acpkp_pt, vecsize, last_time)    float	*fdata;    int		buff_size, sdstep;    double	freq;    F0_params	*par;		/* analysis control parameters */    float	**f0p_pt, **vuvp_pt, **rms_speech_pt, **acpkp_pt;    int		*vecsize, last_time;{  float  maxval, engref, *sta, *rms_ratio, *dsdata, *downsample();  register float ttemp, ftemp, ft1, ferr, err, errmin;  register int  i, j, k, loc1, loc2;  int   nframes, maxloc, ncand, ncandp, minloc,        decimate, samsds;  Stat *stat = NULL;  Stat *get_stationarity();  nframes = get_Nframes((long) buff_size, pad, step); /* # of whole frames */  if(debug_level)    Fprintf(stderr,	    "%s: ******* Computing %d dp frames ******** from %d points\n",	    ProgName, nframes, buff_size);  /* Now downsample the signal for coarse peak estimates. */  decimate = freq/2000.0;	/* downsample to about 2kHz */  if (decimate <= 1)    dsdata = fdata;  else {    samsds = ((nframes-1) * step + ncomp) / decimate;    dsdata = downsample(fdata, buff_size, sdstep, freq, &samsds, decimate, 			first_time, last_time);    if (!dsdata) {      Fprintf(stderr, "%s: can't get downsampled data.\n", ProgName);      return 1;    }  }  /* Get a function of the "stationarity" of the speech signal. */  stat = get_stationarity(fdata, freq, buff_size, nframes, step, first_time);  if (!stat) {     Fprintf(stderr, "%s: can't get stationarity\n", ProgName);    return(1);  }  sta = stat->stat;  rms_ratio = stat->rms_ratio;  /***********************************************************************/  /* MAIN FUNDAMENTAL FREQUENCY ESTIMATION LOOP */  /***********************************************************************/  if(!first_time && nframes > 0) headF = headF->next;  for(i = 0; i < nframes; i++) {     /* NOTE: This buffer growth provision is probably not necessary.       It was put in (with errors) by Derek Lin and apparently never       tested.  My tests and analysis suggest it is completely       superfluous. DT 9/5/96 */    /* Dynamically allocating more space for the circular buffer */    if(headF == tailF->prev){      Frame *frm;      if(cir_buff_growth_count > 5){	Fprintf(stderr,		"%s: too many requests (%d) for dynamically allocating space.\n   There may be a problem in finding converged path.\n",		ProgName, cir_buff_growth_count);	return(1);      }      if(debug_level) 	Fprintf(stderr, "%s: allocating %d more frames for DP circ. buffer.\n",		ProgName, size_cir_buffer);      frm = alloc_frame(nlags, par->n_cands);      headF->next = frm;      frm->prev = headF;      for(k=1; k<size_cir_buffer; k++){	frm->next = alloc_frame(nlags, par->n_cands);	frm->next->prev = frm;	frm = frm->next;      }      frm->next = tailF;      tailF->prev = frm;      cir_buff_growth_count++;    }    headF->rms = stat->rms[i];    get_fast_cands(fdata, dsdata, i, step, size, decimate, start,		   nlags, &engref, &maxloc,		   &maxval, headF->cp, peaks, locs, &ncand, par);        /*    Move the peak value and location arrays into the dp structure */    {      register float *ftp1, *ftp2;      register short *sp1;      register int *sp2;            for(ftp1 = headF->dp->pvals, ftp2 = peaks,	  sp1 = headF->dp->locs, sp2 = locs, j=ncand; j--; ) {	*ftp1++ = *ftp2++;	*sp1++ = *sp2++;      }      *sp1 = -1;		/* distinguish the UNVOICED candidate */      *ftp1 = maxval;      headF->dp->mpvals[ncand] = vbias+maxval; /* (high cost if cor. is high)*/    }    /* Apply a lag-dependent weight to the peaks to encourage the selection       of the first major peak.  Translate the modified peak values into       costs (high peak ==> low cost). */    for(j=0; j < ncand; j++){      ftemp = 1.0 - ((float)locs[j] * lagwt);      headF->dp->mpvals[j] = 1.0 - (peaks[j] * ftemp);    }    ncand++;			/* include the unvoiced candidate */    headF->dp->ncands = ncand;    /*********************************************************************/    /*    COMPUTE THE DISTANCE MEASURES AND ACCUMULATE THE COSTS.       */    /*********************************************************************/    ncandp = headF->prev->dp->ncands;    for(k=0; k<ncand; k++){	/* for each of the current candidates... */      minloc = 0;      errmin = FLT_MAX;      if((loc2 = headF->dp->locs[k]) > 0) { /* current cand. is voiced */	for(j=0; j<ncandp; j++){ /* for each PREVIOUS candidate... */	  /*    Get cost due to inter-frame period change. */	  loc1 = headF->prev->dp->locs[j];	  if (loc1 > 0) { /* prev. was voiced */	    ftemp = log(((double) loc2) / loc1);	    ttemp = fabs(ftemp);	    ft1 = fdouble + fabs(ftemp + ln2);	    if (ttemp > ft1)	      ttemp = ft1;	    ft1 = fdouble + fabs(ftemp - ln2);	    if (ttemp > ft1)	      ttemp = ft1;	    ferr = ttemp * freqwt;	  } else {		/* prev. was unvoiced */	    ferr = tcost + (tfact_s * sta[i]) + (tfact_a / rms_ratio[i]);	  }	  /*    Add in cumulative cost associated with previous peak. */	  err = ferr + headF->prev->dp->dpvals[j];	  if(err < errmin){	/* find min. cost */	    errmin = err;	    minloc = j;	  }	}      } else {			/* this is the unvoiced candidate */	for(j=0; j<ncandp; j++){ /* for each PREVIOUS candidate... */	  	  /*    Get voicing transition cost. */	  if (headF->prev->dp->locs[j] > 0) { /* previous was voiced */	    ferr = tcost + (tfact_s * sta[i]) + (tfact_a * rms_ratio[i]);	  }	  else	    ferr = 0.0;	  /*    Add in cumulative cost associated with previous peak. */	  err = ferr + headF->prev->dp->dpvals[j];	  if(err < errmin){	/* find min. cost */	    errmin = err;	    minloc = j;	  }	}      }
dp_f0.c - 源码说明

本页面展示了「speech signal process tools」中的 dp_f0.c 源码文件，采用 C语言编程语言编写，共 919 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与process相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?