dp_f0.c
来自「speech signal process tools」· C语言 代码 · 共 919 行 · 第 1/2 页
C
919 行
/* * This material contains unpublished, proprietary software of * Entropic Research Laboratory, Inc. Any reproduction, distribution, * or publication of this work must be authorized in writing by Entropic * Research Laboratory, Inc., and must bear the notice: * * "Copyright (c) 1990-1996 Entropic Research Laboratory, Inc. * All rights reserved" * * The copyright notice above does not evidence any actual or intended * publication of this source code. * * Written by: David Talkin * Checked by: * Revised by: Derek Lin, David Talkin * * Brief description: Estimate speech fundamental frequency. * */static char *sccs_id = "@(#)dp_f0.c 1.14 10/21/96 ERL";/* A fundamental frequency estimation algorithm using the normalized cross correlation function and dynamic programming. The algorithm implemented here is similar to that presented by B. Secrest and G. Doddington, "An integrated pitch tracking algorithm for speech systems", Proc. ICASSP-83, pp.1352-1355. It is fully described by D. Talkin, "A robust algorithm for ptich tracking (RAPT)", in W. B. Kleijn & K. K. Paliwal (eds.) Speech Coding and Synthesis, (New York: Elsevier, 1995). *//* For each frame, up to par->n_cands cross correlation peaks are considered as F0 intervals. Each is scored according to its within- frame properties (relative amplitude, relative location), and according to its connectivity with each of the candidates in the previous frame. An unvoiced hypothesis is also generated at each frame and is considered in the light of voicing state change cost, the quality of the cross correlation peak, and frequency continuity. *//* At each frame, each candidate has associated with it the following items: its peak value its peak value modified by its within-frame properties its location the candidate # in the previous frame yielding the min. err. (this is the optimum path pointer!) its cumulative cost: (local cost + connectivity cost + cumulative cost of its best-previous-frame-match). *//* Dynamic programming is then used to pick the best F0 trajectory and voicing state given the local and transition costs for the entire utterance. *//* To avoid the necessity of computing the full crosscorrelation at the input sample rate, the signal is downsampled; a full ccf is computed at the lower frequency; interpolation is used to estimate the location of the peaks at the higher sample rate; and the fine-grained ccf is computed only in the vicinity of these estimated peak locations. */#include <math.h>#include <malloc.h>#include <esps/esps.h>#include "f0.h"#include "f0_structs.h"extern int debug_level;extern char *ProgName; /* * READ_SIZE: length of input data frame in sec to read * DP_CIRCULAR: determines the initial size of DP circular buffer in sec * DP_HIST: stored frame history in second before checking for common path * DP_CIRCULAR > READ_SIZE, DP_CIRCULAR at least 2 times of DP_HIST * DP_LIMIT: in case no convergence is found, DP frames of DP_LIMIT secs * are kept before output is forced by simply picking the lowest cost * path */#define READ_SIZE 0.2#define DP_CIRCULAR 1.5#define DP_HIST 0.5#define DP_LIMIT 1.0/* * stationarity parameters - * STAT_WSIZE: window size in sec used in measuring frame energy/stationarity * STAT_AINT: analysis interval in sec in measuring frame energy/stationarity */#define STAT_WSIZE 0.030#define STAT_AINT 0.020/* * headF points to current frame in the circular buffer, * tailF points to the frame where tracks start * cmpthF points to starting frame of converged path to backtrack */static Frame *headF = NULL, *tailF = NULL, *cmpthF = NULL;static int *pcands = NULL; /* array for backtracking in convergence check */static int cir_buff_growth_count = 0;static int size_cir_buffer, /* # of frames in circular DP buffer */ size_frame_hist, /* # of frames required before convergence test */ size_frame_out, /* # of frames before forcing output */ num_active_frames, /* # of frames from tailF to headF */ output_buf_size; /* # of frames allocated to output buffers *//* * DP parameters */static float tcost, tfact_a, tfact_s, frame_int, vbias, fdouble, wdur, ln2, freqwt, lagwt;static int step, size, nlags, start, stop, ncomp, *locs = NULL;static short maxpeaks;static int wReuse = 0; /* number of windows seen before resued */static Windstat *windstat;static float *f0p = NULL, *vuvp = NULL, *rms_speech = NULL, *acpkp = NULL, *peaks = NULL;static int first_time = 1, pad;/*--------------------------------------------------------------------*/intget_Nframes(buffsize, pad, step) long buffsize; int pad, step;{ if (buffsize < pad) return (0); else return ((buffsize - pad)/step);}/*--------------------------------------------------------------------*/intinit_dp_f0(freq, par, buffsize, sdstep) double freq; F0_params *par; long *buffsize, *sdstep;{ int nframes; int i; int stat_wsize, agap, ind, downpatch;/* * reassigning some constants */ tcost = par->trans_cost; tfact_a = par->trans_amp; tfact_s = par->trans_spec; vbias = par->voice_bias; fdouble = par->double_cost; frame_int = par->frame_step; step = round(frame_int * freq); size = round(par->wind_dur * freq); frame_int = ((float)step)/freq; wdur = ((float)size)/freq; start = round(freq / par->max_f0); stop = round(freq / par->min_f0); nlags = stop - start + 1; ncomp = size + stop + 1; /* # of samples required by xcorr comp. per fr. */ maxpeaks = 2 + (nlags/2); /* maximum number of "peaks" findable in ccf */ ln2 = log(2.0); size_frame_hist = (int) (DP_HIST / frame_int); size_frame_out = (int) (DP_LIMIT / frame_int);/* * SET UP THE D.P. WEIGHTING FACTORS: * The intent is to make the effectiveness of the various fudge factors * independent of frame rate or sampling frequency. */ /* Lag-dependent weighting factor to emphasize early peaks (higher freqs)*/ lagwt = par->lag_weight/stop; /* Penalty for a frequency skip in F0 per frame */ freqwt = par->freq_weight/frame_int; i = (int) (READ_SIZE *freq); if(ncomp >= step) nframes = ((i-ncomp)/step ) + 1; else nframes = i / step; /* *buffsize is the number of samples needed to make F0 computation of nframes DP frames possible. The last DP frame is patched with enough points so that F0 computation on it can be carried. F0 computaion on each frame needs enough points to do 1) xcross or cross correlation measure: enough points to do xcross - ncomp 2) stationarity measure: enough to make 30 msec windowing possible - ind 3) downsampling: enough to make filtering possible -- downpatch So there are nframes whole DP frames, padded with pad points to make the last frame F0 computation ok. */ /* last point in data frame needs points of 1/2 downsampler filter length long, 0.005 is the filter length used in downsampler */ downpatch = (((int) (freq * 0.005))+1) / 2; stat_wsize = (int) (STAT_WSIZE * freq); agap = (int) (STAT_AINT * freq); ind = ( agap - stat_wsize ) / 2; i = stat_wsize + ind; pad = downpatch + ((i>ncomp) ? i:ncomp); *buffsize = nframes * step + pad; *sdstep = nframes * step; /* Allocate space for the DP storage circularly linked data structure */ size_cir_buffer = (int) (DP_CIRCULAR / frame_int); /* creating circularly linked data structures */ tailF = alloc_frame(nlags, par->n_cands); headF = tailF; /* link them up */ for(i=1; i<size_cir_buffer; i++){ headF->next = alloc_frame(nlags, par->n_cands); headF->next->prev = headF; headF = headF->next; } headF->next = tailF; tailF->prev = headF; headF = tailF; /* Allocate sscratch array to use during backtrack convergence test. */ if( ! pcands ) { pcands = (int *) malloc( par->n_cands * sizeof(int)); spsassert(pcands,"can't allocate pathcands"); } /* Allocate arrays to return F0 and related signals. */ /* Note: remember to compare *vecsize with size_frame_out, because size_cir_buffer is not constant */ output_buf_size = size_cir_buffer; rms_speech = (float*)malloc(sizeof(float) * output_buf_size); spsassert(rms_speech,"rms_speech malloc failed"); f0p = (float*)malloc(sizeof(float) * output_buf_size); spsassert(f0p,"f0p malloc failed"); vuvp = (float*)malloc(sizeof(float)* output_buf_size); spsassert(vuvp,"vuvp malloc failed"); acpkp = (float*)malloc(sizeof(float) * output_buf_size); spsassert(acpkp,"acpkp malloc failed"); /* Allocate space for peak location and amplitude scratch arrays. */ peaks = (float*)malloc(sizeof(float) * maxpeaks); spsassert(peaks,"peaks malloc failed"); locs = (int*)malloc(sizeof(int) * maxpeaks); spsassert(locs, "locs malloc failed"); /* Initialise the retrieval/saving scheme of window statistic measures */ wReuse = agap / step; if (wReuse){ windstat = (Windstat *) malloc( wReuse * sizeof(Windstat)); spsassert(windstat, "windstat malloc failed"); for(i=0; i<wReuse; i++){ windstat[i].err = 0; windstat[i].rms = 0; } } if(debug_level){ Fprintf(stderr, "%s: done with initialization:\n", ProgName); Fprintf(stderr, " size_cir_buffer:%d xcorr frame size:%d start lag:%d nlags:%d\n", size_cir_buffer, size, start, nlags); } num_active_frames = 0; first_time = 1; return(0);} /*--------------------------------------------------------------------*/intdp_f0(fdata, buff_size, sdstep, freq, par, f0p_pt, vuvp_pt, rms_speech_pt, acpkp_pt, vecsize, last_time) float *fdata; int buff_size, sdstep; double freq; F0_params *par; /* analysis control parameters */ float **f0p_pt, **vuvp_pt, **rms_speech_pt, **acpkp_pt; int *vecsize, last_time;{ float maxval, engref, *sta, *rms_ratio, *dsdata, *downsample(); register float ttemp, ftemp, ft1, ferr, err, errmin; register int i, j, k, loc1, loc2; int nframes, maxloc, ncand, ncandp, minloc, decimate, samsds; Stat *stat = NULL; Stat *get_stationarity(); nframes = get_Nframes((long) buff_size, pad, step); /* # of whole frames */ if(debug_level) Fprintf(stderr, "%s: ******* Computing %d dp frames ******** from %d points\n", ProgName, nframes, buff_size); /* Now downsample the signal for coarse peak estimates. */ decimate = freq/2000.0; /* downsample to about 2kHz */ if (decimate <= 1) dsdata = fdata; else { samsds = ((nframes-1) * step + ncomp) / decimate; dsdata = downsample(fdata, buff_size, sdstep, freq, &samsds, decimate, first_time, last_time); if (!dsdata) { Fprintf(stderr, "%s: can't get downsampled data.\n", ProgName); return 1; } } /* Get a function of the "stationarity" of the speech signal. */ stat = get_stationarity(fdata, freq, buff_size, nframes, step, first_time); if (!stat) { Fprintf(stderr, "%s: can't get stationarity\n", ProgName); return(1); } sta = stat->stat; rms_ratio = stat->rms_ratio; /***********************************************************************/ /* MAIN FUNDAMENTAL FREQUENCY ESTIMATION LOOP */ /***********************************************************************/ if(!first_time && nframes > 0) headF = headF->next; for(i = 0; i < nframes; i++) { /* NOTE: This buffer growth provision is probably not necessary. It was put in (with errors) by Derek Lin and apparently never tested. My tests and analysis suggest it is completely superfluous. DT 9/5/96 */ /* Dynamically allocating more space for the circular buffer */ if(headF == tailF->prev){ Frame *frm; if(cir_buff_growth_count > 5){ Fprintf(stderr, "%s: too many requests (%d) for dynamically allocating space.\n There may be a problem in finding converged path.\n", ProgName, cir_buff_growth_count); return(1); } if(debug_level) Fprintf(stderr, "%s: allocating %d more frames for DP circ. buffer.\n", ProgName, size_cir_buffer); frm = alloc_frame(nlags, par->n_cands); headF->next = frm; frm->prev = headF; for(k=1; k<size_cir_buffer; k++){ frm->next = alloc_frame(nlags, par->n_cands); frm->next->prev = frm; frm = frm->next; } frm->next = tailF; tailF->prev = frm; cir_buff_growth_count++; } headF->rms = stat->rms[i]; get_fast_cands(fdata, dsdata, i, step, size, decimate, start, nlags, &engref, &maxloc, &maxval, headF->cp, peaks, locs, &ncand, par); /* Move the peak value and location arrays into the dp structure */ { register float *ftp1, *ftp2; register short *sp1; register int *sp2; for(ftp1 = headF->dp->pvals, ftp2 = peaks, sp1 = headF->dp->locs, sp2 = locs, j=ncand; j--; ) { *ftp1++ = *ftp2++; *sp1++ = *sp2++; } *sp1 = -1; /* distinguish the UNVOICED candidate */ *ftp1 = maxval; headF->dp->mpvals[ncand] = vbias+maxval; /* (high cost if cor. is high)*/ } /* Apply a lag-dependent weight to the peaks to encourage the selection of the first major peak. Translate the modified peak values into costs (high peak ==> low cost). */ for(j=0; j < ncand; j++){ ftemp = 1.0 - ((float)locs[j] * lagwt); headF->dp->mpvals[j] = 1.0 - (peaks[j] * ftemp); } ncand++; /* include the unvoiced candidate */ headF->dp->ncands = ncand; /*********************************************************************/ /* COMPUTE THE DISTANCE MEASURES AND ACCUMULATE THE COSTS. */ /*********************************************************************/ ncandp = headF->prev->dp->ncands; for(k=0; k<ncand; k++){ /* for each of the current candidates... */ minloc = 0; errmin = FLT_MAX; if((loc2 = headF->dp->locs[k]) > 0) { /* current cand. is voiced */ for(j=0; j<ncandp; j++){ /* for each PREVIOUS candidate... */ /* Get cost due to inter-frame period change. */ loc1 = headF->prev->dp->locs[j]; if (loc1 > 0) { /* prev. was voiced */ ftemp = log(((double) loc2) / loc1); ttemp = fabs(ftemp); ft1 = fdouble + fabs(ftemp + ln2); if (ttemp > ft1) ttemp = ft1; ft1 = fdouble + fabs(ftemp - ln2); if (ttemp > ft1) ttemp = ft1; ferr = ttemp * freqwt; } else { /* prev. was unvoiced */ ferr = tcost + (tfact_s * sta[i]) + (tfact_a / rms_ratio[i]); } /* Add in cumulative cost associated with previous peak. */ err = ferr + headF->prev->dp->dpvals[j]; if(err < errmin){ /* find min. cost */ errmin = err; minloc = j; } } } else { /* this is the unvoiced candidate */ for(j=0; j<ncandp; j++){ /* for each PREVIOUS candidate... */ /* Get voicing transition cost. */ if (headF->prev->dp->locs[j] > 0) { /* previous was voiced */ ferr = tcost + (tfact_s * sta[i]) + (tfact_a * rms_ratio[i]); } else ferr = 0.0; /* Add in cumulative cost associated with previous peak. */ err = ferr + headF->prev->dp->dpvals[j]; if(err < errmin){ /* find min. cost */ errmin = err; minloc = j; } } }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?