📄 voice.c
字号:
/*************************************************************************** ** VOICIN Version 52****************************************************************************** Voicing Detection (VOICIN) makes voicing decisions for each half* frame of input speech. Tentative voicing decisions are made two frames* in the future (2F) for each half frame. These decisions are carried* through one frame in the future (1F) to the present (P) frame where* they are examined and smoothed, resulting in the final voicing* decisions for each half frame. * The voicing parameter (signal measurement) column vector (VALUE)* is based on a rectangular window of speech samples determined by the* window placement algorithm. The voicing parameter vector contains the* AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy* measures, reflection coefficients, and prediction gains. The voicing* window is placed to avoid contamination of the voicing parameter vector* with speech onsets. * The input signal is then classified as unvoiced (including* silence) or voiced. This decision is made by a linear discriminant* function consisting of a dot product of the voicing decision* coefficient (VDC) row vector with the measurement column vector* (VALUE). The VDC vector is 2-dimensional, each row vector is optimized* for a particular signal-to-noise ratio (SNR). So, before the dot* product is performed, the SNR is estimated to select the appropriate* VDC vector. * The smoothing algorithm is a modified median smoother. The* voicing discriminant function is used by the smoother to determine how* strongly voiced or unvoiced a signal is. The smoothing is further* modified if a speech onset and a voicing decision transition occur* within one half frame. In this case, the voicing decision transition* is extended to the speech onset. For transmission purposes, there are* constraints on the duration and transition of voicing decisions. The* smoother takes these constraints into account. * Finally, the energy estimates are updated along with the dither* threshold used to calculate the zero crossing rate (ZC).** Inputs:* VWIN - Voicing window limits* INBUF - Input speech buffer* LPBUF - Low-pass filtered speech buffer* BUFLIM - INBUF and LPBUF limits* HALF - Present analysis half frame number* MINAMD - Minimum value of the AMDF* MAXAMD - Maximum value of the AMDF* MINTAU - Pointer to the lag of the minimum AMDF value* IVRC(2) - Inverse filter's RC's* OBOUND - Onset boundary descriptions* AF - The analysis frame number* Output:* VOIBUF(2,0:AF) - Buffer of voicing decisions* Internal:* QS - Ratio of preemphasized to full-band energies* RC1 - First reflection coefficient* AR_B - Product of the causal forward and reverse pitch prediction gains* AR_F - Product of the noncausal forward and rev. pitch prediction gains* ZC - Zero crossing rate* DITHER - Zero crossing threshold level* MAXMIN - AMDF's 1 octave windowed maximum-to-minimum ratio* MINPTR - Location of minimum AMDF value* NVDC - Number of elements in each VDC vector* NVDCL - Number of VDC vectors* VDCL - SNR values corresponding to the set of VDC's* VDC - 2-D voicing decision coefficient vector* VALUE(9) - Voicing Parameters* VOICE(2,3)- History of LDA results* LBE - Ratio of low-band instantaneous to average energies* FBE - Ratio of full-band instantaneous to average energies* LBVE - Low band voiced energy* LBUE - Low band unvoiced energy* FBVE - Full band voiced energy* FBUE - Full band unvoiced energy* OFBUE - Previous full-band unvoiced energy* OLBUE - Previous low-band unvoiced energy* REF - Reference energy for initialization and DITHER threshold* SNR - Estimate of signal-to-noise ratio* SNR2 - Estimate of low-band signal-to-noise ratio* SNRL - SNR level number* OT - Onset transition present* VSTATE - Decimal interpretation of binary voicing classifications* FIRST - First call flag*/#include <stdio.h>#include "vcomm.ch"#include "contrl.ch"#include "lpcdefs.h"#include <math.h>voicin( vwin, inbuf, lpbuf, half, minamd, maxamd, mintau, ivrc, obound, voibuf)int vwin[2][AF], half, mintau;float minamd, maxamd, ivrc[2];float *inbuf;float *lpbuf;int *obound, voibuf[2][AF+1];{int zc, lbe, fbe;int i, snrl;static int vstate=0;static float dither=20;static float snr;float snr2;static float maxmin;float qs, rc1, ar_b;float ar_f;static float voice[2][3];float value[9];short ot=0;/* Declare and initialize filters: */static int lbve, lbue, fbve, fbue, ofbue, olbue;static int sfbue, slbue=0;int ref= 3000;static short first=1;if (first) { lbve = ref; fbve = ref; fbue = ref/16; ofbue = ref/16; lbue = ref/32; olbue = ref/32; snr = 64*(fbve/fbue); first = 0; vdcl[0] = 600; vdcl[1] = 450; vdcl[2] = 300; vdcl[3] = 200; vdcl[4] = 6*0; for(i=0;i<3;i++) { voice[1][i] = 0.0; voice[0][i] = 0.0; } }/* The VOICE array contains the result of the linear discriminant function * (analog values). The VOIBUF array contains the hard-limited binary * voicing decisions. The VOICE and VOIBUF arrays, according to FORTRAN * memory allocation, are addressed as:** (half-frame number, future-frame number)** | Past | Present | Future1 | Future2 |* | 1,0 | 2,0 | 1,1 | 2,1 | 1,2 | 2,2 | 1,3 | 2,3 | ---> time** Update linear discriminant function history each frame: */if (half == 1) { voice[0][0]=voice[0][1]; voice[1][0]=voice[1][1]; voice[0][1]=voice[0][2]; voice[1][1]=voice[1][2]; maxmin = maxamd/mmax(minamd,1.);}/* Calculate voicing parameters twice per frame: */vparms( vwin, inbuf, lpbuf, half, &dither, mintau, &zc, &lbe, &fbe, &qs, &rc1, &ar_b, &ar_f );/* Estimate signal-to-noise ratio to select the appropriate VDC vector.* The SNR is estimated as the running average of the ratio of the* running average full-band voiced energy to the running average* full-band unvoiced energy. SNR filter has gain of 63. */snr = nint( 63*( snr + fbve/(float)(mmax(fbue,1)) )/64.);snr2 = (snr*fbue)/mmax(lbue,1);/* Quantize SNR to SNRL according to VDCL thresholds.*//*DO SNRL = 1, NVDCL-1 */for (snrl=1;snrl<nvdcl;snrl++) { if (snr2 > vdcl[snrl-1]) break;}/* (Note: SNRL = NVDCL Here) *//* Linear discriminant voicing parameters: */value[0] = maxmin;value[1] = (float)(lbe)/mmax(lbve,1);value[2] = zc;value[3] = rc1;value[4] = qs;value[5] = ivrc[2];value[6] = ar_b;value[7] = ar_f;value[8] = 0.0;/* Evaluation of linear discriminant function: */voice[half-1][2] = vdc[9][snrl-1];for(i=1;i<10;i++) { voice[half-1][2] += vdc[i-1][snrl-1]*value[i-1];}/* Classify as voiced if discriminant > 0, otherwise unvoiced* Voicing decision for current half-frame: 1 = Voiced; 0 = Unvoiced */if (voice[half-1][2] > 0.0) voibuf[half-1][3]=1;else voibuf[half-1][3]=0;/* Skip voicing decision smoothing in first half-frame: */if (half != 1) {/* Voicing decision smoothing rules (override of linear combination):** Unvoiced half-frames: At least two in a row.* --------------------** Voiced half-frames: At least two in a row in one frame.* ------------------- Otherwise at least three in a row.* (Due to the way transition frames are encoded)** In many cases, the discriminant function determines how to smooth.* In the following chart, the decisions marked with a * may be overridden.** Voicing override of transitions at onsets:* If a V/UV or UV/V voicing decision transition occurs within one-half* frame of an onset bounding a voicing window, then the transition is* moved to occur at the onset.** P 1F* ----- -----* 0 0 0 0* 0 0 0* 1 (If there is an onset there)* 0 0 1* 0* (Based on 2F and discriminant distance)* 0 0 1 1* 0 1* 0 0 (Always)* 0 1* 0* 1 (Based on discriminant distance)* 0* 1 1 0* (Based on past, 2F, and discriminant distance)* 0 1* 1 1 (If there is an onset there)* 1 0* 0 0 (If there is an onset there)* 1 0 0 1* 1 0* 1* 0 (Based on discriminant distance)* 1 0* 1 1 (Always)* 1 1 0 0* 1 1 0* 1* (Based on 2F and discriminant distance)* 1 1 1* 0 (If there is an onset there)* 1 1 1 1** Determine if there is an onset transition between P and 1F.* OT (Onset Transition) is true if there is an onset between * P and 1F but not after 1F.*//*OT = (AND(OBOUND(1), 2) .NE. 0 .OR. OBOUND(2) .EQ. 1) .AND. AND(OBOUND(3), 1) .EQ. 0 */ot = ((obound[1] & 2) != 0 || obound[2] == 1) && (obound[3] & 1) == 0;/* Multi-way dispatch on voicing decision history: */vstate = voibuf[0][1]*8 + voibuf[1][1]*4 + voibuf[0][2]*2 + voibuf[1][2];/* GOTO (99,1,2,99,4,5,6,7,8,99,10,11,99,13,14,99) VSTATE+1 *//*if(count==9) printf("vstate = %d\n",vstate);*/switch(vstate+1) { case 1: break; case 2: if (ot && voibuf[0][3] == 1) voibuf[0][2] = 1; break; case 3: if (voibuf[0][3] == 0 || voice[0][1] < -voice[1][1]) voibuf[0][2] = 0; else voibuf[1][2] = 1; break; case 4: break; case 5: voibuf[1][1] = 0; break; case 6: if (voice[1][0] < -voice[0][1]) voibuf[1][1] = 0; else voibuf[0][2] = 1; break; case 7: /* VOIBUF(2,0) must be 0 */ if (voibuf[0][0] == 1 || voibuf[0][3] == 1 || voice[1][1] > voice[0][0]) voibuf[1][2] = 1; else voibuf[0][1] = 1; break; case 8: if (ot) voibuf[1][1] = 0; break; case 9: if (ot) voibuf[1][1] = 1; break; case 10: break; case 11: if (voice[0][1] < -voice[1][0]) voibuf[0][2] = 0; else voibuf[1][1] = 1; break; case 12: voibuf[1][1] = 1; break; case 13: break; case 14: if ((voibuf[0][3] == 0) && (voice[1][1] < -voice[0][1]) ) voibuf[1][2] = 0; else voibuf[0][2] = 1; break; case 15: if (ot && voibuf[0][3] == 0) voibuf[0][2] = 0; break; default: break;}} /* (99)*//* Now update parameters:* ----------------------** During unvoiced half-frames, update the low band and full band unvoiced* energy estimates (LBUE and FBUE) and also the zero crossing* threshold (DITHER). (The input to the unvoiced energy filters is* restricted to be less than 10dB above the previous inputs of the* filters.)* During voiced half-frames, update the low-pass (LBVE) and all-pass * (FBVE) voiced energy estimates. */if (voibuf[half-1][3] == 0) { sfbue = nint(( 63*sfbue + 8*mmin(fbe,3*ofbue) )/64.); fbue = sfbue/8; ofbue = fbe; slbue = nint(( 63*slbue + 8*mmin(lbe,3*olbue) )/64.); lbue = slbue/8; olbue = lbe;}else{ lbve = nint(( 63*lbve + lbe )/64.); fbve = nint(( 63*fbve + fbe )/64.);}/* Set dither threshold to yield proper zero crossing rates in the* presence of low frequency noise and low level signal input.* NOTE: The divisor is a function of REF, the expected energies. */dither = mmin(mmax( 64*sqrt((float)(lbue*lbve)) / ref,1.),20.);/* Voicing decisions are returned in VOIBUF. */}#ifdef _TMS320C30int nint(anum)int anum;{ return(round(anum));}#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -