📄 g723_codec.c
字号:
frame to the next. Large inter-frame changes (>3 quantizer indices) imply significant changes in background noise energy; the CNG module responds by inserting a SID frame; small inter-frame changes allow the current frame to remain a candidate for non-transmission, provided the inter-frame spectral changes are also small. */ pDstEncoderState->dtxState.qSidGainIndex = 0; /* Initialize the SID prediction residual energy history, the elements of which are interpreted as estimates of the energy associated with each CNG frame excitation. Depending on the number of Up to 3 subframes are considered for any given average computation (see below). */ pDstEncoderState->dtxState.residualEnergy[0] = 0; pDstEncoderState->dtxState.residualEnergy[1] = 0; pDstEncoderState->dtxState.residualEnergy[2] = 0; /* Initialize silence frame counter; this counter, incremented up to a maximum of value of three, tracks the number of consecutive silence frames following a speech interval. The counter is set to 1 at the start of an "inactive zone", i.e., during the first silence frame following a period of speech activity, and is incremented during each consecutive SID or non-TX frame up to a maximum value of 3. This counter is used to determine the number of frame energy estimates over which to compute an average during SID gain quantization. */ pDstEncoderState->dtxState.sumFrame = 0; return(1);} /* EncoderInit_G723 *//******************************************************************************************//// Name:// Encode_G723_16s8u//// Description:// Compress one 30 ms input speech frame (240 samples, 16-bit // linear PCM) into a 189/158/32/8-bit compressed bitstream. // Represent the compressed bitstream using 24/20/4/1 bytes, in accordance // with ITU Recommendations G.723.1, and G.723.1 Annex A//// Rates and stream size with VAD disabled (G.723.1):// - 5.3 kbps bit rate generates a 160-bit/20 byte output bitstream// - 6.3 kbps bit rate generates a 192-bit/24 byte output bitstream//// Rates and stream size with VAD enabled (G.723.1 Annex A):// - SID (silence interval description) generates a 32-bit/4 byte output bitstream// - NONTX (untransmitted VAD==0) generates a 8-bit/1 byte output bitstream//// Input Arguments: // pSrcSpeech - Pointer to input speech vector, of length 240 samples// bitRate - Bit rate specifier, IPP_SPCHBR_xx, where xx=53 or 63// denotes, respectively, 5.3 and 6.3 kbps// enableVad - VAD/DTX control (Annex A); 1=enable, 0=disable// enableHighpassFilter - Preprocessor HPF control; 1=enable, 0=disable; switch// provided for test vector compliance verificaiton// pEncoderState - Pointer to encoder state//// Output Arguments: // pDstBitstream - Compressed bitstream pointer, of length 20/24 bytes// pEncoderState - Pointer to updated encoder state//// Returns: // 1 - No Error.// 0 - Bad Arguments.//********************************************************************************************/IppStatus Encode_G723_16s8u(IppPcmStream *pSrcSpeech, IppBitstream *pDstBitstream, IppSpchBitRate bitRate, int enableVad, int enableHighpassFilter, IppG723EncoderState *pEncoderState){ int i,j; /* general purpose loop indices */ Ipp16s SpchAnalysisBuf[LPCWIN+FLEN-SFLEN]; /* multi-purpose speech and excitation analysis buffer */ Ipp16s PrcptWghtSpchBuf[FLEN+MAXLAG]; /* perceptually weighted speech buffer */ Ipp16s OLPSAnalysisBuf[FLEN+MAXLAG]; /* open-loop pitch search analysis buffer */ Ipp16s autoCorrelation[(LPC1+1)*SFNUM]; /* subframe autocorrelation vectors (4xLPC1) for AC summation computation */ Ipp16s Lpc[SFNUM][LPC]; /* LPC coefficient vector */ Ipp16s Lsf[LPC], InterpLsf[LPC]; /* LSF and interpolated LSF vectors */ Ipp32s QLsfIndex; /* LSF quantization index */ Ipp16s InvalidFrame; /* LSF quantization flag */ Ipp16s QLpc[SFNUM][LPC]; /* quantized interpolated LPC vectors */ Ipp16s PerceptLpc[SFNUM<<1][LPC]; /* perceptually weighted LPC vector */ Ipp16s OpenLoopPitchLag[SFNUM>>1]; /* open-loop pitch estimates */ Ipp16s HNSLag[SFNUM]; /* harmonic noise shaping lags */ Ipp16s HNSGain[SFNUM]; /* harmonic noise shaping gains */ Ipp16s ImpulseResp[SFLEN]; /* combined filter impulse responses */ Ipp16s AdaptGainIndex[SFNUM]; /* adaptive codebook gain indices */ Ipp16s EstimatedPitchLag[SFNUM]; /* closed-loop pitch search neighboorhood (OLPS-based) */ Ipp16s ClosedLoopPitchLagOffset[SFNUM]; /* closed-loop pitch search results - offsets to OLPS */ Ipp16s AdaptCBVect[SFLEN]; /* adaptive codebook vector */ Ipp16s FixedCBVect[SFLEN]; /* fixed codebook vector */ Ipp16s FixedCBGrid[SFNUM]; /* fixed codebook search grid */ Ipp16s DiracTrainEnable[SFNUM]; /* MP-MLQ pulse train usage flag: 0=off, 1=on */ Ipp16s ImpRespCovarMatrix[COVMATDIM]; /* Toepliz covariance matrix for the impulse response of Si(z) */ Ipp16s CrossCorrACELP[SFLEN]; /* ACELP codebook search cross correlations (target x h(n)) */ Ipp16s PulseSignsACELP[SFNUM][4]; /* unpacked ACELP codevector pulse signs */ Ipp16s PulsePosACELP[SFNUM][4]; /* unpacked ACELP codevector pulse positions */ Ipp16s FixedCBQGain[SFNUM]; /* quantized fixed codebook gains */ Ipp16s FixedCBQGainIndex[SFNUM]; /* quantized fixed codebook gain indices */ Ipp16s FixedCBPulseSign[SFNUM]; /* packed/encoded fixed CB pulse signs */ Ipp32s FixedCBPulsePos[SFNUM]; /* packed/encoded fixed CB pulse positions */ Ipp16s PitchSyncIndex, PitchSyncGain; /* ACELP pitch synchronous filter parameters */ Ipp16s ACELPInnerLoopMaxEntry; /* ACELP codebook search control parameter */ Ipp16s SubframeOffset; /* speech analysis buffer subframe base address */ Ipp16s residualEnergy; /* Levinson-Durbin residual energy parameter (unused) */ Ipp16s vad=1; /* VAD result; 1=voice present; 0=voice absent */ Ipp16s frameType = IPP_G723_FRAMETYPE_VOICE; /* frame type: 0=nonTX, 1=active speech (VAD==1), 2=SID (VAD==0) */ /* Apply highpass filter to the input speech; eliminate any DC offset. The preprocessing highpass filter is given in [1], Eq. 1, p. 3. The HPF prevents artificical increases in R(0) during autocorrelation analysis. After filtering, prepare an input speech buffer for LPC autocorrelation analysis (Levinson-Durbin). Upon return, the buffer SpchAnalysisBuf contains highpass filtered input speech suitable for autocorrelation analysis. The HPF filter memory is maintained in the encoder state variable. A switch is provided to disable the highpass filter during test vector compliance procedures as required in [1]. */ appsPreprocess_G723_I((Ipp16s *)pSrcSpeech->pBuf, SpchAnalysisBuf, enableHighpassFilter, &(pEncoderState->highpassFilterZfir), &(pEncoderState->highpassFilterZiir), pEncoderState->prevSpch); /* Perform autocorrelation analysis, estimate LPC parameters using Levinson-Durbin */ for ( i=j=0; i<SFNUM; i++, j+=(LPC1+1) ) { ippsAutoCorr_G723_16s(SpchAnalysisBuf+i*SFLEN, autoCorrelation+j+LPC1, autoCorrelation+j); ippsLevinsonDurbin_G723_16s(autoCorrelation+j, &(pEncoderState->sineDtct), &residualEnergy, Lpc[i]); } /* Compute summation autocorrelation over 4 subframes, then update summation autocorrelation history in the encoder state (4 frame history), as well as the summation autocorrelation scaling history (exp) */ appsAutoCorrSum_G723_16s(autoCorrelation, pEncoderState->frameAutoCorr, pEncoderState->frameAutoCorrExp); /* Detect voice activity using the algorithm described in [3], section A.2, pp. 2-4. Upon return from the VAD analysis, VAD==1 indicates voice present, and VAD==0 indicates voice absent */ if (enableVad) appsVAD_G723_16s(pEncoderState->sineDtct, pEncoderState->openLoopPitchLag, pEncoderState->vadLpc, SpchAnalysisBuf+(SFLEN<<1), &vad, &(pEncoderState->vadState)); /* Update sine detector */ appsSinDetect_G723_I(&(pEncoderState->sineDtct)); /* LPC quantization 1. LPC->LSF transformation on subframe 3 (last subframe) 2. Quantize LSFs on subframe 3 */ /* Convert LPCs to LSFs */ ippsLPCToLSF_G723_16s(Lpc[3], pEncoderState->prevLsf, Lsf); /* Quantize LSFs */ ippsLSFQuant_G723_16s32s(Lsf, pEncoderState->prevLsf, &QLsfIndex); /* Update speech analysis buffer */ for ( i = 0; i < LPCWIN-SFLEN; i ++ ) pEncoderState->prevSpch[i] = SpchAnalysisBuf[FLEN+i]; for ( i = 0; i < FLEN; i ++ ) SpchAnalysisBuf[i] = SpchAnalysisBuf[((LPCWIN-SFLEN)>>1)+i]; /* Construct perceptual weighting filter (Eq. 11 of [1], sec. 2.8, p. 7). Using the scaling property of the Z-transform, shift the poles and zeros radially inwards towards the center of the unit circle, to affect prediction residual (excitation) matching emphasis in the most audible regions during the codebook search procedures. After constructing the filter, apply perceptual weighting to the input speech, i.e., compute f(n) ([1], p. 7) Upon return from the PWF function, PrcptWghtSpchBuf contains the sequence f(n). */ appsPerceptualWeightingFilter_G723_16s(SpchAnalysisBuf, Lpc, PerceptLpc, PrcptWghtSpchBuf, pEncoderState->perceptualWeightFilterZfir, pEncoderState->perceptualWeightFilterZiir, pEncoderState->prevWgtSpch); /* Prepare an analysis buffer for the open loop pitch search as follows: 1. Load last MAXLAG (145) weighted samples from the previous frame into the first MAXLAG samples of the current frame's OLPS analysis buffer. 2. The remaining 240 samples of the OLPS analysis buffer were already generated by the PWF, above. 3. Identify the element of largest magnitude in the entire OLPS buffer to perform normalization 4. For VAD==1, maintain a history of the perceptually weighted speech in the encoder state to be used for the OLPS during the next frame (see analysis buffer construction) 5. For VAD==0, perceptually weighted speech history is */ appsOpenLoopPitchSearchPreprocess_G723_16s(PrcptWghtSpchBuf, OLPSAnalysisBuf, pEncoderState->prevWgtSpch); /* Perform open-loop pitch search The OLPS primitive computes the cross-correlation criterion (Eq. 12 of [1], sec. 2.9, p. 7) and performs the maximization search described in [1]. The maximizing index, j, is returned for each of two half-frames (two subframes in each half-frame). */ for ( i = 0; i < SFNUM>>1; i ++ ) { /* Perform open-loop pitch search */ ippsOpenLoopPitchSearch_G723_16s(OLPSAnalysisBuf+MAXLAG+(i*SFLEN<<1), &OpenLoopPitchLag[i]); /* Update OLPS history used in VAD processing */ pEncoderState->openLoopPitchLag[i] = pEncoderState->openLoopPitchLag[i+2]; pEncoderState->openLoopPitchLag[i+2] = OpenLoopPitchLag[i]; } /* Process VAD==0 frame; i.e., silence or voice activity absent. DTX decision processing classifies the frame using one of two categories: 1) SID (silence interval description) -- silence "reference" frames are parameterized in terms of LPCs and a gain. 2) non-transmitted silence (NonTX) -- uses previous SID parameters. */ if (vad==0) { /* Reset quantized LSF index and frame type indicator */ QLsfIndex = 0; frameType = IPP_G723_FRAMETYPE_NONTX; /* Classify the frame as SID or NonTX by analyzing input speech according to the DTX procedure described in [3], sections A3 and A4, pp. 4-9. */ appsDTXDecision_G723_16s(pEncoderState->prevDTXFrameType, pEncoderState->prevLsf, pEncoderState->frameAutoCorr, pEncoderState->frameAutoCorrExp, pEncoderState->vadLpc, FixedCBQGainIndex, &(pEncoderState->sidGain), &QLsfIndex, pEncoderState->sidLsf, &frameType, &(pEncoderState->targetExcitationGain), &(pEncoderState->vadState), &(pEncoderState->dtxState)); /* Synthesize CNG excitation using CNG procedure described in [3], section A4.5, pp. 10-11. */ appsGenerateCNGExcitation_G723_16s(pEncoderState->targetExcitationGain, pEncoderState->prevExcitation, EstimatedPitchLag, ClosedLoopPitchLagOffset, AdaptGainIndex, PrcptWghtSpchBuf+MAXLAG, &(pEncoderState->randomSeedCNG), bitRate);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -