📄 enc_dtx.c
字号:
* E_DTX_hangover_addition
*
* Parameters:
* st I/0: State struct
* low_power I: flag power of the input frame
* hang_len I: hangover length
* burst_len I: minimum burst length for hangover addition
*
* Function:
* Add hangover after speech bursts.
*
* Returns:
* VAD_flag indicating final VAD decision
*/
static Word16 E_DTX_hangover_addition(E_DTX_Vad_State *st, Word16 low_power,
Word16 hang_len, Word16 burst_len)
{
/*
* if the input power (pow_sum) is lower than a threshold, clear
* counters and set VAD_flag to "0" "fast exit"
*/
if (low_power != 0)
{
st->mem_burst_count = 0;
st->mem_hang_count = 0;
return 0;
}
/* update the counters (hang_count, burst_count) */
if ((st->mem_vadreg & 0x4000) != 0)
{
st->mem_burst_count++;
if (st->mem_burst_count >= burst_len)
{
st->mem_hang_count = hang_len;
}
return 1;
}
else
{
st->mem_burst_count = 0;
if (st->mem_hang_count > 0)
{
st->mem_hang_count--;
return 1;
}
}
return 0;
}
/*
* E_DTX_noise_estimate_update
*
* Parameters:
* st I/0: State struct
* level I: sub-band levels of the input frame
*
* Function:
* Update of background noise estimate
*
* Returns:
* void
*/
static void E_DTX_noise_estimate_update(E_DTX_Vad_State *st, Float32 level[])
{
Float32 alpha_up, alpha_down, bckr_add, temp;
Word32 i;
/* Control update of bckr_est[] */
E_DTX_update_cntrl(st, level);
/* Choose update speed */
bckr_add = 2.0;
if ((0x7800 & st->mem_vadreg) == 0)
{
alpha_up = ALPHA_UP1;
alpha_down = ALPHA_DOWN1;
}
else
{
if (st->mem_stat_count == 0)
{
alpha_up = ALPHA_UP2;
alpha_down = ALPHA_DOWN2;
}
else
{
alpha_up = 0.0;
alpha_down = ALPHA3;
bckr_add = 0.0;
}
}
/* Update noise estimate (bckr_est) */
for (i = 0; i < COMPLEN; i++)
{
temp = st->mem_level[i] - st->mem_bckr_est[i];
if (temp < 0.0)
{ /* update downwards*/
st->mem_bckr_est[i] += -2 + (alpha_down * temp);
/* limit minimum value of the noise estimate to NOISE_MIN */
if (st->mem_bckr_est[i] < NOISE_MIN)
{
st->mem_bckr_est[i] = NOISE_MIN;
}
}
else
{ /* update upwards */
st->mem_bckr_est[i] += bckr_add + (alpha_up * temp);
/* limit maximum value of the noise estimate to NOISE_MAX */
if (st->mem_bckr_est[i] > NOISE_MAX)
{
st->mem_bckr_est[i] = NOISE_MAX;
}
}
}
/* Update signal levels of the previous frame (old_level) */
memcpy(st->mem_level, level, COMPLEN * sizeof(Float32));
}
/*
* E_DTX_decision
*
* Parameters:
* st I/0: State struct
* level I: sub-band levels of the input frame
* pow_sum I: power of the input frame
*
* Function:
* Calculates VAD_flag
*
* Returns:
* VAD_flag
*/
static Word16 E_DTX_decision(E_DTX_Vad_State *st, Float32 level[COMPLEN], Float64 pow_sum)
{
Float64 snr_sum;
Float32 vad_thr, temp, noise_level;
Float32 ilog2_speech_level, ilog2_noise_level;
Float32 temp2;
Word32 i;
Word16 low_power_flag;
Word16 hang_len,burst_len;
/*
* Calculate squared sum of the input levels (level)
* divided by the background noise components (bckr_est).
*/
snr_sum = 0.0;
for (i = 0; i < COMPLEN; i++)
{
temp = level[i] / st->mem_bckr_est[i];
snr_sum += temp * temp;
}
/* Calculate average level of estimated background noise */
temp = 0.0;
for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
{
temp += st->mem_bckr_est[i];
}
noise_level = (Float32)(temp * 0.0625);
/*
* if SNR is lower than a threshold (MIN_SPEECH_SNR),
* and increase speech_level
*/
temp = noise_level * MIN_SPEECH_SNR * 8;
if (st->mem_speech_level <= temp)
{
st->mem_speech_level = temp;
/* avoid log10 error */
temp -= 1E-8F;
}
ilog2_noise_level = (Float32)(-1024.0F * log10(noise_level / 2147483648.0F) / log10(2.0F));
/*
* If SNR is very poor, speech_level is probably corrupted by noise level. This
* is correctred by subtracting -MIN_SPEECH_SNR*noise_level from speech level
*/
ilog2_speech_level = (Float32)(-1024.0F * log10((st->mem_speech_level - temp) / 2147483648.0F) / log10(2.0F));
temp = NO_SLOPE * (ilog2_noise_level- NO_P1) + THR_HIGH;
temp2 = SP_CH_MIN + SP_SLOPE * (ilog2_speech_level - SP_P1);
if (temp2 < SP_CH_MIN)
{
temp2 = SP_CH_MIN;
}
if (temp2 > SP_CH_MAX)
{
temp2 = SP_CH_MAX;
}
vad_thr = temp + temp2;
if (vad_thr < THR_MIN)
{
vad_thr = THR_MIN;
}
/* Shift VAD decision register */
st->mem_vadreg = (Word16)(st->mem_vadreg >> 1);
/* Make intermediate VAD decision */
if (snr_sum > (vad_thr * (Float32)COMPLEN / 128.0F))
{
st->mem_vadreg = (Word16)(st->mem_vadreg | 0x4000);
}
/* primary vad decision made */
/* check if the input power (pow_sum) is lower than a threshold" */
if (pow_sum < VAD_POW_LOW)
{
low_power_flag = 1;
}
else
{
low_power_flag = 0;
}
/* Update speech subband background noise estimates */
E_DTX_noise_estimate_update(st, level);
hang_len = (Word16)((HANG_SLOPE * (vad_thr - HANG_P1) - 0.5) + HANG_HIGH);
if (hang_len < HANG_LOW)
{
hang_len = HANG_LOW;
}
burst_len = (Word16)((BURST_SLOPE * (vad_thr - BURST_P1) - 0.5) + BURST_HIGH);
return(E_DTX_hangover_addition(st, low_power_flag, hang_len,burst_len));
}
/*
* E_DTX_dpeech_estimate
*
* Parameters:
* st I/0: State struct
* in_level I: level of the input frame
*
* Function:
* Estimate speech level
*
* Maximum signal level is searched and stored to the variable sp_max.
* The speech frames must locate within SP_EST_COUNT number of frames to be counted.
* Thus, noisy frames having occasional VAD = "1" decisions will not
* affect to the estimated speech_level.
*
* Returns:
* void
*/
static void E_DTX_speech_estimate(E_DTX_Vad_State *st, Float32 in_level)
{
Float32 alpha, tmp;
/* if the required activity count cannot be achieved, reset counters */
if (SP_ACTIVITY_COUNT > (SP_EST_COUNT - st->mem_sp_est_cnt + st->mem_sp_max_cnt))
{
st->mem_sp_est_cnt = 0;
st->mem_sp_max = 0.0;
st->mem_sp_max_cnt = 0;
}
st->mem_sp_est_cnt++;
if (((st->mem_vadreg & 0x4000) || (in_level > st->mem_speech_level))
&& (in_level > MIN_SPEECH_LEVEL1))
{
if (in_level > st->mem_sp_max)
{
st->mem_sp_max = in_level;
}
st->mem_sp_max_cnt++;
if (st->mem_sp_max_cnt >= SP_ACTIVITY_COUNT)
{
tmp = st->mem_sp_max / 2.0F; /* scale to get "average" speech level*/
if (tmp > st->mem_speech_level)
{
alpha = ALPHA_SP_UP;
}
else
{
alpha = ALPHA_SP_DOWN;
}
if (tmp > MIN_SPEECH_LEVEL2)
{
st->mem_speech_level += alpha * (tmp - st->mem_speech_level);
}
st->mem_sp_max = 0.0;
st->mem_sp_max_cnt = 0;
st->mem_sp_est_cnt = 0;
}
}
}
/*
* E_DTX_vad_reset
*
* Parameters:
* state I/0: State struct
*
* Function:
* Initialises state memory
*
* Returns:
* non-zero with error, zero for ok
*/
Word32 E_DTX_vad_reset (E_DTX_Vad_State *state)
{
Word32 i;
if (state == (E_DTX_Vad_State *) NULL)
{
return -1;
}
/* Initialize pitch detection variables */
state->mem_pitch_tone = 0;
state->mem_vadreg = 0;
state->mem_hang_count = 0;
state->mem_burst_count = 0;
state->mem_hang_count = 0;
/* initialize memory used by the filter bank */
memset(state->mem_a_data5, 0, F_5TH_CNT * 2 * sizeof(Float32));
memset(state->mem_a_data3, 0, F_3TH_CNT * sizeof(Float32));
/* initialize the rest of the memory */
for (i = 0; i < COMPLEN; i++)
{
state->mem_bckr_est[i] = NOISE_INIT;
state->mem_level[i] = NOISE_INIT;
state->mem_ave_level[i] = NOISE_INIT;
state->mem_sub_level[i] = 0;
}
state->mem_sp_est_cnt = 0;
state->mem_sp_max = 0;
state->mem_sp_max_cnt = 0;
state->mem_speech_level = SPEECH_LEVEL_INIT;
state->mem_pow_sum = 0;
state->mem_stat_count = 0;
return 0;
}
/*
* E_DTX_vad_init
*
* Parameters:
* state I/0: State struct
*
* Function:
* Allocates state memory and initializes state memory
*
* Returns:
* non-zero with error, zero for ok
*/
Word32 E_DTX_vad_init (E_DTX_Vad_State **state)
{
E_DTX_Vad_State* s;
if (state == (E_DTX_Vad_State **) NULL)
{
return -1;
}
*state = NULL;
/* allocate memory */
if ((s = (E_DTX_Vad_State *) malloc(sizeof(E_DTX_Vad_State))) == NULL)
{
return -1;
}
E_DTX_vad_reset(s);
*state = s;
return 0;
}
/*
* E_DTX_vad_exit
*
* Parameters:
* state I/0: State struct
*
* Function:
* The memory used for state memory is freed
*
* Returns:
* void
*/
void E_DTX_vad_exit (E_DTX_Vad_State **state)
{
if (state == NULL || *state == NULL)
{
return;
}
/* deallocate memory */
free(*state);
*state = NULL;
return;
}
/*
* E_DTX_pitch_tone_detection
*
* Parameters:
* state I/0: State struct
* p_gain I: pitch gain
*
* Function:
* Set tone flag if pitch gain is high. This is used to detect
* signaling tones and other signals with high pitch gain.
*
* Returns:
* void
*/
void E_DTX_pitch_tone_detection (E_DTX_Vad_State *st, Float32 p_gain)
{
/* update tone flag and pitch flag */
st->mem_pitch_tone = (Word16)(st->mem_pitch_tone >> 1);
/* if (pitch_gain > TONE_THR) set tone flag */
if (p_gain > TONE_THR)
{
st->mem_pitch_tone = (Word16)(st->mem_pitch_tone | 0x4000);
}
}
/*
* E_DTX_vad
*
* Parameters:
* st I/0: State struct
* in_buf I: samples of the input frame
*
* Function:
* Main program for Voice Activity Detection (VAD)
*
* Returns:
* VAD Decision, 1 = speech, 0 = noise
*/
Word16 E_DTX_vad(E_DTX_Vad_State *st, Float32 in_buf[])
{
Float64 L_temp, pow_sum;
Float32 level[COMPLEN];
Float32 temp;
Word32 i;
Word16 VAD_flag;
/* Calculate power of the input frame. */
L_temp = 0.0;
for (i = 0; i < FRAME_LEN; i++)
{
L_temp += in_buf[i] * in_buf[i];
}
L_temp *= 2.0;
/* pow_sum = power of current frame and previous frame */
pow_sum = L_temp + st->mem_pow_sum;
/* save power of current frame for next call */
st->mem_pow_sum = L_temp;
/* If input power is very low, clear tone flag */
if (pow_sum < POW_PITCH_TONE_THR)
{
st->mem_pitch_tone = (Word16)(st->mem_pitch_tone & 0x1fff);
}
/* Run the filter bank and calculate signal levels at each band */
E_DTX_filter_bank(st, in_buf, level);
/* compute VAD decision */
VAD_flag = E_DTX_decision(st, level, pow_sum);
/* Calculate input level */
L_temp = 0.0;
for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
{
L_temp += level[i];
}
temp = (Float32)(L_temp / 16.0F);
E_DTX_speech_estimate(st, temp); /* Estimate speech level */
return(VAD_flag);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -