📄 wb_vad.c
字号:
}
}
/* Update noise estimate (bckr_est) */
for (i = 0; i < COMPLEN; i++)
{
temp = st->old_level[i] - st->bckr_est[i];
if (temp < 0.0)
{ /* update downwards*/
st->bckr_est[i] += -2 + (alpha_down * temp);
/* limit minimum value of the noise estimate to NOISE_MIN */
if (st->bckr_est[i] < NOISE_MIN)
{
st->bckr_est[i] = NOISE_MIN;
}
}
else
{ /* update upwards */
st->bckr_est[i] += bckr_add +(alpha_up * temp);
/* limit maximum value of the noise estimate to NOISE_MAX */
if (st->bckr_est[i] > NOISE_MAX)
{
st->bckr_est[i] = NOISE_MAX;
}
}
}
/* Update signal levels of the previous frame (old_level) */
for(i = 0; i < COMPLEN; i++)
{
st->old_level[i] = level[i];
}
}
/******************************************************************************
*
* Function : vad_decision
* Purpose : Calculates VAD_flag
*
*******************************************************************************
*/
static Word16 vad_decision( /*return value : VAD_flag */
VadVars *st, /* i/o : State structure */
float level[COMPLEN], /* i : sub-band levels of the input frame */
double pow_sum /* i : power of the input frame */
)
{
Word16 i;
double L_snr_sum;
double L_temp;
float vad_thr, temp, noise_level;
Word16 low_power_flag;
Word16 hang_len,burst_len;
float ilog2_speech_level,ilog2_noise_level;
float temp2;
/*
Calculate squared sum of the input levels (level)
divided by the background noise components (bckr_est).
*/
L_snr_sum = 0.0;
for (i = 0; i < COMPLEN; i++)
{
temp = level[i]/st->bckr_est[i];
L_snr_sum += temp * temp;
}
/* Calculate average level of estimated background noise */
L_temp = 0.0;
for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
{
L_temp += st->bckr_est[i];
}
noise_level = (float)(L_temp/16.0f);
/*
if SNR is lower than a threshold (MIN_SPEECH_SNR),
and increase speech_level
*/
temp = noise_level*MIN_SPEECH_SNR*8;
if (st->speech_level < temp) {
st->speech_level = temp;
}
ilog2_noise_level = (float)(-1024.0f*log10(noise_level / 2147483648.0f)/log10(2.0f));
/*
If SNR is very poor, speech_level is probably corrupted by noise level. This
is correctred by subtracting -MIN_SPEECH_SNR*noise_level from speech level
*/
ilog2_speech_level = (float)(-1024.0f*log10((st->speech_level-temp) / 2147483648.0f)/log10(2.0f));
/*ilog2_speech_level = ilog2(st->speech_level);*/
temp = NO_SLOPE * (ilog2_noise_level- NO_P1)+ THR_HIGH;
temp2 = SP_CH_MIN + SP_SLOPE*(ilog2_speech_level - SP_P1);
if (temp2 < SP_CH_MIN) {
temp2 = SP_CH_MIN;
}
if (temp2 > SP_CH_MAX) {
temp2 = SP_CH_MAX;
}
vad_thr = temp + temp2;
if (vad_thr < THR_MIN)
{
vad_thr = THR_MIN;
}
/* Shift VAD decision register */
st->vadreg = (short)((st->vadreg)>>1);
/* Make intermediate VAD decision */
if (L_snr_sum > (vad_thr*(float)COMPLEN/128.0f))
{
st->vadreg = (Word16)(st->vadreg | 0x4000);
}
/* primary vad decsion made */
/* check if the input power (pow_sum) is lower than a threshold" */
if (pow_sum < VAD_POW_LOW)
{
low_power_flag = 1;
}
else
{
low_power_flag = 0;
}
/* Update speech subband background noise estimates */
noise_estimate_update(st, level);
hang_len = (Word16)((Word16)(HANG_SLOPE * (vad_thr - HANG_P1) - 0.5) + HANG_HIGH);
if (hang_len < HANG_LOW) {
hang_len = HANG_LOW;
};
burst_len = (Word16)((Word16)(BURST_SLOPE * (vad_thr - BURST_P1) - 0.5) + BURST_HIGH);
return(hangover_addition(st, low_power_flag, hang_len,burst_len));
}
/******************************************************************************
*
* Estimate_Speech()
* Purpose : Estimate speech level
*
* Maximum signal level is searched and stored to the variable sp_max.
* The speech frames must locate within SP_EST_COUNT number of frames to be counted.
* Thus, noisy frames having occasional VAD = "1" decisions will not
* affect to the estimated speech_level.
*
*******************************************************************************
*/
static void Estimate_Speech(
VadVars *st, /* i/o : State structure */
float in_level /* level of the input frame */
)
{
float alpha, tmp;
/* if the required activity count cannot be achieved, reset counters */
if (SP_ACTIVITY_COUNT > (SP_EST_COUNT - st->sp_est_cnt + st->sp_max_cnt))
{
st->sp_est_cnt = 0;
st->sp_max = 0.0;
st->sp_max_cnt = 0;
}
st->sp_est_cnt++;
if (((st->vadreg & 0x4000) || (in_level > st->speech_level))
&& (in_level > MIN_SPEECH_LEVEL1))
{
if (in_level > st->sp_max) {
st->sp_max = in_level;
}
st->sp_max_cnt++;
if (st->sp_max_cnt >= SP_ACTIVITY_COUNT) {
tmp = st->sp_max/2.0f; /* scale to get "average" speech level*/
if (tmp > st->speech_level) {
alpha = ALPHA_SP_UP;
}
else {
alpha = ALPHA_SP_DOWN;
}
if (tmp > MIN_SPEECH_LEVEL2) {
st->speech_level += alpha*(tmp - st->speech_level);
}
st->sp_max = 0.0;
st->sp_max_cnt = 0;
st->sp_est_cnt = 0;
}
}
}
/******************************************************************************
* PUBLIC PROGRAM CODE
******************************************************************************/
/******************************************************************************
*
* Function: wb_vad_init
* Purpose: Allocates state memory and initializes state memory
*
*******************************************************************************
*/
int wb_vad_init ( /* return: non-zero with error, zero for ok. */
VadVars **state /* i/o : State structure */
)
{
VadVars* s;
if (state == (VadVars **) NULL){
fprintf(stderr, "vad_init: invalid parameter\n");
return -1;
}
*state = NULL;
/* allocate memory */
if ((s = (VadVars *) malloc(sizeof(VadVars))) == NULL){
fprintf(stderr, "vad_init: can not malloc state structure\n");
return -1;
}
wb_vad_reset(s);
*state = s;
return 0;
}
/******************************************************************************
*
* Function: wb_vad_reset
* Purpose: Initializes state memory to zero
*
*******************************************************************************
*******************************************************************************
*/
int wb_vad_reset ( /* return: non-zero with error, zero for ok. */
VadVars *state /* i/o : State structure */
)
{
Word16 i, j;
if (state == (VadVars *) NULL){
fprintf(stderr, "vad_reset: invalid parameter\n");
return -1;
}
/* Initialize pitch detection variables */
state->pitch_tone = 0;
state->vadreg = 0;
state->hang_count = 0;
state->burst_count = 0;
state->hang_count = 0;
/* initialize memory used by the filter bank */
for (i = 0; i < F_5TH_CNT; i++)
{
for (j = 0; j < 2; j++)
{
state->a_data5[i][j] = 0.0;
}
}
for (i = 0; i < F_3TH_CNT; i++)
{
state->a_data3[i] = 0.0;
}
/* initialize the rest of the memory */
for (i = 0; i < COMPLEN; i++)
{
state->bckr_est[i] = NOISE_INIT;
state->old_level[i] = NOISE_INIT;
state->ave_level[i] = NOISE_INIT;
state->sub_level[i] = 0;
state->level[i] = 0.0;
state->prevLevel[i] = 0.0;
}
state->sp_est_cnt = 0;
state->sp_max = 0;
state->sp_max_cnt = 0;
state->speech_level = SPEECH_LEVEL_INIT;
state->prev_pow_sum = 0;
return 0;
}
/******************************************************************************
*
* Function: wb_vad_exit
* Purpose: The memory used for state memory is freed
*
*******************************************************************************
*******************************************************************************
*/
void wb_vad_exit (
VadVars **state /* i/o : State structure */
)
{
if (state == NULL || *state == NULL)
return;
/* deallocate memory */
free(*state);
*state = NULL;
return;
}
/******************************************************************************
*
* Function : wb_vad_tone_detection
* Purpose : Set tone flag if pitch gain is high. This is used to detect
* signaling tones and other signals with high pitch gain.
*
*******************************************************************************
*/
void wb_vad_pitch_tone_detection (
VadVars *st, /* i/o : State struct */
float p_gain /* pitch gain */
)
{
/* update tone flag and pitch flag */
st->pitch_tone = (Word16)((st->pitch_tone)>>1);
/* if (pitch_gain > TONE_THR)
set tone flag
*/
if (p_gain > TONE_THR)
{
st->pitch_tone = (Word16)(st->pitch_tone | 0x4000);
}
}
/******************************************************************************
*
* Function : wb_vad
* Purpose : Main program for Voice Activity Detection (VAD) for AMR
*
*******************************************************************************
*/
Word16 wb_vad( /* Return value : VAD Decision, 1 = speech, 0 = noise */
VadVars *st, /* i/o : State structure */
float in_buf[] /* i : samples of the input frame */
)
{
Word16 i;
Word16 VAD_flag;
float temp;
double L_temp, pow_sum;
for(i=0;i<COMPLEN;i++){
st->prevLevel[i] = st->level[i];
}
/* Calculate power of the input frame. */
L_temp = 0.0;
for (i = 0; i < FRAME_LEN; i++)
{
L_temp += in_buf[i] * in_buf[i];
}
L_temp *= 2.0;
/* pow_sum = power of current frame and previous frame */
pow_sum = L_temp + st->prev_pow_sum;
/* save power of current frame for next call */
st->prev_pow_sum = L_temp;
/* If input power is very low, clear tone flag */
if (pow_sum < POW_PITCH_TONE_THR)
{
st->pitch_tone = (Word16)(st->pitch_tone & 0x1fff);
}
/* Run the filter bank and calculate signal levels at each band */
filter_bank(st, in_buf, st->level);
/* compute VAD decision */
VAD_flag = vad_decision(st, st->level, pow_sum);
/* Calculate input level */
L_temp = 0.0;
for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
{
L_temp += st->level[i];
}
temp = (float)(L_temp/16.0f);
Estimate_Speech(st, temp); /* Estimate speech level */
return(VAD_flag);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -