📄 enc_dtx.c

📁 ffmpeg源码分析
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
 * E_DTX_hangover_addition
 *
 * Parameters:
 *    st              I/0: State struct
 *    low_power         I: flag power of the input frame
 *    hang_len          I: hangover length
 *    burst_len         I: minimum burst length for hangover addition
 *
 * Function:
 *    Add hangover after speech bursts.
 *
 * Returns:
 *    VAD_flag indicating final VAD decision
 */
static Word16 E_DTX_hangover_addition(E_DTX_Vad_State *st, Word16 low_power,
                                      Word16 hang_len, Word16 burst_len)
{
   /*
    * if the input power (pow_sum) is lower than a threshold, clear
    * counters and set VAD_flag to "0"  "fast exit"
    */

   if (low_power != 0)
   {
      st->mem_burst_count = 0;
      st->mem_hang_count = 0;
      return 0;
   }

   /* update the counters (hang_count, burst_count) */

   if ((st->mem_vadreg & 0x4000) != 0)
   {
      st->mem_burst_count++;

      if (st->mem_burst_count >= burst_len)
      {
         st->mem_hang_count = hang_len;
      }
      return 1;
   }
   else
   {
      st->mem_burst_count = 0;

      if (st->mem_hang_count > 0)
      {
         st->mem_hang_count--;
         return 1;
      }
   }
   return 0;
}

/*
 * E_DTX_noise_estimate_update
 *
 * Parameters:
 *    st           I/0: State struct
 *    level          I: sub-band levels of the input frame
 *
 * Function:
 *    Update of background noise estimate
 *
 * Returns:
 *    void
 */
static void E_DTX_noise_estimate_update(E_DTX_Vad_State *st, Float32 level[])
{

   Float32 alpha_up, alpha_down, bckr_add, temp;
   Word32 i;

   /* Control update of bckr_est[] */
   E_DTX_update_cntrl(st, level);

   /* Choose update speed */
   bckr_add = 2.0;


   if ((0x7800 & st->mem_vadreg) == 0)
   {
      alpha_up = ALPHA_UP1;
      alpha_down = ALPHA_DOWN1;
   }
   else
   {

      if (st->mem_stat_count == 0)
      {
         alpha_up = ALPHA_UP2;
         alpha_down = ALPHA_DOWN2;
      }
      else
      {
         alpha_up = 0.0;
         alpha_down = ALPHA3;
         bckr_add = 0.0;
      }
   }

   /* Update noise estimate (bckr_est) */
   for (i = 0; i < COMPLEN; i++)
   {
      temp = st->mem_level[i] - st->mem_bckr_est[i];

      if (temp < 0.0)
      { /* update downwards*/
         st->mem_bckr_est[i] += -2 + (alpha_down * temp);

         /* limit minimum value of the noise estimate to NOISE_MIN */

         if (st->mem_bckr_est[i] < NOISE_MIN)
         {
            st->mem_bckr_est[i] = NOISE_MIN;
         }
      }
      else
      { /* update upwards */
         st->mem_bckr_est[i] += bckr_add + (alpha_up * temp);

         /* limit maximum value of the noise estimate to NOISE_MAX */

         if (st->mem_bckr_est[i] > NOISE_MAX)
         {
            st->mem_bckr_est[i] = NOISE_MAX;
         }
      }
   }

   /* Update signal levels of the previous frame (old_level) */
   memcpy(st->mem_level, level, COMPLEN * sizeof(Float32));
}

/*
 * E_DTX_decision
 *
 * Parameters:
 *    st           I/0: State struct
 *    level          I: sub-band levels of the input frame
 *    pow_sum        I: power of the input frame
 *
 * Function:
 *    Calculates VAD_flag
 *
 * Returns:
 *    VAD_flag
 */
static Word16 E_DTX_decision(E_DTX_Vad_State *st, Float32 level[COMPLEN], Float64 pow_sum)
{
   Float64 snr_sum;
   Float32 vad_thr, temp, noise_level;
   Float32 ilog2_speech_level, ilog2_noise_level;
   Float32 temp2;
   Word32 i;
   Word16 low_power_flag;
   Word16 hang_len,burst_len;

   /*
    * Calculate squared sum of the input levels (level)
    * divided by the background noise components (bckr_est).
    */
   snr_sum = 0.0;

   for (i = 0; i < COMPLEN; i++)
   {
      temp = level[i] / st->mem_bckr_est[i];
      snr_sum += temp * temp;
   }

   /* Calculate average level of estimated background noise */
   temp = 0.0;

   for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
   {
      temp += st->mem_bckr_est[i];
   }

   noise_level = (Float32)(temp * 0.0625);
   /*
    * if SNR is lower than a threshold (MIN_SPEECH_SNR),
    * and increase speech_level
    */
   temp = noise_level * MIN_SPEECH_SNR * 8;

   if (st->mem_speech_level <= temp)
   {
      st->mem_speech_level = temp;

      /* avoid log10 error */
      temp -= 1E-8F;
   }
   
   ilog2_noise_level = (Float32)(-1024.0F * log10(noise_level / 2147483648.0F) / log10(2.0F));
  
   /*
    * If SNR is very poor, speech_level is probably corrupted by noise level. This
    * is correctred by subtracting -MIN_SPEECH_SNR*noise_level from speech level
    */
   ilog2_speech_level = (Float32)(-1024.0F * log10((st->mem_speech_level - temp) / 2147483648.0F) / log10(2.0F));

   temp = NO_SLOPE * (ilog2_noise_level- NO_P1) + THR_HIGH;

   temp2 = SP_CH_MIN + SP_SLOPE * (ilog2_speech_level - SP_P1);

   if (temp2 < SP_CH_MIN)
   {
      temp2 = SP_CH_MIN;
   }

   if (temp2 > SP_CH_MAX)
   {
      temp2 = SP_CH_MAX;
   }

   vad_thr = temp + temp2;

   if (vad_thr < THR_MIN)
   {
      vad_thr = THR_MIN;
   }

   /* Shift VAD decision register */
   st->mem_vadreg = (Word16)(st->mem_vadreg >> 1);

   /* Make intermediate VAD decision */

   if (snr_sum > (vad_thr * (Float32)COMPLEN / 128.0F))
   {
      st->mem_vadreg = (Word16)(st->mem_vadreg | 0x4000);
   }
   /* primary vad decision made */

   /* check if the input power (pow_sum) is lower than a threshold" */

   if (pow_sum < VAD_POW_LOW)
   {
      low_power_flag = 1;
   }
   else
   {
      low_power_flag = 0;
   }

   /* Update speech subband background noise estimates */
   E_DTX_noise_estimate_update(st, level);

   hang_len = (Word16)((HANG_SLOPE * (vad_thr - HANG_P1) - 0.5) + HANG_HIGH);

   if (hang_len < HANG_LOW)
   {
      hang_len = HANG_LOW;
   }

   burst_len = (Word16)((BURST_SLOPE * (vad_thr - BURST_P1) - 0.5) + BURST_HIGH);

   return(E_DTX_hangover_addition(st, low_power_flag, hang_len,burst_len));
}

/*
 * E_DTX_dpeech_estimate
 *
 * Parameters:
 *    st           I/0: State struct
 *    in_level       I: level of the input frame
 *
 * Function:
 *    Estimate speech level
 *
 *    Maximum signal level is searched and stored to the variable sp_max.
 *    The speech frames must locate within SP_EST_COUNT number of frames to be counted.
 *    Thus, noisy frames having occasional VAD = "1" decisions will not
 *    affect to the estimated speech_level.
 *
 * Returns:
 *    void
 */
static void E_DTX_speech_estimate(E_DTX_Vad_State *st, Float32 in_level)
{
   Float32 alpha, tmp;

   /* if the required activity count cannot be achieved, reset counters */
   if (SP_ACTIVITY_COUNT  > (SP_EST_COUNT - st->mem_sp_est_cnt + st->mem_sp_max_cnt))
   {
      st->mem_sp_est_cnt = 0;
      st->mem_sp_max = 0.0;
      st->mem_sp_max_cnt = 0;
   }

   st->mem_sp_est_cnt++;

   if (((st->mem_vadreg & 0x4000) || (in_level > st->mem_speech_level))
      && (in_level > MIN_SPEECH_LEVEL1))
   {
      if (in_level > st->mem_sp_max)
      {
         st->mem_sp_max = in_level;
      }

      st->mem_sp_max_cnt++;

      if (st->mem_sp_max_cnt >= SP_ACTIVITY_COUNT)
      {

         tmp = st->mem_sp_max / 2.0F; /* scale to get "average" speech level*/

         if (tmp > st->mem_speech_level)
         {
            alpha = ALPHA_SP_UP;
         }
         else
         {
            alpha = ALPHA_SP_DOWN;
         }

         if (tmp > MIN_SPEECH_LEVEL2)
         {
            st->mem_speech_level += alpha * (tmp - st->mem_speech_level);
         }

         st->mem_sp_max = 0.0;
         st->mem_sp_max_cnt = 0;
         st->mem_sp_est_cnt = 0;
      }
   }
}

/*
 * E_DTX_vad_reset
 *
 * Parameters:
 *    state        I/0: State struct
 *
 * Function:
 *    Initialises state memory
 *
 * Returns:
 *    non-zero with error, zero for ok
 */
Word32 E_DTX_vad_reset (E_DTX_Vad_State *state)
{
   Word32 i;

   if (state == (E_DTX_Vad_State *) NULL)
   {
      return -1;
   }

   /* Initialize pitch detection variables */
   state->mem_pitch_tone = 0;
   state->mem_vadreg = 0;
   state->mem_hang_count = 0;
   state->mem_burst_count = 0;
   state->mem_hang_count = 0;

   /* initialize memory used by the filter bank */
   memset(state->mem_a_data5, 0, F_5TH_CNT * 2 * sizeof(Float32));
   memset(state->mem_a_data3, 0, F_3TH_CNT * sizeof(Float32));

   /* initialize the rest of the memory */
   for (i = 0; i < COMPLEN; i++)
   {
      state->mem_bckr_est[i] = NOISE_INIT;
      state->mem_level[i] = NOISE_INIT;
      state->mem_ave_level[i] = NOISE_INIT;
      state->mem_sub_level[i] = 0;
   }

   state->mem_sp_est_cnt = 0;
   state->mem_sp_max = 0;
   state->mem_sp_max_cnt = 0;
   state->mem_speech_level = SPEECH_LEVEL_INIT;
   state->mem_pow_sum = 0;
   state->mem_stat_count = 0;

   return 0;
}

/*
 * E_DTX_vad_init
 *
 * Parameters:
 *    state        I/0: State struct
 *
 * Function:
 *    Allocates state memory and initializes state memory
 *
 * Returns:
 *    non-zero with error, zero for ok
 */
Word32 E_DTX_vad_init (E_DTX_Vad_State **state)
{
   E_DTX_Vad_State* s;

   if (state == (E_DTX_Vad_State **) NULL)
   {
      return -1;
   }
   *state = NULL;

   /* allocate memory */
   if ((s = (E_DTX_Vad_State *) malloc(sizeof(E_DTX_Vad_State))) == NULL)
   {
      return -1;
   }

   E_DTX_vad_reset(s);

   *state = s;

   return 0;
}

/*
 * E_DTX_vad_exit
 *
 * Parameters:
 *    state        I/0: State struct
 *
 * Function:
 *    The memory used for state memory is freed
 *
 * Returns:
 *    void
 */
void E_DTX_vad_exit (E_DTX_Vad_State **state)
{
   if (state == NULL || *state == NULL)
   {
      return;
   }

   /* deallocate memory */
   free(*state);
   *state = NULL;

   return;
}

/*
 * E_DTX_pitch_tone_detection
 *
 * Parameters:
 *    state        I/0: State struct
 *    p_gain         I: pitch gain
 *
 * Function:
 *    Set tone flag if pitch gain is high. This is used to detect
 *    signaling tones and other signals with high pitch gain.
 *
 * Returns:
 *    void
 */
void E_DTX_pitch_tone_detection (E_DTX_Vad_State *st, Float32 p_gain)
{
   /* update tone flag and pitch flag */
   st->mem_pitch_tone = (Word16)(st->mem_pitch_tone >> 1);

   /* if (pitch_gain > TONE_THR) set tone flag */
   if (p_gain > TONE_THR)
   {
      st->mem_pitch_tone = (Word16)(st->mem_pitch_tone | 0x4000);
   }
}

/*
 * E_DTX_vad
 *
 * Parameters:
 *    st           I/0: State struct
 *    in_buf         I: samples of the input frame
 *
 * Function:
 *    Main program for Voice Activity Detection (VAD)
 *
 * Returns:
 *    VAD Decision, 1 = speech, 0 = noise
 */
Word16 E_DTX_vad(E_DTX_Vad_State *st, Float32 in_buf[])
{
   Float64 L_temp, pow_sum;
   Float32 level[COMPLEN];
   Float32 temp;
   Word32 i;
   Word16 VAD_flag;

   /* Calculate power of the input frame. */
   L_temp = 0.0;

   for (i = 0; i < FRAME_LEN; i++)
   {
      L_temp += in_buf[i] * in_buf[i];
   }

   L_temp *= 2.0;

   /* pow_sum = power of current frame and previous frame */
   pow_sum = L_temp + st->mem_pow_sum;

   /* save power of current frame for next call */
   st->mem_pow_sum = L_temp;

   /* If input power is very low, clear tone flag */
   if (pow_sum < POW_PITCH_TONE_THR)
   {
      st->mem_pitch_tone = (Word16)(st->mem_pitch_tone & 0x1fff);
   }

   /*  Run the filter bank and calculate signal levels at each band */
   E_DTX_filter_bank(st, in_buf, level);

   /* compute VAD decision */
   VAD_flag = E_DTX_decision(st, level, pow_sum);

   /* Calculate input level */
   L_temp = 0.0;
   for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
   {
      L_temp += level[i];
   }

   temp = (Float32)(L_temp / 16.0F);

   E_DTX_speech_estimate(st, temp); /* Estimate speech level */

   return(VAD_flag);
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -