📄 fe_endpoint.h
字号:
///////////////////////////////////////////////////////////////////////////////
// This is a part of the Feature program.
// Version: 1.0
// Date: February 22, 2003
// Programmer: Oh-Wook Kwon
// Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
///////////////////////////////////////////////////////////////////////////////
#ifndef _FE_ENDPOINT_H_#define _FE_ENDPOINT_H_
#include "FE_common.h"
/*-----------------------------------*/
/* Definition for endpoint detectors */
/*-----------------------------------*/
#define EPD_MAX_RECORD_TIME 10 /* 10 seconds */
#define EPD_WAV_BUF_SIZE (EPD_MAX_RECORD_TIME*16000) /* 10 seconds at 16 kHz mono sampling frequency */
#define EPD_FRAME_SHIFT_IN_MS 10 /* frame shift in millisecond (10 ms) */
#define EPD_MAX_WIN_SIZE 512 /* maximum window size */
/*-----------------------------------*/
/* Definition for endpoint detectors */
/*-----------------------------------*/
#define EPD_MAX_PAUSE_IN_MS 2000 /* maximum duration of long-pause in millisecond (<=2000 ms) */
/*-----------------*/
/* Type definition */
/*-----------------*/
typedef enum {
EPD_STATUS_WAITING,
EPD_STATUS_SPEECH,
EPD_STATUS_PAUSE,
EPD_STATUS_ENDPOINT
} EpdLocalStatus;
typedef enum {
EPD_FK_SILENCE=0,
EPD_FK_SPEECH
} EpdFrameKind;
/*----------------*/
/* Data structure */
/*----------------*/
typedef struct {
int m_sampleRate;
int m_shiftSize; /* Shift size in samples (10 ms) */
int m_winSize; /* Window size in samples, same as feature extraction. */
int m_threshFrameN;
int m_startFrameN;
int m_endFrameN;
int m_startSilenceFrameN;
int m_endSilenceFrameN;
int m_longPauseFrameN; /* to detect utterance end-point */
} EpdParm;
#ifdef _DEBUG
#define EPD_SPEECH_BUF_SIZE EPD_WAV_BUF_SIZE
#define EPD_FRAME_BUF_SIZE (EPD_MAX_RECORD_TIME*100) /* assuming 10 ms frame shift */
#else
#define EPD_SPEECH_BUF_SIZE ((EPD_MAX_PAUSE_IN_MS/10+1)*EPD_MAX_WIN_SIZE) /* to save memory */
#define EPD_FRAME_BUF_SIZE (EPD_MAX_PAUSE_IN_MS/10+1) /* to save memory */
#endif
#define EPD_BUF_SIZE 7
class Epd {
public:
/* fixed part */
int m_isActive;
int m_isAudio;
EpdParm m_config;
/* adaptive part */
long m_uttBeginX; /* start sample point of speech */
long m_uttEndX; /* end sample point of speech */
long m_localFrameX; /* local time frame index */
/* adaptive noise estimation */
float m_lambdaLTE;
float m_noiseEn;
float m_lambdaLTEhigherE;
int m_nbSpeechFrame;
int m_nbFrameEpd;
EpdFrameKind m_flagVAD;
/* ZCR tracking */
float m_noiseLevel;
float m_lambdaZcr;
float m_meanZcr;
/* SNR tracking */
float m_signalEn;
float m_lambdaSignalE;
long m_absTimeX;
float m_lastSnr;
/* working variables */
EpdLocalStatus m_localStatus; /* The status of EPD must be hidden to other modules */
EpdFrameKind m_isSpeechA[EPD_FRAME_BUF_SIZE];
float m_zcrA[EPD_FRAME_BUF_SIZE]; /* to remove breath noise */
long m_sampleEndX; /* end sample point to input audio */
short m_epdSpeech[EPD_SPEECH_BUF_SIZE];
int m_speechSegN; /* number of detected speech segments */
/*--------------------*/
/* Member functions */
/*--------------------*/
Epd();
virtual ~Epd();
int Init(int samplingRate, int isAudio, int isActive, int isDenoised);
int InitNewUtterance();
FeReturnCode OneFrame(float *in, float *out, int frameX, int winSize, FeReturnCode inStatus, EpdFrameKind *frameKind);
int GetOutput(short *sampleA, int maxSampleN);
void Close();
void SetMaxPause(int msec);
#ifdef _DEBUG
int SaveOutput(const char *fname, int offsetX);
#endif
private:
int ParmInit(EpdParm *epdParm, int samplingRate, int isDenoised);
int PutSample(float *sampleA, int sampleN);
int FindBeginPoint(int startX);
int FindEndPoint(int startX, int reqSilN);
int SaveAudio(const char *fname, int begX, int endX);
EpdFrameKind OneFrame(const float *s);
};
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -