📄 activlev.m

📁 this is speech recognition file
💻 M
字号:
function [lev,af,fso]=activlev(sp,fs,mode)
%ACTIVLEV Measure active speech level as in ITU-T P.56 [LEV,AF,FSO]=(SP,FS,MODE)
%
%Inputs: SP is the speech signal (with better than 20dB SNR)
%        FS is the sample frequency in Hz (see also FSO below)
%        MODE is a combination of the following:
%            r - raw omit input filters (default is 200 Hz to 5.5 kHz)
%            0 - no high pass filter (i.e. include DC)
%            4 - high pass filter at 40 Hz (but allows mains hum to pass)
%            1 - use cheybyshev 1 filter
%            2 - use chebyshev 2 filter (default)
%            e - use elliptic filter
%            h - omit low pass filter at 5.5 kHz
%            d - give outputs in dB rather than power

%Outputs:
%        LEV gives the speech level in units of power (or dB if mode='d')
%            if mode='l' is specified, LEV is a row vector with the "long term
%            level" as its second element (this is just the mean power)
%        AF  is the activity factor (or duty cycle) in the range 0 to 1 
%        FSO is a column vector of intermediate information that allows
%            you to process a speech signal in chunks. Thus:
%
%            fso=fs; for i=1:inc:nsamp, [lev,fso]=activlev(sp(i:i+inc-1),fso,mode); end
%
%            is equivalent to:                lev=activlev(sp(1:nsamp),fs,mode)
%
%            but is much slower. The two methods will not give identical results
%            because they will use slightly different threshods.

%For completeness we list here the contents of the FSO structure:
%
%   ffs : sample frequency
%   fmd : mode string
%    nh : hangover time in samples
%    ae : smoothing filter coefs
%    bl : 200Hz HP filter numerator
%    al : 200Hz HP filter denominator
%    bh : 5.5kHz LP filter numerator
%    ah : 5.5kHz LP filter denominator
%    ze : smoothing filter state
%    zl : 200Hz HP filter state
%    zh : 5.5kHz LP filter state
%    zx : hangover max filter state
%  emax : maximum envelope exponent + 1
%   ssq : signal sum of squares
%    ns : number of signal samples
%    ss : sum of speech samples (not actually used here)
%    kc : cumulative occupancy counts

%      Copyright (C) Mike Brookes 2008
%      Version: $Id: activlev.m,v 1.7 2008/04/04 08:19:21 dmb Exp $
%
%   VOICEBOX is a MATLAB toolbox for speech processing.
%   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%   This program is free software; you can redistribute it and/or modify
%   it under the terms of the GNU General Public License as published by
%   the Free Software Foundation; either version 2 of the License, or
%   (at your option) any later version.
%
%   This program is distributed in the hope that it will be useful,
%   but WITHOUT ANY WARRANTY; without even the implied warranty of
%   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
%   GNU General Public License for more details.
%
%   You can obtain a copy of the GNU General Public License from
%   http://www.gnu.org/copyleft/gpl.html or by writing to
%   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

persistent nbin thresh c25zp c15zp e5zp
if isempty(nbin)
    nbin=20;    % 60 dB range at 3dB per bin
    thresh=15.9;    % threshold in dB
    % High pass s-domain zeros and poles of filters with passband ripple<0.25dB, stopband<-50dB, w0=1
    %    w0=fzero(@ch2,0.5); [c2z,c2p,k]=cheby2(5,50,w0,'high','s');
    %    function v=ch2(w); [c2z,c2p,k]=cheby2(5,50,w,'high','s'); v= 20*log10(prod(abs(1i-c2z))/prod(abs(1i-c2p)))+0.25;
    c25zp=[0.37843443673309i 0.23388534441447i; -0.20640255179496+0.73942185906851i -0.54036889596392+0.45698784092898i];
    c25zp=[[0; -0.66793268833792] c25zp conj(c25zp)];
    %       [c1z,c1p,c1k] = cheby1(5,0.25,1,'high','s');
    c15zp=[-0.659002835294875+1.195798636925079i -0.123261821596263+0.947463030958881i];
    c15zp=[zeros(1,5); -2.288586431066945 c15zp conj(c15zp)];
    %      [ez,ep,ek] = ellip(5,0.25,50,1,'high','s')
    e5zp=[0.406667680649209i 0.613849362744881i; -0.538736390607201+1.130245082677107i -0.092723126159100+0.958193646330194i];
    e5zp=[[0; -1.964538608244084]  e5zp conj(e5zp)];
    %    w=linspace(0.2,2,100);
    %    figure(1); plot(w,20*log10(abs(freqs(real(poly(c15zp(1,:))),real(poly(c15zp(2,:))),w)))); title('Chebyshev 1');
    %    figure(2); plot(w,20*log10(abs(freqs(real(poly(c25zp(1,:))),real(poly(c25zp(2,:))),w)))); title('Chebyshev 2');
    %    figure(3); plot(w,20*log10(abs(freqs(real(poly(e5zp(1,:))),real(poly(e5zp(2,:))),w)))); title('Elliptic');
end

if ~isstruct(fs)                        % no state vector given
    if nargin<3
        mode=' ';
    end
    fso.ffs=fs;                       	% sample frequency
    if any(mode=='r')                   % included for backward compatibility
        mode=['0h' mode];               % abolish both filters
    elseif fs<14000
        mode=['h' mode];               % abolish lowpass filter at low sample rates
    end
    fso.fmd=mode;                      % save mode flags
    ti=1/fs;
    g=exp(-ti/0.03);                    % pole position for envelope filter
    fso.ae=[1 -2*g g^2]/(1-g)^2;        % envelope filter coefficients (DC gain = 1)
    fso.ze=zeros(2,1);
    fso.nh=ceil(0.2/ti)+1;              % hangover time in samples
    fso.zx=-Inf;                        % initial value for maxfilt()
    fso.emax=-Inf;                      % maximum exponent
    fso.ns=0;
    fso.ssq=0;
    fso.ss=0;
    fso.kc=zeros(nbin,1);               % cumulative occupancy counts
    % s-plane zeros and poles of high pass 5'th order filter -0.25dB at w=1 and -50dB stopband
    if any(mode=='1')
        szp=c15zp;            % Chebyshev 1
    elseif any(mode=='e')
        szp=e5zp;             % Elliptic
    else
        szp=c25zp;            % Chebyshev 2
    end
    if all(mode~='0')
        if any(mode=='4')
            fl=40;               % 40 Hz cutoff
        else
            fl=200;              % 200 Hz cutoff
        end
        zl=2./(1-szp*tan(fl*pi/fs))-1;      % 200 Hz LF limit
        al=real(poly(zl(2,:)));              % high pass filter
        bl=real(poly(zl(1,:)));
        sw=1-2*rem(0:5,2).';
        fso.bl=bl*(al*sw)/(bl*sw);                  % scale to give HF gain of 1
        fso.al=al;
        fso.zl=zeros(5,1);                   % LF filter state
    end
    if all(mode~='h')
        zh=2./(szp/tan(5500*pi/fs)-1)+1;
        ah=real(poly(zh(2,:)));
        bh=real(poly(zh(1,:)));
        fso.bh=bh*sum(ah)/sum(bh);
        fso.ah=ah;
        fso.zh=zeros(5,1);
    end
else
    fso=fs;             % use existing structure
end
md=fso.fmd;
ns=length(sp);
if ns                       % process this speech chunk
    % apply the input filters to the speech
    if all(md~='0')
        [sp,fso.zl]=filter(fso.bl,fso.al,sp(:),fso.zl);     % highpass filter
    end
    if all(md~='h')
        [sp,fso.zh]=filter(fso.bh,fso.ah,sp(:),fso.zh);     % lowpass filter
    end
    fso.ns=fso.ns+ns;      % count the number of speech samples
    fso.ss=fso.ss+sum(sp);  % sum of speech samples
    fso.ssq=fso.ssq+sum(sp.*sp);    % sum of squared speech samples
    [s,fso.ze]=filter(1,fso.ae,abs(sp(:)),fso.ze);     % envelope filter
    [qf,qe]=log2(s.*s);         % take efficient log2 function, 2^qe is upper limit of bin
    qe(qf==0)=-Inf;           % fix zero values
    [qe,qk,fso.zx]=maxfilt(qe,1,fso.nh,1,fso.zx);       % apply the 0.2 second hangover
    oemax=fso.emax;
    fso.emax=max(oemax,max(qe)+1);
    if fso.emax==-Inf
        fso.kc(1)=fso.kc(1)+ns;
    else
        qe=min(fso.emax-qe,nbin);   % force in the range 1:nbin
        wqe=ones(length(qe),1);
        % below: could use kc=cumsum(accumarray(qe,wqe,nbin)) but unsure about backwards compatibility
        kc=cumsum(full(sparse(qe,wqe,wqe,nbin,1)));    % cumulative occupancy counts
        esh=fso.emax-oemax;         % amount to shift down previous bin counts
        if esh<nbin-1
            kc(esh+1:nbin-1)=kc(esh+1:nbin-1)+fso.kc(1:nbin-esh-1);
            kc(nbin)=kc(nbin)+sum(fso.kc(nbin-esh:nbin));
        else
            kc(nbin)=kc(nbin)+sum(fso.kc);
        end
        fso.kc=kc;
    end
end
if fso.ns                       % now calculate the output values
    if fso.ssq>0
        aj=10*log10(fso.ssq*(fso.kc).^(-1));
        % equivalent to cj=20*log10(sqrt(2).^(fso.emax-(1:nbin)-1));
        cj=10*log10(2)*(fso.emax-(1:nbin)-1);   % lower limit of bin j in dB
        mj=aj'-cj-thresh;
        jj=find(mj*sign(mj(1))<=0); % Find threshold
        if isempty(jj)
            jj=length(mj)-1;
            jf=1;
        else
            jj=max(jj(1)-1,1);                   % integer part of j (>=1 in case mj(1)=0)
            jf=1/(1-mj(jj+1)/mj(jj));   % fractional part of j using linear interpolation
        end
        lev=aj(jj)+jf*(aj(jj+1)-aj(jj)); % active level in decibels
        lp=10.^(lev/10);
        if all(md~='d')
            lev=lp;
        end
        af=fso.ssq/(fso.ns*lp);
    else
        af=0;
        if all(md~='d')
            lev=0;
        else
            lev=-200;
        end
    end
end
if ~nargout
    lev=10*log10(lp);
    subplot(211);
    plot((1:ns)/fso.ffs,[sp s (qe<=jj)*sqrt(lp)]);
    xlabel('Time (s)');
    title(sprintf('Active Level = %.2g dB, Activity = %.0f%% (ITU-T P.56)',lev,100*af));
    ylabel('Amplitude');
    legend('Signal','Smoothed envelope','Active Level','Location','SouthEast');
    subplot(212);
    plot(cj,repmat(lev,nbin,1),'k:',cj,[aj(:) cj(:)+thresh cj(:)]);
    xlabel('Threshold (dB)');
    ylabel('Active Level (dB)');
    legend('Active Level','Speech Level',sprintf('Threshold+%.1f dB',thresh),'Threshold','Location','SouthEast');
end
💿 文件大小 268 K
👤 上传用户 zyz5925629
📂 所属分类系统设计方案
🏷️ 相关标签

#recognition #speech #this #file
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -