📄 fxrapt.m

📁 voicebox ,很不错的matlab源程序
💻 M
📖 第 1 页 / 共 2 页
字号:
12 下一页
function [fx,tt]=fxrapt(s,fs,mode);
%FXRAPT RAPT pitch tracker [FX,VUV]=(S,FS)
%
% Input:   s(ns)      Speech signal
%          fs         Sample frequency (Hz)
%          mode       'g' will plot a graph [default if no output arguments]
%
% Outputs: fx(nframe)     Larynx frequency for each fram,e (or NaN for silent/unvoiced)
%          tt(nframe,3)  Start and end samples of each frame
%
% Plots a graph if no outputs are specified showing lag candidates and selected path
%

% Bugs/Suggestions:
%   (1) Include backward DP pass and output the true cost for each candidate.
%   (2) Add an extra state to distinguish between voiceless and silent
%   (3) N-best DP to allow longer term penalties (e.g. for frequent pitch doubling/halving)

% The algorithm is taken from [1] with the following differences:
%
%      (a)  the factor AFACT which in the Talkin algorithm corresponds roughly
%           to the absolute level of harmonic noise in the correlation window. This value
%           is here calculated as the maximum of three figures:
%                   (i) an absolute floor set by PP.rapt_absnoise
%                  (ii) a multiple of the peak signal set by PP.rapt_signoise
%                 (iii) a multiple of the noise floor set by PP.rapt_relnoise
%      (b) The LPC used in calculating the Itakura distance uses a Hamming window rather than
%          a Hanning window.
%
% A C implementation of this algorithm by Derek Lin and David Talkin is included as  "get_f0.c"
% in the esps.zip package available from http://www.speech.kth.se/esps/esps.zip under the BSD
% license.
%
% Refs:
%      [1]   D. Talkin, "A Robust Algorithm for Pitch Tracking (RAPT)"
%            in "Speech Coding & Synthesis", W B Kleijn, K K Paliwal eds,
%            Elsevier ISBN 0444821694, 1995

%      Copyright (C) Mike Brookes 2006
%      Version: $Id: fxrapt.m,v 1.2 2006/07/28 07:41:25 dmb Exp $
%
%   VOICEBOX is a MATLAB toolbox for speech processing.
%   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%   This program is free software; you can redistribute it and/or modify
%   it under the terms of the GNU General Public License as published by
%   the Free Software Foundation; either version 2 of the License, or
%   (at your option) any later version.
%
%   This program is distributed in the hope that it will be useful,
%   but WITHOUT ANY WARRANTY; without even the implied warranty of
%   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
%   GNU General Public License for more details.
%
%   You can obtain a copy of the GNU General Public License from
%   ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0 or by writing to
%   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

s=s(:); % force s to be a column
if nargin<3
    mode=' ';
end
doback=0;   % don't do backwards DP for now

% read in parameters

PP=voicebox;
f0min=PP.rapt_f0min;            % Min F0 (Hz)                               [50]
f0max=PP.rapt_f0max;            % Max F0 (Hz)                               [500]
tframe=PP.rapt_tframe;          % frame size (s)                            [0.01]
tlpw=PP.rapt_tlpw;              % low pass filter window size (s)           [0.005]
tcorw=PP.rapt_tcorw;            % correlation window size (s)               [0.0075]
candtr=PP.rapt_candtr;          % minimum peak in NCCF                      [0.3]
lagwt=PP.rapt_lagwt;            % linear lag taper factor                   [0.3]
freqwt=PP.rapt_freqwt;          % cost factor for F0 change                 [0.02]
vtranc=PP.rapt_vtranc;          % fixed voice-state transition cost         [0.005]
vtrac=PP.rapt_vtrac;            % delta amplitude modulated transition cost [0.5]
vtrsc=PP.rapt_vtrsc;            % delta spectrum modulated transition cost  [0.5]
vobias=PP.rapt_vobias;          % bias to encourage voiced hypotheses       [0.0]
doublec=PP.rapt_doublec;        % cost of exact doubling or halving         [0.35]
absnoise=PP.rapt_absnoise;      % absolute rms noise level                  [0]
relnoise=PP.rapt_relnoise;      % rms noise level relative to noise floor   [2.0]
signoise=PP.rapt_signoise;      % ratio of peak signal rms to noise floor   [0.001]
ncands=PP.rapt_ncands;          % max hypotheses at each frame              [20]
trms=PP.rapt_trms;              % window length for rms measurement         [0.03]
dtrms=PP.rapt_dtrms;            % window spacing for rms measurement        [0.02]
preemph=PP.rapt_preemph;        % s-plane position of preemphasis zero      [-7000]
nfullag=PP.rapt_nfullag;        % number of full lags to try (must be odd)  [7]

% derived parameters (mostly dependent on sample rate fs)

krms=round(trms*fs);            % window length for rms measurement
kdrms=round(dtrms*fs);          % window spacing for rms measurement
rmswin=hanning(krms).^2;
kdsmp=round(0.25*fs/f0max);
hlpw=round(tlpw*fs/2);          % force window to be an odd length
blp=sinc((-hlpw:hlpw)/kdsmp).*hamming(2*hlpw+1).';
fsd=fs/kdsmp;
kframed=round(fsd*tframe);      % downsampled frame length
kframe=kframed*kdsmp;           % frame increment at full rate
rmsix=(1:krms)+floor((kdrms-kframe)/2); % rms index according to Talkin; better=(1:krms)+floor((kdrms-krms+1)/2)
minlag=ceil(fsd/f0max);
maxlag=round(fsd/f0min);        % use round() only because that is what Talkin does
kcorwd=round(fsd*tcorw);        % downsampled correlation window
kcorw=kcorwd*kdsmp;             % full rate correlation window
spoff=max(hlpw-floor(kdsmp/2),1+kdrms-rmsix(1)-kframe);  % offset for first speech frame at full rate
sfoff=spoff-hlpw+floor(kdsmp/2); % offset for downsampling filter
sfi=1:kcorwd;                   % initial decimated correlation window index array
sfhi=1:kcorw;                   % initial correlation window index array
sfj=1:kcorwd+maxlag;
sfmi=repmat((minlag:maxlag)',1,kcorwd)+repmat(sfi,maxlag-minlag+1,1);
lagoff=(minlag-1)*kdsmp;        % lag offset when converting to high sample rate
beta=lagwt*f0min/fs;            % bias towards low lags
log2=log(2);
lpcord=2+round(fs/1000);        % lpc order for itakura distance
hnfullag=floor(nfullag/2);
jumprat=exp((doublec+log2)/2);  % lag ratio at which octave jump cost is lowest
ssq=s.^2;
csssq=cumsum(ssq);
sqrt(min(csssq(kcorw+1:end)-csssq(1:end-kcorw))/kcorw);
afact=max([absnoise^2,max(ssq)*signoise^2,min(csssq(kcorw+1:end)-csssq(1:end-kcorw))*(relnoise/kcorw)^2])^2*kcorw^2;

% downsample signal to approx 2 kHz to speed up autocorrelation calculation
% kdsmp is the downsample factor

sf=filter(blp/sum(blp),1,s(sfoff+1:end));
sp=filter([1 exp(preemph/fs)],1,s); % preemphasised speech for LPC calculation
sf(1:length(blp)-1)=[];         % remove startup transient
sf=sf(1:kdsmp:end);             % downsample to =~2kHz
nsf=length(sf);                 % length of downsampled speech
ns=length(s);                   % length of full rate speech

% Calculate the frame limit to ensure we don't run off the end of the speech or decimated speech:
%   (a) For decimated autocorrelation when calculating sff():  (nframe-1)*kframed+kcorwd+maxlag <= nsf
%   (b) For full rate autocorrelation when calculating sfh():  max(fho)+kcorw+maxlag*kdsamp+hnfllag <= ns
%   (c) For rms ratio window when calculating rr            :  max(fho)+rmsix(end) <= ns
% where max(fho) = (nframe-1)*kframe + spoff

nframe=floor(1+min((nsf-kcorwd-maxlag)/kframed,(ns-spoff-max(kcorw-maxlag*kdsmp-hnfullag,rmsix(end)))/kframe));

% now search for autocorrelation peaks in the downsampled signal

cost=zeros(nframe,ncands);      % cumulative cost
prev=zeros(nframe,ncands);      % traceback pointer
mcands=zeros(nframe,1);         % number of actual candidates excluding voiceless
lagval=repmat(NaN,nframe,ncands-1);    % lag of each voiced candidate
tv=zeros(nframe,3);             % diagnostics: 1=voiceless cost, 2=min voiced cost, 3:cumulative voiceless-min voiced
if doback
    costms=cell(nframe,1);
end

% Main processing loop for each 10 ms frame

for iframe=1:nframe       % loop for each frame (~10 ms)
    
    % Find peaks in the normalized autocorrelation of subsampled (2Khz) speech
    % only keep peaks that are > 30% of highest peak
    
    sff=sf((iframe-1)*kframed+sfj);
    sffdc=mean(sff(sfi));       % mean of initial correlation window length
    sff=sff-sffdc;              % subtract off the mean
    nccfd=normxcor(sff(1:kcorwd),sff(minlag+1:end));
    [ipkd,vpkd]=findpeaks(nccfd,'q');
    
    % Debugging: execute the line below to plot the autocorrelation peaks.
    % findpeaks(nccfd,'q'); xlabel(sprintf('Lag = (x+%d)*%g ms',minlag-1,1000*kdsmp/fs)); ylabel('Normalized Cross Correlation'); title (sprintf('Frame %d/%d',iframe,nframe));
    
    vipkd=[vpkd ipkd];
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -