fbsyn.m

来自「这是一个用于语音信号处理的工具箱」· M 代码 · 共 377 行

M
377
字号
% Function: perform Formant Based Linear Prediction Speech Synthesis.
% 

function [syns,exc12]=fbsyn(voicetype,gci,cofa,FF,FB,gm,gpcof,ncidx,ncgm,basic);

%retrieve the basic specification
F_len=basic(5);
O_lap=basic(6); 
Order=basic(4);
M_len=F_len-O_lap;

[nframe,dum]=size(FF);
ntotal=M_len*nframe+Order;

% construct the coffa matrix
for kf=1:nframe
   theta=FF(kf,:)*pi/5000;
   tmpfb=cos( FB(kf,:)*pi/10000 );
   for k=1:length(tmpfb)
      tr=roots([1 2*tmpfb(k)-4 1]);
      tr=tr( tr<1 & tr>-1 );
      if length(tr)~=1
         tr=exp(-1.0*FB(kf,k)*pi/10000);
      end
      rdis(k)=tr;
   end
   frts=rdis.*exp(j*theta);
   frts=[frts conj(frts)];
   fpoly=real(poly(frts));
   coffa(kf,:)=fpoly;
end

%--- Select Glottal Source model
%---- Smodel==1 --> 6 order polynomial model (sounds better)
%---- Smodel==2 --> LF model 
Smodel=basic(3);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%                             %%
%%%%%%%  Synthesis (voiced part)    %%
%%%%%%%                             %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Function : synthesize speech by interplaying the analysis results
% under the structure of the proposed LP speech production model.

disp('Formant Synthesis starts...');

signal1=zeros(1,ntotal); % synthetic voiced speech
exc12=zeros(1,ntotal); % 'exc12' = synthetic excitation.

gcof=zeros(1,7); % 'gcof' = smoothed polynomial.
pcofa=coffa(1,:); % 'pcofa' = LP coefficients of the previous frame.
gp=[0 -1 0 0 0 0 0 0 0 0];
numgci=length(gci);
if numgci>2
   lenss=gci(2)-gci(1);
end
Newstart=1;

for nk=1:numgci
   startp=gci(nk)+1;  % starting point of the pitch period
   
   kf=fix( (startp-Order-1)/M_len )+1; % frame number
   
   % last gci
   if nk==numgci
      endp=startp+lenss-1; % ending point of the pitch period
   else
      endp=gci(nk+1);  % ending point of the pitch period
      olens=lenss;
      lenss=gci(nk+1)-gci(nk);       
   end
   
   % voiced-to-unvoiced transition
   if lenss > F_len
      if voicetype(kf)==1 & voicetype(kf+1)==0
         lenss=olens; 
         endp=startp+lenss-1;
      end
   end
   
   if voicetype(kf)==1 % construct the voiced excitation wave
      
      % 0.0 take care of the unvoiced to voiced transition
      %---------------------------------------------------
      
      if ( voicetype(kf-1)+signal1(startp-2) )==0;
         pcofa=coffa(kf-1,:);
         gcof=zeros(1,7);
         Newstart=1;
      end
      
      % 1.0 Interpolate LP coefficients; for details, please see 
      %     Section 4.2. of Dr. Hu's dissertation.
      %---------------------------------------------------------
      
      cf0=coffa(kf-1,:); % previous frame \
      cf1=coffa(kf,:);   % current  frame  >  LP coefficients
      cf2=coffa(kf+1,:); % next     frame /
      
      %ff0=FF(kf-1,:); % previous frame \
      %ff1=FF(kf,:);   % current  frame  >  Formant Frequency
      %ff2=FF(kf+1,:); % next     frame /
      
      %fb0=FB(kf-1,:); % previous frame \
      %fb1=FB(kf,:);   % current  frame  >  Formant Bandwidth
      %fb2=FB(kf+1,:); % next     frame /
      
      pt1=startp-(kf-1)*M_len-Order;
      pt2=endp-(kf-1)*M_len-Order;
      rw0=1/(abs(pt1+M_len-F_len/2+1)+abs(pt2+M_len-F_len/2+1))^2;
      rw1=1/(abs(pt1-F_len/2)+abs(pt2-F_len/2))^2;
      rw2=1/(abs(pt1-F_len/2-M_len)+abs(pt2-F_len/2-M_len))^2;
      rwall=rw0+rw1+rw2;
      w0=rw0/rwall;
      w1=rw1/rwall;
      w2=rw2/rwall;
      ocofa=pcofa;
      
      %ff=w0*ff0+w1*ff1+w2*ff2;
      %fb=w0*fb0+w1*fb1+w2*fb2;
      %pcofa=real(poly(cv_frt([ff;fb],-1) ));
      pcofa=polystab(w0*cf0+w1*cf1+w2*cf2);
      
      % 2.0 create the glottal pulse
      %-----------------------------
      
      % interpolate the glottal phase
      if gcof(1)==0
         gcof=0.5*gpcof(kf,:)+0.5*mean(gpcof);;
      elseif fix(endp/M_len)==kf % ending point in next frame
         gcof=0.5*gcof+0.5*gpcof(kf+1,:);
      else
         gcof=0.5*gcof+0.5*gpcof(kf,:);
      end
      
      % generate the pulse by source model
      o_gp=gp;  % previous excitation signal
      if Smodel==1
         gp=gen_dgf1(gcof,lenss,1);
         % gen_dgf1 <--> polym1
      elseif Smodel==2
         gp=lfpuls(gcof(1),gcof(2),gcof(3),gcof(4),gcof(5),lenss,0.5);
         % lfpuls <--> lfmodel
      end
      if isempty(gp)
         len1=length(o_gp);
         gp=interp1(0:1/(len1-1):1,o_gp, 0:1/(lenss-1):1 );
         gp=gp(:)';
      end
      
      % simulate the source-tract interaction
      %dcof2=conv(pcofa.*((0.8).^(0:length(ocofa)-1)),[1 -.7]);
      %ncof2=conv(pcofa.*((0.7).^(0:length(pcofa)-1)),[1 -.8]);
      %ncof1=conv(pcofa.*((0.7).^(0:length(pcofa)-1)),[1 -.8]);
      %dcof1=conv(ocofa.*((0.8).^(0:length(pcofa)-1)),[1 -.7]);
      
      %len1=floor(lenss/2)-1;
      %gp=cshift(gp,len1);
      %gp1=gp.*hanning(lenss)';
      %gp(len1+1:lenss)=gp(len1+1:lenss)-gp1(len1+1:lenss);
      %gp=cshift(gp,-len1);
      %gp1=[zeros(1,lenss-len1) gp1 zeros(1,len1)];
      %gp2=filter(ncof2,dcof2,gp1);
      %gp3=rev(filter(ncof1,dcof1,rev(gp1)));
      %gp=gp+gp2(lenss+1:2*lenss);	
      %gp=gp+gp2(lenss+1:2*lenss)+gp3(1:lenss);  
      %gp1=[gp(lenss) gp];
      %gp1=filter([1 -autoc2(gp1)],1,gp1);
      %gp=gp1(2:lenss+1);
      %gp=gp-mean(gp);
      
      % 3.0 filter the pulse and determine the excitation gain 
      %-------------------------------------------------------
      
      filt_len=length(pcofa)-1;
      Ziy=signal1(startp:-1:startp-filt_len+1);
      
      Smethod=1;  %%%% Smethod==1 sounds better!!
      
      if Smethod==1
         %*****************************
         % Smethod==1 Dr.Hu's algorithm
         
         if Newstart==1
            cofa1=pcofa;
            cofa1(lenss)=0;
            sscon=real(ifft(fft(gp)./fft(cofa1)));
            amp=sqrt(gm(nk)*lenss/(sscon*sscon'));
            signal1(startp:endp)=sscon*amp;
            pwfsm=0;
            Newstart=0;
         else
            Ziydc=mean(Ziy);
            fxac=filt(1,ocofa,zeros(size(gp)),Ziy-Ziydc);
            % 'fxac' = deviation signal.
            fxdc=filt(1,ocofa,zeros(size(gp)),ones(size(Ziy)));
            % 'fxdc' = mean value signal.
            fx=fxac+Ziydc*fxdc;	
            gx=filter(1,pcofa,gp);
            gxtemp=gx;
            ncomp=0;
            while ncomp<=5
               Ziy1=gxtemp(lenss:-1:lenss-filt_len+1);
               dcamp=mean(Ziy1);
               Ziy1=Ziy1-dcamp;
               if (Ziy*Ziy')~=0
                  tamp=sqrt((Ziy1*Ziy1')/(Ziy*Ziy'));
               else
                  tamp=0;
               end
               gxtemp=gx+tamp*fxac+dcamp*fxdc;
               ncomp=ncomp+1;
            end;
            pwf=sqrt((gx*gx')/(gxtemp*gxtemp'));
            if pwfsm==0
               pwfsm=pwf;
            end;
            smf=.7;
            pwfsm=(1-smf)*pwf+smf*pwfsm;
            amp=sqrt(gm(nk)*lenss/(gx*gx'))*pwfsm;
            signal1(startp:endp)=amp*gx+fx;
            exc12(startp:endp)=gp*amp;
         end
         
         %****************************************************
         % Smethod=2. append zeros in the end of the excitation pulse
         
      elseif Smethod==2
         
         gp=[gp zeros(1,lenss)];
         gx=filter(1, pcofa, gp);
         
         sig_x=gx(lenss+1:2*lenss); % gpulse running into next segment
         sig_r=gx(1:lenss); % synthetic speech of current segment 
         sig_o=signal1(startp:endp); % energy from previous pulse
         
         Pr=gm(nk)*lenss; % gain 
         Pr1=Pr-sum(sig_o.^2);
         loop=1;
         while Pr1<0 & loop<100
            sig_o=(0.98:-0.08/(lenss-1):0.9).*sig_o;
            signal1(startp:endp)=sig_o;
            Pr1=Pr-sum(sig_o.^2);
            loop=loop+1;
         end
         if Pr1<0
            amp=0.01;
         else
            amp=sqrt( Pr1/(sig_r*sig_r') );
         end
         sig_r=sig_r*amp;
         sig_x=sig_x*amp;

         signal1(startp:endp)=sig_r+sig_o;
         signal1(endp+1:endp+lenss)=sig_x+signal1(endp+1:endp+lenss);
         exc12(startp:endp)=amp*gp(1:lenss);
         
         
      end %% if Smethod==1
   end  % voicetype(kf)==1
end  %% for nk=1:numgci

%disp('Voiced segments --> ok'); toc;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%                               %%
%%%%%%%  Synthesis (unvoiced part)    %%
%%%%%%%                               %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% speech synthesis is to proceed using the Direct-I
% implementation.  The filter memory is just the samples pasted away.

load nc; % Load the stochastic codebook

Xlen=M_len/4;
Ziy=zeros(1,Order);
signal2=zeros(1,ntotal);
for kf=1:nframe		%notice proc(1,:)=='nc'
   ocofa=(cofa(kf,1:Order+1));
   %	ocofa=polystab(cofa(kf,1:Order+1)); % Stablized the filter if necessary.
   if kf==1 | kf==nframe
      proc='nc';
   elseif sum(voicetype(kf-1:kf))<2 | sum(voicetype(kf:kf+1))<2
      proc='nc';
   else 
      proc='gc';
   end;
   if proc=='nc'
      for kk=1:4
         startp=(kf-1)*M_len+Order+(kk-1)*Xlen+1;
         endp=startp+Xlen-1;
         
         noise=nc0(ncidx(kf,kk),:);
         
         if Xlen~=50
            noise=interpft(noise,Xlen);
         end
         
         noise=[noise zeros(1,Xlen)];
         ss2=filt(1,ocofa,noise);
         sig_x=ss2(Xlen+1:2*Xlen); %the next segment synthesis
         sig_r=ss2(1:Xlen); %the current segment synthesis
         sig_o=signal2(startp:endp); %previous synthesis
         
         eng1=ncgm(kf,kk)*Xlen; % original energy
         eng2=sum(sig_r.^2); % enery of current excitation 
         eng1=eng1-sum(sig_o.^2);
         loop=1;
         while eng1<0 & loop<100
            sig_o=sig_o.*(0.98:-0.08/(Xlen-1):0.9);
            eng1=eng1-sum(sig_o.^2);
            loop=loop+1;
         end
         if eng1<0
            amp=0.01;
         else
            amp=sqrt(eng1/eng2);
         end
         exc12(startp:endp)=amp*noise(1:Xlen);
         signal2(startp:endp)=amp*sig_r+sig_o;
         signal2(endp+1:endp+Xlen)=sig_x;
      end  %%for kk=1:4
      
   end; %%if proc=='nc'
end;

%disp('Unvoiced/Silent segments --> ok'); toc;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%                                         %%
%%%%%%%  Combine voiced with unvoiced speech    %%
%%%%%%%                                         %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Combine 'signal1' and 'signal2' into 'syns.'
% Note : Voicing transition (Section 4.3.3) is taken care here.

for kf=1:nframe
   startp=M_len*(kf-1)+Order;
   range=startp+(1:M_len);
   if kf==1 | kf==nframe | ~voicetype(kf)
      syns(range)=signal2(range);
   elseif voicetype(kf) & ~voicetype(kf-1)
      pt1=(kf-1)*M_len+Order;
      f_gci=min( gci(gci>pt1) );  % first gci after transition
      range1=(f_gci+1:startp+M_len);
      lnn=length(range1);
      ss1=signal1(range1);
      ss2=signal2(range1);
      syns(range)=signal2(range);
      syns(range1)=(ss2.*(lnn:-1:1)+ss1.*(1:lnn))/(lnn+1);
   elseif voicetype(kf) & ~voicetype(kf+1)
      pt1=kf*M_len+Order;
      x_gci=max( gci(gci<pt1) ); % last gci after transition
      range1=(startp+1:x_gci);
      lnn=length(range1);
      ss1=signal1(range1);
      ss2=signal2(range1);
      lnn=length(range1);
      syns(range)=signal2(range);
      syns(range1)=(ss1.*(lnn:-1:1)+ss2.*(1:lnn))/(lnn+1);
   else
      syns(range)=signal1(range);
   end;
end;

% 'syns' = synthetic speech.
syns=filter([1 -1],[1 -.99],syns); % Remove low-frequency drift.
syns=filter([1 1],[1 .99],syns); % Remove high-frequency noise

%disp(''); toc;
%disp('Synthesis is OK! Job is done (finally).');
%disp('');

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?