📄 fastmcd.m
字号:
end data=(data-repmat(med,n,1))./repmat(mad,n,1);% The standardized classical estimates are now computed.clmean=mean(data);clcov=cov(data);% The univariate non-classical case is now handled.if p==1 & h~=n if ~lts res.method=sprintf('\nUnivariate location and scale estimation.'); end [raw.center,raw.cov]=mcduni(data,n,h,n-h+1,alpha); scale=raw.cov./sqrt(rawconsfactor(h,n,p)*rawcorfactor(p,n,alpha)); sor=sort((data-raw.center).^2); raw.objective=1/(h-1)*sum(sor(1:h))*prod(mad)^2; %m=2*n/asvardiag(h,n,p); %quantile=qf(0.975,p,m-p+1); quantile=chi2q(p);
%weights=((data-raw.center)/scale).^2*(m-p+1)/(m*p)<quantile; weights=((data-raw.center)/scale).^2<quantile;
raw.wt=weights; [res.center,res.cov]=weightmecov(data,weights,n,p); factor=rewconsfactor(weights,n,p); factor=factor*rewcorfactor(p,n,alpha); res.cov=factor*res.cov; mah=(data-res.center).^2/res.cov; mah_raw=(data-raw.center).^2/raw.cov; res.flag= mah <= chi2q(1); [raw.cov,raw.center]=trafo(raw.cov,raw.center,med,mad,p); [res.cov,res.center]=trafo(res.cov,res.center,med,mad,p); res.mahalanobis=abs(data'-clmean)/sqrt(clcov); raw.robdist=sqrt(mah_raw'); res.robdist=sqrt(mah'); if ~lts disp(res.method); end spec.ask=1; if ~lts plotmcd(res,spec); end returnendif det(clascov) < exp(-50*p) % all observations lie on a hyperplane. [z, eigvl]=eigs(clcov,1,0,struct('disp',0)); res.plane=z; weights(1:n)=1; if cor correl=clcov./(sqrt(diag(clcov))*sqrt(diag(clcov))'); end [clcov,clmean]=trafo(clcov,clmean,med,mad,p); [raw.center,raw.cov,res.center,res.cov,raw.objective,raw.wt,res.flag,... res.method]=displ(1,n,weights,n,p,clmean,clcov,res.method,z./mad',ok,... raw.wt,res.flag,file,fid,cor,correl); if cor [res.cor,raw.cor]=deal(correl); end returnend% The classical case is now handled.if h==n if ~lts msg=sprintf('The MCD estimates based on %g observations are equal to the classical estimates.\n',h); res.method=strvcat(res.method,msg); end raw.center=clmean; raw.cov=clcov; raw.objective=det(clcov); mah=mahalanobis(data,clmean,clcov,n,p); res.mahalanobis=sqrt(mah); raw.robdist=res.mahalanobis; weights=mah <= chi2q(p); raw.wt=weights; [res.center,res.cov]=weightmecov(data,weights,n,p) if cor raw.cor=raw.cov./(sqrt(diag(raw.cov))*sqrt(diag(raw.cov))'); res.cor=res.cov./(sqrt(diag(res.cov))*sqrt(diag(res.cov))'); end if det(res.cov) < exp(-50*p) [center,covar,z,correl,plane,count]=fit(data,NaN,med,mad,p,z,cor,res.center,res.cov,n); res.plane=z; if cor correl=covar./(sqrt(diag(covar))*sqrt(diag(covar))'); end res.method=displrw(count,n,p,center,covar,res.method,file,z,fid,cor,correl); [raw.cov,raw.center]=trafo(raw.cov,raw.center,med,mad,p); [res.cov,res.center]=trafo(res.cov,res.center,med,mad,p); res.robdist=raw.robdist; else mah=mahalanobis(data,res.center,res.cov,n,p); weights=mah <= chi2q(p); [raw.cov,raw.center]=trafo(raw.cov,raw.center,med,mad,p); [res.cov,res.center]=trafo(res.cov,res.center,med,mad,p); res.robdist=sqrt(mah); end raw.objective=raw.objective*prod(mad)^2; res.flag=weights; if ~lts disp(res.method) end spec.ask=1; if ~lts plotmcd(res,spec); end returnendpercent=h/n;% If n >= 2*nmini the dataset will be divided into subdatasets. For n < 2*nmini the set % will be treated as a whole. if n >= 2*nmini maxobs=maxgroup*nmini; if n >= maxobs ngroup=maxgroup; group(1:maxgroup)=nmini; else ngroup=floor(n/nmini); minquan=floor(n/ngroup); group(1)=minquan; for s=2:ngroup group(s)=minquan+double(rem(n,ngroup)>=s-1); end end part=1; adjh=floor(group(1)*percent); nsamp=floor(ntrial/ngroup); minigr=sum(group); obsingroup=fillgroup(n,group,ngroup,seed,fid); % obsingroup : i-th row contains the observations of the i-th group. % The last row (ngroup+1-th) contains the observations for the 2nd stage % of the algorithm.else [part,group,ngroup,adjh,minigr,obsingroup]=deal(0,n,1,h,n,n); replow=[50,22,17,15,14,zeros(1,45)]; if n < replow(p) % All (p+1)-subsets will be considered. allsubset=1; perm=[1:p,p]; nsamp=nchoosek(n,p+1); else allsubset=0; nsamp=ntrial; end end % some further initialisations. csteps=csteps1;inplane=NaN;% tottimes : the total number of iteration steps.% fine : becomes 1 when the subdatasets are merged. % final : becomes 1 for the final stage of the algorithm. [tottimes,fine,final,prevdet]=deal(0);if part % bmean1 : contains, for the first stage of the algorithm, the means of the ngroup*10 % best estimates. % bcov1 : analogous to bmean1, but now for the covariance matrices. % bobj1 : analogous to bmean1, but now for the objective values. % coeff1 : if in the k-th subdataset there are at least adjh observations that lie on % a hyperplane then the coefficients of this plane will be stored in the % k-th column of coeff1. coeff1=repmat(NaN,p,ngroup); bobj1=repmat(inf,ngroup,10); bmean1=cell(ngroup,10); bcov1=cell(ngroup,10); [bmean1{:}]=deal(NaN); [bcov1{:}]=deal(NaN);end% bmean : contains the means of the ten best estimates obtained in the second stage of the% algorithm. % bcov : analogous to bmean, but now for the covariance matrices.% bobj : analogous to bmean, but now for the objective values.% coeff : analogous to coeff1, but now for the merged subdataset. % If the data is not split up, the 10 best estimates obtained after csteps1 iterations % will be stored in bmean, bcov and bobj.coeff=repmat(NaN,p,1);bobj=repmat(inf,1,10);bmean=cell(1,10);bcov=cell(1,10);[bmean{:}]=deal(NaN);[bcov{:}]=deal(NaN); seed=0;while final~=2 if fine | (~part & final) nsamp=10; if final adjh=h; ngroup=1; if n*p <= 1e+5 csteps=csteps3; elseif n*p <=1e+6 csteps=10-(ceil(n*p/1e+5)-2); else csteps=1; end if n > 5000 nsamp=1; end else adjh=floor(minigr*percent); csteps=csteps2; end end % found : becomes 1 if we have a singular intermediate MCD estimate. found=0; for k=1:ngroup if ~fine found=0; end for i=1:nsamp tottimes=tottimes+1; % ns becomes 1 if we have a singular trial subsample and if there are at % least adjh observations in the subdataset that lie on the concerning hyperplane. % In that case we don't have to take C-steps. The determinant is zero which is % already the lowest possible value. If ns=1, no C-steps will be taken and we % start with the next sample. If we, for the considered subdataset, haven't % already found a singular MCD estimate, then the results must be first stored in % bmean, bcov, bobj or in bmean1, bcov1 and bobj1. If we, however, already found % a singular result for that subdataset, then the results won't be stored % (the hyperplane we just found is probably the same as the one we found earlier. % We then let adj be zero. This will guarantee us that the results won't be % stored) and we start immediately with the next sample. adj=1; ns=0; % For the second and final stage of the algorithm the array sortdist(1:adjh) % contains the indices of the observations corresponding to the adjh observations % with minimal relative distances with respect to the best estimates of the % previous stage. An exception to this, is when the estimate of the previous % stage is singular. For the second stage we then distinguish two cases : % % 1. There aren't adjh observations in the merged set that lie on the hyperplane. % The observations on the hyperplane are then extended to adjh observations by % adding the observations of the merged set with smallest orthogonal distances % to that hyperplane. % 2. There are adjh or more observations in the merged set that lie on the % hyperplane. We distinguish two cases. We haven't or have already found such % a hyperplane. In the first case we start with a new sample. But first, we % store the results in bmean1, bcov1 and bobj1. In the second case we % immediately start with a new sample. % % For the final stage we do the same as 1. above (if we had h or more observations % on the hyperplane we would already have found it). if final if ~isinf(bobj(i)) meanvct=bmean{i}; covmat=bcov{i}; if bobj(i)==0 [dis,sortdist]=sort(abs(sum((data-repmat(meanvct,n,1))'.*repmat(coeff,1,n)))); else sortdist=mahal(data,meanvct,covmat,part,fine,final,k,obsingroup,group,... minigr,n,p); end else break; end elseif fine if ~isinf(bobj1(k,i)) meanvct=bmean1{k,i}; covmat=bcov1{k,i}; if bobj1(k,i)==0 [dis,ind]=sort(abs(sum((data(obsingroup{end},:)-repmat(meanvct,minigr,1))'.*repmat(coeff1(:,k),1,minigr)))); sortdist=obsingroup{end}(ind); if dis(adjh) < 1e-8 if found==0 obj=0; coeff=coeff1(:,k); found=1; else adj=0; end ns=1; end else sortdist=mahal(data,meanvct,covmat,part,fine,final,k,obsingroup,group,... minigr,n,p); end else break; end else % The first stage of the algorithm. % index : contains trial subsample. if ~part if allsubset k=p+1; perm(k)=perm(k)+1; while ~(k==1 |perm(k) <=(n-(p+1-k))) k=k-1; perm(k)=perm(k)+1; for j=(k+1):p+1 perm(j)=perm(j-1)+1; end end index=perm; else [index,seed]=randomset(n,p+1,seed); end else [index,seed]=randomset(group(k),p+1,seed); index=obsingroup{k}(index); end meanvct=mean(data(index,:)); covmat=cov(data(index,:)); if det(covmat) < exp(-50*p) % The trial subsample is singular. % We distinguish two cases : % % 1. There are adjh or more observations in the subdataset that lie % on the hyperplane. If the data is not split up, we have adjh=h and thus % an exact fit. If the data is split up we distinguish two cases. % We haven't or have already found such a hyperplane. In the first case % we check if there are more than h observations in the entire set % that lie on the hyperplane. If so, we have an exact fit situation. % If not, we start with a new trial subsample. But first, the % results must be stored bmean1, bcov1 and bobj1. In the second case % we immediately start with a new trial subsample. % % 2. There aren't adjh observations in the subdataset that lie on the % hyperplane. We then extend the trial subsample until it isn't singular % anymore. % eigvct : contains the coefficients of the hyperplane. [eigvct, eigvl]=eigs(covmat,1,0,struct('disp',0)); if ~part dist=abs(sum((data-repmat(meanvct,n,1))'.*repmat(eigvct,1,n))); else dist=abs(sum((data(obsingroup{k},:)-repmat(meanvct,group(k),1))'.*repmat(eigvct,1,group(k)))); end obsinplane=find(dist < 1e-8); % count : number of observations that lie on the hyperplane. count=length(obsinplane);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -