📄 chemometrics.py
字号:
Martens,H; Naes,T: Multivariate Calibration, Wiley: New York, 1989
"""
X = copy.deepcopy(myarray)
if type == 'covar':
myarray = meancent(myarray)
elif type == 'corr':
myarray = autoscale(myarray)
arr_size = myarray.shape
tt, pp, i = scipy.zeros((arr_size[0],comps),'d'), scipy.zeros((comps,arr_size[1]),'d'), 0
while i < comps:
std = scipy.stats.std(myarray,axis=0)
st2 = scipy.argsort(std)
ind = st2[arr_size[1]-1,]
t0 = myarray[:,ind]
c = 0
while c == 0: #NIPALS
p0 = scipy.dot(scipy.transpose(t0),myarray)
p1 = p0/scipy.sqrt(scipy.sum(p0**2))
t1 = scipy.dot(myarray,scipy.transpose(p1))
if scipy.sqrt(scipy.sum(t1**2)) - scipy.sqrt(scipy.sum(t0**2)) < 5*10**-5:
tt[:,i] = t1
pp[i,:] = p1
c = 1
t0 = t1
myarray = myarray - scipy.dot(scipy.resize(t1,(arr_size[0],1)),scipy.resize(p1,(1,arr_size[1])))
i += 1
## print 'PC ',i
#report progress to status bar
if stb is not None:
stb.SetStatusText(string.join(('Principal component',str(i)),' '),0)
# work out percentage explained variance
X = meancent(X)
s0, s = scipy.sum(scipy.sum(X**2)), []
for n in scipy.arange(1,comps+1,1):
E = X - scipy.dot(tt[:,0:n],pp[0:n,:])
s.append(scipy.sum(scipy.sum(E**2)))
pr = (1-((scipy.asarray(s)/s0)))*100
pr = scipy.reshape(pr,(1,len(pr)))
pr = scipy.concatenate((scipy.array([[0.0]]),pr),1)
pr = scipy.reshape(pr,(pr.shape[1],))
eigs = scipy.array(s)
if stb is not None:
stb.SetStatusText('Status',0)
return tt,pp,pr[:,scipy.NewAxis],eigs[:,scipy.NewAxis]
def DFA(X,group,nofac):
"""Discriminant function analysis
Manly, B.F.J. Multivariate Statistical Methods: A Primer,
2nd Ed, Chapman & Hall: New York, 1986
"""
# generate T and W matrices
T,W = __TW__(X,group)
B = T-W
P = scipy.dot(scipy.linalg.inv(W),B)
eigval,eigvec = scipy.linalg.eig(P)
eigval = eigval.real
eigvec = eigvec
## print eigvec
## print
# Sort eigenvectors w.r.t eigenvalues
order = __flip__(scipy.argsort(eigval))
eigval = __flip__(scipy.sort(eigval))
# V - canonical variates directions
V = scipy.take(eigvec,order[0:nofac].tolist(),1)
## # scale eigenvalues to unit variance
## v,vi = sort(transpose(V),0), transpose(argsort(transpose(V),0))
## Vv = transpose(V)
## for x in range(nofac):
## v[:,x] = Vv[__flip__(vi[:,x]),x]
## vs = transpose(diagonal(dot(dot(v,W),transpose(v))))
## vs = vs/(X.shape[0]-__max__(group))
## V = V/resize(sqrt(vs),(V.shape))
#DF scores
## print V.shape
## print
## print V
## print
U = scipy.dot(scipy.dot(X,V),__diag__(eigval[0:nofac])).real
return U,V.real,eigval[0:nofac]
def PLS(xdata,ydata,mask,names,factors,stb=None):
"""PLS1 for modelling a single Y-variable and
PLS2 for several Y-variables
Martens,H; Naes,T: Multivariate Calibration, Wiley: New York, 1989
"""
x1,x2,x3,y1,y2,y3,n1,n2,n3 = __split__(xdata,ydata,names,mask) # raw data
Xm,Xmv,Xmt = __mean__(x1),__mean__(x2),__mean__(x3) # get column means
ym,ymv,ymt = __mean__(y1),__mean__(y2),__mean__(y3)
x, y = meancent(xdata), meancent(ydata) # centre the data
# split into training, cross-validation & test
train_x,cval_x,test_x,train_y,cval_y,test_y,tr_nm,cv_nm,ts_nm=__split__(x,y,names,mask)
X,Xv,Xt = train_x,cval_x,test_x
y,yv,yt = train_y,cval_y,test_y
rmsec,rmsepc,bout = [],[],[]
NoY = ydata.shape[1]
u = y
for x in range(0,factors,1):
t0,opt = 0,0
if NoY > 1 and x == 0: #PLS2
u = scipy.reshape(y[:,scipy.argsort(scipy.sum(y1**2))[0]],(y.shape[0],1))
while opt == 0:
# for training
c = scipy.dot(scipy.dot(scipy.dot(scipy.transpose(u),X),scipy.transpose(X)),u)**-0.5 #scaling factor
w = c*scipy.dot(scipy.transpose(X),u) #vector of loading weights, w'w = 1
t = scipy.dot(X,w) # spectral scores
p = scipy.dot(scipy.transpose(X),t)*scipy.linalg.inv(scipy.dot(scipy.transpose(t),t)) # spectral loadings
q = scipy.dot(scipy.transpose(u),t)*scipy.linalg.inv(scipy.dot(scipy.transpose(t),t)) # chemical loading
if NoY == 1: #PLS1
opt = 1
elif float(scipy.reshape(scipy.sum(abs(t-t0)),())) > 5*10**-5: #PLS2 - check for convergence
u = scipy.dot(scipy.dot(u,q),scipy.linalg.inv(scipy.dot(scipy.transpose(q),q)))
t0 = t
else:
opt = 1
X = X - scipy.dot(t,scipy.transpose(p)) # compute residuals of X which are also X for next iteration
u = u - scipy.dot(t,q) # compute residuals of y which are also y for next iteration
if x == 0:
W,T,P,Q = w,t,p,q
else:
W = scipy.concatenate((W,w),1)
T = scipy.concatenate((T,t),1)
P = scipy.concatenate((P,p),1)
Q = scipy.concatenate((Q,q),0)
b=scipy.dot(scipy.dot(W,scipy.linalg.inv(scipy.dot(scipy.transpose(P),W))),Q) # 882*1
# rms for training data - rmsec
if NoY == 1:
b0 = ym - scipy.dot(Xm,b)
predy = b0 + scipy.dot(x1,b)
rmsec.append(float(__rms__(predy,y1)))
elif NoY > 1:
predy = scipy.zeros(y1.shape)
avrmsec = 0
for eachy in range(0,NoY,1):
b0 = __mean__(y1[:,eachy]) - scipy.dot(Xm,b)
predy[:,eachy] = scipy.reshape(b0 + scipy.dot(x1,b),(y1.shape[0],))
avrmsec = avrmsec + float(__rms__(predy[:,eachy],y1[:,eachy]))
rmsec.append(avrmsec/NoY)
# cross validation prediction
if NoY == 1:
b1 = ymv - scipy.dot(Xmv,b)
predyv = b1 + scipy.dot(x2,b)
rmsepc.append(float(__rms__(predyv,y2)))
elif NoY > 1:
predyv = scipy.zeros(y2.shape)
avrmsec = 0
for eachy in range(NoY):
b1 = __mean__(y2[:,eachy]) - scipy.dot(Xmv,b)
predyv[:,eachy] = scipy.reshape(b1 + scipy.dot(x2,b),(y2.shape[0],))
avrmsec = avrmsec + float(__rms__(predyv[:,eachy],y2[:,eachy]))
rmsepc.append(avrmsec/NoY)
#report progress to status bar
if stb is not None:
stb.SetStatusText(string.join(('Extracting factor...',str(x+1)),' '),0)
# work out number of factors to use by finding the min of
# the rmsep cross validation - exception for use in GA
facs = ind = scipy.argsort(rmsepc)[0]
# return final rms values
RMSEC,RMSEPC = rmsec[ind],rmsepc[ind]
b=scipy.dot(scipy.dot(W[:,0:ind+1],scipy.linalg.inv(scipy.dot(scipy.transpose(P[:,0:ind+1]),W[:,0:ind+1]))),Q[0:ind+1])
if NoY == 1:
b0 = ym - scipy.dot(Xm,b)
predy = b0 + scipy.dot(x1,b)
b1 = ymv - scipy.dot(Xmv,b)
predyv = b1 + scipy.dot(x2,b)
b2 = ymt - scipy.dot(Xmt,b)
predyt = b2 + scipy.dot(x3,b)
RMSEPT = float(__rms__(predyt,y3))
elif NoY > 1:
predyt = scipy.zeros(y3.shape)
avrmsec = 0
for eachy in range(0,NoY,1):
b2 = __mean__(y3[:,eachy]) - scipy.dot(Xmt,b)
predyt[:,eachy] = scipy.reshape(b2 + scipy.dot(x3,b),(y3.shape[0],))
avrmsec = avrmsec + float(__rms__(predyt[:,eachy],y3[:,eachy]))
RMSEPT = avrmsec/NoY
if stb is not None:
stb.SetStatusText('Status',0)
return W,T,P,Q,facs,predy,predyv,predyt,RMSEC,RMSEPC,rmsec,rmsepc,RMSEPT
def DFA_XVALRAW(X,group,names,mask,nofac):
"""Perform DFA with full cross validation
"""
if max(mask) > 1:
x1,x2,x3,y1,y2,y3,n1,n2,n3=__split__(X,group,names,mask)
elif max(mask) < 2:
x1,x2,y1,y2,n1,n2=__split__(X,group,names,mask)
#train
trscores,loads,eigs = DFA(x1,y1,nofac)
#cross validation
cvscores = scipy.dot(scipy.dot(x2,loads),__diag__(eigs))
#independent test
if max(mask) > 1:
tstscores = scipy.dot(scipy.dot(x3,loads),__diag__(eigs))
names = scipy.concatenate((n1,n2,n3),0)
groups = scipy.concatenate((y1,y2,y3),0)
scores = scipy.concatenate((trscores,cvscores,tstscores),0)
mask = scipy.concatenate((scipy.zeros((len(y1),1)),scipy.ones((len(y2),1)),(scipy.ones((len(y3),1))*2)),0)
else:
names = scipy.concatenate((n1,n2),0)
groups = scipy.concatenate((y1,y2),0)
scores = scipy.concatenate((trscores,cvscores),0)
mask = scipy.concatenate((scipy.zeros((len(y1),1)),scipy.ones((len(y2),1))),0)
return scores,groups,names,mask,loads,eigs
def DFA_XVAL(RawX,pca,noloads,group,names,mask,nofac,ptype='covar'):
"""Perform DFA with full cross validation
"""
if max(mask) > 1:
rx1,rx2,rx3,ry1,ry2,ry3,rn1,rn2,rn3=__split__(RawX,group[:,scipy.NewAxis],names,mask[:,scipy.NewAxis])
elif max(mask) < 2:
rx1,rx2,ry1,ry2,rn1,rn2=__split__(RawX,group[:,scipy.NewAxis],names,mask[:,scipy.NewAxis])
if pca == 'SVD':
pcscores,pp,pr,pceigs = PCA_SVD(rx1,type=ptype)
elif pca == 'NIPALS':
pcscores,pp,pr,pceigs = PCA_NIPALS(rx1,noloads,type=ptype)
#get indices
idxn = scipy.arange(RawX.shape[0])[:,scipy.NewAxis]
tr_idx = scipy.take(idxn,_index(mask,0),0)
cv_idx = scipy.take(idxn,_index(mask,1),0)
#train
trscores,loads,eigs = DFA(pcscores[:,0:noloads],ry1,nofac)
#cross validation
#Get projected pc scores
rx2 = rx2-scipy.resize(scipy.stats.mean(rx2,0),(len(rx2),rx1.shape[1]))
pcscores = scipy.dot(rx2,scipy.transpose(pp))
cvscores = scipy.dot(scipy.dot(pcscores[:,0:noloads],loads),__diag__(eigs)).real
#independent test
if max(mask) > 1:
ts_idx = scipy.take(idxn,_index(mask,2),0)
rx3 = rx3-scipy.resize(scipy.stats.mean(rx3,0),(len(rx3),rx1.shape[1]))
pcscores = scipy.dot(rx3,scipy.transpose(pp))
tstscores = scipy.dot(scipy.dot(pcscores[:,0:noloads],loads),__diag__(eigs)).real
scores = scipy.zeros((RawX.shape[0],nofac),'d')
tr_idx = scipy.reshape(tr_idx,(len(tr_idx),)).tolist()
cv_idx = scipy.reshape(cv_idx,(len(cv_idx),)).tolist()
ts_idx = scipy.reshape(ts_idx,(len(ts_idx),)).tolist()
_put(scores,tr_idx,trscores)
_put(scores,cv_idx,cvscores)
_put(scores,ts_idx,tstscores)
else:
scores = scipy.concatenate((trscores,cvscores),0)
tr_idx = scipy.reshape(tr_idx,(len(tr_idx),)).tolist()
cv_idx = scipy.reshape(cv_idx,(len(cv_idx),)).tolist()
_put(scores,tr_idx,trscores)
_put(scores,cv_idx,cvscores)
return scores,loads,eigs
def OLS(act,pred):
"""Ordinary least squares regression"""
act = scipy.reshape(act,(len(act),1))
gradient = scipy.sum((act-__mean__(act))*(pred-__mean__(pred)))/(sum((act-__mean__(act))**2))
yintercept = __mean__(act)-(gradient*__mean__(act))
mserr = pred-__mean__(pred)
rmserr = __rms__(act,pred)
gerr = scipy.sqrt(rmserr**2/scipy.sum((act-__mean__(act))**2))
ierr = scipy.sqrt((rmserr**2)*((1/len(act))+((__mean__(act)**2)/scipy.sum((act-__mean__(act))**2))))
return gradient,yintercept,mserr,rmserr,gerr,ierr
if __name__=="__main__":
import chemometrics,doctest
doctest.testmod(chemometrics,verbose=True)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -