📄 chemometrics.py
字号:
"""Chemometrics toolbox for Python
Includes:
-Partial least squares regression (PLS1 and PLS2)
-Principal components analysis
-Discriminant function analysis (DFA)
$Id: chemometrics.py Copyright (C) 2005 Roger Jarvis
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
import string, scipy, copy
from process import meancent
from process import autoscale
def __fdot__(a,b):
"""Dot product for large arrays, faster than numarrays dot
depending on the spec of the computer being used"""
product = scipy.zeros((a.shape[0],b.shape[1]),'d')
r = 0
while r < a.shape[0]:
product[r,:] = scipy.sum(scipy.reshape(a[r,:],(a.shape[1],1))*b,0)
r = r+1
return product
def __mean__(a, axis=0):
"""Find the mean of 2D array along axis = 0 or 1
default axis is 0
"""
return scipy.sum(a,axis)/a.shape[axis]
def __std__(a):
"""Find the standard deviation of 2D array
along axis = 0
"""
m = __mean__(a,0)
m = scipy.resize(m,(a.shape[0],a.shape[1]))
return scipy.sqrt(scipy.sum((a-m)**2,0)/(a.shape[0]-1))
def __diag__(a):
"""Transform vector to diagonal matrix
"""
d = scipy.zeros((len(a),len(a)),'d')
for i in range(len(a)):
d[i,i] = a[i]
return d
def __flip__(a, axis=0):
"""Reverse order of array elements along axis 0 or 1
"""
if axis == 0:
axa,axb = 0,1
elif axis == 1:
axa,axb = 1,0
ind = []
for x in range(0,a.shape[axa],1):
if x == 0:
ind.append(a.shape[axa]-1)
else:
ind.append(a.shape[axa]-1-x)
b = scipy.zeros((a.shape),'d')
for x in range(a.shape[axa]):
if axis == 0:
b[x] = a[ind[x]]
elif axis == 1:
b[:,x] = a[:,ind[x]]
return b
def __rms__(pred,act):
"""Calculate the root mean squared error of prediction
"""
return scipy.reshape(scipy.sqrt(scipy.sum((act-pred)**2)/act.shape[0]),())
def __min__(x, axis=0):
"""find min of 2d array x along axis 0 or 1
"""
s = scipy.sort(x,axis)
return scipy.reshape(s[0],())
def __max__(x, axis=0):
"""find min of 2d array x along axis 0 or 1
"""
s = scipy.sort(x,axis)
return scipy.reshape(s[x.shape[0]-1],())
def __slice__(x,index,axis=0):
"""for slicing arrays
"""
if axis == 0:
slice = scipy.reshape(x[:,int(index[0])],(x.shape[0],1))
for n in range(1,len(index),1):
slice = scipy.concatenate((slice,scipy.reshape(x[:,int(index[n])],(x.shape[0],1))),1)
elif axis == 1:
slice = scipy.reshape(x[int(index[0]),:],(1,x.shape[1]))
for n in range(1,len(index),1):
slice = scipy.concatenate((slice,scipy.reshape(x[int(index[n]),:],(1,x.shape[1]))),0)
return slice
def __split__(xdata, ydata, names, mask):
"""Splits x and y inputs into training, cross validation (and
independent test groups) for use with modelling algorithms.
If max(mask)==2 return x1,x2,x3,y1,y2,y3,n1,n2,n3 else if max(mask)==1
return x1,x2,y1,y2,n1,n2
"""
x1 = scipy.take(xdata,_index(mask,0),0)
x2 = scipy.take(xdata,_index(mask,1),0)
y1 = scipy.take(ydata,_index(mask,0),0)
y2 = scipy.take(ydata,_index(mask,1),0)
n1,n2 = [],[]
for i in range(len(names)):
if mask[i] == 0:
n1.append(names[i])
elif mask[i] == 1:
n2.append(names[i])
if max(mask) == 1:
return x1,x2,y1,y2,n1,n2
elif max(mask) == 2:
x3 = scipy.take(xdata,_index(mask,2),0)
y3 = scipy.take(ydata,_index(mask,2),0)
n3 = []
for i in range(len(names)):
if mask[i] == 2:
n3.append(names[i])
return x1,x2,x3,y1,y2,y3,n1,n2,n3
def __TW__(X,group):
"""Generate T and W matrices for DFA
"""
T,W = scipy.zeros((X.shape[1],X.shape[1]),'d'),scipy.zeros((X.shape[1],X.shape[1]),'d')
mx = scipy.stats.mean(X,0)[scipy.NewAxis,:]
## group = scipy.reshape(group,(group.shape[0],))
for x in range(1,max(group)+1):
idx = _index(group,int(x))
meani = scipy.stats.mean(scipy.take(X,idx,0),0)[scipy.NewAxis,:]
meani = scipy.resize(meani,(len(idx),X.shape[1]))
mxi = scipy.resize(mx,(len(idx),X.shape[1]))
A = scipy.take(X,idx,0) - mxi
C = scipy.take(X,idx,0) - meani
T = T + scipy.dot(scipy.transpose(A),A)
W = W + scipy.dot(scipy.transpose(C),C)
return T,W
def __adj__(a):
"""Adjoint of a"""
div,mod = divmod(a.shape[1], 2)
adj = scipy.zeros((a.shape),'d')
for p in range(0,a.shape[0],1):
for q in range(0,a.shape[1],1):
if mod == 0:
if p < a.shape[1]/2:
if q < a.shape[1]/2:
adj[p,q] = (a[p+1,q+1]*a[p+2,q+2])-(a[p+1,q+2]*a[p+2,q+1])
if q >= a.shape[1]/2:
adj[p,q] = (a[p+1,q-2]*a[p+2,q-1])-(a[p+1,q-1]*a[p+2,q-2])
if p >= a.shape[1]/2:
if q < a.shape[1]/2:
adj[p,q] = (a[p-2,q+1]*a[p-1,q+2])-(a[p-2,q+2]*a[p-1,q+1])
if q >= a.shape[1]/2:
adj[p,q] = (a[p-2,q-2]*a[p-1,q-1])-(a[p-2,q-1]*a[p-1,q-2])
if mod != 0:
if p < a.shape[1]/2:
if q < a.shape[1]/2:
adj[p,q] = (a[p+1,q+1]*a[p+2,q+2])-(a[p+1,q+2]*a[p+2,q+1])
if q > a.shape[1]/2:
adj[p,q] = (a[p+1,q-2]*a[p+2,q-1])-(a[p+1,q-1]*a[p+2,q-2])
if q == a.shape[1]/2:
adj[p,q] = (a[p+1,q-1]*a[p+2,q+1])-(a[p+1,q+1]*a[p+2,q-1])
if p > a.shape[1]/2:
if q < a.shape[1]/2:
adj[p,q] = (a[p-2,q+1]*a[p-1,q+2])-(a[p-2,q+2]*a[p-1,q+1])
if q > a.shape[1]/2:
adj[p,q] = (a[p-2,q-2]*a[p-1,q-1])-(a[p-2,q-1]*a[p-1,q-2])
if q == a.shape[1]/2:
adj[p,q] = (a[p-2,q-1]*a[p-1,q+1])-(a[p-2,q+1]*a[p-1,q-1])
if p == a.shape[1]/2:
if q < a.shape[1]/2:
adj[p,q] = (a[p-1,q+1]*a[p+1,q+2])-(a[p-1,q+2]*a[p+1,q+1])
if q > a.shape[1]/2:
adj[p,q] = (a[p-1,q-2]*a[p+1,q-1])-(a[p-1,q-1]*a[p+1,q-2])
if q == a.shape[1]/2:
adj[p,q] = (a[p-1,q-1]*a[p+1,q+1])-(a[p-1,q+1]*a[p+1,q-1])
for m in range(1,adj.shape[0]+1,1):
for n in range(1,adj.shape[1]+1,1):
if divmod(m,2)[1] != 0:
if divmod(n,2)[1] == 0:
adj[m-1,n-1] = adj[m-1,n-1]*-1
elif divmod(m,2)[1] == 0:
if divmod(n,2)[1] != 0:
adj[m-1,n-1] = adj[m-1,n-1]*-1
return scipy.transpose(adj)
def __inverse__(a):
"""Inverse of a"""
d = scipy.linalg.det(a)
if d == 0:
d = 0.001
return __adj__(a)/d
def _index(y,num):
"""use this to get tuple index for take"""
idx = []
for i in range(y.shape[0]):
if y[i] == num:
idx.append(i)
return tuple(idx)
def _put(a,ind,v):
"""a pvt put function"""
c = 0
for each in ind:
a[each,:] = v[c,:]
c += 1
return a
def _sample(x,N):
"""randomly select N samples from x"""
select = []
while len(select) < N:
a = int(scipy.rand(1)*float(x))
if a not in select:
select.append(a)
return select
def PCA_SVD(myarray,type='covar'):
"""Run principal components analysis (PCA) by singular
value decomposition (SVD)
Formerly SP_PCA_SVD
>>> a = scipy.array([[1,2,3],[0,1,1.5],[-1,-6,34],[8,15,2]])
>>> a
array([[ 1. , 2. , 3. ],
[ 0. , 1. , 1.5],
[ -1. , -6. , 34. ],
[ 8. , 15. , 2. ]])
>>> # There are four samples, with three variables each
>>> tt,pp,pr,eigs = SP_PCA_SVD(a)
>>> tt
array([[ 5.86463567e+00, -4.28370443e+00, 1.46798845e-01],
[ 6.65979784e+00, -6.16620433e+00, -1.25067331e-01],
[ -2.56257861e+01, 1.82610701e+00, -6.62877855e-03],
[ 1.31013526e+01, 8.62380175e+00, -1.51027354e-02]])
>>> pp
array([[ 0.15026487, 0.40643255, -0.90123973],
[ 0.46898935, 0.77318935, 0.4268808 ],
[ 0.87032721, -0.48681703, -0.07442934]])
>>> # This is the 'rotation matrix' - you can imagine colm names
>>> # of PC1, PC2, PC3 and row names of variable1, variable2, variable3.
>>> pr
array([[ 97.1073744 ],
[ 98.88788958],
[ 99.98141011]])
>>> eigs
array([[ 30.11765617],
[ 11.57915467],
[ 0.1935556 ]])
>>> a
array([[ 1. , 2. , 3. ],
[ 0. , 1. , 1.5],
[ -1. , -6. , 34. ],
[ 8. , 15. , 2. ]])
"""
newarray = copy.deepcopy(myarray)
if type=='covar':
newarray = meancent(newarray)
elif type=='corr':
newarray = autoscale(newarray)
else:
raise KeyError, "'type' must be one of 'covar or 'corr'"
# I think this may run faster if myarray is converted to a matrix first.
# (This should be tested - anyone got a large dataset?)
# mymat = scipy.mat(myarray)
u,s,v = scipy.linalg.svd(newarray)
tt = scipy.dot(newarray,scipy.transpose(v))
pp = v
pr = (1-(s/scipy.sum(scipy.sum(newarray**2))))*100
pr = scipy.reshape(pr,(1,len(pr)))
pr = scipy.concatenate((scipy.array([[0.0]]),pr),1)
pr = scipy.reshape(pr,(pr.shape[1],))
eigs = s
return tt,pp,pr[:,scipy.NewAxis],eigs[:,scipy.NewAxis]
def PCA_NIPALS(myarray,comps,type='covar',stb=None):
"""Run principal components analysis (PCA) using NIPALS
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -