📄 stat.hpp

📁 orange源码数据挖掘技术
💻 HPP
📖 第 1 页 / 共 3 页
字号:
               vector<int> &counts, T &min, T &binsize, int &extrapoints,
               int numbins=10)
{ histogram(flist, counts, min, binsize, extrapoints, numbins);
  for(int i=1; i<numbins; counts[i]+=counts[i-1], i++);
}


template<class T>
void cumfreq  (const vector<T> &flist,
               vector<int> &counts, T &rmin, T &binsize, int &extrapoints,
               const T &min, const T &max, int numbins=10)
{ histogram(flist, counts, rmin, binsize, extrapoints, min, max, numbins);
  for(int i=1; i<numbins; counts[i]+=counts[i-1], i++);
}



template<class T>
void relfreq  (const vector<T> &flist,
               vector<double> &counts, T &min, T &binsize, int &extrapoints,
               int numbins=10)
{ DEFINE_TYPENAME
  vector<int> hcounts;
  histogram(flist, hcounts, min, binsize, extrapoints, numbins);
  counts.clear();
  double ls=flist.size();
  ITERATE(vector<int>, ii, hcounts)
    counts.push_back(*ii/ls);
}

template<class T>
void relfreq  (const vector<T> &flist,
               vector<double> &counts, T &rmin, T &binsize, int &extrapoints,
               const T &min, const T &max, int numbins=10)
{ DEFINE_TYPENAME
  vector<int> hcounts;
  histogram(flist, hcounts, rmin, binsize, extrapoints, min, max, numbins);
  counts.clear();
  double ls=flist.size();
  ITERATE(vector<int>, ii, hcounts)
    counts.push_back(*ii/ls);
}


/* *********** VARIABILITY ************/

template<class T>
T samplevar(const vector<T> &flist)
{ DEFINE_TYPENAME
  if (!flist.size())
    throw StatException("samplevar: empty list");

  T me=mean(flist);
  T sum=0.0;
  const_SITERATE(fi, flist)
    sum+=sqr(*fi-me);
  return sum/flist.size();
}


template<class T>
T samplestdev(const vector<T> &flist)
{ return sqrt(samplevar(flist)); }


template<class T>
T var(const vector<T> &flist)
{ DEFINE_TYPENAME
  if (flist.size()<2)
    throw StatException("samplevar: empty or one-element list");

  T me=mean(flist);
  T sum=0.0;
  const_SITERATE(fi, flist)
    sum+=sqr(*fi-me);
  return sum/(flist.size()-1);
}


template<class T>
T stdev(const vector<T> &flist)
{ return sqrt(var(flist)); }


template<class T>
T sterr(const vector<T> &flist)
{ return stdev(flist)/sqrt(T(flist.size())); }


template<class T>
T z(const vector<T> &flist, const T &x)
{ return (x-mean(flist))/samplestdev(flist); }


template<class T>
bool zs(const vector<T> &flist, vector<T> &zss)
{ DEFINE_TYPENAME
  T me=mean(flist), ss=samplestdev(flist);
  zss=vector<T>(flist.size());
  iterator zi(zss.begin());
  const_SITERATE(fi, flist)
    *(zi++)=(*fi-me)/ss;
  return true;
}



/* *********** TRIMMING FNCS ************/

template<class T>
void trimboth (const vector<T> &flist, double proportion, vector<T> &clist)
{ int tocut=int(flist.size()*proportion);
  if (tocut*2>flist.size())
    throw StatException("trim proportion too large");

  clist=vector<T>(flist.begin()+tocut, flist.end()-tocut);
}


template<class T>
void trim1 (const vector<T> &flist, double proportion, vector<T> &clist, bool right=true)
{ int tocut=int(flist.size()*proportion);
  if (tocut>flist.size())
    throw StatException("trim proportion too large");

  if (right)
    clist=vector<T>(flist.begin(), flist.end()-tocut);
  else
    clist=vector<T>(flist.begin()+tocut, flist.end());
}



/* *********** PROBABILITY CALCS ************/

template<class T>
T gammln(const T &xx)
{
    static T cof[6] = {76.18009173, -86.50532033, 24.01409822, -1.231739516, 0.120858003e-2, -0.536382e-5};

    T x=xx, y=xx, tmp=x+5.5;
    tmp-=(x+0.5)*log(tmp);
    T ser=1.000000000190015;
    for(int j=0; j<6; j++)
     ser+=cof[j]/++y;
    return -tmp + log(T(2.5066282746310005)*ser/x);
}


template<class T>
T gammcf(const T &a, const T &x, T &gln)
{ const T FPMIN(1.0e-30);
  const int ITMAX= 100;
  const T EPS=3.0e-7;

  gln = gammln(a);
  T b=x+1.0-a;
  T c=T(1.0)/FPMIN;
  T d=T(1.0)/b;
  T h=d;
  for(int i=1; i<=ITMAX; i++) {
    T an=(a-double(i))*i;
    b += 2.0;
    d=an*d+b;
    if (abs(d) < FPMIN) d=FPMIN;
    c=b+an/c;
    if (abs(c) < FPMIN) c=FPMIN;
    d=T(1.0)/d;
    T del=d*c;
    h *= del;
    if (abs(del-1.0) < EPS)
      return exp(-x+a*log(x)-gln)*h;
  }

  throw StatException("gcf: a too large, ITMAX too small");
}


template<class T>
T gammser(const T &a, const T &x, T &gln)
{ const int ITMAX=100;
  const T EPS=3.0e-7;
  gln=gammln(a);
  if (x<=0.0)
    throw StatException("gser: negative x");

  T ap=a;
  T sum=T(1.0)/a;
  T del=sum;
  for(int n=1; n<=ITMAX; n++) {
    ++ap;
    del *= x/ap;
    sum += del;
    if (abs(del) < fabs(sum)*EPS)
      return sum*exp(-x+a*log(x)-gln);
  }
      
  throw StatException("gcf: a too large, ITMAX too small");
}
   

template<class T>
T gammp(const T &a, const T &x)
{ if ((x<0.0) || (a<=0.0))
    throw StatException("gammp: invalid arguments");

  T gln;
  return (x<(a+1.0)) ? gammser(a, x, gln) : -gammcf(a, x, gln)+1.0;
}


template<class T>
T gammq(const T &a, const T &x)
{ if ((x<0.0) || (a<=0.0))
    throw StatException("gammp: invalid arguments");

  T gln;
  return (x<(a+1.0)) ? -gammser(a, x, gln)+1.0 : gammcf(a, x, gln);
}

      
template<class T>
T erf(const T &x)
{ return (x<0.0) ? -gammp(T(0.5), x*x) : gammp(T(0.5), x*x); }


template<class T>
T erfc(const T &x)
{ return (x<0.0) ? T(1.0) + gammp(T(0.5), x*x) : gammq(T(0.5), x*x); }


template<class T>
T chisqprob(const T &x, const T &df)
{ return gammq(df*0.5, x*0.5);
}



template<class T>
T betacf(const T &a, const T &b, const T &x)
{   const int ITMAX = 200;
    const double EPS = 3.0e-7;

    T bm=1.0, az=1.0, am=1.0;

    T qab=a+b;
    T qap=a+1.0;
    T qam=a-1.0;
    T bz= -qab*x/qap + 1.0;
    for(int i=0; i<=ITMAX; i++) {
	    T em = i+1;
	    T tem = em*2;
	    T d = em*(b-em)*x/((qam+tem)*(a+tem));
	    T ap = az + d*am;
	    T bp = bz+d*bm;
	    d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem));
	    T app = ap+d*az;
	    T bpp = bp+d*bz;
	    T aold = az;
	    am = ap/bpp;
	    bm = bp/bpp;
	    az = app/bpp;
	    bz = 1.0;
	    if (abs(az-aold)<(fabs(az)*EPS))
	      return az;
    }

    throw StatException("betacf: a or b too big, or ITMAX too small.");
    return -1.0;
}

template<class T>
T betai(const T &a, const T &b, const T &x)
{ if ((x<0.0) || (x>1.0))
    throw StatException("betai: bad x");

  T bt= ((x==0.0) || (x==1.0)) ? 0.0 : exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(- x + 1.0));
  return (x < (a+1.0)/(a+b+2.0)) ? bt*betacf(a,b,x)/a : -bt*betacf(b,a, -x+1.0)/b + 1.0;
}


template<class T>
T zprob(const T &z)
{ const T Z_MAX = 6.0;
  
  T x;

  if (z == 0.0)
    x = 0.0;

  else {
    T y = abs(z) * 0.5;
    if (y >= Z_MAX*0.5)
      x = 1.0;
    else if (y < 1.0) {
      T w = sqr(y);
	    x = ((((((((w * 0.000124818987
                  -0.001075204047) * w +0.005198775019) * w
                -0.019198292004) * w +0.059054035642) * w
              -0.151968751364) * w +0.319152932694) * w
            -0.531923007300) * w +0.797884560593) * y * 2.0;
    }
	  else {
	    y -= 2.0;
	    x = (((((((((((((y * -0.000045255659
                       +0.000152529290) * y -0.000019538132) * y
                     -0.000676904986) * y +0.001390604284) * y
                   -0.000794620820) * y -0.002034254874) * y
                 +0.006549791214) * y -0.010557625006) * y
               +0.011630447319) * y -0.009279453341) * y
             +0.005353579108) * y -0.002141268741) * y
           +0.000535310849) * y +0.999936657524;
    }
  }

  return (z > 0.0) ? (x+1.0)*0.5 : (-x+1.0)*0.5;
}


inline double fprob(const int &dfnum, const int &dfden, const double &F)
{ return betai(dfden*0.5, dfnum*0.5, dfden/(dfden+dfnum*F)); }


template<class T>
T erfcc(const T& x)
{ T z = abs(x);
  T t = T(1.0) / (z*0.5+1.0);
  T ans = t * exp(-z*z-1.26551223 + t*(t*(t*(t*(t*(t*(t*(t*(t*0.17087277-0.82215223)+1.48851587)-1.13520398)+0.27886807)-0.18628806)+0.9678418)+0.37409196)+1.00002368));
  return (x >= 0.0) ?  ans : -ans +2.0;
}


/* *********** RANDOM NUMBERS ***************/

template<class T>
T gasdev(const T &mean, const T &dev)
{ float r, v1, v2;
  do {
    v1=float(rand())/RAND_MAX*2-1;
    v2=float(rand())/RAND_MAX*2-1;
    r=v1*v1+v2*v2;
  } while ((r>1.0) || (r<0.0));

  return mean + dev * v1 * sqrt(T(-2.0*log(r)/r));
}

template<class T, class RF>
T gasdev(const T &mean, const T &dev, RF &randfunc)
{ float r, v1, v2;
  do {
    v1=randfunc(-1.0, 1.0);
    v2=randfunc(-1.0, 1.0);
    r=v1*v1+v2*v2;
  } while ((r>1.0) || (r<0.0));

  return mean + dev * v1 * sqrt(-2.0*log(r)/r);
}


/* *********** CORRELATION FNCS ************/

template<class T>
T pearsonr(const vector<T> flist1, const vector<T> &flist2, T &probrs)
{ DEFINE_TYPENAME

  const T TINY=T(1.0e-30);

  if (flist1.size() != flist2.size())
    throw StatException("pearsonr: lists of different sizes");

  double n=flist1.size();
  T sumx=0.0, sumx2=0.0, sumy=0.0, sumy2=0.0, summult=0.0;
  for(const_iterator i1(flist1.begin()), i2(flist2.begin()), e1(flist1.end()); i1!=e1; i1++, i2++) {
    sumx += *i1; sumx2 += sqr(*i1);
    sumy += *i2; sumy2 += sqr(*i2);
    summult += *i1 * *i2;
  }
  T r_num = summult * n - sumx * sumy;
  T r_den = sqrt( (sumx2*n - sqr(sumx) ) * (sumy2*n - sqr(sumy)) );
  T r = r_num / r_den;
  T df=n-2;
  T t = r * sqrt( df / ( (-r+1.0+TINY)*(r+1.0+TINY) ) );
  probrs=betai(df*0.5, T(0.5), df/(df+t*t));
  return r;
}


template<class T>
double spearmanr(const vector<T> &flist1, const vector<T> &flist2, double &probrs)
{ if (flist1.size() != flist2.size())
    throw StatException("spearmanr: lists of different sizes");

  double n=flist1.size();
  vector<double> ranks1, ranks2;
  rankdata(flist1, ranks1);
  rankdata(flist2, ranks2);
  double dsq=sumdiffsquared(ranks1, ranks2);
  double rs=1-6*dsq / (n*(sqr(n)-1));
  double df=n-2;
  double t = rs * sqrt(df / ((rs+1.0)*(1.0-rs)));
  probrs=betai(0.5*df,0.5,df/(df+t*t));
  return rs;
}


template<class T>
double pointbiserialr(const vector<T> &flist1, const vector<T> &flist2, double &probrs)
{ throw StatException("pointbiserialr: not implemented"); }


template<class T>
double kendalltau(const vector<T> &x, const vector<T> &y, double &probrs)
{ if (x.size() != y.size())
    throw StatException("kendaltau: lists of different sizes");
  if (!x.size())
    throw StatException("kendaltau: empty lists");

  int n=x.size();
  int n1=0, n2=0, iss=0;
  for (int j=0; j<n-1; j++)
    for (int k=j+1; k<n; k++) {
      int a1=compare(x[j], x[k]);
      int a2=compare(y[j], x[k]);
      int aa=a1*a2;
      if (aa) {
        n1++; n2++;
        if (aa>0) iss++;
        else iss--;
      }
      else {
        if (a1) n1++;
        if (a2) n2++;
      }
    }
  double tau = double(iss) / sqrt (double(n1)*double(n2));
  double nf=n;
  double svar=(4*nf+10)/(9*nf*(nf-1));
  double z= tau / sqrt(svar);
  probrs = erfcc(fabs(z)/1.4142136);
  return tau;
}
        

template<class T>
void linregress(const vector<T> flist1, const vector<T> &flist2, 
                T &slope, T &intercepr, T &r, T &probrs, T &sterrest)
{ DEFINE_TYPENAME

  const T TINY=T(1.0e-30);

  if (flist1.size() != flist2.size())
    throw StatException("pearsonr: lists of different sizes");

  double n=flist1.size();
  T sumx=0.0, sumx2=0.0, sumy=0.0, sumy2=0.0, summult=0.0;
  for(const_iterator i1(flist1.begin()), i2(flist2.begin()), e1(flist1.end()); i1!=e1; i1++, i2++) {
    sumx += *i1; sumx2 += sqr(*i1);
    sumy += *i2; sumy2 += sqr(*i2);
    summult += (*i1 * *i2);
  }
  T meanx = sumx/n;
  T meany = sumy/n;

  T r_num = summult * n - sumx * sumy;
  T r_den = sqrt( (sumx2*n - sqr(sumx) ) * (sumy2*n - sqr(sumy)) );
  r = r_num / r_den;
  T df=n-2;
  T t = r * sqrt( df / ( (-r+1.0+TINY)*(r+1.0+TINY) ) );
  probrs=betai(df*0.5, T(0.5), df/(df+t*t));

//  T z = log((r+1.0+TINY)/(-r+1.0+TINY));

  slope = r_num / (sumx2*n - sqr(sumx));
  intercepr = meany - slope*meanx;
  sterrest = sqrt(-sqr(r)+1.0) * samplestdev(flist2);
}



/* *********** INFERENTIAL STATS ************/

template<class T>
T ttest_1samp(const vector<T> &flist, const T &popmean, T &prob)
{
  T n=flist.size(), df=n-1.0;
  T t= (mean(flist) - popmean) / sqrt(var(flist)/n);
  prob=betai(df*0.5, T(0.5), df/(df+t*t));
  return t;
}


template<class T>
T ttest_ind(const vector<T> &x, const vector<T> &y, T &prob)
{ T n1=x.size(), n2=y.size(), df=n1+n2-2.0;
  T svar= ( sqr(stdev(x))*(n1-1.0) + sqr(stdev(y))*(n2-1.0) )  /  df;
  T t= ( mean(x) - mean(y) ) / sqrt(svar * ( (n1+n2)/(n1*n2) ) );
  prob=betai(df*0.5, T(0.5), df/(df+t*t));
  return t;
} 


template<class T>
T ttest_rel(const vector<T> &x, const vector<T> &y, T &prob)
{ if (x.size() != y.size())
    throw StatException("ttest_rel: lists of different sizes");

  T meanx=mean(x);
  T meany=mean(y);

  T n=x.size();
  T cov=0.0;
  for(int i=0; i<x.size(); i++)
    cov += (x[i]-meanx) * (y[i]-meany);
  T df=n-1.0;
  cov /= df;
  T sd = sqrt((var(x)+var(y) - cov*2.0)/n);
  if (sd==0.0)
    throw StatException("ttest_rel: sd==0, can't divide");
  T t = (mean(x)-mean(y))/sd;
  prob=betai(df*0.5, T(0.5), df/(df+t*t));
  return t;
}


template<class T>
T chisquare(const vector<T> &x, const vector<T> *exp, T &prob)
{ int n=x.size();

  T chisq=0.0;
  if (exp) {
    if (exp->size()!=n)
      throw StatException("chi_square: lists of different sizes");
    for(int i=0; i<n; i++)
      chisq+=sqr(x[i]-(*exp)[i]) / (*exp)[i];
  }
  else {
    T invted=sum(x)/T(n);
    for(int i=0; i<n; i++)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -