📄 stat.hpp

📁 orange源码数据挖掘技术
💻 HPP
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
      chisq+=sqr(x[i]-invted) / invted;
  }
    
  prob=chisqprob(chisq, T(n-1));

  return chisq;
}


template<class T>
T min_el(const T &x, const T &y)
{ return x<y ? x : y; }

template<class T>
T max_el(const T &x, const T &y)
{ return x>y ? x : y; }

template<class T>
T chisquare2d(const vector<vector<T> > &cont,
              int &df, T &prob, T &cramerV, T &contingency_coeff)
{ if (!cont.size())
    throw StatException("chisquare2d: invalid contingency table");

  const T TINY=1.0e-30;
  int ni=cont.size();
  int nj=cont[0].size();
  if (!nj)
    throw StatException("chisquare2d: invalid contingency table");

  vector<T> sumi(ni), sumj(nj);
  int ci, cj;
  for(ci=0; ci<ni; ci++)
    sumi.push_back(T(0.0));
  for(cj=0; cj<nj; cj++)
    sumj.push_back(T(0.0));

  for(ci=0; ci<ni; ci++) {
    if (cont[ci].size()!=nj)
      throw StatException("chisquare2d: invalid contingency table");
    for(cj=0; cj<nj; cj++) {
      sumi[ci]+=cont[ci][cj];
      sumj[cj]+=cont[ci][cj];
    }
  }

  T sum=0.0;
  int nnj=0;
  for(cj=0; cj<nj; cj++)
    if (sumj[cj]>0.0) {
      sum+=sumj[cj];
      nnj++;
    }

  int nni=0;
  T chisq=0.0;
  for(ci=0; ci<ni; ci++) {
    if (sumi[ci]>0.0)
      nni++;
    for(cj=0; cj<nj; cj++) {
      T expctd=sumj[cj]*sumi[ci] / sum;
      chisq += sqr(cont[ci][cj]-expctd) / (expctd+TINY);
    }
  }

  df=(nni-1)*(nnj-1);

  prob=chisqprob(chisq, T(df));
  cramerV = sqrt (chisq/(sum*(min_el(nni, nnj)-1.0)));
  contingency_coeff=sqrt(chisq/(chisq+sum));
  return chisq;
}


template<class T>
T anova_rel(const vector<vector<T> > &cont, int &df_bt, int &df_err, T &prob)
{ 
  DEFINE_TYPENAME
  
  int k = cont.size();
  int n = cont[0].size();
  if ((n<2) || (k<2))
    throw StatException("anova_rel: invalid contingency table");

  int N = k*n;
  T G = T(0.0), SS_wt = T(0.0), SS_total = T(0.0), SS_bt = T(0.0), SS_bs = T(0.0), SS_err;
  vector<T> Ps(n, T(0.0));
  iterator Psi, Pse(Ps.end());
  for(typename vector<vector<T> >::const_iterator conti(cont.begin()), conte(cont.end()); conti!=conte; conti++) {
    if ((*conti).size() != n)
      throw StatException("anova_rel: number of subject is not the same in all groups");

    T t = T(0.0), tt = T(0.0);
    Psi = Ps.begin();
    for(typename vector<T>::const_iterator contii((*conti).begin()), contie((*conti).end()); contii!=contie; contii++, Psi++) {
      t += (*contii);
      *Psi += (*contii);
      tt += (*contii) * (*contii);
    }
    G += t;
    SS_total += tt;
    SS_wt += tt - t*t/n;
    SS_bt += t*t;
  }

  for(Psi = Ps.begin(); Psi != Pse; Psi++)
    SS_bs += *Psi * *Psi;

  const T GG_N = G*G/N;
  SS_total -= GG_N;
  SS_bt = SS_bt/n - GG_N;
  SS_bs = SS_bs/k - GG_N;
  SS_err = SS_wt - SS_bs;

  df_bt = (k-1);
  df_err = (N-k) - (n-1);

  if (SS_err < 1e-20) {
    prob = 0.0;
    return T(-1.0);
  }

  T MS_bt = SS_bt / df_bt;
  T MS_err = SS_err / df_err;
  T F = MS_bt / MS_err;
  prob = fprob(df_bt, df_err, F);
  return F;
}


template<class T>
T friedmanf(const vector<vector<T> > &cont, double &chi2, int &dfnum, int &dfden, double &prob)
{
  DEFINE_TYPENAME
  
  int k = cont.size();
  int N = cont[0].size();
  if ((N<2) || (k<2))
    throw StatException("friedmanf: invalid contingency table");

  vector<vector<T> > transposed;
  int line;
  for(line = 0; line < N; line++)
    transposed.push_back(vector<T>());

  typename vector<vector<T> >::iterator trani;
  const typename vector<vector<T> >::iterator tranb(transposed.begin()), trane(transposed.end());
  for(typename vector<vector<T> >::const_iterator conti(cont.begin()), conte(cont.end()); conti != conte; conti++) {
    if ((*conti).size() != N)
      throw StatException("friedmanf: number of subject is not the same in all groups");

    trani = tranb;
    for(const_iterator contii((*conti).begin()); trani!=trane; trani++, contii++)
      (*trani).push_back(*contii);
  }

  vector<double> R(k, 0.0), tranks;
  typename vector<double>::iterator Ri, Rb(R.begin()), Re(R.end()), tri;
  for(trani = tranb; trani != trane; trani++) {
    rankdata(*trani, tranks);
    for(Ri = Rb, tri = tranks.begin(); Ri != Re; Ri++, tri++)
      *Ri += *tri;
  }

  double RR = 0.0;
  for(Ri = Rb; Ri != Re; Ri++)
    RR += *Ri * *Ri;

  chi2 = 12 * N / float(k*(k+1)) *  (RR/N/N - k*(k+1)*(k+1) / 4.0);
  double F = (N-1) * chi2 / (N*(k-1) - chi2);
  prob = fprob(k-1, (k-1)*(N-1), F);
  return F;
}


template<class T>
double mannwhitneyu(const vector<T> &x, const vector<T> &y, double &prob)
{ vector<T> both(x);
  both.insert(both.end(), y.begin(), y.end());
  vector<double> ranks;
  rankdata(both, ranks);

  int n1=x.size(), n2=y.size();
  double u1=n1*n2 + (n1*(n1+1))/2.0;
  double u2=n1*n2 + (n2*(n2+1))/2.0;
  vector<double>::iterator ri;
  for(ri=ranks.begin(); n1--; u1 -= *(ri++));
  while(ri!=ranks.end())
    u2 -= *(ri++);
  double bigu=max_el(u1, u2);
  double smallu=min_el(u1, u2);
  double sd=sqrt(n1*n2*(n1+n2+1)/12.0);
  if (sd==0)
    throw StatException("mannwhitneyu: empty group");
  double z=abs((bigu-n1*n2/2.0) / sd);
  prob = 1.0 - zprob(z);
  return smallu;
}


template<class T, class G, class C>
double mannwhitneyu(const vector<T> &x, double &prob, const G &group, const C &compare)
{ DEFINE_TYPENAME
  vector<double> ranks;
  rankdata(x, ranks, compare);
  double u1=0.0, u2=0.0;
  int n1=0, n2=0;
  vector<double>::iterator ri(ranks.begin());
  const_SITERATE(xi, x)
    if (group(*xi)) {
      n1++;
      u1-= *(ri++);
    }
    else {
      n2++;
      u2-= *(ri++);
    }
  u1+=n1*n2 + (n1*(n1+1))/2.0;
  u2+=n1*n2 + (n2*(n2+1))/2.0;
  double bigu=max_el(u1, u2);
  double smallu=min_el(u1, u2);
  double sd=sqrt(n1*n2*(n1+n2+1)/12.0);
  if (sd==0)
    throw StatException("mannwhitneyu: empty group");
  double z=abs((bigu-n1*n2/2.0) / sd);
  prob = 1.0 - zprob(z);
  return smallu;
}


template<class T>
double ranksums(const vector<T> &x, const vector<T> &y, double &prob)
{ vector<T> both(x);
  both.insert(both.end(), y.begin(), y.end());
  vector<double> ranks;
  rankdata(both, ranks);

  int n1c=x.size();
  double sum=0.0;
  for(vector<double>::iterator ri(ranks.begin()); n1c--; sum += *(ri++));

  double n1=x.size(), n2=y.size();
  double expected = n1*(n1+n2+1) / 2.0;
  double z = (sum-expected) / sqrt(n1*n2*(n1+n2+1)/12.0);
  prob=zprob(z);
  return z;
}


template<class T, class G, class C>
double ranksums(const vector<T> &x, double &prob, const G &group, const C &compare)
{ DEFINE_TYPENAME
  vector <double> ranks;
  rankdata(x, ranks, compare);
  vector<double>::iterator ri(ranks.begin());
  double sum=0.0;
  int obs=0;
  const_SITERATE(xi, x)
    if (group(*xi)) {
      sum += *(ri++);
      obs++;
    }
    else
      ri++;

  double n1=obs, n2=x.size()-n1;
  double expected = n1*(n1+n2+1) / 2.0;
  double z = (sum-expected) / sqrt(n1*n2*(n1+n2+1)/12.0);
  prob=zprob(z);
  return z;
}
 

template<class T>
double wilcoxont(const vector<T> &x, const vector<T> &y, double &prob)
{ DEFINE_TYPENAME
  
  if (x.size() != y.size())
    throw StatException("ttest_rel: lists of different sizes");

  vector<T> d, absd;
  for(const_iterator xi(x.begin()), xe(x.end()), yi(y.begin());
      xi!=xe;
      xi++, yi++)
    if (*xi!=*yi) {
      d.push_back(*xi-*yi);
      absd.push_back(abs(d.back()));
    }

  if (!d.size()) {
    prob = 1.0;
    return 0.0;
  }

  vector<double> absdranks;
  rankdata(absd, absdranks);
  double r_plus=0.0, r_minus=0.0;
  for(int i=0; i<d.size(); *( (d[i]<0.0) ? &r_minus : &r_plus ) += absdranks[i], i++);
  double N=d.size();
  double se = sqrt(N*(N+1)*(2*N+1)/24.0);
  double wt = min_el(r_plus, r_minus);
  double z  = fabs(wt-N*(N+1)*0.25)/se;
  prob=1.0-zprob(z);
  return wt;
}



// This follows http://www-2.cs.cmu.edu/afs/cs/project/jair/pub/volume4/cohn96a-html/node7.html

template<class T, class U>
T loess_y(const T &refx, map<T, U> points, const float &windowProp)
{ typedef typename map<T, U>::const_iterator mapiterator;
  mapiterator from, to;

  /* Find the window */

  mapiterator lowedge = points.begin();
  mapiterator highedge = points.end();

  int needpoints = int(ceil(points.size() * windowProp));
  
  if ((needpoints<=1) || (needpoints>=points.size())) {
    from = lowedge;
    to = highedge;
    return ((*--highedge).first - (*lowedge).first)/2.0;
  }
    
  from = points.lower_bound(refx);
  to = points.upper_bound(refx);
  if (from==to)
    if (to != highedge)
      to++;
    else
      from --;


  /* Extend the interval; we set from to highedge when it would go beyond lowedge, to indicate that only to can be modified now */
  while (needpoints--) {
    if ((to == highedge) || ((from != highedge) && (refx - (*from).first < (*to).first - refx))) {
      if (from == lowedge)
        from = highedge;
      else
        from--;
    }
    else
      to++;
  }

  if (from == highedge)
    from = lowedge;
  else
    from++;

  mapiterator tt = to;
  --tt;
  
  T h = (refx - (*from).first);
  if ((*tt).first - refx  >  h)
    h = ((*tt).first - refx);

  h *= T(1.1);

  /* Iterate through the window */

  T Sx = 0.0, Sy = 0.0, Sxx = 0.0, Sxy = 0.0;

  T n = 0.0;

  for (; from != to; from++) {
    const T &x = (*from).first;
    const T &y = (*from).second;

    T w = abs(refx - x) / h;
    w = 1 - w*w*w;
    w = w*w*w;

    n   += w;
    Sx  += w * x;
    Sxx += w * x * x;
    Sy  += w * y;
    Sxy += w * x * y;
  }

  if (n==0)
    return Sy;

  T div = Sxx - Sx * Sx / n;
  if (!div)
    return Sy;

  return Sy/n + (Sxy - Sx*Sy/n) / div * (refx - Sx/n);
}


enum { DISTRIBUTE_MINIMAL, DISTRIBUTE_FACTOR, DISTRIBUTE_FIXED, DISTRIBUTE_UNIFORM };

template<class T, class U>
void distributePoints(const map<T, U> points, int nPoints, vector<T> &result, int method = DISTRIBUTE_MINIMAL)
{ typedef typename map<T, U>::const_iterator mapiterator;

  if ((method == DISTRIBUTE_MINIMAL) && (nPoints<0)) {
    nPoints = -nPoints;
    method = DISTRIBUTE_FACTOR;
  }

  result.clear();

  switch (method) {
    case DISTRIBUTE_FACTOR: {
      for (mapiterator pi(points.begin()), pe(points.end());;) {
        T ax = (*pi).first;
        result.push_back(ax);

        if (++pi==pe)
          break;

        // We could write this faster, but we don't want to run into problems with rounding floats
        T div = ((*pi).first - ax) / nPoints;
        for (int i=1; i < nPoints; i++)
          result.push_back(ax + i*div);
      }
      return;
    }


    case DISTRIBUTE_MINIMAL: {
      if (nPoints<=points.size()) {
        for (mapiterator pi(points.begin()), pe(points.end()); pi != pe; pi++)
          result.push_back((*pi).first);
      }
      else {
        T ineach = float(nPoints - points.size()) / float(points.size()-1);
        T inthis = T(0.0);
    
        for (mapiterator pi(points.begin()), pe(points.end());;) {
          T ax = (*pi).first;
          result.push_back(ax);

          if (++pi==pe)
            break;

          inthis += ineach;
          if (inthis>=T(0.5)) {
            T dif = ((*pi).first - ax) / (int(floor(inthis))+1);
            while (inthis>T(0.5)) {
              result.push_back(ax += dif);
              inthis -= T(1.0);
            }
          }
        }
      }
      return;
    }


    case DISTRIBUTE_FIXED: {
      float step = points.size()/(nPoints-2);
      float up = 1.5;

      T x1 = T(0.0);
      T dx = T(0.0);
      for(mapiterator pi(points.begin()), pe(points.end());;) {
        do {
          x1 = (*pi).first;
          if (++pi==pe)
            break;
          T dx = (*pi).first - x1;
          up -= 1.0;
        } while (up>1.0);
        if (pi==pe)
          break;

        for (; up<1.0; up += step)
          result.push_back(x1 + dx*up);
      }
      result.push_back(x1);
      return;
    }


    case DISTRIBUTE_UNIFORM: {
      T fi = (*points.begin()).first;
      mapiterator pe(points.end());
      pe--;
      T rg = ((*pe).first-fi) / (nPoints-1);
      for (int i = 0; i<nPoints; i++)
        result.push_back(fi + i*rg);
      return;
    }
  }
}

int nUniquePoints(const vector<double> &points);

void samplingFactor (const vector<double> &points,      int nPoints, vector<double> &result);
void samplingFactor (const map<double, double> &points, int nPoints, vector<double> &result);
void samplingMinimal(const vector<double> &points,      int nPoints, vector<double> &result);
void samplingMinimal(const map<double, double> &points, int nPoints, vector<double> &result);
void samplingFixed  (const vector<double> &points,      int nPoints, vector<double> &result);
void samplingFixed  (const map<double, double> &points, int nPoints, vector<double> &result);
void samplingUniform(const vector<double> &points,      int nPoints, vector<double> &result);
void samplingUniform(const map<double, double> &points, int nPoints, vector<double> &result);

template<class T, class U>
void loess(const U &points, int nPoints, const float &windowProp, map<T, T> &loess_curve, int distributionMethod = DISTRIBUTE_MINIMAL)
{ DEFINE_TYPENAME
  vector<T> xpoints;
  distributePoints(points, nPoints, xpoints, distributionMethod);
  for (const_iterator xi(xpoints.begin()), xe(xpoints.end()); xi!=xe; xi++) 
    loess_curve[*xi] = loess_y(*xi, points, windowProp);
}

class TXYW {
public:
  double x, y, w;

  TXYW(const double &ax, const double &ay, const double &aw = 1.0)
  : x(ax), y(ay), w(aw)
  {}

  TXYW(const TXYW &o)
  : x(o.x), y(o.y), w(o.w)
  {}
};


// refpoints and points should be sorted
void loess(const vector<double> &refpoints, const vector<TXYW> &points, const float &windowProp, vector<pair<double, double> > &result);
void lwr(const vector<double> &refpoints, const vector<TXYW> &points, const float &smoothFactor, vector<pair<double, double> > &result);

void loess(const vector<double> &refpoints, const vector<pair<double, double> > &points, const float &windowProp, vector<pair<double, double> > &result);
void loess(const vector<double> &refpoints, const map<double, double> &points, const float &windowProp, vector<pair<double, double> > &result);

void lwr(const vector<double> &refpoints, const vector<pair<double, double> > &points, const float &smoothFactor, vector<pair<double, double> > &result);
void lwr(const vector<double> &refpoints, const map<double, double> &points, const float &smoothFactor, vector<pair<double, double> > &result);

#endif
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -