📄 cluster.cpp

📁 一个google的OCR源码
💻 CPP
📖 第 1 页 / 共 5 页
字号:
        if (*m < ParamDesc->Min)          *m += ParamDesc->Range;      }      else if ((*m1 - *m2) > ParamDesc->HalfRange) {        *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;        if (*m < ParamDesc->Min)          *m += ParamDesc->Range;      }      else        *m = (n1 * *m1 + n2 * *m2) / n;    }    else      *m = (n1 * *m1 + n2 * *m2) / n;  }  return (n);}                                // MergeClusters/** ComputePrototypes *******************************************************Parameters:	Clusterer	data structure holding cluster tree      Config		parameters used to control prototype generationGlobals:	NoneOperation:	This routine decides which clusters in the cluster tree      should be represented by prototypes, forms a list of these      prototypes, and places the list in the Clusterer data      structure.Return:		NoneExceptions:	NoneHistory:	5/30/89, DSJ, Created.*******************************************************************************/void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {  LIST ClusterStack = NIL;  CLUSTER *Cluster;  PROTOTYPE *Prototype;  // use a stack to keep track of clusters waiting to be processed  // initially the only cluster on the stack is the root cluster  if (Clusterer->Root != NULL)    ClusterStack = push (NIL, Clusterer->Root);  // loop until we have analyzed all clusters which are potential prototypes  while (ClusterStack != NIL) {    // remove the next cluster to be analyzed from the stack    // try to make a prototype from the cluster    // if successful, put it on the proto list, else split the cluster    Cluster = (CLUSTER *) first_node (ClusterStack);    ClusterStack = pop (ClusterStack);    Prototype = MakePrototype (Clusterer, Config, Cluster);    if (Prototype != NULL) {      Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);    }    else {      ClusterStack = push (ClusterStack, Cluster->Right);      ClusterStack = push (ClusterStack, Cluster->Left);    }  }}                                // ComputePrototypes/** MakePrototype ***********************************************************Parameters:	Clusterer	data structure holding cluster tree      Config		parameters used to control prototype generation      Cluster		cluster to be made into a prototypeGlobals:	NoneOperation:	This routine attempts to create a prototype from the      specified cluster that conforms to the distribution      specified in Config.  If there are too few samples in the      cluster to perform a statistical analysis, then a prototype      is generated but labelled as insignificant.  If the      dimensions of the cluster are not independent, no prototype      is generated and NULL is returned.  If a prototype can be      found that matches the desired distribution then a pointer      to it is returned, otherwise NULL is returned.Return:		Pointer to new prototype or NULLExceptions:	NoneHistory:	6/19/89, DSJ, Created.*******************************************************************************/PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,                         CLUSTERCONFIG *Config,                         CLUSTER *Cluster) {  STATISTICS *Statistics;  PROTOTYPE *Proto;  BUCKETS *Buckets;  // filter out clusters which contain samples from the same character  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))    return (NULL);  // compute the covariance matrix and ranges for the cluster  Statistics =    ComputeStatistics (Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);  // check for degenerate clusters which need not be analyzed further  // note that the MinSamples test assumes that all clusters with multiple  // character samples have been removed (as above)  Proto = MakeDegenerateProto (Clusterer->SampleSize, Cluster, Statistics,    Config->ProtoStyle,    (inT32) (Config->MinSamples *    Clusterer->NumChar));  if (Proto != NULL) {    FreeStatistics(Statistics);    return (Proto);  }  // check to ensure that all dimensions are independent  if (!Independent (Clusterer->ParamDesc, Clusterer->SampleSize,  Statistics->CoVariance, Config->Independence)) {    FreeStatistics(Statistics);    return (NULL);  }  if (HOTELLING && Config->ProtoStyle == elliptical) {    Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);    if (Proto != NULL) {      FreeStatistics(Statistics);      return Proto;    }  }  // create a histogram data structure used to evaluate distributions  Buckets = GetBuckets (normal, Cluster->SampleCount, Config->Confidence);  // create a prototype based on the statistics and test it  switch (Config->ProtoStyle) {    case spherical:      Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);      break;    case elliptical:      Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);      break;    case mixed:      Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,        Config->Confidence);      break;    case automatic:      Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);      if (Proto != NULL)        break;      Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);      if (Proto != NULL)        break;      Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,        Config->Confidence);      break;  }  FreeBuckets(Buckets);  FreeStatistics(Statistics);  return (Proto);}                                // MakePrototype/** MakeDegenerateProto ******************************************************Parameters:	N		number of dimensions      Cluster		cluster being analyzed      Statistics	statistical info about cluster      Style		type of prototype to be generated      MinSamples	minimum number of samples in a clusterGlobals:	NoneOperation:	This routine checks for clusters which are degenerate and      therefore cannot be analyzed in a statistically valid way.      A cluster is defined as degenerate if it does not have at      least MINSAMPLESNEEDED samples in it.  If the cluster is      found to be degenerate, a prototype of the specified style      is generated and marked as insignificant.  A cluster is      also degenerate if it does not have at least MinSamples      samples in it.      If the cluster is not degenerate, NULL is returned.Return:		Pointer to degenerate prototype or NULL.Exceptions:	NoneHistory:	6/20/89, DSJ, Created.      7/12/89, DSJ, Changed name and added check for 0 stddev.      8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).********************************************************************************/PROTOTYPE *MakeDegenerateProto(  //this was MinSample                               uinT16 N,                               CLUSTER *Cluster,                               STATISTICS *Statistics,                               PROTOSTYLE Style,                               inT32 MinSamples) {  PROTOTYPE *Proto = NULL;  if (MinSamples < MINSAMPLESNEEDED)    MinSamples = MINSAMPLESNEEDED;  if (Cluster->SampleCount < MinSamples) {    switch (Style) {      case spherical:        Proto = NewSphericalProto (N, Cluster, Statistics);        break;      case elliptical:      case automatic:        Proto = NewEllipticalProto (N, Cluster, Statistics);        break;      case mixed:        Proto = NewMixedProto (N, Cluster, Statistics);        break;    }    Proto->Significant = FALSE;  }  return (Proto);}                                // MakeDegenerateProto/** TestEllipticalProto ****************************************************Parameters:	Clusterer	data struct containing samples being clustered      Config provides the magic number of samples that make a good cluster      Cluster		cluster to be made into an elliptical prototype      Statistics	statistical info about clusterGlobals:	NoneOperation:	This routine tests the specified cluster to see if ***     there is a statistically significant difference between*     the sub-clusters that would be made if the cluster were to*     be split. If not, then a new prototype is formed and*     returned to the caller. If there is, then NULL is returned*     to the caller.Return:		Pointer to new elliptical prototype or NULL.****************************************************************************/PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,                               CLUSTERCONFIG *Config,                               CLUSTER *Cluster,                               STATISTICS *Statistics) {  // Fraction of the number of samples used as a range around 1 within  // which a cluster has the magic size that allows a boost to the  // FTable by kFTableBoostMargin, thus allowing clusters near the  // magic size (equal to the number of sample characters) to be more  // likely to stay together.  const double kMagicSampleMargin = 0.0625;  const double kFTableBoostMargin = 2.0;  int N = Clusterer->SampleSize;  CLUSTER* Left = Cluster->Left;  CLUSTER* Right = Cluster->Right;  if (Left == NULL || Right == NULL)    return NULL;  int TotalDims = Left->SampleCount + Right->SampleCount;  if (TotalDims < N + 1 || TotalDims < 2)    return NULL;  const int kMatrixSize = N * N * sizeof(FLOAT32);  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));  // Compute a new covariance matrix that only uses essential features.  for (int i = 0; i < N; ++i) {    int row_offset = i * N;    if (!Clusterer->ParamDesc[i].NonEssential) {      for (int j = 0; j < N; ++j) {        if (!Clusterer->ParamDesc[j].NonEssential)          Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];        else          Covariance[j + row_offset] = 0.0f;      }    } else {      for (int j = 0; j < N; ++j) {        if (i == j)          Covariance[j + row_offset] = 1.0f;        else          Covariance[j + row_offset] = 0.0f;      }    }  }  double err = InvertMatrix(Covariance, N, Inverse);  if (err > 1) {    tprintf("Clustering error: Matrix inverse failed with error %g\n", err);  }  int EssentialN = 0;  for (int dim = 0; dim < N; ++dim) {    if (!Clusterer->ParamDesc[dim].NonEssential) {      Delta[dim] = Left->Mean[dim] - Right->Mean[dim];      ++EssentialN;    } else {      Delta[dim] = 0.0f;    }  }  // Compute Hotelling's T-squared.  double Tsq = 0.0;  for (int x = 0; x < N; ++x) {    double temp = 0.0;    for (int y = 0; y < N; ++y) {      temp += Inverse[y + N*x] * Delta[y];    }    Tsq += Delta[x] * temp;  }  memfree(Covariance);  memfree(Inverse);  memfree(Delta);  // Changed this function to match the formula in  // Statistical Methods in Medical Research p 473  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);  int Fx = EssentialN;  if (Fx > FTABLE_X)    Fx = FTABLE_X;  --Fx;  int Fy = TotalDims - EssentialN - 1;  if (Fy > FTABLE_Y)    Fy = FTABLE_Y;  --Fy;  double FTarget = FTable[Fy][Fx];  if (Config->MagicSamples > 0 &&      TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&      TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {    // Give magic-sized clusters a magic FTable boost.    FTarget += kFTableBoostMargin;  }  if (F < FTarget) {    return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);  }  return NULL;}/* MakeSphericalProto *******************************************************
💿 文件大小 1051 K
👤 上传用户 jackjinke
📂 所属分类多国语言处理
🏷️ 相关标签

#google #OCR #源码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -