📄 cluster.cpp
字号:
if (*m < ParamDesc->Min) *m += ParamDesc->Range; } else if ((*m1 - *m2) > ParamDesc->HalfRange) { *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n; if (*m < ParamDesc->Min) *m += ParamDesc->Range; } else *m = (n1 * *m1 + n2 * *m2) / n; } else *m = (n1 * *m1 + n2 * *m2) / n; } return (n);} // MergeClusters/** ComputePrototypes *******************************************************Parameters: Clusterer data structure holding cluster tree Config parameters used to control prototype generationGlobals: NoneOperation: This routine decides which clusters in the cluster tree should be represented by prototypes, forms a list of these prototypes, and places the list in the Clusterer data structure.Return: NoneExceptions: NoneHistory: 5/30/89, DSJ, Created.*******************************************************************************/void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { LIST ClusterStack = NIL; CLUSTER *Cluster; PROTOTYPE *Prototype; // use a stack to keep track of clusters waiting to be processed // initially the only cluster on the stack is the root cluster if (Clusterer->Root != NULL) ClusterStack = push (NIL, Clusterer->Root); // loop until we have analyzed all clusters which are potential prototypes while (ClusterStack != NIL) { // remove the next cluster to be analyzed from the stack // try to make a prototype from the cluster // if successful, put it on the proto list, else split the cluster Cluster = (CLUSTER *) first_node (ClusterStack); ClusterStack = pop (ClusterStack); Prototype = MakePrototype (Clusterer, Config, Cluster); if (Prototype != NULL) { Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype); } else { ClusterStack = push (ClusterStack, Cluster->Right); ClusterStack = push (ClusterStack, Cluster->Left); } }} // ComputePrototypes/** MakePrototype ***********************************************************Parameters: Clusterer data structure holding cluster tree Config parameters used to control prototype generation Cluster cluster to be made into a prototypeGlobals: NoneOperation: This routine attempts to create a prototype from the specified cluster that conforms to the distribution specified in Config. If there are too few samples in the cluster to perform a statistical analysis, then a prototype is generated but labelled as insignificant. If the dimensions of the cluster are not independent, no prototype is generated and NULL is returned. If a prototype can be found that matches the desired distribution then a pointer to it is returned, otherwise NULL is returned.Return: Pointer to new prototype or NULLExceptions: NoneHistory: 6/19/89, DSJ, Created.*******************************************************************************/PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) { STATISTICS *Statistics; PROTOTYPE *Proto; BUCKETS *Buckets; // filter out clusters which contain samples from the same character if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal)) return (NULL); // compute the covariance matrix and ranges for the cluster Statistics = ComputeStatistics (Clusterer->SampleSize, Clusterer->ParamDesc, Cluster); // check for degenerate clusters which need not be analyzed further // note that the MinSamples test assumes that all clusters with multiple // character samples have been removed (as above) Proto = MakeDegenerateProto (Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle, (inT32) (Config->MinSamples * Clusterer->NumChar)); if (Proto != NULL) { FreeStatistics(Statistics); return (Proto); } // check to ensure that all dimensions are independent if (!Independent (Clusterer->ParamDesc, Clusterer->SampleSize, Statistics->CoVariance, Config->Independence)) { FreeStatistics(Statistics); return (NULL); } if (HOTELLING && Config->ProtoStyle == elliptical) { Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics); if (Proto != NULL) { FreeStatistics(Statistics); return Proto; } } // create a histogram data structure used to evaluate distributions Buckets = GetBuckets (normal, Cluster->SampleCount, Config->Confidence); // create a prototype based on the statistics and test it switch (Config->ProtoStyle) { case spherical: Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets); break; case elliptical: Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets); break; case mixed: Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets, Config->Confidence); break; case automatic: Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets); if (Proto != NULL) break; Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets); if (Proto != NULL) break; Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets, Config->Confidence); break; } FreeBuckets(Buckets); FreeStatistics(Statistics); return (Proto);} // MakePrototype/** MakeDegenerateProto ******************************************************Parameters: N number of dimensions Cluster cluster being analyzed Statistics statistical info about cluster Style type of prototype to be generated MinSamples minimum number of samples in a clusterGlobals: NoneOperation: This routine checks for clusters which are degenerate and therefore cannot be analyzed in a statistically valid way. A cluster is defined as degenerate if it does not have at least MINSAMPLESNEEDED samples in it. If the cluster is found to be degenerate, a prototype of the specified style is generated and marked as insignificant. A cluster is also degenerate if it does not have at least MinSamples samples in it. If the cluster is not degenerate, NULL is returned.Return: Pointer to degenerate prototype or NULL.Exceptions: NoneHistory: 6/20/89, DSJ, Created. 7/12/89, DSJ, Changed name and added check for 0 stddev. 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).********************************************************************************/PROTOTYPE *MakeDegenerateProto( //this was MinSample uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples) { PROTOTYPE *Proto = NULL; if (MinSamples < MINSAMPLESNEEDED) MinSamples = MINSAMPLESNEEDED; if (Cluster->SampleCount < MinSamples) { switch (Style) { case spherical: Proto = NewSphericalProto (N, Cluster, Statistics); break; case elliptical: case automatic: Proto = NewEllipticalProto (N, Cluster, Statistics); break; case mixed: Proto = NewMixedProto (N, Cluster, Statistics); break; } Proto->Significant = FALSE; } return (Proto);} // MakeDegenerateProto/** TestEllipticalProto ****************************************************Parameters: Clusterer data struct containing samples being clustered Config provides the magic number of samples that make a good cluster Cluster cluster to be made into an elliptical prototype Statistics statistical info about clusterGlobals: NoneOperation: This routine tests the specified cluster to see if *** there is a statistically significant difference between* the sub-clusters that would be made if the cluster were to* be split. If not, then a new prototype is formed and* returned to the caller. If there is, then NULL is returned* to the caller.Return: Pointer to new elliptical prototype or NULL.****************************************************************************/PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics) { // Fraction of the number of samples used as a range around 1 within // which a cluster has the magic size that allows a boost to the // FTable by kFTableBoostMargin, thus allowing clusters near the // magic size (equal to the number of sample characters) to be more // likely to stay together. const double kMagicSampleMargin = 0.0625; const double kFTableBoostMargin = 2.0; int N = Clusterer->SampleSize; CLUSTER* Left = Cluster->Left; CLUSTER* Right = Cluster->Right; if (Left == NULL || Right == NULL) return NULL; int TotalDims = Left->SampleCount + Right->SampleCount; if (TotalDims < N + 1 || TotalDims < 2) return NULL; const int kMatrixSize = N * N * sizeof(FLOAT32); FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize)); FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize)); FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32))); // Compute a new covariance matrix that only uses essential features. for (int i = 0; i < N; ++i) { int row_offset = i * N; if (!Clusterer->ParamDesc[i].NonEssential) { for (int j = 0; j < N; ++j) { if (!Clusterer->ParamDesc[j].NonEssential) Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset]; else Covariance[j + row_offset] = 0.0f; } } else { for (int j = 0; j < N; ++j) { if (i == j) Covariance[j + row_offset] = 1.0f; else Covariance[j + row_offset] = 0.0f; } } } double err = InvertMatrix(Covariance, N, Inverse); if (err > 1) { tprintf("Clustering error: Matrix inverse failed with error %g\n", err); } int EssentialN = 0; for (int dim = 0; dim < N; ++dim) { if (!Clusterer->ParamDesc[dim].NonEssential) { Delta[dim] = Left->Mean[dim] - Right->Mean[dim]; ++EssentialN; } else { Delta[dim] = 0.0f; } } // Compute Hotelling's T-squared. double Tsq = 0.0; for (int x = 0; x < N; ++x) { double temp = 0.0; for (int y = 0; y < N; ++y) { temp += Inverse[y + N*x] * Delta[y]; } Tsq += Delta[x] * temp; } memfree(Covariance); memfree(Inverse); memfree(Delta); // Changed this function to match the formula in // Statistical Methods in Medical Research p 473 // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews. // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims; double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN); int Fx = EssentialN; if (Fx > FTABLE_X) Fx = FTABLE_X; --Fx; int Fy = TotalDims - EssentialN - 1; if (Fy > FTABLE_Y) Fy = FTABLE_Y; --Fy; double FTarget = FTable[Fy][Fx]; if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) && TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) { // Give magic-sized clusters a magic FTable boost. FTarget += kFTableBoostMargin; } if (F < FTarget) { return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics); } return NULL;}/* MakeSphericalProto *******************************************************
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -