📄 kmeans.cpp
字号:
#include "KMeans.h"#include <math.h>#include <stdlib.h>#include <iostream.h>#include <fstream.h>#include <time.h>#define EPSILON 0.000001//#define TESTNOWKMeans::KMeans(double** buf, int num, int dim){ number = num; dimension = dim; buffer = buf; naiveIterateTime = 0; boundIterateTime = 0; stopTime = 0;}KMeans::~KMeans(){}double KMeans::GetDistSqu(double* p1, double* p2){ int i; double distSqu = 0; for(i = 0; i < dimension; i++) { distSqu += (p1[i] - p2[i]) * (p1[i] - p2[i]); } return distSqu;}double KMeans::GetCost(double** centers, int k){ int i,j;// int closeCenter; double minDistSqu; double distSquSum = 0; for(i = 0; i < number; i++) { //closeCenter = 0; minDistSqu = GetDistSqu(buffer[i], centers[0]); for(j = 1; j < k; j++) { double temp = GetDistSqu(buffer[i], centers[j]); if(temp < minDistSqu) { //closeCenter = j; minDistSqu = temp; } } distSquSum += minDistSqu; } return distSquSum;}void KMeans::AccCluster(double** optSolution, int k, int randomNum, int time){ int i, m, n; clusterTime = 0; iterateTime = 0; belongings = new int[number]; ranks = new int[2*number]; differences = new double[2*number]; quadPara = new double[2*number]; linearPara = new double[2*number]; constPara = new double[2*number]; double optCost = number*dimension; double** tempSolution = new double*[k]; for(i = 0; i < k; i++) tempSolution[i] = new double[dimension]; srand(randomNum); for(i = 0; i < time; i++) { //srand(randomNum+i); //invoke AccOneCluster here double temp = AccOneCluster(tempSolution, k, optCost); if(temp < optCost) { optCost = temp; for(m = 0; m < k; m++) { for(n = 0; n < dimension; n++) { optSolution[m][n] = tempSolution[m][n]; } } }#ifdef TESTNOW cout << "Run " << clusterTime << " Finished" << endl;#endif } cout << "Naive Iterate time : " << naiveIterateTime << endl; cout << "Bound Iterate time : " << boundIterateTime << endl; cout << "Stop Iterate time : " << stopTime << endl; cout << "Best Solution Cost : " << optCost << endl; //OutputSolution(tempSolution, k); for(i = 0; i < k; i++) delete [] tempSolution[i]; delete [] tempSolution; delete [] belongings; delete [] ranks; delete [] differences; delete [] quadPara; delete [] linearPara; delete [] constPara;}void KMeans::Cluster(double** optSolution, int k, int randomNum, int time){ int i, m, n; double optCost = number*dimension; double** tempSolution = new double*[k]; for(i = 0; i < k; i++) tempSolution[i] = new double[dimension]; srand(randomNum); for(i = 0; i < time; i++) { //srand(randomNum+i); double temp; if((temp = OneCluster(tempSolution, k)) < optCost) { //If the temp solution is better, replace it with the optimal solution optCost = temp; for(m = 0; m < k; m++) { for(n = 0; n < dimension; n++) { optSolution[m][n] = tempSolution[m][n]; } } } } cout << "Naive Iterate time : " << naiveIterateTime << endl; cout << "Bound Iterate time : " << boundIterateTime << endl; cout << "Best Solution Cost : " << optCost << endl; //OutputSolution(tempSolution, k); for(i = 0; i < k; i++) delete [] tempSolution[i]; delete [] tempSolution;}void KMeans::OutputSolution(double** solution, int k){ int i,j; for(i = 0; i < k; i++) { for(j = 0; j < dimension; j++) cout << solution[i][j] << " "; cout << endl; }}double KMeans::AccOneCluster(double** tempSolution, int k, double optCost){ int i,j; clusterTime++; iterateTime = 0; for(i = 0; i < k; i++) { //Choose a point in the data set randomly int pos = rand() % number; for(j = 0; j < dimension; j++) { tempSolution[i][j] = buffer[pos][j]; } } //OutputSolution(tempSolution, k); //int iterateTime = 1; for(i = 0; i < number; i++) belongings[i] = 0; newSmallCluster = 0; bool noBound = false; do{ //cout << iterateTime++ << " : " << endl; //invoke AccIterate here int temp; if(noBound) temp = OneIterate(tempSolution, k); else temp = AccIterate(tempSolution, k, optCost); if(temp == 0) break; if(temp == 1) { stopTime++;
return dimension * number; } if(temp == 3) noBound = true; }while(true); return GetCost(tempSolution, k);}double KMeans::OneCluster(double** tempSolution, int k){ int i,j; for(i = 0; i < k; i++) { //Choose a point in the data set randomly int pos = rand() % number; for(j = 0; j < dimension; j++) { tempSolution[i][j] = buffer[pos][j]; } } //OutputSolution(tempSolution, k); //cout << GetCost(tempSolution, k) << endl; //while(!OneIterate(tempSolution, k)); //int iterateTime = 1; do{ //cout << iterateTime++ << " : " << endl; }while(OneIterate(tempSolution, k)>0); //OutputSolution(tempSolution, k); return GetCost(tempSolution, k);}void KMeans::swap(int* array, int left, int right){ int temp = array[left]; array[left] = array[right]; array[right] = temp;}void KMeans::QuickSort(int left, int right){ //cout << left << " " << right << endl; if (left >= right) return; int newleft = rand()%(right-left+1) + left; swap(ranks, newleft, left); int last = left, i; //swap(a,left,left); for (i = left + 1; i <= right; i++) { if (differences[ranks[i]] < differences[ranks[left]]) swap(ranks,++last,i); } swap(ranks,left,last); //cout << last << endl; QuickSort(left,last-1); QuickSort(last+1,right);}double* diff;int compare( const void *arg1, const void *arg2 ){ int p1 = *((int*)arg1); int p2 = *((int*)arg2); if(diff[p1] < diff[p2]) return -1; if(diff[p1] > diff[p2]) return 1; return 0;}double KMeans::LowerBound(double currentCost, double lowerDelta, double upperDelta){ int i; double quad = number; double linear = 0; double constant = 0; //cout << upperDelta << endl; int searchNum = 0; for(i = 0; i < 2*number; i++) { if(differences[i] < lowerDelta) { quad += quadPara[i]; linear += linearPara[i]; constant += constPara[i]; }
#ifdef TESTNOW
else
#endif#ifndef TESTNOW else if(differences[i] <= upperDelta) {#endif ranks[searchNum++] = i;#ifndef TESTNOW }#endif }#ifdef TESTNOW// cout << "Number of Delta : " << searchNum << endl;// cout << "Lower Delta : " << lowerDelta << endl;// cout << "Upper Delta : " << upperDelta << endl; int lasttime = time(NULL);#endif// if(clusterTime == 3 && iterateTime == 3)// {/* for(i = 0; i < searchNum; i++) { if(differences[ranks[i]] < lowerDelta || differences[ranks[i]] > upperDelta) cout << "wrong" << endl; }*/// } //QuickSort(0, searchNum-1); diff = differences; qsort(ranks, searchNum, sizeof(int), compare);#ifdef TESTNOW// cout << "Sorting Finished in " << time(NULL) - lasttime << // "seconds" << endl;#endif double delta = 0; double DiffSum; for(i = 0; i < searchNum; i++) { delta = differences[ranks[i]]; //cout << delta << endl; //DiffSum = delta * oldSmallCluster - hardDiffSum - 2 * delta * zeroPos + softDiffSum; DiffSum = delta*delta*quad - 2*delta*linear - constant; if(DiffSum > 0) { #ifdef TESTNOW cout << "Delta : " << delta << endl; cout << "Lower Bound : " << currentCost - delta * delta* number << endl; #endif return currentCost - delta * delta * number; } quad += quadPara[ranks[i]]; linear += linearPara[ranks[i]]; constant += constPara[ranks[i]]; } return 0;}int KMeans::AccIterate(double** tempSolution, int k, double optCost){ int i,j; int closeCenter; double minDistSqu; double oldMinDistSqu; double secMinDistSqu; double distSquSum = 0; //double oldCost = 0; double hardDiffSum = 0; //double hardSqrtSum = 0; double hardDistSum = 0; boundIterateTime++; iterateTime++; //cout << iterateTime << endl; int* clusterSize = new int[k]; double** sumArray = new double*[k]; for(i = 0; i < k; i++) { clusterSize[i] = 0; sumArray[i] = new double[dimension]; for(j = 0; j < dimension; j++) sumArray[i][j] = 0; } for(i = 0; i < number; i++) { //ranks[i] = i; //ranks[number + i] = number + i; closeCenter = belongings[i]; minDistSqu = GetDistSqu(buffer[i], tempSolution[closeCenter]); oldMinDistSqu = minDistSqu; secMinDistSqu = dimension; //oldCost += minDistSqu; for(j = 0; j < k; j++) { if(j == belongings[i]) continue; double temp = GetDistSqu(buffer[i], tempSolution[j]); if(temp < minDistSqu) { closeCenter = j; secMinDistSqu = minDistSqu; minDistSqu = temp; } else if(temp < secMinDistSqu) { secMinDistSqu = temp; } } belongings[i] = closeCenter; clusterSize[closeCenter]++; distSquSum += minDistSqu; for(j = 0; j < dimension; j++) sumArray[closeCenter][j] += buffer[i][j]; if(minDistSqu < oldMinDistSqu) { double t1 = sqrt(oldMinDistSqu); double t2 = sqrt(minDistSqu); differences[i] = 0; differences[number + i] = t2; hardDiffSum += oldMinDistSqu - minDistSqu; //hardSqrtSum += t1 + t2; quadPara[i] = 0; linearPara[i] = t1 + t2; constPara[i] = 0; quadPara[number + i] = -1; linearPara[number + i] = -t2; constPara[number + i] = minDistSqu; hardDistSum += t1; } else { double t1 = sqrt(minDistSqu); double t3 = sqrt(secMinDistSqu); differences[i] = (t3 - t1)/2; differences[number + i] = t3; quadPara[i] = 0; quadPara[number + i] = -1; linearPara[i] = t1 + t3; linearPara[number + i] = -t3; constPara[i] = minDistSqu - secMinDistSqu; constPara[number + i] = secMinDistSqu; } } #ifdef TESTNOW //cout << iterateTime << endl; cout << clusterTime << "." << iterateTime << " Current Cost : " << distSquSum << endl; //cout << "Hard Diff Sum : " << hardDiffSum << endl; //cout << "Hard Sqrt Sum : " << hardSqrtSum << endl; #endif //bool isConverge = true; for(i = 0; i < k; i++) { if(clusterSize[i] == 0) continue; for(j = 0; j < dimension; j++) { double temp = sumArray[i][j] / clusterSize[i]; /* if(fabs(temp - tempSolution[i][j]) > EPSILON) { isConverge = false; } */ tempSolution[i][j] = temp; } } oldSmallCluster = newSmallCluster; newSmallCluster = number; for(i = 0; i < k; i++) if(clusterSize[i] < newSmallCluster) newSmallCluster = clusterSize[i]; delete [] clusterSize; for(i = 0; i < k; i++) delete [] sumArray[i]; delete [] sumArray; if(hardDiffSum == 0) return 0; if(distSquSum < optCost) return 3; double lowerDelta = 2*hardDistSum / number; double upperDelta = sqrt((distSquSum - optCost + 1) / number); //double upperDelta = sqrt(distSquSum / number); if(lowerDelta > upperDelta) return 2; double bound = LowerBound(distSquSum, lowerDelta, upperDelta); //double bound = optCost; if(bound > optCost - 1) { //cout << bound << " " << optCost << endl; return 1; } return 2;}int KMeans::OneIterate(double** tempSolution, int k){ int i,j; naiveIterateTime++; iterateTime++; //cout << iterateTime << endl; int* clusterSize = new int[k]; double** sumArray = new double*[k]; for(i = 0; i < k; i++) { clusterSize[i] = 0; sumArray[i] = new double[dimension]; for(j = 0; j < dimension; j++) sumArray[i][j] = 0; } int closeCenter; double minDistSqu; double distSquSum = 0; for(i = 0; i < number; i++) { closeCenter = 0; minDistSqu = GetDistSqu(buffer[i], tempSolution[0]); for(j = 1; j < k; j++) { double temp = GetDistSqu(buffer[i], tempSolution[j]); if(temp < minDistSqu) { closeCenter = j; minDistSqu = temp; } } clusterSize[closeCenter]++; distSquSum += minDistSqu; for(j = 0; j < dimension; j++) { sumArray[closeCenter][j] += buffer[i][j]; } } //OutputSolution(tempSolution, k); #ifdef TESTNOW //cout << iterateTime << endl; cout << clusterTime << "." << iterateTime << " Current Cost : " << distSquSum << endl; #endif bool isConverge = true; for(i = 0; i < k; i++) { if(clusterSize[i] == 0) continue; for(j = 0; j < dimension; j++) { double temp = sumArray[i][j] / clusterSize[i]; if(fabs(temp - tempSolution[i][j]) > 0) { isConverge = false; } tempSolution[i][j] = temp; } } delete [] clusterSize; for(i = 0; i < k; i++) delete [] sumArray[i]; delete [] sumArray; if(isConverge) return 0; else return 2;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -