📄 cmorderedtreeminer.cpp
字号:
// CMOrderedTreeMiner.cpp : Defines the entry point for the console application.
#pragma warning (disable: 4083 4786)
#include "CMRmisc.h"
#include "TextTree.h"
#include "OccList.h"
#include "OccLongList.h"
#include "CHACCluster.h"
short MIN_VERTEX = -1;
short MAX_VERTEX = -1; //therefore, the range for valid node label is 0--30000
short currentPathNumber;
vector<short> currentPath;//记录当前频繁路径
typedef map<int,bool> MAP_FREQUENT;
int main(int argc, char* argv[])
{
//if ( argc != 4 )
//{
//cout << "Usage: CMOrderedTreeMiner support input_file output_file" << endl;
//exit (1);
//}
int support;
//istringstream iss(argv[1]);
//iss >> support;
//if(!iss)
//{
//cerr << "invalid argument, not an integer value" << endl;
// exit (1);
//}
support=20;
string inputFile = "input\\outputtree.txt";
string outputFile = "output\\frequentpath.txt";
string clusteroutputFile = "output\\clusterResult.txt";
map<int, vector<vector<short> > > frequency; //<频繁路径长度,频繁路径>
map<int, vector<vector<short> > >::iterator pos1;
vector< vector<short> > maximal;
vector< vector<pathocc > > maximalocclonglist;
vector< vector<int> > maximalocc;
clock_t start_time;
clock_t stop_time;
/******************************************************************
step1: read in the database, and find the MIN_VERTEX and MAX_VERTEX
******************************************************************/
// string inputFile = argv[2];
//string outputFile = argv[3];
ofstream outFile(outputFile.c_str());
if(!outFile) {
cerr << "cannot open OUTPUT file!" << endl;
exit(1);
}
ifstream inFile(inputFile.c_str());
if(!inFile) {
cerr << "cannot open INPUT file!" << endl;
exit(1);
}
vector<int > class_no_tree;
vector<TextTree> database;
int myTid = 0;
while ( !inFile.eof() ) {
TextTree tt;
inFile >> tt;
class_no_tree.push_back(tt.classid);
tt.doctid = myTid++;
for ( short i = 0; i < tt.vNumber; i++ ) {
if ( tt.vLabel[i] > MAX_VERTEX ) MAX_VERTEX = tt.vLabel[i];
}
database.push_back(tt);
}
inFile.close();
//for test
/*for (int i=0;i<database.size();i++)
{
cout<<endl;
cout<<endl;
cout<<i<<endl;
for (int j=0;j<database[i].path.size();j++)
{
cout<<database[i].path[j].first<<" ";
for(int k=0;k<database[i].path[j].second.size();k++)
cout<<database[i].path[j].second[k]<<" ";
cout<<endl;
}
}*/
cout<<"begin get the feature space...."<<endl;
//get the frequent node of all path
start_time = clock();
vector<bool> isFrequent(MAX_VERTEX - MIN_VERTEX + 1, false);
map<short,OccLongList> occLongList;
map<short,OccLongList>::iterator pos2;
for ( int i = 0; i < database.size(); i++ ) {
for ( int j = 0; j < database[i].path.size(); j++ ) {
for (int k=0;k<database[i].path[j].second.size();k++)
occLongList[database[i].path[j].second[k] - MIN_VERTEX].insert(i,j,k);
}
}
pos2 = occLongList.begin();
while(pos2 != occLongList.end())
{
//cout<<pos2->first<<endl;
//cout<<MIN_VERTEX<<endl;
if(pos2->second.mySupport >= support)
{
isFrequent[pos2->first] = true;
pos2++;
}
else
pos2 = occLongList.erase(pos2);
}
//for test
/*for(pos2 =occLongList.begin() ;pos2!=occLongList.end();++pos2)
{
cout<<pos2->first+MIN_VERTEX<<endl;
cout<<pos2->second.mySupport<<endl;
for ( i=0; i<pos2->second.occurrenceLong.size() ;i++)
cout<<pos2->second.occurrenceLong[i].dococ<<" "<<pos2->second.occurrenceLong[i].pathoc<<" "<<pos2->second.occurrenceLong[i].pos<<endl;
cout<<endl;
}*/
/******************************************************************
step2.3: explore each frequent item (prefixSpan)
******************************************************************/
for ( pos2 = occLongList.begin(); pos2 != occLongList.end(); ++pos2 ) {
if ( pos2->second.mySupport >= support ) {
currentPath.push_back(pos2->first + MIN_VERTEX);
pos2->second.explore(isFrequent,database,support,frequency,maximal,maximalocclonglist);
currentPath.erase(currentPath.begin()+currentPath.size()-1);
}
}
//get the maximalocclonglist
for( i=0; i<maximalocclonglist.size(); i++)
{
int currentid = -1;
vector<int> treeid;
for( int j=0; j<maximalocclonglist[i].size(); j++)
{
if(maximalocclonglist[i][j].dococ != currentid)
{
treeid.push_back(maximalocclonglist[i][j].dococ);
currentid = maximalocclonglist[i][j].dococ;
}
}
maximalocc.push_back(treeid);
}
//for test
/*for( i=0; i<maximalocc.size(); i++){
cout<<endl;
for(int j=0; j<maximalocc[i].size(); j++)
cout<<maximalocc[i][j]<<" ";
}*/
stop_time = clock();
cout<<"done!"<<endl;
double feature_time =(stop_time-start_time)/(double)CLOCKS_PER_SEC;
int feature_num=maximalocc.size();
cout<<"("<<"time:"<<feature_time<<" feature number:"<<feature_num<<")"<<endl;
cout<<"begin HAC clustering...."<<endl;
int sample_num=database.size();
int class_num=3;
//begin clustering~~
CHACCluster rockcluster;
rockcluster.ReadData(maximalocc,class_no_tree,sample_num,feature_num,class_num);
rockcluster.HACCluster();
rockcluster.GetResult(clusteroutputFile,feature_time);
/******************************************************************
step2.4: output the results
******************************************************************/
int frequencynum=0;
int maximalnum=0;
//get the maximal frequnt path
outFile<<"Maximal frequent path:"<<endl;
for( i=0; i<maximal.size();i++){
maximalnum++;
for(int j=0;j<maximal[i].size();j++)
outFile<<maximal[i][j]<<" ";
outFile<<endl;
}
//for test
/*for( i=0;i<maximalocclonglist.size();i++){
outFile<<endl;
for(int j=0; j<maximalocclonglist[i].size();j++)
outFile<<maximalocclonglist[i][j].dococ<<" "<<maximalocclonglist[i][j].pathoc<<" "<<maximalocclonglist[i][j].pos<<endl;
}*/
outFile << "Total maximal frequent path: " << maximalnum<<endl;
outFile <<endl;
outFile<<"Frequent path:"<<endl;
for(pos1 = frequency.begin(); pos1!=frequency.end();++pos1)
{
outFile << "frequent path("<<pos1->first<<"):"<<pos1->second.size()<<endl;
frequencynum+=pos1->second.size();
}
outFile << "Total frequent path: " << frequencynum<<endl;
outFile << "Total Running Time: " << feature_time << endl;
outFile.close();
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -