⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cvfdt.c

📁 数据挖掘方面的源码
💻 C
📖 第 1 页 / 共 4 页
字号:
      }      // If we didn't replace the tree, switch and reset counts      if ( currentNode->altSubtreeInfos != 0 ) {	if (oldInfo->exampleCount == gAltTestNum) {	  oldInfo->exampleCount = 0;	  newInfo->exampleCount = 0;	  oldInfo->numTested = 0;	  newInfo->numTested = 0;	  oldInfo->numErrors = 0;	  newInfo->numErrors = 0;	}	oldInfo->status = (oldInfo->status + 1) % 2;      }    }  }  /* add the information */  ExampleGroupStatsAddExample(egs, ce->example);  if ( currentNode->nodeId > ce->oldestNode )    ce->oldestNode = currentNode->nodeId;	  currentNode->examplesSeen++;  currentNode->hasChanged = 1;  gd->seenSinceLastProcess++;  // Do depth first search to check for questionable nodes  if (( root == currentNode ) && (count % gCheckSize == 0)) {    _CVFDTCheckForQuestionableNodes( root, root );  }  if(gd->seenSinceLastProcess >= root->vfdt->processChunkSize) {    gd->seenSinceLastProcess = 0;        if ( !DecisionTreeIsNodeGrowing( currentNode->dtreeNode ))      return;        if(!ExampleGroupStatsIsPure(egs)) {      _CheckSplitEntropy(root, currentNode );    }  }}static void _RemoveExampleFromNode(CVFDTPtr root, 				   CVFDTPtr currentNode,				   CVFDTExamplePtr ce) {  VFDTGrowingDataPtr gd     = (VFDTGrowingDataPtr)DecisionTreeGetGrowingData(currentNode->dtreeNode);  ExampleGroupStatsPtr egs = gd->egs;    // Check for alternate trees  if (currentNode->altSubtreeInfos != 0) {    //CVFDTAltInfoPtr oldInfo = VALIndex(currentNode->altSubtreeInfos, 0);    CVFDTAltInfoPtr newInfo = VALIndex(currentNode->altSubtreeInfos, 1);    // HERE Geoff took this out, I think it means that you will forget     //  different examples than you learned on depending on the test period     // instead now we test on 10% of examples but we also use them for     // training    //if ( oldInfo->status == 1 )    //  return;    CVFDTForgetExample(newInfo->subtree, ce);  }  /* remove the information */  currentNode->hasChanged = 1;  if ( currentNode->examplesSeen > 0 ) {    ExampleGroupStatsRemoveExample(egs, ce->example);    currentNode->examplesSeen--;  }}void CVFDTForgetExample( CVFDTPtr cvfdt, CVFDTExamplePtr ce ) {  int foundNode = 0;  CVFDTPtr currentNode = cvfdt;  CVFDTPtr parentNode = 0;  int lastRemoved;    while(!foundNode && currentNode != 0) {    if ( ce->oldestNode < currentNode->nodeId )      return;    lastRemoved = 0;    if ( currentNode->examplesSeen == 1 )      lastRemoved = 1;    _RemoveExampleFromNode(cvfdt, currentNode, ce );        if(DecisionTreeIsLeaf(currentNode->dtreeNode ))       foundNode = 1;    else if(DecisionTreeIsNodeGrowing(currentNode->dtreeNode ))       foundNode = 1;        if(!foundNode) {      parentNode = currentNode;      currentNode = CVFDTOneStepClassify(currentNode, ce->example );    }  }}void _CVFDTProcessExamples(CVFDTPtr cvfdt, FILE *input) {   ExamplePtr e = ExampleRead(input, cvfdt->vfdt->spec);   CVFDTExamplePtr ce = _CVFDTExampleNew( e );   //int replaceHere = 0;   int count = 0;   long allocation;   DecisionTreePtr dt;   //int cacheCount = 0;   char* fileName;   long learnTime = 0;   int numQExamples = 0;   long cacheTime = 0;   long seen = 0;   struct tms starttime;   struct tms endtime;   times(&starttime);   gCache = VALNew();   while(e != 0 ) {     int i = 0;     cvfdt->vfdt->examplesSeen++;     seen++;     ce->inQNode = 0;     if(gIncrementalReporting) {       times(&endtime);       learnTime += endtime.tms_utime - starttime.tms_utime;          _doIncrementalTest(cvfdt->vfdt, cvfdt->vfdt->spec, e);       if(seen % 1000 == 0) {	 _doIncrementalReport();       }       times(&starttime);     }     _ProcessExample(cvfdt, ce, count);     if ( ce->inQNode == 1 )       numQExamples++;     times(&endtime);     learnTime += endtime.tms_utime - starttime.tms_utime;     times(&starttime);     VALInsert( gCache, ce, gCurCachePos );     gCurCachePos++;     count++;     if (( gCurCachePos == gCacheSize ) || ( count % gWindowSize == 0 )) {       CVFDTExamplePtr curCe;       FILE* curFile;       fileName = VALIndex( gCacheFiles, gCurCacheIndex );       curFile = fopen( fileName, "w" );       fprintf( curFile, "%d ", VALLength( gCache ));       for ( i = 0; i < VALLength( gCache ); i++ ) {	 curCe = (CVFDTExamplePtr)VALIndex( gCache, i );	 _CVFDTExampleWrite( curFile, curCe );	 _CVFDTExampleFree( curCe );       }       VALFree( gCache );       gCache = VALNew();       fclose( curFile );       if ( count < gWindowSize )	 gLastFilledCacheIndex = gCurCacheIndex;       if ( ++gCurCacheIndex == VALLength( gCacheFiles ))	 gCurCacheIndex = 0;       gCurCachePos = 0;     }     times(&endtime);     cacheTime += endtime.tms_utime - starttime.tms_utime;     times(&starttime);     /*     if(count % 10000 == 0){       printf("processed %ld examples\n", count);     }     */     if ( count >=  gWindowSize ) {       if ( gCurCachePos == 0 ) {	 CVFDTExamplePtr curCe;	 FILE* curFile;	 int numExamples;	 int returnVal;	 times(&endtime);	 learnTime += endtime.tms_utime - starttime.tms_utime;	 times(&starttime);	 	 fileName = VALIndex( gCacheFiles, gCurCacheIndex );	 curFile = fopen( fileName, "r" );	 returnVal = fscanf( curFile, "%d ", &numExamples );	 for ( i = 0; i < numExamples; i++ ) {	   curCe = _CVFDTExampleRead( curFile, cvfdt->vfdt->spec );	   VALAppend( gCache, curCe );	 }	 	 fclose( curFile );	 times(&endtime);	 cacheTime += endtime.tms_utime - starttime.tms_utime;	 times(&starttime);	        }       ce = VALIndex( gCache, gCurCachePos );       CVFDTForgetExample( cvfdt, ce );       VALRemove( gCache, gCurCachePos );       _CVFDTExampleFree( ce );     }     e = ExampleRead(input, cvfdt->vfdt->spec);     ce = _CVFDTExampleNew( e );          if (gUseSchedule && count == gScheduleCount) {       gScheduleCount *= gScheduleMult;       times(&endtime);       learnTime += endtime.tms_utime - starttime.tms_utime;       allocation = MGetTotalAllocation();       dt = CVFDTGetLearnedTree( cvfdt );       DebugMessage(1, 2, "doing tests...\n");       _doTests( cvfdt->vfdt->spec, dt, cvfdt->vfdt->numGrowing, 		 count, learnTime, cacheTime, 		 allocation, 0, numQExamples,		 gNumSwitches, gNumPrunes );       numQExamples = 0;       gNumSwitches = 0;       gNumPrunes = 0;       gNumNewAlts = 0;       DecisionTreeFree(dt);       times(&starttime);     }   }      times(&endtime);   learnTime += endtime.tms_utime - starttime.tms_utime;   if(gIncrementalReporting) {      _doIncrementalReport();   }   if(gMessageLevel >= 1) {     printf("finished with all the examples, there are %ld growing nodes\n", 	    cvfdt->vfdt->numGrowing);     printf("time %.2lfs\n", ((double)learnTime) / 100);   }   }void CVFDTProcessExamples( CVFDTPtr cvfdt, FILE *input ) {  _CVFDTProcessExamples(cvfdt, input);}static void _CVFDTPrintSpaces(FILE *out, int num) {  int i;    for(i = 0 ; i < num ; i++) {    fprintf(out, " ");  }}static void _CVFDTPrintHelp( CVFDTPtr cvfdt, FILE *out, int indent ) {  int i;  _CVFDTPrintSpaces(out, indent);  if(cvfdt->dtreeNode->nodeType == dtnLeaf) {    fprintf(out, "%ld (%d): (leaf: %s)\n", 	    cvfdt->uid, cvfdt->examplesSeen, 	    ExampleSpecGetClassValueName(cvfdt->dtreeNode->spec, 					 cvfdt->dtreeNode->myclass));  }   else if(cvfdt->dtreeNode->nodeType == dtnDiscrete) {    fprintf(out, "%ld (%d): (split on %s:\n", cvfdt->uid, cvfdt->examplesSeen, 	    ExampleSpecGetAttributeName(cvfdt->dtreeNode->spec, 					cvfdt->dtreeNode->splitAttribute) );    for(i = 0 ; i < VALLength(cvfdt->children) ; i++) {      _CVFDTPrintSpaces(out, indent + 1);      fprintf(out, "Value %s\n", 	      ExampleSpecGetAttributeValueName(cvfdt->dtreeNode->spec, 					       cvfdt->dtreeNode->splitAttribute, i));      _CVFDTPrintHelp(VALIndex(cvfdt->children, i), out, indent + 2);    }    _CVFDTPrintSpaces(out, indent);    fprintf(out, ")\n");    if (cvfdt->altSubtreeInfos != 0) {      CVFDTAltInfoPtr altTreeInfo;      _CVFDTPrintSpaces(out, indent + 2);      fprintf(out, "***************\n");      altTreeInfo = VALIndex(cvfdt->altSubtreeInfos, 1);      _CVFDTPrintHelp(altTreeInfo->subtree, out, indent + 2);      _CVFDTPrintSpaces(out, indent + 2);      fprintf(out, "***************\n");    }  }   else if(cvfdt->dtreeNode->nodeType == dtnContinuous) {    fprintf(out, "%ld: (split on %s threshold %f: \n",	    cvfdt->uid, ExampleSpecGetAttributeName(cvfdt->dtreeNode->spec, 						    cvfdt->dtreeNode->splitAttribute),	    cvfdt->dtreeNode->splitThreshold);    _CVFDTPrintSpaces(out, indent + 1);        /* left child */    fprintf(out, "< %f\n", cvfdt->dtreeNode->splitThreshold);    _CVFDTPrintHelp(VALIndex(cvfdt->children, 0), out, indent + 2);        /* right child */    fprintf(out, ">= %f\n", cvfdt->dtreeNode->splitThreshold);    _CVFDTPrintHelp(VALIndex(cvfdt->children, 1), out, indent + 2);        _CVFDTPrintSpaces(out, indent);    fprintf(out, ")\n");  }  else if ( cvfdt->dtreeNode->nodeType == dtnGrowing )    fprintf( out, "%ld (%d): (growing)\n", cvfdt->uid, cvfdt->examplesSeen );}void CVFDTPrint( CVFDTPtr cvfdt, FILE *out) {  _CVFDTPrintHelp(cvfdt, out, 0);}CVFDTPtr CVFDTNew( VFDTPtr vfdt,		   DecisionTreePtr dt,		   long uid ) {  CVFDTPtr cvfdt = MNewPtr(sizeof(CVFDT));  cvfdt->vfdt = vfdt;  cvfdt->dtreeNode = dt;  cvfdt->children = 0;  cvfdt->examplesSeen = 0;  cvfdt->uid = uid;  cvfdt->nodeId = gNodeId++;  cvfdt->hasChanged = 0;  cvfdt->altSubtreeInfos = 0;  return cvfdt;}CVFDTPtr CVFDTCreateRoot(ExampleSpecPtr spec, float splitConfidence, float tieConfidence) {  CVFDTPtr cvfdt = MNewPtr(sizeof(CVFDT));  cvfdt->vfdt = VFDTNew( spec, splitConfidence, tieConfidence );  cvfdt->dtreeNode = cvfdt->vfdt->dtree;  cvfdt->children = 0;  cvfdt->examplesSeen = 0;  cvfdt->uid = 1;  cvfdt->nodeId = gNodeId++;  cvfdt->hasChanged = 0;  cvfdt->altSubtreeInfos = 0;  return cvfdt;}void CVFDTAddChild(CVFDTPtr parent, DecisionTreePtr newChild ){  CVFDTPtr child = MNewPtr(sizeof(CVFDT));  child->vfdt = parent->vfdt;  child->dtreeNode = newChild;  child->children = 0;  child->hasChanged = 0;  child->examplesSeen = 0;  child->nodeId = gNodeId++;  if ( parent->children == 0 )    parent->children = VALNew();  VALAppend( parent->children, child );  child->uid = parent->uid * 10 + VALLength( parent->children );  child->altSubtreeInfos = 0;}CVFDTPtr CVFDTOneStepClassify(CVFDTPtr cvfdt, ExamplePtr e) {  if(cvfdt->dtreeNode->nodeType == dtnLeaf      || cvfdt->dtreeNode->nodeType == dtnGrowing) {    return cvfdt;  }  if(cvfdt->dtreeNode->nodeType == dtnDiscrete) {    if(ExampleIsAttributeUnknown(e, cvfdt->dtreeNode->splitAttribute)) {      /* HERE do something smarter */      return VALIndex(cvfdt->children, 0);    } else {      return VALIndex(cvfdt->children, 		      ExampleGetDiscreteAttributeValue(e, cvfdt->dtreeNode->splitAttribute));    }  } else if(cvfdt->dtreeNode->nodeType == dtnContinuous) {    if(ExampleIsAttributeUnknown(e, cvfdt->dtreeNode->splitAttribute)) {      /* HERE do something smarter */      return VALIndex(cvfdt->children, 0);    } else if(ExampleGetContinuousAttributeValue(e, cvfdt->dtreeNode->splitAttribute) >=	      cvfdt->dtreeNode->splitThreshold) {      return VALIndex(cvfdt->children, 1);    } else { /* the value is < the threshold */      return VALIndex(cvfdt->children, 0);    }        }    return cvfdt;}DecisionTreePtr CVFDTGetLearnedTree(CVFDTPtr cvfdt) {  int i;  VoidAListPtr list = VALNew();  ExampleGroupStatsPtr egs;  DecisionTreePtr growNode;  DecisionTreePtr finalTree = DecisionTreeClone(cvfdt->dtreeNode);    DecisionTreeGatherGrowingNodes(finalTree, list);    for(i = VALLength(list) - 1 ; i >= 0 ; i--) {    growNode = VALIndex(list, i);    DecisionTreeSetTypeLeaf(growNode);        egs = ((VFDTGrowingDataPtr)DecisionTreeGetGrowingData(growNode))->egs;    DecisionTreeSetClass(growNode, ExampleGroupStatsGetMostCommonClass(egs));  }  VALFree(list);    return finalTree;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -