⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 c45dt.cpp

📁 这是一个改进的C4.5决策数算法C++类
💻 CPP
📖 第 1 页 / 共 5 页
字号:
 /**************************************************************************/
*** All rights reserved.
***文件名称:C45DT.cpp
***文件标识:见配置管理计划书
***摘要:本程序是一个C4.5决策树类。
***
*** 当前版本:1.1
*** 改进者:wuxing,km_wx@yahoo.com.cn
*** 完成日期:2004年9月11日
/**************************************************************************/ 
//********************************************************************************
//*									       *
//*  Copyright J.R. Quinlan, 1987, 1988, 1989, 1990, 1991, 1992.  This software  *
//*  may not be distributed in any form without permission of the copyright      *
//*  holder.								       *
//*									       *
//********************************************************************************
#include "C45DT.h"
#include "RoughSet.h"
#include "SampleSet.h"
#include "wxdebuglog.h"

#include <stdlib.h>
#include <math.h> 
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <malloc.h>
#include <fstream.h>

#define C45DT_DEBUG
//#define PRUNETREE
//#define USE_RULES
//#define DEBUG_RST

WXDebugLog<char,float,int> objLogF_C45DT;

C45DT::C45DT()
{

	objLogF_C45DT.OpenLogFile("Log_C45DT.txt",ios::out);
	objLogF_C45DT.WriteT1("*** C45DT - <SetParas Report> ***\n\n");
	objLogF_C45DT.WriteMilliSecond();

	BATCH      = DTTrue;
	PROBTHRESH = DTFalse;
	VERBOSITY = 0;
	TRIALS    = 10;
	WINDOW    = 0;
	INCREMENT = 0;
	MINOBJS   = 2;
	CF = 0.25;
	GAINRATIO  = DTTrue;
	SUBSET     = DTFalse;
	UNSEENS    = DTTrue;

	optind = 1;
	AllKnown = DTTrue;
	strcpy(FileName,"Log_C45DT");
	MaxDiscrVal = 2;

	initializeParas();
	
	DecisionTree=Nil;
	Raw=Nil;
	Pruned=Nil;
}

C45DT::~C45DT()
{
	ClearLearnDT();
	ClearTestDT();
}

void C45DT::initializeParas()
{
	PossibleValues=Nil;
	Subset=Nil;

	LHSErr=Nil;
	RHSErr=Nil;
	
	SpecialStatus=Nil;

	CVals=Nil;
	SplitGain=Nil;
	SplitInfo=Nil;
	Subsets=Nil;	/* Subsets[a] = no. subsets for att a */

	TargetClassFreq=Nil;
	ThreshErrs=Nil;

	ClassName=Nil;
	AttName=Nil;
	AttValName=Nil;
	Slice1=Nil;
	Slice2=Nil;
	//build.c
	Weight=Nil;
	Freq=Nil;
	ValFreq=Nil;
	ClassFreq=Nil;
	
	MaxAttVal=Nil;
	Item=Nil;

	//trees.c
	Tested=Nil;

	Gain=Nil;
	Info=Nil;
	Bar=Nil;
	UnknownRate=Nil;
	fTreeSwap=Nil;

	//consult.c
	LowClassSum=Nil;
	ClassSum = Nil;		/* accumulated central estimates */

	TRf = 0;
}

void C45DT::PrintHeader(char *Title)
{
    char TitleLine[80];
    time_t clock;
    short Underline;

    time(&clock);
    sprintf(TitleLine, "C4.5 [release %s] %s", RELEASE, Title);
	
    printf("\n%s\t%s", TitleLine, ctime(&clock));
	//printf("\n%s", TitleLine);

    Underline = strlen(TitleLine);
	while ( Underline-- ) putchar('-');
    putchar('\n');

}


/*************************************************************************/
/*									 */
/*  This file is included in case your version of Unix doesn't include   */
/*  the getopt utility.  If it does, discard this file and amend the     */
/*  Makefile accordingly.						 */
/*									 */
/*  There is no copyright on this file.					 */
/*	getopt.c								 */
/*************************************************************************/
int C45DT::getopt(int Argc,char **Argv,char *Str)
{

/*参数检验程序*/	
    int Optchar;
    char *Option;

    if ( optind >= Argc ) return EOF;/*无操作符*/

    Option = Argv[optind++];

    if ( *Option++ != '-' ) return '?';

    Optchar = *Option++;

    while ( *Str && *Str != Optchar ) Str++;//检查操作符optchar是否在Str的列表中

    if ( ! *Str ) return '?';//值为空

    if ( *++Str == ':' )
    {
		if ( *Option ) optarg = Option;
		else if ( optind < Argc ) optarg = Argv[optind++];
		else Optchar = '?';
    }

    return Optchar;
}

/*************************************************************************/
/*									 */
/*	Get names of classes, attributes and attribute values		 */
/*	-----------------------------------------------------		 */
/*	getnames.c								 */
/************************************************************************/
/*************************************************************************/
/*									 */
/*  Read a name from file f into C45_String s, setting Delimiter.		 */
/*									 */
/*  - Embedded periods are permitted, but periods followed by space	 */
/*    characters act as delimiters.					 */
/*  - Embedded spaces are permitted, but multiple spaces are replaced	 */
/*    by a single space.						 */
/*  - Any character can be escaped by '\'.				 */
/*  - The remainder of a line following '|' is ignored.			 */
/*									 */
/*************************************************************************/

Boolean C45DT::ReadName(FILE *f, C45_String s)
{
    register char *Sp=s;
    register int c;

    /*  Skip to first non-space character  */

    while ( ( c = getc(f) ) == '|' || Space(c) ) /* see forward*/
    {
		if ( c == '|' ) SkipComment;/*忽略注释*/
    }

    /*  Return DTFalse if no names to read  */

    if ( c == EOF )
    {
		Delimiter = EOF;
		return DTFalse;
    }

    /*  Read in characters up to the next delimiter  */

    while ( c != ':' && c != ',' && c != '\n' && c != '|' && c != EOF )
    {
		/*若字符非各种分隔符,则检验如下*/
		if ( c == '.' )
			{
				if ( ( c = getc(f) ) == '|' || Space(c) ) break;
				*Sp++ = '.';
			}

		if ( c == '\\' )
			{
				c = getc(f);
			}

		*Sp++ = c;

		if ( c == ' ' )
			{
				while ( ( c = getc(f) ) == ' ' )
				;
			}
		else
			{
				c = getc(f);
			}
	}

	if ( c == '|' ) SkipComment;
	Delimiter = c;

    /*  Strip trailing spaces  */

    while ( Space(*(Sp-1)) ) Sp--;

    *Sp++ = '\0';
    return DTTrue;
}



/*************************************************************************/
/*									 */
/*  Read the names of classes, attributes and legal attribute values.	 */
/*  On completion, these names are stored in:				 */
/*	ClassName	-  class names					 */
/*	AttName		-  attribute names				 */
/*	AttValName	-  attribute value names			 */
/*  with:								 */
/*	MaxAttVal	-  number of values for each attribute		 */
/*									 */
/*  Other global variables set are:					 */
/*	MaxAtt		-  maximum attribute number			 */
/*	MaxClass	-  maximum class number				 */
/*	MaxDiscrVal	-  maximum discrete values for any attribute	 */
/*									 */
/*  Note:  until the number of attributes is known, the name		 */
/*	   information is assembled in local arrays			 */

/*	modified by wuxing ,2003-9-3								 */
/*************************************************************************/

void C45DT::GetNames()
/*  ---------  */
{
    FILE *Nf;
    char Fn[100], Buffer[1000];
    DiscrValue v;
    int AttCeiling=100, ClassCeiling=100, ValCeiling;

    /*  Open names file  */

    strcpy(Fn, FileName);/* "DF"->Fn */
    strcat(Fn, ".names");/* Fn="DF.names" */
    if ( ! ( Nf = fopen(Fn, "r") ) ) Quinlan_Error(0, Fn, "");/*不能打开文件输出错误信息*/

    /*  Get class names from names file  */

    ClassName = (C45_String *) calloc(ClassCeiling, sizeof(C45_String));/*分配空间*/
    MaxClass = -1;/* ?*/
    do
    {
		ReadName(Nf, Buffer);/*从文件中逐个提取类名字符*/

		if ( ++MaxClass >= ClassCeiling)/*如果超过类的数量就扩展*/
		{
			ClassCeiling += 100;
			ClassName = (C45_String *) realloc(ClassName, ClassCeiling*sizeof(C45_String));
		}
		ClassName[MaxClass] = CopyString(Buffer);
    }while ( Delimiter == ',' );/*如果是类名分隔符,便继续提取类名*/

	/*已提取全部类名*/
    /*  Get attribute and attribute value names from names file  */

    AttName = (C45_String *) calloc(AttCeiling, sizeof(C45_String));
    MaxAttVal = (DiscrValue *) calloc(AttCeiling, sizeof(DiscrValue));
    AttValName = (C45_String **) calloc(AttCeiling, sizeof(C45_String *));
    SpecialStatus = (char *) malloc(AttCeiling);

    MaxAtt = -1;
    while ( ReadName(Nf, Buffer) )
    {
		if ( Delimiter != ':' ) Quinlan_Error(1, Buffer, "");/*属性后无冒号分隔*/

		if ( ++MaxAtt >= AttCeiling )/*如果超过属性的数量就扩展*/
		{
			AttCeiling += 100;
			AttName = (C45_String *) realloc(AttName, AttCeiling*sizeof(C45_String));
			MaxAttVal = (DiscrValue *) realloc(MaxAttVal, AttCeiling*sizeof(DiscrValue));
			AttValName = (C45_String **) realloc(AttValName, AttCeiling*sizeof(C45_String *));
			SpecialStatus = (char *) realloc(SpecialStatus, AttCeiling);
		}

		/*得到当前属性名*/
		AttName[MaxAtt] = CopyString(Buffer);
		SpecialStatus[MaxAtt] = Nil;
		MaxAttVal[MaxAtt] = 0;
		ValCeiling = 100;
		AttValName[MaxAtt] =Nil;
		AttValName[MaxAtt] = (C45_String *) calloc(ValCeiling, sizeof(C45_String));

		/*得到当前属性的全部属性值*/
		do
		{
			if ( ! ( ReadName(Nf, Buffer) ) ) Quinlan_Error(2, AttName[MaxAtt], "");
			/*unexpected eof while reading attribute*/

			if ( ++MaxAttVal[MaxAtt] >= ValCeiling )
				{
				ValCeiling += 100;
				AttValName[MaxAtt] =
					(C45_String *) realloc(AttValName[MaxAtt], ValCeiling*sizeof(C45_String));
				}

			AttValName[MaxAtt][MaxAttVal[MaxAtt]] = CopyString(Buffer);
		}while ( Delimiter == ',' );
		
		if ( MaxAttVal[MaxAtt] == 1 )/*如果仅有一个属性值*/
		{
			/*  Check for special treatment  */

			if ( ! strcmp(Buffer, "continuous") )
				{}/*连续属性值*/
			else if ( ! memcmp(Buffer, "discrete", 8) )/*如果比discrete字符串短*/
			{
				SpecialStatus[MaxAtt] = DISCRETE;/*则是离散值*/

				/*  Read max values, reserve space and check MaxDiscrVal  */

				v = atoi(&Buffer[8]);/*提取属性值*/
				if ( v < 2 )
					{
						printf("** %s: illegal number of discrete values\n",
						   AttName[MaxAtt]);
						exit(1);
					}

				AttValName[MaxAtt] =
					(C45_String *) realloc(AttValName[MaxAtt], (v+2)*sizeof(C45_String));
				AttValName[MaxAtt][0] = (char *) v;
				if ( v > MaxDiscrVal ) MaxDiscrVal = v;
			}
			else if ( ! strcmp(Buffer, "ignore") )
			{
				SpecialStatus[MaxAtt] = IGNORE;
			}
			else
			{
				/*  Cannot have only one discrete value for an attribute  */
				
				Quinlan_Error(3, AttName[MaxAtt], "");
			}

			MaxAttVal[MaxAtt] = 0;/*如果仅有一个属性值,则令MaxAttVal[MaxAtt] = 0*/
		}
		else if ( MaxAttVal[MaxAtt] > MaxDiscrVal ) /*得到所有属性中最大属性值数*/
			MaxDiscrVal = MaxAttVal[MaxAtt];
    }

    fclose(Nf);
}

/*************************************************************************/
/*									 */
/*	Locate value Val in List[First] to List[Last]			 */
/*									 */
/*************************************************************************/

/*在每个属性的值域中查找当前值的位置编号*/
int C45DT::Which(C45_String Val,C45_String List[],short First,short Last)
{
    short n=First;

    while ( n <= Last && strcmp(Val, List[n]) ) n++;

    return ( n <= Last ? n : First-1 );/*如果字符串不在数组中返回0*/
}

/*************************************************************************/
/*									 */
/*	Allocate space then copy C45_String into it				 */
/*									 */
/*************************************************************************/

C45_String C45DT::CopyString(C45_String x)
{
    char *s;
	int i;

	i=strlen(x);
    s = (char *) calloc(strlen(x)+1, sizeof(char));
    strcpy(s, x);
    return s;
}

/*************************************************************************/
/*									 */
/*			Error messages					 */
/*									 */
/*************************************************************************/

void C45DT::Quinlan_Error(short n,C45_String s1,C45_String s2)
{
    static char Messages=0;

    printf("\nERROR:  ");
    switch(n)
    {
	case 0: printf("cannot open file %s%s\n", s1, s2);
		exit(1);

	case 1:	printf("colon expected after attribute name %s\n", s1);
		break;

	case 2:	printf("unexpected eof while reading attribute %s\n", s1);
		break;

	case 3: printf("attribute %s has only one value\n", s1);
		break;

	case 4: printf("case %d's value of '%s' for attribute %s is illegal\n",
		    MaxItem+1, s2, s1);
		break;

	case 5: printf("case %d's class of '%s' is illegal\n", MaxItem+1, s2);
    }

    if ( ++Messages > 10 )
    {
	printf("Error limit exceeded\n");
	exit(1);
    }
}

/*************************************************************************/
/*									 */
/*	Get case descriptions from data file				 */
/*	--------------------------------------				 */
/*	getdata.c								 */
/*************************************************************************/
/*************************************************************************/
/*									 */
/*  Read raw case descriptions from file with given extension.		 */
/*									 */
/*  On completion, cases are stored in array Item in the form		 */
/*  of Descriptions (i.e. arrays of attribute values), and		 */
/*  MaxItem is set to the number of data items.				 */
/*									 */
/*************************************************************************/

void C45DT::GetData(C45_String Extension)
{
    FILE *Df;
    char Fn[100];
    ItemNo i=0,ItemSpace=0;

    /*  Open data file  */

    
    if ( ! ( Df = fopen(Fn, "r") ) ) Quinlan_Error(0, Fn, "");

    do
    {
		MaxItem = i;

		/*  Make sure there is room for another item  */

		if ( i >= ItemSpace )/*第一次成立*/
		{
			if ( ItemSpace )/*第一次ItemSpace=0不成立*/
			{
			ItemSpace += Inc;/*一次加2048*/
			Item = (Description *)/*重新分配Description结构体内存空间*/
				realloc(Item, ItemSpace*sizeof(Description));
			}
			else/*第一次分配内存空间*/
			{
			Item = (Description *)
				malloc((ItemSpace=Inc)*sizeof(Description));
			}
		}

		Item[i] = GetDescription(Df);/*返回属性指针*/

    } while ( Item[i] != Nil && ++i );

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -