📄 c45dt.cpp
字号:
/**************************************************************************/
*** All rights reserved.
***文件名称:C45DT.cpp
***文件标识:见配置管理计划书
***摘要:本程序是一个C4.5决策树类。
***
*** 当前版本:1.1
*** 改进者:wuxing,km_wx@yahoo.com.cn
*** 完成日期:2004年9月11日
/**************************************************************************/
//********************************************************************************
//* *
//* Copyright J.R. Quinlan, 1987, 1988, 1989, 1990, 1991, 1992. This software *
//* may not be distributed in any form without permission of the copyright *
//* holder. *
//* *
//********************************************************************************
#include "C45DT.h"
#include "RoughSet.h"
#include "SampleSet.h"
#include "wxdebuglog.h"
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <malloc.h>
#include <fstream.h>
#define C45DT_DEBUG
//#define PRUNETREE
//#define USE_RULES
//#define DEBUG_RST
WXDebugLog<char,float,int> objLogF_C45DT;
C45DT::C45DT()
{
objLogF_C45DT.OpenLogFile("Log_C45DT.txt",ios::out);
objLogF_C45DT.WriteT1("*** C45DT - <SetParas Report> ***\n\n");
objLogF_C45DT.WriteMilliSecond();
BATCH = DTTrue;
PROBTHRESH = DTFalse;
VERBOSITY = 0;
TRIALS = 10;
WINDOW = 0;
INCREMENT = 0;
MINOBJS = 2;
CF = 0.25;
GAINRATIO = DTTrue;
SUBSET = DTFalse;
UNSEENS = DTTrue;
optind = 1;
AllKnown = DTTrue;
strcpy(FileName,"Log_C45DT");
MaxDiscrVal = 2;
initializeParas();
DecisionTree=Nil;
Raw=Nil;
Pruned=Nil;
}
C45DT::~C45DT()
{
ClearLearnDT();
ClearTestDT();
}
void C45DT::initializeParas()
{
PossibleValues=Nil;
Subset=Nil;
LHSErr=Nil;
RHSErr=Nil;
SpecialStatus=Nil;
CVals=Nil;
SplitGain=Nil;
SplitInfo=Nil;
Subsets=Nil; /* Subsets[a] = no. subsets for att a */
TargetClassFreq=Nil;
ThreshErrs=Nil;
ClassName=Nil;
AttName=Nil;
AttValName=Nil;
Slice1=Nil;
Slice2=Nil;
//build.c
Weight=Nil;
Freq=Nil;
ValFreq=Nil;
ClassFreq=Nil;
MaxAttVal=Nil;
Item=Nil;
//trees.c
Tested=Nil;
Gain=Nil;
Info=Nil;
Bar=Nil;
UnknownRate=Nil;
fTreeSwap=Nil;
//consult.c
LowClassSum=Nil;
ClassSum = Nil; /* accumulated central estimates */
TRf = 0;
}
void C45DT::PrintHeader(char *Title)
{
char TitleLine[80];
time_t clock;
short Underline;
time(&clock);
sprintf(TitleLine, "C4.5 [release %s] %s", RELEASE, Title);
printf("\n%s\t%s", TitleLine, ctime(&clock));
//printf("\n%s", TitleLine);
Underline = strlen(TitleLine);
while ( Underline-- ) putchar('-');
putchar('\n');
}
/*************************************************************************/
/* */
/* This file is included in case your version of Unix doesn't include */
/* the getopt utility. If it does, discard this file and amend the */
/* Makefile accordingly. */
/* */
/* There is no copyright on this file. */
/* getopt.c */
/*************************************************************************/
int C45DT::getopt(int Argc,char **Argv,char *Str)
{
/*参数检验程序*/
int Optchar;
char *Option;
if ( optind >= Argc ) return EOF;/*无操作符*/
Option = Argv[optind++];
if ( *Option++ != '-' ) return '?';
Optchar = *Option++;
while ( *Str && *Str != Optchar ) Str++;//检查操作符optchar是否在Str的列表中
if ( ! *Str ) return '?';//值为空
if ( *++Str == ':' )
{
if ( *Option ) optarg = Option;
else if ( optind < Argc ) optarg = Argv[optind++];
else Optchar = '?';
}
return Optchar;
}
/*************************************************************************/
/* */
/* Get names of classes, attributes and attribute values */
/* ----------------------------------------------------- */
/* getnames.c */
/************************************************************************/
/*************************************************************************/
/* */
/* Read a name from file f into C45_String s, setting Delimiter. */
/* */
/* - Embedded periods are permitted, but periods followed by space */
/* characters act as delimiters. */
/* - Embedded spaces are permitted, but multiple spaces are replaced */
/* by a single space. */
/* - Any character can be escaped by '\'. */
/* - The remainder of a line following '|' is ignored. */
/* */
/*************************************************************************/
Boolean C45DT::ReadName(FILE *f, C45_String s)
{
register char *Sp=s;
register int c;
/* Skip to first non-space character */
while ( ( c = getc(f) ) == '|' || Space(c) ) /* see forward*/
{
if ( c == '|' ) SkipComment;/*忽略注释*/
}
/* Return DTFalse if no names to read */
if ( c == EOF )
{
Delimiter = EOF;
return DTFalse;
}
/* Read in characters up to the next delimiter */
while ( c != ':' && c != ',' && c != '\n' && c != '|' && c != EOF )
{
/*若字符非各种分隔符,则检验如下*/
if ( c == '.' )
{
if ( ( c = getc(f) ) == '|' || Space(c) ) break;
*Sp++ = '.';
}
if ( c == '\\' )
{
c = getc(f);
}
*Sp++ = c;
if ( c == ' ' )
{
while ( ( c = getc(f) ) == ' ' )
;
}
else
{
c = getc(f);
}
}
if ( c == '|' ) SkipComment;
Delimiter = c;
/* Strip trailing spaces */
while ( Space(*(Sp-1)) ) Sp--;
*Sp++ = '\0';
return DTTrue;
}
/*************************************************************************/
/* */
/* Read the names of classes, attributes and legal attribute values. */
/* On completion, these names are stored in: */
/* ClassName - class names */
/* AttName - attribute names */
/* AttValName - attribute value names */
/* with: */
/* MaxAttVal - number of values for each attribute */
/* */
/* Other global variables set are: */
/* MaxAtt - maximum attribute number */
/* MaxClass - maximum class number */
/* MaxDiscrVal - maximum discrete values for any attribute */
/* */
/* Note: until the number of attributes is known, the name */
/* information is assembled in local arrays */
/* modified by wuxing ,2003-9-3 */
/*************************************************************************/
void C45DT::GetNames()
/* --------- */
{
FILE *Nf;
char Fn[100], Buffer[1000];
DiscrValue v;
int AttCeiling=100, ClassCeiling=100, ValCeiling;
/* Open names file */
strcpy(Fn, FileName);/* "DF"->Fn */
strcat(Fn, ".names");/* Fn="DF.names" */
if ( ! ( Nf = fopen(Fn, "r") ) ) Quinlan_Error(0, Fn, "");/*不能打开文件输出错误信息*/
/* Get class names from names file */
ClassName = (C45_String *) calloc(ClassCeiling, sizeof(C45_String));/*分配空间*/
MaxClass = -1;/* ?*/
do
{
ReadName(Nf, Buffer);/*从文件中逐个提取类名字符*/
if ( ++MaxClass >= ClassCeiling)/*如果超过类的数量就扩展*/
{
ClassCeiling += 100;
ClassName = (C45_String *) realloc(ClassName, ClassCeiling*sizeof(C45_String));
}
ClassName[MaxClass] = CopyString(Buffer);
}while ( Delimiter == ',' );/*如果是类名分隔符,便继续提取类名*/
/*已提取全部类名*/
/* Get attribute and attribute value names from names file */
AttName = (C45_String *) calloc(AttCeiling, sizeof(C45_String));
MaxAttVal = (DiscrValue *) calloc(AttCeiling, sizeof(DiscrValue));
AttValName = (C45_String **) calloc(AttCeiling, sizeof(C45_String *));
SpecialStatus = (char *) malloc(AttCeiling);
MaxAtt = -1;
while ( ReadName(Nf, Buffer) )
{
if ( Delimiter != ':' ) Quinlan_Error(1, Buffer, "");/*属性后无冒号分隔*/
if ( ++MaxAtt >= AttCeiling )/*如果超过属性的数量就扩展*/
{
AttCeiling += 100;
AttName = (C45_String *) realloc(AttName, AttCeiling*sizeof(C45_String));
MaxAttVal = (DiscrValue *) realloc(MaxAttVal, AttCeiling*sizeof(DiscrValue));
AttValName = (C45_String **) realloc(AttValName, AttCeiling*sizeof(C45_String *));
SpecialStatus = (char *) realloc(SpecialStatus, AttCeiling);
}
/*得到当前属性名*/
AttName[MaxAtt] = CopyString(Buffer);
SpecialStatus[MaxAtt] = Nil;
MaxAttVal[MaxAtt] = 0;
ValCeiling = 100;
AttValName[MaxAtt] =Nil;
AttValName[MaxAtt] = (C45_String *) calloc(ValCeiling, sizeof(C45_String));
/*得到当前属性的全部属性值*/
do
{
if ( ! ( ReadName(Nf, Buffer) ) ) Quinlan_Error(2, AttName[MaxAtt], "");
/*unexpected eof while reading attribute*/
if ( ++MaxAttVal[MaxAtt] >= ValCeiling )
{
ValCeiling += 100;
AttValName[MaxAtt] =
(C45_String *) realloc(AttValName[MaxAtt], ValCeiling*sizeof(C45_String));
}
AttValName[MaxAtt][MaxAttVal[MaxAtt]] = CopyString(Buffer);
}while ( Delimiter == ',' );
if ( MaxAttVal[MaxAtt] == 1 )/*如果仅有一个属性值*/
{
/* Check for special treatment */
if ( ! strcmp(Buffer, "continuous") )
{}/*连续属性值*/
else if ( ! memcmp(Buffer, "discrete", 8) )/*如果比discrete字符串短*/
{
SpecialStatus[MaxAtt] = DISCRETE;/*则是离散值*/
/* Read max values, reserve space and check MaxDiscrVal */
v = atoi(&Buffer[8]);/*提取属性值*/
if ( v < 2 )
{
printf("** %s: illegal number of discrete values\n",
AttName[MaxAtt]);
exit(1);
}
AttValName[MaxAtt] =
(C45_String *) realloc(AttValName[MaxAtt], (v+2)*sizeof(C45_String));
AttValName[MaxAtt][0] = (char *) v;
if ( v > MaxDiscrVal ) MaxDiscrVal = v;
}
else if ( ! strcmp(Buffer, "ignore") )
{
SpecialStatus[MaxAtt] = IGNORE;
}
else
{
/* Cannot have only one discrete value for an attribute */
Quinlan_Error(3, AttName[MaxAtt], "");
}
MaxAttVal[MaxAtt] = 0;/*如果仅有一个属性值,则令MaxAttVal[MaxAtt] = 0*/
}
else if ( MaxAttVal[MaxAtt] > MaxDiscrVal ) /*得到所有属性中最大属性值数*/
MaxDiscrVal = MaxAttVal[MaxAtt];
}
fclose(Nf);
}
/*************************************************************************/
/* */
/* Locate value Val in List[First] to List[Last] */
/* */
/*************************************************************************/
/*在每个属性的值域中查找当前值的位置编号*/
int C45DT::Which(C45_String Val,C45_String List[],short First,short Last)
{
short n=First;
while ( n <= Last && strcmp(Val, List[n]) ) n++;
return ( n <= Last ? n : First-1 );/*如果字符串不在数组中返回0*/
}
/*************************************************************************/
/* */
/* Allocate space then copy C45_String into it */
/* */
/*************************************************************************/
C45_String C45DT::CopyString(C45_String x)
{
char *s;
int i;
i=strlen(x);
s = (char *) calloc(strlen(x)+1, sizeof(char));
strcpy(s, x);
return s;
}
/*************************************************************************/
/* */
/* Error messages */
/* */
/*************************************************************************/
void C45DT::Quinlan_Error(short n,C45_String s1,C45_String s2)
{
static char Messages=0;
printf("\nERROR: ");
switch(n)
{
case 0: printf("cannot open file %s%s\n", s1, s2);
exit(1);
case 1: printf("colon expected after attribute name %s\n", s1);
break;
case 2: printf("unexpected eof while reading attribute %s\n", s1);
break;
case 3: printf("attribute %s has only one value\n", s1);
break;
case 4: printf("case %d's value of '%s' for attribute %s is illegal\n",
MaxItem+1, s2, s1);
break;
case 5: printf("case %d's class of '%s' is illegal\n", MaxItem+1, s2);
}
if ( ++Messages > 10 )
{
printf("Error limit exceeded\n");
exit(1);
}
}
/*************************************************************************/
/* */
/* Get case descriptions from data file */
/* -------------------------------------- */
/* getdata.c */
/*************************************************************************/
/*************************************************************************/
/* */
/* Read raw case descriptions from file with given extension. */
/* */
/* On completion, cases are stored in array Item in the form */
/* of Descriptions (i.e. arrays of attribute values), and */
/* MaxItem is set to the number of data items. */
/* */
/*************************************************************************/
void C45DT::GetData(C45_String Extension)
{
FILE *Df;
char Fn[100];
ItemNo i=0,ItemSpace=0;
/* Open data file */
if ( ! ( Df = fopen(Fn, "r") ) ) Quinlan_Error(0, Fn, "");
do
{
MaxItem = i;
/* Make sure there is room for another item */
if ( i >= ItemSpace )/*第一次成立*/
{
if ( ItemSpace )/*第一次ItemSpace=0不成立*/
{
ItemSpace += Inc;/*一次加2048*/
Item = (Description *)/*重新分配Description结构体内存空间*/
realloc(Item, ItemSpace*sizeof(Description));
}
else/*第一次分配内存空间*/
{
Item = (Description *)
malloc((ItemSpace=Inc)*sizeof(Description));
}
}
Item[i] = GetDescription(Df);/*返回属性指针*/
} while ( Item[i] != Nil && ++i );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -