📄 clsfydlg.cpp
字号:
// ClsfyDlg.cpp : implementation file
//
#include "stdafx.h"
#include "sim_tc.h"
#include "ClsfyDlg.h"
#include "FileItem.h"
#include "BayesDlg.h"
#include "SvmDlg.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif
/////////////////////////////////////////////////////////////////////////////
// CClsfyDlg
IMPLEMENT_DYNCREATE(CClsfyDlg, CFormView)
CClsfyDlg::CClsfyDlg()
: CFormView(CClsfyDlg::IDD)
{
//{{AFX_DATA_INIT(CClsfyDlg)
//}}AFX_DATA_INIT
m_ClsFileNum = 0;
m_ClsFileArray = new CFileArray;
m_isMaxProb = TRUE;
m_ClsModel = 0;
m_isSet = FALSE;
}
CClsfyDlg::~CClsfyDlg()
{
int n=m_ClsFileArray->GetSize();
int i;
CFileItem* pa;
if (n>0){
for (i=n-1; i<0; i--){
pa = m_ClsFileArray->GetAt(i);
m_ClsFileArray->RemoveAt(i);
delete pa;
}
}
m_ClsFileArray->RemoveAll();
}
void CClsfyDlg::DoDataExchange(CDataExchange* pDX)
{
CFormView::DoDataExchange(pDX);
//{{AFX_DATA_MAP(CClsfyDlg)
DDX_Control(pDX, IDC_CLS_TRAINSET, m_TrainSet);
DDX_Control(pDX, IDC_CLS_START, m_Start);
DDX_Control(pDX, IDC_CLS_PARA, m_SetPara);
DDX_Control(pDX, IDC_CLS_FILELIST, m_ClsFileList);
DDX_Control(pDX, IDC_CLS_CHOOSEFILE, m_ChooseFile);
DDX_Control(pDX, IDC_CLS_ALG, m_ClsAlgrithm);
DDX_Control(pDX, IDC_CLS_RESULTGRID, m_ResultGrid);
//}}AFX_DATA_MAP
}
BEGIN_MESSAGE_MAP(CClsfyDlg, CFormView)
//{{AFX_MSG_MAP(CClsfyDlg)
ON_BN_CLICKED(IDC_CLS_CHOOSEFILE, OnClsChoosefile)
ON_BN_CLICKED(IDC_CLS_PARA, OnClsPara)
ON_BN_CLICKED(IDC_CLS_START, OnClsStart)
ON_LBN_DBLCLK(IDC_CLS_FILELIST, OnDblclkClsFilelist)
ON_CBN_SETFOCUS(IDC_CLS_TRAINSET, OnSetfocusClsTrainset)
//}}AFX_MSG_MAP
END_MESSAGE_MAP()
/////////////////////////////////////////////////////////////////////////////
// CClsfyDlg diagnostics
#ifdef _DEBUG
void CClsfyDlg::AssertValid() const
{
CFormView::AssertValid();
}
void CClsfyDlg::Dump(CDumpContext& dc) const
{
CFormView::Dump(dc);
}
#endif //_DEBUG
/////////////////////////////////////////////////////////////////////////////
// CClsfyDlg message handlers
void CClsfyDlg::OnClsChoosefile()
{
CFileDialog AddFileDlg(TRUE,"txt","*.txt",NULL,NULL,this);
if(AddFileDlg.DoModal() == IDOK){
CFileItem* pFileItem = new CFileItem;
//pFileItem->m_FileNumber = m_TrainDocNum;
//AfxMessageBox(m_TrainDocNum);
pFileItem->m_FileName = AddFileDlg.GetFileName();
pFileItem->m_FilePath = AddFileDlg.GetPathName();
m_ClsFileList.InsertString(m_ClsFileNum, pFileItem->m_FileName);
m_ClsFileNum = m_ClsFileNum + 1;
m_ClsFileArray->Add(pFileItem);
}
}
void CClsfyDlg::OnClsPara()
{
//分类过程
CString AlgrithmName;
if (m_isSet == FALSE) {
m_isSet = TRUE;
}
int nIndex = m_ClsAlgrithm.GetCurSel();
if (nIndex != CB_ERR){
switch (nIndex){
case 0:
{
//弹出bayes算法的参数设置的对话框,完成参数设置
CBayesDlg BayesDlg;
if (BayesDlg.DoModal() == IDOK){
if (BayesDlg.m_isMaxProb == 0){
m_isMaxProb = BayesDlg.m_isMaxProb;
}
else {
m_ProbLimit = BayesDlg.m_ProbLimit;
}
m_ClsModel = BayesDlg.m_ClsModel;
}
break;
}
case 1:
AfxMessageBox("功能还没有实现!");
break;
case 2:
AfxMessageBox("功能还没有实现");
break;
case 3:
AfxMessageBox("功能还没有实现");
break;
default:
break;
}
}
}
void CClsfyDlg::OnClsStart()
{
CString AlgrithmName;
CString CurTrainSet;
_variant_t RecordsAffected;
_variant_t vIndex = (long)0;
int nIndexAlgorithm,nIndexTrainSet,nIndexClsFile;
nIndexAlgorithm = m_ClsAlgrithm.GetCurSel();
nIndexTrainSet = m_TrainSet.GetCurSel();
nIndexClsFile = m_ClsFileList.GetCount();
if (nIndexAlgorithm == CB_ERR){
AfxMessageBox("请选择算法!");
}
else if (nIndexTrainSet == CB_ERR){
AfxMessageBox("请选择训练集!");
}
else if (nIndexClsFile == 0){
AfxMessageBox("请添加待分类的文档!");
}
else {
if (m_isSet == FALSE)
AfxMessageBox("若您没有进行参数设置,将采用系统默认设置进行分类!");
if (nIndexAlgorithm != CB_ERR){
switch (nIndexAlgorithm){
case 0:
{
if (m_ClsModel == 1){
m_TrainSet.GetLBText(nIndexTrainSet,CurTrainSet);
MultinomalBayesTrain(CurTrainSet);
MultinomalBayesClassification(m_ClsFileArray, CurTrainSet, m_ClsFileNum);
}
else{
}
break;
}
case 1:
AfxMessageBox("功能还没有实现!");
break;
case 2:
AfxMessageBox("功能还没有实现");
break;
case 3:
AfxMessageBox("功能还没有实现");
break;
default:
break;
}
}
}
}
void CClsfyDlg::OnInitialUpdate()
{
CFormView::OnInitialUpdate();
DisplayTrainSet();
}
void CClsfyDlg::DisplayTrainSet()
{
_variant_t RecordsAffected;
_variant_t vDatasetName;
CString strSQL;
CString strDatasetName;
_variant_t vIndex = (long)0;
CComboBox* pComboBox = (CComboBox*) GetDlgItem(IDC_CLS_TRAINSET);
ASSERT(pComboBox != NULL);
pComboBox->ResetContent();
strSQL = "select name from sysobjects where xtype='u' and name<>'dtproperties'";
CSim_tcApp::AdoDBObject.m_pRecordset = CSim_tcApp::AdoDBObject.m_pConnection->Execute((_bstr_t)strSQL,&RecordsAffected,adCmdText);
if (!CSim_tcApp::AdoDBObject.m_bIsConnected){
AfxMessageBox("数据库还没有连上,请关闭程序重试!");
return;
}
while(!CSim_tcApp::AdoDBObject.m_pRecordset->adoEOF){
vDatasetName = CSim_tcApp::AdoDBObject.m_pRecordset->GetCollect("name");///取得用户自定义的表的名字
strDatasetName = vDatasetName.bstrVal;
pComboBox->AddString( strDatasetName );
CSim_tcApp::AdoDBObject.m_pRecordset->MoveNext();///移到下一条记录
}
CSim_tcApp::AdoDBObject.m_pRecordset->Close();
Invalidate(FALSE);
}
void CClsfyDlg::OnDblclkClsFilelist()
{
// TODO: Add your control notification handler code here
int nIndex = m_ClsFileList.GetCurSel();
CFileItem* pa;
pa = m_ClsFileArray->GetAt(nIndex);
if (nIndex != LB_ERR){
m_ClsFileList.DeleteString(nIndex);
m_ClsFileArray->RemoveAt(nIndex);
delete pa;
m_ClsFileNum = m_ClsFileNum - 1;
}
}
void CClsfyDlg::DisplayClsAlgrithm()
{
}
void CClsfyDlg::OnSetfocusClsTrainset()
{
// TODO: Add your control notification handler code here
DisplayTrainSet();
}
void CClsfyDlg::MultinomalBayesClassification(CFileArray* pFileHead, CString CurTrainSet, int ClsFileNum)
{
//获得类别标号的信息,存放在数组ClassLabelArray中
_variant_t RecordsAffected;
_variant_t vAttrNum;
_variant_t vClassNum;
_variant_t vClassLabel;
_variant_t vIndex = (long)0;
StringArray ClassLabelArray;
CString SqlGetClassLabel = "";
CString ClassLabel="";
int DocNum = 0;
//查询数据集的属性个数
CString SqlCheckTrainSetDim = "";
int TrainFileDim = 0;
SqlCheckTrainSetDim = "select count(name) from syscolumns where syscolumns.id in (select id from sysobjects where name = '" + CurTrainSet + "')"; //查询属性个数
CSim_tcApp::AdoDBObject.m_pRecordset = CSim_tcApp::AdoDBObject.m_pConnection->Execute((_bstr_t)SqlCheckTrainSetDim,&RecordsAffected,adCmdText);
vAttrNum = CSim_tcApp::AdoDBObject.m_pRecordset->GetCollect(vIndex);
TrainFileDim = vAttrNum.lVal;
TrainFileDim = TrainFileDim - 2;
CSim_tcApp::AdoDBObject.m_pRecordset->Close();
//设定放大因数
double MagnifyFactor = 1.0E30; //避免结果太小,无法比较
// 查询类别个数
SqlGetClassLabel = "select count(distinct classlabel) from " + CurTrainSet;
CSim_tcApp::AdoDBObject.m_pRecordset = CSim_tcApp::AdoDBObject.m_pConnection->Execute((_bstr_t)SqlGetClassLabel,&RecordsAffected,adCmdText);
vClassNum = CSim_tcApp::AdoDBObject.m_pRecordset->GetCollect(vIndex);
int ClassNum = vClassNum.lVal;
CSim_tcApp::AdoDBObject.m_pRecordset->Close();
ClassLabelArray.SetSize(ClassNum,5);
DoubleArray ClassificationResult;
ClassificationResult.SetSize(ClassNum,5);
//查询类别名称
SqlGetClassLabel = "select distinct classlabel from " + CurTrainSet;
CSim_tcApp::AdoDBObject.m_pRecordset = CSim_tcApp::AdoDBObject.m_pConnection->Execute((_bstr_t)SqlGetClassLabel,&RecordsAffected,adCmdText);
int i=0;
while(!CSim_tcApp::AdoDBObject.m_pRecordset->adoEOF){
vClassLabel = (_bstr_t)CSim_tcApp::AdoDBObject.m_pRecordset->GetCollect(vIndex);
ClassLabel = vClassLabel.bstrVal;
ClassLabel.TrimLeft();
ClassLabel.TrimRight();
ClassLabelArray.SetAt(i,ClassLabel);
i++;
CSim_tcApp::AdoDBObject.m_pRecordset->MoveNext();
}
CSim_tcApp::AdoDBObject.m_pRecordset->Close();
ClassLabelArray.FreeExtra();
//填充表头
CString temp;//temp存放classlabel的名字
for(i=1;i<=ClassNum;i++){
temp = ClassLabelArray.GetAt(i-1);
m_ResultGrid.SetTextMatrix(0,i,temp);
}
m_ResultGrid.SetTextMatrix(0,i,"Result");
UpdateData(FALSE);
int MatrixRowNum = 1;
int ResultIndex = 0;
//依次读取分类文件中一个文件的一行,表示读入一个文件的词频向量
CString ATextLine = "";
CFileItem* pFileItem;
BOOL NotAtEnd;
BOOL isBeginning = TRUE;
double total = 0.0;
double FirstClassProb = 0.0;
double LatterClassProb = 0.0;
int MaxIndex = 0;
for (int indexFile=0; indexFile<ClsFileNum; indexFile++){
pFileItem = pFileHead->GetAt(indexFile);
CStdioFile ClsDocItem(pFileItem->m_FilePath,CFile::modeRead);
NotAtEnd = ClsDocItem.ReadString(ATextLine);//首先获取第一行
//根据第一行判断文本的维数
int DocVectorDim;
DocVectorDim = GetVectorDim(ATextLine); //获得数组的维数
if (TrainFileDim != DocVectorDim){
AfxMessageBox("分类文档与训练集的维数不相等!");
}
int TextLineLength;//从文件读入的一行的长度
while (NotAtEnd){
//读入内存中,开辟一个连续空间存储向量DocVectorArray
IntArray DocVectorArray;
DocVectorArray.SetSize(DocVectorDim,50);
ATextLine.TrimLeft();
ATextLine.TrimRight();
int j = 0;//j 用来标志ATextLine中字符的位置
TextLineLength = ATextLine.GetLength();
if (TextLineLength != 0){//读入的长度大于0,则将内容导入数组
CString strVectorAttr = "";
char GetChar;
int iVectorAttr = 0.0;
int IndexVector = 0;
for (j=0; j<TextLineLength; j++){//for循环把读入内存的一行文本转换成浮点数组
GetChar = ATextLine.GetAt(j);
if ((GetChar == ' ') || (GetChar =='\0') || (GetChar ==',')){
iVectorAttr = atoi(strVectorAttr);
DocVectorArray.SetAt(IndexVector,iVectorAttr);
IndexVector++;
strVectorAttr = "";
}
else {
strVectorAttr = strVectorAttr + GetChar;
}
//AfxMessageBox(tt);
}//for循环结束
iVectorAttr = atof(strVectorAttr);
DocVectorArray.SetAt(IndexVector,iVectorAttr);
DocVectorArray.FreeExtra();
}//到这里文件的一行就转换成浮点数组
//开始分类
DocNum++; //DocNum为分类文件的标号
char buffer[20];
_itoa( DocNum, buffer, 10 );
m_ResultGrid.SetTextMatrix(DocNum,0,buffer);
CString SqlGetAColumn = "";
_variant_t vColumnProb;
float fColumnProb;
float mutipleResult = 1.0;
int Exponent = 0;
float Base;
for (i=0; i<ClassNum; i++){
temp = "class_" + ClassLabelArray.GetAt(i);
SqlGetAColumn = "Select " + temp + " from ";
SqlGetAColumn = SqlGetAColumn + CurTrainSet +"_MultinomalResult";
//AfxMessageBox(SqlGetAColumn);
CSim_tcApp::AdoDBObject.m_pRecordset = CSim_tcApp::AdoDBObject.m_pConnClsDB->Execute((_bstr_t)SqlGetAColumn,&RecordsAffected,adCmdText);
vColumnProb = CSim_tcApp::AdoDBObject.m_pRecordset->GetCollect(_bstr_t(temp));
fColumnProb = vColumnProb.dblVal;
fColumnProb = fColumnProb * MagnifyFactor;
j=0;
CSim_tcApp::AdoDBObject.m_pRecordset->MoveNext();
mutipleResult = 1.0;
while ((!CSim_tcApp::AdoDBObject.m_pRecordset->adoEOF) && (j<DocVectorDim)){
vColumnProb = CSim_tcApp::AdoDBObject.m_pRecordset->GetCollect(_bstr_t(temp));
Base = vColumnProb.dblVal;
Exponent = DocVectorArray.GetAt(j);
mutipleResult = mutipleResult * pow(Base,Exponent);
CSim_tcApp::AdoDBObject.m_pRecordset->MoveNext();
j++;
}//while
mutipleResult = mutipleResult * fColumnProb;
ClassificationResult.SetAt(i,mutipleResult);
//m_ResultGrid.SetTextMatrix(DocNum,i+1,buffer);
CSim_tcApp::AdoDBObject.m_pRecordset->Close();
fColumnProb = 1.0;
}
//对Classification数组进行排序,选出最大的作为文本的类属
for (i=0; i<ClassNum; i++){
total = total + ClassificationResult.GetAt(i);
}
FirstClassProb = 1.0E-10;
for (i=0; i<ClassNum; i++){
LatterClassProb = ClassificationResult.GetAt(i);
LatterClassProb = LatterClassProb/total;
if (LatterClassProb>FirstClassProb){
FirstClassProb = LatterClassProb;
MaxIndex = i;
}
ClassificationResult.SetAt(i, LatterClassProb);
_gcvt(LatterClassProb, 7, buffer);
//AfxMessageBox(buffer);
m_ResultGrid.SetTextMatrix(DocNum,i+1,buffer);
}
temp = ClassLabelArray.GetAt(MaxIndex);
m_ResultGrid.SetTextMatrix(DocNum,i+1,temp);
//DoubleArray * pClsResult;
//pClsResult = &ClassificationResult;
//ResultIndex = SortClsResult(pClsResult);
//temp = ClassLabelArray.GetAt(ResultIndex);
//m_ResultGrid.SetTextMatrix(DocNum,i+1,temp);
NotAtEnd = ClsDocItem.ReadString(ATextLine);//第一个读入的文本分类结束,接着读入下一行
}// while (NotAtEnd)
ClsDocItem.Close();
}
//利用已有的训练数据,计算分类结果数据,存储到当前目录下的临时文件中temp.txt
//将每个文件对于每一类的概率也依次写入到另一个数组TextPostProbArray中,便于确定类别
//将所有文件所属的类别记录到一个数组FileClassArray
//AfxMessageBox("complete!");
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -