⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cataloglist.cpp

📁 良好的代码实现
💻 CPP
📖 第 1 页 / 共 2 页
字号:
// CatalogList.cpp: implementation of the CCatalogList class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "CatalogList.h"
#include "wordlist.h"
#include "direct.h"
#include "wordsegment.h"
#include "classifierparam.h"
#include "stemmer.h"
#include "classifier.h"
#include <math.h>
#include <memory.h> 
#include ".\\Utility\\Utility.h"

#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif

int CCatalogList::m_nSaveMode;
char pTempStr[MAX_PATH];
char pWordTag[10];
sWeightNode *CDocNode::m_pTemp=NULL;                 //生成文档向量时需要使用的一块临时内存----newly added
int CDocNode::m_nAllocTempLen=0;
char CDocNode::m_pSentence[MAX_PATH*10];        //用来存放经过去掉空格回车等字母后的句子
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
//文档结点类
CDocNode::CDocNode(const CDocNode& x):m_nAllocLen(0),m_sWeightSet(NULL),m_pResults(NULL),m_nClassNum(0)
{
	*this=x;
}
//拷贝
const CDocNode& CDocNode::operator=(const CDocNode& x)
{
	if(this==&x) return *this;
	m_strDocName=x.m_strDocName;
	m_idxDoc=x.m_idxDoc;
	if(x.m_sWeightSet!=NULL)
	{
		AllocBuffer(x.m_nAllocLen);
		memcpy(m_sWeightSet,x.m_sWeightSet,m_nAllocLen*sizeof(sWeightNode));
	}
	else
	{
		m_sWeightSet=NULL;
		m_nAllocLen=0;
	}

	if(x.m_pResults!=NULL)
	{
		AllocResultsBuffer(x.m_nClassNum);
		memcpy(m_pResults,x.m_pResults,m_nClassNum*sizeof(double));
	}
	else
	{
		m_pResults=NULL;
		m_nClassNum=0;
	}

	return *this;
}

CDocNode::CDocNode()
{
	m_sWeightSet = NULL;
	m_pResults   = NULL;
	m_nAllocLen=0;
	m_nClassNum=0;
	m_idxDoc=-1;
	m_nCataID=-1;
}

CDocNode::~CDocNode()
{
	DeallocBuffer();
	DeallocResultsBuffer();
}

//nMode<=0  删除所有文档信息
//nMode>0   只删除文档向量所占用的空间
void CCatalogNode::InitCatalogNode(int nMode)
{
	m_lTotalWordNum = 0;
	POSITION pos_doc=m_lstDocList.GetHeadPosition();
	while(pos_doc!=NULL)
	{
		CDocNode& docnode=m_lstDocList.GetNext(pos_doc);
		docnode.DeallocBuffer();
		docnode.DeallocResultsBuffer();
	}
	if(nMode>0) m_lstDocList.RemoveAll();
}

CCatalogNode::CCatalogNode(const CCatalogNode& x)
{
	*this=x;
}

//类节点构造
const CCatalogNode& CCatalogNode::operator = (const CCatalogNode& x)
{
	if(this==&x) return *this;
	m_lTotalWordNum = x.m_lTotalWordNum ;
	m_strCatalogName=x.m_strCatalogName;
	m_strDirName=x.m_strDirName;
	m_idxCata=x.m_idxCata;
	m_lstDocList.RemoveAll();
	POSITION pos = x.m_lstDocList.GetHeadPosition();
	while(pos!=NULL)
	{
		CDocNode& docnode=x.m_lstDocList.GetNext(pos);
		m_lstDocList.AddTail(docnode);
	}
	return *this;
}

const CCatalogNode& CCatalogNode::operator += (const CCatalogNode& x)
{
	if(this==&x) return *this;
	m_lTotalWordNum += x.m_lTotalWordNum ;
	m_strCatalogName=x.m_strCatalogName;
	m_strDirName=x.m_strDirName;
	m_idxCata=x.m_idxCata;
	POSITION pos = x.m_lstDocList.GetHeadPosition();
	while(pos!=NULL)
	{
		CDocNode& docnode=x.m_lstDocList.GetNext(pos);
		m_lstDocList.AddTail(docnode);
	}
	return *this;
}

CCatalogNode::CCatalogNode()
{
	m_idxCata=-1;
	m_lCurDocID=0;
	m_lTotalWordNum=0;
	InitCatalogNode();
}

CCatalogNode::~CCatalogNode()
{
	InitCatalogNode();
}


void CCatalogNode::SetStartDocID(long lDocID)
{
	m_lCurDocID=lDocID;
}

long CCatalogNode::ScanDirectory(CString strPath)
{
	if(_chdir(strPath))  // if can't find the dir
	{
		CString	csTmp = "目录";
		csTmp+=strPath;
		csTmp+="不存在!";
		AfxMessageBox(csTmp);
		return -1;
	}

	HANDLE hFinder;
	LPWIN32_FIND_DATA lpFindFileData;	
	lpFindFileData  = new WIN32_FIND_DATA;
	hFinder = ::FindFirstFile("*.*",lpFindFileData );
	while(::FindNextFile(hFinder,lpFindFileData))
	{
		if( !strcmp(lpFindFileData->cFileName,".") || !strcmp(lpFindFileData->cFileName,"..") )
			continue;

		if(!(lpFindFileData->dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
		{
			CDocNode docnode;
			docnode.m_strDocName=lpFindFileData->cFileName;
			docnode.m_idxDoc=m_lCurDocID++;
			docnode.m_nAllocLen=0;
			docnode.m_sWeightSet=NULL;
			docnode.m_nClassNum=0;
			docnode.m_pResults=NULL;
			AddDoc(docnode);
		}
	}
	delete	lpFindFileData;
	return m_lCurDocID;
}

CCatalogList::CCatalogList()
{
	m_nSaveMode=0;
}

CCatalogList::~CCatalogList()
{
}


void CCatalogList::DumpCataList(CString strFileName)
{
	FILE *stream;
	if( (stream  = fopen( strFileName, "w+" )) == NULL )
	{
		AfxMessageBox("无法创建文件"+strFileName+"!");
		return;
	}

	POSITION pos=GetFirstPosition();
	while(pos!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos);
		fprintf(stream,"%d %s\n",catanode.m_idxCata,catanode.m_strCatalogName);
	}
	fclose(stream);
}

void CCatalogList::DumpDocList(CString strFileName)
{
	FILE *stream;
	if( (stream  = fopen( strFileName, "w+" )) == NULL )
	{
		AfxMessageBox("无法创建文件"+strFileName+"!");
		return;
	}

	POSITION pos=GetFirstPosition();
	POSITION pos_doc;
	while(pos!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos);
		pos_doc=catanode.GetFirstPosition();
		while(pos_doc!=NULL)
		{
			CDocNode& docnode=catanode.GetNext(pos_doc);
			if(docnode.m_nAllocLen>0)
			{
				fprintf(stream,"%d",catanode.m_idxCata+1);
				for(int i=0;i<docnode.m_nAllocLen;i++)
				{
					if(fabs(docnode.m_sWeightSet[i].s_dWeight) > dZero)
					fprintf(stream," %d:%f",docnode.m_sWeightSet[i].s_idxWord+1,docnode.m_sWeightSet[i].s_dWeight);
				}
				fprintf(stream,"\n");
			}
		}
	}
	fclose(stream);
}

//nSaveMode<=0 保存文档的向量
//nSaveMode>0  不保存文档的向量
void CCatalogList::DumpToFile(CString strFileName, int nSaveMode)  // view the word list content
{
	CFile		fBinOut;
	if(!fBinOut.Open(strFileName,CFile::modeWrite | CFile::modeCreate))
	{
		AfxMessageBox("无法创建文件"+strFileName+"!");
		return;
	}
	
	CArchive ar(&fBinOut,CArchive::store);
	CCatalogList::m_nSaveMode=nSaveMode;
	Serialize(ar);
	
	ar.Close();
	fBinOut.Close();
}

BOOL CCatalogList::GetFromFile(CString strFileName)  // view the word list content
{
	CFile	fBinOut;
	if(!fBinOut.Open(strFileName,CFile::modeRead))
	{
		AfxMessageBox("无法打开文件"+strFileName+"!");
		return FALSE;
	}

	CArchive ar(&fBinOut,CArchive::load);
	Serialize(ar);
	ar.Close();

	fBinOut.Close();
	return TRUE;
}
//处理文档库中的文档
long CCatalogList::BuildLib(CString strDirName)
{
	InitCatalogList();
	return ScanDirectory(strDirName);
}

void CCatalogList::Serialize(CArchive &ar)
{
	if(ar.IsStoring()) ar<<m_nSaveMode;
	else ar>>m_nSaveMode;
	m_lstCatalogList.Serialize(ar);
}

void AFXAPI SerializeElements(CArchive& ar,CCatalogNode* pElements,int nCount)
{
	ASSERT(nCount==0||
		AfxIsValidAddress(pElements,nCount*sizeof(CCatalogNode)));
	pElements->Serialize(ar);
}

void AFXAPI SerializeElements(CArchive& ar,CDocNode* pElements,int nCount)
{
	ASSERT(nCount==0||
		AfxIsValidAddress(pElements,nCount*sizeof(CDocNode)));
	pElements->Serialize(ar);
}

void CDocNode::Serialize(CArchive &ar)
{
	int nLen;
	if(ar.IsStoring())
	{
		ar<<m_idxDoc;
		ar<<m_strDocName;
		if(CCatalogList::GetSaveMode()<=0)
		{
			ar<<m_nAllocLen;
			ar.Write((void*)m_sWeightSet,m_nAllocLen*sizeof(sWeightNode));
		}
	}
	else
	{
		ar>>m_idxDoc;
		ar>>m_strDocName;
		if(CCatalogList::GetSaveMode()<=0)
		{
			ar>>nLen;
			AllocBuffer(nLen);
			ar.Read((void*)m_sWeightSet,m_nAllocLen*sizeof(sWeightNode));
		}
		else
		{
			m_nAllocLen=0;
			m_sWeightSet=NULL;
		}
		m_nClassNum=0;
		m_pResults=NULL;
	}
}

void CCatalogNode::Serialize(CArchive &ar)
{
	if(ar.IsStoring())
	{
		ar<<m_idxCata;
		ar<<m_strDirName;
		ar<<m_lTotalWordNum;
		ar<<m_strCatalogName;
	}
	else
	{
		ar>>m_idxCata;
		ar>>m_strDirName;
		ar>>m_lTotalWordNum;
		ar>>m_strCatalogName;
	}
	m_lstDocList.Serialize(ar);
}

//为权重分配空间
void CDocNode::AllocTempBuffer(int nLen)
{
	if((nLen<=0)||(m_nAllocTempLen==nLen)) return;
	if(m_pTemp!=NULL)
	{
		delete []m_pTemp;
		m_pTemp=NULL;
	}
	m_pTemp=new sWeightNode[nLen];
	m_nAllocTempLen=nLen;
}


//释放为权重分配的临时空间
void CDocNode::DeallocTempBuffer()
{
	if(m_pTemp!=NULL)
	{
		delete []m_pTemp;
		m_pTemp=NULL;
	}
	m_nAllocTempLen=0;
}


//为权重分配临时空间
void CDocNode::AllocBuffer(int nLen)
{
	if((nLen<=0)||(m_nAllocLen==nLen)) return;
	if(m_sWeightSet!=NULL)
	{
		delete []m_sWeightSet;
		m_sWeightSet=NULL;
	}
	m_nAllocLen=nLen;
	m_sWeightSet=new sWeightNode[m_nAllocLen];
}


//释放为权重分配的空间
void CDocNode::DeallocBuffer()
{
	if(m_sWeightSet!=NULL)
	{
		delete []m_sWeightSet;
		m_sWeightSet=NULL;
	}
	m_nAllocLen=0;
}

//为分类结果分配空间
void CDocNode::AllocResultsBuffer(short nLen)
{
	if((nLen<=0)||(m_nClassNum==nLen)) return;
	if(m_pResults!=NULL)
	{
		delete []m_pResults;
		m_pResults=NULL;
	}
	m_nClassNum=nLen;
	m_pResults=new double[m_nClassNum];
}


//释放为分类结果分配的空间
void CDocNode::DeallocResultsBuffer()
{
	if(m_pResults!=NULL)
	{
		delete []m_pResults;
		m_pResults=NULL;
	}
	m_nClassNum=0;
}

int CDocNode::ScanChinese(char * pPath,CWordList& wordList,int nCataNum, short idxCata)
{
	CFile fin;
	char *buffer;
	strcpy(pTempStr,pPath);
	strcat(pTempStr,"\\");
	strcat(pTempStr,m_strDocName.GetBuffer(0));
	if(!fin.Open(pTempStr,CFile::modeRead))
		return -1;

	unsigned int flen=fin.GetLength();
	buffer=new char[flen+1];
	flen=fin.ReadHuge(buffer,flen);
	buffer[flen]='\0';
	fin.Close();
	int num=ScanChineseString(buffer,wordList,nCataNum,m_idxDoc,idxCata);
	delete[] buffer;
	return num;
}

int CDocNode::ScanChineseString(char * pPath,CWordList& wordList,int nCataNum, long idxDoc, short idxCata)
{
	char *buffer=pPath;
	int i,j,sum;
	char *w;
	//realcnt为文章中去掉停用词后剩下的总共词数
	//nStart为一个句子在buffer中的开始位置
	int nStart=0,nNewStart=0;
	bool flag=true;
	int nSentenceLen=0;
	int realcnt=0;
	while(buffer[nStart]!='\0')
	{
		flag=true;
		nSentenceLen=ParseFile(buffer,nStart,nNewStart);
		nStart=nNewStart;
		if(nSentenceLen==0) continue;
		if(m_pSentence[0]>0) //如果是一个英文单词
		{
			//如果英文单词的长度大于等于2,且不是数字
			if((nSentenceLen>=2)&&((m_pSentence[0]<'0')||(m_pSentence[0]>'9')))
			{
				wordList.Add(m_pSentence,idxCata,idxDoc,nCataNum);
				realcnt++;
			}
		}
		else //如果是汉字串
		{
			if(nSentenceLen%2!=0) continue;
			if(nSentenceLen==2) //如果是单个汉字
			{
				wordList.Add(m_pSentence,idxCata,idxDoc,nCataNum);
				realcnt++;
			}
			else
			{
				g_wordSeg.Segment(m_pSentence);
				for(i=0;i<g_wordSeg.GetSegmentCount();i++)
				{
					sum=0;
					PWORD_RESULT pItem=g_wordSeg.GetWordSeg(i);
					while(pItem[sum].sWord[0]!=0) sum++;
				
					for(j=1;j<sum-1;j++)
					{
						w=pItem[j].sWord;
						if(w[0]=='\0'||g_wordSeg.isInStopWords(w)) flag=false;		
						if(flag)
						{
							wordList.Add(w,idxCata,idxDoc,nCataNum);
							realcnt++;
						}
					}
				}
			}
		}
	}
	return realcnt;
}

int CDocNode::ScanEnglish(char *pPath, CWordList &wordList, int nCataNum, short idxCata, bool bStem)
{
	CFile fin;
	char *buffer;
	strcpy(pTempStr,pPath);
	strcat(pTempStr,"\\");
	strcat(pTempStr,m_strDocName.GetBuffer(0));
	if(!fin.Open(pTempStr,CFile::modeRead))
		return -1;

	int flen=fin.GetLength();
	buffer=new char[flen+1];
	flen=fin.ReadHuge(buffer,flen);
	buffer[flen]='\0';
	fin.Close();

	int num=ScanEnglishString(buffer,wordList,nCataNum,m_idxDoc,idxCata,bStem);
	delete[] buffer;
	return num;
}

int CDocNode::ScanEnglishString(char *pPath, CWordList &wordList, int nCataNum, long idxDoc, short idxCata, bool bStem)
{
	char *buffer=pPath;
	_strlwr(buffer);

	int nFilePos=0;
	int realcnt=0,wordLen=0;
	char c, *p;
	p=buffer;
	while(buffer[nFilePos]!='\0')
	{
		c=buffer[nFilePos];
		if(c==' '||c=='\r'||c=='\n'||
			(c>32&&c<=47)||(c>=58&&c<=64)||(c>=91&&c<=96)||(c>=123&&c<=127))
		{
			buffer[nFilePos]='\0';
			wordLen=buffer+nFilePos-p;
			if(wordLen>2)
			{
				if(bStem) theStemmer.stem(p,0,wordLen-1);
				wordList.Add(p,idxCata,idxDoc,nCataNum);
				realcnt++;
			}
			p=buffer+nFilePos+1;
		}
		nFilePos++;
	}
	return realcnt;
}


BOOL CDocNode::IsZero()
{
	ASSERT(m_sWeightSet!=NULL);
	for(int i=0;i<m_nAllocLen;i++)
		if(m_sWeightSet[i].s_dWeight!=0) return FALSE;
	return TRUE;
}

const CCatalogList& CCatalogList::operator = (const CCatalogList& x)
{
	if(this==&x) return *this;
	m_lstCatalogList.RemoveAll();
	POSITION pos = x.m_lstCatalogList.GetHeadPosition();
	while(pos!=NULL)
	{
		CCatalogNode& catanode=x.m_lstCatalogList.GetNext(pos);
		m_lstCatalogList.AddTail(catanode);
	}
	return *this;
}

const CCatalogList& CCatalogList::operator += (const CCatalogList& x)
{
	if(this==&x) return *this;
	POSITION pos = x.m_lstCatalogList.GetHeadPosition();
	while(pos!=NULL)
	{
		CCatalogNode& catanode=x.m_lstCatalogList.GetNext(pos);
		m_lstCatalogList.AddTail(catanode);
	}
	return *this;
}

//nMode<=0 删除所有信息
//nMode=1  删除所有文档,但保留类别(节点)
//nMode=2  只删除文档向量所占用的内存
void CCatalogList::InitCatalogList(int nMode)
{
	POSITION pos=GetFirstPosition();
	while(pos!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos);
		if(nMode==2) catanode.InitCatalogNode(1);
		else catanode.InitCatalogNode(0);
	}
	if(nMode<=1) m_lstCatalogList.RemoveAll();

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -