📄 cataloglist.cpp
字号:
// CatalogList.cpp: implementation of the CCatalogList class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "CatalogList.h"
#include "wordlist.h"
#include "direct.h"
#include "wordsegment.h"
#include "classifierparam.h"
#include "stemmer.h"
#include "classifier.h"
#include <math.h>
#include <memory.h>
#include ".\\Utility\\Utility.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif
int CCatalogList::m_nSaveMode;
char pTempStr[MAX_PATH];
char pWordTag[10];
sWeightNode *CDocNode::m_pTemp=NULL; //生成文档向量时需要使用的一块临时内存----newly added
int CDocNode::m_nAllocTempLen=0;
char CDocNode::m_pSentence[MAX_PATH*10]; //用来存放经过去掉空格回车等字母后的句子
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
//文档结点类
CDocNode::CDocNode(const CDocNode& x):m_nAllocLen(0),m_sWeightSet(NULL),m_pResults(NULL),m_nClassNum(0)
{
*this=x;
}
//拷贝
const CDocNode& CDocNode::operator=(const CDocNode& x)
{
if(this==&x) return *this;
m_strDocName=x.m_strDocName;
m_idxDoc=x.m_idxDoc;
if(x.m_sWeightSet!=NULL)
{
AllocBuffer(x.m_nAllocLen);
memcpy(m_sWeightSet,x.m_sWeightSet,m_nAllocLen*sizeof(sWeightNode));
}
else
{
m_sWeightSet=NULL;
m_nAllocLen=0;
}
if(x.m_pResults!=NULL)
{
AllocResultsBuffer(x.m_nClassNum);
memcpy(m_pResults,x.m_pResults,m_nClassNum*sizeof(double));
}
else
{
m_pResults=NULL;
m_nClassNum=0;
}
return *this;
}
CDocNode::CDocNode()
{
m_sWeightSet = NULL;
m_pResults = NULL;
m_nAllocLen=0;
m_nClassNum=0;
m_idxDoc=-1;
m_nCataID=-1;
}
CDocNode::~CDocNode()
{
DeallocBuffer();
DeallocResultsBuffer();
}
//nMode<=0 删除所有文档信息
//nMode>0 只删除文档向量所占用的空间
void CCatalogNode::InitCatalogNode(int nMode)
{
m_lTotalWordNum = 0;
POSITION pos_doc=m_lstDocList.GetHeadPosition();
while(pos_doc!=NULL)
{
CDocNode& docnode=m_lstDocList.GetNext(pos_doc);
docnode.DeallocBuffer();
docnode.DeallocResultsBuffer();
}
if(nMode>0) m_lstDocList.RemoveAll();
}
CCatalogNode::CCatalogNode(const CCatalogNode& x)
{
*this=x;
}
//类节点构造
const CCatalogNode& CCatalogNode::operator = (const CCatalogNode& x)
{
if(this==&x) return *this;
m_lTotalWordNum = x.m_lTotalWordNum ;
m_strCatalogName=x.m_strCatalogName;
m_strDirName=x.m_strDirName;
m_idxCata=x.m_idxCata;
m_lstDocList.RemoveAll();
POSITION pos = x.m_lstDocList.GetHeadPosition();
while(pos!=NULL)
{
CDocNode& docnode=x.m_lstDocList.GetNext(pos);
m_lstDocList.AddTail(docnode);
}
return *this;
}
const CCatalogNode& CCatalogNode::operator += (const CCatalogNode& x)
{
if(this==&x) return *this;
m_lTotalWordNum += x.m_lTotalWordNum ;
m_strCatalogName=x.m_strCatalogName;
m_strDirName=x.m_strDirName;
m_idxCata=x.m_idxCata;
POSITION pos = x.m_lstDocList.GetHeadPosition();
while(pos!=NULL)
{
CDocNode& docnode=x.m_lstDocList.GetNext(pos);
m_lstDocList.AddTail(docnode);
}
return *this;
}
CCatalogNode::CCatalogNode()
{
m_idxCata=-1;
m_lCurDocID=0;
m_lTotalWordNum=0;
InitCatalogNode();
}
CCatalogNode::~CCatalogNode()
{
InitCatalogNode();
}
void CCatalogNode::SetStartDocID(long lDocID)
{
m_lCurDocID=lDocID;
}
long CCatalogNode::ScanDirectory(CString strPath)
{
if(_chdir(strPath)) // if can't find the dir
{
CString csTmp = "目录";
csTmp+=strPath;
csTmp+="不存在!";
AfxMessageBox(csTmp);
return -1;
}
HANDLE hFinder;
LPWIN32_FIND_DATA lpFindFileData;
lpFindFileData = new WIN32_FIND_DATA;
hFinder = ::FindFirstFile("*.*",lpFindFileData );
while(::FindNextFile(hFinder,lpFindFileData))
{
if( !strcmp(lpFindFileData->cFileName,".") || !strcmp(lpFindFileData->cFileName,"..") )
continue;
if(!(lpFindFileData->dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
{
CDocNode docnode;
docnode.m_strDocName=lpFindFileData->cFileName;
docnode.m_idxDoc=m_lCurDocID++;
docnode.m_nAllocLen=0;
docnode.m_sWeightSet=NULL;
docnode.m_nClassNum=0;
docnode.m_pResults=NULL;
AddDoc(docnode);
}
}
delete lpFindFileData;
return m_lCurDocID;
}
CCatalogList::CCatalogList()
{
m_nSaveMode=0;
}
CCatalogList::~CCatalogList()
{
}
void CCatalogList::DumpCataList(CString strFileName)
{
FILE *stream;
if( (stream = fopen( strFileName, "w+" )) == NULL )
{
AfxMessageBox("无法创建文件"+strFileName+"!");
return;
}
POSITION pos=GetFirstPosition();
while(pos!=NULL)
{
CCatalogNode& catanode=GetNext(pos);
fprintf(stream,"%d %s\n",catanode.m_idxCata,catanode.m_strCatalogName);
}
fclose(stream);
}
void CCatalogList::DumpDocList(CString strFileName)
{
FILE *stream;
if( (stream = fopen( strFileName, "w+" )) == NULL )
{
AfxMessageBox("无法创建文件"+strFileName+"!");
return;
}
POSITION pos=GetFirstPosition();
POSITION pos_doc;
while(pos!=NULL)
{
CCatalogNode& catanode=GetNext(pos);
pos_doc=catanode.GetFirstPosition();
while(pos_doc!=NULL)
{
CDocNode& docnode=catanode.GetNext(pos_doc);
if(docnode.m_nAllocLen>0)
{
fprintf(stream,"%d",catanode.m_idxCata+1);
for(int i=0;i<docnode.m_nAllocLen;i++)
{
if(fabs(docnode.m_sWeightSet[i].s_dWeight) > dZero)
fprintf(stream," %d:%f",docnode.m_sWeightSet[i].s_idxWord+1,docnode.m_sWeightSet[i].s_dWeight);
}
fprintf(stream,"\n");
}
}
}
fclose(stream);
}
//nSaveMode<=0 保存文档的向量
//nSaveMode>0 不保存文档的向量
void CCatalogList::DumpToFile(CString strFileName, int nSaveMode) // view the word list content
{
CFile fBinOut;
if(!fBinOut.Open(strFileName,CFile::modeWrite | CFile::modeCreate))
{
AfxMessageBox("无法创建文件"+strFileName+"!");
return;
}
CArchive ar(&fBinOut,CArchive::store);
CCatalogList::m_nSaveMode=nSaveMode;
Serialize(ar);
ar.Close();
fBinOut.Close();
}
BOOL CCatalogList::GetFromFile(CString strFileName) // view the word list content
{
CFile fBinOut;
if(!fBinOut.Open(strFileName,CFile::modeRead))
{
AfxMessageBox("无法打开文件"+strFileName+"!");
return FALSE;
}
CArchive ar(&fBinOut,CArchive::load);
Serialize(ar);
ar.Close();
fBinOut.Close();
return TRUE;
}
//处理文档库中的文档
long CCatalogList::BuildLib(CString strDirName)
{
InitCatalogList();
return ScanDirectory(strDirName);
}
void CCatalogList::Serialize(CArchive &ar)
{
if(ar.IsStoring()) ar<<m_nSaveMode;
else ar>>m_nSaveMode;
m_lstCatalogList.Serialize(ar);
}
void AFXAPI SerializeElements(CArchive& ar,CCatalogNode* pElements,int nCount)
{
ASSERT(nCount==0||
AfxIsValidAddress(pElements,nCount*sizeof(CCatalogNode)));
pElements->Serialize(ar);
}
void AFXAPI SerializeElements(CArchive& ar,CDocNode* pElements,int nCount)
{
ASSERT(nCount==0||
AfxIsValidAddress(pElements,nCount*sizeof(CDocNode)));
pElements->Serialize(ar);
}
void CDocNode::Serialize(CArchive &ar)
{
int nLen;
if(ar.IsStoring())
{
ar<<m_idxDoc;
ar<<m_strDocName;
if(CCatalogList::GetSaveMode()<=0)
{
ar<<m_nAllocLen;
ar.Write((void*)m_sWeightSet,m_nAllocLen*sizeof(sWeightNode));
}
}
else
{
ar>>m_idxDoc;
ar>>m_strDocName;
if(CCatalogList::GetSaveMode()<=0)
{
ar>>nLen;
AllocBuffer(nLen);
ar.Read((void*)m_sWeightSet,m_nAllocLen*sizeof(sWeightNode));
}
else
{
m_nAllocLen=0;
m_sWeightSet=NULL;
}
m_nClassNum=0;
m_pResults=NULL;
}
}
void CCatalogNode::Serialize(CArchive &ar)
{
if(ar.IsStoring())
{
ar<<m_idxCata;
ar<<m_strDirName;
ar<<m_lTotalWordNum;
ar<<m_strCatalogName;
}
else
{
ar>>m_idxCata;
ar>>m_strDirName;
ar>>m_lTotalWordNum;
ar>>m_strCatalogName;
}
m_lstDocList.Serialize(ar);
}
//为权重分配空间
void CDocNode::AllocTempBuffer(int nLen)
{
if((nLen<=0)||(m_nAllocTempLen==nLen)) return;
if(m_pTemp!=NULL)
{
delete []m_pTemp;
m_pTemp=NULL;
}
m_pTemp=new sWeightNode[nLen];
m_nAllocTempLen=nLen;
}
//释放为权重分配的临时空间
void CDocNode::DeallocTempBuffer()
{
if(m_pTemp!=NULL)
{
delete []m_pTemp;
m_pTemp=NULL;
}
m_nAllocTempLen=0;
}
//为权重分配临时空间
void CDocNode::AllocBuffer(int nLen)
{
if((nLen<=0)||(m_nAllocLen==nLen)) return;
if(m_sWeightSet!=NULL)
{
delete []m_sWeightSet;
m_sWeightSet=NULL;
}
m_nAllocLen=nLen;
m_sWeightSet=new sWeightNode[m_nAllocLen];
}
//释放为权重分配的空间
void CDocNode::DeallocBuffer()
{
if(m_sWeightSet!=NULL)
{
delete []m_sWeightSet;
m_sWeightSet=NULL;
}
m_nAllocLen=0;
}
//为分类结果分配空间
void CDocNode::AllocResultsBuffer(short nLen)
{
if((nLen<=0)||(m_nClassNum==nLen)) return;
if(m_pResults!=NULL)
{
delete []m_pResults;
m_pResults=NULL;
}
m_nClassNum=nLen;
m_pResults=new double[m_nClassNum];
}
//释放为分类结果分配的空间
void CDocNode::DeallocResultsBuffer()
{
if(m_pResults!=NULL)
{
delete []m_pResults;
m_pResults=NULL;
}
m_nClassNum=0;
}
int CDocNode::ScanChinese(char * pPath,CWordList& wordList,int nCataNum, short idxCata)
{
CFile fin;
char *buffer;
strcpy(pTempStr,pPath);
strcat(pTempStr,"\\");
strcat(pTempStr,m_strDocName.GetBuffer(0));
if(!fin.Open(pTempStr,CFile::modeRead))
return -1;
unsigned int flen=fin.GetLength();
buffer=new char[flen+1];
flen=fin.ReadHuge(buffer,flen);
buffer[flen]='\0';
fin.Close();
int num=ScanChineseString(buffer,wordList,nCataNum,m_idxDoc,idxCata);
delete[] buffer;
return num;
}
int CDocNode::ScanChineseString(char * pPath,CWordList& wordList,int nCataNum, long idxDoc, short idxCata)
{
char *buffer=pPath;
int i,j,sum;
char *w;
//realcnt为文章中去掉停用词后剩下的总共词数
//nStart为一个句子在buffer中的开始位置
int nStart=0,nNewStart=0;
bool flag=true;
int nSentenceLen=0;
int realcnt=0;
while(buffer[nStart]!='\0')
{
flag=true;
nSentenceLen=ParseFile(buffer,nStart,nNewStart);
nStart=nNewStart;
if(nSentenceLen==0) continue;
if(m_pSentence[0]>0) //如果是一个英文单词
{
//如果英文单词的长度大于等于2,且不是数字
if((nSentenceLen>=2)&&((m_pSentence[0]<'0')||(m_pSentence[0]>'9')))
{
wordList.Add(m_pSentence,idxCata,idxDoc,nCataNum);
realcnt++;
}
}
else //如果是汉字串
{
if(nSentenceLen%2!=0) continue;
if(nSentenceLen==2) //如果是单个汉字
{
wordList.Add(m_pSentence,idxCata,idxDoc,nCataNum);
realcnt++;
}
else
{
g_wordSeg.Segment(m_pSentence);
for(i=0;i<g_wordSeg.GetSegmentCount();i++)
{
sum=0;
PWORD_RESULT pItem=g_wordSeg.GetWordSeg(i);
while(pItem[sum].sWord[0]!=0) sum++;
for(j=1;j<sum-1;j++)
{
w=pItem[j].sWord;
if(w[0]=='\0'||g_wordSeg.isInStopWords(w)) flag=false;
if(flag)
{
wordList.Add(w,idxCata,idxDoc,nCataNum);
realcnt++;
}
}
}
}
}
}
return realcnt;
}
int CDocNode::ScanEnglish(char *pPath, CWordList &wordList, int nCataNum, short idxCata, bool bStem)
{
CFile fin;
char *buffer;
strcpy(pTempStr,pPath);
strcat(pTempStr,"\\");
strcat(pTempStr,m_strDocName.GetBuffer(0));
if(!fin.Open(pTempStr,CFile::modeRead))
return -1;
int flen=fin.GetLength();
buffer=new char[flen+1];
flen=fin.ReadHuge(buffer,flen);
buffer[flen]='\0';
fin.Close();
int num=ScanEnglishString(buffer,wordList,nCataNum,m_idxDoc,idxCata,bStem);
delete[] buffer;
return num;
}
int CDocNode::ScanEnglishString(char *pPath, CWordList &wordList, int nCataNum, long idxDoc, short idxCata, bool bStem)
{
char *buffer=pPath;
_strlwr(buffer);
int nFilePos=0;
int realcnt=0,wordLen=0;
char c, *p;
p=buffer;
while(buffer[nFilePos]!='\0')
{
c=buffer[nFilePos];
if(c==' '||c=='\r'||c=='\n'||
(c>32&&c<=47)||(c>=58&&c<=64)||(c>=91&&c<=96)||(c>=123&&c<=127))
{
buffer[nFilePos]='\0';
wordLen=buffer+nFilePos-p;
if(wordLen>2)
{
if(bStem) theStemmer.stem(p,0,wordLen-1);
wordList.Add(p,idxCata,idxDoc,nCataNum);
realcnt++;
}
p=buffer+nFilePos+1;
}
nFilePos++;
}
return realcnt;
}
BOOL CDocNode::IsZero()
{
ASSERT(m_sWeightSet!=NULL);
for(int i=0;i<m_nAllocLen;i++)
if(m_sWeightSet[i].s_dWeight!=0) return FALSE;
return TRUE;
}
const CCatalogList& CCatalogList::operator = (const CCatalogList& x)
{
if(this==&x) return *this;
m_lstCatalogList.RemoveAll();
POSITION pos = x.m_lstCatalogList.GetHeadPosition();
while(pos!=NULL)
{
CCatalogNode& catanode=x.m_lstCatalogList.GetNext(pos);
m_lstCatalogList.AddTail(catanode);
}
return *this;
}
const CCatalogList& CCatalogList::operator += (const CCatalogList& x)
{
if(this==&x) return *this;
POSITION pos = x.m_lstCatalogList.GetHeadPosition();
while(pos!=NULL)
{
CCatalogNode& catanode=x.m_lstCatalogList.GetNext(pos);
m_lstCatalogList.AddTail(catanode);
}
return *this;
}
//nMode<=0 删除所有信息
//nMode=1 删除所有文档,但保留类别(节点)
//nMode=2 只删除文档向量所占用的内存
void CCatalogList::InitCatalogList(int nMode)
{
POSITION pos=GetFirstPosition();
while(pos!=NULL)
{
CCatalogNode& catanode=GetNext(pos);
if(nMode==2) catanode.InitCatalogNode(1);
else catanode.InitCatalogNode(0);
}
if(nMode<=1) m_lstCatalogList.RemoveAll();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -