📄 cataloglist.cpp

📁 良好的代码实现
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
}

long CCatalogList::GetDocNum()
{
	long i=0;
	POSITION pos=GetFirstPosition();
	while(pos!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos);
		i+=catanode.GetDocNum();
	}
	return i;
}

int CCatalogList::GetCataNum()
{
	return m_lstCatalogList.GetCount();
}


POSITION CCatalogNode::AddDoc(CDocNode &docnode)
{
	return m_lstDocList.AddTail(docnode);
}

bool CCatalogList::GetCataName(short idxCata,CString& strCataName)
{
	POSITION pos=GetFirstPosition();
	while(pos!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos);
		if(idxCata==catanode.m_idxCata)
		{
			strCataName=catanode.m_strCatalogName;
			return true;
		}
	}
	return false;
}

bool CCatalogList::GetDocName(short idxCata, long idxDoc, CString &strDocName)
{
	POSITION pos_cata=GetFirstPosition();
	while(pos_cata!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos_cata);
		if(idxCata==catanode.m_idxCata)
		{
			POSITION pos_doc=catanode.GetFirstPosition();
			while(pos_doc!=NULL)
			{
				CDocNode& docnode=catanode.GetNext(pos_doc);
				if(idxDoc==docnode.m_idxDoc)
				{
					strDocName=docnode.m_strDocName;
					return true;
				}
			}
			return false;
		}
	}
	return false;
}

POSITION CCatalogList::AddCata(CCatalogNode &catanode)
{
	return m_lstCatalogList.AddTail(catanode);
}

POSITION CCatalogList::GetFirstPosition()
{
	return m_lstCatalogList.GetHeadPosition();
}

CCatalogNode& CCatalogList::GetNext(POSITION &rPos)
{
	return m_lstCatalogList.GetNext(rPos);
}

POSITION CCatalogNode::GetFirstPosition()
{
	return m_lstDocList.GetHeadPosition();
}

CDocNode& CCatalogNode::GetNext(POSITION &rPos)
{
	return m_lstDocList.GetNext(rPos);
}

UINT CCatalogNode::GetDocNum()
{
    return m_lstDocList.GetCount();
}

//扫描目录构建类和文档节点
long CCatalogList::ScanDirectory(CString strPath)
{
	if(_chdir(strPath))  // if can't find the dir
	{
		CString	csTmp = "目录";
		csTmp+=strPath;
		csTmp+="不存在!";
		AfxMessageBox(csTmp);
		return -1;
	}

	long docNum=0;
	short idxCurCata=0;
	HANDLE hFinder;
	LPWIN32_FIND_DATA lpFindFileData;	
	lpFindFileData  = new WIN32_FIND_DATA;

	hFinder = ::FindFirstFile("*.*",lpFindFileData );
	while(::FindNextFile(hFinder,lpFindFileData))  // process the catalog dir;
	{
		if( !strcmp(lpFindFileData->cFileName,".") || !strcmp(lpFindFileData->cFileName,"..") )
			continue;

		if((lpFindFileData->dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
		{
			CCatalogNode catalognode;
			catalognode.m_strCatalogName=lpFindFileData->cFileName;
			catalognode.m_strDirName=strPath+"\\"+catalognode.m_strCatalogName;
			catalognode.m_idxCata=idxCurCata++;
			POSITION posTemp=AddCata(catalognode);
			CCatalogNode& cataTemp=GetAt(posTemp);
			cataTemp.SetStartDocID(docNum);
			docNum=cataTemp.ScanDirectory(catalognode.m_strDirName);
		}
	}
	delete	lpFindFileData;
	return docNum;
}

CCatalogNode CCatalogList::GetAt(POSITION pos) const
{
	return m_lstCatalogList.GetAt(pos);
}

CCatalogNode& CCatalogList::GetAt(POSITION pos)
{
	return m_lstCatalogList.GetAt(pos);
}
//根据标识获得类节点
CCatalogNode* CCatalogList::GetCata(short idxCata)
{
	POSITION pos_cata=GetFirstPosition();
	while(pos_cata!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos_cata);
		if(catanode.m_idxCata==idxCata)
			return &catanode;
	}
	return NULL;
}

//根据文档名获得文档
CDocNode* CCatalogList::GetDocByName(CString strDocName)
{
	POSITION pos_cata=GetFirstPosition();
	while(pos_cata!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos_cata);
		POSITION pos_doc=catanode.GetFirstPosition();
		while(pos_doc!=NULL)
		{
			CDocNode& docnode=catanode.GetNext(pos_doc);
			if(docnode.m_strDocName==strDocName)
				return &docnode;
		}
	}
	return NULL;
}
//根据类名获得类
CCatalogNode* CCatalogList::GetCataByName(CString strCataName)
{
	POSITION pos_cata=GetFirstPosition();
	while(pos_cata!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos_cata);
		if(catanode.m_strCatalogName==strCataName)
			return &catanode;
	}
	return NULL;
}

//根据类名获得类的ID
short CCatalogList::GetCataIDByName(CString strCataName)
{
	POSITION pos_cata=GetFirstPosition();
	while(pos_cata!=NULL)
	{
		CCatalogNode& catanode=GetNext(pos_cata);
		if(catanode.m_strCatalogName==strCataName)
			return catanode.m_idxCata;
	}
	return -1;
}

int CDocNode::GetWordNum()
{
	int s=0;
	for(int i=0;i<m_nAllocLen;i++)
	{
		s+=m_sWeightSet[i].s_tfi;
	}
	return s;
}

int CDocNode::GenDocVector()
{
	int i,nSum=0;
	for(i=0;i<m_nAllocTempLen;i++)
		if(m_pTemp[i].s_dWeight>dZero) nSum++;	
	AllocBuffer(nSum);
	nSum=0;
	for(i=0;i<m_nAllocTempLen;i++)
	{
		if(m_pTemp[i].s_dWeight>dZero)
		{
			m_sWeightSet[nSum].s_idxWord=m_pTemp[i].s_idxWord;
			m_sWeightSet[nSum].s_tfi=m_pTemp[i].s_tfi;
			m_sWeightSet[nSum].s_dWeight=m_pTemp[i].s_dWeight;
			nSum++;
		}
	}
	return nSum;
}


int CDocNode::GenDocVector(DOC &doc)
{
	if(m_pTemp==NULL||m_nAllocTempLen<=0) return -1;
	
	int i,nSum=0;
	for(i=0;i<CDocNode::m_nAllocTempLen;i++)
	{
		if(CDocNode::m_pTemp[i].s_dWeight>0) nSum++;
	}

	doc.words=(SVM_WORD *)malloc(sizeof(SVM_WORD)*(nSum+12));
	nSum=0;
	for(i=0;i<CDocNode::m_nAllocTempLen;i++)
	{
		if(CDocNode::m_pTemp[i].s_dWeight>0)
		{
			//DOC的特征ID从1开始
			(doc.words[nSum]).wnum=i+1;
			(doc.words[nSum]).weight=CDocNode::m_pTemp[i].s_dWeight;
			nSum++;
		}
	}
	(doc.words[nSum]).wnum=0;
	doc.docnum=-1;
	return nSum;
}

int CDocNode::ScanChineseStringWithDict(char *pPath,CWordList& wordList)
{
	char *buffer=pPath;
	int i,j,k;
	short l,n;
	char gram[12];
	//realcnt为文章中去掉停用词后剩下的总共词数
	//nStart为一个句子在buffer中的开始位置
	int realcnt=0,nStart=0,nNewStart=0;
	//句子的长度
	int nSentenceLen=0;
	memset(m_pTemp,0,sizeof(sWeightNode)*m_nAllocTempLen);
	CWordNode wordNode;
	while(buffer[nStart]!='\0')
	{
		nSentenceLen=ParseFile(buffer,nStart,nNewStart);
		nStart=nNewStart;
		if(nSentenceLen==0) continue;
		if(m_pSentence[0]>0) //如果是一个英文单词
		{
			//如果英文单词的长度大于等于2,且不是数字
			if((nSentenceLen>=2)&&((m_pSentence[0]<'0')||(m_pSentence[0]>'9')))
			{
				if(wordList.Lookup(m_pSentence,wordNode))
				{
					m_pTemp[wordNode.m_nWordID].s_idxWord=wordNode.m_nWordID;
					m_pTemp[wordNode.m_nWordID].s_tfi+=1;
					m_pTemp[wordNode.m_nWordID].s_dWeight=wordNode.m_dWeight;
					realcnt++;
				}
			}
		}
		else //如果是汉字串
		{
			if(nSentenceLen%2!=0) continue;
			//倒着扫描句子，这样分词的准确率高一些
			i=nSentenceLen;
			while(i>0)
			{
				//最长扫描5个汉字的单词
				if(i>10) k=10;
				else k=i;
				for(j=k;j>0;j=j-2)
				{
					//将Gram项拷贝到gram中
					n=0;
					for(l=j;l>0;l--) gram[n++]=m_pSentence[i-l];
					gram[n]='\0';
					if(wordList.Lookup(gram,wordNode))
					{
						m_pTemp[wordNode.m_nWordID].s_idxWord=wordNode.m_nWordID;
						m_pTemp[wordNode.m_nWordID].s_tfi+=1;
						m_pTemp[wordNode.m_nWordID].s_dWeight=wordNode.m_dWeight;
						realcnt++;
						i=i-j+2;
						break;
					}
				}
				i=i-2;
			}
		}
	}
	//对文档向量中的每一维进行加权
	if(realcnt>0)
	{
		double sum=0;
		for(i=0;i<m_nAllocTempLen;i++)
		{
			m_pTemp[i].s_dWeight*=m_pTemp[i].s_tfi;
			sum+=(m_pTemp[i].s_dWeight*m_pTemp[i].s_dWeight);
		}
		sum=sqrt(sum);
		for(i=0;i<m_nAllocTempLen;i++)
			m_pTemp[i].s_dWeight/=sum;
	}
	return realcnt;
}

int CDocNode::ScanChineseWithDict(char *pPath,CWordList& wordList)
{
	CFile fin;
	char *buffer;
	strcpy(pTempStr,pPath);
	strcat(pTempStr,"\\");
	strcat(pTempStr,m_strDocName.GetBuffer(0));
	if(!fin.Open(pTempStr,CFile::modeRead))
		return -1;

	int flen=fin.GetLength();
	buffer=new char[flen+1];
	flen=fin.ReadHuge(buffer,flen);
	buffer[flen]='\0';
	fin.Close();
	int num=ScanChineseStringWithDict(buffer,wordList);
	delete[] buffer;
	return num;
}

int CDocNode::ScanEnglishStringWithDict(char *pPath, CWordList &wordList, bool bStem)
{
	char *buffer=pPath;
	_strlwr(buffer);

	int nFilePos=0;
	int realcnt=0,wordLen=0;
	char c, *p;
	p=buffer;
	memset(m_pTemp,0,sizeof(sWeightNode)*m_nAllocTempLen);
	CWordNode wordNode;
	while(buffer[nFilePos]!='\0')
	{
		c=buffer[nFilePos];
		if(c==' '||c=='\r'||c=='\n'||
			(c>32&&c<=47)||(c>=58&&c<=64)||(c>=91&&c<=96)||(c>=123&&c<=127))
		{
			buffer[nFilePos]='\0';
			wordLen=buffer+nFilePos-p;
			if(wordLen>2)
			{
				if(bStem) theStemmer.stem(p,0,wordLen-1);
				if(wordList.Lookup(p,wordNode))
				{
					m_pTemp[wordNode.m_nWordID].s_idxWord=wordNode.m_nWordID;
					m_pTemp[wordNode.m_nWordID].s_tfi+=1;
					m_pTemp[wordNode.m_nWordID].s_dWeight=wordNode.m_dWeight;
					realcnt++;
				}
			}
			p=buffer+nFilePos+1;
		}
		nFilePos++;
	}

	//对文档向量中的每一维进行加权
	int i;
	if(realcnt>0)
	{
		double sum=0;
		for(i=0;i<m_nAllocTempLen;i++)
		{
			m_pTemp[i].s_dWeight*=m_pTemp[i].s_tfi;
			sum+=(m_pTemp[i].s_dWeight*m_pTemp[i].s_dWeight);
		}
		sum=sqrt(sum);
		for(i=0;i<m_nAllocTempLen;i++)
			m_pTemp[i].s_dWeight/=sum;
	}
	return realcnt;
}

int CDocNode::ScanEnglishWithDict(char *pPath, CWordList &wordList, bool bStem)
{
	CFile fin;
	char *buffer;
	strcpy(pTempStr,pPath);
	strcat(pTempStr,"\\");
	strcat(pTempStr,m_strDocName.GetBuffer(0));
	if(!fin.Open(pTempStr,CFile::modeRead))
		return -1;

	int flen=fin.GetLength();
	buffer=new char[flen+1];
	flen=fin.ReadHuge(buffer,flen);
	buffer[flen]='\0';
	fin.Close();
	int num=ScanEnglishStringWithDict(buffer,wordList,bStem);
	delete[] buffer;
	return num;
}

double CDocNode::ComputeSimilarityRatio()
{
	double sum=0.0;
	for(int i=0;i<m_nAllocLen;i++)
		sum+=m_sWeightSet[i].s_dWeight*m_pTemp[m_sWeightSet[i].s_idxWord].s_dWeight;
	return sum;
}


//对pBuffer中的文字进行处理，得到一个句子的结束位置nEnd
//并且将去掉空格回车等字母后的句子放入变量m_pSentence
int CDocNode::ParseFile(char *pBuffer, int nStart, int &nEnd)
{
	//nSum为得到的句子包含的字节数
	int nCurrent,nSum=0;
	byte bChar[2];
	//是否为中文句子
	bool bChinese=true;
	nCurrent=nStart;
	//去掉句首的全角和半角空格
	//判断句子是以中文字母开头，还是英文字母开头
	while(pBuffer[nCurrent]!='\0')
	{
		bChar[0]=pBuffer[nCurrent];
		if(bChar[0]==' '||bChar[0]=='\r'||bChar[0]=='\n')
			nCurrent++;
		else if(bChar[0]==0xA1)
		{
			bChar[1]=pBuffer[nCurrent+1];
			if(bChar[1]==0xA1) 
				nCurrent+=2;
			else
			{
				bChinese=true;
				break;
			}
		}
		else if(pBuffer[nCurrent]>0)
		{
			bChinese=false;
			break;
		}
		else
		{
			bChinese=true;
			break;
		}
	}

	while(pBuffer[nCurrent]!='\0')
	{
		bChar[0]=pBuffer[nCurrent];
		if(bChar[0]>127)
		{	
			if(!bChinese) break;
			nCurrent++;
			bChar[1]=pBuffer[nCurrent];
			//0xA1A1为全角的空格
			if((bChar[0]!=0xA1)||(bChar[1]!=0xA1))
			{
				//如果为"的"字,或为标点符号或其它全角字母
				if(((bChar[0]==0xB5)&&(bChar[1]==0xC4))||
					((bChar[0]==0xA1)&&(bChar[1]>0xA1)&&(bChar[1]<=0xFE))||
					((bChar[0]==0xA2)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFC))||
					((bChar[0]==0xA3)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFE))||
					((bChar[0]==0xA4)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF3))||
					((bChar[0]==0xA5)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF6))||
					((bChar[0]==0xA6)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF5))||
					((bChar[0]==0xA7)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF1))||
					((bChar[0]==0xA8)&&(bChar[1]>=0x40)&&(bChar[1]<=0xE9))||
					((bChar[0]==0xA9)&&(bChar[1]>=0x40)&&(bChar[1]<=0xEF))||
					((bChar[0]==0xAA)&&(bChar[1]==0xA5)))
				{
					nCurrent++;
					break;
				}
				//如果后半部分小于0x80，代表是一个错误的汉字
				else if(bChar[1]>=0x80)
				{
					m_pSentence[nSum]=pBuffer[nCurrent-1];
					nSum++;
					m_pSentence[nSum]=pBuffer[nCurrent];
					nSum++;
				}
			}
		}
		else
		{
			if(bChar[0]==' '||bChar[0]=='\r'||bChar[0]=='\n')
			{
				if(!bChinese)
				{
					nCurrent++;
					break;
				}
			}
			else
			{
				//if(bChar[0]=='!'||bChar[0]=='?'||bChar[0]==':'||
				//	bChar[0]==';'||bChar[0]=='.')
				if((bChar[0]>32&&bChar[0]<=47)||(bChar[0]>=58&&bChar[0]<=64)||
					(bChar[0]>=91&&bChar[0]<=96)||(bChar[0]>=123&&bChar[0]<=127))
				{
					nCurrent++;
					break;
				}
				else if(bChinese) break;
				else
				{
					m_pSentence[nSum]=pBuffer[nCurrent];
					nSum++;
				}
			}
		}
		nCurrent++;
	}
	m_pSentence[nSum]='\0';
	nEnd=nCurrent;
	return nSum;
}

bool CCatalogList::BuildCatalogID(CCatalogList & catalogList)
{
	POSITION pos=GetFirstPosition();
	CString strCatalogName;
	short cataID;
	while(pos!=NULL)
	{
		CCatalogNode& cataNode=GetNext(pos);
		cataID=catalogList.GetCataIDByName(cataNode.m_strCatalogName);
		if(cataID<0) return false;
		cataNode.m_idxCata=cataID;
	}
	return true;
}

bool CDocNode::IsNumber(char *p)
{
	int i=0;
	while(p[i]!='\0'&&p[i]>='0'&&p[i]<='9') i++;
	if(p[i]=='\0') return true;
	else return false;
}

//函数将smart格式的类别字符串line转换一个类别ID数组
//函数的返回值为字符串line中包含无法识别的类别总数
short CCatalogList::GetCataIDArrayFromString(char * line, CArray<short, short> &aryCataID)
{
	int pos=0,id,d;
	short result=0;
	char type[MAX_PATH];

	aryCataID.RemoveAll();
	while(sscanf(line+pos,"%s %d",type,&d) != EOF)
	{
		id=GetCataIDByName(type);
		if(id>=0)
		{
			aryCataID.Add(id);
			while(line[pos]!=';'&&line[pos]!='\r'&&line[pos]!='\0') 
				pos++;
			if(line[pos]==';') pos=pos+2;
		}
		else result++;
	}
	return result;
}

CDocNode& CCatalogNode::GetAt(POSITION position)
{
	return m_lstDocList.GetAt(position);
}

int CCatalogList::GetSaveMode()
{
	return CCatalogList::m_nSaveMode;
}


double CDocNode::ComputeProbability(CWordList &wordlist,int n)
{
	/*
	double sum=0.0;
	for(int i=0;i<m_nAllocTempLen;i++)
	{
		int j=m_pTemp[i].s_idxWord;
		CString str = wordlist.GetWordByID(i);
		CWordNode &wordnode = wordlist.m_lstWordList[str];
		sum+= wordnode.m_pCataWeightPro[n];
	}
	return sum;
	*/
	return m_nAllocLen;
}
上一页 12
💿 文件大小 3508 K
👤 上传用户 Kunlun_mrpii
📂 所属分类文章/文档
🏷️ 相关标签

#代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -