⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segtool.cpp

📁 用字典进行预料的切词
💻 CPP
字号:
// SegTool.cpp: implementation of the CSegTool class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "SegTool.h"


//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CSegTool::CSegTool()
{

}

CSegTool::~CSegTool()
{
	m_WordList.clear();
}

bool CSegTool::InitSegTool(char *psWordDict){
	FILE *fpWordFile = NULL;
	char szWord[256];
	if( psWordDict == NULL ) return false;
	if( (fpWordFile = fopen(psWordDict,"r")) == NULL ) return false;
	while( fgets(szWord,256,fpWordFile) != NULL ){
		if( *szWord == 0 ) continue;
		if( szWord[strlen(szWord) - 1] == '\n' ){
			szWord[strlen(szWord) - 1] = 0 ;
		}
		m_WordList.push_back(szWord);
	}
	fclose(fpWordFile);
	
	return true;
}

int CSegTool::SegFile( char *psInputFile,char *psOutputFile ){
	FILE *fpInput = NULL;
	FILE *fpOutput = NULL;
	char szLine[1024];
	char szInput[1096];
	char szOutput[1096];
	char *psRes = NULL;
	*szInput = 0;
	if( (fpInput = fopen(psInputFile,"r")) == NULL ){
		return false;
	}
	if( (fpOutput = fopen(psOutputFile,"w")) == NULL ){
		return false;
	}
	while( fgets(szLine,1024,fpInput ) != NULL ){
		if( psRes != NULL ){
			strcpy(szInput,psRes );
		}
		strcat(szInput,szLine);
		SegSentence( 40 , szInput, szOutput, psRes );
		fprintf(fpOutput, "%s" , szOutput);
	}
	if( psRes != NULL ){
		strcpy(szInput,psRes );
	}
	SegSentence( 0 , szInput, szOutput, psRes );
	fprintf(fpOutput, "%s" , szOutput);
	fclose(fpInput );
	fclose(fpOutput );
	return true;
}

int CSegTool::SegSentence( int nResNum ,char *psInputSen , char *psOutputSen ,char *&psRes){
	*psOutputSen = 0;
	if( nResNum >=  strlen(psInputSen) ) {
		psRes = psInputSen;
		return strlen(psInputSen);
	}
	
	int nSegNum = 0;
	char *psInput = NULL;
	char szWord[256];
	psInput = psInputSen;
	*psOutputSen = 0;
	while( nSegNum + nResNum <= strlen(psInputSen) ){
		if( GetOneWord(psInput + nSegNum ,szWord) == false ) break;
		strcat(psOutputSen,szWord);
		strcat(psOutputSen,"/");
		nSegNum += strlen(szWord);
	}
	psRes = psInput + nSegNum ;
	return strlen(psInputSen) - nSegNum ;
}

bool CSegTool::GetOneWord(char *psInput ,char *psOutput){
	char ch = 0;
	int nPos = 0;
	int nHead = 0;
	int nTail = m_WordList.size();
	char szSearchWord[256];
	char szWordSeg[256];
	int nFlag = -1;
	if( *psInput == 0 ) return false;
	if( (ch = *(psInput + nPos)) & 0x80 ){
		nPos += 2;
	}else{
		nPos += 1;
	}
	memcpy(szSearchWord,psInput,nPos);
	szSearchWord[nPos] = 0;
	strcpy(szWordSeg,szSearchWord);
	while(nPos <= strlen(psInput)){
		if( (nFlag = SearchWordHeadRange(szSearchWord,nHead ,nTail ,nHead,nTail)) == -1 ){
			strcpy(psOutput,szWordSeg);
			return true;
		}else if( nFlag == 1 ){
		}else if( nFlag == 2 ){
			strcpy(szWordSeg,szSearchWord);
			if( nHead == nTail ) {
				strcpy(psOutput,szWordSeg);
				return true;
			}
			nHead++;
		}
		if( (ch = *(psInput + nPos)) & 0x80 ){
			nPos += 2;
		}else{
			nPos += 1;
		}
		memcpy(szSearchWord,psInput,nPos);
		szSearchWord[nPos] = 0;
	}
	strcpy(psOutput,szWordSeg);
	return true;
}

bool CSegTool::GetWord( int nIndex, char *psWord ){
	if( nIndex < 0 || nIndex > m_WordList.size() ) return false;
	strcpy(psWord,m_WordList[nIndex].begin());
	return true;
}

int CSegTool::SearchOneWord( char *psKey ){ // return index or -1 if not
	return BSearch(psKey,0,m_WordList.size());
}


int CSegTool::SearchWordHeadRange( char *psKey ,int nHead,int nTail,int &nReHead, int &nReTail){
	if( (nReHead = SearchWordHeadTop(psKey,nHead,nTail) ) == - 1 ) return -1;
	nReTail = SearchWordHeadDown(psKey,nHead,nTail);
	if( strcmp(m_WordList[nReHead].begin(),psKey ) == 0 ){
		return 2;
	}else{
		return 1;
	}
	
}

int CSegTool::SearchWordHeadTop( char *psKey ,int nHead, int nTail){
	int nMiddle = (nHead + nTail) / 2;
	if( nMiddle == 0 ) return -1;
	
	if( strstr(m_WordList[nMiddle].begin(), psKey) == m_WordList[nMiddle].begin() ){
		if( strstr(m_WordList[nMiddle - 1].begin(), psKey) ==  m_WordList[nMiddle - 1].begin() ){
			return SearchWordHeadTop(psKey,nHead,nMiddle - 1);
		}else{
			return nMiddle;
		}
	}else if( nHead >= nTail ) {
		return -1;
	}else if( strcmp(m_WordList[nMiddle].begin(), psKey) > 0 ){
		return SearchWordHeadTop(psKey,nHead,nMiddle - 1);
	}else{
		return SearchWordHeadTop(psKey,nMiddle + 1,nTail);
	}
	return -1;
}

int CSegTool::SearchWordHeadDown( char *psKey ,int nHead, int nTail){
	int nMiddle = (nHead + nTail) / 2;
	if( nMiddle == m_WordList.size() ) return -1;
	if( strstr(m_WordList[nMiddle].begin(), psKey) == m_WordList[nMiddle].begin() ){
		if( strstr(m_WordList[nMiddle + 1].begin(), psKey) ==  m_WordList[nMiddle + 1].begin() ){
			return SearchWordHeadDown(psKey,nMiddle + 1,nTail);
		}else{
			return nMiddle;
		}
	}else if( nHead == nTail) {
		return -1;
	}else if( strcmp(m_WordList[nMiddle].begin(), psKey) > 0 ){
		return SearchWordHeadDown(psKey,nHead,nMiddle - 1);
	}else{
		return SearchWordHeadDown(psKey,nMiddle + 1,nTail);
	}
	return -1;
}

int CSegTool::BSearch( char *psKey ,int nHead , int nTail ){
	int nMiddle = (nHead + nTail) / 2;
	if( strcmp(m_WordList[nMiddle].begin(), psKey) == 0 ){
		return nMiddle;
	}else if( nHead == nTail || nHead + 1 == nTail) {
		return -1;
	}else if( strcmp(m_WordList[nMiddle].begin(), psKey) > 0 ){
		return BSearch(psKey,nHead,nMiddle - 1);
	}else{
		return BSearch(psKey,nMiddle + 1,nTail);
	}
	return -1;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -