⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 diction.cpp

📁 计算机英汉机器翻译系统中的英语词性标注方法实现
💻 CPP
📖 第 1 页 / 共 5 页
字号:
	int nEngHuffLen;
	int nEngOrgLen;
	int nWordsNum;
	long lDataSite;
	int nWordInfoLen;
	BOOL bIsEven = FALSE;

	BOOL bResult;
	do {
		bResult = ReadALineFromEngHuff(fpEnglishHuff,szEngHuff,nEngHuffLen,
							nEngOrgLen,nWordsNum,lDataSite,nWordInfoLen);
			if ( bResult == FALSE )
			break;

		bIsEven = !bIsEven;
		if ( bIsEven ) continue;

		// 第0字节为词典中以该单词为首词的词组中的单词的最大个数
		pszWordInfoBuff[0] = (UCHAR)nWordsNum;
		nWordInfoLen ++;
		
		ReadIndexData(fpTempDat,pszWordInfoBuff+1,lDataSite,nWordInfoLen);
		
		long lNewDataSite = WriteIndexData(fpIndexDat,pszWordInfoBuff,
						nWordInfoLen);
		CompressIndexOffsetInfo(lNewDataSite,nWordInfoLen,
						obDiction.m_pszOffset);
		
		if ( nMaxKeyLen < nEngHuffLen )
			nMaxKeyLen = nEngHuffLen;  // 字典中的最大的关键字的长度

		if ( nEngHuffLen > DIC_WORD_LEN ) {// 将超过最大词长的关键字的超过部分截断
			nEngHuffLen = DIC_WORD_LEN;
			nExceedMaxNum ++; // 词典中超过允许最大关键字的词的个数
		}

		memset(obDiction.m_pszWord,0x0,DIC_WORD_LEN);
		memcpy(obDiction.m_pszWord,szEngHuff,nEngHuffLen);

		bResult = obDiction.insert();
//		ASSERT(bResult == IM_OK); // 词典中有重复的词组
	} while ( TRUE );

	bIsEven = TRUE;
	fseek(fpEnglishHuff,0L,SEEK_SET);
	
	do {
		bResult = ReadALineFromEngHuff(fpEnglishHuff,szEngHuff,nEngHuffLen,
							nEngOrgLen,nWordsNum,lDataSite,nWordInfoLen);
		if ( bResult == FALSE )
			break;

		bIsEven = !bIsEven;
		if ( bIsEven ) continue;

		// 第0字节为词典中以该单词为首词的词组中的单词的最大个数
		pszWordInfoBuff[0] = (UCHAR)nWordsNum;
		nWordInfoLen ++;
		
		ReadIndexData(fpTempDat,pszWordInfoBuff+1,lDataSite,nWordInfoLen);
		
		long lNewDataSite = WriteIndexData(fpIndexDat,pszWordInfoBuff,
						nWordInfoLen);
		CompressIndexOffsetInfo(lNewDataSite,nWordInfoLen,
						obDiction.m_pszOffset);

		if ( nEngHuffLen > DIC_WORD_LEN ) {// 将超过最大词长的关键字的超过部分截断
			if ( nMaxKeyLen < nEngHuffLen )
				nMaxKeyLen = nEngHuffLen;  // 字典中的最大的关键字的长度

			nEngHuffLen = DIC_WORD_LEN;
			nExceedMaxNum ++; // 词典中超过允许最大关键字的词的个数
		}

		memset(obDiction.m_pszWord,0x0,DIC_WORD_LEN);
		memcpy(obDiction.m_pszWord,szEngHuff,nEngHuffLen);

		obDiction.insert();
	} while ( TRUE );

	GlobalUnlock(GlobalHandle(pszWordInfoBuff));
	GlobalFree(GlobalHandle(pszWordInfoBuff));

	fclose(fpEnglishHuff);

	fclose(fpTempDat);
	fclose(fpIndexDat);
	
	remove(pszEngHuffName);
	remove(szTempDatName);
	
	// 更新数据文件的文件头信息
	if ( (fpIndexDat = fopen(szDictDatName,"r+b") ) == NULL ) {
		CString strMsg;
		strMsg.Format("Cann't write file %s !",szDictDatName);
		AfxMessageBox(strMsg);
		return FALSE;
	}
	fseek(fpIndexDat,0L,SEEK_SET);
	fwrite(&phHead,sizeof(DictHeader),1,fpIndexDat);
	fclose(fpIndexDat);
/*
	fprintf(fpLog,"字典中最长的英文词长=%d;\n允许的最大英文词长=%d;\n超过允许的词的个数=%d;\n[注] 词的超过部分将被截掉!\n",
		nMaxKeyLen,DIC_WORD_LEN,nExceedMaxNum);
*/	
	return TRUE;
}


BOOL BuildUserIndex(char *szUserDicName , char *szIndexName )
{
	CString strMsg;
	char sExeFilePath[256];
	//strMsg.Format("本函数将对用户自定义词典重建索引,\n \
	//执行本步后,旧的用户自定义索引文件将被删除.\n继续吗?");
	
	GetModuleFileName(NULL,sExeFilePath,255);
	_chdir(sExeFilePath);
	
	if ( TRUE ) {
		char szInDefineName[] = "DictRes\\define.txt";
		//yys 98.5.15 Bgn
		char szOutDefineName[] = "DictRes\\DictDefU.h";
		//yys 98.5.15 End
		//char szDicIndexName[] = "DictRes\\EcDictU";		//原始词典名
		char szDicIndexName[1024] ;
		char szCodeTableName[] = "DictRes\\CodeTabU.txt";
		char szEngInfoName[] = "DictRes\\EngInfoU.txt";
		char szEngSortInfoName[] = "DictRes\\SEngInfo.txt";
		
		/*
		char szLogFile[] = "DictRes\\Buildidx.log";
		FILE *fpLog;
		
		fpLog = fopen(szLogFile , "w+");
		if( !fpLog ){
			AfxMessageBox("无法生成纪录文件!",MB_OK);
			return FALSE;
		}
		*/

		BOOL bResult;
		CSenLink *pSenLink;
		
		//原建索引的第一步
		strcpy(szDicIndexName , "DictRes\\");
		strcat(szDicIndexName , szIndexName);
		bResult = BuildIndexStepOne(szInDefineName,szOutDefineName,
								szDicIndexName,szEngInfoName,
								szCodeTableName,szUserDicName);
		if( !bResult ){
			return FALSE;
		}
		
		pSenLink = new CSenLink;
		//FALSE -- 对szEngInfoName进行逆序排列,输出结果为:szEngSortInfoName
		bResult = pSenLink->SortMain(szEngInfoName,szEngSortInfoName,FALSE);
		delete pSenLink;		//释放空间
		
		if( !bResult ){
			return FALSE;
		}
		
		//原建索引的第二步
		char szHuffmanFreqDataName[] = "DictRes\\HufFreqU.dat";
		char szEngHuffName[] = "DictRes\\EngHuffU.txt";
		char szEngSortHuffName[] = "DictRes\\SEngHuff.txt";
		
		bResult = BuildIndexStepTwo(szEngSortInfoName,
					szHuffmanFreqDataName,szEngHuffName);
		
		if( !bResult ){
			return FALSE;
		}
		//yys 98.5.14 Bgn
		remove(szEngSortInfoName);
		//yys 98.5.14 End
		//原建索引的第三步
		pSenLink = new CSenLink;
		//TRUE -- 对szEngHuffName进行顺序排列,输出结果为:szEngSortHuffName
		bResult = pSenLink->SortMain(szEngHuffName,szEngSortHuffName,TRUE);
		delete pSenLink;		//释放空间			
		
		//yys 98.5.14 Bgn
		bResult = BuildIndexStepThree(szEngSortHuffName,szDicIndexName,szUserDicName);
		
		remove(szEngSortHuffName);
		//yys 98.5.14 End
		
		if( !bResult ){
			return FALSE;
		}		
		else
			AfxMessageBox("建索引完成!\n");
	}	

	fclose(fpLog);
	return TRUE;
}


DictNode::DictNode()
{
	m_pszEnglish = NULL;
	m_nEnglishLen = 0;
	m_bIsExistOrig = FALSE;
	m_pszOrig = NULL;
	m_nOrigLen = 0;
	m_nWordStyle = 0;
	m_bIsExistAmbig = 0;
	m_pszAmbig = NULL;
	m_nAmbigLen = 0;
	m_nChinNum = 0;
	m_pFirstChin = NULL;

	//yys 98.4.26 Bgn
	m_byMaxPhraseLen=0;	//如果 m_pszEnglish为一个单词,
							// 记录短语第一个为该词的短语最大长度
	m_byForm=0;	
	/*
	0:original,
	1:-ed1,2:ed2,3:-ing,4:-s,5:ed1,ed2; 6; er;7;est;	
	8: 待定 is or has;9: 待定 would or had 10: unknown word,	
	11:number
	*/
	//int		m_byDe;		//1:所有格
	m_nLxhCate=NULL;
	m_nLxhCateLen=0;
	m_byGenetive=0;	//1:所有格
	m_byNumber=0;	//0:单数,	1:复数
	m_pNextWordNode=NULL;	//下一个 词结点
	m_pPrevWordNode=NULL;	//前一个 词结点 

	m_byDictForm=0;
	m_byDictGenetive=0;		//1:所有格
	
	m_cDictTense=0;			//时态:0现在,1过去,2将来,3过去将来
	m_cDictAspect=0;			//体态:0一般,1进行,2完成,3完成进行
	m_cDictVoice=0;			//0:主动,1:被动
	m_cDictNegative=0;		//0:肯定,1:否定
	m_byDictPerson=0;			//1:第一人称,2:第二人称,3:第三人称
	m_byDictNumber=0;			//0:单数,1:复数
//LXHEnd


//CDQBegin
	m_pszPostag=NULL;			//统计算法词性标注结果
	m_pszBaseNPtag=NULL;		//BaseNP标注结果
//CDQEnd


//xunedBegin
	m_pCurrSele=NULL;		//当前完成译文选择的词典一行		
	m_pszTranslation=NULL;	//生成或选择后的译文
	m_nTranslationLen=0;
	m_pszParseRule=NULL;		//分析数的中间结点的分析规则
	m_nParseRule=0;	   	
	m_pszTransferRule=NULL;	//分析数的中间结点的转换生成规则
	m_nTransferRule=0;
	m_pszDynamicInfo=NULL;	//翻译过程中动态生成的信息
	m_nDynamicInfo=0;
	m_bIsSentBegin=FALSE;		//当前结点是否为句子头
	m_bIsSentEnd=FALSE;		//当前结点是否为句子尾
	m_bIsLeafNode=FALSE;		//当前结点是否为分析树的叶结点
	m_pChild=NULL;			//当前结点的儿子结点
	m_pParent=NULL;			//当前结点的父亲结点
	m_pLeftBrother=NULL;		//当前结点的左兄弟结点
	m_pRightBrother=NULL;	//当前结点的右兄弟结点
	m_pLeftCousin=NULL;		//当前结点的左堂兄弟结点
	m_pRightCousin=NULL;		//当前结点的右堂兄弟结点
		//xunedEnd
	//yys 98.4.26 End
}

DictNode::~DictNode()
{
	//FreeNode(m_pDictNode);
}

DictChin *DictNode::AppendEmptyChin()
// 在m_pFirstChin的尾部追加一个空的DictChin结点,
// 返回新添加的结点的指针
{
	DictChin *pNewChin;
	DictChin *pCurrChin = m_pFirstChin;
	
	//yys 98.4.30 Bgn
	if( !pCurrChin ){		//如果pCurrChin为空
		m_pFirstChin = new DictChin;
		pNewChin = m_pFirstChin;
		pNewChin->m_pNextChin = NULL;
		m_nChinNum = 1;
	} 
	//yys 98.4.30 End
	else{
		do {
			if ( pCurrChin->m_pNextChin == NULL )
				break;
			pCurrChin = pCurrChin->m_pNextChin;
		} while ( TRUE );
		pNewChin = new DictChin;
		pCurrChin->m_pNextChin = pNewChin;
		pNewChin->m_pNextChin = NULL;
		m_nChinNum ++;
	}

	return pNewChin;
}

DictChin *DictNode::GetPointedChin(int nPointedIndex)
// 取链表中指定位置的中文结点
// 从 0 算起
{
	DictChin *pCurrChin = m_pFirstChin;

	for ( int Loop=0;Loop<nPointedIndex;Loop++ )
		pCurrChin = pCurrChin->m_pNextChin;
	
	return pCurrChin;
}

void DictNode::DeletePointedChin(int nPointedIndex)
// 删除链表中指定位置的中文结点
// 从 0 算起
{
	DictChin *pLastChin = NULL;
	DictChin *pCurrChin;
	DictChin *pNextChin;
	if ( m_pFirstChin == NULL )
		return;

	if ( nPointedIndex > 0 )
		pLastChin = GetPointedChin(nPointedIndex-1);

	pCurrChin = GetPointedChin(nPointedIndex);
	pNextChin = pCurrChin->m_pNextChin;
	delete pCurrChin;

	if ( pLastChin != NULL )
		pLastChin->m_pNextChin = pNextChin;
	
	if ( nPointedIndex == 0 )
		m_pFirstChin = pNextChin;
	m_nChinNum --;
}

DictSearch::DictSearch()
{
	m_pDictNode = NULL;
	/* qlp delete 5.27
	m_obDiction = NULL;
	m_obSysDiction = NULL;
	m_obAddtionDiction = NULL;
	m_obUserDiction = NULL;
	*/
	bSearchSysDict = TRUE;
	bSearchUserDict = TRUE;
	bHaveAllocMemory = FALSE;
//qlp delete 5.27.
	/*
	m_fpIndexDat = NULL;
	m_fpSysIndexDat = NULL;			//系统词典索引
	m_fpAddtionIndexDat = NULL;		//系统附加词典索引
	m_fpUserIndexDat = NULL;			//用户词典索引
*/

	GetUserIndexName( szUserDictName );

	for ( int Loop=0;Loop<=BYTES_BIT_NUM;Loop++ )
		m_narrayDector[Loop] = (int)pow(2,Loop);

	m_nSameHushWordsNum = 0;

}

DictSearch::~DictSearch()
{
/*	m_obDiction = NULL;
	m_obSysDiction = NULL;
	m_obAddtionDiction = NULL;
	m_obUserDiction = NULL;
*/
}
  
Dictionary *DictSearch::DictInit( LPSTR pszDicIndexName ,
						  FILE *m_fpIndexDat )
{
	char szDictIndexName[MAX_FILENAME_LEN]; // 索引文件名
	
	strcpy(szDictIndexName,pszDicIndexName);
	strcat(szDictIndexName,".idx");
	
	char szDictDtaName[MAX_FILENAME_LEN]; // 二级索引文件名
	char szDictDatName[MAX_FILENAME_LEN]; // 数据文件名
	
	
	strcpy(szDictDtaName,pszDicIndexName);
	strcat(szDictDtaName,".dta");
	
	strcpy(szDictDatName,pszDicIndexName);
	strcat(szDictDatName,".dat");
	
	m_fpIndexDat = fopen(szDictIndexName , "r + b");
	if ( !m_fpIndexDat ) {
		CString strMsg;
		strMsg.Format("Cann't open file %s !",szDictIndexName );
		AfxMessageBox(strMsg);
		return NULL;
	}	

	Dictionary *m_obDiction = new Dictionary(szDictDtaName,szDictIndexName,0);
	
	return m_obDiction ;
}

BOOL DictSearch::InitUserDict( char *pszUserDicIndexName )
{	
	char szUserDictIndexName[MAX_FILENAME_LEN]; // 索引文件名
	char szUserDictDtaName[MAX_FILENAME_LEN]; // 二级索引文件名
	char szUserDictDatName[MAX_FILENAME_LEN]; // 数据文件名
	
	strcpy(szUserDictIndexName,pszUserDicIndexName);
	strcat(szUserDictIndexName,".idx");
	
	strcpy(szUserDictDtaName,pszUserDicIndexName);
	strcat(szUserDictDtaName,".dta");
	
	strcpy(szUserDictDatName,pszUserDicIndexName);
	strcat(szUserDictDatName,".dat");

	if ( (( (CPosTagApp* ) AfxGetApp() ) ->m_fpUserIndexDat = fopen(szUserDictDatName,"r+b") ) == NULL ) {
		CString strMsg;
		strMsg.Format("Cann't open file %s ! ",	szUserDictDatName);
		AfxMessageBox(strMsg);
		return FALSE;
	}
	
( (CPosTagApp* ) AfxGetApp() ) ->m_obUserDiction = new Dictionary(szUserDictDtaName,szUserDictIndexName,0);
	
	if ( !LoadUserHushTableFromIndexDat(( (CPosTagApp* ) AfxGetApp() ) ->m_fpUserIndexDat) )
		return FALSE;
	
	return TRUE;
}

BOOL DictSearch::UserDictExit()
{
	if(( (CPosTagApp* ) AfxGetApp() ) -> m_fpUserIndexDat )
	{
		fclose(( (CPosTagApp* ) AfxGetApp() ) ->m_fpUserIndexDat);
		( (CPosTagApp* ) AfxGetApp() ) ->m_fpUserIndexDat = NULL;
	}
	if(( (CPosTagApp* ) AfxGetApp() ) -> m_obUserDiction ){
		delete ( (CPosTagApp* ) AfxGetApp() ) ->m_obUserDiction;
	( (CPosTagApp* ) AfxGetApp() ) ->	m_obUserDiction = NULL;
	}
	
	return TRUE;
}

BOOL DictSearch::SearchWordInit(LPSTR pszSysDicIndexName,
	

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -