⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dictionary.cpp

📁 计算所汉语词法分析系统ICTCLAS介绍 词是最小的能够独立活动的有意义的语言成分。 但汉语是以字为基本的书写单位
💻 CPP
📖 第 1 页 / 共 2 页
字号:
 *
 *  Returns    : success or fail
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/

bool CDictionary::FindInOriginalTable(int nInnerCode,char *sWord,int nHandle,int *nPosRet)
{
    PWORD_ITEM pItems=m_IndexTable[nInnerCode].pWordItemHead;
	int nStart=0,nEnd=m_IndexTable[nInnerCode].nCount-1,nMid=(nStart+nEnd)/2,nCount=0,nCmpValue;
	while(nStart<=nEnd)//Binary search
	{
       nCmpValue=strcmp(pItems[nMid].sWord,sWord);
       if(nCmpValue==0&&(pItems[nMid].nHandle==nHandle||nHandle==-1))
	   {
		   if(nPosRet)
		   {
			   if(nHandle==-1)//Not very strict match
			   {//Add in 2002-1-28
				   nMid-=1;
				   while(nMid>=0&&strcmp(pItems[nMid].sWord,sWord)==0)
				   //Get the first item which match the current word
					   nMid--;
                   if(nMid<0||strcmp(pItems[nMid].sWord,sWord)!=0)
					   nMid++;
			   }
			   *nPosRet=nMid;
               return true;   
		   }
	        if(nPosRet)
				*nPosRet=nMid;
			return true;//find it
	   }
	   else if(nCmpValue<0||(nCmpValue==0&&pItems[nMid].nHandle<nHandle&&nHandle!=-1))
	   {
		   nStart=nMid+1;
	   }
	   else if(nCmpValue>0||(nCmpValue==0&&pItems[nMid].nHandle>nHandle&&nHandle!=-1))
       {
		   nEnd=nMid-1;
	   }
	   nMid=(nStart+nEnd)/2;
	}
    if(nPosRet)
	{
		//Get the previous position
		*nPosRet=nMid-1;
	}
	return false;
}

/*********************************************************************
 *
 *  Func Name  : FindInModifyTable
 *
 *  Description: judge the word and handle exist in the modified table and its items
 *              
 *
 *  Parameters : nInnerCode: the inner code of the first CHines char
 *               sWord: the word
 *               nHandle:the handle number
 *               *pFindRet: the node found
 *
 *  Returns    : success or fail
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/
bool CDictionary::FindInModifyTable(int nInnerCode,char *sWord,int nHandle,PWORD_CHAIN *pFindRet)
{
   PWORD_CHAIN pCur,pPre;
   if(m_pModifyTable==NULL)//empty
	   return false;
   pCur=m_pModifyTable[nInnerCode].pWordItemHead;
   pPre=NULL;
   while(pCur!=NULL&&(_stricmp(pCur->data.sWord,sWord)<0||(_stricmp(pCur->data.sWord,sWord)==0&&pCur->data.nHandle<nHandle)))
   //sort the link chain as alphabet
   {
   	   pPre=pCur;
	   pCur=pCur->next;
   }
   if(pFindRet)
     *pFindRet=pPre;
   if(pCur!=NULL && _stricmp(pCur->data.sWord,sWord)==0&&(pCur->data.nHandle==nHandle||nHandle<0))
   {//The node exists, delete the node and return 
	   return true;
   }
   return false;
}
/*********************************************************************
 *
 *  Func Name  : GetWordType
 *
 *  Description: Get the type of word
 *              
 *
 *  Parameters : sWord: the word

 *  Returns    : the type
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/
int CDictionary::GetWordType(char *sWord)
{
   int nType=charType((unsigned char *)sWord),nLen=strlen(sWord);
   if(nLen>0&&nType==CT_CHINESE&&IsAllChinese((unsigned char *)sWord))
	   return WT_CHINESE;//Chinese word
   else if(nLen>0&&nType==CT_DELIMITER)
       return WT_DELIMITER;//Delimiter
   else
	   return WT_OTHER;//other invalid
}
/*********************************************************************
 *
 *  Func Name  : PreProcessing
 *
 *  Description: Get the type of word
 *              
 *
 *  Parameters : sWord: the word

 *  Returns    : the type
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/
bool CDictionary::PreProcessing(char *sWord, int *nId, char *sWordRet,bool bAdd)
{
   //Position for the delimeters
   int nType=charType((unsigned char *)sWord),nLen=strlen(sWord);
   int nEnd=nLen-1,nBegin=0;
   if(nLen==0)
	   return false;
   while(nEnd>=0&&sWord[nEnd]==' ')
		nEnd-=1;
   while(nBegin<=nEnd&&sWord[nBegin]==' ')
		nBegin+=1;
   if(nBegin>nEnd)
	   return false;
   if(nEnd!=nLen-1||nBegin!=0)
   {
	   strncpy(sWord,sWord+nBegin,nEnd-nBegin+1);
	   sWord[nEnd-nBegin+1]=0;
   }
/*
   if((bAdd||strlen(sWord)>4)&&IsAllChineseNum(sWord))
   {  //Only convert the Chinese Num to 3755 while 
      //Get the inner code of the first Chinese Char
       strcpy(sWord,"五十八");
   }
*/   
   if(nType==CT_CHINESE)//&&IsAllChinese((unsigned char *)sWord)
   {//Chinese word
	   *nId=CC_ID(sWord[0],sWord[1]);
		   //Get the inner code of the first Chinese Char
		strcpy(sWordRet,&sWord[2]);//store the word,not store the first Chinese Char
		return true;
   }
/* if(nType==CT_NUM&&IsAllNum((unsigned char *)sWord))
   {
	   *nId=3756;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
*/ if(nType==CT_DELIMITER)
   {//Delimiter
	   *nId=3755;
       //Get the inner code of the first Chinese Char
       strcpy(sWordRet,sWord);//store the word,not store the first Chinese Char
	   return true;
   }
/*
   if(nType==CT_LETTER&&IsAllLetter((unsigned char *)sWord))
   {
	   *nId=3757;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
   if(nType==CT_SINGLE&&IsAllSingleByte((unsigned char *)sWord))
   {
	   *nId=3758;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
   if(nType==CT_INDEX&&IsAllIndex((unsigned char *)sWord))
   {
	   *nId=3759;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
*/
   return false;//other invalid
}
/*********************************************************************
 *
 *  Func Name  : MergePOS
 *
 *  Description: Merge all the POS into nHandle,
 *              just get the word in the dictionary and set its Handle as nHandle
 *              
 *
 *  Parameters : nHandle: the only handle which will be attached to the word

 *  Returns    : the type
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-21
 *********************************************************************/
bool CDictionary::MergePOS(int nHandle)
{
    int i,j,nCompare;
	char sWordPrev[WORD_MAXLENGTH];
	PWORD_CHAIN pPre,pCur,pTemp;
    if(!m_pModifyTable)//Not prepare the buffer
    {
	   m_pModifyTable=new MODIFY_TABLE[CC_NUM];
	   memset(m_pModifyTable,0,CC_NUM*sizeof(MODIFY_TABLE));
    }
	for( i=0;i<CC_NUM;i++)//Operation in the index table
	{//delete the memory of word item array in the dictionary
		sWordPrev[0]=0;//Set empty
		for(j=0;j<m_IndexTable[i].nCount;j++)
		{
			nCompare=_stricmp(sWordPrev,m_IndexTable[i].pWordItemHead[j].sWord);
            if((j==0||nCompare<0)&&m_IndexTable[i].pWordItemHead[j].nFrequency!=-1)
			{//Need to modify its handle
			    m_IndexTable[i].pWordItemHead[j].nHandle=nHandle;//Change its handle
				strcpy(sWordPrev,m_IndexTable[i].pWordItemHead[j].sWord);//Refresh previous Word
			}
			else if(nCompare==0&&m_IndexTable[i].pWordItemHead[j].nFrequency!=-1)
			{//Need to delete when not delete and same as previous word
				m_IndexTable[i].pWordItemHead[j].nFrequency=-1;//Set delete flag
				m_pModifyTable[i].nDelete+=1;//Add the number of being deleted
			}
		}
	}
	for( i=0;i<CC_NUM;i++)//Operation in the modify table
	{
		 pPre=NULL;
	     pCur=m_pModifyTable[i].pWordItemHead;
		 sWordPrev[0]=0;//Set empty
         while(pCur!=NULL)
		 {
			 if(_stricmp(pCur->data.sWord,sWordPrev)>0)
			 {//The new word
				pCur->data.nHandle=nHandle;//Chang its handle
				strcpy(sWordPrev,pCur->data.sWord);//Set new previous word
				pPre=pCur;//New previous pointer
				pCur=pCur->next;
			 }
			 else
			 {//The same word as previous,delete it.
				 pTemp=pCur;
    			 if(pPre!=NULL)//pCur is the first item
					 pPre->next=pCur->next;
				 else
					 m_pModifyTable[i].pWordItemHead=pCur->next;
				 pCur=pCur->next;
	 			 delete pTemp->data.sWord;//Delete the word
				 delete pTemp;//Delete the item
			 }
		 }
   }
	return true;
}
/*********************************************************************
 *
 *  Func Name  : GetMaxMatch
 *
 *  Description: Get the max match to the word
 *              
 *
 *  Parameters : nHandle: the only handle which will be attached to the word

 *  Returns    : success or fail
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-21
 *********************************************************************/
bool CDictionary::GetMaxMatch(char *sWord, char *sWordRet,int *npHandleRet)
{
   char sWordGet[WORD_MAXLENGTH-2],sFirstChar[3];
   int nPos,nFoundPos,nTemp;
   PWORD_CHAIN pCur;
   *npHandleRet=-1;
   if(!PreProcessing(sWord, &nPos,sWordGet))
	   return false;
   sWordRet[0]=0;
   strncpy(sFirstChar,sWord,strlen(sWord)-strlen(sWordGet));//Get the first char
   sFirstChar[strlen(sWord)-strlen(sWordGet)]=0;//Set the end flag
   FindInOriginalTable(nPos,sWordGet,-1,&nFoundPos);
   nTemp=nFoundPos;//Check its previous position
   if(nFoundPos==-1)
		nTemp=0;
   while(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)!=m_IndexTable[nPos].pWordItemHead[nTemp].sWord)
   {//Get the next
	   nTemp+=1;
   }
   if(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)==m_IndexTable[nPos].pWordItemHead[nTemp].sWord)
   {
	   strcpy(sWordRet,sFirstChar);
	   strcat(sWordRet,m_IndexTable[nPos].pWordItemHead[nTemp].sWord);
	   *npHandleRet=m_IndexTable[nPos].pWordItemHead[nTemp].nHandle;
	   return true;
   }//Cannot get the item and retrieve the modified data if exists
    //Operation in the index table and its items 
   if(m_pModifyTable&&m_pModifyTable[nPos].pWordItemHead)//Exists 
	   pCur=m_pModifyTable[nPos].pWordItemHead;
   else
	   pCur=NULL;
   while(pCur!=NULL&&strcmp(pCur->data.sWord,sWordGet)<=0&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord)//
   {
	   pCur=pCur->next;
   }
   if(pCur!=NULL&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord)
   {//Get it
	   strcpy(sWordRet,sFirstChar);
	   strcat(sWordRet,pCur->data.sWord);
	   *npHandleRet=pCur->data.nHandle;
	   return true;
   }
   return false;
}
/*********************************************************************
 *
 *  Func Name  : GetPOSValue
 *
 *  Description: Get the POS value according the POS string
 *              
 *
 *  Parameters : 

 *  Returns    : the value
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-29
 *********************************************************************/
int CDictionary::GetPOSValue(char *sPOS)
{
	int nPOS;
	char *sPlusPos,sTemp[4];
	if(strlen(sPOS)<3)
	{
		nPOS=sPOS[0]*256+sPOS[1];
	}
	else
	{
		sPlusPos=strchr(sPOS,'+');
		strncpy(sTemp,sPOS,sPlusPos-sPOS);
		sTemp[sPlusPos-sPOS]=0;
        nPOS=100*GetPOSValue(sTemp);
		strncpy(sTemp,sPlusPos+1,4);
        nPOS+=atoi(sTemp);
	}
	return nPOS;
}
/*********************************************************************
 *
 *  Func Name  : GetPOSString
 *
 *  Description: Get the POS string according the POS value
 *              
 *
 *  Parameters : 

 *  Returns    : success or fail
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-29
 *********************************************************************/
bool CDictionary::GetPOSString(int nPOS, char *sPOSRet)
{
    if(nPOS>'a'*25600) 
	{
		if((nPOS/100)%256!=0)
 			sprintf(sPOSRet,"%c%c+%d",nPOS/25600,(nPOS/100)%256,nPOS%100);
		else
			sprintf(sPOSRet,"%c+%d",nPOS/25600,nPOS%100);
	}
	else
	{
	  if(nPOS>256)
		sprintf(sPOSRet,"%c%c",nPOS/256,nPOS%256);
	  else
		sprintf(sPOSRet,"%c",nPOS%256);
	}
	return true;
}

int CDictionary::GetFrequency(char *sWord, int nHandle)
{
   char sWordFind[WORD_MAXLENGTH-2];
   int nPos,nIndex;
   PWORD_CHAIN pFound;
   if(!PreProcessing(sWord, &nPos,sWordFind))
	   return 0;
   if(FindInOriginalTable(nPos,sWordFind,nHandle,&nIndex))
   {
		return m_IndexTable[nPos].pWordItemHead[nIndex].nFrequency;
   }
   if(FindInModifyTable(nPos,sWordFind,nHandle,&pFound))
   {
	   return pFound->data.nFrequency;
   }
   return 0;
}

bool CDictionary::Output(char *sFilename)
{
   FILE *fp;
   int i,j;
   PWORD_CHAIN pCur;
   char sPrevWord[WORD_MAXLENGTH]="", sCurWord[WORD_MAXLENGTH],sPOS[10];
   if((fp=fopen(sFilename,"wb"))==NULL)
	   return false;//fail while opening the file
   if(m_pModifyTable)
   {//Modification made, not to output when modify table exists.
	   return false;
   }   
   for(i=0;i<CC_NUM;i++)
   {
	   pCur=NULL;
       j=0;  
	   while(j<m_IndexTable[i].nCount)
	   {
		 GetPOSString(m_IndexTable[i].pWordItemHead[j].nHandle,sPOS);
		 //Get the POS string
		 sprintf(sCurWord,"%c%c%s",CC_CHAR1(i),CC_CHAR2(i),m_IndexTable[i].pWordItemHead[j].sWord);
         if(strcmp(sPrevWord,sCurWord)!=0)
			 fprintf(fp,"\n%s %s",sCurWord,sPOS);
		 else
			 fprintf(fp," %s",sPOS);
		 strcpy(sPrevWord,sCurWord);
 		 j+=1;//Get next item in the original table.
	   }
   }
   fclose(fp);
   return true;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -