📄 dictionary.cpp

📁 中科院分词系统VC++版本
💻 CPP
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
		sWordPrev[0]=0;//Set empty
		for(j=0;j<m_IndexTable[i].nCount;j++)
		{
			nCompare=_stricmp(sWordPrev,m_IndexTable[i].pWordItemHead[j].sWord);
            if((j==0||nCompare<0)&&m_IndexTable[i].pWordItemHead[j].nFrequency!=-1)
			{//Need to modify its handle
			    m_IndexTable[i].pWordItemHead[j].nHandle=nHandle;//Change its handle
				strcpy(sWordPrev,m_IndexTable[i].pWordItemHead[j].sWord);//Refresh previous Word
			}
			else if(nCompare==0&&m_IndexTable[i].pWordItemHead[j].nFrequency!=-1)
			{//Need to delete when not delete and same as previous word
				m_IndexTable[i].pWordItemHead[j].nFrequency=-1;//Set delete flag
				m_pModifyTable[i].nDelete+=1;//Add the number of being deleted
			}
		}
	}
	for( i=0;i<CC_NUM;i++)//Operation in the modify table
	{
		 pPre=NULL;
	     pCur=m_pModifyTable[i].pWordItemHead;
		 sWordPrev[0]=0;//Set empty
         while(pCur!=NULL)
		 {
			 if(_stricmp(pCur->data.sWord,sWordPrev)>0)
			 {//The new word
				pCur->data.nHandle=nHandle;//Chang its handle
				strcpy(sWordPrev,pCur->data.sWord);//Set new previous word
				pPre=pCur;//New previous pointer
				pCur=pCur->next;
			 }
			 else
			 {//The same word as previous,delete it.
				 pTemp=pCur;
    			 if(pPre!=NULL)//pCur is the first item
					 pPre->next=pCur->next;
				 else
					 m_pModifyTable[i].pWordItemHead=pCur->next;
				 pCur=pCur->next;
	 			 delete pTemp->data.sWord;//Delete the word
				 delete pTemp;//Delete the item
			 }
		 }
   }
	return true;
}
/*********************************************************************
 *
 *  Func Name  : GetMaxMatch
 *
 *  Description: Get the max match to the word
 *              
 *
 *  Parameters : nHandle: the only handle which will be attached to the word

 *  Returns    : success or fail
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-21
 *********************************************************************/
bool CDictionary::GetMaxMatch(char *sWord, char *sWordRet,int *npHandleRet)
{
   char sWordGet[WORD_MAXLENGTH-2],sFirstChar[3];
   int nPos,nFoundPos,nTemp;
   PWORD_CHAIN pCur;
   *npHandleRet=-1;
   if(!PreProcessing(sWord, &nPos,sWordGet))
	   return false;
   sWordRet[0]=0;
   strncpy(sFirstChar,sWord,strlen(sWord)-strlen(sWordGet));//Get the first char
   sFirstChar[strlen(sWord)-strlen(sWordGet)]=0;//Set the end flag
   FindInOriginalTable(nPos,sWordGet,-1,&nFoundPos);
   nTemp=nFoundPos;//Check its previous position
   if(nFoundPos==-1)
		nTemp=0;
   while(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)!=m_IndexTable[nPos].pWordItemHead[nTemp].sWord)
   {//Get the next
	   nTemp+=1;
   }
   if(nTemp<m_IndexTable[nPos].nCount&&CC_Find(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)==m_IndexTable[nPos].pWordItemHead[nTemp].sWord)
   {
	   strcpy(sWordRet,sFirstChar);
	   strcat(sWordRet,m_IndexTable[nPos].pWordItemHead[nTemp].sWord);
	   *npHandleRet=m_IndexTable[nPos].pWordItemHead[nTemp].nHandle;
	   return true;
   }//Cannot get the item and retrieve the modified data if exists
    //Operation in the index table and its items 
   if(m_pModifyTable&&m_pModifyTable[nPos].pWordItemHead)//Exists 
	   pCur=m_pModifyTable[nPos].pWordItemHead;
   else
	   pCur=NULL;
   while(pCur!=NULL&&strcmp(pCur->data.sWord,sWordGet)<=0&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord)//
   {
	   pCur=pCur->next;
   }
   if(pCur!=NULL&&CC_Find(pCur->data.sWord,sWordGet)!=pCur->data.sWord)
   {//Get it
	   strcpy(sWordRet,sFirstChar);
	   strcat(sWordRet,pCur->data.sWord);
	   *npHandleRet=pCur->data.nHandle;
	   return true;
   }
   return false;
}
/*********************************************************************
 *
 *  Func Name  : GetPOSValue
 *
 *  Description: Get the POS value according the POS string
 *              
 *
 *  Parameters : 

 *  Returns    : the value
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-29
 *********************************************************************/
int CDictionary::GetPOSValue(char *sPOS)
{
	int nPOS;
	char *sPlusPos,sTemp[4];
	if(strlen(sPOS)<3)
	{
		nPOS=sPOS[0]*256+sPOS[1];
	}
	else
	{
		sPlusPos=strchr(sPOS,'+');
		strncpy(sTemp,sPOS,sPlusPos-sPOS);
		sTemp[sPlusPos-sPOS]=0;
        nPOS=100*GetPOSValue(sTemp);
		strncpy(sTemp,sPlusPos+1,4);
        nPOS+=atoi(sTemp);
	}
	return nPOS;
}
/*********************************************************************
 *
 *  Func Name  : GetPOSString
 *
 *  Description: Get the POS string according the POS value
 *              
 *
 *  Parameters : 

 *  Returns    : success or fail
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-29
 *********************************************************************/
bool CDictionary::GetPOSString(int nPOS, char *sPOSRet)
{
    if(nPOS>'a'*25600) 
	{
		if((nPOS/100)%256!=0)
 			sprintf(sPOSRet,"%c%c+%d",nPOS/25600,(nPOS/100)%256,nPOS%100);
		else
			sprintf(sPOSRet,"%c+%d",nPOS/25600,nPOS%100);
	}
	else
	{
	  if(nPOS>256)
		sprintf(sPOSRet,"%c%c",nPOS/256,nPOS%256);
	  else
		sprintf(sPOSRet,"%c",nPOS%256);
	}
	return true;
}

int CDictionary::GetFrequency(char *sWord, int nHandle)
{
   char sWordFind[WORD_MAXLENGTH-2];
   int nPos,nIndex;
   PWORD_CHAIN pFound;
   if(!PreProcessing(sWord, &nPos,sWordFind))
	   return 0;
   if(FindInOriginalTable(nPos,sWordFind,nHandle,&nIndex))
   {
		return m_IndexTable[nPos].pWordItemHead[nIndex].nFrequency;
   }
   if(FindInModifyTable(nPos,sWordFind,nHandle,&pFound))
   {
	   return pFound->data.nFrequency;
   }
   return 0;
}

bool CDictionary::Output(char *sFilename)
{
   FILE *fp;
   int i,j;
   PWORD_CHAIN pCur;
   char sPrevWord[WORD_MAXLENGTH]="", sCurWord[WORD_MAXLENGTH],sPOS[10];
   if((fp=fopen(sFilename,"wb"))==NULL)
	   return false;//fail while opening the file
   if(m_pModifyTable)
   {//Modification made, not to output when modify table exists.
	   return false;
   }   
   for(i=0;i<CC_NUM;i++)
   {
	   pCur=NULL;
       j=0;  
	   while(j<m_IndexTable[i].nCount)
	   {
		 GetPOSString(m_IndexTable[i].pWordItemHead[j].nHandle,sPOS);
		 //Get the POS string
		 sprintf(sCurWord,"%c%c%s",CC_CHAR1(i),CC_CHAR2(i),m_IndexTable[i].pWordItemHead[j].sWord);
         if(strcmp(sPrevWord,sCurWord)!=0)
			 fprintf(fp,"\n%s %s",sCurWord,sPOS);
		 else
			 fprintf(fp," %s",sPOS);
		 strcpy(sPrevWord,sCurWord);
 		 j+=1;//Get next item in the original table.
	   }
   }
   fclose(fp);
   return true;
}
bool CDictionary::OutputChars(char *sFilename)
{
   FILE *fp;
   int i,j;
   char sPrevWord[WORD_MAXLENGTH]="", sCurWord[WORD_MAXLENGTH];
   if((fp=fopen(sFilename,"wb"))==NULL)
	   return false;//fail while opening the file
   if(m_pModifyTable)
   {//Modification made, not to output when modify table exists.
	   return false;
   }   
   for(i=0;i<CC_NUM;i++)
   {
       j=0;  
	   while(j<m_IndexTable[i].nCount)
	   {
		 sprintf(sCurWord,"%c%c%s",CC_CHAR1(i),CC_CHAR2(i),m_IndexTable[i].pWordItemHead[j].sWord);
         if(strcmp(sPrevWord,sCurWord)!=0&&m_IndexTable[i].pWordItemHead[j].nFrequency>50)//
			 fprintf(fp,"%s",sCurWord);
		 strcpy(sPrevWord,sCurWord);
 		 j+=1;//Get next item in the original table.
	   }
   }
   fclose(fp);
   return true;

}

bool CDictionary::Merge(CDictionary dict2, int nRatio)
//Merge dict2 into current dictionary and the frequency ratio from dict2 and current dict is nRatio
{
   int i,j,k,nCmpValue;
   char sWord[WORD_MAXLENGTH];
   if(m_pModifyTable||dict2.m_pModifyTable)
   {//Modification made, not to output when modify table exists.
	   return false;
   }   
   for(i=0;i<CC_NUM;i++)
   {
       j=0; 
	   k=0; 
	   while(j<m_IndexTable[i].nCount&&k<dict2.m_IndexTable[i].nCount)
	   {
		 nCmpValue=strcmp(m_IndexTable[i].pWordItemHead[j].sWord,dict2.m_IndexTable[i].pWordItemHead[k].sWord);
		 if(nCmpValue==0)//Same Words and determine the different handle
		 {
			 if(m_IndexTable[i].pWordItemHead[j].nHandle<dict2.m_IndexTable[i].pWordItemHead[k].nHandle)
				nCmpValue=-1;
			 else if(m_IndexTable[i].pWordItemHead[j].nHandle>dict2.m_IndexTable[i].pWordItemHead[k].nHandle)
				nCmpValue=1;
		 }

		 if(nCmpValue==0)
		 {
			 m_IndexTable[i].pWordItemHead[j].nFrequency=(nRatio*m_IndexTable[i].pWordItemHead[j].nFrequency+dict2.m_IndexTable[i].pWordItemHead[k].nFrequency)/(nRatio+1);
			 j+=1;
			 k+=1;
		 }
		 else if(nCmpValue<0)//Get next word in the current dictionary
		 {
			 m_IndexTable[i].pWordItemHead[j].nFrequency=(nRatio*m_IndexTable[i].pWordItemHead[j].nFrequency)/(nRatio+1);	
			 j+=1;
		 }
		 else//Get next word in the second dictionary
		 {
			  if(dict2.m_IndexTable[i].pWordItemHead[k].nFrequency>(nRatio+1)/10)
			  {
				  sprintf(sWord,"%c%c%s",CC_CHAR1(i),CC_CHAR2(i),dict2.m_IndexTable[i].pWordItemHead[k].sWord);
				  AddItem(sWord,dict2.m_IndexTable[i].pWordItemHead[k].nHandle,dict2.m_IndexTable[i].pWordItemHead[k].nFrequency/(nRatio+1));
			  }
			 k+=1;
		 }
	   }
	   while(j<m_IndexTable[i].nCount)//words in current dictionary are left
	   {
		 m_IndexTable[i].pWordItemHead[j].nFrequency=(nRatio*m_IndexTable[i].pWordItemHead[j].nFrequency)/(nRatio+1);
		 j+=1;
	   }
	   while(k<dict2.m_IndexTable[i].nCount)//words in Dict2 are left
	   {
			if(dict2.m_IndexTable[i].pWordItemHead[k].nFrequency>(nRatio+1)/10)
			{
			 sprintf(sWord,"%c%c%s",CC_CHAR1(i),CC_CHAR2(i),dict2.m_IndexTable[i].pWordItemHead[k].sWord);
			 AddItem(sWord,dict2.m_IndexTable[i].pWordItemHead[k].nHandle,dict2.m_IndexTable[i].pWordItemHead[k].nFrequency/(nRatio+1));
			}
       		k+=1;
	   }
   }
   return true;
}
//Delete word item which 
//(1)frequency is 0 
//(2)word is same as following but the POS value is parent set of the following
//for example "江泽民/n/0" will deleted, because "江泽民/nr/0" is more detail and correct
bool CDictionary::Optimum()
{
   int nPrevPOS,i,j,nPrevFreq;
   char sPrevWord[WORD_MAXLENGTH],sCurWord[WORD_MAXLENGTH];
   for(i=0;i<CC_NUM;i++)
   {
       j=0;  
	   sPrevWord[0]=0;
	   nPrevPOS=0;
	   nPrevFreq=-1;
	   while(j<m_IndexTable[i].nCount)
	   {
		 sprintf(sCurWord,"%c%c%s",CC_CHAR1(i),CC_CHAR2(i),m_IndexTable[i].pWordItemHead[j].sWord);
         if(nPrevPOS==30720||nPrevPOS==26368||nPrevPOS==29031||(strcmp(sPrevWord,sCurWord)==0&&nPrevFreq==0&&m_IndexTable[i].pWordItemHead[j].nHandle/256*256==nPrevPOS))
		 {//Delete Previous word item
		  //Delete word with POS 'x','g' 'qg'
			DelItem(sPrevWord,nPrevPOS);
		 }
		 strcpy(sPrevWord,sCurWord);
		 nPrevPOS=m_IndexTable[i].pWordItemHead[j].nHandle;
		 nPrevFreq=m_IndexTable[i].pWordItemHead[j].nFrequency;
 		 j+=1;//Get next item in the original table.
	   }
   }
   return true;
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -