⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dictionary.cpp

📁 对中科院分词程序的简化版本
💻 CPP
📖 第 1 页 / 共 3 页
字号:
				m_IndexTable[nPos].pWordItemHead[nTemp].nFrequency=-1;
				m_pModifyTable[nPos].nDelete+=1;
				nTemp+=1;
		   }
	   }
	   return true;
   }
   //Operation in the modify table and its items 
   if(FindInModifyTable(nPos,sWordDel,nHandle,&pPre))
   {
	     pCur=m_pModifyTable[nPos].pWordItemHead;
	     if(pPre!=NULL)
			 pCur=pPre->next;
         while(pCur!=NULL && _stricmp(pCur->data.sWord,sWordDel)==0&&(pCur->data.nHandle==nHandle||nHandle<0))
		 {
			 pTemp=pCur;
    		 if(pPre!=NULL)//pCur is the first item
				 pPre->next=pCur->next;
			 else
				 m_pModifyTable[nPos].pWordItemHead=pCur->next;
			 pCur=pCur->next;
			 delete pTemp->data.sWord;//Delete the word
			 delete pTemp;
		 }
	   return true;
   }
   return false;
}
bool CDictionary::DelModified()
{
  PWORD_CHAIN pTemp,pCur;
  if(!m_pModifyTable)
	  return true;
  for(int i=0;i<CC_NUM;i++)
  {
      pCur=m_pModifyTable[i].pWordItemHead;
	  while(pCur!=NULL)
	  {
		  pTemp=pCur;
		  pCur=pCur->next;
		  delete pTemp->data.sWord;
		  delete pTemp;
	  }
  }
  delete [] m_pModifyTable;
  m_pModifyTable=NULL;
  return true;
}
bool CDictionary::IsExist(char *sWord,  int nHandle)
{
   char sWordFind[WORD_MAXLENGTH-2];
   int nPos;
   if(!PreProcessing(sWord, &nPos,sWordFind))
	   return false;
   return(FindInOriginalTable(nPos,sWordFind,nHandle)||FindInModifyTable(nPos,sWordFind,nHandle));
}
bool CDictionary::GetHandle(char *sWord,int *pnCount,int *pnHandle,int *pnFrequency)
{
   char sWordGet[WORD_MAXLENGTH-2];
   int nPos,nFoundPos,nTemp;
   PWORD_CHAIN pPre,pCur;
   *pnCount=0;
   if(!PreProcessing(sWord, &nPos,sWordGet))
	   return false;
   if(FindInOriginalTable(nPos,sWordGet,-1,&nFoundPos))
   {
       pnHandle[*pnCount]=m_IndexTable[nPos].pWordItemHead[nFoundPos].nHandle;
	   pnFrequency[*pnCount]=m_IndexTable[nPos].pWordItemHead[nFoundPos].nFrequency;
	   *pnCount+=1;
/*	   nTemp=nFoundPos-1;//Check its previous position
	   while(nTemp>0&&strcmp(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)==0)
	   {
		   pnHandle[*pnCount]=m_IndexTable[nPos].pWordItemHead[nTemp].nHandle;
		   pnFrequency[*pnCount]=m_IndexTable[nPos].pWordItemHead[nTemp].nFrequency;
		   *pnCount+=1;
	   	   nTemp-=1;
	   }
*/	   nTemp=nFoundPos+1;//Check its previous position
	   while(nTemp<m_IndexTable[nPos].nCount&&strcmp(m_IndexTable[nPos].pWordItemHead[nTemp].sWord,sWordGet)==0)
	   {
		   pnHandle[*pnCount]=m_IndexTable[nPos].pWordItemHead[nTemp].nHandle;
		   pnFrequency[*pnCount]=m_IndexTable[nPos].pWordItemHead[nTemp].nFrequency;
		   *pnCount+=1;
		   nTemp+=1;
	   }
	   return true;
   }
   //Operation in the index table and its items 
   if(FindInModifyTable(nPos,sWordGet,-1,&pPre))
   {
	     pCur=m_pModifyTable[nPos].pWordItemHead;
	     if(pPre!=NULL)
			 pCur=pPre->next;
         while(pCur!=NULL && _stricmp(pCur->data.sWord,sWordGet)==0)
		 {
			 pnHandle[*pnCount]=pCur->data.nHandle;
			 pnFrequency[*pnCount]=pCur->data.nFrequency;
			 *pnCount+=1;
 			 pCur=pCur->next;
		 }
	   return true;
   }
   return false;
}
/*********************************************************************
 *
 *  Func Name  : FindInOriginalTable
 *
 *  Description: judge the word and handle exist in the inner table and its items
 *              
 *
 *  Parameters : nInnerCode: the inner code of the first CHines char
 *               sWord: the word
 *               nHandle:the handle number
 *               *nPosRet:the position which node is matched
 *
 *  Returns    : success or fail
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/

bool CDictionary::FindInOriginalTable(int nInnerCode,char *sWord,int nHandle,int *nPosRet)
{
    PWORD_ITEM pItems=m_IndexTable[nInnerCode].pWordItemHead;
	int nStart=0,nEnd=m_IndexTable[nInnerCode].nCount-1,nMid=(nStart+nEnd)/2,nCount=0,nCmpValue;
	while(nStart<=nEnd)//Binary search
	{
       nCmpValue=strcmp(pItems[nMid].sWord,sWord);
       if(nCmpValue==0&&(pItems[nMid].nHandle==nHandle||nHandle==-1))
	   {
		   if(nPosRet)
		   {
			   if(nHandle==-1)//Not very strict match
			   {//Add in 2002-1-28
				   nMid-=1;
				   while(nMid>=0&&strcmp(pItems[nMid].sWord,sWord)==0)
				   //Get the first item which match the current word
					   nMid--;
                   if(nMid<0||strcmp(pItems[nMid].sWord,sWord)!=0)
					   nMid++;
			   }
			   *nPosRet=nMid;
               return true;   
		   }
	        if(nPosRet)
				*nPosRet=nMid;
			return true;//find it
	   }
	   else if(nCmpValue<0||(nCmpValue==0&&pItems[nMid].nHandle<nHandle&&nHandle!=-1))
	   {
		   nStart=nMid+1;
	   }
	   else if(nCmpValue>0||(nCmpValue==0&&pItems[nMid].nHandle>nHandle&&nHandle!=-1))
       {
		   nEnd=nMid-1;
	   }
	   nMid=(nStart+nEnd)/2;
	}
    if(nPosRet)
	{
		//Get the previous position
		*nPosRet=nMid-1;
	}
	return false;
}

/*********************************************************************
 *
 *  Func Name  : FindInModifyTable
 *
 *  Description: judge the word and handle exist in the modified table and its items
 *              
 *
 *  Parameters : nInnerCode: the inner code of the first CHines char
 *               sWord: the word
 *               nHandle:the handle number
 *               *pFindRet: the node found
 *
 *  Returns    : success or fail
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/
bool CDictionary::FindInModifyTable(int nInnerCode,char *sWord,int nHandle,PWORD_CHAIN *pFindRet)
{
   PWORD_CHAIN pCur,pPre;
   if(m_pModifyTable==NULL)//empty
	   return false;
   pCur=m_pModifyTable[nInnerCode].pWordItemHead;
   pPre=NULL;
   while(pCur!=NULL&&(_stricmp(pCur->data.sWord,sWord)<0||(_stricmp(pCur->data.sWord,sWord)==0&&pCur->data.nHandle<nHandle)))
   //sort the link chain as alphabet
   {
   	   pPre=pCur;
	   pCur=pCur->next;
   }
   if(pFindRet)
     *pFindRet=pPre;
   if(pCur!=NULL && _stricmp(pCur->data.sWord,sWord)==0&&(pCur->data.nHandle==nHandle||nHandle<0))
   {//The node exists, delete the node and return 
	   return true;
   }
   return false;
}
/*********************************************************************
 *
 *  Func Name  : GetWordType
 *
 *  Description: Get the type of word
 *              
 *
 *  Parameters : sWord: the word

 *  Returns    : the type
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/
int CDictionary::GetWordType(char *sWord)
{
   int nType=charType((unsigned char *)sWord),nLen=strlen(sWord);
   if(nLen>0&&nType==CT_CHINESE&&IsAllChinese((unsigned char *)sWord))
	   return WT_CHINESE;//Chinese word
   else if(nLen>0&&nType==CT_DELIMITER)
       return WT_DELIMITER;//Delimiter
   else
	   return WT_OTHER;//other invalid
}
/*********************************************************************
 *
 *  Func Name  : PreProcessing
 *
 *  Description: Get the type of word
 *              
 *
 *  Parameters : sWord: the word

 *  Returns    : the type
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/
bool CDictionary::PreProcessing(char *sWord, int *nId, char *sWordRet,bool bAdd)
{
   //Position for the delimeters
   int nType=charType((unsigned char *)sWord),nLen=strlen(sWord);
   int nEnd=nLen-1,nBegin=0;
   if(nLen==0)
	   return false;
   while(nEnd>=0&&sWord[nEnd]==' ')
		nEnd-=1;
   while(nBegin<=nEnd&&sWord[nBegin]==' ')
		nBegin+=1;
   if(nBegin>nEnd)
	   return false;
   if(nEnd!=nLen-1||nBegin!=0)
   {
	   strncpy(sWord,sWord+nBegin,nEnd-nBegin+1);
	   sWord[nEnd-nBegin+1]=0;
   }
/*
   if((bAdd||strlen(sWord)>4)&&IsAllChineseNum(sWord))
   {  //Only convert the Chinese Num to 3755 while 
      //Get the inner code of the first Chinese Char
       strcpy(sWord,"五十八");
   }
*/   
   if(nType==CT_CHINESE)//&&IsAllChinese((unsigned char *)sWord)
   {//Chinese word
	   *nId=CC_ID(sWord[0],sWord[1]);
		   //Get the inner code of the first Chinese Char
		strcpy(sWordRet,&sWord[2]);//store the word,not store the first Chinese Char
		return true;
   }
/* if(nType==CT_NUM&&IsAllNum((unsigned char *)sWord))
   {
	   *nId=3756;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
*/ if(nType==CT_DELIMITER)
   {//Delimiter
	   *nId=3755;
       //Get the inner code of the first Chinese Char
       strcpy(sWordRet,sWord);//store the word,not store the first Chinese Char
	   return true;
   }
/*
   if(nType==CT_LETTER&&IsAllLetter((unsigned char *)sWord))
   {
	   *nId=3757;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
   if(nType==CT_SINGLE&&IsAllSingleByte((unsigned char *)sWord))
   {
	   *nId=3758;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
   if(nType==CT_INDEX&&IsAllIndex((unsigned char *)sWord))
   {
	   *nId=3759;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
*/
   return false;//other invalid
}
/*********************************************************************
 *
 *  Func Name  : MergePOS
 *
 *  Description: Merge all the POS into nHandle,
 *              just get the word in the dictionary and set its Handle as nHandle
 *              
 *
 *  Parameters : nHandle: the only handle which will be attached to the word

 *  Returns    : the type
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-21
 *********************************************************************/
bool CDictionary::MergePOS(int nHandle)
{
    int i,j,nCompare;
	char sWordPrev[WORD_MAXLENGTH];
	PWORD_CHAIN pPre,pCur,pTemp;
    if(!m_pModifyTable)//Not prepare the buffer
    {
	   m_pModifyTable=new MODIFY_TABLE[CC_NUM];
	   memset(m_pModifyTable,0,CC_NUM*sizeof(MODIFY_TABLE));
    }
	for( i=0;i<CC_NUM;i++)//Operation in the index table
	{//delete the memory of word item array in the dictionary

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -