📄 ftwbrk.cpp
字号:
}
int ThaiLexicon::InsertSubWord(LPTRIE pTrie, unsigned char nLen, int iStart)
{
LPTRIE pTrieWalk = pTrie;
unsigned char iLoop;
int iNext = iStart;
bool fEndWord = false;
m_MemDict[iNext++] = nLen; // store length
for (iLoop = 0; iLoop < nLen; iLoop++)
{
m_MemDict[iNext++] = pTrieWalk->ch;
if ((fEndWord = pTrieWalk->wrap) == true)
m_MemDict[iNext++] = SUBSTRENDWORD;
pTrieWalk = _lptrie(pTrieWalk->child);
}
if (pTrieWalk)
{
if ((unsigned int)m_MemDict[iNext-1] < MAX_DICT_CTRL_CHAR)
m_MemDict[iNext-1] |= SUBSTRHAVECHILD;
else
m_MemDict[iNext++] = SUBSTRHAVECHILD;
iNext = Trie2Mem(_lptrie(pTrieWalk),iNext,fEndWord);
}
return iNext;
}
bool ThaiLexicon::SearchMemDict(unsigned char *pchWord)
{
unsigned char *pchDictWalk = m_MemDict;
unsigned char *pchWordWalk = pchWord;
DIFFINDEX cbDiff;
cbDiff.l = 0;
unsigned char iLoop,nSubStrLen;
while (*pchWordWalk)
{
LSrcLoopStart:
if (*pchDictWalk & ENDCHILD) // not found word
break;
cbDiff.b.b2 = ((*(pchDictWalk++)) & 0x3F); // restore index to next character
cbDiff.b.b1 = *(pchDictWalk++);
cbDiff.b.b0 = *(pchDictWalk++);
if (((nSubStrLen = m_MemDict[cbDiff.l]) < MAX_DICT_CTRL_CHAR) &&
(m_MemDict[cbDiff.l+1] == *pchWordWalk))
{
cbDiff.l ++;
for (iLoop = 0; iLoop < nSubStrLen; iLoop++)
{
if (m_MemDict[cbDiff.l++] != *pchWordWalk) // word not match
return false;
pchWordWalk++;
if (!(*pchWordWalk))
return ((m_MemDict[cbDiff.l] < MAX_DICT_CTRL_CHAR) &&
(m_MemDict[cbDiff.l] & SUBSTRENDWORD));
else
if (m_MemDict[cbDiff.l] < MAX_DICT_CTRL_CHAR)
{
if (m_MemDict[cbDiff.l] & SUBSTRHAVECHILD) // this means end of substr
{ // next 3 byte will be 'Address'
pchDictWalk = m_MemDict + cbDiff.l + 1; // of next node.
goto LSrcLoopStart;
}
else
cbDiff.l++; // skip flag SUBENDWORD
}
}
return false;
}
else
{
if (m_MemDict[cbDiff.l] == *pchWordWalk) // Is character match ?
{
pchWordWalk++; // check next character
if (!(*pchWordWalk) && (m_MemDict[cbDiff.l + 1] & ENDWORD))
return true;
pchDictWalk = m_MemDict + cbDiff.l + 1;
}
}
}
return false;
}
bool ThaiLexicon::fSaveLexicon(unsigned char* lpFileName)
{
bool fSuccess = false;
// initialize header
//lstrcpy(m_DictHeader.szDesc, "Thai Lexicon Data Version 1.1"); // <<== NYI Please check...!!!!
m_DictHeader.chEOF = -1; //0xFF
m_DictHeader.version.fv = 2;
m_DictHeader.version.dv = 2;
m_DictHeader.cWord = m_cWord;
m_DictHeader.cNode = (unsigned int)m_cNode;
m_DictHeader.code = ((int)m_cWord << 1) + m_cNode;
m_DictHeader.nSize = m_nSize;
m_DictHeader.fValid = true;
FILE* fn=0;
fn = fopen((const char*)lpFileName, "a+b");
if (!fn)
return fSuccess;
if (fwrite(&m_DictHeader,sizeof(unsigned char), sizeof(DICTHEADER), fn)!=sizeof(DICTHEADER))
{
fclose(fn);
return fSuccess;
}
#ifdef _DEBUG
fpos_t fpos;
fgetpos(fn, &fpos);
#endif
if (fwrite(m_MemDict, sizeof(unsigned char), m_nSize, fn)!=(size_t)m_nSize)
{
fclose(fn);
return fSuccess;
}
#ifdef _DEBUG
fgetpos(fn, &fpos);
#endif
fclose(fn);
fSuccess = true;
return fSuccess;
}
bool ThaiLexicon::fLoadLexicon(unsigned char *lpFileName)
{
bool fSuccess = false;
FILE* fn=0;
//if(cfDict.Open( szDictName, CFile::modeRead, &cfeDict ) )
if ((fn=fopen((const char*)lpFileName, "r+b")))
{
// Read data header block
if (sizeof(DICTHEADER) != fread(&m_DictHeader,sizeof(unsigned char), sizeof(DICTHEADER), fn))
; // Error report
else
// validate data
if (((int)m_DictHeader.code) != (int)(((int)m_DictHeader.cWord << 1) + (int)m_DictHeader.cNode))
; // Error report
else
// allocate memory for dict
if (!(m_MemDict = new unsigned char[m_DictHeader.nSize]))
; // Error report
else
{
#if 0 // Don't need to justify pointer
sal_Int64 pos = sizeof(DICTHEADER);
fsetpos(fn, &pos);
#endif
// read data
if ((size_t)m_DictHeader.nSize != fread(m_MemDict, sizeof(unsigned char), m_DictHeader.nSize, fn))
; // Error report
else
{
m_cWord = m_DictHeader.cWord;
m_cNode = m_DictHeader.cNode;
m_nSize = m_DictHeader.nSize;
fSuccess = true;
}
}
fclose(fn);
}
else
return false;
if (!fSuccess && (m_MemDict))
{
delete m_MemDict;
m_MemDict = NULL;
}
m_DictHeader.fValid = fSuccess;
return fSuccess;
}
//-------------------------------------------------------------------------------
// Implementation of ThaiLexicon class
//-------------------------------------------------------------------------------
ThaiBreakIterator::ThaiBreakIterator(unsigned char* lpLexiconPath) : m_pLexicon(0), m_sLexiconFile((char*)lpLexiconPath)
{
m_pLexicon = NULL;
InitializeLexicon();
}
ThaiBreakIterator::~ThaiBreakIterator()
{
if (m_pLexicon)
delete m_pLexicon;
}
bool ThaiBreakIterator::InitializeLexicon()
{
if (m_pLexicon)
delete m_pLexicon;
m_pLexicon = new ThaiLexicon;
m_pLexicon->fLoadLexicon((unsigned char*)"ftwrk.lex");
memcpy(m_rgThaiCharTypeTable, THAICHARTYPETABLE, 256*sizeof(unsigned long));
if (!m_pLexicon)
return false;
return true;
}
#define _DICTIONARY20_
int ThaiBreakIterator::FindThaiWordBreak(const char* szText,unsigned int nStrlen, unsigned char* rgbBrk, unsigned int nMaxBrk, unsigned int uFlags)
{
unsigned char *vmOurDict = m_pLexicon->pGetMemDict();
unsigned char *pchFst = (unsigned char*)szText;
unsigned char *pchLim;
unsigned char rgWL[CWL_MAX]; // Buffer of possible word length
unsigned char iWL,iBrk,cch;
int iLoop;
unsigned char *pchDictWalk;
unsigned char *pchWordWalk/*,*pchWordWalkSav*/;
DIFFINDEX cbDiff;
unsigned char chDict;
bool fCharInfoNotMatch;
bool fCanBreakNext;
bool fPrevWordValid = false;
#if defined(_DICTIONARY20_)
int nSubStrLen;
int iSubLoop;
#endif
unsigned long dwTypeCur;
unsigned char chCur;
unsigned long THAGROUP,SYMGROUP,ENGGROUP;
int cchMaxMatch;
bool fWordWrap = (uFlags & FTWB_WORDWRAP);
bool fSeparateSymbol = ((uFlags & FTWB_SEPARATESYMBOL)?true:false);
bool fSpeller = ((uFlags & FTWB_SPELLER)?true:false);
//
// Validate Dictinary Buffer
//
if (!m_pLexicon->pGetMemDict())
{
InitializeLexicon();
vmOurDict = m_pLexicon->pGetMemDict();
if (!vmOurDict)
return 0;
}
//
// Check argument.
//
if ((nMaxBrk == 0) || (!pchFst) || (!rgbBrk))
return 0;
//
// if nStrLen = 0, Find word break point until
// end of given string.
//
if (nStrlen)
pchLim = pchFst + nStrlen;
else
pchLim = (unsigned char*)(szText + strlen(szText));
//
// Init local variables.
//
rgbBrk[0] = iBrk = 0;
cbDiff.l = 0;
//
// Format group
//
if (fSeparateSymbol)
{
ENGGROUP = XT_ENG;
SYMGROUP = XT_PUNCT | XT_WRDBEG | XT_WRDEND | XT_SNTEND;
THAGROUP = XT_THA;
}
else
{
ENGGROUP = XT_ENG | XT_PUNCT;
SYMGROUP = XT_WRDBEG | XT_WRDEND | XT_SNTEND;
THAGROUP = XT_THA;
}
//
// Loop until end of input string, or word break buffer is full.
//
pchWordWalk = pchFst;
while ( (*pchFst) &&
(*pchWordWalk) &&
(pchFst < pchLim) &&
(iBrk < nMaxBrk))
{
rgWL[0] = iWL = cch = cbDiff.l = cchMaxMatch = 0;
fCharInfoNotMatch = fCanBreakNext = false;
pchDictWalk = vmOurDict;
pchWordWalk = pchFst;
//
// Loop for get all possible next word length.
//
LStartLoop:
while ((chCur = *pchWordWalk) && (!fCharInfoNotMatch))
{
BOOL fParsedMaiYaMok = false;
dwTypeCur = XCharType(chCur);
//
// Special case for 'Dot'
//
if ((dwTypeCur & THAGROUP) ||
((*pchWordWalk == 0x2E) &&
(cch != 0) &&
(*(pchWordWalk-1) > 0x7F)))
{
goto LParseThaiChar;
}
else if (dwTypeCur & ENGGROUP)
{
//STORE_CUR_LEN
if (cch) { goto LStoreWordLen; }
//PREPARE_BREAK_ARRAY
if (rgbBrk[iBrk]) {rgbBrk[++iBrk] = 0;}
do {
++pchWordWalk;
++cch;
}while ((pchWordWalk < pchLim) &&
HaveType(*pchWordWalk,ENGGROUP))/* &&
(!HaveType(*pchWordWalk,XT_WRDBEG) || fSeparateSymbol))*/;
//COLLECT_SPACE;
goto LCollectSpace;
}
else if (dwTypeCur & SYMGROUP)
{
unsigned long dwSav = dwTypeCur;
//STORE_CUR_LEN
if (cch) { goto LStoreWordLen; }
//PREPARE_BREAK_ARRAY
if (rgbBrk[iBrk]) {rgbBrk[++iBrk] = 0;}
do {
++pchWordWalk;
++cch;
}while ((pchWordWalk < pchLim) &&
HaveType(*pchWordWalk,dwSav));
//COLLECT_SPACE;
goto LCollectSpace;
}
else if (dwTypeCur & XT_WRDBEG)
{
//STORE_CUR_LEN
if (cch) { goto LStoreWordLen; }
//PREPARE_BREAK_ARRAY
if (rgbBrk[iBrk]) {rgbBrk[++iBrk] = 0;}
do{
++pchWordWalk;
++cch;
}while((pchWordWalk < pchLim) &&
HaveType(*pchWordWalk,XT_WRDBEG));
//GO_NEXTLOOP
goto LStartLoop;
}
else if (dwTypeCur & XT_WRDEND)
{
while ((pchWordWalk < pchLim) && (XT_WRDEND & XCharType(*pchWordWalk)))
{
++pchWordWalk;
++cch;
}
//COLLECT_SPACE;
goto LCollectSpace;
}
else if (chCur == 0x20)
{
++pchWordWalk;
++cch;
if ((0xE6 == *pchWordWalk) &&
(((unsigned long)pchWordWalk > (unsigned long)(szText+1)) && (XCharType(*(pchWordWalk-2)) & XT_THAI))) /* MAI_YAMOK */
{
++pchWordWalk;
++cch;
}
LCollectSpace:
while(*pchWordWalk == 0x20)
{
++pchWordWalk;
++cch;
}
//STORE_CUR_LEN
if (cch) { goto LStoreWordLen; }
}
else if (chCur == 0xE6)
{
++pchWordWalk;
++cch;
//COLLECT_SPACE
goto LCollectSpace;
}
else if (chCur < 0x20) /* control char */
{
//STORE_CUR_LEN
if (cch) { goto LStoreWordLen; }
//PREPARE_BREAK_ARRAY
if (rgbBrk[iBrk]) {rgbBrk[++iBrk] = 0;}
if ((*pchWordWalk == 0x0D) &&
(pchWordWalk+1 < pchLim) &&
(*(pchWordWalk+1) == 0x0A))
{
pchWordWalk += 2;
cch += 2;
}
else
{
pchWordWalk++;
cch++;
}
//STORE_CUR_LEN
if (cch) { goto LStoreWordLen; }
}
else
{
//
// Collect non-group char
//
pchWordWalk++;
cch++;
//GO_NEXTLOOP;
goto LStartLoop;
}
LParseThaiChar:
/* No more child node? */
if (*pchDictWalk & ENDCHILD)
break;
/* Get Index */
cbDiff.b.b2 = ((*(pchDictWalk++)) & 0x3F);
cbDiff.b.b1 = *(pchDictWalk++);
cbDiff.b.b0 = *(pchDictWalk++);
#if defined(_DICTIONARY20_)
//
// Not only 1 char?
//
if (((nSubStrLen = vmOurDict[cbDiff.l]) < MAX_DICT_CTRL_CHAR) &&
(vmOurDict[cbDiff.l+1] == *pchWordWalk))
{
++cbDiff.l;
for (iSubLoop = 0; iSubLoop < nSubStrLen; iSubLoop++)
{
if (vmOurDict[cbDiff.l++] != *pchWordWalk) // word not match
{
fCharInfoNotMatch = true;
break;
}
else
{
cch++;
pchWordWalk++;
if (vmOurDict[cbDiff.l] < MAX_DICT_CTRL_CHAR)
{
if ((vmOurDict[cbDiff.l] & SUBSTRENDWORD) &&
(!HaveType(*pchWordWalk,XT_ZWIDTH)))
{
rgWL[iWL++] = cch;
rgWL[iWL] = 0;
}
if (vmOurDict[cbDiff.l] & SUBSTRHAVECHILD) // this means end of substr
{ // next 3 byte will be 'Address'
pchDictWalk = vmOurDict + cbDiff.l + 1; // of next node.
break; // goto LFTWBLoop1;
}
cbDiff.l++;
}
}
}
}
else
#endif // _DICTIONARY20_
//
// Char match?
//
if ((chDict = vmOurDict[cbDiff.l]) < *pchWordWalk)
{
//do nothing. just for speed
}
else if (chDict == *pchWordWalk)
{
++cch;
++pchWordWalk;
//
// Can break word here ?
//
if ((vmOurDict[cbDiff.l+1] & ENDWORD) &&
(!HaveType(*pchWordWalk,XT_ZWIDTH)))
{
rgWL[iWL++] = cch;
rgWL[iWL] = 0;
}
//
// Move to first child node.
//
pchDictWalk = vmOurDict + cbDiff.l + 1;
}
else // chDict > *pchWordWalk
break; // exit while loop ... same as set fCharInfoNotMatch = true;
} // while
//
// is there a possible word length?
//
#ifndef DONTUSEWEIGHT
if (iWL)
{
int w,i = iWL;
int wMaxPri, wMaxSnd, wMaxSum, iMax;
wMaxPri = wMaxSnd = wMaxSum = iMax = 0;
if (iWL == 1 &&
0 == GetWeight(vmOurDict,pchFst,pchFst+rgWL[0],pchLim,0))
{
wMaxSum = rgWL[0];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -