📄 dictionary.cpp
字号:
/*
* Copyright (c) 2001,2002,2003 Mike Matsnev. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice immediately at the beginning of the file, without modification,
* this list of conditions, and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Absolutely no warranty of function or purpose is made by the author
* Mike Matsnev.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $Id: Dictionary.cpp,v 1.23.2.3 2004/07/07 12:04:46 mike Exp $
*
*/
#include <afx.h>
#include <afxtempl.h>
#include "ptr.h"
#include "zlib.h"
#include "Unicode.h"
#include "RFile.h"
#include "TextParser.h"
#include "Dictionary.h"
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
class Dict : public IDict
{
public:
Dict(RFile *fp);
~Dict() { }
int NumWords() { return m_numwords; }
Buffer<wchar_t> GetWordW(int index);
bool Find(const wchar_t *word,int& index,int& found);
bool FindImp(const wchar_t *word,int& index,int *found);
bool Valid() { return m_ok; }
int GetStartPofWord(int index);
int GetWordFromP(int para);
int GetNumP() { return m_numpara; }
protected:
struct Block {
Buffer<char> key; /* key */
int wordidx; /* first word index */
int numwords; /* number of words in this block */
int size; /* uncompressed block size */
int csize; /* compressed size */
int off; /* offset in the file */
int numpara; /* number of paragraphs in this block */
int paraidx; /* starting paragraph */
};
kilo::auto_ptr<RFile> m_rf;
DWORD m_numblk;
DWORD m_numwords;
int m_curblk;
CPtrArray m_windex; // words in current block
CPtrArray m_kindex; // keys
CUIntArray m_pindex; // paragraphs
Buffer<char> m_buffer;
CArray<Block,Block&> m_blocks;
bool m_ok;
LCID m_lcid;
UINT m_ms_codepage;
int m_codepage;
int m_numpara;
bool GetBlk(int num);
const char *GetWordImp(int index);
bool OpenOld();
bool OpenNew();
};
static DWORD getdword(RFile *fp) {
BYTE b[4];
int rd=fp->read2(b,4);
if (rd!=4)
return 0;
return ((DWORD)b[3]<<24)|((DWORD)b[2]<<16)|((DWORD)b[1])<<8|b[0];
}
bool Dict::OpenNew() {
if ((m_lcid=getdword(m_rf.get()))==0)
return false;
if (!IsValidLocale(m_lcid,LCID_INSTALLED))
m_lcid=GetUserDefaultLCID();
if ((m_numwords=getdword(m_rf.get()))==0)
return false;
if ((m_ms_codepage=getdword(m_rf.get()))==0)
return false;
if ((m_codepage=Unicode::GetIntCodePage(m_ms_codepage))<0)
m_codepage=Unicode::DefaultCodePage();
DWORD btab=getdword(m_rf.get());
if (btab==0)
return false;
m_rf->seek(btab);
if ((m_numblk=getdword(m_rf.get()))==0 || m_numblk>m_numwords)
return false;
DWORD idx,i,off,maxblock,paraidx;
for (i=idx=paraidx=maxblock=0,off=20;i<(int)m_numblk;++i) {
Block blk;
int keylen;
if ((blk.size=getdword(m_rf.get()))==0)
return false;
if ((blk.csize=getdword(m_rf.get()))==0)
return false;
if ((keylen=getdword(m_rf.get()))==0)
return false;
if ((blk.numwords=getdword(m_rf.get()))==0)
return false;
if ((blk.numpara=getdword(m_rf.get()))==0)
return false;
blk.numpara+=blk.numwords; // append and empty line after each word
blk.wordidx=idx;
blk.paraidx=paraidx;
blk.off=off;
idx+=blk.numwords;
paraidx+=blk.numpara;
off+=blk.csize;
blk.key=Buffer<char>(keylen);
m_blocks.Add(blk);
if (blk.size>(int)maxblock)
maxblock=blk.size;
}
if (idx!=m_numwords)
return false;
m_numpara=paraidx;
for (i=0;i<(int)m_numblk;++i)
if ((m_rf->read2(m_blocks[i].key,m_blocks[i].key.size()))!=(DWORD)m_blocks[i].key.size())
return false;
m_buffer=Buffer<char>(maxblock);
return true;
}
Dict::Dict(RFile *fp) :
m_ok(false), m_numwords(0), m_numpara(0), m_numblk(0), m_curblk(-1), m_rf(fp)
{
if (!OpenNew())
goto fail;
m_ok=true;
return;
fail:
m_buffer=Buffer<char>();
m_blocks.RemoveAll();
}
bool Dict::GetBlk(int num) {
if (num<0 || num>=(int)m_numblk)
return false;
if (m_curblk==num)
return true;
m_curblk=-1;
m_rf->seek(m_blocks[num].off);
if (m_blocks[num].size==m_blocks[num].csize) { // uncompressed
if (m_blocks[num].size!=(int)m_rf->read2(m_buffer,m_blocks[num].size))
return false;
} else {
Buffer<unsigned char> in(m_blocks[num].csize);
if (m_blocks[num].csize!=(int)m_rf->read2(in,m_blocks[num].csize))
return false;
uLongf len=m_blocks[num].size;
int ret=uncompress((unsigned char *)(char *)m_buffer,&len,
in,m_blocks[num].csize);
if (ret!=Z_OK || (int)len!=m_blocks[num].size)
return false;
}
char *p=m_buffer;
char *e=p+m_blocks[num].size;
int i,pnum;
m_windex.SetSize(m_blocks[num].numwords);
m_kindex.SetSize(m_blocks[num].numwords);
m_pindex.SetSize(m_blocks[num].numwords+1);
for (i=pnum=0;i<m_blocks[num].numwords && p<e;++i) {
m_kindex[i]=p;
m_pindex[i]=pnum;
while (p<e && *p)
++p;
if (p<e)
++p;
m_windex[i]=p;
while (p<e && *p) {
if (*p=='\n')
++pnum;
++p;
}
if (p<e)
++p;
pnum+=2; // implicit empty line after each word
}
if (i!=m_blocks[num].numwords || pnum!=m_blocks[num].numpara)
return false;
m_curblk=num;
m_pindex[m_blocks[num].numwords]=m_blocks[num].numpara;
return true;
}
Buffer<wchar_t> Dict::GetWordW(int index) {
const char *word=GetWordImp(index);
if (!word)
return Buffer<wchar_t>();
return Unicode::ToWCbuf(m_codepage,word,strlen(word));
}
CString IDict::GetWord(int index) {
CString ret(Unicode::ToCS(GetWordW(index)));
ret.Replace(_T("\n"),_T("\r\n"));
return ret;
}
const char *Dict::GetWordImp(int index) {
if (index<0 || index>=(int)m_numwords)
return NULL;
if (m_curblk<0 || index<m_blocks[m_curblk].wordidx ||
index>=m_blocks[m_curblk].wordidx+m_blocks[m_curblk].numwords)
{
int low=0;
int high=m_numblk-1;
int mid;
for (int ni=0;;++ni) {
if (ni>(int)m_numblk) // prevent loops on unsorted invalid data
return NULL;
if (low>high)
return NULL;
mid=(low+high)>>1;
if (index<m_blocks[mid].wordidx)
high=mid-1;
else if (index>=m_blocks[mid].wordidx+m_blocks[mid].numwords)
low=mid+1;
else
break;
}
if (!GetBlk(mid))
return NULL;
}
return (const char *)m_windex[index-m_blocks[m_curblk].wordidx];
}
static int compare_buf_str(Buffer<char>& b1,const char *b2,int l2=-1) {
if (l2<0)
l2=strlen(b2);
int res=memcmp(b1,b2,min(b1.size(),l2));
if (res==0)
res=b1.size()<l2 ? -1 : b1.size()>l2 ? 1 : 0;
return res;
}
static int compare_buf_str_len(Buffer<char>& b1,const char *b2,int l2=-1) {
if (l2<0)
l2=strlen(b2);
if (l2>b1.size())
l2=b1.size();
const char *p=b1;
while (l2-->0 && *p++==*b2++) ;
return p-b1;
}
static inline int compare_bufs(Buffer<char>& b1,Buffer<char>& b2) {
return compare_buf_str(b1,b2,b2.size());
}
bool Dict::FindImp(const wchar_t *word,int& index,int *found) {
Buffer<char> sortkey(Unicode::SortKey(m_lcid,word));
int low=0;
int high=m_numblk-1;
int mid;
for (int ni=0;;++ni) {
if (ni>(int)m_numblk) // prevent loops on unsorted data
return false;
if (low>high) {
if (low==0) {
index=0;
if (found) {
*found=0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -