📄 encoder.cpp
字号:
#include "XWRT.h"
#include <time.h>
#if defined WIN32 || defined WIN64
#include <windows.h>
#include <conio.h>
#endif
#include "Common.h"
#include "MemBuffer.h"
#include "Encoder.h"
extern int g_fileLenMB;
XWRT_Encoder::XWRT_Encoder() : utf8cached(0), utf8pos(0), last_c_bak(0)
{
getcBuffer=new unsigned char[mFileBufferSize+1];
zlibBuffer=new unsigned char[ZLIB_BUFFER_SIZE];
if (!getcBuffer || !zlibBuffer)
OUT_OF_MEMORY();
#ifdef USE_LZMA_LIBRARY
LZMAlib_Init(8);
outStream=new COutFileStream;
oStream=outStream;
#endif
};
XWRT_Encoder::~XWRT_Encoder()
{
if (getcBuffer)
delete(getcBuffer);
if (zlibBuffer)
delete(zlibBuffer);
}
#define ENCODE_PUTC(c)\
{ \
if (!detect) \
{ \
if (cont.memout->memsize>maxMemSize) \
{ \
PRINT_DICT(("%d maxMemSize=%d\n",cont.memout->memsize,maxMemSize)); \
cont.writeMemBuffers(preprocFlag,PPMVClib_order,additionalParam,PAQ_encoder,zlibBuffer,outStream); \
cont.memout->memsize=0; \
} \
\
PRINT_CHARS(("output=%d (%c)\n",c,c)); \
cont.memout->OutTgtByte(c); \
} \
}
#define READ_CHAR(c)\
{\
if (getcBufferSize) \
{ \
c=getcBufferData[0]; \
getcBufferData++; \
getcBufferSize--; \
} \
else \
readGetcBuffer(XWRT_file,c); \
}
#define ENCODE_UNICODE_GETC(c) \
{ \
last_last_c=last_c; \
last_c=last_c_bak; \
\
READ_CHAR(c); \
\
if (c>=0) \
{ \
int d; \
if (IF_OPTION(OPTION_UNICODE_BE)) \
{ \
READ_CHAR(d); \
if (d<0) \
d=65536; \
c=256*c+d; \
} \
else \
if (IF_OPTION(OPTION_UNICODE_LE)) \
{ \
READ_CHAR(d); \
if (d<0) \
d=256; \
c+=256*d; \
} \
} \
\
last_c_bak=c; \
}
#define ENCODE_GETC(c) \
{\
if (utf8cached>0) \
{ \
c=utf8buff[utf8pos++]; \
utf8cached--; \
} \
else \
{ \
ENCODE_UNICODE_GETC(c); \
\
if (c>=0x80 && (IF_OPTION(OPTION_UNICODE_BE) || IF_OPTION(OPTION_UNICODE_LE))) \
{ \
utf8cached=unicode2utf8(c,&utf8buff[0]); \
utf8pos=0; \
\
c=utf8buff[utf8pos++]; \
utf8cached--; \
} \
} \
}
inline void XWRT_Encoder::readGetcBuffer(FILE* &file,int &c)
{
if (getcBufferDataParts>0)
{
if (getcBufferDataParts==1)
{
c=EOF;
return;
}
getcBufferDataParts--;
}
getcBuffer[0]=getcBufferData[-1];
getcBufferSize=fread_fast(getcBuffer+1,mFileBufferSize,file);
getcBufferData=&getcBuffer[1];
getcBufferSizeBak=getcBufferSize;
if (getcBufferSize==0)
c=EOF;
else
{
if (!detect)
printStatus((int)getcBufferSize,0,true);
c=getcBufferData[0];
getcBufferData++;
getcBufferSize--;
}
}
inline int XWRT_Encoder::unicode2utf8(unsigned int cp, unsigned char* result)
{
int len=0;
if (cp < 0x80) { // one octet
*(result++) = static_cast<unsigned char>(cp);
len=1;
}
else if (cp < 0x800) { // two octets
*(result++) = static_cast<unsigned char>((cp >> 6) | 0xc0);
*(result++) = static_cast<unsigned char>((cp & 0x3f) | 0x80);
len=2;
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<unsigned char>((cp >> 12) | 0xe0);
*(result++) = static_cast<unsigned char>((cp >> 6) & 0x3f | 0x80);
*(result++) = static_cast<unsigned char>((cp & 0x3f) | 0x80);
len=3;
}
else { // four octets
*(result++) = static_cast<unsigned char>((cp >> 18) | 0xf0);
*(result++) = static_cast<unsigned char>((cp >> 12)& 0x3f | 0x80);
*(result++) = static_cast<unsigned char>((cp >> 6) & 0x3f | 0x80);
*(result++) = static_cast<unsigned char>((cp & 0x3f) | 0x80);
len=4;
}
return len;
}
// encode word (should be lower case) using n-gram array (when word doesn't exist in the dictionary)
inline void XWRT_Encoder::encodeAsText(unsigned char* &s,int &s_size,EWordType wordType)
{
int i=0;
if (!IF_OPTION(OPTION_LETTER_CONTAINER))
{
#ifdef DYNAMIC_DICTIONARY
if (s_size>=WORD_MIN_SIZE)
{
memcpy(mem,s,s_size);
if (mem<dictmem_end && addWord(mem,s_size)!=0)
{
mem+=(s_size/4+1)*4;
// s[s_size]=0;
// printf("NEWWORD=%s %d/%d t=%d\n",s,sizeDict,dictionary,wordType);
ENCODE_PUTC(CHAR_NEWWORD);
for (i=0; i<s_size; i++)
ENCODE_PUTC(s[i]);
ENCODE_PUTC(0);
return;
}
}
#endif
for (i=0; i<s_size; i++)
{
if (addSymbols[s[i]])
ENCODE_PUTC(CHAR_ESCAPE);
ENCODE_PUTC(s[i]);
}
return;
}
if (s_size==1)
{
ENCODE_PUTC('A');
cont.memout_letters->OutTgtByte(s[0]);
return;
}
i=tolower(s[0]);
if (i<'a' || i>'z')
wordType=VARWORD;
switch (wordType)
{
case LOWERWORD:
ENCODE_PUTC('E');
cont.memout_words2->OutTgtByte(toupper(s[0]));
for (i=1; i<s_size; i++)
cont.memout_words2->OutTgtByte(s[i]);
break;
case FIRSTUPPER:
ENCODE_PUTC('C');
for (i=0; i<s_size; i++)
cont.memout_words3->OutTgtByte(s[i]);
break;
case UPPERWORD:
ENCODE_PUTC('D');
cont.memout_words4->OutTgtByte(s[0]);
for (i=1; i<s_size; i++)
cont.memout_words4->OutTgtByte(tolower(s[i]));
break;
default:
ENCODE_PUTC('B');
for (i=0; i<s_size; i++)
cont.memout_words->OutTgtByte(s[i]);
cont.memout_words->OutTgtByte(' ');
break;
}
#ifdef DYNAMIC_DICTIONARY
if (s_size>=WORD_MIN_SIZE)
{
memcpy(mem,s,s_size);
if (mem<dictmem_end && addWord(mem,s_size)!=0)
mem+=(s_size/4+1)*4;
}
#endif
}
inline void XWRT_Encoder::encodeCodeWord_LZMA(int &i)
{
int first,second,third,fourth;
first=i-1;
if (first>=bound4)
{
first-=bound4;
fourth=first/dict123size;
first=first%dict123size;
third=first/dict12size;
first=first%dict12size;
second=first/dict1size;
first=first%dict1size;
ENCODE_PUTC(sym2codeword[dict1size+fourth]);
ENCODE_PUTC(sym2codeword[dict1size+third]);
ENCODE_PUTC(sym2codeword[dict1size+second]);
ENCODE_PUTC(sym2codeword[first]);
}
else
if (first>=bound3)
{
first-=bound3;
third=first/dict12size;
first=first%dict12size;
second=first/dict1size;
first=first%dict1size;
ENCODE_PUTC(sym2codeword[dict1size+third]);
ENCODE_PUTC(sym2codeword[dict1size+second]);
ENCODE_PUTC(sym2codeword[first]);
}
else
if (first>=dict1size)
{
first-=dict1size;
second=first/dict1size;
first=first%dict1size;
ENCODE_PUTC(sym2codeword[dict1size+second]);
ENCODE_PUTC(sym2codeword[first]);
}
else
{
ENCODE_PUTC(sym2codeword[first]);
}
}
inline void XWRT_Encoder::encodeCodeWord_LZ(int &i)
{
int first,second,third;
first=i-1;
if (first>=bound3)
{
first-=bound3;
third=first/(256*256);
first=first%(256*256);
second=first/256;
first=first%256;
ENCODE_PUTC(sym2codeword[dict1size+dict2size+third]);
ENCODE_PUTC(second);
ENCODE_PUTC(first);
}
else
if (first>=dict1size)
{
first-=dict1size;
second=first/256;
first=first%256;
ENCODE_PUTC(sym2codeword[dict1size+second]);
ENCODE_PUTC(first);
}
else
ENCODE_PUTC(sym2codeword[first]);
}
inline void XWRT_Encoder::encodeCodeWord_PPM(int &i)
{
int first,second,third,fourth;
first=i-1;
if (first>=bound4)
{
first-=bound4;
fourth=first/dict123size;
first=first%dict123size;
third=first/dict12size;
first=first%dict12size;
second=first/dict1size;
first=first%dict1size;
ENCODE_PUTC(sym2codeword[dict1plus2plus3+fourth]);
PRINT_CODEWORDS(("1st=%d ",sym2codeword[dict1plus2plus3+fourth]));
ENCODE_PUTC(sym2codeword[dict1plus2+third]);
PRINT_CODEWORDS(("2nd=%d ",sym2codeword[dict1plus2+third]));
ENCODE_PUTC(sym2codeword[dict1size+second]);
PRINT_CODEWORDS(("3rd=%d ",sym2codeword[dict1size+second]));
ENCODE_PUTC(sym2codeword[first]);
PRINT_CODEWORDS(("4th=%d ",sym2codeword[first]));
}
else
if (first>=bound3)
{
first-=bound3;
third=first/dict12size;
first=first%dict12size;
second=first/dict1size;
first=first%dict1size;
ENCODE_PUTC(sym2codeword[dict1plus2+third]);
PRINT_CODEWORDS(("1st=%d(%d) ",sym2codeword[dict1plus2+third],third));
ENCODE_PUTC(sym2codeword[dict1size+second]);
PRINT_CODEWORDS(("2nd=%d(%d) ",sym2codeword[dict1size+second],second));
ENCODE_PUTC(sym2codeword[first]);
PRINT_CODEWORDS(("3rd=%d(%d) ",sym2codeword[first],first));
}
else
if (first>=dict1size)
{
first-=dict1size;
second=first/dict1size;
first=first%dict1size;
ENCODE_PUTC(sym2codeword[dict1size+second]);
PRINT_CODEWORDS(("1st=%d ",sym2codeword[dict1size+second]));
ENCODE_PUTC(sym2codeword[first]);
PRINT_CODEWORDS(("2nd=%d ",sym2codeword[first]));
}
else
{
ENCODE_PUTC(sym2codeword[first]);
PRINT_CODEWORDS(("1st=%d ",sym2codeword[first]));
}
PRINT_CODEWORDS((" no=%d %s\n", no-1,dict[no]));
}
inline void XWRT_Encoder::encodeCodeWord(int &i)
{
if (codewordType==LZ77)
encodeCodeWord_LZ(i);
else
if (codewordType==LZMA)
encodeCodeWord_LZMA(i);
else
encodeCodeWord_PPM(i);
}
inline void XWRT_Encoder::encodeSpaces()
{
if (spaces==1)
{
ENCODE_PUTC(' ');
}
else
if (spaces>0)
{
while (spaces>0)
{
int sp=spaces;
if (spaces>=256)
sp=255;
while (sp>0 && spacesCodeword[sp]==0) sp--;
if (spacesCodeword[sp])
{
encodeCodeWord(spacesCodeword[sp]);
spaces-=sp;
}
else
{
{
ENCODE_PUTC(' ');
spaces--;
}
}
}
}
spaces=0;
}
// make hash from string
inline void XWRT_Encoder::stringHash(const unsigned char *ptr, int len,int& hash)
{
for (hash = 0; len>0; len--, ptr++)
{
hash *= HASH_MULT;
hash += *ptr;
}
hash=hash&(HASH_TABLE_SIZE-1);
}
// check if word "s" does exist in the dictionary
inline void XWRT_Encoder::checkHashExactly(unsigned char* &s,int &s_size,int& i)
{
int h;
stringHash(s,s_size,h);
i=word_hash[h];
if (i>0)
{
if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0)
{
i=word_hash[(h+s_size*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
if (i>0)
{
if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0)
{
i=word_hash[(h+s_size*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
if (i>0)
{
if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0)
i=-1;
}
else
i=-1;
}
}
else
i=-1;
}
}
else
i=-1;
if (i>=dictionary)
i=-1;
}
// check if word "s" (prefix of original word) does exist in the dictionary using hash "h"
inline int XWRT_Encoder::checkHash(unsigned char* &s,int &s_size,int h)
{
int i=word_hash[h];
if (i>0)
{
if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0)
{
i=word_hash[(h+s_size*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
if (i>0)
{
if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0)
{
i=word_hash[(h+s_size*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
if (i>0)
{
if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0)
i=-1;
}
else
i=-1;
}
}
else
i=-1;
}
}
else
i=-1;
if (i>=dictionary)
i=-1;
return i;
}
// check if word "s" or prefix of word "s" does exist in the dictionary using hash "h"
inline int XWRT_Encoder::findShorterWord(unsigned char* &s,int &s_size)
{
int ret;
int i;
int best;
unsigned int hash;
hash = 0;
for (i=0; i<WORD_MIN_SIZE+tryShorterBound; i++)
hash = HASH_MULT * hash + s[i];
best=-1;
for (i=WORD_MIN_SIZE+tryShorterBound; i<s_size; i++)
{
ret=checkHash(s,i,hash&(HASH_TABLE_SIZE-1));
if (ret>=0)
best=ret;
hash = HASH_MULT*hash + s[i];
}
return best;
}
// convert lower string to upper
inline void XWRT_Encoder::toUpper(unsigned char* s,int &s_size)
{
for (int i=0; i<s_size; i++)
s[i]=toupper(s[i]);
}
// convert upper string to lower
inline void XWRT_Encoder::toLower(unsigned char* s,int &s_size)
{
for (int i=0; i<s_size; i++)
s[i]=tolower(s[i]);
}
void XWRT_Encoder::encodeMixed(unsigned char* s,int s_size,EXMLState& XMLState,int& old_c)
{
int c,size,start,ptr=0;
EWordType wordType;
unsigned char* s2;
do
{
start=ptr;
do
{
c=s[ptr++];
letterType=letterSet[c];
}
while (ptr<s_size && letterType==NUMBERCHAR);
if (letterType!=NUMBERCHAR)
ptr--;
wordType=NUMBER;
encodeWord(s+start,ptr-start,wordType,XMLState,old_c);
if (ptr>=s_size)
break;
start=ptr;
do
{
c=s[ptr++];
letterType=letterSet[c];
}
while (ptr<s_size && letterType!=NUMBERCHAR);
if (letterType==NUMBERCHAR)
ptr--;
wordType=VARWORD;
s2=s+start;
size=ptr-start;
encodeAsText(s2,size,wordType);
}
while (ptr<s_size);
}
// encode word "s" using dictionary
void XWRT_Encoder::encodeWord(unsigned char* s,int s_size,EWordType wordType,EXMLState& XMLState,int& c)
{
if (detect)
{
checkWord(s,s_size,XMLState,c);
return;
}
if (s_size<1)
{
encodeSpaces();
return;
}
int i=-1;
int size=0;
int flagToEncode=-1;
bool justAdded=false;
if (XMLState==OPEN)
{
if (s_size>1)
{
PRINT_STACK(("push s=%s c=%c (%d) s_size=%d\n",s,c,c,s_size));
justAdded=true;
XMLState=ADDED;
s[s_size]=0;
stack.push_back((char*)s);
}
else
XMLState=INSIDE;
if (c=='>')
{
s[s_size++]=c;
XMLState=ADDED2; // no encoding '>'
}
}
else
if (XMLState==CLOSE || XMLState==CLOSE_EOL)
{
encodeSpaces();
static std::string str;
if (stack.size()>0)
{
str=stack.back();
stack.pop_back();
}
else
str.erase();
PRINT_STACK(("pop str=%s s=%s \n",str.c_str(),s));
if (s_size==str.size() && memcmp(s,str.c_str(),s_size)==0)
{
if (c=='>')
{
if (XMLState==CLOSE)
ENCODE_PUTC(CHAR_END_TAG)
else
ENCODE_PUTC(CHAR_END_TAG_EOL);
XMLState=CLOSED;
cont.MemBufferPopBack();
return;
}
}
memmove(s+1,s,s_size);
s[0]='<';
s[1]='/';
s_size++; // <tag -> </tag
if (c=='>')
{
s[s_size]='>';
s_size++; // </tag -> </tag>
}
if (XMLState==CLOSE_EOL)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -