📄 encoder.cpp
字号:
ENCODE_GETC(c);
if (s_size==2 && wordType==FIRSTUPPER && letterSet[c]==UPPERCHAR)
wordType=UPPERWORD;
continue;
}
#endif
}
if (wordSet[c])
{
if (c!=' ')
{
if (s_size==0)
{
if (last_c!=' ')
beforeWord=last_c;
else
beforeWord=last_last_c;
if (letterType==LOWERCHAR)
wordType=LOWERWORD;
else
if (letterType==UPPERCHAR)
wordType=FIRSTUPPER;
else
wordType=VARWORD;
}
else
{
switch (wordType)
{
case LOWERWORD:
if (letterType!=LOWERCHAR)
wordType=VARWORD;
break;
case UPPERWORD:
if (letterType!=UPPERCHAR)
wordType=VARWORD;
break;
case FIRSTUPPER:
if (letterType!=LOWERCHAR)
{
if (s_size==1 && letterType==UPPERCHAR)
wordType=UPPERWORD;
else
wordType=VARWORD;
}
break;
}
}
}
else
{
encodeWord(s,s_size,wordType,XMLState,c);
s_size=0;
spaces++;
while (true)
{
ENCODE_GETC(c);
if (c!=' ')
break;
spaces++;
}
continue;
}
s[s_size++]=c;
if (s_size>=STRING_MAX_SIZE-2)
{
encodeWord(s,s_size,wordType,XMLState,c);
s_size=0;
}
ENCODE_GETC(c);
continue;
}
encodeWord(s,s_size,wordType,XMLState,c);
s_size=0;
if (XMLState==ADDED2)
XMLState=INSIDE;
else
ENCODE_PUTC(c);
ENCODE_GETC(c);
}
encodeWord(s,s_size,wordType,XMLState,c);
s_size=0;
if (detect && !IF_OPTION(OPTION_UNICODE_LE) && !IF_OPTION(OPTION_UNICODE_BE))
{
if (unicode_le*4/3>fftell/2)
TURN_ON(OPTION_UNICODE_LE)
else
if (unicode_be*4/3>fftell/2)
TURN_ON(OPTION_UNICODE_BE)
PRINT_DICT(("unicode_le=%d unicode_be=%d uni=%d\n",unicode_le,unicode_be,IF_OPTION(OPTION_UNICODE_LE) || IF_OPTION(OPTION_UNICODE_BE)));
}
printf(" + dynamic dictionary %d/%d words\n",sizeDict,dictionary);
}
inline int common(const char* offset1,const char* offset2, int bound)
{
int lp=0;
while (offset1[lp]==offset2[lp] && lp<bound)
lp++;
return lp;
}
void XWRT_Encoder::write_dict(int comprLevel)
{
int i,count=0;
unsigned char *bound=(unsigned char*)&word_hash[0]+HASH_TABLE_SIZE*sizeof(word_hash[0])-WORD_MAX_SIZE;
unsigned char *writeBuffer=(unsigned char*)&word_hash[0]; //putcBuffer;
unsigned char *bufferData=writeBuffer+3;
if (IF_OPTION(OPTION_SPACES_MODELING))
{
for (i=0; i<256; i++)
if (spacesCont[i]>=minSpacesFreq())
count++;
PRINT_DICT(("sp_count=%d\n",count));
bufferData[0]=count;
bufferData++;
for (i=0; i<256; i++)
if (spacesCont[i]>=minSpacesFreq())
{
bufferData[0]=i;
bufferData++;
}
}
unsigned char *count_header=bufferData;
bufferData+=3;
PRINT_DICT(("sortedDict.size()=%d\n",sortedDict.size()));
int cmn;
count=(int)sortedDict.size();
for (i=0; i<count; i++)
{
cmn=0;
if (i>0)
cmn=common(sortedDict[i-1].c_str(),sortedDict[i].c_str(),min(sortedDict[i].size(),sortedDict[i-1].size()));
if ((preprocType!=LZ77) && (cmn>0 || (unsigned char)(sortedDict[i][0])>=128))
bufferData+=sprintf((char*)bufferData,"%c%s\n",128+cmn,sortedDict[i].c_str()+cmn);
else
bufferData+=sprintf((char*)bufferData,"%s\n",sortedDict[i].c_str());
if (bufferData>bound)
break;
}
sortedDictSize=(int)i; // i<=count
PRINT_DICT(("sortedDictCount=%d\n",sortedDictSize));
count_header[0]=sortedDictSize%256;
count_header[1]=(sortedDictSize/256)%256;
count_header[2]=sortedDictSize/65536;
count=(int)(bufferData-(writeBuffer+3));
PRINT_DICT(("write_dict count=%d\n",count));
#ifdef USE_PAQ_LIBRARY
if (IF_OPTION(OPTION_PAQ))
{
PAQ_encoder->compress(count>>16);
PAQ_encoder->compress(count>>8);
PAQ_encoder->compress(count);
int last=ftell(XWRT_fileout);
for (i=0; i<count; i)
{
PAQ_encoder->compress(writeBuffer[3+i]);
i++;
if (i%102400==0)
{
printStatus(0,ftell(XWRT_fileout)-last,true);
last=ftell(XWRT_fileout);
}
}
printStatus(0,ftell(XWRT_fileout)-last,true);
}
else
#endif
#ifdef USE_LZMA_LIBRARY
if (IF_OPTION(OPTION_LZMA))
{
int last=LZMAlib_GetOutputFilePos(outStream);
LZMAlib_EncodeMemToFile(writeBuffer+3,count,outStream);
printStatus(0,LZMAlib_GetOutputFilePos(outStream)-last,true);
}
else
#endif
#ifdef USE_PPMVC_LIBRARY
if (IF_OPTION(OPTION_PPMVC))
{
int last=ftell(XWRT_fileout);
PPMVClib_EncodeMemToFile(PPMVClib_order,writeBuffer+3,count,XWRT_fileout);
printStatus(0,ftell(XWRT_fileout)-last,true);
}
else
#endif
#ifdef USE_ZLIB_LIBRARY
if (IF_OPTION(OPTION_ZLIB))
Zlib_compress(XWRT_fileout,writeBuffer+3,count,zlibBuffer,ZLIB_BUFFER_SIZE,comprLevel);
else
#endif
{
PUTC(count>>16);
PUTC(count>>8);
PUTC(count);
fwrite_fast((unsigned char*)writeBuffer+3,count,XWRT_fileout);
printStatus(0,count,true);
}
}
void XWRT_Encoder::WRT_get_options(int& c,int& c2)
{
c=c2=0;
if (IF_OPTION(OPTION_USE_CONTAINERS))
c=c+128;
if (IF_OPTION(OPTION_PAQ))
c=c+64;
if (IF_OPTION(OPTION_ZLIB))
c=c+32;
if (IF_OPTION(OPTION_PPMVC))
c=c+16;
if (IF_OPTION(OPTION_LZMA))
c=c+8;
if (IF_OPTION(OPTION_BINARY_DATA))
c=c+4;
c+=preprocType; // 0-3
if (IF_OPTION(OPTION_LETTER_CONTAINER))
c2=c2+128;
if (IF_OPTION(OPTION_NUMBER_CONTAINER))
c2=c2+64;
if (IF_OPTION(OPTION_SPACES_MODELING))
c2=c2+32;
if (IF_OPTION(OPTION_CRLF))
c2=c2+16;
if (IF_OPTION(OPTION_QUOTES_MODELING))
c2=c2+8;
if (IF_OPTION(OPTION_USE_DICTIONARY))
c2=c2+4;
if (IF_OPTION(OPTION_UNICODE_LE))
c2=c2+2;
if (IF_OPTION(OPTION_UNICODE_BE))
c2=c2+1;
}
void XWRT_Encoder::WRT_start_encoding(unsigned int fileLen,bool type_detected)
{
int c,c2,dictPathLen;
unsigned char s[STRING_MAX_SIZE];
unsigned char dictPath[STRING_MAX_SIZE];
s[0]=0;
lastAll=0;
getcBufferDataParts=0;
collision=0;
PUTC(XWRT_HEADER[0]);
PUTC(XWRT_HEADER[1]);
PUTC(XWRT_HEADER[2]);
PUTC(XWRT_HEADER[3]);
PUTC(XWRT_VERSION-150);
fileLenMB=fileLen/(1024*1024);
if (fileLenMB>255*256)
fileLenMB=255*256;
g_fileLenMB=fileLenMB;
init_PPMVC(fileLenMB,COMPRESS);
cont.prepareMemBuffers();
cont.memout->memsize=0;
clock_t start_time; // in ticks
start_time=clock();
if (preprocType!=LZ77 && fileLenMB<32)
minWordFreq+=3;
if ((preprocType==PPM || preprocType==PAQ) && fileLenMB<6)
minWordFreq=250;
int pos=ftell(XWRT_file);
if (!type_detected)
WRT_detectFileType();
#ifdef DYNAMIC_DICTIONARY
getcBufferSize=getcBufferSizeBak;
#else
getcBufferSize=0;
fseek(XWRT_file, pos, SEEK_SET );
#endif
dictPathLen=getSourcePath((char*)dictPath,sizeof(dictPath));
if (dictPathLen>0)
{
dictPath[dictPathLen]=0;
strcat((char*)dictPath,(char*)s);
strcat((char*)dictPath,(char*)"wrt-eng.dic");
strcpy((char*)s,(char*)dictPath);
}
WRT_get_options(c,c2); // po make_dict()
WRT_print_options();
PUTC(c);
PUTC(c2);
PUTC(maxMemSize/(1024*1024));
PUTC(fileLenMB/256);
PUTC(fileLenMB%256);
PUTC(additionalParam);
if (IF_OPTION(OPTION_BINARY_DATA))
{
start_time=clock();
getcBuffer[0]=0;
getcBufferData=&getcBuffer[1];
while (true)
{
READ_CHAR(c);
if (c<0)
break;
ENCODE_PUTC(c);
}
cont.writeMemBuffers(preprocFlag,PPMVClib_order,additionalParam,PAQ_encoder,zlibBuffer,outStream);
cont.freeMemBuffers(true);
#ifdef USE_PAQ_LIBRARY
if (PAQ_encoder)
{
PAQ_encoder->flush();
delete(PAQ_encoder);
PAQ_encoder=NULL;
}
#endif
return;
}
PRINT_DICT(("maxMemSize=%d fileLenMB=%d preprocType=%d\n",maxMemSize,fileLenMB,preprocType));
write_dict(additionalParam); // przed initialize()
memset(detectedSymbols,0,sizeof(detectedSymbols));
decoding=false;
WRT_deinitialize();
if (!initialize(s,true))
return;
memset(value,0,sizeof(value));
if (!IF_OPTION(OPTION_PAQ))
{
PUTC(1*detectedSymbols[0]+2*detectedSymbols[1]+4*detectedSymbols[2]+8*detectedSymbols[3]+16*detectedSymbols[4]+32*detectedSymbols[5]+64*detectedSymbols[6]+128*detectedSymbols[7]);
PUTC(1*detectedSymbols[8]+2*detectedSymbols[9]+4*detectedSymbols[10]+8*detectedSymbols[11]+16*detectedSymbols[12]+32*detectedSymbols[13]+64*detectedSymbols[14]+128*detectedSymbols[15]);
PUTC(1*detectedSymbols[16]+2*detectedSymbols[17]+4*detectedSymbols[18]+8*detectedSymbols[19]+16*detectedSymbols[20]+32*detectedSymbols[21]+64*detectedSymbols[22]+128*detectedSymbols[23]);
}
else
{
#ifdef USE_PAQ_LIBRARY
PAQ_encoder->compress(1*detectedSymbols[0]+2*detectedSymbols[1]+4*detectedSymbols[2]+8*detectedSymbols[3]+16*detectedSymbols[4]+32*detectedSymbols[5]+64*detectedSymbols[6]+128*detectedSymbols[7]);
PAQ_encoder->compress(1*detectedSymbols[8]+2*detectedSymbols[9]+4*detectedSymbols[10]+8*detectedSymbols[11]+16*detectedSymbols[12]+32*detectedSymbols[13]+64*detectedSymbols[14]+128*detectedSymbols[15]);
PAQ_encoder->compress(1*detectedSymbols[16]+2*detectedSymbols[17]+4*detectedSymbols[18]+8*detectedSymbols[19]+16*detectedSymbols[20]+32*detectedSymbols[21]+64*detectedSymbols[22]+128*detectedSymbols[23]);
#endif
}
start_time=clock();
WRT_encode(getcBufferSize);
cont.writeMemBuffers(preprocFlag,PPMVClib_order,additionalParam,PAQ_encoder,zlibBuffer,outStream);
cont.freeMemBuffers(true);
#ifdef USE_PAQ_LIBRARY
if (PAQ_encoder)
{
PAQ_encoder->flush();
delete(PAQ_encoder);
PAQ_encoder=NULL;
}
#endif
}
inline void XWRT_Encoder::setSpaces(int c)
{
if (IF_OPTION(OPTION_SPACELESS_WORDS))
if (spaces>0 && ((c>='a' && c<='z') || (c>='A' && c<='Z')))
spaces--;
if (IF_OPTION(OPTION_SPACES_MODELING) && spaces>1 && spaces<256)
spacesCont[spaces]++;
spaces=0;
}
inline void XWRT_Encoder::checkWord(unsigned char* &s,int &s_size,EXMLState& XMLState,int& c)
{
if (s_size<1)
{
setSpaces('-');
return;
}
if (s_size>WORD_MAX_SIZE)
s_size=WORD_MAX_SIZE;
if (XMLState==CLOSE || XMLState==CLOSE_EOL)
{
XMLState=CLOSED;
setSpaces(s[0]);
return;
}
if (XMLState==OPEN)
{
if (c!='!' && c!='?')
XMLState=ADDED;
else
XMLState=INSIDE;
if (c=='>')
{
s[s_size++]=c;
XMLState=ADDED2;
}
}
if (s[0]=='<' && (XMLState==ADDED || XMLState==ADDED2))
{
if (spaces+s_size<STRING_MAX_SIZE)
{
memmove(s+spaces,s,s_size);
memset(s,' ',spaces);
s_size+=spaces;
}
spaces=0;
}
else
setSpaces(s[0]);
if (s_size<WORD_MIN_SIZE)
{
setSpaces('-');
return;
}
int i;
checkHashExactly(s,s_size,i);
if (i<0)
{
if (dynmem>dictbound)
{
if (firstWarn)
{
printf("warning: dictionary too big, you can use -b option to increase buffer size\n");
firstWarn=false;
}
return;
}
memcpy(dynmem,s,s_size);
if (addWord(dynmem,s_size)==1)
{
dynmem+=(s_size/4+1)*4;
dictfreq[sizeDict-1]=1;
}
}
else
{
dictfreq[i]++;
}
}
int XWRT_Encoder::WRT_detectFileType()
{
detect=true;
memset(value,0,sizeof(value));
memset(addSymbols,0,sizeof(addSymbols));
memset(reservedSet,0,sizeof(reservedSet));
memset(spacesCont,0,sizeof(spacesCont));
quotes=0;
spaces=0;
firstWarn=true;
sizeDict=1;
PRINT_DICT(("maxDynDictBuf=%d maxMemSize=%d\n",maxDynDictBuf,maxMemSize));
dictionary=maxDynDictBuf*(MAX_DYNAMIC_DICT_COUNT/256); // 512k, dblp=372k
dictmem=(unsigned char*)calloc(dictionary*WORD_AVG_SIZE,1);
dictbound=dictmem+dictionary*WORD_AVG_SIZE-WORD_MAX_SIZE;
dict=(unsigned char**)calloc(sizeof(unsigned char*)*(dictionary+1),1);
dictlen=(unsigned char*)calloc(sizeof(unsigned char)*(dictionary+1),1);
dictfreq=(int*)calloc(sizeof(int)*(dictionary+1),1);
memset(&word_hash[0],0,HASH_TABLE_SIZE*sizeof(word_hash[0]));
dynmem=dictmem;
PRINT_DICT(("maxDict=%d allocatedMemory=%d hashTable=%d\n",dictionary,dictionary*WORD_AVG_SIZE+sizeof(unsigned char*)*(dictionary+1)+sizeof(unsigned char)*(dictionary+1)+sizeof(int)*(dictionary+1),HASH_TABLE_SIZE*sizeof(word_hash[0])));
if (dictmem && dict && dictlen && dictfreq)
{
initializeLetterSet();
clock_t start_time=clock();
if (fileLenMB>0)
getcBufferDataParts=1+((1024*1024/(mFileBufferSize))*(fileLenMB+1))/firstPassBlock;
PRINT_DICT(("firstPassBlock=%d fileLenMB=%d getcBufferDataParts=%d\n",firstPassBlock,fileLenMB,getcBufferDataParts));
#ifdef DYNAMIC_DICTIONARY
getcBufferDataParts=2;
#endif
WRT_encode(0);
PRINT_DICT(("bincount=%d/%d\n",binCount,ftell(XWRT_file)/100));
if (binCount>ftell(XWRT_file)/100) // (for textual files in UTF-8)
TURN_ON(OPTION_BINARY_DATA);
if (value[13]>value[10]/2)
TURN_ON(OPTION_CRLF);
PRINT_DICT(("+ WRT_detectFileType time %1.2f sec\n",double(clock()-start_time)/CLOCKS_PER_SEC));
WRT_detectFinish();
}
WRT_deinitialize();
if (collision>0)
PRINT_DICT(("warning: hash collisions=%d\n",collision));
detect=false;
getcBufferDataParts=0;
WRT_print_options();
return preprocFlag;
}
int compare_str( const void *arg1, const void *arg2 )
{
int a=*(int*)arg1;
int b=*(int*)arg2;
return strcmp((char*)dict[a],(char*)dict[b]);
}
int compare_str_rev( const void *arg1, const void *arg2 )
{
int a=*(int*)arg1;
int b=*(int*)arg2;
int minv=min(dictlen[a],dictlen[b]);
for (int i=1; i<=minv; i++)
{
if (dict[a][dictlen[a]-i]!=dict[b][dictlen[b]-i])
return dict[a][dictlen[a]-i] - dict[b][dictlen[b]-i];
}
return dictlen[a] - dictlen[b];
}
int compare_freq( const void *arg1, const void *arg2 )
{
int a=*(int*)arg1;
int b=*(int*)arg2;
return dictfreq[b]-dictfreq[a];
}
void XWRT_Encoder::sortDict(int size)
{
int i,add;
size--;
if (size<20)
return;
initializeCodeWords(size,false);
add=0;
if (IF_OPTION(OPTION_QUOTES_MODELING))
add+=2;
if (IF_OPTION(OPTION_SPACES_MODELING))
{
for (i=0; i<256; i++)
if (spacesCont[i]>=minSpacesFreq())
add++;
}
dict1size-=add;
bound3-=add;
bound4-=add;
int* inttable=new int[size];
if (!inttable)
OUT_OF_MEMORY();
for (i=0; i<size; i++)
inttable[i]=i+1;
qsort(&inttable[0],size,sizeof(inttable[0]),compare_freq);
if (preprocType!=LZ77)
{
qsort(&inttable[0],min(size,dict1size),sizeof(inttable[0]),compare_str);
if (size>dict1size)
qsort(&inttable[dict1size],min(size,bound3)-dict1size,sizeof(inttable[0]),compare_str);
if (size>bound3)
qsort(&inttable[bound3],min(size,bound4)-bound3,sizeof(inttable[0]),compare_str);
if (size>bound4)
qsort(&inttable[bound4],size-bound4,sizeof(inttable[0]),compare_str);
}
for (i=0; i<size; i++)
{
std::string str=(char*)dict[inttable[i]];
sortedDict.push_back(str);
}
delete(inttable);
}
void XWRT_Encoder::WRT_detectFinish()
{
int i,j;
TURN_OFF(OPTION_SPACES_MODELING);
for (i=0; i<256; i++)
if (spacesCont[i]>=minSpacesFreq())
TURN_ON(OPTION_SPACES_MODELING);
PRINT_DICT(("%d words ",sizeDict-1));
sortedDict.clear();
int num;
int minWordFreq2;
if (minWordFreq<6)
minWordFreq2=minWordFreq;
else
minWordFreq2=minWordFreq-2;
for (i=1; i<sizeDict-1; i++)
{
num=dictfreq[i];
if (num>=minWordFreq || (num>=minWordFreq2 && (dictlen[i]>=7)))
;
else
dictfreq[i]=0;
}
for (i=1, j=sizeDict-2; i<j; i++)
{
if (dictfreq[i]>0)
continue;
while (j>0 && dictfreq[j]==0) j--;
if (i>j)
break;
dict[i]=dict[j];
dictfreq[i]=dictfreq[j];
dictfreq[j--]=0;
}
sizeDict=i;
if (sizeDict>maxDictSize)
sizeDict=maxDictSize;
PRINT_DICT(("reduced to %d words (freq>=%d)\n",sizeDict,minWordFreq));
if (quotes>=minSpacesFreq())
TURN_ON(OPTION_QUOTES_MODELING);
sortDict(sizeDict);
PRINT_DICT(("quotes=%d\n",quotes));
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -