⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 encoder.cpp

📁 x-wrt is the GUI config tool for openwrt,which is a open project about wireless Router
💻 CPP
📖 第 1 页 / 共 4 页
字号:
#include "XWRT.h"
#include <time.h>

#if defined WIN32 || defined WIN64
	#include <windows.h>
	#include <conio.h>
#endif

#include "Common.h"
#include "MemBuffer.h"
#include "Encoder.h"

extern int g_fileLenMB;

XWRT_Encoder::XWRT_Encoder() : utf8cached(0), utf8pos(0), last_c_bak(0)
{ 	
	getcBuffer=new unsigned char[mFileBufferSize+1];
	zlibBuffer=new unsigned char[ZLIB_BUFFER_SIZE];

	if (!getcBuffer || !zlibBuffer)
		OUT_OF_MEMORY();

#ifdef USE_LZMA_LIBRARY
	LZMAlib_Init(8);
	outStream=new COutFileStream;
	oStream=outStream;
#endif

};

XWRT_Encoder::~XWRT_Encoder() 
{ 
	if (getcBuffer)
		delete(getcBuffer);

	if (zlibBuffer)
		delete(zlibBuffer);
}

#define ENCODE_PUTC(c)\
{ \
	if (!detect) \
	{ \
		if (cont.memout->memsize>maxMemSize) \
		{ \
			PRINT_DICT(("%d maxMemSize=%d\n",cont.memout->memsize,maxMemSize)); \
			cont.writeMemBuffers(preprocFlag,PPMVClib_order,additionalParam,PAQ_encoder,zlibBuffer,outStream); \
			cont.memout->memsize=0; \
		} \
 \
		PRINT_CHARS(("output=%d (%c)\n",c,c)); \
		cont.memout->OutTgtByte(c); \
	} \
}

#define READ_CHAR(c)\
{\
	if (getcBufferSize) \
	{ \
		c=getcBufferData[0]; \
		getcBufferData++; \
		getcBufferSize--; \
	} \
	else \
		readGetcBuffer(XWRT_file,c); \
}

#define ENCODE_UNICODE_GETC(c) \
{ \
	last_last_c=last_c; \
	last_c=last_c_bak; \
 \
 	READ_CHAR(c); \
 \
	if (c>=0) \
	{ \
		int d; \
		if (IF_OPTION(OPTION_UNICODE_BE)) \
		{ \
		 	READ_CHAR(d); \
			if (d<0) \
				d=65536; \
			c=256*c+d; \
		} \
		else \
		if (IF_OPTION(OPTION_UNICODE_LE)) \
		{ \
		 	READ_CHAR(d); \
			if (d<0) \
				d=256; \
			c+=256*d; \
		} \
	} \
 \
	last_c_bak=c; \
}

#define ENCODE_GETC(c) \
{\
	if (utf8cached>0) \
	{ \
		c=utf8buff[utf8pos++]; \
		utf8cached--; \
	} \
	else \
	{ \
		ENCODE_UNICODE_GETC(c); \
	\
		if (c>=0x80 && (IF_OPTION(OPTION_UNICODE_BE) || IF_OPTION(OPTION_UNICODE_LE))) \
	 	{ \
			utf8cached=unicode2utf8(c,&utf8buff[0]); \
			utf8pos=0; \
		\
			c=utf8buff[utf8pos++]; \
			utf8cached--; \
		} \
	} \
} 

inline void XWRT_Encoder::readGetcBuffer(FILE* &file,int &c)
{
	if (getcBufferDataParts>0)
	{
		if (getcBufferDataParts==1)
		{
			c=EOF; 
			return;
		}
		getcBufferDataParts--;
	}
	
	getcBuffer[0]=getcBufferData[-1];
	getcBufferSize=fread_fast(getcBuffer+1,mFileBufferSize,file); 
	getcBufferData=&getcBuffer[1];
	getcBufferSizeBak=getcBufferSize;

	
	if (getcBufferSize==0) 
		c=EOF; 
	else 
	{ 
		if (!detect)
			printStatus((int)getcBufferSize,0,true);
		c=getcBufferData[0]; 
		getcBufferData++;
		getcBufferSize--; 
	} 
}

inline int XWRT_Encoder::unicode2utf8(unsigned int cp, unsigned char* result)
{
	int len=0;
	if (cp < 0x80) {                       // one octet
		*(result++) = static_cast<unsigned char>(cp);  
		len=1;
	}
	else if (cp < 0x800) {                // two octets
		*(result++) = static_cast<unsigned char>((cp >> 6)          | 0xc0);
		*(result++) = static_cast<unsigned char>((cp & 0x3f)        | 0x80);
		len=2;
	}
	else if (cp < 0x10000) {              // three octets
		*(result++) = static_cast<unsigned char>((cp >> 12)         | 0xe0);
		*(result++) = static_cast<unsigned char>((cp >> 6) & 0x3f   | 0x80);
		*(result++) = static_cast<unsigned char>((cp & 0x3f)        | 0x80);
		len=3;
	}
	else {                                // four octets
		*(result++) = static_cast<unsigned char>((cp >> 18)         | 0xf0);
		*(result++) = static_cast<unsigned char>((cp >> 12)& 0x3f   | 0x80);
		*(result++) = static_cast<unsigned char>((cp >> 6) & 0x3f   | 0x80);
		*(result++) = static_cast<unsigned char>((cp & 0x3f)        | 0x80);
		len=4;
	}
	return len;
}


// encode word (should be lower case) using n-gram array (when word doesn't exist in the dictionary)
inline void XWRT_Encoder::encodeAsText(unsigned char* &s,int &s_size,EWordType wordType)
{
	int i=0;

	if (!IF_OPTION(OPTION_LETTER_CONTAINER))
	{
#ifdef DYNAMIC_DICTIONARY
		if (s_size>=WORD_MIN_SIZE)
		{
			memcpy(mem,s,s_size);
			
			if (mem<dictmem_end && addWord(mem,s_size)!=0)
			{
				mem+=(s_size/4+1)*4;
				
//				s[s_size]=0;
//				printf("NEWWORD=%s %d/%d t=%d\n",s,sizeDict,dictionary,wordType);
				
				ENCODE_PUTC(CHAR_NEWWORD);
				for (i=0; i<s_size; i++)
					ENCODE_PUTC(s[i]);
				ENCODE_PUTC(0);
				return; 
			}
		}
#endif

		for (i=0; i<s_size; i++)
		{
			if (addSymbols[s[i]])
				ENCODE_PUTC(CHAR_ESCAPE);
			ENCODE_PUTC(s[i]);
		}
		return;
	}


	if (s_size==1)
	{
		ENCODE_PUTC('A');
		cont.memout_letters->OutTgtByte(s[0]); 
		return;
	}

	i=tolower(s[0]);
	if (i<'a' || i>'z')
		wordType=VARWORD;

	switch (wordType)
	{
		case LOWERWORD:
			ENCODE_PUTC('E');

			cont.memout_words2->OutTgtByte(toupper(s[0])); 

			for (i=1; i<s_size; i++)
				cont.memout_words2->OutTgtByte(s[i]); 
			break;

		case FIRSTUPPER:
			ENCODE_PUTC('C');

			for (i=0; i<s_size; i++)
				cont.memout_words3->OutTgtByte(s[i]); 
			break;

		case UPPERWORD:
			ENCODE_PUTC('D');

			cont.memout_words4->OutTgtByte(s[0]); 

			for (i=1; i<s_size; i++)
				cont.memout_words4->OutTgtByte(tolower(s[i])); 
			break;

		default:
			ENCODE_PUTC('B');

			for (i=0; i<s_size; i++)
				cont.memout_words->OutTgtByte(s[i]); 
			cont.memout_words->OutTgtByte(' '); 
			break;
	}

#ifdef DYNAMIC_DICTIONARY
	if (s_size>=WORD_MIN_SIZE)
	{
		memcpy(mem,s,s_size);

		if (mem<dictmem_end && addWord(mem,s_size)!=0)
			mem+=(s_size/4+1)*4;
	}
#endif

}

inline void XWRT_Encoder::encodeCodeWord_LZMA(int &i)
{
	int first,second,third,fourth;

	first=i-1;

	if (first>=bound4)
	{
		first-=bound4;

		fourth=first/dict123size;
		first=first%dict123size;
		third=first/dict12size;		
		first=first%dict12size;
		second=first/dict1size;		
		first=first%dict1size;

		ENCODE_PUTC(sym2codeword[dict1size+fourth]);

		ENCODE_PUTC(sym2codeword[dict1size+third]);

		ENCODE_PUTC(sym2codeword[dict1size+second]);

		ENCODE_PUTC(sym2codeword[first]);
	}
	else
	if (first>=bound3)
	{
		first-=bound3;

		third=first/dict12size;		
		first=first%dict12size;
		second=first/dict1size;		
		first=first%dict1size;

		ENCODE_PUTC(sym2codeword[dict1size+third]);

		ENCODE_PUTC(sym2codeword[dict1size+second]);

		ENCODE_PUTC(sym2codeword[first]);
	}
	else
		if (first>=dict1size)
		{
			first-=dict1size;

			second=first/dict1size;		
			first=first%dict1size;

			ENCODE_PUTC(sym2codeword[dict1size+second]);

			ENCODE_PUTC(sym2codeword[first]);
		}
		else
		{
			ENCODE_PUTC(sym2codeword[first]);
		}

}


inline void XWRT_Encoder::encodeCodeWord_LZ(int &i)
{
	int first,second,third;


	first=i-1;

	if (first>=bound3)
	{
		first-=bound3;

		third=first/(256*256);		
		first=first%(256*256);
		second=first/256;		
		first=first%256;


		ENCODE_PUTC(sym2codeword[dict1size+dict2size+third]);
		ENCODE_PUTC(second);
		ENCODE_PUTC(first);
	}
	else
		if (first>=dict1size)
		{
			first-=dict1size;

			second=first/256;		
			first=first%256;


			ENCODE_PUTC(sym2codeword[dict1size+second]);
			ENCODE_PUTC(first);
		}
		else
			ENCODE_PUTC(sym2codeword[first]);
}

inline void XWRT_Encoder::encodeCodeWord_PPM(int &i)
{
	int first,second,third,fourth;

	first=i-1;

	if (first>=bound4)
	{
		first-=bound4;

		fourth=first/dict123size;
		first=first%dict123size;
		third=first/dict12size;		
		first=first%dict12size;
		second=first/dict1size;		
		first=first%dict1size;

		ENCODE_PUTC(sym2codeword[dict1plus2plus3+fourth]);
		PRINT_CODEWORDS(("1st=%d ",sym2codeword[dict1plus2plus3+fourth]));

		ENCODE_PUTC(sym2codeword[dict1plus2+third]);
		PRINT_CODEWORDS(("2nd=%d ",sym2codeword[dict1plus2+third]));

		ENCODE_PUTC(sym2codeword[dict1size+second]);
		PRINT_CODEWORDS(("3rd=%d ",sym2codeword[dict1size+second]));

		ENCODE_PUTC(sym2codeword[first]);
		PRINT_CODEWORDS(("4th=%d ",sym2codeword[first]));
	}
	else
	if (first>=bound3)
	{
		first-=bound3;

		third=first/dict12size;		
		first=first%dict12size;
		second=first/dict1size;		
		first=first%dict1size;

		ENCODE_PUTC(sym2codeword[dict1plus2+third]);
		PRINT_CODEWORDS(("1st=%d(%d) ",sym2codeword[dict1plus2+third],third));

		ENCODE_PUTC(sym2codeword[dict1size+second]);
		PRINT_CODEWORDS(("2nd=%d(%d) ",sym2codeword[dict1size+second],second));

		ENCODE_PUTC(sym2codeword[first]);
		PRINT_CODEWORDS(("3rd=%d(%d) ",sym2codeword[first],first));
	}
	else
		if (first>=dict1size)
		{
			first-=dict1size;

			second=first/dict1size;		
			first=first%dict1size;

			ENCODE_PUTC(sym2codeword[dict1size+second]);
			PRINT_CODEWORDS(("1st=%d ",sym2codeword[dict1size+second]));
	
			ENCODE_PUTC(sym2codeword[first]);
			PRINT_CODEWORDS(("2nd=%d ",sym2codeword[first]));
		}
		else
		{
			ENCODE_PUTC(sym2codeword[first]);
			PRINT_CODEWORDS(("1st=%d ",sym2codeword[first]));
		}

		PRINT_CODEWORDS((" no=%d %s\n", no-1,dict[no]));
}

inline void XWRT_Encoder::encodeCodeWord(int &i)
{
	if (codewordType==LZ77)
		encodeCodeWord_LZ(i);
	else
	if (codewordType==LZMA)
		encodeCodeWord_LZMA(i);
	else
		encodeCodeWord_PPM(i);
}

inline void XWRT_Encoder::encodeSpaces()
{
	if (spaces==1)
	{
		ENCODE_PUTC(' ');
	}
	else
		if (spaces>0)
		{
			while (spaces>0)
			{
				int sp=spaces;
				if (spaces>=256)
					sp=255;
				
				while (sp>0 && spacesCodeword[sp]==0) sp--;

				if (spacesCodeword[sp])
				{		
					encodeCodeWord(spacesCodeword[sp]);
					spaces-=sp;
				}
				else
				{
					{
						ENCODE_PUTC(' ');
						spaces--;
					}
				}
			}
		}
	spaces=0;
}

// make hash from string
inline void XWRT_Encoder::stringHash(const unsigned char *ptr, int len,int& hash)
{
	for (hash = 0; len>0; len--, ptr++)
	{
		hash *= HASH_MULT;
		hash += *ptr;
	}

	hash=hash&(HASH_TABLE_SIZE-1);
}

// check if word "s" does exist in the dictionary 
inline void XWRT_Encoder::checkHashExactly(unsigned char* &s,int &s_size,int& i)
{
	int h;

	stringHash(s,s_size,h);

	i=word_hash[h];
	if (i>0)
	{
		if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0)
		{
			i=word_hash[(h+s_size*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
			if (i>0)
			{
				if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0)
				{
					i=word_hash[(h+s_size*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
					if (i>0)
					{
						if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0)
							i=-1;
					}
					else
						i=-1;
				}
			}
			else
				i=-1;
		}
	}
	else
		i=-1;

	if (i>=dictionary)
		i=-1;
}

// check if word "s" (prefix of original word) does exist in the dictionary using hash "h" 
inline int XWRT_Encoder::checkHash(unsigned char* &s,int &s_size,int h)
{
	int i=word_hash[h];
	if (i>0)
	{
		if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0)
		{
			i=word_hash[(h+s_size*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
			if (i>0)
			{
				if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0)
				{
					i=word_hash[(h+s_size*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
					if (i>0)
					{
						if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0)
							i=-1;
					}
					else
						i=-1;
				}
			}
			else
				i=-1;
		}
	}
	else
		i=-1;

	if (i>=dictionary)
		i=-1;

	return i;
}

// check if word "s" or prefix of word "s" does exist in the dictionary using hash "h" 
inline int XWRT_Encoder::findShorterWord(unsigned char* &s,int &s_size)
{
	int ret;
	int i;
	int best;
	unsigned int hash;

	hash = 0;
	for (i=0; i<WORD_MIN_SIZE+tryShorterBound; i++)
		hash = HASH_MULT * hash + s[i];
 
	best=-1;
	for (i=WORD_MIN_SIZE+tryShorterBound; i<s_size; i++)
	{
		ret=checkHash(s,i,hash&(HASH_TABLE_SIZE-1));	
		if (ret>=0)
			best=ret;
		hash = HASH_MULT*hash + s[i];
	}

	return best;
}

// convert lower string to upper
inline void XWRT_Encoder::toUpper(unsigned char* s,int &s_size)
{
	for (int i=0; i<s_size; i++)
		s[i]=toupper(s[i]); 
}


// convert upper string to lower
inline void XWRT_Encoder::toLower(unsigned char* s,int &s_size)
{
	for (int i=0; i<s_size; i++)
		s[i]=tolower(s[i]);
}


void XWRT_Encoder::encodeMixed(unsigned char* s,int s_size,EXMLState& XMLState,int& old_c)
{
	int c,size,start,ptr=0;
	EWordType wordType;
	unsigned char* s2;

	do
	{
		start=ptr;

		do
		{
			c=s[ptr++];
			letterType=letterSet[c];
		}
		while (ptr<s_size && letterType==NUMBERCHAR);
		
		if (letterType!=NUMBERCHAR)
			ptr--;
		wordType=NUMBER;
		encodeWord(s+start,ptr-start,wordType,XMLState,old_c);
		
		if (ptr>=s_size)
			break;
		
		start=ptr;
		do
		{
			c=s[ptr++];
			letterType=letterSet[c];
		}
		while (ptr<s_size && letterType!=NUMBERCHAR);
		
		if (letterType==NUMBERCHAR)
			ptr--;
		wordType=VARWORD;
		s2=s+start;
		size=ptr-start;
		encodeAsText(s2,size,wordType);
	}
	while (ptr<s_size);
}

// encode word "s" using dictionary
void XWRT_Encoder::encodeWord(unsigned char* s,int s_size,EWordType wordType,EXMLState& XMLState,int& c)
{
	if (detect)
	{
		checkWord(s,s_size,XMLState,c);
		return;
	}

	if (s_size<1)
	{
		encodeSpaces();
		return;
	}

	int i=-1;
	int size=0;
	int flagToEncode=-1;
	bool justAdded=false;
	

	if (XMLState==OPEN)
	{
		if (s_size>1)
		{
			PRINT_STACK(("push s=%s c=%c (%d) s_size=%d\n",s,c,c,s_size));
			justAdded=true;
			XMLState=ADDED;
			s[s_size]=0;
			stack.push_back((char*)s);
		}
		else
			XMLState=INSIDE;

		if (c=='>')
		{
			s[s_size++]=c;
			XMLState=ADDED2; // no encoding '>'
		}
	}
	else
		if (XMLState==CLOSE || XMLState==CLOSE_EOL)
		{
			encodeSpaces();

			static std::string str;
			if (stack.size()>0)
			{
				str=stack.back();
				stack.pop_back();
			}
			else
				str.erase();
		
			PRINT_STACK(("pop str=%s s=%s \n",str.c_str(),s));
			if (s_size==str.size() && memcmp(s,str.c_str(),s_size)==0)
			{
				if (c=='>')
				{
					if (XMLState==CLOSE)
						ENCODE_PUTC(CHAR_END_TAG)
					else
						ENCODE_PUTC(CHAR_END_TAG_EOL);

					XMLState=CLOSED;

					cont.MemBufferPopBack();
					return; 
				}
			}

			memmove(s+1,s,s_size); 
			s[0]='<';
			s[1]='/';
			s_size++; // <tag -> </tag

			if (c=='>')
			{
				s[s_size]='>';
				s_size++; // </tag -> </tag>
			}

			if (XMLState==CLOSE_EOL)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -