⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 encoder.cpp

📁 x-wrt is the GUI config tool for openwrt,which is a open project about wireless Router
💻 CPP
📖 第 1 页 / 共 4 页
字号:
				ENCODE_GETC(c);
				
				if (s_size==2 && wordType==FIRSTUPPER && letterSet[c]==UPPERCHAR)
					wordType=UPPERWORD;
				
				continue;
			}
#endif
		}

		if (wordSet[c])
		{
			if (c!=' ')
			{
				if (s_size==0)
				{
					if (last_c!=' ')
						beforeWord=last_c;
					else
						beforeWord=last_last_c;

					if (letterType==LOWERCHAR)
						wordType=LOWERWORD;
					else
					if (letterType==UPPERCHAR)
						wordType=FIRSTUPPER;
					else
						wordType=VARWORD;
				}
				else
				{
					switch (wordType)
					{
						case LOWERWORD:
							if (letterType!=LOWERCHAR)
								wordType=VARWORD;
							break;
						case UPPERWORD:
							if (letterType!=UPPERCHAR)
								wordType=VARWORD;
							break;
						case FIRSTUPPER:
							if (letterType!=LOWERCHAR)
							{
								if (s_size==1 && letterType==UPPERCHAR)
									wordType=UPPERWORD;
								else
									wordType=VARWORD;
							}
							break;
					}
				}
			}
			else
			{
				encodeWord(s,s_size,wordType,XMLState,c);
				s_size=0;
					
				spaces++;

				while (true) 
				{
					ENCODE_GETC(c);
					if (c!=' ')
						break;
					spaces++;
				}
				continue;
			}


			s[s_size++]=c;
			if (s_size>=STRING_MAX_SIZE-2)
			{
				encodeWord(s,s_size,wordType,XMLState,c);
				s_size=0;
			}
			ENCODE_GETC(c);
			continue;
		}
	

		encodeWord(s,s_size,wordType,XMLState,c);
		s_size=0;


		if (XMLState==ADDED2)
			XMLState=INSIDE;
		else
			ENCODE_PUTC(c);
 
		ENCODE_GETC(c);
	}

	encodeWord(s,s_size,wordType,XMLState,c);
	s_size=0;

	if (detect && !IF_OPTION(OPTION_UNICODE_LE) && !IF_OPTION(OPTION_UNICODE_BE))
	{
		if (unicode_le*4/3>fftell/2)
			TURN_ON(OPTION_UNICODE_LE)
		else
		if (unicode_be*4/3>fftell/2)
			TURN_ON(OPTION_UNICODE_BE)

		PRINT_DICT(("unicode_le=%d unicode_be=%d uni=%d\n",unicode_le,unicode_be,IF_OPTION(OPTION_UNICODE_LE) || IF_OPTION(OPTION_UNICODE_BE)));
	}

	printf(" + dynamic dictionary %d/%d words\n",sizeDict,dictionary);
}

inline int common(const char* offset1,const char* offset2, int bound)
{
	int lp=0;
	while (offset1[lp]==offset2[lp] && lp<bound)
		lp++;

	return lp;
}

void XWRT_Encoder::write_dict(int comprLevel)
{
	int i,count=0;
	unsigned char *bound=(unsigned char*)&word_hash[0]+HASH_TABLE_SIZE*sizeof(word_hash[0])-WORD_MAX_SIZE;
	unsigned char *writeBuffer=(unsigned char*)&word_hash[0]; //putcBuffer;
	unsigned char *bufferData=writeBuffer+3;

	if (IF_OPTION(OPTION_SPACES_MODELING))
	{
		for (i=0; i<256; i++)
			if (spacesCont[i]>=minSpacesFreq())
				count++;

	PRINT_DICT(("sp_count=%d\n",count));
		bufferData[0]=count;
		bufferData++;
		for (i=0; i<256; i++)
			if (spacesCont[i]>=minSpacesFreq())
			{		
				bufferData[0]=i;
				bufferData++;
			}
	}


	unsigned char *count_header=bufferData;
	bufferData+=3;

	PRINT_DICT(("sortedDict.size()=%d\n",sortedDict.size()));

	int cmn;
	count=(int)sortedDict.size();
	for (i=0; i<count; i++)
	{
		cmn=0;
		if (i>0)
			cmn=common(sortedDict[i-1].c_str(),sortedDict[i].c_str(),min(sortedDict[i].size(),sortedDict[i-1].size()));

		if ((preprocType!=LZ77) && (cmn>0 || (unsigned char)(sortedDict[i][0])>=128))
			bufferData+=sprintf((char*)bufferData,"%c%s\n",128+cmn,sortedDict[i].c_str()+cmn);
		else
			bufferData+=sprintf((char*)bufferData,"%s\n",sortedDict[i].c_str());

		if (bufferData>bound)
			break;
	}
	sortedDictSize=(int)i; // i<=count

	PRINT_DICT(("sortedDictCount=%d\n",sortedDictSize));

	count_header[0]=sortedDictSize%256;
	count_header[1]=(sortedDictSize/256)%256;
	count_header[2]=sortedDictSize/65536;


	count=(int)(bufferData-(writeBuffer+3));
	PRINT_DICT(("write_dict count=%d\n",count));

#ifdef USE_PAQ_LIBRARY
	if (IF_OPTION(OPTION_PAQ))
	{
		PAQ_encoder->compress(count>>16);
		PAQ_encoder->compress(count>>8);
		PAQ_encoder->compress(count);

		int last=ftell(XWRT_fileout);
		for (i=0; i<count; i)
		{
			PAQ_encoder->compress(writeBuffer[3+i]);
			i++;
			if (i%102400==0)
			{
				printStatus(0,ftell(XWRT_fileout)-last,true);
				last=ftell(XWRT_fileout);
			}
		}

		printStatus(0,ftell(XWRT_fileout)-last,true);		
	}
	else
#endif
#ifdef USE_LZMA_LIBRARY
	if (IF_OPTION(OPTION_LZMA))
	{
		int last=LZMAlib_GetOutputFilePos(outStream);
		LZMAlib_EncodeMemToFile(writeBuffer+3,count,outStream);
		printStatus(0,LZMAlib_GetOutputFilePos(outStream)-last,true);
	}
	else
#endif
#ifdef USE_PPMVC_LIBRARY
	if (IF_OPTION(OPTION_PPMVC))
	{
		int last=ftell(XWRT_fileout);
		PPMVClib_EncodeMemToFile(PPMVClib_order,writeBuffer+3,count,XWRT_fileout);
		printStatus(0,ftell(XWRT_fileout)-last,true);
	}
	else
#endif
#ifdef USE_ZLIB_LIBRARY
	if (IF_OPTION(OPTION_ZLIB))
		Zlib_compress(XWRT_fileout,writeBuffer+3,count,zlibBuffer,ZLIB_BUFFER_SIZE,comprLevel);
	else
#endif
	{
		PUTC(count>>16);
		PUTC(count>>8);
		PUTC(count);

		fwrite_fast((unsigned char*)writeBuffer+3,count,XWRT_fileout);

		printStatus(0,count,true);
	}
}


void XWRT_Encoder::WRT_get_options(int& c,int& c2)
{
	c=c2=0;
	if (IF_OPTION(OPTION_USE_CONTAINERS))
		c=c+128;
	if (IF_OPTION(OPTION_PAQ))
		c=c+64;
	if (IF_OPTION(OPTION_ZLIB))
		c=c+32;
	if (IF_OPTION(OPTION_PPMVC))
		c=c+16;
	if (IF_OPTION(OPTION_LZMA))
		c=c+8;
	if (IF_OPTION(OPTION_BINARY_DATA))
		c=c+4;

	c+=preprocType; // 0-3

	if (IF_OPTION(OPTION_LETTER_CONTAINER))
		c2=c2+128;
	if (IF_OPTION(OPTION_NUMBER_CONTAINER))
		c2=c2+64;
	if (IF_OPTION(OPTION_SPACES_MODELING))
		c2=c2+32;
	if (IF_OPTION(OPTION_CRLF))
		c2=c2+16;
	if (IF_OPTION(OPTION_QUOTES_MODELING))
		c2=c2+8;
	if (IF_OPTION(OPTION_USE_DICTIONARY))
		c2=c2+4;
	if (IF_OPTION(OPTION_UNICODE_LE))
		c2=c2+2;
	if (IF_OPTION(OPTION_UNICODE_BE))
		c2=c2+1;
}


void XWRT_Encoder::WRT_start_encoding(unsigned int fileLen,bool type_detected)
{
	int c,c2,dictPathLen;
	unsigned char s[STRING_MAX_SIZE];
	unsigned char dictPath[STRING_MAX_SIZE];
	s[0]=0;
	lastAll=0;
	getcBufferDataParts=0;
	collision=0;


	PUTC(XWRT_HEADER[0]);
	PUTC(XWRT_HEADER[1]);
	PUTC(XWRT_HEADER[2]);
	PUTC(XWRT_HEADER[3]);
	PUTC(XWRT_VERSION-150);

	fileLenMB=fileLen/(1024*1024);
	if (fileLenMB>255*256)
		fileLenMB=255*256;
	g_fileLenMB=fileLenMB;
	init_PPMVC(fileLenMB,COMPRESS);

	cont.prepareMemBuffers();
	cont.memout->memsize=0;


	clock_t start_time;  // in ticks
    start_time=clock();
	
	if (preprocType!=LZ77 && fileLenMB<32)
		minWordFreq+=3;

	if ((preprocType==PPM || preprocType==PAQ) && fileLenMB<6)
		minWordFreq=250;

	int pos=ftell(XWRT_file);
	if (!type_detected)
		WRT_detectFileType();
#ifdef DYNAMIC_DICTIONARY
	getcBufferSize=getcBufferSizeBak;
#else
	getcBufferSize=0;
	fseek(XWRT_file, pos, SEEK_SET );
#endif


	dictPathLen=getSourcePath((char*)dictPath,sizeof(dictPath));

	if (dictPathLen>0)
	{
		dictPath[dictPathLen]=0;
		strcat((char*)dictPath,(char*)s);
		strcat((char*)dictPath,(char*)"wrt-eng.dic");
		strcpy((char*)s,(char*)dictPath);
	}



	WRT_get_options(c,c2); // po make_dict()
	WRT_print_options();

	PUTC(c);
	PUTC(c2);
	PUTC(maxMemSize/(1024*1024));
	PUTC(fileLenMB/256);
	PUTC(fileLenMB%256);
	PUTC(additionalParam);

	if (IF_OPTION(OPTION_BINARY_DATA))
	{
		start_time=clock();

		getcBuffer[0]=0;
		getcBufferData=&getcBuffer[1];

		while (true)
		{
			READ_CHAR(c);
			if (c<0)
				break;
			ENCODE_PUTC(c);
		}

		cont.writeMemBuffers(preprocFlag,PPMVClib_order,additionalParam,PAQ_encoder,zlibBuffer,outStream);
		cont.freeMemBuffers(true);

#ifdef USE_PAQ_LIBRARY
		if (PAQ_encoder)
		{
			PAQ_encoder->flush();
			delete(PAQ_encoder);
			PAQ_encoder=NULL;
		}
#endif
		return;
	}

	PRINT_DICT(("maxMemSize=%d fileLenMB=%d preprocType=%d\n",maxMemSize,fileLenMB,preprocType));

	write_dict(additionalParam); // przed initialize()

	memset(detectedSymbols,0,sizeof(detectedSymbols));
	decoding=false;

	WRT_deinitialize();

	if (!initialize(s,true))
		return;

	memset(value,0,sizeof(value));

	if (!IF_OPTION(OPTION_PAQ))
	{
		PUTC(1*detectedSymbols[0]+2*detectedSymbols[1]+4*detectedSymbols[2]+8*detectedSymbols[3]+16*detectedSymbols[4]+32*detectedSymbols[5]+64*detectedSymbols[6]+128*detectedSymbols[7]);
		PUTC(1*detectedSymbols[8]+2*detectedSymbols[9]+4*detectedSymbols[10]+8*detectedSymbols[11]+16*detectedSymbols[12]+32*detectedSymbols[13]+64*detectedSymbols[14]+128*detectedSymbols[15]);
		PUTC(1*detectedSymbols[16]+2*detectedSymbols[17]+4*detectedSymbols[18]+8*detectedSymbols[19]+16*detectedSymbols[20]+32*detectedSymbols[21]+64*detectedSymbols[22]+128*detectedSymbols[23]);
	}
	else
	{
#ifdef USE_PAQ_LIBRARY
		PAQ_encoder->compress(1*detectedSymbols[0]+2*detectedSymbols[1]+4*detectedSymbols[2]+8*detectedSymbols[3]+16*detectedSymbols[4]+32*detectedSymbols[5]+64*detectedSymbols[6]+128*detectedSymbols[7]);
		PAQ_encoder->compress(1*detectedSymbols[8]+2*detectedSymbols[9]+4*detectedSymbols[10]+8*detectedSymbols[11]+16*detectedSymbols[12]+32*detectedSymbols[13]+64*detectedSymbols[14]+128*detectedSymbols[15]);
		PAQ_encoder->compress(1*detectedSymbols[16]+2*detectedSymbols[17]+4*detectedSymbols[18]+8*detectedSymbols[19]+16*detectedSymbols[20]+32*detectedSymbols[21]+64*detectedSymbols[22]+128*detectedSymbols[23]);
#endif
	}

	start_time=clock();

	WRT_encode(getcBufferSize);

	cont.writeMemBuffers(preprocFlag,PPMVClib_order,additionalParam,PAQ_encoder,zlibBuffer,outStream);

	cont.freeMemBuffers(true);

#ifdef USE_PAQ_LIBRARY
	if (PAQ_encoder)
	{
		PAQ_encoder->flush();
		delete(PAQ_encoder);
		PAQ_encoder=NULL;
	}
#endif
}

inline void XWRT_Encoder::setSpaces(int c)
{
	if (IF_OPTION(OPTION_SPACELESS_WORDS))
	if (spaces>0 && ((c>='a' && c<='z') || (c>='A' && c<='Z')))
		spaces--;

	if (IF_OPTION(OPTION_SPACES_MODELING) && spaces>1 && spaces<256)
		spacesCont[spaces]++;
	spaces=0;
}

inline void XWRT_Encoder::checkWord(unsigned char* &s,int &s_size,EXMLState& XMLState,int& c)
{
	if (s_size<1)
	{
		setSpaces('-');
		return;
	}

	if (s_size>WORD_MAX_SIZE)
		s_size=WORD_MAX_SIZE; 


	if (XMLState==CLOSE || XMLState==CLOSE_EOL)
	{
		XMLState=CLOSED;
		setSpaces(s[0]);
		return;
	}

	if (XMLState==OPEN)
	{
		if (c!='!' && c!='?')
			XMLState=ADDED;
		else
			XMLState=INSIDE;

		if (c=='>')
		{
			s[s_size++]=c;
			XMLState=ADDED2;
		}
	}


	if (s[0]=='<' && (XMLState==ADDED || XMLState==ADDED2))
	{
		if (spaces+s_size<STRING_MAX_SIZE)
		{
			memmove(s+spaces,s,s_size);
			memset(s,' ',spaces);
			s_size+=spaces;
		}
		spaces=0;
	}
	else
		setSpaces(s[0]);



	if (s_size<WORD_MIN_SIZE)
	{
		setSpaces('-');
		return;
	} 

	int i;
	checkHashExactly(s,s_size,i);

	if (i<0)
	{
		if (dynmem>dictbound)
		{
			if (firstWarn)
			{
				printf("warning: dictionary too big, you can use -b option to increase buffer size\n");
				firstWarn=false;
			}
			return;
		}
		memcpy(dynmem,s,s_size);
		if (addWord(dynmem,s_size)==1)
		{
			dynmem+=(s_size/4+1)*4;
			dictfreq[sizeDict-1]=1;
		}
	}
	else
	{
		dictfreq[i]++;
	}
}


int XWRT_Encoder::WRT_detectFileType()
{
	detect=true;


	memset(value,0,sizeof(value));
	memset(addSymbols,0,sizeof(addSymbols));
	memset(reservedSet,0,sizeof(reservedSet));
	memset(spacesCont,0,sizeof(spacesCont));

	quotes=0;
	spaces=0;

	firstWarn=true;
	sizeDict=1;
	PRINT_DICT(("maxDynDictBuf=%d maxMemSize=%d\n",maxDynDictBuf,maxMemSize));
	dictionary=maxDynDictBuf*(MAX_DYNAMIC_DICT_COUNT/256);  // 512k, dblp=372k
	dictmem=(unsigned char*)calloc(dictionary*WORD_AVG_SIZE,1);
	dictbound=dictmem+dictionary*WORD_AVG_SIZE-WORD_MAX_SIZE;
	dict=(unsigned char**)calloc(sizeof(unsigned char*)*(dictionary+1),1);
	dictlen=(unsigned char*)calloc(sizeof(unsigned char)*(dictionary+1),1);
	dictfreq=(int*)calloc(sizeof(int)*(dictionary+1),1);
	memset(&word_hash[0],0,HASH_TABLE_SIZE*sizeof(word_hash[0]));
	dynmem=dictmem;

	PRINT_DICT(("maxDict=%d allocatedMemory=%d hashTable=%d\n",dictionary,dictionary*WORD_AVG_SIZE+sizeof(unsigned char*)*(dictionary+1)+sizeof(unsigned char)*(dictionary+1)+sizeof(int)*(dictionary+1),HASH_TABLE_SIZE*sizeof(word_hash[0])));

	if (dictmem && dict && dictlen && dictfreq)
	{
		initializeLetterSet();

	    clock_t start_time=clock();

		if (fileLenMB>0)
			getcBufferDataParts=1+((1024*1024/(mFileBufferSize))*(fileLenMB+1))/firstPassBlock;

	PRINT_DICT(("firstPassBlock=%d fileLenMB=%d getcBufferDataParts=%d\n",firstPassBlock,fileLenMB,getcBufferDataParts));
#ifdef DYNAMIC_DICTIONARY
		getcBufferDataParts=2;
#endif
		WRT_encode(0);

		PRINT_DICT(("bincount=%d/%d\n",binCount,ftell(XWRT_file)/100));
		if (binCount>ftell(XWRT_file)/100) // (for textual files in UTF-8)
			TURN_ON(OPTION_BINARY_DATA);

		if (value[13]>value[10]/2)
			TURN_ON(OPTION_CRLF);

	    PRINT_DICT(("+ WRT_detectFileType time %1.2f sec\n",double(clock()-start_time)/CLOCKS_PER_SEC));

		WRT_detectFinish();
	}

	WRT_deinitialize();


	if (collision>0)
		PRINT_DICT(("warning: hash collisions=%d\n",collision));


	detect=false;
	getcBufferDataParts=0;

	WRT_print_options();

	return preprocFlag;
}


int compare_str( const void *arg1, const void *arg2 )
{
	int a=*(int*)arg1;
	int b=*(int*)arg2;

	return strcmp((char*)dict[a],(char*)dict[b]);
}

int compare_str_rev( const void *arg1, const void *arg2 )
{
	int a=*(int*)arg1;
	int b=*(int*)arg2;

	int minv=min(dictlen[a],dictlen[b]);

	for (int i=1; i<=minv; i++)
	{
		if (dict[a][dictlen[a]-i]!=dict[b][dictlen[b]-i])
			return dict[a][dictlen[a]-i] - dict[b][dictlen[b]-i];
	}

	return dictlen[a] - dictlen[b];
}

int compare_freq( const void *arg1, const void *arg2 )
{
	int a=*(int*)arg1;
	int b=*(int*)arg2;

	return dictfreq[b]-dictfreq[a];
}


void XWRT_Encoder::sortDict(int size)
{
	int i,add;

	size--;

	if (size<20)
		return;

	initializeCodeWords(size,false);

	add=0;
	if (IF_OPTION(OPTION_QUOTES_MODELING))
		add+=2;

	if (IF_OPTION(OPTION_SPACES_MODELING))
	{
		for (i=0; i<256; i++)
		if (spacesCont[i]>=minSpacesFreq())
			add++;
	}

	dict1size-=add;
	bound3-=add;
	bound4-=add;

	int* inttable=new int[size];

	if (!inttable)
		OUT_OF_MEMORY();

	for (i=0; i<size; i++)
		inttable[i]=i+1;


	qsort(&inttable[0],size,sizeof(inttable[0]),compare_freq);


	if (preprocType!=LZ77)
	{
		qsort(&inttable[0],min(size,dict1size),sizeof(inttable[0]),compare_str);
		
		if (size>dict1size)
			qsort(&inttable[dict1size],min(size,bound3)-dict1size,sizeof(inttable[0]),compare_str);
		
		if (size>bound3)
			qsort(&inttable[bound3],min(size,bound4)-bound3,sizeof(inttable[0]),compare_str);
		
		if (size>bound4)
			qsort(&inttable[bound4],size-bound4,sizeof(inttable[0]),compare_str);
	}

	for (i=0; i<size; i++)
	{
		std::string str=(char*)dict[inttable[i]];
		sortedDict.push_back(str);
	}

	delete(inttable);
}


void XWRT_Encoder::WRT_detectFinish()
{	
	int i,j;


	TURN_OFF(OPTION_SPACES_MODELING);

	for (i=0; i<256; i++)
		if (spacesCont[i]>=minSpacesFreq())
			TURN_ON(OPTION_SPACES_MODELING);

	PRINT_DICT(("%d words ",sizeDict-1));

	sortedDict.clear();
	int num;
	int minWordFreq2;

	if (minWordFreq<6)
		minWordFreq2=minWordFreq;
	else
		minWordFreq2=minWordFreq-2;


	for (i=1; i<sizeDict-1; i++)
	{
		num=dictfreq[i];

		if (num>=minWordFreq || (num>=minWordFreq2 && (dictlen[i]>=7))) 
			;
		else
			dictfreq[i]=0;
	}

	for (i=1, j=sizeDict-2; i<j; i++)
	{
		if (dictfreq[i]>0)
			continue;

		while (j>0 && dictfreq[j]==0) j--;

		if (i>j)
			break;

		dict[i]=dict[j];
		dictfreq[i]=dictfreq[j];
		dictfreq[j--]=0;
	}

	sizeDict=i;

	if (sizeDict>maxDictSize)
		sizeDict=maxDictSize;

	PRINT_DICT(("reduced to %d words (freq>=%d)\n",sizeDict,minWordFreq));

	if (quotes>=minSpacesFreq())
		TURN_ON(OPTION_QUOTES_MODELING);

	sortDict(sizeDict);

	PRINT_DICT(("quotes=%d\n",quotes));
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -