⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bittoken.c

📁 一个很好的html网页分析工具
💻 C
字号:
//--------------------------------------------//                 BIT HTML LIB////file:  BitToken.c//usage: for token//---------------------------------------------#include "BitHtml.h"//--------------------------------------BitTokenContext * Bit_NewContext(){	BitTokenContext *global_cx;	global_cx=(BitPTokenContext)malloc(sizeof(BitTokenContext));	Bit_TokenList_Init(global_cx);	global_cx->curPosition=0;	global_cx->strBuffer=NULL;	global_cx->pTagList=Bit_GetHTMLTagList();	return global_cx;}BitToken *Bit_NewToken(int type){	BitToken *token=(BitPToken)malloc(sizeof(BitToken));	Bit_TokenAttrList_Init(token);	token->pData=NULL;	token->type=type;	token->end=FALSE;	return token;}BitTokenAttr *Bit_NewTokenAttribute(int type){	BitTokenAttr *tokenAttr;	tokenAttr=(BitPTokenAttr)malloc(sizeof(BitTokenAttr));	tokenAttr->type=type;	tokenAttr->value=NULL;	return tokenAttr;}void Token_EraseSpace(BitTokenContext *global_cx){	char ch;	while(ch=BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition))	{		if(ch==' ' || ch=='	' || ch=='\r'|| ch=='\n')  //space,tab,\n			global_cx->curPosition++;		else break;	}}
/*void Token_ConvertIfNeed(char * aString){	int start=0,end=0;	char item=0,item1=0;	char *str;	int mLength;	if(!aString)		return ;	str=BitString_NewChars("      ");	mLength=strlen(aString);	while(end < mLength)	{		start=BitString_FindChar(aString,mLength,end,'&',TRUE);		if(start == -1)			return ;		end=start+1;		item=BitString_GetCharAt(aString,end);		switch(item)		{
		case '#': //maybe a "&#(number)" or"&#X(number)"			{ 				int bTen=TRUE;				char result=0;				int  err;				item1=BitString_GetCharAt(aString,++end);				if(item1 == 'x' || item1== 'X')				{//16					bTen=FALSE;					item1=BitString_GetCharAt(aString,++end);				}				while( Bit_CharISDigit(item1,bTen)) 				{					BitString_AppendChar(str,item1);					item1=BitString_GetCharAt(aString,++end);				}				result=BitString_ToInteger(str,&err,bTen?10:16);				if(result)				{					BitString_Cut(aString,start,end-start-1);					BitString_SetCharAt(aString,result,start);					end=start+1;				}				start=end;				break;			}        case 'a':            {                BitString_Cut(aString,start,5); 		    	BitString_SetCharAt(aString,'&',start);                 start+=1;                break;            }        case 'c':            {                 BitString_Cut(aString,start,5); 		    	 BitString_SetCharAt(aString,'@',start);                  start+=1;                 break;            }

        case 'n':
            {
                BitString_Cut(aString,start,5); 
				BitString_SetCharAt(aString,' ',start); 
                start+=1;
                break;
            }		}//switch	}	free(str);	return;}*/
char *Token_ReadUntil(BitTokenContext *global_cx,char *sUntil){	int i;	char *pStr=NULL;	i=BitString_FindCharInSet(global_cx->strBuffer,sUntil,0,global_cx->curPosition);		if(i==-1)		return NULL;		pStr=(char *)malloc(i-global_cx->curPosition+2);	BitString_Mid(global_cx->strBuffer,pStr,global_cx->curPosition,i-global_cx->curPosition);	global_cx->curPosition=i-1; //old is i	//Token_ConvertIfNeed(pStr);	return pStr;}void Token_ThrowTo(BitTokenContext *global_cx,char *sUntil){	int i;	i=BitString_FindCharInSet(global_cx->strBuffer,sUntil,0,global_cx->curPosition);	if(i==-1)		return ;	global_cx->curPosition=i; //old is i+1	return;}char *Token_GetAttribute(BitTokenContext *global_cx){	char * pAttr=NULL;	Token_EraseSpace(global_cx);	switch( BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition))	{	case '\"':		global_cx->curPosition++;		pAttr= Token_ReadUntil(global_cx,"\">");		global_cx->curPosition++;		if( BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition) == '\"')			global_cx->curPosition++;		break;	case '\'':		global_cx->curPosition++;		pAttr= Token_ReadUntil(global_cx,"\'>");		global_cx->curPosition++;		if( BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition) == '\'')			global_cx->curPosition++;		break;	default:		pAttr= Token_ReadUntil(global_cx," >");		global_cx->curPosition++;		break;	}	return pAttr;}int Bit_GetTagNumber(BitTokenContext *global_cx,char * name_element){	return global_cx->pTagList->fGetTagID(name_element);}int Bit_GetAtrNumber(BitTokenContext *global_cx,int Type,char * name_element){	return global_cx->pTagList->fGetAtrID(name_element);}//-----------------------------------------------------int Token_ConsumTag(BitTokenContext *global_cx){	char *pStr;	char item;	int pos=global_cx->curPosition;	BitToken *pToken=NULL;    int	 bEndTag=FALSE;	/*	if( '<' != BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition))		return FALSE;	*/		global_cx->curPosition++;	Token_EraseSpace(global_cx);	item=BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition);	if( '!' == item)	{   // case the comment <!--		Token_ReadUntil(global_cx,"><");		return FALSE;	}	else if( '/' == item)	{		bEndTag=TRUE;		global_cx->curPosition++;		Token_EraseSpace(global_cx);	}	else if( ('>' == item) || ('<'==item)) //empty tag like <   > ,<<		return FALSE;		pStr=Token_ReadUntil(global_cx," =>\r\n	"); //tab,\r\n	if(pStr)	{		int type;		// get the tag type and creat a new token		type=Bit_GetTagNumber(global_cx,pStr);		pToken=Bit_NewToken(type);		if(type == -1)		{			global_cx->curPosition=pos+1; //old is pos			pToken->pData=Token_ReadUntil(global_cx,"><");			//Token_ThrowTo(global_cx,"><");			global_cx->curPosition++;			//TDDeque_Push(&pContext->TokenDeque,pToken);			Bit_TokenList_AddTail(global_cx,pToken);			return TRUE;		}		if(bEndTag)/*if this is an end tag, skip the following char*/		{			pToken->end=TRUE;			Token_ThrowTo(global_cx,">");			//TDDeque_Push(&pContext->TokenDeque,pToken);			Bit_TokenList_AddTail(global_cx,pToken);					return TRUE;		}	}	else	{		/*this is an empt tag, just throw it know*/		Token_ThrowTo(global_cx,"><");		return TRUE;	}	/*if go here, we have gotten the correct tag name,	then we should see if it has some additional information	get the addtional attribute    开始处理该token的属性	*/	global_cx->curPosition++;	Token_EraseSpace(global_cx);	while(BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition) != '>')	{		pStr=Token_ReadUntil(global_cx," =,>\r\n	"); //tab,\r\n		if(pStr)		{			BitPTokenAttr pAttr=NULL;			int		 type;			type=Bit_GetAtrNumber(global_cx,pToken->type,pStr);			if(type ==-1)			{				/*it's an invalidate attribute*/				break;			}			pAttr=Bit_NewTokenAttribute(type);			//TDC_List_AddTail(&pToken->pAttribute,(TDDWORD)pAttr);			Bit_TokenAttrList_AddTail(pToken,pAttr);			global_cx->curPosition++;			Token_EraseSpace(global_cx);			switch(BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition) )			{			case '=':				global_cx->curPosition++;				pAttr->value=Token_GetAttribute(global_cx);/*read a word*/				break;			case '>':			case '<': //容错  add <			default:				break;			}		}		Token_EraseSpace(global_cx);	}// while	Token_ThrowTo(global_cx,"><"); //容错 add <	if(pToken)	   //TDDeque_Push(&pContext->TokenDeque,pToken);	   Bit_TokenList_AddTail(global_cx,pToken);	return TRUE;}void Token_ConvertRtToSpace(BitTokenContext *global_cx,char * aString){	int i;	int mLength=strlen(aString);	if( !global_cx || !aString)		return ;	for(i=0;i<mLength;i++)	{		if(BitString_GetCharAt(aString,i) == 0x0D)		{			if(BitString_GetCharAt(aString,i+1) == 0x0A)			{				BitString_Cut(aString,i,1);				BitString_SetCharAt(aString,' ',i);			}			else			{				BitString_SetCharAt(aString,' ',i);			}		}		}	return;}int Token_Consum_PlainText(BitTokenContext *global_cx){	BitToken *pToken=NULL;	char * pStr=NULL;	pStr=Token_ReadUntil(global_cx,"<>"); //容错 add >	//Token_ConvertRtToSpace(global_cx,pStr);	pToken=Bit_NewToken(global_cx->pTagList->nPlainText);	if(!pToken)		return FALSE;	pToken->pData=pStr;	Bit_TokenList_AddTail(global_cx,pToken);	return TRUE;}//------------------------------------int Bit_Tokenize(BitTokenContext *global_cx){/*  // test for how to add 1 token  	BitToken *token;    	BitTokenAttr *tokenAttr;	char *s="i am bobo";	token=Bit_NewToken(1);	tokenAttr=Bit_NewTokenAttribute(0);	tokenAttr->value=BitString_NewChars("http://a.htm");	Bit_TokenAttrList_AddTail(token,tokenAttr);	Bit_TokenList_AddTail(global_cx,token);*/	char   item;	Token_EraseSpace(global_cx);	while(global_cx->curPosition<global_cx->bufferLength)	{				item=BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition);		switch(item)		{		case '<':			{				Token_ConsumTag(global_cx);				global_cx->curPosition++;				break;			}		case 0xD:	   // '\n'		case 0xA:	   // '\r'		case ' ':		case '>':      // 容错			{				global_cx->curPosition++;				break;			}		default:			{			    Token_Consum_PlainText(global_cx);				global_cx->curPosition++;			}							break;		} //switch	} //while	return 0;}void Bit_BeginToken(BitTokenContext *global_cx){	int step_length=2048;	int step=step_length;
	global_cx->global_curPosition=0;	global_cx->strBuffer=(char *)malloc(step+2);	// step by step	while(global_cx->global_curPosition<global_cx->global_bufferLength)	{			// read step or countleft number of chars to strBuffer		int countleft=global_cx->global_bufferLength-global_cx->global_curPosition;		if(step>countleft)			step=countleft;		BitString_Mid(global_cx->global_strBuffer,global_cx->strBuffer,global_cx->global_curPosition,step);		global_cx->global_curPosition+=step;		global_cx->curPosition=step; //to the end		// now we check whether it is a complete tag		// maybe we need backward some chars		// backward for tag		while(global_cx->curPosition>0)		{			char   item;			item=BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition);					if(item=='<')			{				global_cx->curPosition--;				break;			}			else if(item=='>')			{				global_cx->global_curPosition++;				break;			}							global_cx->curPosition--;			global_cx->global_curPosition--;						}//while still not a complete tag		if(global_cx->curPosition>0)		// the buffer contains at least 1 tag,just token it		{	global_cx->bufferLength=global_cx->curPosition;			global_cx->curPosition=0;						Bit_Tokenize(global_cx);		}		else if(global_cx->curPosition==0)// 容错,找不到一个tag		{			if(step==step_length) // maybe it is a long plain text			{
				global_cx->global_curPosition+=step;				global_cx->bufferLength=step;				//for plaintext read until				BitString_AppendChar(global_cx->strBuffer,'<');				Bit_Tokenize(global_cx);
			}			else				break; //剩余少部分文字,不构成tag,不token		}		else break; // curPosition<0,like "<a href=" 容错	}//while for step by step}
void Bit_SaveTokenResult(BitTokenContext *global_cx,char * filename){  	BitTokenList *pTtokenList;    	BitTokenAttrList *pTokenAttrList;	FILE * fp=fopen(filename,"w");	if(fp==NULL)		return;	fprintf(fp,"global_cx->curPosition:%d\n",global_cx->curPosition);	fprintf(fp,"global_cx->global_curPosition:%d\n\n",global_cx->global_curPosition);	pTtokenList=global_cx->tokenList->next;		while(pTtokenList!=NULL)	{		fprintf(fp,"---------------------------------------\n");				// name		fprintf(fp,"Token Name:%s",BitHTML_GetTagName(pTtokenList->token->type));				if(pTtokenList->token->end==TRUE)			fprintf(fp,"  </end tag>\n");		else			fprintf(fp,"\n");		// type		fprintf(fp,"Token Type:%d\n",pTtokenList->token->type);		if(pTtokenList->token->type==HTML_TEXT)		{ // if plaintext get the data			fprintf(fp,"Token->pData:%s\n",pTtokenList->token->pData);		}		else		{  // get token attribute			pTokenAttrList=pTtokenList->token->attrList->next;			while(pTokenAttrList!=NULL)			{				fprintf(fp,"\nAttr->type:%d\n",pTokenAttrList->attr->type);				fprintf(fp,"AttrName:%s\n",BitHTML_GetAtrName(pTokenAttrList->attr->type));				fprintf(fp,"Attr->value:%s\n",pTokenAttrList->attr->value);				pTokenAttrList=pTokenAttrList->next;			}		}		pTtokenList=pTtokenList->next;	}	fclose(fp);}

int Bit_DestroyToken(BitTokenContext *global_cx){ 	free(global_cx->strBuffer);	free(global_cx->global_strBuffer);	Bit_TokenList_Destroy(global_cx); 	free(global_cx); 	return 0; }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -