📄 bittoken.c
字号:
//--------------------------------------------// BIT HTML LIB////file: BitToken.c//usage: for token//---------------------------------------------#include "BitHtml.h"//--------------------------------------BitTokenContext * Bit_NewContext(){ BitTokenContext *global_cx; global_cx=(BitPTokenContext)malloc(sizeof(BitTokenContext)); Bit_TokenList_Init(global_cx); global_cx->curPosition=0; global_cx->strBuffer=NULL; global_cx->pTagList=Bit_GetHTMLTagList(); return global_cx;}BitToken *Bit_NewToken(int type){ BitToken *token=(BitPToken)malloc(sizeof(BitToken)); Bit_TokenAttrList_Init(token); token->pData=NULL; token->type=type; token->end=FALSE; return token;}BitTokenAttr *Bit_NewTokenAttribute(int type){ BitTokenAttr *tokenAttr; tokenAttr=(BitPTokenAttr)malloc(sizeof(BitTokenAttr)); tokenAttr->type=type; tokenAttr->value=NULL; return tokenAttr;}void Token_EraseSpace(BitTokenContext *global_cx){ char ch; while(ch=BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition)) { if(ch==' ' || ch==' ' || ch=='\r'|| ch=='\n') //space,tab,\n global_cx->curPosition++; else break; }}
/*void Token_ConvertIfNeed(char * aString){ int start=0,end=0; char item=0,item1=0; char *str; int mLength; if(!aString) return ; str=BitString_NewChars(" "); mLength=strlen(aString); while(end < mLength) { start=BitString_FindChar(aString,mLength,end,'&',TRUE); if(start == -1) return ; end=start+1; item=BitString_GetCharAt(aString,end); switch(item) {
case '#': //maybe a "&#(number)" or"&#X(number)" { int bTen=TRUE; char result=0; int err; item1=BitString_GetCharAt(aString,++end); if(item1 == 'x' || item1== 'X') {//16 bTen=FALSE; item1=BitString_GetCharAt(aString,++end); } while( Bit_CharISDigit(item1,bTen)) { BitString_AppendChar(str,item1); item1=BitString_GetCharAt(aString,++end); } result=BitString_ToInteger(str,&err,bTen?10:16); if(result) { BitString_Cut(aString,start,end-start-1); BitString_SetCharAt(aString,result,start); end=start+1; } start=end; break; } case 'a': { BitString_Cut(aString,start,5); BitString_SetCharAt(aString,'&',start); start+=1; break; } case 'c': { BitString_Cut(aString,start,5); BitString_SetCharAt(aString,'@',start); start+=1; break; }
case 'n':
{
BitString_Cut(aString,start,5);
BitString_SetCharAt(aString,' ',start);
start+=1;
break;
} }//switch } free(str); return;}*/
char *Token_ReadUntil(BitTokenContext *global_cx,char *sUntil){ int i; char *pStr=NULL; i=BitString_FindCharInSet(global_cx->strBuffer,sUntil,0,global_cx->curPosition); if(i==-1) return NULL; pStr=(char *)malloc(i-global_cx->curPosition+2); BitString_Mid(global_cx->strBuffer,pStr,global_cx->curPosition,i-global_cx->curPosition); global_cx->curPosition=i-1; //old is i //Token_ConvertIfNeed(pStr); return pStr;}void Token_ThrowTo(BitTokenContext *global_cx,char *sUntil){ int i; i=BitString_FindCharInSet(global_cx->strBuffer,sUntil,0,global_cx->curPosition); if(i==-1) return ; global_cx->curPosition=i; //old is i+1 return;}char *Token_GetAttribute(BitTokenContext *global_cx){ char * pAttr=NULL; Token_EraseSpace(global_cx); switch( BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition)) { case '\"': global_cx->curPosition++; pAttr= Token_ReadUntil(global_cx,"\">"); global_cx->curPosition++; if( BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition) == '\"') global_cx->curPosition++; break; case '\'': global_cx->curPosition++; pAttr= Token_ReadUntil(global_cx,"\'>"); global_cx->curPosition++; if( BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition) == '\'') global_cx->curPosition++; break; default: pAttr= Token_ReadUntil(global_cx," >"); global_cx->curPosition++; break; } return pAttr;}int Bit_GetTagNumber(BitTokenContext *global_cx,char * name_element){ return global_cx->pTagList->fGetTagID(name_element);}int Bit_GetAtrNumber(BitTokenContext *global_cx,int Type,char * name_element){ return global_cx->pTagList->fGetAtrID(name_element);}//-----------------------------------------------------int Token_ConsumTag(BitTokenContext *global_cx){ char *pStr; char item; int pos=global_cx->curPosition; BitToken *pToken=NULL; int bEndTag=FALSE; /* if( '<' != BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition)) return FALSE; */ global_cx->curPosition++; Token_EraseSpace(global_cx); item=BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition); if( '!' == item) { // case the comment <!-- Token_ReadUntil(global_cx,"><"); return FALSE; } else if( '/' == item) { bEndTag=TRUE; global_cx->curPosition++; Token_EraseSpace(global_cx); } else if( ('>' == item) || ('<'==item)) //empty tag like < > ,<< return FALSE; pStr=Token_ReadUntil(global_cx," =>\r\n "); //tab,\r\n if(pStr) { int type; // get the tag type and creat a new token type=Bit_GetTagNumber(global_cx,pStr); pToken=Bit_NewToken(type); if(type == -1) { global_cx->curPosition=pos+1; //old is pos pToken->pData=Token_ReadUntil(global_cx,"><"); //Token_ThrowTo(global_cx,"><"); global_cx->curPosition++; //TDDeque_Push(&pContext->TokenDeque,pToken); Bit_TokenList_AddTail(global_cx,pToken); return TRUE; } if(bEndTag)/*if this is an end tag, skip the following char*/ { pToken->end=TRUE; Token_ThrowTo(global_cx,">"); //TDDeque_Push(&pContext->TokenDeque,pToken); Bit_TokenList_AddTail(global_cx,pToken); return TRUE; } } else { /*this is an empt tag, just throw it know*/ Token_ThrowTo(global_cx,"><"); return TRUE; } /*if go here, we have gotten the correct tag name, then we should see if it has some additional information get the addtional attribute 开始处理该token的属性 */ global_cx->curPosition++; Token_EraseSpace(global_cx); while(BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition) != '>') { pStr=Token_ReadUntil(global_cx," =,>\r\n "); //tab,\r\n if(pStr) { BitPTokenAttr pAttr=NULL; int type; type=Bit_GetAtrNumber(global_cx,pToken->type,pStr); if(type ==-1) { /*it's an invalidate attribute*/ break; } pAttr=Bit_NewTokenAttribute(type); //TDC_List_AddTail(&pToken->pAttribute,(TDDWORD)pAttr); Bit_TokenAttrList_AddTail(pToken,pAttr); global_cx->curPosition++; Token_EraseSpace(global_cx); switch(BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition) ) { case '=': global_cx->curPosition++; pAttr->value=Token_GetAttribute(global_cx);/*read a word*/ break; case '>': case '<': //容错 add < default: break; } } Token_EraseSpace(global_cx); }// while Token_ThrowTo(global_cx,"><"); //容错 add < if(pToken) //TDDeque_Push(&pContext->TokenDeque,pToken); Bit_TokenList_AddTail(global_cx,pToken); return TRUE;}void Token_ConvertRtToSpace(BitTokenContext *global_cx,char * aString){ int i; int mLength=strlen(aString); if( !global_cx || !aString) return ; for(i=0;i<mLength;i++) { if(BitString_GetCharAt(aString,i) == 0x0D) { if(BitString_GetCharAt(aString,i+1) == 0x0A) { BitString_Cut(aString,i,1); BitString_SetCharAt(aString,' ',i); } else { BitString_SetCharAt(aString,' ',i); } } } return;}int Token_Consum_PlainText(BitTokenContext *global_cx){ BitToken *pToken=NULL; char * pStr=NULL; pStr=Token_ReadUntil(global_cx,"<>"); //容错 add > //Token_ConvertRtToSpace(global_cx,pStr); pToken=Bit_NewToken(global_cx->pTagList->nPlainText); if(!pToken) return FALSE; pToken->pData=pStr; Bit_TokenList_AddTail(global_cx,pToken); return TRUE;}//------------------------------------int Bit_Tokenize(BitTokenContext *global_cx){/* // test for how to add 1 token BitToken *token; BitTokenAttr *tokenAttr; char *s="i am bobo"; token=Bit_NewToken(1); tokenAttr=Bit_NewTokenAttribute(0); tokenAttr->value=BitString_NewChars("http://a.htm"); Bit_TokenAttrList_AddTail(token,tokenAttr); Bit_TokenList_AddTail(global_cx,token);*/ char item; Token_EraseSpace(global_cx); while(global_cx->curPosition<global_cx->bufferLength) { item=BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition); switch(item) { case '<': { Token_ConsumTag(global_cx); global_cx->curPosition++; break; } case 0xD: // '\n' case 0xA: // '\r' case ' ': case '>': // 容错 { global_cx->curPosition++; break; } default: { Token_Consum_PlainText(global_cx); global_cx->curPosition++; } break; } //switch } //while return 0;}void Bit_BeginToken(BitTokenContext *global_cx){ int step_length=2048; int step=step_length;
global_cx->global_curPosition=0; global_cx->strBuffer=(char *)malloc(step+2); // step by step while(global_cx->global_curPosition<global_cx->global_bufferLength) { // read step or countleft number of chars to strBuffer int countleft=global_cx->global_bufferLength-global_cx->global_curPosition; if(step>countleft) step=countleft; BitString_Mid(global_cx->global_strBuffer,global_cx->strBuffer,global_cx->global_curPosition,step); global_cx->global_curPosition+=step; global_cx->curPosition=step; //to the end // now we check whether it is a complete tag // maybe we need backward some chars // backward for tag while(global_cx->curPosition>0) { char item; item=BitString_GetCharAt(global_cx->strBuffer,global_cx->curPosition); if(item=='<') { global_cx->curPosition--; break; } else if(item=='>') { global_cx->global_curPosition++; break; } global_cx->curPosition--; global_cx->global_curPosition--; }//while still not a complete tag if(global_cx->curPosition>0) // the buffer contains at least 1 tag,just token it { global_cx->bufferLength=global_cx->curPosition; global_cx->curPosition=0; Bit_Tokenize(global_cx); } else if(global_cx->curPosition==0)// 容错,找不到一个tag { if(step==step_length) // maybe it is a long plain text {
global_cx->global_curPosition+=step; global_cx->bufferLength=step; //for plaintext read until BitString_AppendChar(global_cx->strBuffer,'<'); Bit_Tokenize(global_cx);
} else break; //剩余少部分文字,不构成tag,不token } else break; // curPosition<0,like "<a href=" 容错 }//while for step by step}
void Bit_SaveTokenResult(BitTokenContext *global_cx,char * filename){ BitTokenList *pTtokenList; BitTokenAttrList *pTokenAttrList; FILE * fp=fopen(filename,"w"); if(fp==NULL) return; fprintf(fp,"global_cx->curPosition:%d\n",global_cx->curPosition); fprintf(fp,"global_cx->global_curPosition:%d\n\n",global_cx->global_curPosition); pTtokenList=global_cx->tokenList->next; while(pTtokenList!=NULL) { fprintf(fp,"---------------------------------------\n"); // name fprintf(fp,"Token Name:%s",BitHTML_GetTagName(pTtokenList->token->type)); if(pTtokenList->token->end==TRUE) fprintf(fp," </end tag>\n"); else fprintf(fp,"\n"); // type fprintf(fp,"Token Type:%d\n",pTtokenList->token->type); if(pTtokenList->token->type==HTML_TEXT) { // if plaintext get the data fprintf(fp,"Token->pData:%s\n",pTtokenList->token->pData); } else { // get token attribute pTokenAttrList=pTtokenList->token->attrList->next; while(pTokenAttrList!=NULL) { fprintf(fp,"\nAttr->type:%d\n",pTokenAttrList->attr->type); fprintf(fp,"AttrName:%s\n",BitHTML_GetAtrName(pTokenAttrList->attr->type)); fprintf(fp,"Attr->value:%s\n",pTokenAttrList->attr->value); pTokenAttrList=pTokenAttrList->next; } } pTtokenList=pTtokenList->next; } fclose(fp);}
int Bit_DestroyToken(BitTokenContext *global_cx){ free(global_cx->strBuffer); free(global_cx->global_strBuffer); Bit_TokenList_Destroy(global_cx); free(global_cx); return 0; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -