📄 htmlfnct.h
字号:
/* OpenWebSpider * * Authors: Stefano Alimonti AND Stefano Fantin * Version: 0.7 * E-Mails: shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it * * * This file is part of OpenWebSpider * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */#ifndef __HTMLFNCT#define __HTMLFNCT/* ForgePacket * hst -> packet <- * hst.Page = "/prova.htm" ==> packet = "GET /prova.htm HTTP/1.1" */int ForgeHTTPPacket(struct sHost hst,char * packet){char unicodedFilename[MAXURLSIZE];int g=0;int i; for(i=0;(unsigned)i<strlen(hst.Page);i++) if(hst.Page[i]==' ') { strcat(unicodedFilename,"%20"); g+=3; } else { unicodedFilename[g++]=hst.Page[i]; unicodedFilename[g]=0; } sprintf(packet,"GET %s HTTP/1.0\r\nAccept: */*\r\nHost: %s\r\nUser-Agent: OpenWebSpider/%s (http://www.openwebspider.org)\r\n\r\n",unicodedFilename,hst.Host,VERSION);return 1;}/* ParseHTTPRequest * recvdpkt -> htmlOut <- maxout -> * Return the packet without the HTTP header */int ParseHTTPRequest(char* recvdpkt,char* htmlOut,int maxout,char* httpHeader, char* stuff,int level){int c;char* pTmp;int loc;char *sLocation=stuff;struct sHost locHost; if(strnicmp(recvdpkt,"HTTP/1",6)!=0) return 0; memset(httpHeader,0,MAXHTTPSTATUSSIZE); memset(sLocation,0,MAXHOSTSIZE); pTmp=strstr(recvdpkt,"\r\n"); if(pTmp && pTmp>recvdpkt && pTmp-recvdpkt<MAXHTTPSTATUSSIZE) { strncpy(httpHeader,recvdpkt,pTmp-recvdpkt); } /* recvdpkt: "HTTP/1.1 302 Found Date: Sun, 27 Mar 2005 09:15:55 GMT Server: Apache/1.3.33 (Unix) PHP/4.3.10 X-Powered-By: PHP/4.3.10 Location: http://www.openwebspider.org Connection: close Transfer-Encoding: chunked Content-Type: text/html "*/ loc=0; if(strnicmp(recvdpkt,"HTTP/1.1 302",12)==0 || strnicmp(recvdpkt,"HTTP/1.0 302",12)==0) loc=1; for(c=0;c<(signed)strlen(recvdpkt);c++) { if(loc==1) { if(strncmp(recvdpkt+c,"Location",8)==0) { pTmp=strstr(recvdpkt+c+10,"\r\n"); if(pTmp) { if(pTmp-(recvdpkt+c+10)<MAXURLSIZE) { strncpy(sLocation,recvdpkt+c+10,pTmp-(recvdpkt+c+10)); ParseUrl(sLocation,&locHost,&IndexingHost); AddUrl(locHost,level,NULL); } else { strcpy(sLocation,"<Url too long>"); } } } } if(strncmp(recvdpkt+c,"\r\n\r\n",4)==0) { memcpy(htmlOut,recvdpkt+c+4,MIN(maxout-((recvdpkt+c+4)-recvdpkt),MAXPACKETSIZE-1)); return (strnicmp(recvdpkt,"HTTP/1.1 200",12)==0 || strnicmp(recvdpkt,"HTTP/1.0 200",12)==0)?2:1; } }return 0;}char* RemoveTag(char* html, char* startTag, char* endTag){char* startTagFound;char* endTagFound; //<!-- remove html comments--> (Marius Roibu) startTagFound = my_stristr(html,startTag); if(startTagFound) endTagFound = my_stristr(startTagFound,endTag); /* if startTagFound<endTagFound -> HTML is wrong */ while((startTagFound!= NULL) && (endTagFound!= NULL) && startTagFound<endTagFound ) { endTagFound += strlen(endTag); startTagFound[0] = 0; strcat(html, " "); strcat(html, endTagFound); startTagFound = my_stristr(html,startTag); endTagFound = my_stristr(html,endTag); } //end alterationreturn html;}/* BetweenTag * html text -> tag -> maxout out <- * html: "<p align=center>bye bye</p> ==> "align=center>ciao ciao" */int BetweenTag(char* html, char* tag,char* out,int endwithstarttag,int maxout){char tmptag1[MAXTAGSIZE+1], tmptag2[MAXTAGSIZE+3];int iRelPos=0;char* tmpPacket;char* startTag;char* tmpP; sprintf(tmptag1,"<%s",tag); sprintf(tmptag2,"</%s>",tag); tmpPacket = malloc(MAXPACKETSIZE); if(tmpPacket==NULL) MemoryCorruptedHandler("BetweenTag"); atoupper(html,tmpPacket,MAXPACKETSIZE-1); do { if((startTag=my_stristr(tmpPacket+iRelPos,_strupr(tmptag1)))<tmpPacket) { FREE(tmpPacket); return -1; } iRelPos=(startTag-tmpPacket)+strlen(tmptag1); //loop untile the tag is followed by a char that's not ' ' or '>' or CRLF or a tab /* tmpPacket[iRelPos]!=' ' && tmpPacket[iRelPos]!='>' && tmpPacket[iRelPos]!='\r' && tmpPacket[iRelPos]!='\n' && tmpPacket[iRelPos]!='\t') return BetweenTag(startTag+strlen(tmptag1)+1, tag, out,endwithstarttag,maxout); */ }while(tmpPacket[iRelPos]!=' ' && tmpPacket[iRelPos]!='>' && tmpPacket[iRelPos]!='\r' && tmpPacket[iRelPos]!='\n' && tmpPacket[iRelPos]!='\t'); if(endwithstarttag==1) //Ex. <A href=sample.c>sample code</A> tmpP=my_stristr(tmpPacket+iRelPos,_strupr(tmptag2)); else //Ex. <IMG src=sample.jpg> tmpP=strchr(tmpPacket+iRelPos,'>'); if(tmpP>tmpPacket+iRelPos && tmpP-(tmpPacket+iRelPos)<=maxout) { strncpy(out,html+iRelPos,tmpP-(tmpPacket+iRelPos)); out[tmpP-(tmpPacket+iRelPos)]=0; FREE(tmpPacket); return iRelPos+1; } out[0]=0; FREE(tmpPacket);return -1;}/* UnHtml * html -> text <- * html = "<p align="left"><font face="Arial" size="2">TesT123</font></p>" * text => TesT123 */int UnHtml(char* html, char* text,int maxout){int i, m, x=0, pOpen=0;unsigned char curC; memset(text,0,maxout); RemoveTag(html,"<!--","-->"); RemoveTag(html,"<script","</script>"); RemoveTag(html,"<style","</style>"); m=MIN((signed)strlen(html),maxout); for(i=0;i<m;i++) { curC=html[i]; if(curC=='<') { pOpen=1; /* "a<br>b" => "a b"; "a <br>b" => "a b" */ if(x && text[x-1]!=' ') text[x++]=' '; } else if(curC=='>') pOpen=0; if(pOpen==0 && curC!='>') { /* RemoveShit */ if( /*!( (curC>=32 && curC<=126) || (curC>=192 && curC<=255) ) ||*/ curC=='\'' || curC=='\"' || curC=='\\' || curC=='\n' || curC=='\r' || curC=='\t') curC=' '; if(x && text[x-1]==' ' && curC==' ') //if x>0 and last char is space and current char is space -> don't add this char continue; else text[x++]=curC; } }return x;}/* LookForUrls * html -> AddUrl() <- */int LookForUrls(char *html,struct sHost hst){char a2a[MAXTAGLENGTH]; //<a>...........</a>char tmpurl[MAXURLSIZE];char trimurl[MAXURLSIZE];char encodedurl[MAXURLSIZE];char fnd[MAXDESCRIPTIONSIZE];char strComment[MAXDESCRIPTIONSIZE];int strlenhtml;int c,i,x,tmpc,y;struct sHost tmphst;int apix=0;int stage;int nUrlFound=0;struct sHost sBaseHref;struct sHost* sReferringHost=&hst; RemoveTag(html,"<!--","-->"); strlenhtml=strlen(html); for(y=0;taglist[y].flag!=-1;y++) { tmpc=c=0; while(c<strlenhtml) { if((tmpc = BetweenTag(html+c,taglist[y].bTag,a2a,taglist[y].flag ,sizeof(a2a)))==-1) break; ReplaceChr(a2a,'\n',' '); ReplaceChr(a2a,'\r',' '); c += tmpc+strlen(taglist[y].bTag); stage=0; x=0; apix=0; for(i=0;i<(signed)strlen(a2a);i++) { switch(stage) { case 0: //looks for start tag if(strnicmp(a2a+i,taglist[y].eTag,strlen(taglist[y].eTag))==0) { stage=1; //start tag found i+=strlen(taglist[y].eTag); memset(tmpurl,0,sizeof(tmpurl)); } break; case 1: //looks for '\"' or '\'' if(a2a[i]=='\"' || a2a[i]=='\'' ) //start '"' found { stage=2; apix=1; break; } else if(a2a[i]=='=') break; if(a2a[i]!=' ' && a2a[i]!='\n' && a2a[i]!='\r') //If a2a[i] is not a delimiter consider it as data (apix=0) { stage=2; i--; } break; case 2: /* URL too long */ if(x>=MAXURLSIZE-1) { stage=3; tmpurl[0]=0; break; } if(apix==1) /* "<a href="test.htm">test</a>" OR "<a href='test.htm'>test</a>" */ { if(a2a[i]!='\"' && a2a[i]!='\'' ) //while end '"' is not found { tmpurl[x++]=a2a[i]; break; } else //end '"' found { stage=3; tmpurl[x]=0; break; } } else /* "<a href=test.htm>test</a>" */ { /*if(strlen(a2a+i)!=1) { tmpurl[x++]=a2a[i]; stage=3; tmpurl[x]=0; break; } else */ if(a2a[i]!=' ' && a2a[i]!='>' && a2a[i]!='\"' && a2a[i]!='\'' && strlen(a2a+i)!=1) { tmpurl[x++]=a2a[i]; break; } else //end '"' found { stage=3; tmpurl[x]=0; break; } } } /*switch*/ if(stage==3) //exits from for{} break; } /*for*/ if(stage==3) if(tmpurl[0]!=0 && strnicmp(tmpurl,"javascript:",11)!=0) { strtrim(tmpurl, trimurl); memset(encodedurl,0,sizeof(encodedurl)); unencode(trimurl,trimurl+strlen(trimurl),encodedurl); //Support 4 unicode fnd[0]=0; if(stricmp(taglist[y].bTag,"base")==0) //if TAG is BASE { if(ParseUrl(encodedurl,&sBaseHref,NULL)==-1) continue; /* sReferringHost has the same hostname and port and has an host_id */ if( sReferringHost && sReferringHost->host_id!=0 && strcmp( sReferringHost->Host, sBaseHref.Host ) == 0 && sReferringHost->port == sBaseHref.port ) { /* yes: this page is from the same domain: use currentHost host_id */ sBaseHref.host_id = sReferringHost->host_id; } sReferringHost=&sBaseHref; continue; } else if(stricmp(taglist[y].bTag,"a")==0) //if TAG is A { //This shit is needed cause a2a doesn't start and doesn't end respectively with <> a2a[0]='<'; strcat(a2a,">"); //looks for the href's comment <a href...>XXX</a> UnHtml(a2a,fnd,sizeof(fnd)); UnToken(fnd,"\r\n\t",strComment,strlen(fnd)); strtrim(strComment,strComment); OnlyOneSpace(strComment,fnd,sizeof(fnd)); } if(ParseUrl(encodedurl,&tmphst,sReferringHost)==-1) continue; if(bTokenIn(encodedurl,"<>\r\n\t\\",strlen(encodedurl))==0) { tmphst.viewed = 0; memcpy(tmphst.Description,fnd,MAXDESCRIPTIONSIZE-1); if(tmphst.type != 3) //Add only HTML or plain/text file or custom html files { /*if(strchr(tmphst.Page,' ')>tmphst.Page) printf("\n\nasd\n\n"); */ nUrlFound++; AddUrl(tmphst,hst.level,&hst); } } } } }return nUrlFound;}#endif/*EOF*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -