📄 urlfunct.h
字号:
return 1; } for(i=0;HtmlExtensions[i][0]!=0;i++) { if(stricmp(rPage+strlen(rPage)-strlen(HtmlExtensions[i]),(char*)HtmlExtensions[i])==0) { Host->type = 1; // Html file return 1; } } for(i=0;PlainTextExtension[i][0]!=0;i++) { if(stricmp(rPage+strlen(rPage)-strlen(PlainTextExtension[i]),(char*)PlainTextExtension[i])==0) { Host->type = 2; return 1; } } /*Support for custom extensions*/ /*TO TEST*/ for(i=0;CustomExtensions[i][0]!=0;i++) { if(stricmp(rPage+strlen(rPage)-strlen(CustomExtensions[i]),(char*)CustomExtensions[i])==0) { Host->type = 4; return 1; } } if(bArgs==1) Host->type = 1; else Host->type = 3; //discard it return 1;}/* PortNumFromHostname* hostname -><-* hostname="www.auuuu.com:90" => hostname="www.auuuu.com"; return 90;*/unsigned int PortNumFromHostname(char* hostname){ unsigned int i; for(i=0;i<strlen(hostname);i++) if(hostname[i]==':') break; if(i!=strlen(hostname)) { hostname[i]=0; return (unsigned)atoi(hostname+i+1); } return PORT;}int GenerateURL(struct sHost Host,char* URL){ char port[5]; sprintf(port,"%d",Host.port); strcpy(URL,"http://"); strcat(URL,Host.Host); strcat(URL,":"); strcat(URL,port); strcat(URL,Host.Page); return 1;}/* ParseUrl* Url <- sHost* Url: "http://www.test.com/page.htm" ==>* ==> sHost.Url = Url && sHost.Host = "www.test.com" && sHost.Page = "page.htm"*/int ParseUrl(char* url,struct sHost* sh,struct sHost* currentHost){ char tUrl[MAXURLSIZE]; char BaseDir[MAXPAGESIZE]; unsigned int offset=0,i; char* token1=NULL; char* tmpPage; if(url==NULL || sh==NULL) return -1; if(strlen(url)>MAXURLSIZE-1) return -1; if( strnicmp(url,"ftp://",6)==0 || strnicmp(url,"mailto:",7)==0 || strnicmp(url,"about:",6)==0 || strnicmp(url,"irc://",6)==0 || strnicmp(url,"news://",7)==0 || strnicmp(url,"https://",8)==0) //protocols not supported return -1; memset(sh,0,sizeof(struct sHost)); memset(tUrl,0,MAXURLSIZE); for(i=0;i<strlen(url);i++) { if(url[i]=='#') { url[i]=0; break; } } if(url[0]==0) return -1; if(strnicmp(url,"http://",7)==0) { if(strlen(url)==7) return -1; else offset=7; } if(strncmp(url,"//",2)==0) { if(strlen(url)==7) return -1; else offset=2; } strncpy(tUrl,url+offset,strlen(url)-offset); tUrl[strlen(url)-offset]=0; if(offset>0) //url with prefix: "http://" || "//" { for(i=0;i<strlen(tUrl);i++) { if(tUrl[i]=='/' || tUrl[i]=='?') { token1=tUrl+i; break; } } if(token1>tUrl) //is there a '/'? { strncpy(sh->Host,tUrl,token1-tUrl); //yes: the host is the part of the string before '/' and the page the rest strncpy(sh->Page,token1,MAXPAGESIZE-1); if(strnicmp(sh->Page,"mailto:",7)==0) return -1; } else //no: the host is the url and the page is the index { strncpy(sh->Host,tUrl,MAXHOSTSIZE-1); strcpy(sh->Page,"/"); } sh->port = PortNumFromHostname(sh->Host); strtrim(sh->Host,sh->Host); tmpPage = malloc(MAXPAGESIZE); strtrim(sh->Page,tmpPage); ReplaceStr(tmpPage,sh->Page,"&","&"); FREE(tmpPage); /* currentHost has the same hostname and port and has an host_id */ if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port ) { /* yes: this page is from the same domain: use currentHost host_id */ sh->host_id = currentHost->host_id; } if(CheckPage(sh->Page)==-1) return -1; return PageType(sh); } else //now we expect a relative url { if(strlen(url)>MAXPAGESIZE-1) return -1; if(currentHost==NULL) //if we haven't a reference host we can't continue return -1; strncpy(sh->Host,currentHost->Host,MIN(MAXHOSTSIZE-1,strlen(currentHost->Host))); if(tUrl[0]!='/') //if the first char is not '/' we must consider the current directory GetDir(currentHost->Page,BaseDir); else BaseDir[0]=0; for(i=strlen(tUrl);i>0;i--) //is there a '.' before last '/'? if(tUrl[i]=='/') break; else if(tUrl[i]=='.') //yes: this is a page Ex. "/sources.html" { if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE) return -1; strcpy(sh->Page,BaseDir); strcat(sh->Page,tUrl); //get the port from the current Host sh->port = currentHost->port; strtrim(sh->Host,sh->Host); tmpPage = malloc(MAXPAGESIZE); strtrim(sh->Page,tmpPage); ReplaceStr(tmpPage,sh->Page,"&","&"); FREE(tmpPage); /* currentHost has the same hostname and port and has an host_id */ if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port ) { /* yes: this page is from the same domain: use currentHost host_id */ sh->host_id = currentHost->host_id; } if(CheckPage(sh->Page)==-1) return -1; return PageType(sh); } //is there a '.' before last '/'? : no: if i==strlen(tUrl) this is a directory Ex. "sample/" if((unsigned)i==strlen(tUrl)-1) { if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE) return -1; strcpy(sh->Page,BaseDir); strcat(sh->Page,tUrl); //get the port from the current Host sh->port = currentHost->port; strtrim(sh->Host,sh->Host); tmpPage = malloc(MAXPAGESIZE); strtrim(sh->Page,tmpPage); ReplaceStr(tmpPage,sh->Page,"&","&"); FREE(tmpPage); /* currentHost has the same hostname and port and has an host_id */ if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port ) { /* yes: this page is from the same domain: use currentHost host_id */ sh->host_id = currentHost->host_id; } if(CheckPage(sh->Page)==-1) return -1; return PageType(sh); } else //in this case we have a page like: "dir1/something" we consider it a directory { if(strlen(BaseDir)+strlen(tUrl)+1>=MAXPAGESIZE) return -1; strcpy(sh->Page,BaseDir); strcat(sh->Page,tUrl); if(strchr(tUrl,'?')==NULL) //if there is a '?' in the "directory" we consider it a page strcat(sh->Page,"/"); //get the port from the current Host sh->port = currentHost->port; strtrim(sh->Host,sh->Host); tmpPage = malloc(MAXPAGESIZE); strtrim(sh->Page,tmpPage); ReplaceStr(tmpPage,sh->Page,"&","&"); FREE(tmpPage); /* currentHost has the same hostname and port and has an host_id */ if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port ) { /* yes: this page is from the same domain: use currentHost host_id */ sh->host_id = currentHost->host_id; } if(CheckPage(sh->Page)==-1) return -1; return PageType(sh); } } }/* GetHostId* if the host exists in the table hostlist returns its id* else returns 0*/int GetHostId(struct sHost host){ char* sqlQuery; MYSQL_RES gRes; MYSQL_RES** tmpRes=NULL; MYSQL_ROW row; unsigned int ret; tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES)); sqlQuery = malloc(MAXQUERYSIZE); if(tmpRes==NULL || sqlQuery==NULL) MemoryCorruptedHandler("GetHostId"); snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id FROM %s.hostlist WHERE hostname='%s' AND port = %d LIMIT 1", DB1, host.Host, host.port); my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX); FREE(sqlQuery); row = mysql_fetch_row(&gRes); if(row) ret = atoi(row[0]); else ret = 0; if(*tmpRes) { mysql_free_result(*tmpRes); } FREE(tmpRes); return ret;}int pRelationships(struct sHost* links,struct sHost* linked,int level){ char* sqlQuery; int host_id; int linkedhost_id; if(bTesting==1) return 1; if( level>0 && level<3 ) { if( ((links? (*links) : IndexingHost ).host_id) == 0 ) host_id = GetHostId( (links? *links:IndexingHost) ); else host_id = ((links? (*links) : IndexingHost ).host_id); if( linked->host_id == 0 ) linkedhost_id = GetHostId( *linked ); else linkedhost_id = linked->host_id; if( host_id==0 || linkedhost_id==0 ) return 0; sqlQuery = malloc(MAXQUERYSIZE); if(level==1) //saves hostname only snprintf_mysql_escaped_sql_statement(&gMysqlDB1,sqlQuery,MAXQUERYSIZE-1,"INSERT IGNORE INTO rels (host_id,linkedhost_id,page,linkedpage,textlink) VALUES(%d,%d,'/','/', '%s')",host_id,linkedhost_id , linked->Description ); else if(level==2) snprintf_mysql_escaped_sql_statement(&gMysqlDB1,sqlQuery,MAXQUERYSIZE-1,"INSERT IGNORE INTO rels (host_id,linkedhost_id,page,linkedpage,textlink) VALUES(%d,%d,'%s','%s', '%s')",host_id,linkedhost_id,(links?links->Page:"/"),linked->Page , linked->Description ); my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1); FREE(sqlQuery); return 1; } else return 0;}/* unencode* transform the gave unicoded string in an unencoded string*/void unencode(char *src, char *last, char *dest){ for(; src <= last; src++, dest++) { if(*src == '%') { int code; if(sscanf(src+1, "%2x", &code) != 1) code = '?'; *dest = (char)code; src +=2; } else *dest = *src; } *dest = '\0'; return;}#endif/*EOF*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -