📄 spider.c
字号:
void HandleInitNode(WEBNODE * node)
{
WEBNODE * CurrentNode = 0;
CurrentNode = node;
if(CurrentNode)
{
while(CurrentNode)
{
if(CurrentNode->IsHandled == 0)
{
HandOneNode(CurrentNode);
if(DEBUG)
{
fprintf(stdout, "\nDisplay.%5d:", FileNumber);
DisplayNode(NodeHeader); /* display every node */
}
}
CurrentNode = CurrentNode->brother;
}// while
CurrentNode = node;
while(CurrentNode)
{
if(CurrentNode->child && CurrentNode->child->IsHandled == 0)
{
HandleInitNode(CurrentNode->child);
}
CurrentNode = CurrentNode->brother;
}// while
}
}
/**************************************************************
功能:显示年有节点信息
***************************************************************/
void DisplayNode(WEBNODE * NodeHeader)
{
WEBNODE * TempNode;
TempNode = NodeHeader;
fprintf(stdout, "\n");
while(TempNode)
{
if(!strcmp(TempNode->dir, "/"))
fprintf(stdout, "\t%s:%d%s%s => %s %d\n", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "@")?TempNode->page:"", TempNode->file, TempNode->IsHandled);
else
fprintf(stdout, "\t%s:%d/%s/%s => %s %d\n", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "@")?TempNode->page:"", TempNode->file, TempNode->IsHandled);
TempNode = TempNode->brother;
}// while
TempNode = NodeHeader;
while(TempNode)
{
if(TempNode->child)
DisplayNode(TempNode->child);
TempNode = TempNode->brother;
}// while
}
/**************************************************************
功能:处理单个节点信息
***************************************************************/
void HandOneNode(WEBNODE * node)
{
char UserAgent[1024] = "",
Accept[1024] = "",
AcceptLanguage[1024] = "",
AcceptEncoding[1024] = "",
AcceptCharset[1024] = "",
KeepAlive[1024] = "",
Connection[1024] = "",
ContentType[1024] = "";
NodeCurr = node;
if((host=gethostbyname(NodeCurr->host))==NULL) /* get ip address by domain */
{
if(DEBUG) fprintf(stderr,"\tGethostname '%s' error, %s\n", NodeCurr->host, strerror(errno));
exit(1);
}
GetLocalAgent(UserAgent, Accept, AcceptLanguage, AcceptEncoding, AcceptCharset, KeepAlive, Connection, ContentType); /* Get client browser information */
if(strcmp(NodeCurr->dir, "/"))
sprintf(request, "GET /%s/%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\nAccept: %s\r\nConnection: %s\r\n\r\n", NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connection);
else
sprintf(request, "GET %s%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\nAccept: %s\r\nConnection: %s\r\n\r\n", NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connection);
DoneWithList(1);
AnalyzePage(NodeCurr);
}
/**************************************************************
功能:从字符串 src 中分析出邮件地址保存到文件
***************************************************************/
void GetEmail(char * src)
{
char * pa, * pb, * pc, *pd;
char myemail[1024] = "";
FILE * mailfp = NULL;
if((mailfp = fopen("email.txt", "a+")) == NULL)
return;
pa = src;
while((pb = strchr(pa, '@')))
{
GetBeforePos(pb, &pc);
GetAfterPos(pb, &pd);
if(pc && pd && (strlen(pc) > (strlen(pd) + 3)))
{
memset(myemail, 0, 1024);
memcpy(myemail, pc, strlen(pc) - strlen(pd));
if(strcmp(NodeCurr->dir, "/")) fprintf(mailfp, "%s\thttp://%s/%s/%s\n", myemail, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"");
else fprintf(mailfp, "%s\thttp://%s%s%s\n", myemail, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"");
if(*(pd + 1))
pa = pd + 1;
else break;
}
else if(*(pb + 1))
pa = pb + 1;
else
break;
}// while
fclose(mailfp);
}
/**************************************************************
功能:从 src 中找出前面的字母、数字等内含,即 email 地址中 @ 的前面部分
***************************************************************/
void GetBeforePos(char * src, char ** d)
{
char * x;
if(src - 1)
x = src - 1;
else {*d = 0; return ;}
while(x)
{
if(*x >= 'a' && *x <= 'z') {x--; continue;}
else if(*x >= 'A' && *x <= 'Z') {x--; continue;}
else if(*x >= '0' && *x <= '9') {x--; continue;}
else if(*x == '.' || *x == '-' || *x == '_') {x--; continue;}
else {break;}
}
x++;
if(x) *d = x;
else *d = 0;
}
/**************************************************************
功能:从 src 中找出后面的字母、数字等内含,即 email 地址中 @ 的后面部分
***************************************************************/
void GetAfterPos(char * src, char ** d)
{
char * x;
if(src + 1)
x = src + 1;
else {*d = 0; return ;}
while(x)
{
if(*x >= 'a' && *x <= 'z') {x++; continue;}
else if(*x >= 'A' && *x <= 'Z') {x++; continue;}
else if(*x >= '0' && *x <= '9') {x++; continue;}
else if(*x == '.' || *x == '-' || *x == '_') {x++; continue;}
else {break;}
}
if(x) *d = x;
else *d = 0;
}
/**************************************************************
功能:从 src 中找出前面的字母、数字等内含,即一个网页地址中主机名后面的部分
***************************************************************/
void GetAfterPosWithSlash(char * src, char ** d)
{
char * x;
if(src)
x = src;
else {*d = 0; return ;}
while(x)
{
if(*x >= 'a' && *x <= 'z') {x++; continue;}
else if(*x >= 'A' && *x <= 'Z') {x++; continue;}
else if(*x >= '0' && *x <= '9') {x++; continue;}
else if(*x == '.' || *x == '-' || *x == '_' || *x == '=') {x++; continue;}
else if(*x == ':' || *x == '/' || *x == '?' || *x == '&') {x++; continue;}
else {break;}
}
if(x) *d = x;
else *d = 0;
}
/**************************************************************
功能:为 myanchor 分配 len 大小的内存
***************************************************************/
void GetMemory(char ** myanchor, int len)
{
if(!(*myanchor))
(*myanchor) = (char *)malloc(len + 1);
else
(*myanchor) = (char *)realloc((void *)(*myanchor), len + 1);
memset((*myanchor), 0, len + 1);
}
/**************************************************************
功能:从 src 中分析出网页链接,并加入到当前节点的子节点上
***************************************************************/
void GetLink(char * src)
{
char * pa, * pb, * pc;
char * myanchor = 0;
int len = 0;
pa = src;
do
{
if((pb = strstr(pa, "href='")))
{
pc = strchr(pb + 6, '\'');
len = strlen(pb + 6) - strlen(pc);
GetMemory(&myanchor, len);
memcpy(myanchor, pb + 6, len);
}
else if((pb = strstr(pa, "href=\"")))
{
pc = strchr(pb + 6, '"');
len = strlen(pb + 6) - strlen(pc);
GetMemory(&myanchor, len);
memcpy(myanchor, pb + 6, len);
}
else if((pb = strstr(pa, "href=")))
{
GetAfterPosWithSlash(pb + 5, &pc);
len = strlen(pb + 5) - strlen(pc);
GetMemory(&myanchor, len);
memcpy(myanchor, pb + 5, len);
}
else {goto __returnLink ;}
if(strlen(myanchor) > 0)
AddChildNode(NodeCurr, myanchor);
if(pc + 1)
pa = pc + 1;
}while(pa);
__returnLink:
return;
}
/**************************************************************
功能:为当前节点增加子节点
***************************************************************/
void AddChildNode(WEBNODE * node, char * src)
{
int WebPort, len;
char * WebHost = 0, * PageAddress = 0, * WebDir = 0, * pC = 0;
WEBNODE * NewNode;
char filename[MAXFILENAME + 1] = "";
char IsFromRoot = 0;
if(!src) return;
if(!strncasecmp(src, "mailto:", strlen("mailto:"))) return ;
if(strstr(src, ".css")) return;
if(strstr(src, ".xml")) return;
if(strstr(src, ".ico")) return;
if(strstr(src, ".jpg")) return;
if(strstr(src, ".gif")) return;
if(strstr(src, "javascript:")) return;
if(strstr(src, "+")) return;
ret = GetHost(src, &WebHost, &PageAddress, &WebPort, &WebDir);
if(ret)
{
len = strlen(node->host);
GetMemory(&WebHost, len);
strcpy(WebHost, node->host);
WebPort = node->port;
IsFromRoot = !strncmp(src, "/", 1);
if(IsFromRoot && (src + 1))
Rstrchr(src + 1, '/', &pC);
else if(!IsFromRoot)
Rstrchr(src, '/', &pC);
else
pC = 0;
if(pC)
{
if(IsFromRoot)
len = strlen(src + 1) - strlen(pC);
else
len = strlen(src) - strlen(pC) + strlen(node->dir) + 1;
GetMemory(&WebDir, len);
if(IsFromRoot)
memcpy(WebDir, src + 1, len);
else
{
memcpy(WebDir, node->dir, strlen(node->dir));
strcat(WebDir, "/");
memcpy(WebDir + strlen(node->dir) + 1, src, strlen(src) - strlen(pC));
}
if(pC + 1)
{
len = strlen(pC + 1);
GetMemory(&PageAddress, len);
strcpy(PageAddress, pC + 1);
}
else
{
len = 1;
GetMemory(&PageAddress, len);
memcpy(PageAddress, e, len);
}
}
else
{
if(IsFromRoot)
{
len = 1;
GetMemory(&WebDir, len);
memcpy(WebDir, e + 1, len);
len = strlen(src + 1);
GetMemory(&PageAddress, len);
memcpy(PageAddress, src + 1, len);
}
else
{
len = strlen(node->dir);
GetMemory(&WebDir, len);
memcpy(WebDir, node->dir, len);
len = strlen(src);
GetMemory(&PageAddress, len);
memcpy(PageAddress, src, len);
}
}
}
ret = IsExistWeb(NodeHeader, WebHost, PageAddress, WebPort, WebDir);
if(ret) goto __ReturnAdd;
if(node->child == NULL)
NewNode = node->child = (WEBNODE *)malloc(sizeof(WEBNODE));
else
NodeTail->brother = NewNode = (WEBNODE *)malloc(sizeof(WEBNODE));
memset(NewNode, 0, sizeof(WEBNODE));
NewNode->host = (char *)malloc(strlen(WebHost) + 1);
memset(NewNode->host, 0, strlen(WebHost) + 1);
NewNode->page = (char *)malloc(strlen(PageAddress) + 1);
memset(NewNode->page, 0, strlen(PageAddress) + 1);
NewNode->dir = (char *)malloc(strlen(WebDir) + 1);
memset(NewNode->dir, 0, strlen(WebDir) + 1);
NewNode->file = (char *)malloc(MAXFILENAME + 1);
memset(NewNode->file, 0, MAXFILENAME + 1);
strcpy(NewNode->host, WebHost);
strcpy(NewNode->page, PageAddress);
strcpy(NewNode->dir, WebDir);
sprintf(filename, "file%05d.html", FileNumber++);
strcpy(NewNode->file, filename);
NewNode->port = WebPort;
NewNode->IsHandled = 0;
NewNode->brother = 0;
NewNode->child = 0;
NodeTail = NewNode;
__ReturnAdd:
free(WebHost);
free(PageAddress);
free(WebDir);
}
/**************************************************************
功能:检查是否已经处理过的网页
***************************************************************/
int IsExistWeb(WEBNODE * node, char * host, char * page, int port, char * dir)
{
WEBNODE * t;
t = node;
while(t)
{
if(!strcmp(t->host, host) && !strcmp(t->page, page) && t->port == port && !strcmp(t->dir, dir)) return 1;
t = t->brother;
}
t = node;
while(t)
{
if(t->child)
{
ret = IsExistWeb(t->child, host, page, port, dir);
if(ret)
return 2;
}
t = t->brother;
}
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -