📄 htrobot.c
字号:
HTChunk_puts(buffer, value); } } if (!first) HTChunk_puts(buffer, ")"); HTLog_addLine(mr->conneg, HTChunk_data(buffer)); HTChunk_delete(buffer); HT_FREE(uri); } } /* Count the amount of body data that we have read */ if (HTRequest_method(request) == METHOD_GET) { int length = HTAnchor_length(HTRequest_anchor(request)); if (length > 0) mr->get_bytes += length; mr->get_docs++; } else if (HTRequest_method(request) == METHOD_HEAD) { int length = HTAnchor_length(HTRequest_anchor(request)); if (length > 0) mr->head_bytes += length; mr->head_docs++; } else { mr->other_docs++; } if (!(mr->flags & MR_BFS)) { /* Delete this thread */ Finger_delete(finger); /* Should we stop? */ if (mr->cnt <= 0) { if (SHOW_QUIET(mr)) HTPrint(" Everything is finished...\n"); Cleanup(mr, 0); /* No way back from here */ } } if (SHOW_QUIET(mr)) HTPrint(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s"); return HT_OK;}PUBLIC int my_terminate_handler (HTRequest * request, HTResponse * response, void * param, int status) { Finger * finger = (Finger *) HTRequest_context(request); Robot * mr = finger->robot; HTParentAnchor * dest = finger->dest; HyperDoc * hd = HTAnchor_document(dest); int depth = (hd ? hd->depth : -1); if (hd) set_error_state_hyperdoc(hd,request); if(hd && (HTRequest_method(request)== METHOD_HEAD) && (depth < mr->depth)) { hd->method = METHOD_GET; HTQueue_append(mr->queue, (void *)hd); (mr->cq)++; } Finger_delete(finger); if(!(mr->flags & MR_PREEMPTIVE)) Serving_queue(mr); return HT_OK;}PUBLIC void Serving_queue(Robot *mr){ BOOL abort = NO; Finger *nfinger; while(!abort) { if(!HTQueue_isEmpty(mr->queue)) { HTRequest *newreq; HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue); if(nhd) { char *uri = HTAnchor_address((HTAnchor *)nhd->anchor); HTQueue_dequeue(mr->queue); (mr->cq)--; nfinger = Finger_new(mr, nhd->anchor, nhd->method); newreq = nfinger->request; if(SHOW_QUIET(mr)) HTPrint("Request from QUEUE %s\n",uri); HT_FREE(uri); if(SHOW_QUIET(mr)) HTPrint("%d elements in queue \n", mr->cq); HTRequest_setParent(newreq,get_last_parent(nhd->anchor)); /* @@@ Should be done using a timer and not sleep! @@@ */#if 0 if(mr->waits) sleep(mr->waits);#endif if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES) { if (SHOW_QUIET(mr)) HTPrint("not tested!\n"); Finger_delete(nfinger); } } else abort = YES; } else abort = YES; } if(SHOW_QUIET(mr)) HTPrint("Queue size: %d \n", mr->cq); if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE))) { if(mr->cnt > 0) if(SHOW_QUIET(mr)) HTPrint("%d requests were not served\n", mr->cnt); if (SHOW_QUIET(mr)) HTPrint(" Everything is finished...\n"); Cleanup(mr, 0); /* No way back from here */ }}/* ------------------------------------------------------------------------- *//* HTEXT INTERFACE *//* ------------------------------------------------------------------------- */PUBLIC BOOL Robot_registerHTMLParser (void){ HText_registerCDCallback(RHText_new, RHText_delete); HText_registerLinkCallback(RHText_foundLink); return YES;}PRIVATE HText * RHText_new (HTRequest * request, HTParentAnchor * anchor, HTStream * stream){ HText * me; Finger * finger = (Finger *) HTRequest_context(request); Robot * mr = finger->robot; char * robots = NULL; if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL) HT_OUTOFMEM("HText_new2"); /* Bind the HText object together with the Request Object */ me->request = request; me->follow = YES; /* Check to see if we have any meta tags */ if (!(mr->flags & MR_NOMETATAGS) && (robots = HTAnchor_robots(anchor)) != NULL) { char * strval = NULL; char * ptr = NULL; char * token = NULL; StrAllocCopy(strval, robots); ptr = strval; while ((token = HTNextField(&ptr)) != NULL) { if (!strcasecomp(token, "nofollow")) { me->follow = NO; break; } } HT_FREE(strval); } /* Add this HyperDoc object to our list */ if (!mr->htext) mr->htext = HTList_new(); HTList_addObject(mr->htext, (void *) me); return me;}PRIVATE BOOL RHText_delete (HText * me) { if (me) { HT_FREE(me); return YES; } return NO;}PRIVATE void RHText_foundAnchor (HText * text, HTChildAnchor * anchor){ if (text && anchor) { Finger * finger = (Finger *) HTRequest_context(text->request); Robot * mr = finger->robot; HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor); HTParentAnchor * dest_parent = HTAnchor_parent(dest); char * uri = HTAnchor_address((HTAnchor *) dest_parent); HyperDoc * hd = HTAnchor_document(dest_parent); HTParentAnchor * referer = HTRequest_anchor(text->request); BOOL match = text->follow; BOOL check = NO; /* These are new variables */ HyperDoc * nhd = NULL; BOOL follow = YES; /* These three variables were moved */ /*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/ HTParentAnchor * last_anchor = HTRequest_anchor(text->request); HyperDoc * last_doc = HTAnchor_document(last_anchor); int depth = last_doc ? last_doc->depth+1 : 0; if (!uri) return; if (SHOW_QUIET(mr)) HTPrint("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n"); if (hd) { if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n"); hd->hits++;#ifdef HT_MYSQL if (mr->sqllog) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (ref_addr) { HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri, "referer", NULL); HT_FREE(ref_addr); } }#endif HT_FREE(uri); return; } /* Check for prefix match */ if (match && mr->prefix) { match = HTStrMatch(mr->prefix, uri) ? YES : NO; }#ifdef HT_POSIX_REGEX /* ** Check for any regular expression. The include may override ** the prefix matching */ if (mr->include) { match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES; } if (match && mr->exc_robot) { match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO; } if (match && mr->exclude) { match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO; } if (match && mr->check) { check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES; }#endif if(uri && test_for_blank_spaces(uri)) follow = NO; else if (mr->ndoc == 0) /* Number of Documents is reached */ follow = NO; /* Test whether we already have a hyperdoc for this document */ if(!hd && dest_parent) { nhd = HyperDoc_new(mr, dest_parent, depth); mr->cdepth[depth]++; } /* Test whether we already have a hyperdoc for this document */ if (mr->flags & MR_LINK && match && dest_parent && follow && !hd) { if (mr->flags & MR_BFS) { nhd->method = METHOD_HEAD; HTQueue_enqueue(mr->queue, (void *) nhd); (mr->cq)++; if(mr->ndoc > 0) mr->ndoc--; } else { Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET); HTRequest * newreq = newfinger->request; HTRequest_setParent(newreq, referer); if (check || depth >= mr->depth) { if (SHOW_QUIET(mr)) HTPrint("loading at depth %d using HEAD\n", depth); HTRequest_setMethod(newreq, METHOD_HEAD); } else { if (SHOW_QUIET(mr)) HTPrint("loading at depth %d\n", depth); } if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) { if (SHOW_QUIET(mr)) HTPrint("not tested!\n"); Finger_delete(newfinger); } } } else { if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");#ifdef HT_MYSQL if (mr->reject || mr->sqllog) {#else if (mr->reject) {#endif if (referer) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (mr->reject && ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);#ifdef HT_MYSQL if (mr->sqllog && mr->sqlexternals && ref_addr) HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri, "referer", NULL);#endif HT_FREE(ref_addr); } } } HT_FREE(uri); }}PRIVATE void RHText_foundImage (HText * text, HTChildAnchor * anchor, const char *alt, const char * align, BOOL isMap){ if (text && anchor) { Finger * finger = (Finger *) HTRequest_context(text->request); Robot * mr = finger->robot; if (mr->flags & MR_IMG) { HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor); HTParentAnchor * dest_parent = HTAnchor_parent(dest); char * uri = HTAnchor_address((HTAnchor *) dest_parent); HyperDoc * hd = HTAnchor_document(dest_parent); HTParentAnchor * referer = HTRequest_anchor(text->request); BOOL match = YES; if (!uri) return; if (hd) { if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n"); hd->hits++;#ifdef HT_MYSQL if (mr->sqllog) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (ref_addr) { HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri, "image", alt); HT_FREE(ref_addr); } }#endif HT_FREE(uri); return; } /* Check for prefix match */ if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;#ifdef HT_POSIX_REGEX /* ** Check for any regular expression. The include may override ** the prefix matching */ if (mr->include) { match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES; } if (match && mr->exc_robot) { match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO; } if (match && mr->exclude) { match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO; }#endif /* Test whether we already have a hyperdoc for this document */ if (match && dest) { Finger * newfinger = Finger_new(mr, dest_parent, mr->flags & MR_SAVE ? METHOD_GET : METHOD_HEAD); HTRequest * newreq = newfinger->request; HyperDoc_new(mr, dest_parent, 1); HTRequest_setParent(newreq, referer); /* Check whether we should report missing ALT tags */ if (mr->noalttag && (alt==NULL || *alt=='\0')) { if (referer) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri); HT_FREE(ref_addr); } } if (SHOW_QUIET(mr)) HTPrint("Robot....... Checking Image `%s\'\n", uri); if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) { if (SHOW_QUIET(mr)) HTPrint("Robot....... Image not tested!\n"); Finger_delete(newfinger); } } else { if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");#ifdef HT_MYSQL if (mr->reject || mr->sqllog) {#else if (mr->reject) {#endif if (referer) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (mr->reject && ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);#ifdef HT_MYSQL if (mr->sqllog && mr->sqlexternals && ref_addr) HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri, "image", alt);#endif HT_FREE(ref_addr); } } } HT_FREE(uri); } }}PRIVATE void RHText_foundLink (HText * text, int element_number, int attribute_number, HTChildAnchor * anchor, const BOOL * present, const char ** value){ if (text && anchor) { Finger * finger = (Finger *) HTRequest_context(text->request); Robot * mr = finger->robot; if (SHOW_QUIET(mr)) HTPrint("Robot....... Received element %d, attribute %d with anchor %p\n", element_number, attribute_number, anchor); if ((element_number==HTML_IMG && attribute_number==HTML_IMG_SRC) || (element_number==HTML_BODY && attribute_number==HTML_BODY_BACKGROUND)) RHText_foundImage(text, anchor, NULL, NULL, NO); else RHText_foundAnchor(text, anchor); }}PUBLIC char * get_robots_txt(char * uri){ char *str = NULL; HTChunk * chunk; HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri)); HTRequest *request = HTRequest_new(); HTRequest_setOutputFormat(request, WWW_SOURCE); HTRequest_setPreemptive(request, YES); HTRequest_setMethod(request, METHOD_GET); chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request); str = HTChunk_toCString(chunk); HTRequest_delete(request); return str;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -