📄 htrobot.c
字号:
/* Find mediatype distribution */ if (mr->mtfile) { HTList * mtdist = mediatype_distribution(array); if (mtdist) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged media type distribution in file `%s\'\n", mr->mtfile); log_meta_distribution(mr->mtfile, mtdist); delete_meta_distribution(mtdist); } } /* Find charset distribution */ if (mr->charsetfile) { HTList * charsetdist = charset_distribution(array); if (charsetdist) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged charset distribution in file `%s\'\n", mr->charsetfile); log_meta_distribution(mr->charsetfile, charsetdist); delete_meta_distribution(charsetdist); } } /* Add as may other stats here as you like */ /* ... */ /* Delete the array */ HTArray_delete(array); } } return YES;}PRIVATE HTParentAnchor *get_last_parent(HTParentAnchor *anchor){ HTAnchor *anc; HTList *sources = anchor->sources; while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL) { HTParentAnchor *panchor = HTAnchor_parent(anc); return panchor; } return NULL;}PRIVATE HTLink *HTLink_find_type(HTAnchor * src, HTAnchor * dest, char *linktype){ if(src && dest && linktype) { HTLink * link = HTAnchor_mainLink(src); HTList * sublinks = HTAnchor_subLinks(src); HTLinkType type = (HTLinkType)HTAtom_caseFor(linktype); HTAnchor *sdest = HTLink_destination(link); if (link && sdest == dest && type == HTLink_type(link)) return link; else if (sublinks) { while ((link = (HTLink *) HTList_nextObject (sublinks))) { sdest = HTLink_destination(link); if (sdest == dest && HTLink_type(link) == type) return link; } } } return NULL;}PRIVATE voidupdate_incoming_links(HTParentAnchor *anchor, HTParentAnchor *nanchor){ if(anchor && nanchor) { HTAnchor *anc; HTList *sources = anchor->sources; while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL) { HTParentAnchor *panchor = HTAnchor_parent(anc); if((HTLink_find((HTAnchor *)panchor,(HTAnchor *)anchor)) && (!HTLink_find_type((HTAnchor *)panchor, (HTAnchor *)nanchor,"redirection"))) { HTLink_add((HTAnchor *)panchor,(HTAnchor *)nanchor, (HTLinkType) HTAtom_caseFor("redirection"), METHOD_HEAD); } } }} PRIVATE voidupdate_hyperdoc(HyperDoc *hd,HTRequest *request){ if(hd && request) { HTParentAnchor *anchor = hd->anchor; HTParentAnchor *nanchor = HTRequest_anchor(request); HTParentAnchor *parent = HTRequest_parent(request); HyperDoc *nhd = HTAnchor_document(nanchor); char *tit = (char *) HTAnchor_title(nanchor); if(nhd && tit) StrAllocCopy(nhd->title,tit); if (anchor != nanchor) { if(nhd) { /* The redirected anchor has a Hyperdoc */ if(nhd != hd) { hd->code = REDIR_CODE; HTAnchor_setDocument(anchor,(void *)nhd); if(!HTLink_find_type((HTAnchor *)parent, (HTAnchor *)nanchor,"redirection")) { HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor, (HTLinkType) HTAtom_caseFor("redirection"), METHOD_HEAD); } } } else { /* The redirected anchor does not have a Hyperdoc */ hd->anchor = nanchor; HTAnchor_setDocument(nanchor,(void *) hd); if(!HTLink_find_type((HTAnchor *)parent,(HTAnchor *)nanchor, "redirection")) { HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor, (HTLinkType) HTAtom_caseFor("redirection") , METHOD_HEAD); } } update_incoming_links(anchor,nanchor); } }}PRIVATE voidset_error_state_hyperdoc(HyperDoc * hd, HTRequest *request){ HTList * cur = HTRequest_error(request); HTError *pres; Finger * finger = (Finger *) HTRequest_context(request); Robot * mr = finger->robot; while((pres = (HTError *) HTList_nextObject(cur)) != NULL) { int code =HTErrors[HTError_index(pres)].code; hd->code = code; if((mr->flags & MR_REDIR) && code >= 200 && code < 300 ) update_hyperdoc(hd,request); }}#if 0PRIVATE inttest_for_blank_spaces(char *uri){ char *ptr = uri; for(;*ptr!='\0';ptr++) if(*ptr == ' ') return 1; return 0;}#endif/* Create a Command Line Object** ----------------------------*/PUBLIC Robot * Robot_new (void){ Robot * me; if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL) HT_OUTOFMEM("Robot_new"); me->hyperdoc = HTList_new(); me->htext = HTList_new(); me->timer = DEFAULT_TIMEOUT*MILLIES; me->waits = 0; me->cwd = HTGetCurrentDirectoryURL(); me->output = OUTPUT; me->cnt = 0; me->ndoc = -1; me->fingers = HTList_new(); /* This is new */ me->queue = HTQueue_new(); me->cq = 0; me->furl = NULL; return me;}/* Delete a Command Line Object** ----------------------------*/PRIVATE BOOL Robot_delete (Robot * mr){ if (mr) { HTList_delete(mr->fingers); /* Calculate statistics */ calculate_statistics(mr); if (mr->hyperdoc) { HTList * cur = mr->hyperdoc; HyperDoc * pres; while ((pres = (HyperDoc *) HTList_nextObject(cur))) HyperDoc_delete(pres); HTList_delete(mr->hyperdoc); } if (mr->htext) { HTList * cur = mr->htext; HText * pres; while ((pres = (HText *) HTList_nextObject(cur))) RHText_delete(pres); HTList_delete(mr->htext); } /* Close all the log files */ if (mr->flags & MR_LOGGING) { if (SHOW_REAL_QUIET(mr)) HTPrint("\nRaw Log files:\n"); } if (mr->log) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in general log file `%s\'\n", HTLog_accessCount(mr->log), mr->logfile); HTLog_close(mr->log); } if (mr->ref) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in referer log file `%s\'\n", HTLog_accessCount(mr->ref), mr->reffile); HTLog_close(mr->ref); } if (mr->reject) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in rejected log file `%s\'\n", HTLog_accessCount(mr->reject), mr->rejectfile); HTLog_close(mr->reject); } if (mr->notfound) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in not found log file `%s\'\n", HTLog_accessCount(mr->notfound), mr->notfoundfile); HTLog_close(mr->notfound); } if (mr->conneg) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in content negotiation log file `%s\'\n", HTLog_accessCount(mr->conneg), mr->connegfile); HTLog_close(mr->conneg); } if (mr->noalttag) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in missing alt tag log file `%s\'\n", HTLog_accessCount(mr->noalttag), mr->noalttagfile); HTLog_close(mr->noalttag); } if (mr->output && mr->output != STDOUT) fclose(mr->output); if (mr->flags & MR_TIME) { time_t local = time(NULL); if (SHOW_REAL_QUIET(mr)) HTPrint("\nRobot terminated %s\n", HTDateTimeStr(&local, YES)); } /* This is new */ HT_FREE(mr->cdepth); HT_FREE(mr->furl);#ifdef HT_POSIX_REGEX if (mr->include) { regfree(mr->include); HT_FREE(mr->include); } if (mr->exclude) { regfree(mr->exclude); HT_FREE(mr->exclude); } if (mr->exc_robot) { regfree(mr->exc_robot); HT_FREE(mr->exc_robot); } if (mr->check) { regfree(mr->check); HT_FREE(mr->check); }#endif#ifdef HT_MYSQL if (mr->sqllog) { HTSQLLog_close(mr->sqllog); mr->sqllog = NULL; }#endif if (mr->queue) HTQueue_delete(mr->queue); HT_FREE(mr->cwd); HT_FREE(mr->prefix); HT_FREE(mr->img_prefix); HT_FREE(mr); return YES; } return NO;}/*** This function creates a new finger object and initializes it with a new request*/PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method){ Finger * me; HTRequest * request = HTRequest_new(); if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL) HT_OUTOFMEM("Finger_new"); me->robot = robot; me->request = request; me->dest = dest; HTList_addObject(robot->fingers, (void *)me); /* Set the context for this request */ HTRequest_setContext (request, me); /* Check the various flags to customize the request */ if (robot->flags & MR_PREEMPTIVE) HTRequest_setPreemptive(request, YES); if (robot->flags & MR_VALIDATE) HTRequest_setReloadMode(request, HT_CACHE_VALIDATE); if (robot->flags & MR_END_VALIDATE) HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE); /* We wanna make sure that we are sending a Host header (default) */ HTRequest_addRqHd(request, HT_C_HOST); /* Set the method for this request */ HTRequest_setMethod(request, method); robot->cnt++; return me;}PRIVATE int Finger_delete (Finger * me){ HTList_removeObject(me->robot->fingers, (void *)me); /* Done with one more */ me->robot->cnt--; /* See if we don't need to keep all the metadata around in the anchors */ if (!(me->robot->flags & MR_KEEP_META)) HTAnchor_clearHeader(HTRequest_anchor(me->request)); /* ** If we are down at one request then flush the output buffer */ if (me->request) { if (me->robot->cnt == 1) HTRequest_forceFlush(me->request); HTRequest_delete(me->request); me->request = NULL; } /* ** Delete the request and free myself */ HT_FREE(me); return YES;}PRIVATE BOOL check_constraints(Robot * mr, char *prefix, char *uri){ BOOL match = YES; /* Check for prefix match */ if (prefix) { match = HTStrMatch(prefix, uri) ? YES : NO; } #ifdef HT_POSIX_REGEX /* Check for any regular expression */ if (match && mr->include) { match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES; } if (match && mr->exc_robot) { match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO; } if (match && mr->exclude) { match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO; } #endif return match;}/*** Cleanup and make sure we close all connections including the persistent** ones*/PUBLIC void Cleanup (Robot * me, int status){ /* ** First we clean up the robot itself and calculate the various ** statistics. This can actually take some time as a lot of data ** has to be manipulated */ Robot_delete(me); /* ** Then we shut down libwww */ HTProfile_delete();#ifdef HT_MEMLOG HTMemLog_close();#endif#ifdef VMS exit(status ? status : 1);#else exit(status ? status : 0);#endif}#ifdef HT_POSIX_REGEXPRIVATE char * get_regerror (int errcode, regex_t * compiled){ size_t length = regerror (errcode, compiled, NULL, 0); char * str = NULL; if ((str = (char *) HT_MALLOC(length+1)) == NULL) HT_OUTOFMEM("get_regerror"); (void) regerror (errcode, compiled, str, length); return str;}PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags){ regex_t * regex = NULL; if (regex_str && *regex_str) { int status; if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL) HT_OUTOFMEM("get_regtype"); if ((status = regcomp(regex, regex_str, cflags))) { char * err_msg = get_regerror(status, regex); if (SHOW_REAL_QUIET(mr)) HTPrint("Regular expression error: %s\n", err_msg); HT_FREE(err_msg); Cleanup(mr, -1); } } return regex;}#endifPUBLIC void VersionInfo (void){ HTPrint("\nW3C OpenSource Software"); HTPrint("\n-----------------------\n\n"); HTPrint("\tWebbot version %s\n", APP_VERSION); HTPrint("\tusing the W3C libwww library version %s.\n\n",HTLib_version()); HTPrint("\tSee \"%s\" for help\n", COMMAND_LINE); HTPrint("\tSee \"http://www.w3.org/Robot/User/\" for user information\n"); HTPrint("\tSee \"http://www.w3.org/Robot/\" for general information\n\n"); HTPrint("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n"); HTPrint("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");}/* redirection_handler** -------------------** If we are set up to handle redirections then handle it here.*/PUBLIC int redirection_handler (HTRequest * request, HTResponse * response, void * param, int status) { Finger * finger = (Finger *) HTRequest_context(request); Robot * mr = finger->robot; HTParentAnchor * me = HTRequest_anchor(request); HTAnchor * redirection = HTResponse_redirection(response); HTParentAnchor * redirection_parent = HTAnchor_parent(redirection); HyperDoc * redirection_hd = HTAnchor_document(redirection_parent); char * uri = NULL; char * redirection_parent_addr = NULL; BOOL match = YES; BOOL check = NO; /* In case we didn't get any redirection destination */ if (!redirection) return HT_OK; /* Get the addresses */ uri = HTAnchor_address((HTAnchor *) me); redirection_parent_addr = HTAnchor_address((HTAnchor *) redirection_parent); if (SHOW_QUIET(mr)) HTPrint("Robot....... Checking redirecting from `%s\' to `%s\'\n", uri, redirection_parent_addr); /* Log the event */#ifdef HT_MYSQL if (mr->sqllog && redirection_parent_addr) HTSQLLog_addLinkRelationship(mr->sqllog, redirection_parent_addr, uri, "redirection", NULL);#endif /* Check our constraints matcher */ match = check_constraints(mr,mr->prefix, redirection_parent_addr);#ifdef HT_POSIX_REGEX /* See if we should do a HEAD or a GET on this URI */ if (match && mr->check) { check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES; }#endif /* ** If we already have a HyperDoc for the redirected anchor ** then update it */ if (match) { if ((redirection_hd = HTAnchor_document(redirection_parent)) != NULL) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -