📄 htrobot.c
字号:
/* Calculate efficiency */ if (mr->time > 0) { ms_t t = HTGetTimeInMillis() - mr->time; if (t > 0) { double loadfactor = (mr->get_bytes / (t * 0.001)); double reqprsec = (total_docs / (t * 0.001)); double secs = t / 1000.0; char bytes[50]; if (SHOW_REAL_QUIET(mr)) HTPrint("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n", total_docs, secs, reqprsec); HTNumToStr(mr->get_bytes, bytes, 50); if (SHOW_REAL_QUIET(mr)) HTPrint("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n", mr->get_docs, bytes, loadfactor); HTNumToStr(mr->head_bytes, bytes, 50); if (SHOW_REAL_QUIET(mr)) HTPrint("\tDid a HEAD on %ld document(s) with a total of %s bytes\n", mr->head_docs, bytes); } } /* Create an array of existing anchors */ if (total_docs > 1) { HTArray * array = HTAnchor_getArray(total_docs); if (array) { /* Distributions */ if (mr->flags & MR_DISTRIBUTIONS) { if (SHOW_REAL_QUIET(mr)) HTPrint("\nDistributions:\n"); } /* Sort after hit counts */ if (mr->hitfile) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged hit count distribution in file `%s\'\n", mr->hitfile); calculate_hits(mr, array); } /* Sort after link relations */#ifdef HT_MYSQL if (mr->relfile || mr->sqllog) {#else if (mr->relfile) {#endif if (mr->relfile && SHOW_REAL_QUIET(mr)) HTPrint("\tLogged link relationship distribution in file `%s\'\n", mr->relfile); calculate_linkRelations(mr, array); } /* Sort after modified date */ if (mr->lmfile) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged last modified distribution in file `%s\'\n", mr->lmfile); calculate_lm(mr, array); } /* Sort after title */ if (mr->titlefile) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged title distribution in file `%s\'\n", mr->titlefile); calculate_title(mr, array); } /* Find mediatype distribution */ if (mr->mtfile) { HTList * mtdist = mediatype_distribution(array); if (mtdist) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged media type distribution in file `%s\'\n", mr->mtfile); log_meta_distribution(mr->mtfile, mtdist); delete_meta_distribution(mtdist); } } /* Find charset distribution */ if (mr->charsetfile) { HTList * charsetdist = charset_distribution(array); if (charsetdist) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged charset distribution in file `%s\'\n", mr->charsetfile); log_meta_distribution(mr->charsetfile, charsetdist); delete_meta_distribution(charsetdist); } } /* Add as may other stats here as you like */ /* ... */ /* Delete the array */ HTArray_delete(array); } } return YES;}PRIVATE HTParentAnchor *get_last_parent(HTParentAnchor *anchor){ HTAnchor *anc; HTList *sources = anchor->sources; while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL) { HTParentAnchor *panchor = HTAnchor_parent(anc); return panchor; } return NULL;}PRIVATE voidset_error_state_hyperdoc(HyperDoc * hd, HTRequest *request){ HTList * cur = HTRequest_error(request); HTError *pres; while((pres = (HTError *) HTList_nextObject(cur)) != NULL) { int code =HTErrors[HTError_index(pres)].code; hd->code = code; }}PRIVATE inttest_for_blank_spaces(char *uri){ char *ptr = uri; for(;*ptr!='\0';ptr++) if(*ptr == ' ') return 1; return 0;}/* Create a Command Line Object** ----------------------------*/PUBLIC Robot * Robot_new (void){ Robot * me; if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL) HT_OUTOFMEM("Robot_new"); me->hyperdoc = HTList_new(); me->htext = HTList_new(); me->timer = DEFAULT_TIMEOUT*MILLIES; me->waits = 0; me->cwd = HTGetCurrentDirectoryURL(); me->output = OUTPUT; me->cnt = 0; me->ndoc = -1; me->fingers = HTList_new(); /* This is new */ me->queue = HTQueue_new(); me->cq = 0; me->furl = NULL; return me;}/* Delete a Command Line Object** ----------------------------*/PRIVATE BOOL Robot_delete (Robot * mr){ if (mr) { HTList_delete(mr->fingers); /* Calculate statistics */ calculate_statistics(mr); if (mr->hyperdoc) { HTList * cur = mr->hyperdoc; HyperDoc * pres; while ((pres = (HyperDoc *) HTList_nextObject(cur))) HyperDoc_delete(pres); HTList_delete(mr->hyperdoc); } if (mr->htext) { HTList * cur = mr->htext; HText * pres; while ((pres = (HText *) HTList_nextObject(cur))) RHText_delete(pres); HTList_delete(mr->htext); } /* Close all the log files */ if (mr->flags & MR_LOGGING) { if (SHOW_REAL_QUIET(mr)) HTPrint("\nRaw Log files:\n"); } if (mr->log) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in general log file `%s\'\n", HTLog_accessCount(mr->log), mr->logfile); HTLog_close(mr->log); } if (mr->ref) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in referer log file `%s\'\n", HTLog_accessCount(mr->ref), mr->reffile); HTLog_close(mr->ref); } if (mr->reject) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in rejected log file `%s\'\n", HTLog_accessCount(mr->reject), mr->rejectfile); HTLog_close(mr->reject); } if (mr->notfound) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in not found log file `%s\'\n", HTLog_accessCount(mr->notfound), mr->notfoundfile); HTLog_close(mr->notfound); } if (mr->conneg) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in content negotiation log file `%s\'\n", HTLog_accessCount(mr->conneg), mr->connegfile); HTLog_close(mr->conneg); } if (mr->noalttag) { if (SHOW_REAL_QUIET(mr)) HTPrint("\tLogged %5d entries in missing alt tag log file `%s\'\n", HTLog_accessCount(mr->noalttag), mr->noalttagfile); HTLog_close(mr->noalttag); } if (mr->output && mr->output != STDOUT) fclose(mr->output); if (mr->flags & MR_TIME) { time_t local = time(NULL); if (SHOW_REAL_QUIET(mr)) HTPrint("\nRobot terminated %s\n", HTDateTimeStr(&local, YES)); } /* This is new */#if 0 if (mr->cdepth) FT_FREE(mr->cdepth);#endif if(mr->furl) HT_FREE(mr->furl);#ifdef HT_POSIX_REGEX if (mr->include) { regfree(mr->include); HT_FREE(mr->include); } if (mr->exclude) { regfree(mr->exclude); HT_FREE(mr->exclude); } if (mr->exc_robot) { regfree(mr->exc_robot); HT_FREE(mr->exc_robot); } if (mr->check) { regfree(mr->check); HT_FREE(mr->check); }#endif#ifdef HT_MYSQL if (mr->sqllog) { HTSQLLog_close(mr->sqllog); mr->sqllog = NULL; }#endif if (mr->queue) HTQueue_delete(mr->queue); HT_FREE(mr->cwd); HT_FREE(mr->prefix); HT_FREE(mr->img_prefix); HT_FREE(mr); return YES; } return NO;}/*** This function creates a new finger object and initializes it with a new request*/PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method){ Finger * me; HTRequest * request = HTRequest_new(); if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL) HT_OUTOFMEM("Finger_new"); me->robot = robot; me->request = request; me->dest = dest; HTList_addObject(robot->fingers, (void *)me); /* Set the context for this request */ HTRequest_setContext (request, me); /* Check the various flags to customize the request */ if (robot->flags & MR_PREEMPTIVE) HTRequest_setPreemptive(request, YES); if (robot->flags & MR_VALIDATE) HTRequest_setReloadMode(request, HT_CACHE_VALIDATE); if (robot->flags & MR_END_VALIDATE) HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE); /* We wanna make sure that we are sending a Host header (default) */ HTRequest_addRqHd(request, HT_C_HOST); /* Set the method for this request */ HTRequest_setMethod(request, method); robot->cnt++; return me;}PRIVATE int Finger_delete (Finger * me){ HTList_removeObject(me->robot->fingers, (void *)me); me->robot->cnt--; /* ** If we are down at one request then flush the output buffer */ if (me->request) { if (me->robot->cnt == 1) HTRequest_forceFlush(me->request); HTRequest_delete(me->request); } /* ** Delete the request and free myself */ HT_FREE(me); return YES;}/*** Cleanup and make sure we close all connections including the persistent** ones*/PUBLIC void Cleanup (Robot * me, int status){ HTProfile_delete(); Robot_delete(me);#ifdef HT_MEMLOG HTMemLog_close();#endif#ifdef VMS exit(status ? status : 1);#else exit(status ? status : 0);#endif}#ifdef HT_POSIX_REGEXPRIVATE char * get_regerror (int errcode, regex_t * compiled){ size_t length = regerror (errcode, compiled, NULL, 0); char * str = NULL; if ((str = (char *) HT_MALLOC(length+1)) == NULL) HT_OUTOFMEM("get_regerror"); (void) regerror (errcode, compiled, str, length); return str;}PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags){ regex_t * regex = NULL; if (regex_str && *regex_str) { int status; if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL) HT_OUTOFMEM("get_regtype"); if ((status = regcomp(regex, regex_str, cflags))) { char * err_msg = get_regerror(status, regex); if (SHOW_REAL_QUIET(mr)) HTPrint("Regular expression error: %s\n", err_msg); HT_FREE(err_msg); Cleanup(mr, -1); } } return regex;}#endifPUBLIC void VersionInfo (void){ HTPrint("\nW3C OpenSource Software"); HTPrint("\n-----------------------\n\n"); HTPrint("\tWebbot version %s\n", APP_VERSION); HTPrint("\tusing the W3C libwww library version %s.\n\n",HTLib_version()); HTPrint("\tSee \"%s\" for help\n", COMMAND_LINE); HTPrint("\tSee \"http://www.w3.org/Robot/User/\" for user information\n"); HTPrint("\tSee \"http://www.w3.org/Robot/\" for general information\n\n"); HTPrint("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n"); HTPrint("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");}/* terminate_handler** -----------------** This function is registered to handle the result of the request.** If no more requests are pending then terminate program*/PUBLIC int terminate_handler (HTRequest * request, HTResponse * response, void * param, int status) { Finger * finger = (Finger *) HTRequest_context(request); Robot * mr = finger->robot; if (SHOW_QUIET(mr)) HTPrint("Robot....... done with %s\n", HTAnchor_physical(finger->dest));#ifdef HT_MYSQL if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);#endif /* Check if negotiated resource and whether we should log that*/ if (mr->conneg) { HTAssocList * cur = HTResponse_variant(response); if (cur) { BOOL first = YES; HTChunk * buffer = HTChunk_new(128); char * uri = HTAnchor_address((HTAnchor *) finger->dest); HTAssoc * pres; HTChunk_puts(buffer, uri); while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) { char * value = HTAssoc_value(pres); if (first) { HTChunk_puts(buffer, "\t("); first = NO; } else HTChunk_puts(buffer, ", "); /* Output the name */ HTChunk_puts(buffer, HTAssoc_name(pres)); /* Only output the value if not empty string */ if (value && *value) { HTChunk_puts(buffer, "=");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -