⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htrobot.c

📁 www工具包. 这是W3C官方支持的www支撑库. 其中提供通用目的的客户端的WebAPI: complete HTTP/1.1 (with caching, pipelining, PUT, POS
💻 C
📖 第 1 页 / 共 3 页
字号:
            /* Find mediatype distribution */	    if (mr->mtfile) {		HTList * mtdist = mediatype_distribution(array);		if (mtdist) {		    if (SHOW_REAL_QUIET(mr))			HTPrint("\tLogged media type distribution in file `%s\'\n",				mr->mtfile);		    log_meta_distribution(mr->mtfile, mtdist);		    delete_meta_distribution(mtdist);		}	    }            /* Find charset distribution */	    if (mr->charsetfile) {		HTList * charsetdist = charset_distribution(array);		if (charsetdist) {		    if (SHOW_REAL_QUIET(mr))			HTPrint("\tLogged charset distribution in file `%s\'\n",				mr->charsetfile);		    log_meta_distribution(mr->charsetfile, charsetdist);		    delete_meta_distribution(charsetdist);		}	    }            /* Add as may other stats here as you like */	    /* ... */	    	    /* Delete the array */            HTArray_delete(array);        }    }    return YES;}PRIVATE HTParentAnchor *get_last_parent(HTParentAnchor *anchor){  HTAnchor *anc;  HTList *sources = anchor->sources;  while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL)    {      HTParentAnchor *panchor = HTAnchor_parent(anc);      return panchor;    }  return NULL;}PRIVATE HTLink *HTLink_find_type(HTAnchor * src, HTAnchor * dest, char *linktype){    if(src && dest && linktype)    {	HTLink * link = HTAnchor_mainLink(src);	HTList * sublinks = HTAnchor_subLinks(src);	HTLinkType type = (HTLinkType)HTAtom_caseFor(linktype);	HTAnchor *sdest = HTLink_destination(link);	if (link && sdest == dest && type == HTLink_type(link))	    return link;	else if (sublinks) {	    while ((link = (HTLink *) HTList_nextObject (sublinks))) {		sdest = HTLink_destination(link);		if (sdest == dest && HTLink_type(link) == type) 		    return link;	    }	}    }    return NULL;}PRIVATE voidupdate_incoming_links(HTParentAnchor *anchor, HTParentAnchor *nanchor){    if(anchor && nanchor) {	HTAnchor *anc;	HTList *sources = anchor->sources;	while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL) {	    HTParentAnchor *panchor = HTAnchor_parent(anc);	    if((HTLink_find((HTAnchor *)panchor,(HTAnchor *)anchor)) &&	       (!HTLink_find_type((HTAnchor *)panchor,				  (HTAnchor *)nanchor,"redirection"))) {		HTLink_add((HTAnchor *)panchor,(HTAnchor *)nanchor, 			   (HTLinkType) HTAtom_caseFor("redirection"), 				    METHOD_HEAD);	    }	}    }}	PRIVATE voidupdate_hyperdoc(HyperDoc *hd,HTRequest *request){    if(hd && request) {	HTParentAnchor *anchor = hd->anchor;	HTParentAnchor *nanchor = HTRequest_anchor(request);	HTParentAnchor *parent = HTRequest_parent(request);	HyperDoc *nhd = HTAnchor_document(nanchor);	char *tit = (char *) HTAnchor_title(nanchor);	if(nhd && tit)	    StrAllocCopy(nhd->title,tit);	if (anchor != nanchor) {	    if(nhd) { 	    /* The redirected anchor has a Hyperdoc */		if(nhd != hd) {		    hd->code = REDIR_CODE;		    HTAnchor_setDocument(anchor,(void *)nhd);		    if(!HTLink_find_type((HTAnchor *)parent,					 (HTAnchor *)nanchor,"redirection")) {			HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor, 				   (HTLinkType) HTAtom_caseFor("redirection"), 				   METHOD_HEAD);		    }		}	    } else { /* The redirected anchor does not have a Hyperdoc */		hd->anchor = nanchor;		HTAnchor_setDocument(nanchor,(void *) hd);		if(!HTLink_find_type((HTAnchor *)parent,(HTAnchor *)nanchor,				     "redirection")) {		    HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor, 			      (HTLinkType) HTAtom_caseFor("redirection") , 			       METHOD_HEAD);		}	    }	    update_incoming_links(anchor,nanchor);	}    }}PRIVATE voidset_error_state_hyperdoc(HyperDoc * hd, HTRequest *request){    HTList * cur = HTRequest_error(request);    HTError *pres;    Finger * finger = (Finger *) HTRequest_context(request);    Robot * mr = finger->robot;    while((pres = (HTError *) HTList_nextObject(cur)) != NULL) {	int code =HTErrors[HTError_index(pres)].code;	hd->code = code;	if((mr->flags & MR_REDIR) && code >= 200 && code < 300 )	    update_hyperdoc(hd,request);    }}#if 0PRIVATE inttest_for_blank_spaces(char *uri){  char *ptr = uri;  for(;*ptr!='\0';ptr++)    if(*ptr == ' ')      return 1;  return 0;}#endif/*	Create a Command Line Object**	----------------------------*/PUBLIC Robot * Robot_new (void){    Robot * me;    if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)	HT_OUTOFMEM("Robot_new");    me->hyperdoc = HTList_new();    me->htext = HTList_new();    me->timer = DEFAULT_TIMEOUT*MILLIES;    me->waits = 0;    me->cwd = HTGetCurrentDirectoryURL();    me->output = OUTPUT;    me->cnt = 0;    me->ndoc = -1;    me->fingers = HTList_new();    /* This is new */    me->queue = HTQueue_new();    me->cq = 0;    me->furl = NULL;    return me;}/*	Delete a Command Line Object**	----------------------------*/PRIVATE BOOL Robot_delete (Robot * mr){    if (mr) {	HTList_delete(mr->fingers);       	/* Calculate statistics */	calculate_statistics(mr);        if (mr->hyperdoc) {	    HTList * cur = mr->hyperdoc;	    HyperDoc * pres;	    while ((pres = (HyperDoc *) HTList_nextObject(cur)))		HyperDoc_delete(pres);	    HTList_delete(mr->hyperdoc);	}	if (mr->htext) {	    HTList * cur = mr->htext;	    HText * pres;	    while ((pres = (HText *) HTList_nextObject(cur)))		RHText_delete(pres);	    HTList_delete(mr->htext);	}	/* Close all the log files */	if (mr->flags & MR_LOGGING) {	    if (SHOW_REAL_QUIET(mr)) HTPrint("\nRaw Log files:\n");	}	if (mr->log) {	    if (SHOW_REAL_QUIET(mr))		HTPrint("\tLogged %5d entries in general log file `%s\'\n",			HTLog_accessCount(mr->log), mr->logfile);	    HTLog_close(mr->log);	}	if (mr->ref) {	    if (SHOW_REAL_QUIET(mr))		HTPrint("\tLogged %5d entries in referer log file `%s\'\n",			HTLog_accessCount(mr->ref), mr->reffile);	    HTLog_close(mr->ref);	}	if (mr->reject) {	    if (SHOW_REAL_QUIET(mr))		HTPrint("\tLogged %5d entries in rejected log file `%s\'\n",			HTLog_accessCount(mr->reject), mr->rejectfile);	    HTLog_close(mr->reject);	}	if (mr->notfound) {	    if (SHOW_REAL_QUIET(mr))		HTPrint("\tLogged %5d entries in not found log file `%s\'\n",			HTLog_accessCount(mr->notfound), mr->notfoundfile);	    HTLog_close(mr->notfound);	}	if (mr->conneg) {	    if (SHOW_REAL_QUIET(mr))		HTPrint("\tLogged %5d entries in content negotiation log file `%s\'\n",			HTLog_accessCount(mr->conneg), mr->connegfile);	    HTLog_close(mr->conneg);	}	if (mr->noalttag) {	    if (SHOW_REAL_QUIET(mr))		HTPrint("\tLogged %5d entries in missing alt tag log file `%s\'\n",			HTLog_accessCount(mr->noalttag), mr->noalttagfile);	    HTLog_close(mr->noalttag);	}	if (mr->output && mr->output != STDOUT) fclose(mr->output);	if (mr->flags & MR_TIME) {	    time_t local = time(NULL);	    if (SHOW_REAL_QUIET(mr))		HTPrint("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));	}	/* This is new */	HT_FREE(mr->cdepth);	HT_FREE(mr->furl);#ifdef HT_POSIX_REGEX	if (mr->include) {	    regfree(mr->include);	    HT_FREE(mr->include);	}	if (mr->exclude) {	    regfree(mr->exclude);	    HT_FREE(mr->exclude);	}	if (mr->exc_robot) {	    regfree(mr->exc_robot);	    HT_FREE(mr->exc_robot);	}	if (mr->check) {	    regfree(mr->check);	    HT_FREE(mr->check);	}#endif#ifdef HT_MYSQL	if (mr->sqllog) {	    HTSQLLog_close(mr->sqllog);	    mr->sqllog = NULL;	}#endif	if (mr->queue) HTQueue_delete(mr->queue);	HT_FREE(mr->cwd);	HT_FREE(mr->prefix);	HT_FREE(mr->img_prefix);	HT_FREE(mr);	return YES;    }    return NO;}/***  This function creates a new finger object and initializes it with a new request*/PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method){    Finger * me;    HTRequest * request = HTRequest_new();    if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)	HT_OUTOFMEM("Finger_new");    me->robot = robot;    me->request = request;    me->dest = dest;    HTList_addObject(robot->fingers, (void *)me);    /* Set the context for this request */    HTRequest_setContext (request, me);    /* Check the various flags to customize the request */    if (robot->flags & MR_PREEMPTIVE)	HTRequest_setPreemptive(request, YES);    if (robot->flags & MR_VALIDATE)	HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);    if (robot->flags & MR_END_VALIDATE)	HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);    /* We wanna make sure that we are sending a Host header (default) */    HTRequest_addRqHd(request, HT_C_HOST);    /* Set the method for this request */    HTRequest_setMethod(request, method);    robot->cnt++;    return me;}PRIVATE int Finger_delete (Finger * me){    HTList_removeObject(me->robot->fingers, (void *)me);    /* Done with one more */    me->robot->cnt--;    /* See if we don't need to keep all the metadata around in the anchors */    if (!(me->robot->flags & MR_KEEP_META))	HTAnchor_clearHeader(HTRequest_anchor(me->request));    /*    **  If we are down at one request then flush the output buffer    */    if (me->request) {	if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);	HTRequest_delete(me->request);	me->request = NULL;    }    /*    **  Delete the request and free myself    */    HT_FREE(me);    return YES;}PRIVATE BOOL check_constraints(Robot * mr, char *prefix, char *uri){    BOOL match = YES;    /* Check for prefix match */    if (prefix) {	match = HTStrMatch(prefix, uri) ? YES : NO;    }  #ifdef HT_POSIX_REGEX    /* Check for any regular expression */    if (match && mr->include) {	match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;    }    if (match && mr->exc_robot) {	match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;    }    if (match && mr->exclude) {	match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;    }  #endif    return match;}/***  Cleanup and make sure we close all connections including the persistent**  ones*/PUBLIC void Cleanup (Robot * me, int status){    /*    **  First we clean up the robot itself and calculate the various    **  statistics. This can actually take some time as a lot of data    **  has to be manipulated    */    Robot_delete(me);    /*    **  Then we shut down libwww    */    HTProfile_delete();#ifdef HT_MEMLOG    HTMemLog_close();#endif#ifdef VMS    exit(status ? status : 1);#else    exit(status ? status : 0);#endif}#ifdef HT_POSIX_REGEXPRIVATE char * get_regerror (int errcode, regex_t * compiled){    size_t length = regerror (errcode, compiled, NULL, 0);    char * str = NULL;    if ((str = (char *) HT_MALLOC(length+1)) == NULL)	HT_OUTOFMEM("get_regerror");    (void) regerror (errcode, compiled, str, length);    return str;}PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags){    regex_t * regex = NULL;    if (regex_str && *regex_str) {	int status;	if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)	    HT_OUTOFMEM("get_regtype");	if ((status = regcomp(regex, regex_str, cflags))) {	    char * err_msg = get_regerror(status, regex);	    if (SHOW_REAL_QUIET(mr))		HTPrint("Regular expression error: %s\n", err_msg);	    HT_FREE(err_msg);	    Cleanup(mr, -1);	}    }    return regex;}#endifPUBLIC void VersionInfo (void){    HTPrint("\nW3C OpenSource Software");    HTPrint("\n-----------------------\n\n");    HTPrint("\tWebbot version %s\n", APP_VERSION);    HTPrint("\tusing the W3C libwww library version %s.\n\n",HTLib_version());    HTPrint("\tSee \"%s\" for help\n", COMMAND_LINE);    HTPrint("\tSee \"http://www.w3.org/Robot/User/\" for user information\n");    HTPrint("\tSee \"http://www.w3.org/Robot/\" for general information\n\n");    HTPrint("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n");    HTPrint("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");}/*	redirection_handler**	-------------------**	If we are set up to handle redirections then handle it here.*/PUBLIC int redirection_handler (HTRequest * request, HTResponse * response,				void * param, int status) {    Finger * finger = (Finger *) HTRequest_context(request);    Robot * mr = finger->robot;    HTParentAnchor * me = HTRequest_anchor(request);    HTAnchor * redirection = HTResponse_redirection(response);    HTParentAnchor * redirection_parent = HTAnchor_parent(redirection);    HyperDoc * redirection_hd = HTAnchor_document(redirection_parent);    char * uri = NULL;    char * redirection_parent_addr = NULL;    BOOL match = YES;    BOOL check = NO;    /* In case we didn't get any redirection destination */    if (!redirection) return HT_OK;    /* Get the addresses */    uri = HTAnchor_address((HTAnchor *) me);    redirection_parent_addr = HTAnchor_address((HTAnchor *) redirection_parent);    if (SHOW_QUIET(mr))	HTPrint("Robot....... Checking redirecting from `%s\' to `%s\'\n",		uri, redirection_parent_addr);    /* Log the event */#ifdef HT_MYSQL    if (mr->sqllog && redirection_parent_addr)	HTSQLLog_addLinkRelationship(mr->sqllog, redirection_parent_addr,				     uri, "redirection", NULL);#endif    /* Check our constraints matcher */    match = check_constraints(mr,mr->prefix, redirection_parent_addr);#ifdef HT_POSIX_REGEX    /* See if we should do a HEAD or a GET on this URI */    if (match && mr->check) {	check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;    }#endif    /*    ** If we already have a HyperDoc for the redirected anchor    ** then update it    */    if (match) {	if ((redirection_hd = HTAnchor_document(redirection_parent)) != NULL) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -