⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htrobot.c

📁 www工具包
💻 C
📖 第 1 页 / 共 3 页
字号:
		    HTChunk_puts(buffer, value);		}	    }	    if (!first) HTChunk_puts(buffer, ")");	    HTLog_addLine(mr->conneg, HTChunk_data(buffer));	    HTChunk_delete(buffer);	    HT_FREE(uri);	}    }    /* Count the amount of body data that we have read */    if (HTRequest_method(request) == METHOD_GET) {	int length = HTAnchor_length(HTRequest_anchor(request));	if (length > 0) mr->get_bytes += length;	mr->get_docs++;    } else if (HTRequest_method(request) == METHOD_HEAD) {	int length = HTAnchor_length(HTRequest_anchor(request));	if (length > 0) mr->head_bytes += length;	mr->head_docs++;    } else {	mr->other_docs++;    }    if (!(mr->flags & MR_BFS)) {	/* Delete this thread */	Finger_delete(finger);	/* Should we stop? */	if (mr->cnt <= 0) {	    if (SHOW_QUIET(mr)) HTPrint("             Everything is finished...\n");	    Cleanup(mr, 0);			/* No way back from here */	}    }    if (SHOW_QUIET(mr)) HTPrint("             %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");    return HT_OK;}PUBLIC int my_terminate_handler (HTRequest * request, HTResponse * response,			       void * param, int status) {    Finger * finger = (Finger *) HTRequest_context(request);    Robot * mr = finger->robot;    HTParentAnchor * dest = finger->dest;    HyperDoc * hd = HTAnchor_document(dest);    int depth = (hd ? hd->depth : -1);    if (hd) set_error_state_hyperdoc(hd,request);          if(hd && (HTRequest_method(request)== METHOD_HEAD) &&        (depth < mr->depth))      {	hd->method = METHOD_GET;	HTQueue_append(mr->queue, (void *)hd); (mr->cq)++;      }    Finger_delete(finger);    if(!(mr->flags & MR_PREEMPTIVE))      Serving_queue(mr);    return HT_OK;}PUBLIC void Serving_queue(Robot *mr){  BOOL abort = NO;  Finger *nfinger;    while(!abort)    {      if(!HTQueue_isEmpty(mr->queue))	{	  HTRequest *newreq;	  	  HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue);	  	  if(nhd)	    {	      char *uri = HTAnchor_address((HTAnchor *)nhd->anchor);	      HTQueue_dequeue(mr->queue); (mr->cq)--;	      nfinger = Finger_new(mr, nhd->anchor, nhd->method); 	      	      newreq = nfinger->request;	      if(SHOW_QUIET(mr))  HTPrint("Request from QUEUE  %s\n",uri);	      HT_FREE(uri);	      if(SHOW_QUIET(mr)) HTPrint("%d elements in queue \n", mr->cq);	      HTRequest_setParent(newreq,get_last_parent(nhd->anchor));	      /* @@@ Should be done using a timer and not sleep! @@@ */#if 0	      if(mr->waits)		  sleep(mr->waits);#endif	      	      if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES) 		{		  if (SHOW_QUIET(mr)) HTPrint("not tested!\n");		  Finger_delete(nfinger);		}	    }	  else	    abort = YES;	}      else	abort = YES;    }  if(SHOW_QUIET(mr)) HTPrint("Queue size: %d \n", mr->cq);    if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE)))      {	if(mr->cnt > 0)	  if(SHOW_QUIET(mr)) HTPrint("%d requests were not served\n", mr->cnt);	if (SHOW_QUIET(mr)) HTPrint("             Everything is finished...\n");	Cleanup(mr, 0);			/* No way back from here */      }}/* ------------------------------------------------------------------------- *//*				HTEXT INTERFACE				     *//* ------------------------------------------------------------------------- */PUBLIC BOOL Robot_registerHTMLParser (void){    HText_registerCDCallback(RHText_new, RHText_delete);    HText_registerLinkCallback(RHText_foundLink);    return YES;}PRIVATE HText * RHText_new (HTRequest * request, HTParentAnchor * anchor,			    HTStream * stream){    HText * me;    Finger * finger = (Finger *) HTRequest_context(request);    Robot * mr = finger->robot;    char * robots = NULL;    if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)	HT_OUTOFMEM("HText_new2");    /* Bind the HText object together with the Request Object */    me->request = request;    me->follow = YES;    /* Check to see if we have any meta tags */    if (!(mr->flags & MR_NOMETATAGS) && (robots = HTAnchor_robots(anchor)) != NULL) {	char * strval = NULL;	char * ptr = NULL;	char * token = NULL;	StrAllocCopy(strval, robots);	ptr = strval;	while ((token = HTNextField(&ptr)) != NULL) {	    if (!strcasecomp(token, "nofollow")) {		me->follow = NO;		break;	    }	}	HT_FREE(strval);    }    /* Add this HyperDoc object to our list */    if (!mr->htext) mr->htext = HTList_new();    HTList_addObject(mr->htext, (void *) me);    return me;}PRIVATE BOOL RHText_delete (HText * me) {    if (me) {	HT_FREE(me);	return YES;    }    return NO;}PRIVATE void RHText_foundAnchor (HText * text, HTChildAnchor * anchor){    if (text && anchor) {	Finger * finger = (Finger *) HTRequest_context(text->request);	Robot * mr = finger->robot;	HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);	HTParentAnchor * dest_parent = HTAnchor_parent(dest);	char * uri = HTAnchor_address((HTAnchor *) dest_parent);	HyperDoc * hd = HTAnchor_document(dest_parent);	HTParentAnchor * referer = HTRequest_anchor(text->request);	BOOL match = text->follow;	BOOL check = NO;	/* These are new variables */	HyperDoc * nhd = NULL;	BOOL follow = YES;	/* These three variables were moved */	/*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/	HTParentAnchor * last_anchor = HTRequest_anchor(text->request);	HyperDoc * last_doc = HTAnchor_document(last_anchor);	int depth = last_doc ? last_doc->depth+1 : 0;	if (!uri) return;	if (SHOW_QUIET(mr)) HTPrint("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n");        if (hd) {	    if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");            hd->hits++;#ifdef HT_MYSQL	    if (mr->sqllog) {		char * ref_addr = HTAnchor_address((HTAnchor *) referer);		if (ref_addr) {		    HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,						 "referer", NULL);		    HT_FREE(ref_addr);		}	    }#endif	    HT_FREE(uri);	    return;	}	/* Check for prefix match */	if (match && mr->prefix) {	    match = HTStrMatch(mr->prefix, uri) ? YES : NO;	}#ifdef HT_POSIX_REGEX	/*	**  Check for any regular expression. The include may override	**  the prefix matching	*/	if (mr->include) {	    match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;	}	if (match && mr->exc_robot) {	    match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;	}	if (match && mr->exclude) {	    match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;	}	if (match && mr->check) {	    check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;	}#endif	if(uri && test_for_blank_spaces(uri))	  follow = NO;	else if (mr->ndoc == 0) /* Number of Documents is reached */	  follow = NO;	/* Test whether we already have a hyperdoc for this document */	if(!hd && dest_parent)	  {	    nhd = HyperDoc_new(mr, dest_parent, depth);	    mr->cdepth[depth]++;	  }	/* Test whether we already have a hyperdoc for this document */        if (mr->flags & MR_LINK && match && dest_parent && follow && !hd) {	    if (mr->flags & MR_BFS) {		nhd->method = METHOD_HEAD;		HTQueue_enqueue(mr->queue, (void *) nhd);		(mr->cq)++;		if(mr->ndoc > 0) mr->ndoc--;	    } else {		Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);		HTRequest * newreq = newfinger->request;		HTRequest_setParent(newreq, referer);		if (check || depth >= mr->depth) {		    if (SHOW_QUIET(mr)) HTPrint("loading at depth %d using HEAD\n", depth);		    HTRequest_setMethod(newreq, METHOD_HEAD);		} else {		    if (SHOW_QUIET(mr)) HTPrint("loading at depth %d\n", depth);		}		if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {		    if (SHOW_QUIET(mr)) HTPrint("not tested!\n");		    Finger_delete(newfinger);		}	    }	} else {	    if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");#ifdef HT_MYSQL	    if (mr->reject || mr->sqllog) {#else		    if (mr->reject) {#endif		if (referer) {		    char * ref_addr = HTAnchor_address((HTAnchor *) referer);		    if (mr->reject && ref_addr)			HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);#ifdef HT_MYSQL		    if (mr->sqllog && mr->sqlexternals && ref_addr)			HTSQLLog_addLinkRelationship(mr->sqllog,						     ref_addr, uri,						     "referer", NULL);#endif		    HT_FREE(ref_addr);		}	    }	}	HT_FREE(uri);    }}PRIVATE void RHText_foundImage (HText * text, HTChildAnchor * anchor,				const char *alt, const char * align, BOOL isMap){    if (text && anchor) {	Finger * finger = (Finger *) HTRequest_context(text->request);	Robot * mr = finger->robot;	if (mr->flags & MR_IMG) {	    HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);	    HTParentAnchor * dest_parent = HTAnchor_parent(dest);	    char * uri = HTAnchor_address((HTAnchor *) dest_parent);	    HyperDoc * hd = HTAnchor_document(dest_parent);	    HTParentAnchor * referer = HTRequest_anchor(text->request);	    BOOL match = YES;	    if (!uri) return;	    if (hd) {		if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");		hd->hits++;#ifdef HT_MYSQL		if (mr->sqllog) {		    char * ref_addr = HTAnchor_address((HTAnchor *) referer);		    if (ref_addr) {			HTSQLLog_addLinkRelationship(mr->sqllog,						     ref_addr, uri,						     "image", alt);			HT_FREE(ref_addr);		    }		}#endif		HT_FREE(uri);		return;	    }	    /* Check for prefix match */	    if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;#ifdef HT_POSIX_REGEX	/*	**  Check for any regular expression. The include may override	**  the prefix matching	*/	if (mr->include) {	    match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;	}	if (match && mr->exc_robot) {	    match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;	}	if (match && mr->exclude) {	    match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;	}#endif	    /* Test whether we already have a hyperdoc for this document */	    if (match && dest) {		Finger * newfinger = Finger_new(mr, dest_parent,						mr->flags & MR_SAVE ?						METHOD_GET : METHOD_HEAD);		HTRequest * newreq = newfinger->request;		HyperDoc_new(mr, dest_parent, 1);		HTRequest_setParent(newreq, referer);		/* Check whether we should report missing ALT tags */		if (mr->noalttag && (alt==NULL || *alt=='\0')) {		    if (referer) {			char * ref_addr = HTAnchor_address((HTAnchor *) referer);			if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);			HT_FREE(ref_addr);		    }		}				if (SHOW_QUIET(mr)) HTPrint("Robot....... Checking Image `%s\'\n", uri);		if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {		    if (SHOW_QUIET(mr)) HTPrint("Robot....... Image not tested!\n");		    Finger_delete(newfinger);		}	    } else {		if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");#ifdef HT_MYSQL		if (mr->reject || mr->sqllog) {#else			if (mr->reject) {#endif		    if (referer) {			char * ref_addr = HTAnchor_address((HTAnchor *) referer);			if (mr->reject && ref_addr)			    HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);#ifdef HT_MYSQL			if (mr->sqllog && mr->sqlexternals && ref_addr)			    HTSQLLog_addLinkRelationship(mr->sqllog,							 ref_addr, uri,							 "image", alt);#endif			HT_FREE(ref_addr);		    }		}	    }	    HT_FREE(uri);	}    }}PRIVATE void RHText_foundLink (HText * text,			       int element_number, int attribute_number,			       HTChildAnchor * anchor,			       const BOOL * present, const char ** value){    if (text && anchor) {	Finger * finger = (Finger *) HTRequest_context(text->request);	Robot * mr = finger->robot;	if (SHOW_QUIET(mr))	    HTPrint("Robot....... Received element %d, attribute %d with anchor %p\n",		    element_number, attribute_number, anchor);	if ((element_number==HTML_IMG && attribute_number==HTML_IMG_SRC) || 	    (element_number==HTML_BODY && attribute_number==HTML_BODY_BACKGROUND))	    RHText_foundImage(text, anchor, NULL, NULL, NO);	else	    RHText_foundAnchor(text, anchor);    }}PUBLIC char * get_robots_txt(char * uri){  char *str = NULL;  HTChunk * chunk;  HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri));  HTRequest *request = HTRequest_new();  HTRequest_setOutputFormat(request, WWW_SOURCE);  HTRequest_setPreemptive(request, YES);  HTRequest_setMethod(request, METHOD_GET);  chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request);  str = HTChunk_toCString(chunk);  HTRequest_delete(request);  return str;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -