⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htrobot.c

📁 www工具包
💻 C
📖 第 1 页 / 共 3 页
字号:
/***	@(#) $Id: HTRobot.c,v 1.83 1999/02/23 17:53:30 frystyk Exp $**	**	W3C Webbot can be found at "http://www.w3.org/Robot/"**	**	Copyright 仼 1995-1998 World Wide Web Consortium, (Massachusetts**	Institute of Technology, Institut National de Recherche en**	Informatique et en Automatique, Keio University). All Rights**	Reserved. This program is distributed under the W3C's Software**	Intellectual Property License. This program is distributed in the hope**	that it will be useful, but WITHOUT ANY WARRANTY; without even the**	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR**	PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more**	details.****  Authors:**	HFN		Henrik Frystyk Nielsen, (frystyk@w3.org)**	BR		Bob Racko**	JP		John Punin****  History:**	Dec 04 95	First version**	Oct 1998	Split into separate files*/#include "HTRobMan.h"#include "HTQueue.h"#include "HTAncMan.h"#define SHOW_QUIET(mr)		((mr) && !((mr)->flags & MR_QUIET))#define SHOW_REAL_QUIET(mr)	((mr) && !((mr)->flags & MR_REAL_QUIET))PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};/***  Some sorting algorithms*/PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;/***  Ths callbacks that we need from the libwww HTML parser*/PRIVATE HText_new	RHText_new;PRIVATE HText_delete	RHText_delete;PRIVATE HText_foundLink	RHText_foundLink;/* ------------------------------------------------------------------------- *//*	Create a "HyperDoc" object**	--------------------------**	A HyperDoc object contains information about whether we have already**	started checking the anchor and the depth in our search*/PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth){    HyperDoc * hd;    if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)	HT_OUTOFMEM("HyperDoc_new");    hd->depth = depth;    hd->hits = 1;    hd->code = -1;    hd->index = ++mr->cindex;    /* Bind the HyperDoc object together with the Anchor Object */    hd->anchor = anchor;    HTAnchor_setDocument(anchor, (void *) hd);    /* Add this HyperDoc object to our list */    if (!mr->hyperdoc) mr->hyperdoc = HTList_new();    HTList_addObject(mr->hyperdoc, (void *) hd);    return hd;}/*	Delete a "HyperDoc" object**	--------------------------*/PUBLIC BOOL HyperDoc_delete (HyperDoc * hd){    if (hd) {	HT_FREE (hd);	return YES;    }    return NO;}/***  Sort the anchor array and log reference count*/PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array){    if (mr && array) {        HTLog * log = HTLog_open(mr->hitfile, YES, YES);        if (log) {            void ** data = NULL;            HTParentAnchor * anchor = NULL;            HTArray_sort(array, HitSort);            anchor = (HTParentAnchor *) HTArray_firstObject(array, data);	    while (anchor) {                char * uri = HTAnchor_address((HTAnchor *) anchor);                HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);                if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);                HT_FREE(uri);                anchor = (HTParentAnchor *) HTArray_nextObject(array, data);            }	}        HTLog_close(log);        return YES;    }    return NO;}PRIVATE int HitSort (const void * a, const void * b){    HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);    HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);    if (aa && bb) return (bb->hits - aa->hits);    return bb - aa;}/***  Sort the anchor array and log link relations*/PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array){    if (mr && array) {        HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;	void ** data = NULL;	HTParentAnchor * anchor = NULL;	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);	while (anchor) {	    /*	    **  If we have a specific link relation to look for then do this.	    **  Otherwise look for all link relations.	    */	    if (mr->relation) {		HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);		if (link) {		    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));		    char * src_uri = HTAnchor_address((HTAnchor *) anchor);		    char * dest_uri = HTAnchor_address((HTAnchor *) dest);		    if (src_uri && dest_uri) {#ifdef HT_MYSQL			if (mr->sqllog) {			    HTSQLLog_addLinkRelationship (mr->sqllog,							  src_uri, dest_uri,							  HTAtom_name(mr->relation),							  NULL);			}#endif			if (log) {			    HTFormat format = HTAnchor_format(dest);			    HTLog_addText(log, "%s %s %s --> %s\n",					  HTAtom_name(mr->relation),					  format != WWW_UNKNOWN ?					  HTAtom_name(format) : "<unknown>",					  src_uri, dest_uri);			}			/* Cleanup */			HT_FREE(src_uri);			HT_FREE(dest_uri);		    }		}	    } else {		HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);		HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);		char * src_uri = HTAnchor_address((HTAnchor *) anchor);		HTLinkType linktype;		/* First look in the main link */		if (link && (linktype = HTLink_type(link))) {		    		    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));		    char * dest_uri = HTAnchor_address((HTAnchor *) dest);		    if (src_uri && dest_uri) {#ifdef HT_MYSQL			if (mr->sqllog) {			    HTSQLLog_addLinkRelationship (mr->sqllog,							  src_uri, dest_uri,							  HTAtom_name(linktype),							  NULL);			}#endif			if (log) {			    HTFormat format = HTAnchor_format(dest);			    HTLog_addText(log, "%s %s %s --> %s\n",					  HTAtom_name(linktype),					  format != WWW_UNKNOWN ?					  HTAtom_name(format) : "<unknown>",					  src_uri, dest_uri);			}		    }		    HT_FREE(dest_uri);		}		/* and then in any sublinks */		if (sublinks) {		    HTLink * pres;		    while ((pres = (HTLink *) HTList_nextObject(sublinks))) {			if ((linktype = HTLink_type(pres))) {			    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));			    char * dest_uri = HTAnchor_address((HTAnchor *) dest);			    if (src_uri && dest_uri) {#ifdef HT_MYSQL				if (mr->sqllog) {				    HTSQLLog_addLinkRelationship (mr->sqllog,								  src_uri, dest_uri,								  HTAtom_name(linktype),								  NULL);				}#endif				if (log) {				    HTFormat format = HTAnchor_format(dest);				    HTLog_addText(log, "%s %s %s --> %s\n",						  HTAtom_name(linktype),						  format != WWW_UNKNOWN ?						  HTAtom_name(format) : "<unknown>",						  src_uri, dest_uri);				}				HT_FREE(dest_uri);			    }			}		    }		}		/* Cleanup */		HT_FREE(src_uri);	    }	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);	}        if (log) HTLog_close(log);        return YES;    }    return NO;}/***  Sort the anchor array and log last modified date*/PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array){    if (mr && array) {        HTLog * log = HTLog_open(mr->lmfile, YES, YES);        if (log) {            void ** data = NULL;            HTParentAnchor * anchor = NULL;            HTArray_sort(array, LastModifiedSort);            anchor = (HTParentAnchor *) HTArray_firstObject(array, data);	    while (anchor) {                char * uri = HTAnchor_address((HTAnchor *) anchor);                time_t lm = HTAnchor_lastModified(anchor);                if (uri && lm > 0)		    HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);                HT_FREE(uri);                anchor = (HTParentAnchor *) HTArray_nextObject(array, data);            }	}        HTLog_close(log);        return YES;    }    return NO;}PRIVATE int LastModifiedSort (const void * a, const void * b){    time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);    time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);    return bb - aa;}/***  Sort the anchor array and log the document title*/PRIVATE BOOL calculate_title (Robot * mr, HTArray * array){    if (mr && array) {        HTLog * log = HTLog_open(mr->titlefile, YES, YES);        if (log) {            void ** data = NULL;            HTParentAnchor * anchor = NULL;            HTArray_sort(array, TitleSort);            anchor = (HTParentAnchor *) HTArray_firstObject(array, data);	    while (anchor) {                char * uri = HTAnchor_address((HTAnchor *) anchor);                const char * title = HTAnchor_title(anchor);		HTCharset charset = HTAnchor_charset(anchor);                if (uri) HTLog_addText(log, "%s `%s\' %s\n",				       charset ? HTAtom_name(charset) : "<none>",				       title ? title : "<none>",				       uri);                HT_FREE(uri);                anchor = (HTParentAnchor *) HTArray_nextObject(array, data);            }	}        HTLog_close(log);        return YES;    }    return NO;}PRIVATE int TitleSort (const void * a, const void * b){    const char * aa = HTAnchor_title(*(HTParentAnchor **) a);    const char * bb = HTAnchor_title(*(HTParentAnchor **) b);    return strcasecomp(bb?bb:"", aa?aa:"");}/***  Calculate distributions for media types. The same mechanism**  can be used for other characteristics with relatively**  few outcomes.*/PRIVATE HTList * mediatype_distribution (HTArray * array){    if (array) {	HTList * mt = HTList_new();	MetaDist * pres = NULL;	void ** data = NULL;	HTParentAnchor * anchor = NULL;	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);	while (anchor) {	    HTFormat format = HTAnchor_format(anchor);	    if (format && format != WWW_UNKNOWN) {		HTList * cur = mt;		/* If found then increase counter */		while ((pres = (MetaDist *) HTList_nextObject(cur))) {		    if (pres->name == format) {			pres->hits++;			break;		    }		}		/* If not found then add new format to list */		if (!pres) {                    if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)        	         HT_OUTOFMEM("mediatype_distribution");		    pres->name = format;		    pres->hits = 1;		    HTList_addObject(mt, pres);		    HTList_insertionSort(mt, FormatSort);		}	    }	    /* Find next anchor in array */	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);	}	return mt;    }    return NULL;}/***  Calculate distributions for charsets. The same mechanism**  can be used for other characteristics with relatively**  few outcomes.*/PRIVATE HTList * charset_distribution (HTArray * array){    if (array) {	HTList * cs = HTList_new();	MetaDist * pres = NULL;	void ** data = NULL;	HTParentAnchor * anchor = NULL;	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);	while (anchor) {	    HTCharset charset = HTAnchor_charset(anchor);	    if (charset) {		HTList * cur = cs;		/* If found then increase counter */		while ((pres = (MetaDist *) HTList_nextObject(cur))) {		    if (pres->name == charset) {			pres->hits++;			break;		    }		}		/* If not found then add new format to list */		if (!pres) {                    if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)        	         HT_OUTOFMEM("charset_distribution");		    pres->name = charset;		    pres->hits = 1;		    HTList_addObject(cs, pres);		    HTList_insertionSort(cs, FormatSort);		}	    }	    /* Find next anchor in array */	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);	}	return cs;    }    return NULL;}PRIVATE int FormatSort (const void * a, const void * b){    MetaDist * aa = (MetaDist *) a;    MetaDist * bb = (MetaDist *) b;    return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));}PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution){    if (logfile && distribution) {        HTLog * log = HTLog_open(logfile, YES, YES);	if (log) {	    HTList * cur = distribution;	    MetaDist * pres;	    while ((pres = (MetaDist *) HTList_nextObject(cur))) {		if (pres->name) {		    HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));		}	    }	    HTLog_close(log);	}    }    return NO;}PRIVATE BOOL delete_meta_distribution (HTList * distribution){    if (distribution) {	HTList * cur = distribution;	MetaDist * pres;	while ((pres = (MetaDist *) HTList_nextObject(cur)))	    HT_FREE(pres);	HTList_delete(distribution);		return YES;	    }    return NO;}/*	Statistics**	----------**	Calculates a bunch of statistics for the anchors traversed*/PRIVATE BOOL calculate_statistics (Robot * mr){    long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;    if (!mr) return NO;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -