📄 htrobot.c
字号:
/*** @(#) $Id: HTRobot.c,v 1.83 1999/02/23 17:53:30 frystyk Exp $** ** W3C Webbot can be found at "http://www.w3.org/Robot/"** ** Copyright 仼 1995-1998 World Wide Web Consortium, (Massachusetts** Institute of Technology, Institut National de Recherche en** Informatique et en Automatique, Keio University). All Rights** Reserved. This program is distributed under the W3C's Software** Intellectual Property License. This program is distributed in the hope** that it will be useful, but WITHOUT ANY WARRANTY; without even the** implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR** PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more** details.**** Authors:** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)** BR Bob Racko** JP John Punin**** History:** Dec 04 95 First version** Oct 1998 Split into separate files*/#include "HTRobMan.h"#include "HTQueue.h"#include "HTAncMan.h"#define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))#define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};/*** Some sorting algorithms*/PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;/*** Ths callbacks that we need from the libwww HTML parser*/PRIVATE HText_new RHText_new;PRIVATE HText_delete RHText_delete;PRIVATE HText_foundLink RHText_foundLink;/* ------------------------------------------------------------------------- *//* Create a "HyperDoc" object** --------------------------** A HyperDoc object contains information about whether we have already** started checking the anchor and the depth in our search*/PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth){ HyperDoc * hd; if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL) HT_OUTOFMEM("HyperDoc_new"); hd->depth = depth; hd->hits = 1; hd->code = -1; hd->index = ++mr->cindex; /* Bind the HyperDoc object together with the Anchor Object */ hd->anchor = anchor; HTAnchor_setDocument(anchor, (void *) hd); /* Add this HyperDoc object to our list */ if (!mr->hyperdoc) mr->hyperdoc = HTList_new(); HTList_addObject(mr->hyperdoc, (void *) hd); return hd;}/* Delete a "HyperDoc" object** --------------------------*/PUBLIC BOOL HyperDoc_delete (HyperDoc * hd){ if (hd) { HT_FREE (hd); return YES; } return NO;}/*** Sort the anchor array and log reference count*/PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array){ if (mr && array) { HTLog * log = HTLog_open(mr->hitfile, YES, YES); if (log) { void ** data = NULL; HTParentAnchor * anchor = NULL; HTArray_sort(array, HitSort); anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { char * uri = HTAnchor_address((HTAnchor *) anchor); HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor); if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri); HT_FREE(uri); anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } } HTLog_close(log); return YES; } return NO;}PRIVATE int HitSort (const void * a, const void * b){ HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a); HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b); if (aa && bb) return (bb->hits - aa->hits); return bb - aa;}/*** Sort the anchor array and log link relations*/PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array){ if (mr && array) { HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL; void ** data = NULL; HTParentAnchor * anchor = NULL; anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { /* ** If we have a specific link relation to look for then do this. ** Otherwise look for all link relations. */ if (mr->relation) { HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation); if (link) { HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link)); char * src_uri = HTAnchor_address((HTAnchor *) anchor); char * dest_uri = HTAnchor_address((HTAnchor *) dest); if (src_uri && dest_uri) {#ifdef HT_MYSQL if (mr->sqllog) { HTSQLLog_addLinkRelationship (mr->sqllog, src_uri, dest_uri, HTAtom_name(mr->relation), NULL); }#endif if (log) { HTFormat format = HTAnchor_format(dest); HTLog_addText(log, "%s %s %s --> %s\n", HTAtom_name(mr->relation), format != WWW_UNKNOWN ? HTAtom_name(format) : "<unknown>", src_uri, dest_uri); } /* Cleanup */ HT_FREE(src_uri); HT_FREE(dest_uri); } } } else { HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor); HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor); char * src_uri = HTAnchor_address((HTAnchor *) anchor); HTLinkType linktype; /* First look in the main link */ if (link && (linktype = HTLink_type(link))) { HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link)); char * dest_uri = HTAnchor_address((HTAnchor *) dest); if (src_uri && dest_uri) {#ifdef HT_MYSQL if (mr->sqllog) { HTSQLLog_addLinkRelationship (mr->sqllog, src_uri, dest_uri, HTAtom_name(linktype), NULL); }#endif if (log) { HTFormat format = HTAnchor_format(dest); HTLog_addText(log, "%s %s %s --> %s\n", HTAtom_name(linktype), format != WWW_UNKNOWN ? HTAtom_name(format) : "<unknown>", src_uri, dest_uri); } } HT_FREE(dest_uri); } /* and then in any sublinks */ if (sublinks) { HTLink * pres; while ((pres = (HTLink *) HTList_nextObject(sublinks))) { if ((linktype = HTLink_type(pres))) { HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres)); char * dest_uri = HTAnchor_address((HTAnchor *) dest); if (src_uri && dest_uri) {#ifdef HT_MYSQL if (mr->sqllog) { HTSQLLog_addLinkRelationship (mr->sqllog, src_uri, dest_uri, HTAtom_name(linktype), NULL); }#endif if (log) { HTFormat format = HTAnchor_format(dest); HTLog_addText(log, "%s %s %s --> %s\n", HTAtom_name(linktype), format != WWW_UNKNOWN ? HTAtom_name(format) : "<unknown>", src_uri, dest_uri); } HT_FREE(dest_uri); } } } } /* Cleanup */ HT_FREE(src_uri); } anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } if (log) HTLog_close(log); return YES; } return NO;}/*** Sort the anchor array and log last modified date*/PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array){ if (mr && array) { HTLog * log = HTLog_open(mr->lmfile, YES, YES); if (log) { void ** data = NULL; HTParentAnchor * anchor = NULL; HTArray_sort(array, LastModifiedSort); anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { char * uri = HTAnchor_address((HTAnchor *) anchor); time_t lm = HTAnchor_lastModified(anchor); if (uri && lm > 0) HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri); HT_FREE(uri); anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } } HTLog_close(log); return YES; } return NO;}PRIVATE int LastModifiedSort (const void * a, const void * b){ time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a); time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b); return bb - aa;}/*** Sort the anchor array and log the document title*/PRIVATE BOOL calculate_title (Robot * mr, HTArray * array){ if (mr && array) { HTLog * log = HTLog_open(mr->titlefile, YES, YES); if (log) { void ** data = NULL; HTParentAnchor * anchor = NULL; HTArray_sort(array, TitleSort); anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { char * uri = HTAnchor_address((HTAnchor *) anchor); const char * title = HTAnchor_title(anchor); HTCharset charset = HTAnchor_charset(anchor); if (uri) HTLog_addText(log, "%s `%s\' %s\n", charset ? HTAtom_name(charset) : "<none>", title ? title : "<none>", uri); HT_FREE(uri); anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } } HTLog_close(log); return YES; } return NO;}PRIVATE int TitleSort (const void * a, const void * b){ const char * aa = HTAnchor_title(*(HTParentAnchor **) a); const char * bb = HTAnchor_title(*(HTParentAnchor **) b); return strcasecomp(bb?bb:"", aa?aa:"");}/*** Calculate distributions for media types. The same mechanism** can be used for other characteristics with relatively** few outcomes.*/PRIVATE HTList * mediatype_distribution (HTArray * array){ if (array) { HTList * mt = HTList_new(); MetaDist * pres = NULL; void ** data = NULL; HTParentAnchor * anchor = NULL; anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { HTFormat format = HTAnchor_format(anchor); if (format && format != WWW_UNKNOWN) { HTList * cur = mt; /* If found then increase counter */ while ((pres = (MetaDist *) HTList_nextObject(cur))) { if (pres->name == format) { pres->hits++; break; } } /* If not found then add new format to list */ if (!pres) { if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL) HT_OUTOFMEM("mediatype_distribution"); pres->name = format; pres->hits = 1; HTList_addObject(mt, pres); HTList_insertionSort(mt, FormatSort); } } /* Find next anchor in array */ anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } return mt; } return NULL;}/*** Calculate distributions for charsets. The same mechanism** can be used for other characteristics with relatively** few outcomes.*/PRIVATE HTList * charset_distribution (HTArray * array){ if (array) { HTList * cs = HTList_new(); MetaDist * pres = NULL; void ** data = NULL; HTParentAnchor * anchor = NULL; anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { HTCharset charset = HTAnchor_charset(anchor); if (charset) { HTList * cur = cs; /* If found then increase counter */ while ((pres = (MetaDist *) HTList_nextObject(cur))) { if (pres->name == charset) { pres->hits++; break; } } /* If not found then add new format to list */ if (!pres) { if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL) HT_OUTOFMEM("charset_distribution"); pres->name = charset; pres->hits = 1; HTList_addObject(cs, pres); HTList_insertionSort(cs, FormatSort); } } /* Find next anchor in array */ anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } return cs; } return NULL;}PRIVATE int FormatSort (const void * a, const void * b){ MetaDist * aa = (MetaDist *) a; MetaDist * bb = (MetaDist *) b; return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));}PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution){ if (logfile && distribution) { HTLog * log = HTLog_open(logfile, YES, YES); if (log) { HTList * cur = distribution; MetaDist * pres; while ((pres = (MetaDist *) HTList_nextObject(cur))) { if (pres->name) { HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name)); } } HTLog_close(log); } } return NO;}PRIVATE BOOL delete_meta_distribution (HTList * distribution){ if (distribution) { HTList * cur = distribution; MetaDist * pres; while ((pres = (MetaDist *) HTList_nextObject(cur))) HT_FREE(pres); HTList_delete(distribution); return YES; } return NO;}/* Statistics** ----------** Calculates a bunch of statistics for the anchors traversed*/PRIVATE BOOL calculate_statistics (Robot * mr){ long total_docs = mr->get_docs + mr->head_docs + mr->other_docs; if (!mr) return NO;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -