📄 visitors.c
字号:
/* visitors -- very fast web logs analyzer. * * Copyright (C) 2004-2006 Salvatore Sanfilippo <antirez@invece.org> * All Rights Reserved. * * This software is released under the terms of the GPL license version 2. * Read the COPYING file in this distribution for more details. */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <time.h>#include <stdarg.h>#include <errno.h>#include <locale.h>#include <ctype.h>#include "aht.h"#include "antigetopt.h"#include "sleep.h"#include "blacklist.h"/* Max length of an error stored in the visitors handle */#define VI_ERROR_MAX 1024/* Max length of a log line */#define VI_LINE_MAX 4096/* Max number of filenames in the command line */#define VI_FILENAMES_MAX 1024/* Max number of prefixes in the command line */#define VI_PREFIXES_MAX 1024/* Max number of --grep --exclude patterns in the command line */#define VI_GREP_PATTERNS_MAX 1024/* Abbreviation length for HTML outputs */#define VI_HTML_ABBR_LEN 100/* Version as a string */#define VI_DATE_MAX 64/* Max length of a log entry date */#define VI_VERSION_STR "0.7"/*------------------------------- data structures ----------------------------*//* visitors handle */struct vih { int startt; int endt; int processed; int invalid; int blacklisted; int hour[24]; int weekday[7]; int weekdayhour[7][24]; /* hour and weekday combined data */ int monthday[12][31]; /* month and day combined data */ struct hashtable visitors; struct hashtable googlevisitors; struct hashtable pages; struct hashtable images; struct hashtable error404; struct hashtable pageviews; struct hashtable pageviews_grouped; struct hashtable referers; struct hashtable referersage; struct hashtable date; struct hashtable googledate; struct hashtable adsensed; struct hashtable month; struct hashtable googlemonth; struct hashtable agents; struct hashtable googled; struct hashtable googlevisits; struct hashtable googlekeyphrases; struct hashtable googlekeyphrasesage; struct hashtable trails; struct hashtable tld; struct hashtable os; struct hashtable browsers; struct hashtable robots; struct hashtable googlehumanlanguage; struct hashtable screenres; struct hashtable screendepth; char *error;};/* info associated with a line of log */struct logline { char *host; char *date; char *hour; char *timezone; char *req; char *ref; char *agent; time_t time; struct tm tm;};/* output module structure. See below for the definition of * the text and html output modules. */struct outputmodule { void (*print_header)(FILE *fp); void (*print_footer)(FILE *fp); void (*print_title)(FILE *fp, char *title); void (*print_subtitle)(FILE *fp, char *title); void (*print_numkey_info)(FILE *fp, char *key, int val); void (*print_keykey_entry)(FILE *fp, char *key1, char *key2, int num); void (*print_numkey_entry)(FILE *fp, char *key, int val, char *link, int num); void (*print_numkeybar_entry)(FILE *fp, char *key, int max, int tot, int this); void (*print_numkeycomparativebar_entry)(FILE *fp, char *key, int tot, int this); void (*print_bidimentional_map)(FILE *fp, int xlen, int ylen, char **xlabel, char **ylabel, int *value); void (*print_hline)(FILE *fp); void (*print_credits)(FILE *fp); void (*print_report_link)(FILE *fp, char *report);};/* Just a string with cached length */struct vistring { char *str; int len;};/* Grep pattern for --grep --exclude */#define VI_PATTERNTYPE_GREP 0#define VI_PATTERNTYPE_EXCLUDE 1struct greppat { int type; char *pattern;};/* ---------------------- global configuration parameters ------------------- */int Config_debug = 0;int Config_max_referers = 20;int Config_max_referers_age = 20;int Config_max_pages = 20;int Config_max_images = 20;int Config_max_error404 = 20;int Config_max_agents = 20;int Config_max_googled = 20;int Config_max_adsensed = 20;int Config_max_google_keyphrases = 20;int Config_max_google_keyphrases_age = 20;int Config_max_trails = 20;int Config_max_tld = 20;int Config_max_robots = 20;int Config_process_agents = 0;int Config_process_google = 0;int Config_process_google_keyphrases = 0;int Config_process_google_keyphrases_age = 0;int Config_process_google_human_language = 0;int Config_process_web_trails = 0;int Config_process_weekdayhour_map = 0;int Config_process_monthday_map = 0;int Config_process_referers_age = 0;int Config_process_tld = 0;int Config_process_os = 0;int Config_process_browsers = 0;int Config_process_error404 = 0;int Config_process_pageviews = 0;int Config_process_monthly_visitors = 1;int Config_process_robots = 0;int Config_process_screen_info = 0;int Config_graphviz_mode = 0;int Config_graphviz_ignorenode_google = 0;int Config_graphviz_ignorenode_external = 0;int Config_graphviz_ignorenode_noreferer = 0;int Config_tail_mode = 0;int Config_stream_mode = 0;int Config_update_every = 60*10; /* update every 10 minutes for default. */int Config_reset_every = 0; /* never reset for default */int Config_time_delta = 0; /* adjustable time difference */int Config_filter_spam = 0;int Config_ignore_404 = 0;char *Config_output_file = NULL; /* stdout if not set. */struct outputmodule *Output = NULL; /* intialized to 'text' in main() *//* Prefixes */int Config_prefix_num = 0; /* number of set prefixes */struct vistring Config_prefix[VI_PREFIXES_MAX];/* Grep/Exclude array */struct greppat Config_grep_pattern[VI_GREP_PATTERNS_MAX];int Config_grep_pattern_num = 0; /* number of set patterns *//*----------------------------------- Tables ---------------------------------*/static char *vi_wdname[7] = {"Mo", "Tu", "We", "Th", "Fr", "Sa", "Su"};#if 0static int vi_monthdays[12] = {31, 29, 31, 30, 31, 30 , 31, 31, 30, 31, 30, 31};#endif/* -------------------------------- prototypes ------------------------------ */void vi_clear_error(struct vih *vih);void vi_tail(int filec, char **filev);/*------------------- Options parsing help functions ------------------------ */void ConfigAddGrepPattern(char *pattern, int type){ char *s; int len = strlen(pattern); if (Config_grep_pattern_num == VI_GREP_PATTERNS_MAX) { fprintf(stderr, "Too many grep/exclude options specified\n"); exit(1); } s = malloc(strlen(pattern)+3); s[0] = '*'; memcpy(s+1, pattern, len); s[len+1] = '*'; s[len+2] = '\0'; Config_grep_pattern[Config_grep_pattern_num].type = type; Config_grep_pattern[Config_grep_pattern_num].pattern = s; Config_grep_pattern_num++;}/*------------------------------ support functions -------------------------- *//* Returns non-zero if the link seems like a google link, zero otherwise. * Note that this function only checks for a prefix of www.google.<something>. * so may be fooled. */int vi_is_google_link(char *s){ return !strncmp(s, "http://www.google.", 18);}/* Returns non-zero if the user agent appears to be the GoogleBot. */int vi_is_googlebot_agent(char *agent) { if (strstr(agent, "Googlebot") || strstr(agent, "googlebot")) return 1; return 0;}/* Returns non-zero if the user agent appears to be the Mediapartners-Google. */int vi_is_adsensebot_agent(char *agent) { if (strstr(agent, "Mediapartners-Google")) return 1; return 0;}int vi_is_yahoobot_agent(char *agent) { if (strstr(agent, "Yahoo! Slurp")) return 1; return 0;}int vi_is_msbot_agent(char *agent) { if (strstr(agent, "msn.com/msnbot.htm")) return 1; return 0;}/* Try to guess if a given agent string is about a crawler/bot * of some time. This function MUST be conservative, because * false negatives are acceptable while false positives arent. */int vi_is_genericbot_agent(char *agent) { if (strstr(agent, "crawler") || strstr(agent, "Crawler") || strstr(agent, "bot/") || strstr(agent, "Bot/") || strstr(agent, "bot.htm") || strstr(agent, "+http://")) return 1; return 0;}int vi_is_bot_agent(char *agent) { if (vi_is_googlebot_agent(agent) || vi_is_adsensebot_agent(agent) || vi_is_yahoobot_agent(agent) || vi_is_msbot_agent(agent)) return 1; return 0;}/* Returns non-zero if the url matches some user-specified prefix. * being a link "internal" to the site. Otherwise zero is returned. * * When there is a match, the value returned is the length of * the matching prefix. */int vi_is_internal_link(char *url){ int i, l; if (!Config_prefix_num) return 0; /* no prefixes set? */ l = strlen(url); for (i = 0; i < Config_prefix_num; i++) { if (Config_prefix[i].len <= l && !strncasecmp(url, Config_prefix[i].str, Config_prefix[i].len)) { return Config_prefix[i].len; } } return 0;}/* returns non-zero if the URL 's' seems an image or a CSS file. */int vi_is_image(char *s){ int l = strlen(s); char *end = s + l; /* point to the nul term */ if (l < 5) return 0; if (!memcmp(end-4, ".css", 4) || !memcmp(end-4, ".jpg", 4) || !memcmp(end-4, ".gif", 4) || !memcmp(end-4, ".png", 4) || !memcmp(end-4, ".ico", 4) || !memcmp(end-4, ".swf", 4) || !memcmp(end-3, ".js", 3) || !memcmp(end-5, ".jpeg", 5) || !memcmp(end-4, ".CSS", 4) || !memcmp(end-4, ".JPG", 4) || !memcmp(end-4, ".GIF", 4) || !memcmp(end-4, ".PNG", 4) || !memcmp(end-4, ".ICO", 4) || !memcmp(end-4, ".SWF", 4) || !memcmp(end-3, ".JS", 3) || !memcmp(end-5, ".JPEG", 5)) return 1; return 0;}/* returns non-zero if the URL 's' seems a real page. */int vi_is_pageview(char *s){ int l = strlen(s); char *end = s + l; /* point to the nul term */ char *dot, *slash; if (s[l-1] == '/') return 1; if (l >= 6 && (!memcmp(end-5, ".html", 5) || !memcmp(end-4, ".htm", 4) || !memcmp(end-4, ".php", 4) || !memcmp(end-4, ".asp", 4) || !memcmp(end-4, ".jsp", 4) || !memcmp(end-4, ".xdl", 4) || !memcmp(end-5, ".xhtml", 5) || !memcmp(end-4, ".xml", 4) || !memcmp(end-4, ".cgi", 4) || !memcmp(end-3, ".pl", 3) || !memcmp(end-6, ".shtml", 6) || !memcmp(end-5, ".HTML", 5) || !memcmp(end-4, ".HTM", 4) || !memcmp(end-4, ".PHP", 4) || !memcmp(end-4, ".ASP", 4) || !memcmp(end-4, ".JSP", 4) || !memcmp(end-4, ".XDL", 4) || !memcmp(end-6, ".XHTML", 6) || !memcmp(end-4, ".XML", 4) || !memcmp(end-4, ".CGI", 4) || !memcmp(end-3, ".PL", 3) || !memcmp(end-6, ".SHTML", 6))) return 1; dot = strrchr(s, '.'); if (!dot) return 1; slash = strrchr(s, '/'); if (slash && slash > dot) return 1; return 0;}/* returns non-zero if 'ip' seems a string representing an IP address * like "1.2.3.4". Note that 'ip' is always an IP or an hostname * so this function actually test if the string pointed by 'ip' only * contains characters in the "[0-9.]" set */int vi_is_numeric_address(char *ip){ unsigned int l = strlen(ip); return strspn(ip, "0123456789.") == l;}/* returns the time converted into a time_t value. * On error (time_t) -1 is returned. * Note that this function is specific for the following format: * "10/May/2004:04:15:33". Works if the month is not an abbreviation, or if the * year is abbreviated to only the last two digits. * The time can be omitted like in "10/May/2004". */time_t parse_date(char *s, struct tm *tmptr){ struct tm tm; time_t t; char *months[] = { "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec", }; char *day, *month, *year, *time = NULL; char monthaux[32]; int i, len; /* make a copy to mess with it */ len = strlen(s); if (len >= 32) goto fmterr; memcpy(monthaux, s, len); monthaux[len] = '\0'; /* Inizialize the tm structure. We just fill three fields */ tm.tm_sec = 0; tm.tm_min = 0; tm.tm_hour = 0; tm.tm_mday = 0; tm.tm_mon = 0; tm.tm_year = 0; tm.tm_wday = 0; tm.tm_yday = 0; tm.tm_isdst = -1; /* search delimiters */ day = monthaux; if ((month = strchr(day, '/')) == NULL) goto fmterr; *month++ = '\0'; if ((year = strchr(month, '/')) == NULL) goto fmterr; *year++ = '\0'; /* time, optional for this parser. */ if ((time = strchr(year, ':')) != NULL) { *time++ = '\0'; } /* convert day */ tm.tm_mday = atoi(day); if (tm.tm_mday < 1 || tm.tm_mday > 31) goto fmterr; /* convert month */ if (strlen(month) < 3) goto fmterr; month[0] = tolower(month[0]); month[1] = tolower(month[1]); month[2] = tolower(month[2]); for (i = 0; i < 12; i++) { if (memcmp(month, months[i], 3) == 0) break; } if (i == 12) goto fmterr; tm.tm_mon = i; /* convert year */ tm.tm_year = atoi(year); if (tm.tm_year > 100) { if (tm.tm_year < 1900 || tm.tm_year > 2500) goto fmterr; tm.tm_year -= 1900; } else { /* if the year is in two-digits form, the 0 - 68 range * is converted to 2000 - 2068 */ if (tm.tm_year < 69) tm.tm_year += 100; } /* convert time */ if (time) { /* format is HH:MM:SS */ if (strlen(time) < 8) goto fmterr; tm.tm_hour = ((time[0]-'0')*10)+(time[1]-'0'); if (tm.tm_hour < 0 || tm.tm_hour > 23) goto fmterr; tm.tm_min = ((time[3]-'0')*10)+(time[4]-'0'); if (tm.tm_min < 0 || tm.tm_min > 59) goto fmterr; tm.tm_sec = ((time[6]-'0')*10)+(time[7]-'0'); if (tm.tm_sec < 0 || tm.tm_sec > 60) goto fmterr; } t = mktime(&tm); if (t == (time_t)-1) goto fmterr; t += (Config_time_delta*3600); if (tmptr) { struct tm *auxtm; if ((auxtm = localtime(&t)) != NULL) *tmptr = *auxtm; } return t;fmterr: /* format error */ return (time_t) -1;}/* returns 1 if the given date is Saturday or Sunday. * Zero is otherwise returned. */int vi_is_weekend(char *s){ struct tm tm; if (parse_date(s, &tm) != (time_t)-1) { if (tm.tm_wday == 0 || tm.tm_wday == 6) return 1; } return 0;}#if 0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -