⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 select_test.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
字号:
/* Copyright (C) 2001-2002  Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Test program. * Test classification of mail and optionally calculate statistics about it. * * @author  Mikael Ylikoski * @date    2001-2002 */#include <getopt.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <sys/socket.h>#include <sys/un.h>#include <time.h>#include <unistd.h>#include "box.h"#include "collection.h"#include "combiner.h"#include "document.h"#include "protocol_c.h"#include "statlib.h"#include "utility.h"static char *adr_str;			/**< Socket address string */static char *mailbox_file;		/**< Mailbox file */static protocol_c_data *pdata;		/**< Protocol data */static test_data *td;			/**< Test data */static enum csm csm;			/**< Class sequence mode */static enum msm msm;			/**< Message sequence mode */static enum trm trm;			/**< Training mode */static enum plotting_mode plot_mode;	/**< Plotting mode */static int print_plot;			/**< Print plot */static int test_and_stat;		/**< Do stats directly */static int mean_n;			/**< Number to average over */static int rank_limit;			/**< Rank limit */static int plot_c;			/**< Plotted classifier */static int use_subject;			/**< Add subject to body text */static int use_combiner;		/**< Use combiner classifier */static int nob;				/**< Number of boxes */static int noc;				/**< Number of classifiers *//** * Command line options. */static struct option longopts[] = {    { "address", required_argument, NULL, 'a' },    { "mailbox-file", required_argument, NULL, 'b' },    { "class-seq", required_argument, NULL, 'c' },    { "conf-file", required_argument, NULL, 'f' },    { "message-seq", required_argument, NULL, 'm' },    { "plot", required_argument, NULL, 'p' },    { "rank-limit", required_argument, NULL, 'r' },    { "train-mode", required_argument, NULL, 't' },    { "combiner", no_argument, NULL, 'C' },    { "plot-classifier", required_argument, NULL, 'P' },    { "test-option", required_argument, NULL, 'T' },    { 0, 0, 0, 0 }};/** * Open document. */static intopen_doc (void) {    const char *text;    int len, bno, mno;    document *doc;    text_part *tp;    protocol_c_open (pdata);    bno = collection_get_class ();    mno = collection_get_msg ();    doc = collection_get_document ();    tp = document_get_parts (doc);    for (; tp; tp = tp->next) {	len = tp->len;	if (len > 10000)	    len = 10000;	protocol_c_part (pdata, "text", tp->charset, tp->text, len);    }    text = document_get_subject (doc);    if (text)	protocol_c_part (pdata, "subject", NULL, text, strlen (text));    text = document_get_from_name (doc);    if (text)	protocol_c_part (pdata, "from", NULL, text, strlen (text));    collection_drop_document (doc);    return 0;}/** * Close document. */static intclose_doc (void) {    protocol_c_close (pdata);    return 0;}/** * Train classifier with a message. * * @param bno  mailbox number of training message * @param mno  message number of training message * @return Zero if ok, or nonzero if there was an error. */static inttrain_msg (void) {    int i, bno, mno;    bno = collection_get_class ();    mno = collection_get_msg ();    i = protocol_c_learn (pdata, bno);    if (i)	return -1;    if (!test_and_stat)	printf ("> %d/%d\n", bno, mno);    return 0;}/** * Test classification of a message and print the result. * * @param bno  mailbox number of test message * @param mno  message number of test message * @return Zero if ok, or nonzero if there was an error. */static inttest_msg (void) {    char buf[10];    int *il, i, k;    int bno, mno;    bno = collection_get_class ();    mno = collection_get_msg ();    if (test_and_stat) {	td->bno = bno;	td->mno = mno;    } else	printf ("< %d/%d\n", bno, mno);    for (k = 0; k < noc; k++) {	/* if (score)	   dl = protocol_c_classify_score (pdata, "0");	   if (test_and_stat) {	   if (dl->len < nob) {	   dl->array = my_realloc (dl->array, nob * sizeof(double));	   for (i = dl->len; i < nob; i++)	   dl->array[i] = 0;	   }	   td->res[0].cm = SCORE;	   td->res[0].len = dl->len;	   td->res[0].u.slist = dl->array;	   free (dl);	   } else {	   printf ("c:");	   for (i = 0; i < dl->len; i++)	   printf (" %.3f", dl->array[i]);	   free (dl->array);	   free (dl);	   printf ("\n");	   }	*/	/* if (rank) */	sprintf (buf, "%d", k);	il = protocol_c_classify_rank (pdata, buf);	if (!il) {	    fprintf (stderr, "Error: Communication error\n");	    exit (1);	}	if (test_and_stat) {	    for (i = 0; 1; i++)		if (il[i] == -1)		    break;	    td->res[k].cm = RANK;	    td->res[k].len = i;	    td->res[k].u.rlist = il;	} else {	    printf ("c%d:", k);	    if (il[0] == -1)		printf (" -1");	    else		for (i = 0; il[i] != -1; i++)		    printf (" %d", il[i]);	    free (il);	    printf ("\n");	}    }    /* combiner */    if (use_combiner) {	combi_results *cr;	cr = my_malloc (sizeof(combi_results));	cr->res = my_malloc (noc * sizeof(int *));	cr->nor = noc;	for (i = 0; i < noc; i++)	    cr->res[i] = td->res[i].u.rlist;	il = combiner_combine_rank (NULL, cr);	td->res[noc].cm = RANK;	td->res[noc].len = td->res[0].len;	td->res[noc].u.rlist = il;	free (cr->res);	free (cr);    }    if (test_and_stat) {	if (statlib_update_stats (td)) {	    printf ("Error updating stats!\n");	    return -1;	}	for (i = 0; i < noc; i++) {	    if (td->res[i].cm == SCORE && td->res[i].u.slist) {		free (td->res[i].u.slist);		td->res[i].u.slist = NULL;	    } else if (td->res[i].cm == RANK && td->res[i].u.rlist) {		free (td->res[i].u.rlist);		td->res[i].u.rlist = NULL;	    }	}	if (use_combiner) {	    free (td->res[i].u.rlist);	    td->res[i].u.rlist = NULL;	}	td->bno = -1;	td->mno = -1;	if (print_plot) {	    if (mno == 0)		printf ("# New box: %d\n", bno);	    statlib_print_plot_data ();	}    } else	printf (";\n");    return 0;}/** * Read command line options. * * @param argc  argument count * @param argv  argument vector * @return Zero if ok, or nonzero otherwise. */static intread_opts (int argc, char *argv[]) {    int i, retval;    retval = 0;    while ((i = getopt_long (argc, argv, "a:b:c:f:m:p:r:t:CP:Q:T:",			     longopts, NULL)) != EOF) {        switch (i) {        case 'a':	    adr_str = optarg;            break;        case 'b':            mailbox_file = optarg;            break;        case 'c':	    if (!strcmp (optarg, "cross"))		csm = CROSS;	    else if (!strcmp (optarg, "linear"))		csm = LINEAR;	    else if (!strcmp (optarg, "random"))		csm = RANDOM;	    else if (!strcmp (optarg, "time"))		csm = TIME;	    else {		fprintf (stderr, "Error: Unknown class sequence mode: %s\n",			 optarg);		retval = -1;		break;	    }            break;	case 'f':	    break;	case 'm':	    if (!strcmp (optarg, "linear"))		msm = LINEAR;	    else if (!strcmp (optarg, "random"))		msm = RANDOM;	    else {		fprintf (stderr, "Error: Unknown message sequence mode: %s\n",			 optarg);		retval = -1;	    }	    break;        case 'p':	    if (!strcmp (optarg, "last_n")) {		plot_mode = N_AVERAGE;		print_plot = 1;		// FIXME should set rank_limit	    } else if (!strcmp (optarg, "total")) {		plot_mode = TOTAL;		print_plot = 1;	    } else if (!strcmp (optarg, "off")) {		//plot_mode = NONE;		print_plot = 0;	    } else {		fprintf (stderr, "Error: Unknown plot type: %s\n", optarg);		retval = -1;	    }            break;        case 'r':	    rank_limit = atoi (optarg);	    if (rank_limit < 1) {		fprintf (stderr, "Error: Incorrect rank_limit value: %s\n",			 optarg);		retval = -1;		break;	    }            break;	case 't':	    if (!strcmp (optarg, "all"))		trm = ALL;	    else if (!strcmp (optarg, "interleaved"))		trm = ALL_INTERLEAVED;	    else if (!strcmp (optarg, "class_percentage"))		trm = CLASS_PERCENTAGE;	    else if (!strcmp (optarg, "total_percentage"))		trm = TOTAL_PERCENTAGE;	    else {		fprintf (stderr, "Error: Unknown training mode: %s\n", optarg);		retval = -1;	    }	    break;        case 'C':	    use_combiner = 1;            break;        case 'P':	    plot_c = atoi (optarg);            break;	case 'Q':	// Quickoption for selectd	    break;	case 'T':	// Just for my convenience, should be removed	    if (!strcmp (optarg, "cross"))		csm = CROSS;	    else if (!strcmp (optarg, "combi")) {		use_combiner = 1;		plot_c = 3;	    } else {		fprintf (stderr, "Error: Unknown test option: %s\n", optarg);		retval = -1;	    }	    break;        default:	    printf ("Usage: ...\n");	    return -1;        }    }    return retval;}/** * Read configuration file. * * @return Zero if ok, or nonzero if there was an error. */static intread_config (const char *file) {    FILE *fd;    int retval;    conf_pair *cp;    fd = fopen (file, "r");    if (!fd)	return -1;    retval = 0;    cp = my_malloc (sizeof(conf_pair));    while (get_next_configuration (fd, cp)) {	if (!strcmp (cp->key, "address")) {	    if (cp->value)		adr_str = my_strdup (cp->value);	} else if (!strcmp (cp->key, "mailbox_file"))	    mailbox_file = my_strdup (cp->value);	else if (!strcmp (cp->key, "combiner"))	    use_combiner = !strcmp (cp->value, "on");	else if (!strcmp (cp->key, "rank_limit")) {	    rank_limit = atoi (cp->value);	    if (rank_limit < 1) {		fprintf (stderr, "Error: Incorrect rank_limit value: %s\n",			 cp->key);		retval = -1;		break;	    }	} else if (!strcmp (cp->key, "plot"))	    if (!strcmp (cp->value, "last_n")) {		plot_mode = N_AVERAGE;		print_plot = 1;		// FIXME should set rank_limit	    } else if (!strcmp (cp->value, "total")) {		plot_mode = TOTAL;		print_plot = 1;	    } else if (!strcmp (cp->value, "off")) {		//plot_mode = NONE;		print_plot = 0;	    } else {		fprintf (stderr, "Error: Unknown plot type: %s\n", cp->value);		return -1;	    }	else if (!strcmp (cp->key, "plot_classifier"))	    plot_c = atoi (cp->value);	else if (!strcmp (cp->key, "class_seq"))	    if (!strcmp (cp->value, "cross"))		csm = CROSS;	    else if (!strcmp (cp->value, "linear"))		csm = LINEAR;	    else if (!strcmp (cp->value, "random"))		csm = RANDOM;	    else if (!strcmp (cp->value, "time"))		csm = TIME;	    else {		fprintf (stderr, "Error: Unknown class sequence mode: %s\n",			 cp->value);		retval = -1;		break;	    }	else if (!strcmp (cp->key, "msg_seq"))	    if (!strcmp (cp->value, "linear"))		msm = LINEAR;	    else if (!strcmp (cp->value, "random"))		msm = RANDOM;	    else {		fprintf (stderr, "Error: Unknown message sequence mode: %s\n",			 cp->value);		retval = -1;		break;	    }	else if (!strcmp (cp->key, "training_mode"))	    if (!strcmp (cp->value, "all"))		trm = ALL;	    else if (!strcmp (cp->value, "interleaved"))		trm = ALL_INTERLEAVED;	    else if (!strcmp (cp->value, "class_percentage"))		trm = CLASS_PERCENTAGE;	    else if (!strcmp (cp->value, "total_percentage"))		trm = TOTAL_PERCENTAGE;	    else {		fprintf (stderr, "Error: Unknown training mode: %s\n",			 cp->value);		retval = -1;		break;	    }	else {	    fprintf (stderr, "Error: Unknown configuration key: %s\n",		     cp->key);	    retval = -1;	    break;	}    }    free (cp);    fclose (fd);    return retval;}/** * Read mailbox names. * * @return Zero if ok, or nonzero if there was an error. */static intread_mailconfig (void) {    FILE *fp;    char buf[128];    int i;    box *b;    fp = fopen (mailbox_file, "r");    if (!fp)	return -1;    collection_init (csm, msm, trm, RFC822);    for (i = 0; get_line_nows (fp, buf, 128) && i < 128; i++) {	b = box_new (buf);	if (!b) {	    fprintf (stderr, "Error: Cannot read mailbox '%s'!\n", buf);	    return -1;	}	collection_add_box (b);    }    nob = i;    fclose (fp);    return 0;}/** * Main program. */intmain (int argc, char *argv[]) {    char *conf_file;    int i;    /* Default configuration */    conf_file = NULL;    adr_str = NULL;    mailbox_file = NULL;    csm = TIME;    msm = LINEAR_SEQ;    trm = ALL_INTERLEAVED;    plot_mode = N_AVERAGE;    print_plot = 1;    test_and_stat = 1;    mean_n = 50;    rank_limit = 3;    plot_c = 0;    use_subject = 1;    use_combiner = 0;    nob = 0;    /* Initialization */    srand (time (NULL));    /* Find configuration file */    for (i = 1; i < argc; i++)	if (!strcmp (argv[i], "-f")) {	    if (argc > i + 1 && *argv[i + 1] != '-')		conf_file = argv[i + 1];	    else		printf ("error in option -f\n");	} else if (!strncmp (argv[i], "--conf-file=", 12)) {	    if (argv[i][12] != '\0')		conf_file = &argv[i][12];	    else		printf ("error in option --conf-file\n");	}    /* Read configuration file */    if (conf_file)	if (read_config (conf_file)) {	    fprintf (stderr, "Error: Cannot read configuration!\n");	    return 1;	}    /* Read command line options */    if (read_opts (argc, argv)) {	fprintf (stderr, "Error: Cannot read options!\n");	return 1;    }    /* Read mailbox configuration */    if (!mailbox_file) {	fprintf (stderr, "Error: No mailbox configuration file specified!\n");	return 1;    }    if (read_mailconfig ()) {	fprintf (stderr, "Error: Cannot read mailbox configuration!\n");	return 1;    }    pdata = protocol_c_new (15000, adr_str);    i = protocol_c_open (pdata);    if (i) {	fprintf (stderr, "Error: Cannot connect to daemon!\n");	return 1;    }    noc = protocol_c_get_integer (pdata, "noc");    protocol_c_close (pdata);    if (noc < 1) {	fprintf (stderr, "Error: No classifiers\n");	return 1;    }    if (use_combiner)	noc++;		// Decremented later    if (plot_c > noc) {	fprintf (stderr, "Error: Incorrect classifier to plot.\n");	return 1;    }    printf ("# noc=%d ", noc);    if (csm == TIME)	printf ("time ");    else if (csm == CROSS)	printf ("cross ");    if (use_combiner)	printf ("combi ");    printf ("\n");    if (test_and_stat) {	td = my_malloc (sizeof(test_data));	td->res = my_malloc (sizeof(cls_res) * noc);	for (i = 0; i < noc; i++) {	    td->res[i].len = 0;	    td->res[i].u.slist = NULL;	    td->res[i].u.rlist = NULL;	}	td->bno = -1;	td->mno = -1;	td->nor = noc;	statlib_initialize (plot_mode, rank_limit, mean_n, plot_c, noc, nob);	if (print_plot)	    statlib_print_plot_header ();    } else {	/* Print header */	printf ("! noc=%d nob=%d nom=%d\n", noc, nob, collection_get_nod ());    }    if (use_combiner)	noc--;    if (trm == ALL_INTERLEAVED)	while (collection_next_document ()) {	    open_doc ();	    if (test_msg ())		return 1;	    if (train_msg ())		return 1;	    close_doc ();	}    else {	while (collection_next_document ()) {	    open_doc ();	    if (train_msg ())		return 1;	    close_doc ();	}	while (collection_next_test_document ())	    open_doc ();	    if (test_msg ())		return 1;	    close_doc ();    }    if (test_and_stat)	statlib_print_results ();    else	printf ("# End of testing\n");    /*i = dict_get_size (vectorizer_get_dictionary (vec));      printf ("## Total number of words: %d\n", i);*/    //from_print (fdb);    return 0;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -