📄 select_test.c
字号:
/* Copyright (C) 2001-2002 Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Test program. * Test classification of mail and optionally calculate statistics about it. * * @author Mikael Ylikoski * @date 2001-2002 */#include <getopt.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <sys/socket.h>#include <sys/un.h>#include <time.h>#include <unistd.h>#include "box.h"#include "collection.h"#include "combiner.h"#include "document.h"#include "protocol_c.h"#include "statlib.h"#include "utility.h"static char *adr_str; /**< Socket address string */static char *mailbox_file; /**< Mailbox file */static protocol_c_data *pdata; /**< Protocol data */static test_data *td; /**< Test data */static enum csm csm; /**< Class sequence mode */static enum msm msm; /**< Message sequence mode */static enum trm trm; /**< Training mode */static enum plotting_mode plot_mode; /**< Plotting mode */static int print_plot; /**< Print plot */static int test_and_stat; /**< Do stats directly */static int mean_n; /**< Number to average over */static int rank_limit; /**< Rank limit */static int plot_c; /**< Plotted classifier */static int use_subject; /**< Add subject to body text */static int use_combiner; /**< Use combiner classifier */static int nob; /**< Number of boxes */static int noc; /**< Number of classifiers *//** * Command line options. */static struct option longopts[] = { { "address", required_argument, NULL, 'a' }, { "mailbox-file", required_argument, NULL, 'b' }, { "class-seq", required_argument, NULL, 'c' }, { "conf-file", required_argument, NULL, 'f' }, { "message-seq", required_argument, NULL, 'm' }, { "plot", required_argument, NULL, 'p' }, { "rank-limit", required_argument, NULL, 'r' }, { "train-mode", required_argument, NULL, 't' }, { "combiner", no_argument, NULL, 'C' }, { "plot-classifier", required_argument, NULL, 'P' }, { "test-option", required_argument, NULL, 'T' }, { 0, 0, 0, 0 }};/** * Open document. */static intopen_doc (void) { const char *text; int len, bno, mno; document *doc; text_part *tp; protocol_c_open (pdata); bno = collection_get_class (); mno = collection_get_msg (); doc = collection_get_document (); tp = document_get_parts (doc); for (; tp; tp = tp->next) { len = tp->len; if (len > 10000) len = 10000; protocol_c_part (pdata, "text", tp->charset, tp->text, len); } text = document_get_subject (doc); if (text) protocol_c_part (pdata, "subject", NULL, text, strlen (text)); text = document_get_from_name (doc); if (text) protocol_c_part (pdata, "from", NULL, text, strlen (text)); collection_drop_document (doc); return 0;}/** * Close document. */static intclose_doc (void) { protocol_c_close (pdata); return 0;}/** * Train classifier with a message. * * @param bno mailbox number of training message * @param mno message number of training message * @return Zero if ok, or nonzero if there was an error. */static inttrain_msg (void) { int i, bno, mno; bno = collection_get_class (); mno = collection_get_msg (); i = protocol_c_learn (pdata, bno); if (i) return -1; if (!test_and_stat) printf ("> %d/%d\n", bno, mno); return 0;}/** * Test classification of a message and print the result. * * @param bno mailbox number of test message * @param mno message number of test message * @return Zero if ok, or nonzero if there was an error. */static inttest_msg (void) { char buf[10]; int *il, i, k; int bno, mno; bno = collection_get_class (); mno = collection_get_msg (); if (test_and_stat) { td->bno = bno; td->mno = mno; } else printf ("< %d/%d\n", bno, mno); for (k = 0; k < noc; k++) { /* if (score) dl = protocol_c_classify_score (pdata, "0"); if (test_and_stat) { if (dl->len < nob) { dl->array = my_realloc (dl->array, nob * sizeof(double)); for (i = dl->len; i < nob; i++) dl->array[i] = 0; } td->res[0].cm = SCORE; td->res[0].len = dl->len; td->res[0].u.slist = dl->array; free (dl); } else { printf ("c:"); for (i = 0; i < dl->len; i++) printf (" %.3f", dl->array[i]); free (dl->array); free (dl); printf ("\n"); } */ /* if (rank) */ sprintf (buf, "%d", k); il = protocol_c_classify_rank (pdata, buf); if (!il) { fprintf (stderr, "Error: Communication error\n"); exit (1); } if (test_and_stat) { for (i = 0; 1; i++) if (il[i] == -1) break; td->res[k].cm = RANK; td->res[k].len = i; td->res[k].u.rlist = il; } else { printf ("c%d:", k); if (il[0] == -1) printf (" -1"); else for (i = 0; il[i] != -1; i++) printf (" %d", il[i]); free (il); printf ("\n"); } } /* combiner */ if (use_combiner) { combi_results *cr; cr = my_malloc (sizeof(combi_results)); cr->res = my_malloc (noc * sizeof(int *)); cr->nor = noc; for (i = 0; i < noc; i++) cr->res[i] = td->res[i].u.rlist; il = combiner_combine_rank (NULL, cr); td->res[noc].cm = RANK; td->res[noc].len = td->res[0].len; td->res[noc].u.rlist = il; free (cr->res); free (cr); } if (test_and_stat) { if (statlib_update_stats (td)) { printf ("Error updating stats!\n"); return -1; } for (i = 0; i < noc; i++) { if (td->res[i].cm == SCORE && td->res[i].u.slist) { free (td->res[i].u.slist); td->res[i].u.slist = NULL; } else if (td->res[i].cm == RANK && td->res[i].u.rlist) { free (td->res[i].u.rlist); td->res[i].u.rlist = NULL; } } if (use_combiner) { free (td->res[i].u.rlist); td->res[i].u.rlist = NULL; } td->bno = -1; td->mno = -1; if (print_plot) { if (mno == 0) printf ("# New box: %d\n", bno); statlib_print_plot_data (); } } else printf (";\n"); return 0;}/** * Read command line options. * * @param argc argument count * @param argv argument vector * @return Zero if ok, or nonzero otherwise. */static intread_opts (int argc, char *argv[]) { int i, retval; retval = 0; while ((i = getopt_long (argc, argv, "a:b:c:f:m:p:r:t:CP:Q:T:", longopts, NULL)) != EOF) { switch (i) { case 'a': adr_str = optarg; break; case 'b': mailbox_file = optarg; break; case 'c': if (!strcmp (optarg, "cross")) csm = CROSS; else if (!strcmp (optarg, "linear")) csm = LINEAR; else if (!strcmp (optarg, "random")) csm = RANDOM; else if (!strcmp (optarg, "time")) csm = TIME; else { fprintf (stderr, "Error: Unknown class sequence mode: %s\n", optarg); retval = -1; break; } break; case 'f': break; case 'm': if (!strcmp (optarg, "linear")) msm = LINEAR; else if (!strcmp (optarg, "random")) msm = RANDOM; else { fprintf (stderr, "Error: Unknown message sequence mode: %s\n", optarg); retval = -1; } break; case 'p': if (!strcmp (optarg, "last_n")) { plot_mode = N_AVERAGE; print_plot = 1; // FIXME should set rank_limit } else if (!strcmp (optarg, "total")) { plot_mode = TOTAL; print_plot = 1; } else if (!strcmp (optarg, "off")) { //plot_mode = NONE; print_plot = 0; } else { fprintf (stderr, "Error: Unknown plot type: %s\n", optarg); retval = -1; } break; case 'r': rank_limit = atoi (optarg); if (rank_limit < 1) { fprintf (stderr, "Error: Incorrect rank_limit value: %s\n", optarg); retval = -1; break; } break; case 't': if (!strcmp (optarg, "all")) trm = ALL; else if (!strcmp (optarg, "interleaved")) trm = ALL_INTERLEAVED; else if (!strcmp (optarg, "class_percentage")) trm = CLASS_PERCENTAGE; else if (!strcmp (optarg, "total_percentage")) trm = TOTAL_PERCENTAGE; else { fprintf (stderr, "Error: Unknown training mode: %s\n", optarg); retval = -1; } break; case 'C': use_combiner = 1; break; case 'P': plot_c = atoi (optarg); break; case 'Q': // Quickoption for selectd break; case 'T': // Just for my convenience, should be removed if (!strcmp (optarg, "cross")) csm = CROSS; else if (!strcmp (optarg, "combi")) { use_combiner = 1; plot_c = 3; } else { fprintf (stderr, "Error: Unknown test option: %s\n", optarg); retval = -1; } break; default: printf ("Usage: ...\n"); return -1; } } return retval;}/** * Read configuration file. * * @return Zero if ok, or nonzero if there was an error. */static intread_config (const char *file) { FILE *fd; int retval; conf_pair *cp; fd = fopen (file, "r"); if (!fd) return -1; retval = 0; cp = my_malloc (sizeof(conf_pair)); while (get_next_configuration (fd, cp)) { if (!strcmp (cp->key, "address")) { if (cp->value) adr_str = my_strdup (cp->value); } else if (!strcmp (cp->key, "mailbox_file")) mailbox_file = my_strdup (cp->value); else if (!strcmp (cp->key, "combiner")) use_combiner = !strcmp (cp->value, "on"); else if (!strcmp (cp->key, "rank_limit")) { rank_limit = atoi (cp->value); if (rank_limit < 1) { fprintf (stderr, "Error: Incorrect rank_limit value: %s\n", cp->key); retval = -1; break; } } else if (!strcmp (cp->key, "plot")) if (!strcmp (cp->value, "last_n")) { plot_mode = N_AVERAGE; print_plot = 1; // FIXME should set rank_limit } else if (!strcmp (cp->value, "total")) { plot_mode = TOTAL; print_plot = 1; } else if (!strcmp (cp->value, "off")) { //plot_mode = NONE; print_plot = 0; } else { fprintf (stderr, "Error: Unknown plot type: %s\n", cp->value); return -1; } else if (!strcmp (cp->key, "plot_classifier")) plot_c = atoi (cp->value); else if (!strcmp (cp->key, "class_seq")) if (!strcmp (cp->value, "cross")) csm = CROSS; else if (!strcmp (cp->value, "linear")) csm = LINEAR; else if (!strcmp (cp->value, "random")) csm = RANDOM; else if (!strcmp (cp->value, "time")) csm = TIME; else { fprintf (stderr, "Error: Unknown class sequence mode: %s\n", cp->value); retval = -1; break; } else if (!strcmp (cp->key, "msg_seq")) if (!strcmp (cp->value, "linear")) msm = LINEAR; else if (!strcmp (cp->value, "random")) msm = RANDOM; else { fprintf (stderr, "Error: Unknown message sequence mode: %s\n", cp->value); retval = -1; break; } else if (!strcmp (cp->key, "training_mode")) if (!strcmp (cp->value, "all")) trm = ALL; else if (!strcmp (cp->value, "interleaved")) trm = ALL_INTERLEAVED; else if (!strcmp (cp->value, "class_percentage")) trm = CLASS_PERCENTAGE; else if (!strcmp (cp->value, "total_percentage")) trm = TOTAL_PERCENTAGE; else { fprintf (stderr, "Error: Unknown training mode: %s\n", cp->value); retval = -1; break; } else { fprintf (stderr, "Error: Unknown configuration key: %s\n", cp->key); retval = -1; break; } } free (cp); fclose (fd); return retval;}/** * Read mailbox names. * * @return Zero if ok, or nonzero if there was an error. */static intread_mailconfig (void) { FILE *fp; char buf[128]; int i; box *b; fp = fopen (mailbox_file, "r"); if (!fp) return -1; collection_init (csm, msm, trm, RFC822); for (i = 0; get_line_nows (fp, buf, 128) && i < 128; i++) { b = box_new (buf); if (!b) { fprintf (stderr, "Error: Cannot read mailbox '%s'!\n", buf); return -1; } collection_add_box (b); } nob = i; fclose (fp); return 0;}/** * Main program. */intmain (int argc, char *argv[]) { char *conf_file; int i; /* Default configuration */ conf_file = NULL; adr_str = NULL; mailbox_file = NULL; csm = TIME; msm = LINEAR_SEQ; trm = ALL_INTERLEAVED; plot_mode = N_AVERAGE; print_plot = 1; test_and_stat = 1; mean_n = 50; rank_limit = 3; plot_c = 0; use_subject = 1; use_combiner = 0; nob = 0; /* Initialization */ srand (time (NULL)); /* Find configuration file */ for (i = 1; i < argc; i++) if (!strcmp (argv[i], "-f")) { if (argc > i + 1 && *argv[i + 1] != '-') conf_file = argv[i + 1]; else printf ("error in option -f\n"); } else if (!strncmp (argv[i], "--conf-file=", 12)) { if (argv[i][12] != '\0') conf_file = &argv[i][12]; else printf ("error in option --conf-file\n"); } /* Read configuration file */ if (conf_file) if (read_config (conf_file)) { fprintf (stderr, "Error: Cannot read configuration!\n"); return 1; } /* Read command line options */ if (read_opts (argc, argv)) { fprintf (stderr, "Error: Cannot read options!\n"); return 1; } /* Read mailbox configuration */ if (!mailbox_file) { fprintf (stderr, "Error: No mailbox configuration file specified!\n"); return 1; } if (read_mailconfig ()) { fprintf (stderr, "Error: Cannot read mailbox configuration!\n"); return 1; } pdata = protocol_c_new (15000, adr_str); i = protocol_c_open (pdata); if (i) { fprintf (stderr, "Error: Cannot connect to daemon!\n"); return 1; } noc = protocol_c_get_integer (pdata, "noc"); protocol_c_close (pdata); if (noc < 1) { fprintf (stderr, "Error: No classifiers\n"); return 1; } if (use_combiner) noc++; // Decremented later if (plot_c > noc) { fprintf (stderr, "Error: Incorrect classifier to plot.\n"); return 1; } printf ("# noc=%d ", noc); if (csm == TIME) printf ("time "); else if (csm == CROSS) printf ("cross "); if (use_combiner) printf ("combi "); printf ("\n"); if (test_and_stat) { td = my_malloc (sizeof(test_data)); td->res = my_malloc (sizeof(cls_res) * noc); for (i = 0; i < noc; i++) { td->res[i].len = 0; td->res[i].u.slist = NULL; td->res[i].u.rlist = NULL; } td->bno = -1; td->mno = -1; td->nor = noc; statlib_initialize (plot_mode, rank_limit, mean_n, plot_c, noc, nob); if (print_plot) statlib_print_plot_header (); } else { /* Print header */ printf ("! noc=%d nob=%d nom=%d\n", noc, nob, collection_get_nod ()); } if (use_combiner) noc--; if (trm == ALL_INTERLEAVED) while (collection_next_document ()) { open_doc (); if (test_msg ()) return 1; if (train_msg ()) return 1; close_doc (); } else { while (collection_next_document ()) { open_doc (); if (train_msg ()) return 1; close_doc (); } while (collection_next_test_document ()) open_doc (); if (test_msg ()) return 1; close_doc (); } if (test_and_stat) statlib_print_results (); else printf ("# End of testing\n"); /*i = dict_get_size (vectorizer_get_dictionary (vec)); printf ("## Total number of words: %d\n", i);*/ //from_print (fdb); return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -