⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 selectd.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
📖 第 1 页 / 共 2 页
字号:
/* Copyright (C) 2002  Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * The Select daemon. * * @author  Mikael Ylikoski * @date    2002 */#include <ctype.h>#include <getopt.h>#include <signal.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include "classifier.h"#include "dictionary.h"#include "doc_classifier.h"#include "holders.h"#include "identifier.h"#include "languages.h"#include "multi.h"#include "protocol_s.h"#include "utility.h"#include "vector.h"#include "vectorizer.h"typedef struct langentry_ langentry;/** * Language entry. */struct langentry_ {    char *name;			/**< Language name */    char *locale;		/**< Language locale */    langentry *next;		/**< Next language */};/** * Classifier entry. */typedef struct {    char *name;			/**< Classifier name */    doc_classifier *cls;	/**< Classifier */} cls_entry;/** * Classifier options. */typedef struct {    char *name;			/**< Name, or NULL if none */    char *cls;			/**< Classifier */    char *type;			/**< Classifier type */    char *options;		/**< Classifier options, or NULL if none */    char *vec;			/**< Vectorizer */    char *sel;			/**< Feature selector, or NULL if none */    char *tok;			/**< Tokenizer */    char *nor;			/**< Normalizer, or NULL if none */} cls_opt;static char *adr_str;		/**< Socket address string */static char *data_dir;		/**< Database directory */static char *share_dir;		/**< Share directory */static char *plugin_dir;	/**< Plugin directory */static int use_subject;		/**< Add subject to body text */static int use_id;		/**< Use language identifier */static int use_stemmer;		/**< Use stemmers */static int use_stopwords;	/**< Use stopwords */static identifier *id;		/**< Language identifier */static langentry *langlist;	/**< List of languages to handle */static protocol_s_data *pdata;	/**< Protocol data */static dict *folders;		/**< Folder -> class table */static cls_entry *classifiers;	/**< Classifier table */static cls_opt *cls_opts;	/**< Classifier options */static int noc;			/**< Number of classifiers *//** * Command line options. */static struct option longopts[] = {    { "address", required_argument, NULL, 'a' },    { "daemon", required_argument, NULL, 'd' },    { "conf-file", required_argument, NULL, 'f' },    { "quickoption", required_argument, NULL, 'Q' },    { 0, 0, 0, 0 }};/** * Make a document from some parts. * * @param doc    document to add to * @param parts  document parts */static intsetup_document (document *doc, part *parts) {    int len;    if (!strcmp (parts->type, PART_TEXT)) {	len = strlen (parts->string);	// FIXME allow NULL chars in string?	document_add_text (doc, parts->charset, parts->string, len);    } else if (!strcmp (parts->type, PART_RFC822)) {	document_set_rfc822 (doc, parts->string);    } else if (!strcmp (parts->type, PART_FROM)) {	document_set_from (doc, parts->string);    } else if (!strcmp (parts->type, PART_SUBJECT)) {	document_set_subject (doc, parts->string);    } else if (!strcmp (parts->type, PART_HEADER)) {	fprintf (stderr, "Warning: Header parts not implemented!\n");    } else if (!strcmp (parts->type, PART_LANGUAGE)) {	document_set_language (doc, parts->string);    } else {	fprintf (stderr, "Warning: Unknown part type: %s\n", parts->type);    }    free (parts->type);    free (parts);    return 0;}/** * Determine language of document. * * @param doc  document */static voidsetup_language (document *doc) {    const char *lang, *charset;    int len;    text_part *tp;    /* Identify language (only on first text part) */    lang = document_get_language (doc);    if (!lang) {	if (id) {	    tp = document_get_parts (doc);	    if (tp) {		if (tp->charset)		    charset = tp->charset;		else		    charset = document_get_charset (doc);		len = tp->len;		if (len > 500)		    len = 500;		lang = identifier_guess_language (id, tp->text, len, charset);	    }	    if (!lang)		lang = LANGUAGE_UNKNOWN;	} else	    lang = LANGUAGE_UNKNOWN;	// FIXME	document_set_language (doc, (char *)lang);    }}/** * Perform classification. * * @param doc  document to classify * @param cl   classifier string * @param typ  classification type: "s", "r" or "t" * @return Zero if ok, or nonzero otherwise. */static intdo_classify_doc (document *doc, const char *cl, const char *typ) {    int i, j, *il;    double *dl;    doc_classifier *cls;    setup_language (doc);    /* Find classifier */    if (cl) {	if (cl[0] == '"') {	    j = strlen (cl) - 2;	    for (i = 0; i < noc; i++)		if (!strncmp (&cl[1], classifiers[i].name, j))		    break;	} else {	    i = atoi (cl);	}    } else	i = 0;    if (i < 0 || i >= noc) {	//protocol_s_error (pdata);	return -1;    }    cls = classifiers[i].cls;    if (!typ || !strcmp (typ, "s")) {	dl = doc_classifier_classify_score (cls, doc);	if (dl) {	    i = doc_classifier_get_noc (cls);	    protocol_s_classify_score (pdata, dl, i);	    free (dl);	    return 0;	}	protocol_s_classify_score (pdata, NULL, 0);    } else if (!strcmp (typ, "r")) {	il = doc_classifier_classify_rank (cls, doc);	if (il) {	    for (i = 0; il[i] != -1; i++)		;	    protocol_s_classify_rank (pdata, il, i);	    free (il);	    return 0;	}	protocol_s_classify_rank (pdata, NULL, 0);    } else if (!strcmp (typ, "t")) {	i = doc_classifier_classify_top (cls, doc);	protocol_s_classify_top (pdata, i);    } else {	fprintf (stderr, "Error: Unknown classification type: %s\n", typ);	return -1;    }    return 0;}/** * Learn document. * * @param doc    document to learn * @param class  class to learn * @return Zero if ok, or nonzero otherwise. */static intdo_learn (document *doc, char *class) {    int i, j;    setup_language (doc);    if (class[0] == '"') {	i = strlen (class);	class[i - 1] = '\0';	//j = doc_classifier_get_noc (cls);	//j = dict_insert_word_x (folders, &class[1], j);	j = dict_insert_word (folders, &class[1]);    } else {	j = atoi (class);    }    if (j < 0)	return -1;    if (doc)	for (i = 0; i < noc; i++)	    doc_classifier_learn (classifiers[i].cls, doc, j);    return 0;}/** * Unlearn document. * * @param doc    document to learn * @param class  class to learn * @return Zero if ok, or nonzero otherwise. */static intdo_unlearn (document *doc, char *class) {    int i, j;    setup_language (doc);    if (class[0] == '"') {	i = strlen (class);	class[i - 1] = '\0';	//j = doc_classifier_get_noc (cls);	//j = dict_insert_word_x (folders, &class[1], j);	j = dict_find_word (folders, &class[1]);    } else {	j = atoi (class);    }    if (j < 0)	return -1;    if (doc)	for (i = 0; i < noc; i++)	    doc_classifier_unlearn (classifiers[i].cls, doc, j);    return 0;}/** * Write a folder number and name for the client. */static voidwrite_folder (gpointer key, gpointer value, gpointer null) {    protocol_s_write_integer (pdata, *(int *)value);    protocol_s_write_string (pdata, "=");    protocol_s_write_string (pdata, (char *)key);    protocol_s_write_string (pdata, ";");}/** * Answer a get request. * * @param key  key string to get */static intdo_get (const char *key) {    int i;    if (!strcmp (key, "folders")) {	protocol_s_write_string (pdata, "l:");	protocol_s_write_integer (pdata, dict_get_size (folders));	protocol_s_write_string (pdata, ":");	dict_for_each (folders, write_folder);    } else if (!strcmp (key, "classifiers")) {	protocol_s_write_string (pdata, "l:");	protocol_s_write_integer (pdata, noc);	protocol_s_write_string (pdata, ":");	for (i = 0; i < noc; i++) {	    protocol_s_write_integer (pdata, i);	    protocol_s_write_string (pdata, "=");	    protocol_s_write_string (pdata, classifiers[i].name);	    protocol_s_write_string (pdata, ";");	}    } else if (!strcmp (key, "noc")) {	protocol_s_write_integer (pdata, noc);    } else if (!strcmp (key, "nof")) {	protocol_s_write_integer (pdata, dict_get_size (folders));    } else {	protocol_s_write_string (pdata, "ERROR");    }    protocol_s_send (pdata);    return -1;}/** * Answer a set request. */static intdo_set (const char *key, const char *value) {    protocol_s_write_string (pdata, "Not implemented yet");    protocol_s_send (pdata);    return -1;}/** * Answer a client request. * * @param doc  current document * @return Zero if ok, or nonzero otherwise. */static intread_request (document *doc) {    int retval;    protocol_s_request *pr;    retval = 0;    pr = protocol_s_read_request (pdata);    if (!pr) {	fprintf (stderr, "Error: No request\n");	return -1;    }    switch (pr->type) {    case REQUEST_OPEN:	break;    case REQUEST_CLOSE:	retval = -1;	break;    case REQUEST_PARTS:	if (pr->parts)	    setup_document (doc, pr->parts);	else	    fprintf (stderr, "Error: Cannot read part\n");	break;    case REQUEST_CLASSIFY:	do_classify_doc (doc, pr->str, pr->str2);	break;    case REQUEST_LEARN:	do_learn (doc, pr->str);	break;    case REQUEST_UNLEARN:	do_unlearn (doc, pr->str);	break;    case REQUEST_GET:	do_get (pr->str);	break;    case REQUEST_SET:	do_set (pr->str, pr->str2);	break;    case REQUEST_X:	break;    }    free (pr);    return retval;}/** * Handle a session with a client. * * @param fd  file descriptor of client */static voidhandle_session (int fd) {    int i;    document *doc;    /* Create empty document */    doc = document_new (NULL, PLAIN);    protocol_s_reinit (pdata, fd);    /* Receive and process requests */    while (1) {	i = protocol_s_receive (pdata);	if (i)	    break;	i = read_request (doc);	if (i)	    break;    }    /* Free document */    document_free (doc);}/** * Save a classifier database. * * @param cl  classifier * @param co  classifier options * @return Zero if ok, or nonzero otherwise. */static intsave_doc_classifier (doc_classifier *cl, cls_opt *co) {    FILE *fp;    char buf[200];    int i;    vectorizer *vec;    vec = doc_classifier_get_vectorizer (cl);    i = strlen (data_dir);    if (i > 150)	// FIXME inte 150	return -1;    memcpy (buf, data_dir, i);    memcpy (&buf[i], "/saved.", 7);    strcpy (&buf[i + 7], co->name);    i = strlen (buf);    if (vec) {	strcpy (&buf[i], ".vect");	fp = fopen (buf, "w");	vectorizer_save (vec, fp);	fclose (fp);    }    strcpy (&buf[i], ".db");    fp = fopen (buf, "w");    fprintf (fp, "classifier %s\n", co->cls);    fprintf (fp, "type %s\n", co->type);    doc_classifier_save (fp, cl);    fclose (fp);    return 0;}#include "multi_one.h"/** * Load a classifier database. * * maybe move this to doc_classifier_load */static doc_classifier *load_classifier (FILE *f, cls_opt *co, languages *langs, vectorizer *vec) {    char *cls;    classifier *cl;    conf_pair cp;    const doc_classifier_functions *dof;    multi_functions *mf;    get_next_configuration (f, &cp);    if (strcmp (cp.key, "classifier")) {	fprintf (stderr, "Error 1\n");	return NULL;    }    cls = my_strdup (cp.value);    get_next_configuration (f, &cp);    if (strcmp (cp.key, "type")) {	fprintf (stderr, "Error 2\n");	return NULL;    }    if (!strcmp (cp.value, "document")) {	dof = holders_find_doc_classifier (cls);	if (!dof) {	    fprintf (stderr, "Error: Cannot find classifier: %s\n", cls);	    return NULL;	}	return doc_classifier_new (dof->load (f), dof, NULL, DOCUMENT);    } else if (!strncmp (cp.value, "multi_", 6)) {	mf = holders_find_classifier (cls);	if (!mf) {	    fprintf (stderr, "Error: Cannot find classifier: %s\n", cls);	    return NULL;	}	if (!strcmp (cp.value, "multi_one"))	    cl = multi_one_max_load (f, mf);	else if (!strcmp (cp.value, "multi_rest"))	    cl = multi_one_rest_load (f, mf);	else if (!strcmp (cp.value, "multi_linmax"))	    cl = multi_one_lin_load (f, mf);	else if (!strcmp (cp.value, "multi_uc"))	    cl = multi_one_uc_load (f, mf);	else {	    fprintf (stderr, "Error: Loading of classifier type not "		     "implemented yet: %s\n", cp.value);	    return NULL;	}	dof = classifier_get_doc_classifier_functions ();	return doc_classifier_new (cl, dof, vec, VECTOR);    } else {	fprintf (stderr, "Error: Unknown classifier type: %s\n", cp.value);	return NULL;    }    return NULL;}/** * Create a classifier. * * @param co     classifier options * @param langs  languages */static doc_classifier *create_classifier (cls_opt *co, languages *langs) {    classifier *cl;				/* Classifier */    const doc_classifier_functions *dof;	/* Classifier functions  */    multi_functions *mf;			/* Multi functions */    tokenizer *tok;				/* Tokenizer */    vectorizer *vec;				/* Vectorizer */    enum multi_method mm;			/* Classifier multi method */    if (!co->cls) {	fprintf (stderr, "Error: Unspecified classifier\n");	return NULL;    }    if (!strcmp (co->type, "document")) {	dof = holders_find_doc_classifier (co->cls);	if (!dof) {	    fprintf (stderr, "Error: Cannot find classifier: %s\n", co->cls);	    return NULL;	}	return doc_classifier_new (dof->new (co->options), dof, NULL,				   DOCUMENT);    } else {	if (!strcmp (co->type, "multi_one"))	    mm = ONE_MAX;	else if (!strcmp (co->type, "multi_rest"))	    mm = REST_MAX;	else if (!strcmp (co->type, "multi_linmax"))	    mm = LIN_MAX;	else if (!strcmp (co->type, "multi_uc"))

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -