📄 selectd.c
字号:
/* Copyright (C) 2002 Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * The Select daemon. * * @author Mikael Ylikoski * @date 2002 */#include <ctype.h>#include <getopt.h>#include <signal.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include "classifier.h"#include "dictionary.h"#include "doc_classifier.h"#include "holders.h"#include "identifier.h"#include "languages.h"#include "multi.h"#include "protocol_s.h"#include "utility.h"#include "vector.h"#include "vectorizer.h"typedef struct langentry_ langentry;/** * Language entry. */struct langentry_ { char *name; /**< Language name */ char *locale; /**< Language locale */ langentry *next; /**< Next language */};/** * Classifier entry. */typedef struct { char *name; /**< Classifier name */ doc_classifier *cls; /**< Classifier */} cls_entry;/** * Classifier options. */typedef struct { char *name; /**< Name, or NULL if none */ char *cls; /**< Classifier */ char *type; /**< Classifier type */ char *options; /**< Classifier options, or NULL if none */ char *vec; /**< Vectorizer */ char *sel; /**< Feature selector, or NULL if none */ char *tok; /**< Tokenizer */ char *nor; /**< Normalizer, or NULL if none */} cls_opt;static char *adr_str; /**< Socket address string */static char *data_dir; /**< Database directory */static char *share_dir; /**< Share directory */static char *plugin_dir; /**< Plugin directory */static int use_subject; /**< Add subject to body text */static int use_id; /**< Use language identifier */static int use_stemmer; /**< Use stemmers */static int use_stopwords; /**< Use stopwords */static identifier *id; /**< Language identifier */static langentry *langlist; /**< List of languages to handle */static protocol_s_data *pdata; /**< Protocol data */static dict *folders; /**< Folder -> class table */static cls_entry *classifiers; /**< Classifier table */static cls_opt *cls_opts; /**< Classifier options */static int noc; /**< Number of classifiers *//** * Command line options. */static struct option longopts[] = { { "address", required_argument, NULL, 'a' }, { "daemon", required_argument, NULL, 'd' }, { "conf-file", required_argument, NULL, 'f' }, { "quickoption", required_argument, NULL, 'Q' }, { 0, 0, 0, 0 }};/** * Make a document from some parts. * * @param doc document to add to * @param parts document parts */static intsetup_document (document *doc, part *parts) { int len; if (!strcmp (parts->type, PART_TEXT)) { len = strlen (parts->string); // FIXME allow NULL chars in string? document_add_text (doc, parts->charset, parts->string, len); } else if (!strcmp (parts->type, PART_RFC822)) { document_set_rfc822 (doc, parts->string); } else if (!strcmp (parts->type, PART_FROM)) { document_set_from (doc, parts->string); } else if (!strcmp (parts->type, PART_SUBJECT)) { document_set_subject (doc, parts->string); } else if (!strcmp (parts->type, PART_HEADER)) { fprintf (stderr, "Warning: Header parts not implemented!\n"); } else if (!strcmp (parts->type, PART_LANGUAGE)) { document_set_language (doc, parts->string); } else { fprintf (stderr, "Warning: Unknown part type: %s\n", parts->type); } free (parts->type); free (parts); return 0;}/** * Determine language of document. * * @param doc document */static voidsetup_language (document *doc) { const char *lang, *charset; int len; text_part *tp; /* Identify language (only on first text part) */ lang = document_get_language (doc); if (!lang) { if (id) { tp = document_get_parts (doc); if (tp) { if (tp->charset) charset = tp->charset; else charset = document_get_charset (doc); len = tp->len; if (len > 500) len = 500; lang = identifier_guess_language (id, tp->text, len, charset); } if (!lang) lang = LANGUAGE_UNKNOWN; } else lang = LANGUAGE_UNKNOWN; // FIXME document_set_language (doc, (char *)lang); }}/** * Perform classification. * * @param doc document to classify * @param cl classifier string * @param typ classification type: "s", "r" or "t" * @return Zero if ok, or nonzero otherwise. */static intdo_classify_doc (document *doc, const char *cl, const char *typ) { int i, j, *il; double *dl; doc_classifier *cls; setup_language (doc); /* Find classifier */ if (cl) { if (cl[0] == '"') { j = strlen (cl) - 2; for (i = 0; i < noc; i++) if (!strncmp (&cl[1], classifiers[i].name, j)) break; } else { i = atoi (cl); } } else i = 0; if (i < 0 || i >= noc) { //protocol_s_error (pdata); return -1; } cls = classifiers[i].cls; if (!typ || !strcmp (typ, "s")) { dl = doc_classifier_classify_score (cls, doc); if (dl) { i = doc_classifier_get_noc (cls); protocol_s_classify_score (pdata, dl, i); free (dl); return 0; } protocol_s_classify_score (pdata, NULL, 0); } else if (!strcmp (typ, "r")) { il = doc_classifier_classify_rank (cls, doc); if (il) { for (i = 0; il[i] != -1; i++) ; protocol_s_classify_rank (pdata, il, i); free (il); return 0; } protocol_s_classify_rank (pdata, NULL, 0); } else if (!strcmp (typ, "t")) { i = doc_classifier_classify_top (cls, doc); protocol_s_classify_top (pdata, i); } else { fprintf (stderr, "Error: Unknown classification type: %s\n", typ); return -1; } return 0;}/** * Learn document. * * @param doc document to learn * @param class class to learn * @return Zero if ok, or nonzero otherwise. */static intdo_learn (document *doc, char *class) { int i, j; setup_language (doc); if (class[0] == '"') { i = strlen (class); class[i - 1] = '\0'; //j = doc_classifier_get_noc (cls); //j = dict_insert_word_x (folders, &class[1], j); j = dict_insert_word (folders, &class[1]); } else { j = atoi (class); } if (j < 0) return -1; if (doc) for (i = 0; i < noc; i++) doc_classifier_learn (classifiers[i].cls, doc, j); return 0;}/** * Unlearn document. * * @param doc document to learn * @param class class to learn * @return Zero if ok, or nonzero otherwise. */static intdo_unlearn (document *doc, char *class) { int i, j; setup_language (doc); if (class[0] == '"') { i = strlen (class); class[i - 1] = '\0'; //j = doc_classifier_get_noc (cls); //j = dict_insert_word_x (folders, &class[1], j); j = dict_find_word (folders, &class[1]); } else { j = atoi (class); } if (j < 0) return -1; if (doc) for (i = 0; i < noc; i++) doc_classifier_unlearn (classifiers[i].cls, doc, j); return 0;}/** * Write a folder number and name for the client. */static voidwrite_folder (gpointer key, gpointer value, gpointer null) { protocol_s_write_integer (pdata, *(int *)value); protocol_s_write_string (pdata, "="); protocol_s_write_string (pdata, (char *)key); protocol_s_write_string (pdata, ";");}/** * Answer a get request. * * @param key key string to get */static intdo_get (const char *key) { int i; if (!strcmp (key, "folders")) { protocol_s_write_string (pdata, "l:"); protocol_s_write_integer (pdata, dict_get_size (folders)); protocol_s_write_string (pdata, ":"); dict_for_each (folders, write_folder); } else if (!strcmp (key, "classifiers")) { protocol_s_write_string (pdata, "l:"); protocol_s_write_integer (pdata, noc); protocol_s_write_string (pdata, ":"); for (i = 0; i < noc; i++) { protocol_s_write_integer (pdata, i); protocol_s_write_string (pdata, "="); protocol_s_write_string (pdata, classifiers[i].name); protocol_s_write_string (pdata, ";"); } } else if (!strcmp (key, "noc")) { protocol_s_write_integer (pdata, noc); } else if (!strcmp (key, "nof")) { protocol_s_write_integer (pdata, dict_get_size (folders)); } else { protocol_s_write_string (pdata, "ERROR"); } protocol_s_send (pdata); return -1;}/** * Answer a set request. */static intdo_set (const char *key, const char *value) { protocol_s_write_string (pdata, "Not implemented yet"); protocol_s_send (pdata); return -1;}/** * Answer a client request. * * @param doc current document * @return Zero if ok, or nonzero otherwise. */static intread_request (document *doc) { int retval; protocol_s_request *pr; retval = 0; pr = protocol_s_read_request (pdata); if (!pr) { fprintf (stderr, "Error: No request\n"); return -1; } switch (pr->type) { case REQUEST_OPEN: break; case REQUEST_CLOSE: retval = -1; break; case REQUEST_PARTS: if (pr->parts) setup_document (doc, pr->parts); else fprintf (stderr, "Error: Cannot read part\n"); break; case REQUEST_CLASSIFY: do_classify_doc (doc, pr->str, pr->str2); break; case REQUEST_LEARN: do_learn (doc, pr->str); break; case REQUEST_UNLEARN: do_unlearn (doc, pr->str); break; case REQUEST_GET: do_get (pr->str); break; case REQUEST_SET: do_set (pr->str, pr->str2); break; case REQUEST_X: break; } free (pr); return retval;}/** * Handle a session with a client. * * @param fd file descriptor of client */static voidhandle_session (int fd) { int i; document *doc; /* Create empty document */ doc = document_new (NULL, PLAIN); protocol_s_reinit (pdata, fd); /* Receive and process requests */ while (1) { i = protocol_s_receive (pdata); if (i) break; i = read_request (doc); if (i) break; } /* Free document */ document_free (doc);}/** * Save a classifier database. * * @param cl classifier * @param co classifier options * @return Zero if ok, or nonzero otherwise. */static intsave_doc_classifier (doc_classifier *cl, cls_opt *co) { FILE *fp; char buf[200]; int i; vectorizer *vec; vec = doc_classifier_get_vectorizer (cl); i = strlen (data_dir); if (i > 150) // FIXME inte 150 return -1; memcpy (buf, data_dir, i); memcpy (&buf[i], "/saved.", 7); strcpy (&buf[i + 7], co->name); i = strlen (buf); if (vec) { strcpy (&buf[i], ".vect"); fp = fopen (buf, "w"); vectorizer_save (vec, fp); fclose (fp); } strcpy (&buf[i], ".db"); fp = fopen (buf, "w"); fprintf (fp, "classifier %s\n", co->cls); fprintf (fp, "type %s\n", co->type); doc_classifier_save (fp, cl); fclose (fp); return 0;}#include "multi_one.h"/** * Load a classifier database. * * maybe move this to doc_classifier_load */static doc_classifier *load_classifier (FILE *f, cls_opt *co, languages *langs, vectorizer *vec) { char *cls; classifier *cl; conf_pair cp; const doc_classifier_functions *dof; multi_functions *mf; get_next_configuration (f, &cp); if (strcmp (cp.key, "classifier")) { fprintf (stderr, "Error 1\n"); return NULL; } cls = my_strdup (cp.value); get_next_configuration (f, &cp); if (strcmp (cp.key, "type")) { fprintf (stderr, "Error 2\n"); return NULL; } if (!strcmp (cp.value, "document")) { dof = holders_find_doc_classifier (cls); if (!dof) { fprintf (stderr, "Error: Cannot find classifier: %s\n", cls); return NULL; } return doc_classifier_new (dof->load (f), dof, NULL, DOCUMENT); } else if (!strncmp (cp.value, "multi_", 6)) { mf = holders_find_classifier (cls); if (!mf) { fprintf (stderr, "Error: Cannot find classifier: %s\n", cls); return NULL; } if (!strcmp (cp.value, "multi_one")) cl = multi_one_max_load (f, mf); else if (!strcmp (cp.value, "multi_rest")) cl = multi_one_rest_load (f, mf); else if (!strcmp (cp.value, "multi_linmax")) cl = multi_one_lin_load (f, mf); else if (!strcmp (cp.value, "multi_uc")) cl = multi_one_uc_load (f, mf); else { fprintf (stderr, "Error: Loading of classifier type not " "implemented yet: %s\n", cp.value); return NULL; } dof = classifier_get_doc_classifier_functions (); return doc_classifier_new (cl, dof, vec, VECTOR); } else { fprintf (stderr, "Error: Unknown classifier type: %s\n", cp.value); return NULL; } return NULL;}/** * Create a classifier. * * @param co classifier options * @param langs languages */static doc_classifier *create_classifier (cls_opt *co, languages *langs) { classifier *cl; /* Classifier */ const doc_classifier_functions *dof; /* Classifier functions */ multi_functions *mf; /* Multi functions */ tokenizer *tok; /* Tokenizer */ vectorizer *vec; /* Vectorizer */ enum multi_method mm; /* Classifier multi method */ if (!co->cls) { fprintf (stderr, "Error: Unspecified classifier\n"); return NULL; } if (!strcmp (co->type, "document")) { dof = holders_find_doc_classifier (co->cls); if (!dof) { fprintf (stderr, "Error: Cannot find classifier: %s\n", co->cls); return NULL; } return doc_classifier_new (dof->new (co->options), dof, NULL, DOCUMENT); } else { if (!strcmp (co->type, "multi_one")) mm = ONE_MAX; else if (!strcmp (co->type, "multi_rest")) mm = REST_MAX; else if (!strcmp (co->type, "multi_linmax")) mm = LIN_MAX; else if (!strcmp (co->type, "multi_uc"))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -