📄 naivebayes.c
字号:
/* Copyright (C) 2001-2002 Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Naive Bayes learning algorithm. * * Should be used with tf vectorizer, without any normalizer. * * Implemented after description in Joachims, T., <em>A Probabilistic Analysis * of the Rocchio Algorithm with TFIDF for Text Categorization</em>, 1997. * * @author Mikael Ylikoski * @date 2001-2002 */#include <float.h>#include <math.h>#include <stdio.h>#include <stdlib.h>#include "multi.h"#include "naivebayes.h"#include "utility.h"#include "vector.h"/** * Naive Bayes classifier global data. */typedef struct { int nod; /**< Number Of Documents in total */ int now; /**< Number Of Words in dictionary */} naivebayes_db;/** * Naive Bayes classifier class data. */typedef struct { vector *tf; /**< Vector with term frequencies for class */ int nod; /**< Number Of Documents for class */ float now; /**< Number Of Words for class = vector_sum (tf) */} naivebayes_class;/** * Create a new classifier database. * * @param opts classifier options, not used * @return The new classifier database. */void *naivebayes_new_db (const char *opts) { naivebayes_db *db; db = my_malloc (sizeof(naivebayes_db)); db->nod = 0; db->now = 0; return db;}/** * Create a new classifier. * * @return The classifier. */void *naivebayes_new (void) { naivebayes_class *nbc; nbc = my_malloc (sizeof(naivebayes_class)); nbc->tf = NULL; nbc->nod = 0; nbc->now = 0; return nbc;}void *naivebayes_copy (void *data) { naivebayes_class *nbc; naivebayes_class *onbc; onbc = (naivebayes_class *)data; nbc = my_malloc (sizeof(naivebayes_class)); if (onbc->tf) { nbc->tf = vector_copy (onbc->tf); if (!nbc->tf) { free (nbc); return NULL; } } else nbc->tf = NULL; nbc->nod = onbc->nod; nbc->now = onbc->now; return nbc;}voidnaivebayes_free (void *data) { naivebayes_class *nbc; nbc = (naivebayes_class *)data; if (nbc->tf) vector_free (nbc->tf); free (nbc);}/** * Train classifier with a document vector. * * @param db classifier database * @param data class data * @param v term frequency vector for document to learn * @param class document class: 1 or -1 * @return 0 if ok; -1 otherwise. */intnaivebayes_learn (void *db, void *data, vector *v, int class) { int i; naivebayes_class *nbc; naivebayes_db *ndb; if (class != 1) return -1; nbc = (naivebayes_class *)data; ndb = (naivebayes_db *)db; if (nbc->tf == NULL) nbc->tf = vector_copy (v); else vector_add (nbc->tf, v); nbc->nod++; nbc->now += vector_sum (v); ndb->nod++; i = vector_dim (v); if (i > ndb->now) ndb->now = i; return 0;}/** * Unlearn classifier with a document vector. * * @param db classifier database * @param data class data * @param v term frequency vector for document to unlearn * @param class document class; 1 or -1 * @return 0 if ok; -1 otherwise. */intnaivebayes_unlearn (void *db, void *data, vector *v, int class) { naivebayes_class *nbc; naivebayes_db *ndb; if (class != 1) return -1; nbc = (naivebayes_class *)data; ndb = (naivebayes_db *)db; if (nbc->tf == NULL) return -1; vector_sub (nbc->tf, v); nbc->nod--; nbc->now -= vector_sum (v); ndb->nod--; if (vector_dim (v) == ndb->now) ndb->now = vector_dim (nbc->tf); return 0;}intnaivebayes_remove_global (void *db, vector *v) { naivebayes_db *ndb; ndb = (naivebayes_db *)db; ndb->now -= v->nel; /* i = vector_dim (v); if (i > ndb->now) ndb->now = i; */ return 0;}intnaivebayes_remove (void *db, void *data, vector *v) { naivebayes_class *nbc; naivebayes_db *ndb; nbc = (naivebayes_class *)data; ndb = (naivebayes_db *)db; if (nbc->tf == NULL) return 0; nbc->now -= vector_remove_v (nbc->tf, v); //nbc->now -= ; return 0;}/** * Classify a document. * * @param db classifier database * @param data class data * @param v term frequency vector for the document to classify * @return The number of the most probable class. */doublenaivebayes_classify (void *db, void *data, vector *v) { int i, j; double d, e, p; naivebayes_class *nbc; naivebayes_db *ndb; nbc = (naivebayes_class *)data; ndb = (naivebayes_db *)db; if (!nbc->tf) return 0; if (nbc->now == 0) return 0; d = ndb->now + nbc->now; p = log (nbc->nod / (double)ndb->nod); for (i = 0; i < v->nel; i++) { e = (1 + vector_get_value (nbc->tf, v->name[i])) / d; for (j = v->value[i]; j > 0; j--) /* McCallum & Nigam */ p += log (e / (double)j); /*p += log (e);*/ /* Aas & Eikvil */ } return p;}void *naivebayes_load_db (FILE *file) { int i; naivebayes_db *ndb; ndb = my_malloc (sizeof(naivebayes_db)); i = fscanf (file, "nod %d\n", &ndb->nod); i = fscanf (file, "now %d\n", &ndb->now); return ndb;}void *naivebayes_load_class (FILE *file) { int i; naivebayes_class *ncl; ncl = my_malloc (sizeof(naivebayes_class)); i = fscanf (file, "nod %d\n", &ncl->nod); if (i != 1) { free (ncl); return NULL; } i = fscanf (file, "now %f\n", &ncl->now); if (i != 1) { free (ncl); return NULL; } fscanf (file, "vec "); ncl->tf = vector_load (file); if (!ncl->tf) { free (ncl); return NULL; } fscanf (file, "\n"); return ncl;}intnaivebayes_save_db (FILE *file, void *db) { naivebayes_db *ndb; ndb = (naivebayes_db *)db; fprintf (file, "nod %d\n", ndb->nod); fprintf (file, "now %d\n", ndb->now); return 0;}intnaivebayes_save_class (FILE *file, void *data) { naivebayes_class *ncl; ncl = (naivebayes_class *)data; fprintf (file, "nod %d\n", ncl->nod); fprintf (file, "now %f\n", ncl->now); fprintf (file, "vec "); vector_save (ncl->tf, file); fprintf (file, "\n"); return 0;}/** * Keep cygwin happy. */intmain (void) { return 0;}/** * Naive Bayes classifier name. */const char *my_classifier_name = "NaiveBayes";/** * Naive Bayes classifier functions. */const multi_functions my_functions = { .new_db = naivebayes_new_db, .new = naivebayes_new, .copy = naivebayes_copy, .free = naivebayes_free, .learn = naivebayes_learn, .unlearn = naivebayes_unlearn, .remove = naivebayes_remove, .remove_db = naivebayes_remove_global, .classify = naivebayes_classify, .load_db = naivebayes_load_db, .load_class = naivebayes_load_class, .save_db = naivebayes_save_db, .save_class = naivebayes_save_class, .option = 0};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -