📄 ngram.c
字号:
/* Copyright (C) 2001-2002 Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * N-gram learning learning algorithm. * * Should be used with ngram tokenizer and tf vectorizer without a normalizer. * * @author Mikael Ylikoski * @date 2001-2002 */#include <float.h>#include <math.h>#include <stdio.h>#include <stdlib.h>#include "multi.h"#include "ngram.h"#include "utility.h"#include "vector.h"/** * N-gram classifier global data. */struct ngram_db_ { int now; /**< Number Of Words in dictionary */};/** * N-gram classifier class data. */typedef struct { vector *class; /**< Vector with word frequencies for class */} ngram_class;/** * Create a new global state. * * @return The new global state. */void *ngram_new_db (const char *opts) { ngram_db *db; db = my_malloc (sizeof(ngram_db)); db->now = 0; return db;}/** * Create a new classifier. * * @return The classifier. */void *ngram_new (void) { ngram_class *nc; nc = my_malloc (sizeof(ngram_class)); nc->class = NULL; return nc;}void *ngram_copy (void *data) { ngram_class *nc; ngram_class *nnc; nc = (ngram_class *)data; nnc = my_malloc (sizeof(ngram_class)); if (nc->class) nnc->class = vector_copy (nc->class); else nnc->class = NULL; return nnc;}voidngram_free (void *data) { ngram_class *nc; nc = (ngram_class *)data; if (nc->class) vector_free (nc->class); free (nc);}/** * Train classifier with a document vector. * * @param db classifier database * @param data class data * @param v term frequency vector for document to learn * @param class document class: 1 or -1 * @return 0 if ok; -1 otherwise. */intngram_learn (void *db, void *data, vector *v, int class) { int i; ngram_class *nbc; ngram_db *ndb; if (class != 1) return -1; nbc = (ngram_class *)data; ndb = (ngram_db *)db; if (nbc->class == NULL) nbc->class = vector_copy (v); else vector_add (nbc->class, v); i = vector_dim (v); if (i > ndb->now) ndb->now = i; return 0;}/** * Unlearn classifier with a document vector. * * @param db classifier database * @param data class data * @param v term frequency vector for document to unlearn * @param class document class; 1 or -1 * @return 0 if ok; -1 otherwise. */intngram_unlearn (void *db, void *data, vector *v, int class) { ngram_class *nbc; ngram_db *ndb; if (class != 1) return -1; nbc = (ngram_class *)data; ndb = (ngram_db *)db; if (nbc->class == NULL) return -1; vector_sub (nbc->class, v); if (vector_dim (v) == ndb->now) ndb->now = vector_dim (nbc->class); return 0;}/** * Classify a document. * * @param db classifier database * @param data class data * @param v term frequency vector for the document to classify * @return The relative probability of the class. */doublengram_classify (void *db, void *data, vector *v) { ngram_class *nbc; ngram_db *ndb; nbc = (ngram_class *)data; ndb = (ngram_db *)db; if (!nbc->class) return 0; return -vector_relative_entropy (v, nbc->class, ndb->now);}void *ngram_load_db (FILE *file) { int i; ngram_db *ndb; ndb = my_malloc (sizeof(ngram_db)); i = fread (&ndb->now, sizeof(int), 1, file); if (i != 1) { free (ndb); return NULL; } return ndb;}void *ngram_load_class (FILE *file) { ngram_class *ncl; ncl = my_malloc (sizeof(ngram_class)); ncl->class = vector_load (file); if (!ncl->class) { free (ncl); return NULL; } return ncl;}intngram_save_db (FILE *file, void *db) { int i; ngram_db *ndb; ndb = (ngram_db *)db; i = fwrite (&ndb->now, sizeof(int), 1, file); if (i != 1) return -1; return 0;}intngram_save_class (FILE *file, void *data) { ngram_class *ncl; ncl = (ngram_class *)data; vector_save (ncl->class, file); return 0;}/** * Keep cygwin happy. */intmain (void) { return 0;}/** * N-gram classifier name. */const char *my_classifier_name = "N-gram";/** * N-gram classifier functions. */const multi_functions my_functions = { .new_db = ngram_new_db, .new = ngram_new, .copy = ngram_copy, .free = ngram_free, .learn = ngram_learn, .classify = ngram_classify, .load_db = ngram_load_db, .load_class = ngram_load_class, .save_db = ngram_save_db, .save_class = ngram_save_class, .option = 0};/** * Load a db from a text_cat language model file. *multi_db *ngram_load_file_db (char *file) { int i, j; FILE *f; f = fopen(file, "r"); for (i = 0; i < 400; i++) { for (j = 0; j < 9; j++) if (0) ; scanf(); } return NULL;}*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -