📄 doc_classifier.c
字号:
/* Copyright (C) 2002 Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Document classifier. * * @author Mikael Ylikoski * @date 2002 */#include <stdlib.h>#include <string.h>#include "doc_classifier.h"#include "document.h"#include "utility.h"#include "vector.h"/** * Document classifier database. */struct doc_classifier_ { enum cl_type type; /**< Classifier type */ const doc_classifier_functions *funcs; /**< Classifier functions */ void *data; /**< Classifier database */ vectorizer *vec; /**< Vectorizer */};/** * Create a new document classifier. */doc_classifier *doc_classifier_new (void *data, const doc_classifier_functions *funcs, vectorizer *vec, enum cl_type clt) { doc_classifier *cl; cl = my_malloc (sizeof(doc_classifier)); cl->type = clt; cl->data = data; cl->funcs = funcs; cl->vec = vec; return cl;}/** * Get vectorizer. */vectorizer *doc_classifier_get_vectorizer (doc_classifier *cl) { return cl->vec;}/** * Get number of classes. */intdoc_classifier_get_noc (doc_classifier *cl) { return cl->funcs->info (cl->data);}/** * Get a vector corresponding to a document. * * @param cl classifier * @param d document * @param new indicates whether this is for (un)learning * @return The new vector. */static vector *doc_to_vector (doc_classifier *cl, document *d, int new) { const char *lang, *text, *charset, *cs; int len; text_part *tp; vector *u, *v; lang = document_get_language (d); charset = document_get_charset (d); tp = document_get_parts (d); v = NULL; for (; tp; tp = tp->next) { len = tp->len; if (len > 10000) len = 10000; if (tp->charset) cs = tp->charset; else cs = charset; v = vectorizer_vectorize_text (cl->vec, v, tp->text, len, lang, cs); } text = document_get_subject (d); if (text) v = vectorizer_vectorize_text (cl->vec, v, text, strlen (text), lang, charset); if (v) { u = vectorizer_normalize_vector (cl->vec, v, new); if (u) { if (cl->funcs->remove) cl->funcs->remove (cl->data, u); vector_free (u); } } return v;}/** * Learn a document. */intdoc_classifier_learn (doc_classifier *cl, document *d, int class) { int i; vector *v; switch (cl->type) { case DOCUMENT: return cl->funcs->learn (cl->data, d, class); case VECTOR: v = doc_to_vector (cl, d, 1); i = cl->funcs->learn (cl->data, v, class); vector_free (v); return i; } return -1;}/** * Unlearn a vector. */intdoc_classifier_unlearn (doc_classifier *cl, document *d, int class) { int i; vector *v; switch (cl->type) { case DOCUMENT: if (cl->funcs->unlearn) return cl->funcs->unlearn (cl->data, d, class); break; case VECTOR: v = doc_to_vector (cl, d, -1); i = cl->funcs->unlearn (cl->data, v, class); vector_free (v); return i; } return -1;}/** * Classify a document. */intdoc_classifier_classify_top (doc_classifier *cl, document *d) { int i; vector *v; switch (cl->type) { case DOCUMENT: if (cl->funcs->classify_top) return cl->funcs->classify_top (cl->data, d); break; case VECTOR: v = doc_to_vector (cl, d, 0); i = cl->funcs->classify_top (cl->data, v); vector_free (v); return i; } return -1;}/** * Classify a document. */int *doc_classifier_classify_rank (doc_classifier *cl, document *d) { int *il; vector *v; switch (cl->type) { case DOCUMENT: if (cl->funcs->classify_rank) return cl->funcs->classify_rank (cl->data, d); break; case VECTOR: v = doc_to_vector (cl, d, 0); il = cl->funcs->classify_rank (cl->data, v); vector_free (v); return il; } return NULL;}/** * Classify a document. */double *doc_classifier_classify_score (doc_classifier *cl, document *d) { double *dl; vector *v; switch (cl->type) { case DOCUMENT: if (cl->funcs->classify_score) return cl->funcs->classify_score (cl->data, d); break; case VECTOR: v = doc_to_vector (cl, d, 0); dl = cl->funcs->classify_score (cl->data, v); vector_free (v); return dl; } return NULL;}/** * Save a document classifier. * * @param fp file to save to * @param cl classifier to save * @return Zero if ok, or nonzero otherwise. */intdoc_classifier_save (FILE *fp, doc_classifier *cl) { if (cl->funcs->save) return cl->funcs->save (fp, cl->data); return -1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -