⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 doc_classifier.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
字号:
/* Copyright (C) 2002  Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Document classifier. * * @author  Mikael Ylikoski * @date    2002 */#include <stdlib.h>#include <string.h>#include "doc_classifier.h"#include "document.h"#include "utility.h"#include "vector.h"/** * Document classifier database. */struct doc_classifier_ {    enum cl_type type;				/**< Classifier type */    const doc_classifier_functions *funcs;	/**< Classifier functions */    void *data;					/**< Classifier database */    vectorizer *vec;				/**< Vectorizer */};/** * Create a new document classifier. */doc_classifier *doc_classifier_new (void *data, const doc_classifier_functions *funcs,		    vectorizer *vec, enum cl_type clt) {    doc_classifier *cl;    cl = my_malloc (sizeof(doc_classifier));    cl->type = clt;    cl->data = data;    cl->funcs = funcs;    cl->vec = vec;    return cl;}/** * Get vectorizer. */vectorizer *doc_classifier_get_vectorizer (doc_classifier *cl) {    return cl->vec;}/** * Get number of classes. */intdoc_classifier_get_noc (doc_classifier *cl) {    return cl->funcs->info (cl->data);}/** * Get a vector corresponding to a document. * * @param cl   classifier * @param d    document * @param new  indicates whether this is for (un)learning * @return The new vector. */static vector *doc_to_vector (doc_classifier *cl, document *d, int new) {    const char *lang, *text, *charset, *cs;    int len;    text_part *tp;    vector *u, *v;    lang = document_get_language (d);    charset = document_get_charset (d);    tp = document_get_parts (d);    v = NULL;    for (; tp; tp = tp->next) {	len = tp->len;	if (len > 10000)	    len = 10000;	if (tp->charset)	    cs = tp->charset;	else	    cs = charset;	v = vectorizer_vectorize_text (cl->vec, v, tp->text, len, lang, cs);    }    text = document_get_subject (d);    if (text)	v = vectorizer_vectorize_text (cl->vec, v, text, strlen (text), lang,				       charset);    if (v) {	u = vectorizer_normalize_vector (cl->vec, v, new);	if (u) {	    if (cl->funcs->remove)		cl->funcs->remove (cl->data, u);	    vector_free (u);	}    }    return v;}/** * Learn a document. */intdoc_classifier_learn (doc_classifier *cl, document *d, int class) {    int i;    vector *v;    switch (cl->type) {    case DOCUMENT:	return cl->funcs->learn (cl->data, d, class);    case VECTOR:	v = doc_to_vector (cl, d, 1);	i = cl->funcs->learn (cl->data, v, class);	vector_free (v);	return i;    }    return -1;}/** * Unlearn a vector. */intdoc_classifier_unlearn (doc_classifier *cl, document *d, int class) {    int i;    vector *v;    switch (cl->type) {    case DOCUMENT:	if (cl->funcs->unlearn)	    return cl->funcs->unlearn (cl->data, d, class);	break;    case VECTOR:	v = doc_to_vector (cl, d, -1);	i = cl->funcs->unlearn (cl->data, v, class);	vector_free (v);	return i;    }    return -1;}/** * Classify a document. */intdoc_classifier_classify_top (doc_classifier *cl, document *d) {    int i;    vector *v;    switch (cl->type) {    case DOCUMENT:	if (cl->funcs->classify_top)	    return cl->funcs->classify_top (cl->data, d);	break;    case VECTOR:	v = doc_to_vector (cl, d, 0);	i = cl->funcs->classify_top (cl->data, v);	vector_free (v);	return i;    }    return -1;}/** * Classify a document. */int *doc_classifier_classify_rank (doc_classifier *cl, document *d) {    int *il;    vector *v;    switch (cl->type) {    case DOCUMENT:	if (cl->funcs->classify_rank)	    return cl->funcs->classify_rank (cl->data, d);	break;    case VECTOR:	v = doc_to_vector (cl, d, 0);	il = cl->funcs->classify_rank (cl->data, v);	vector_free (v);	return il;    }    return NULL;}/** * Classify a document. */double *doc_classifier_classify_score (doc_classifier *cl, document *d) {    double *dl;    vector *v;    switch (cl->type) {    case DOCUMENT:	if (cl->funcs->classify_score)	    return cl->funcs->classify_score (cl->data, d);	break;    case VECTOR:	v = doc_to_vector (cl, d, 0);	dl = cl->funcs->classify_score (cl->data, v);	vector_free (v);	return dl;    }    return NULL;}/** * Save a document classifier. * * @param fp  file to save to * @param cl  classifier to save * @return Zero if ok, or nonzero otherwise. */intdoc_classifier_save (FILE *fp, doc_classifier *cl) {    if (cl->funcs->save)	return cl->funcs->save (fp, cl->data);    return -1;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -