📄 vectorizer.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
字号:
/* Copyright (C) 2001-2002  Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Handle transformation of texts into vectors. * * @author  Mikael Ylikoski * @date    2001-2002 */#include <stdlib.h>#include <string.h>#include "dictionary.h"#include "selector.h"#include "tokenizer.h"#include "utility.h"#include "vector.h"#include "vectorizer.h"/** * Vectorizer type. */enum vect_type {    VECT_BOOL,		/**< Boolean representation */    VECT_TF,		/**< Term Frequency */    VECT_TFIDF,		/**< Term Frequency - Inverse Document Frequency */    VECT_HADAMARD	/**< Hadamard product representation */};/** * Text vectorizer. */struct vectorizer_ {    enum vect_type type;		/**< Vectorizer type */    void *db;				/**< Vectorizer database */    dict *dt;				/**< Dictionary */    tokenizer *tok;			/**< Tokenizer */    selector *sel;			/**< Selector */    vector *(*vectorize_text) (vectorizer *, vector *);					/**< Text vectorization function */    void (*vectorize_post) (vectorizer *, vector *, int);					/**< Text vectorization post process */    void (*remove_terms) (vectorizer *, vector *);					/**< Term selection removal */    void (*normalize) (vector *);	/**< Normalization function */    int autobias;			/**< Add constant bias term */};/** * TF-IDF database. */typedef struct {    vector *df;		/**< Document Frequency vector */    int nod;		/**< Number Of Documents */} tfidf_db;/** * Hadamard database. */typedef struct {    vector *tf;		/**< Total Term Frequency */} hadamard_db;/** * Create a set of words vector from a tokenizer. * * @param tok  tokenizer to use * @param dt   dictionary to use * @return The set of words vector. */static vector *vectorizer_bool_vectorize_text (vectorizer *vec, vector *v) {    int i, j;    const char *s;    if (!v)	v = vector_new (50);    if (vec->autobias) {	j = dict_insert_word (vec->dt, "@AUTOBIAS@");	vector_set_value (v, j, 1);    }    for (i = 0; (s = tokenizer_next_token (vec->tok)); i++) {	j = dict_insert_word (vec->dt, s);	vector_set_value (v, j, 1);    }    return v;}/** * Create a boolean vectorizer. * * @param tok  the tokenizer to use. * @return The new boolean vectorizer. */vectorizer *vectorizer_bool_create (tokenizer *tok) {    vectorizer *vec;    vec = my_malloc (sizeof(vectorizer));    vec->type = VECT_BOOL;    vec->db = NULL;    vec->dt = dict_new ();    vec->tok = tok;    vec->vectorize_text = vectorizer_bool_vectorize_text;    vec->vectorize_post = NULL;    vec->remove_terms = NULL;    vec->sel = NULL;    vec->normalize = NULL;    vec->autobias = 0;    return vec;}/** * Create a Term Frequency vector from a tokenizer. * * @param tok  tokenizer to use * @param dt   dictionary to use * @return The Term Frequency vector. */static vector *vectorizer_tf_vectorize_text (vectorizer *vec, vector *v) {    int i, j;    const char *s;    if (!v)	v = vector_new (50);    if (vec->autobias) {	j = dict_insert_word (vec->dt, "@AUTOBIAS@");	vector_set_value (v, j, 1);    }    for (i = 0; (s = tokenizer_next_token (vec->tok)); i++) {	j = dict_insert_word (vec->dt, s);	vector_inc_value (v, j);    }    return v;}/** * Create a Term Frequency vectorizer. * * @param tok  tokenizer to use * @return The new Term Frequency vectorizer. */vectorizer *vectorizer_tf_create (tokenizer *tok) {    vectorizer *vec;    vec = my_malloc (sizeof(vectorizer));    vec->type = VECT_TF;    vec->db = NULL;    vec->dt = dict_new ();    vec->tok = tok;    vec->vectorize_text = vectorizer_tf_vectorize_text;    vec->vectorize_post = NULL;    vec->remove_terms = NULL;    vec->sel = NULL;    vec->normalize = NULL;    vec->autobias = 0;    return vec;}/** * TF-IDF post processing. */static voidvectorizer_tfidf_vectorize_post (vectorizer *vec, vector *v, int new) {    if (new > 0) {	vector_add_v (((tfidf_db *)vec->db)->df, v, 1);	((tfidf_db *)vec->db)->nod++;    } else if (new < 0) {	vector_add_v (((tfidf_db *)vec->db)->df, v, -1);	((tfidf_db *)vec->db)->nod--;    }    vector_tfidf (v, ((tfidf_db *)vec->db)->df, ((tfidf_db *)vec->db)->nod);}/** * TF-IDF term removal. */static voidvectorizer_tfidf_remove_terms (vectorizer *vec, vector *v) {    vector_remove_v (((tfidf_db *)vec->db)->df, v);}/** * Create a TF-IDF vectorizer. * * @param tok  tokenizer to use * @return The new TF-IDF vectorizer. */vectorizer *vectorizer_tfidf_create (tokenizer *tok) {    vectorizer *vec;    vec = my_malloc (sizeof(vectorizer));    vec->type = VECT_TFIDF;    vec->dt = dict_new ();    if (!vec->dt) {	free (vec);	return NULL;    }    vec->db = my_malloc (sizeof(tfidf_db));    ((tfidf_db*)vec->db)->df = vector_new (1000);    if (!((tfidf_db*)vec->db)->df) {	dict_free (vec->dt);	free (vec->db);	free (vec);	return NULL;    }    ((tfidf_db*)vec->db)->nod = 0;    vec->tok = tok;    vec->vectorize_text = vectorizer_tf_vectorize_text;    vec->vectorize_post = vectorizer_tfidf_vectorize_post;    vec->remove_terms = vectorizer_tfidf_remove_terms;    vec->sel = NULL;    vec->normalize = NULL;    vec->autobias = 0;    return vec;}/** * Hadamard post processing. */static voidvectorizer_hadamard_vectorize_post (vectorizer *vec, vector *v, int new) {    if (new > 0)	vector_add (((hadamard_db *)vec->db)->tf, v);    else if (new < 0)	vector_sub (((hadamard_db *)vec->db)->tf, v);    vector_elem_mul (v, ((hadamard_db *)vec->db)->tf);}/** * Create a hadamard vectorizer. * * The Hadamard product representation consists of the vector where the i:th * entry is the product of the frequency of the i:th keyword in the document * and its frequency over all documents in the training set. * See: * L. Manevitz and M, Yousef, <em>Document classification via neural networks * trained exclusively with positive examples</em>, 2001. * * @param tok  tokenizer to use * @return The new Hadamard vectorizer. */vectorizer *vectorizer_hadamard_create (tokenizer *tok) {    vectorizer *vec;    vec = my_malloc (sizeof(vectorizer));    vec->type = VECT_HADAMARD;    vec->dt = dict_new ();    vec->db = my_malloc (sizeof(hadamard_db));    ((hadamard_db*)vec->db)->tf = vector_new (1000);    if (!((hadamard_db*)vec->db)->tf) {	dict_free (vec->dt);	free (vec->db);	free (vec);	return NULL;    }    vec->tok = tok;    vec->vectorize_text = vectorizer_tf_vectorize_text;    vec->vectorize_post = vectorizer_hadamard_vectorize_post;    vec->remove_terms = NULL;	// FIXME    vec->sel = NULL;    vec->normalize = NULL;    vec->autobias = 1;    return vec;}/** * Set normalization function for vectorizer to use. * * @param vec        vectorizer * @param normalize  normalization function */intvectorizer_set_normalizer (vectorizer *vec, void (*normalize) (vector *)) {    vec->normalize = normalize;    return 0;}/** * Set autobias. * Off by default. * * @param vec   vectorizer * @param bool  1 or 0, for on or off */voidvectorizer_set_autobias (vectorizer *vec, int bool) {    vec->autobias = bool;}/** * Set dictionary of vectorizer. * * @param vec  vectorizer * @param dt   new dictionary */voidvectorizer_set_dictionary (vectorizer *vec, dict *dt) {    dict_free (vec->dt);    vec->dt = dt;}/** * Get dictionary of a vectorizer. * * @param vec  vectorizer to use * @return The dictionary */dict *vectorizer_get_dictionary (vectorizer *vec) {    return vec->dt;}voidvectorizer_set_selector (vectorizer *vec, selector *sel) {    vec->sel = sel;    dict_set_reverse (vec->dt);}/** * Normalize vector. * * If new == 0 then don't update text statistics. * Use this when classifying a vector. * If new == 1 then update text statistics. * Use this when learning a vector. * If new == -1 then downdate text statistics. * Use this when unlearning a vector. * * @param vec  vectorizer * @param v    vector * @param new  indicates whether text statistics should be updated */vector *vectorizer_normalize_vector (vectorizer *vec, vector *v, int new) {    int i;    vector *u;    u = NULL;    if (vec->vectorize_post)	vec->vectorize_post (vec, v, new);    if (vec->sel) {	selector_weight (vec->sel, v);	//if (class != -1)	selector_update (vec->sel, v, 0);	// FIXME 0 should be class	u = selector_get_removable (vec->sel);	if (u) {	    for (i = 0; i < u->nel; i++)		dict_remove_word_num (vec->dt, u->value[i]);	    if (vec->remove_terms)		vec->remove_terms (vec, u);	}    }    if (vec->normalize)	vec->normalize (v);    return u;}/** * Vectorize text. * * @param vec      vectorizer to use * @param v        vector * @param text     text to vectorize * @param n        text length * @param lang     language * @param charset  character set * @return A vector corresponding to the text */vector *vectorizer_vectorize_text (vectorizer *vec, vector *v, const char *text,			   int n, const char *lang, const char *charset) {    tokenizer_set_text (vec->tok, text, n, charset);    tokenizer_set_language (vec->tok, lang);    return vec->vectorize_text (vec, v);}/** * Save a vectorizer to a file. * * @param vec  vectorizer to save * @param f    file to save to * @return Zero if ok, nonzero otherwise. */intvectorizer_save (vectorizer *vec, FILE *f) {    fprintf (f, "vectorizer ");    switch (vec->type) {    case VECT_BOOL:	fprintf (f, "bool\n");	break;    case VECT_TF:	fprintf (f, "tf\n");	break;    case VECT_TFIDF:	fprintf (f, "tfidf\n");	break;    case VECT_HADAMARD:	fprintf (f, "hadamard\n");	break;    }    fprintf (f, "tokenizer ");    tokenizer_save (vec->tok, f);	// FIXME not nice    fprintf (f, "\n");    fprintf (f, "normalizer\n");	// FIXME    fprintf (f, "autobias %d\n", vec->autobias);    fprintf (f, "dictionary ");    dict_save (f, vec->dt);    fprintf (f, "\n");    if (vec->type == VECT_TFIDF) {	fprintf (f, "nod %d\n", ((tfidf_db *)vec->db)->nod);	fprintf (f, "df ");	vector_save (((tfidf_db *)vec->db)->df, f);	fprintf (f, "\n");    }    return 0;}/** * Load a vectorizer from a file. * * @param f      file to load from * @param langs  languages * @return The vectorizer. */vectorizer *vectorizer_load (FILE *f, languages *langs) {    char *vs;    int ab;    conf_pair cp;    tokenizer *tok;    vectorizer *vec;    get_next_configuration (f, &cp);    if (strcmp (cp.key, "vectorizer")) {	fprintf (stderr, "Error\n");	return NULL;    }    vs = my_strdup (cp.value);    get_next_configuration (f, &cp);    if (strcmp (cp.key, "tokenizer")) {	fprintf (stderr, "Error\n");	return NULL;    }    tok = tokenizer_new (cp.value);    if (!tok) {	fprintf (stderr, "Error: Unknown tokenizer type: %s\n", cp.value);	return NULL;    }    tokenizer_set_languages (tok, langs);    if (!strcmp (vs, "bool"))	vec = vectorizer_bool_create (tok);    else if (!strcmp (vs, "tf"))	vec = vectorizer_tf_create (tok);    else if (!strcmp (vs, "tfidf"))	vec = vectorizer_tfidf_create (tok);    else	return NULL;    get_next_configuration (f, &cp);    if (strcmp (cp.key, "normalizer")) {	fprintf (stderr, "Error\n");	return NULL;    }    // FIXME normalizer    get_next_configuration (f, &cp);    if (strcmp (cp.key, "autobias")) {	fprintf (stderr, "Error\n");	return NULL;    }    ab = strtol (cp.value, NULL, 10);    vectorizer_set_autobias (vec, ab);    dict_free (vec->dt);    fscanf (f, "dictionary ");    vec->dt = dict_load (f);    fscanf (f, "\n");    if (!strcmp (vs, "tfidf")) {	vector_free (((tfidf_db *)vec->db)->df);	fscanf (f, "nod %d\n", &((tfidf_db *)vec->db)->nod);	fscanf (f, "df ");	((tfidf_db *)vec->db)->df = vector_load (f);	fscanf (f, "\n");    }    return vec;}
💿 文件大小 472 K
👤 上传用户 huanzhudev
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#增量 #分类算法 #监控
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -