📄 vectorizer.c
字号:
/* Copyright (C) 2001-2002 Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Handle transformation of texts into vectors. * * @author Mikael Ylikoski * @date 2001-2002 */#include <stdlib.h>#include <string.h>#include "dictionary.h"#include "selector.h"#include "tokenizer.h"#include "utility.h"#include "vector.h"#include "vectorizer.h"/** * Vectorizer type. */enum vect_type { VECT_BOOL, /**< Boolean representation */ VECT_TF, /**< Term Frequency */ VECT_TFIDF, /**< Term Frequency - Inverse Document Frequency */ VECT_HADAMARD /**< Hadamard product representation */};/** * Text vectorizer. */struct vectorizer_ { enum vect_type type; /**< Vectorizer type */ void *db; /**< Vectorizer database */ dict *dt; /**< Dictionary */ tokenizer *tok; /**< Tokenizer */ selector *sel; /**< Selector */ vector *(*vectorize_text) (vectorizer *, vector *); /**< Text vectorization function */ void (*vectorize_post) (vectorizer *, vector *, int); /**< Text vectorization post process */ void (*remove_terms) (vectorizer *, vector *); /**< Term selection removal */ void (*normalize) (vector *); /**< Normalization function */ int autobias; /**< Add constant bias term */};/** * TF-IDF database. */typedef struct { vector *df; /**< Document Frequency vector */ int nod; /**< Number Of Documents */} tfidf_db;/** * Hadamard database. */typedef struct { vector *tf; /**< Total Term Frequency */} hadamard_db;/** * Create a set of words vector from a tokenizer. * * @param tok tokenizer to use * @param dt dictionary to use * @return The set of words vector. */static vector *vectorizer_bool_vectorize_text (vectorizer *vec, vector *v) { int i, j; const char *s; if (!v) v = vector_new (50); if (vec->autobias) { j = dict_insert_word (vec->dt, "@AUTOBIAS@"); vector_set_value (v, j, 1); } for (i = 0; (s = tokenizer_next_token (vec->tok)); i++) { j = dict_insert_word (vec->dt, s); vector_set_value (v, j, 1); } return v;}/** * Create a boolean vectorizer. * * @param tok the tokenizer to use. * @return The new boolean vectorizer. */vectorizer *vectorizer_bool_create (tokenizer *tok) { vectorizer *vec; vec = my_malloc (sizeof(vectorizer)); vec->type = VECT_BOOL; vec->db = NULL; vec->dt = dict_new (); vec->tok = tok; vec->vectorize_text = vectorizer_bool_vectorize_text; vec->vectorize_post = NULL; vec->remove_terms = NULL; vec->sel = NULL; vec->normalize = NULL; vec->autobias = 0; return vec;}/** * Create a Term Frequency vector from a tokenizer. * * @param tok tokenizer to use * @param dt dictionary to use * @return The Term Frequency vector. */static vector *vectorizer_tf_vectorize_text (vectorizer *vec, vector *v) { int i, j; const char *s; if (!v) v = vector_new (50); if (vec->autobias) { j = dict_insert_word (vec->dt, "@AUTOBIAS@"); vector_set_value (v, j, 1); } for (i = 0; (s = tokenizer_next_token (vec->tok)); i++) { j = dict_insert_word (vec->dt, s); vector_inc_value (v, j); } return v;}/** * Create a Term Frequency vectorizer. * * @param tok tokenizer to use * @return The new Term Frequency vectorizer. */vectorizer *vectorizer_tf_create (tokenizer *tok) { vectorizer *vec; vec = my_malloc (sizeof(vectorizer)); vec->type = VECT_TF; vec->db = NULL; vec->dt = dict_new (); vec->tok = tok; vec->vectorize_text = vectorizer_tf_vectorize_text; vec->vectorize_post = NULL; vec->remove_terms = NULL; vec->sel = NULL; vec->normalize = NULL; vec->autobias = 0; return vec;}/** * TF-IDF post processing. */static voidvectorizer_tfidf_vectorize_post (vectorizer *vec, vector *v, int new) { if (new > 0) { vector_add_v (((tfidf_db *)vec->db)->df, v, 1); ((tfidf_db *)vec->db)->nod++; } else if (new < 0) { vector_add_v (((tfidf_db *)vec->db)->df, v, -1); ((tfidf_db *)vec->db)->nod--; } vector_tfidf (v, ((tfidf_db *)vec->db)->df, ((tfidf_db *)vec->db)->nod);}/** * TF-IDF term removal. */static voidvectorizer_tfidf_remove_terms (vectorizer *vec, vector *v) { vector_remove_v (((tfidf_db *)vec->db)->df, v);}/** * Create a TF-IDF vectorizer. * * @param tok tokenizer to use * @return The new TF-IDF vectorizer. */vectorizer *vectorizer_tfidf_create (tokenizer *tok) { vectorizer *vec; vec = my_malloc (sizeof(vectorizer)); vec->type = VECT_TFIDF; vec->dt = dict_new (); if (!vec->dt) { free (vec); return NULL; } vec->db = my_malloc (sizeof(tfidf_db)); ((tfidf_db*)vec->db)->df = vector_new (1000); if (!((tfidf_db*)vec->db)->df) { dict_free (vec->dt); free (vec->db); free (vec); return NULL; } ((tfidf_db*)vec->db)->nod = 0; vec->tok = tok; vec->vectorize_text = vectorizer_tf_vectorize_text; vec->vectorize_post = vectorizer_tfidf_vectorize_post; vec->remove_terms = vectorizer_tfidf_remove_terms; vec->sel = NULL; vec->normalize = NULL; vec->autobias = 0; return vec;}/** * Hadamard post processing. */static voidvectorizer_hadamard_vectorize_post (vectorizer *vec, vector *v, int new) { if (new > 0) vector_add (((hadamard_db *)vec->db)->tf, v); else if (new < 0) vector_sub (((hadamard_db *)vec->db)->tf, v); vector_elem_mul (v, ((hadamard_db *)vec->db)->tf);}/** * Create a hadamard vectorizer. * * The Hadamard product representation consists of the vector where the i:th * entry is the product of the frequency of the i:th keyword in the document * and its frequency over all documents in the training set. * See: * L. Manevitz and M, Yousef, <em>Document classification via neural networks * trained exclusively with positive examples</em>, 2001. * * @param tok tokenizer to use * @return The new Hadamard vectorizer. */vectorizer *vectorizer_hadamard_create (tokenizer *tok) { vectorizer *vec; vec = my_malloc (sizeof(vectorizer)); vec->type = VECT_HADAMARD; vec->dt = dict_new (); vec->db = my_malloc (sizeof(hadamard_db)); ((hadamard_db*)vec->db)->tf = vector_new (1000); if (!((hadamard_db*)vec->db)->tf) { dict_free (vec->dt); free (vec->db); free (vec); return NULL; } vec->tok = tok; vec->vectorize_text = vectorizer_tf_vectorize_text; vec->vectorize_post = vectorizer_hadamard_vectorize_post; vec->remove_terms = NULL; // FIXME vec->sel = NULL; vec->normalize = NULL; vec->autobias = 1; return vec;}/** * Set normalization function for vectorizer to use. * * @param vec vectorizer * @param normalize normalization function */intvectorizer_set_normalizer (vectorizer *vec, void (*normalize) (vector *)) { vec->normalize = normalize; return 0;}/** * Set autobias. * Off by default. * * @param vec vectorizer * @param bool 1 or 0, for on or off */voidvectorizer_set_autobias (vectorizer *vec, int bool) { vec->autobias = bool;}/** * Set dictionary of vectorizer. * * @param vec vectorizer * @param dt new dictionary */voidvectorizer_set_dictionary (vectorizer *vec, dict *dt) { dict_free (vec->dt); vec->dt = dt;}/** * Get dictionary of a vectorizer. * * @param vec vectorizer to use * @return The dictionary */dict *vectorizer_get_dictionary (vectorizer *vec) { return vec->dt;}voidvectorizer_set_selector (vectorizer *vec, selector *sel) { vec->sel = sel; dict_set_reverse (vec->dt);}/** * Normalize vector. * * If new == 0 then don't update text statistics. * Use this when classifying a vector. * If new == 1 then update text statistics. * Use this when learning a vector. * If new == -1 then downdate text statistics. * Use this when unlearning a vector. * * @param vec vectorizer * @param v vector * @param new indicates whether text statistics should be updated */vector *vectorizer_normalize_vector (vectorizer *vec, vector *v, int new) { int i; vector *u; u = NULL; if (vec->vectorize_post) vec->vectorize_post (vec, v, new); if (vec->sel) { selector_weight (vec->sel, v); //if (class != -1) selector_update (vec->sel, v, 0); // FIXME 0 should be class u = selector_get_removable (vec->sel); if (u) { for (i = 0; i < u->nel; i++) dict_remove_word_num (vec->dt, u->value[i]); if (vec->remove_terms) vec->remove_terms (vec, u); } } if (vec->normalize) vec->normalize (v); return u;}/** * Vectorize text. * * @param vec vectorizer to use * @param v vector * @param text text to vectorize * @param n text length * @param lang language * @param charset character set * @return A vector corresponding to the text */vector *vectorizer_vectorize_text (vectorizer *vec, vector *v, const char *text, int n, const char *lang, const char *charset) { tokenizer_set_text (vec->tok, text, n, charset); tokenizer_set_language (vec->tok, lang); return vec->vectorize_text (vec, v);}/** * Save a vectorizer to a file. * * @param vec vectorizer to save * @param f file to save to * @return Zero if ok, nonzero otherwise. */intvectorizer_save (vectorizer *vec, FILE *f) { fprintf (f, "vectorizer "); switch (vec->type) { case VECT_BOOL: fprintf (f, "bool\n"); break; case VECT_TF: fprintf (f, "tf\n"); break; case VECT_TFIDF: fprintf (f, "tfidf\n"); break; case VECT_HADAMARD: fprintf (f, "hadamard\n"); break; } fprintf (f, "tokenizer "); tokenizer_save (vec->tok, f); // FIXME not nice fprintf (f, "\n"); fprintf (f, "normalizer\n"); // FIXME fprintf (f, "autobias %d\n", vec->autobias); fprintf (f, "dictionary "); dict_save (f, vec->dt); fprintf (f, "\n"); if (vec->type == VECT_TFIDF) { fprintf (f, "nod %d\n", ((tfidf_db *)vec->db)->nod); fprintf (f, "df "); vector_save (((tfidf_db *)vec->db)->df, f); fprintf (f, "\n"); } return 0;}/** * Load a vectorizer from a file. * * @param f file to load from * @param langs languages * @return The vectorizer. */vectorizer *vectorizer_load (FILE *f, languages *langs) { char *vs; int ab; conf_pair cp; tokenizer *tok; vectorizer *vec; get_next_configuration (f, &cp); if (strcmp (cp.key, "vectorizer")) { fprintf (stderr, "Error\n"); return NULL; } vs = my_strdup (cp.value); get_next_configuration (f, &cp); if (strcmp (cp.key, "tokenizer")) { fprintf (stderr, "Error\n"); return NULL; } tok = tokenizer_new (cp.value); if (!tok) { fprintf (stderr, "Error: Unknown tokenizer type: %s\n", cp.value); return NULL; } tokenizer_set_languages (tok, langs); if (!strcmp (vs, "bool")) vec = vectorizer_bool_create (tok); else if (!strcmp (vs, "tf")) vec = vectorizer_tf_create (tok); else if (!strcmp (vs, "tfidf")) vec = vectorizer_tfidf_create (tok); else return NULL; get_next_configuration (f, &cp); if (strcmp (cp.key, "normalizer")) { fprintf (stderr, "Error\n"); return NULL; } // FIXME normalizer get_next_configuration (f, &cp); if (strcmp (cp.key, "autobias")) { fprintf (stderr, "Error\n"); return NULL; } ab = strtol (cp.value, NULL, 10); vectorizer_set_autobias (vec, ab); dict_free (vec->dt); fscanf (f, "dictionary "); vec->dt = dict_load (f); fscanf (f, "\n"); if (!strcmp (vs, "tfidf")) { vector_free (((tfidf_db *)vec->db)->df); fscanf (f, "nod %d\n", &((tfidf_db *)vec->db)->nod); fscanf (f, "df "); ((tfidf_db *)vec->db)->df = vector_load (f); fscanf (f, "\n"); } return vec;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -