📄 identifier.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
字号:
/* Copyright (C) 2002  Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Language identifier. * Uses N-gram classifier with N = 1..3. * * @author  Mikael Ylikoski * @date    2002 */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "classifier.h"#include "dictionary.h"#include "holders.h"#include "identifier.h"#include "multi_one.h"#include "tokenizer.h"#include "utility.h"#include "vector.h"#include "vectorizer.h"/** * Language identifier. */struct identifier_ {    char **langs;	/**< Array of language strings */    int nol;		/**< Number Of Languages */    classifier *cl;	/**< Language classifier */    vectorizer *vec;	/**< Vectorizer */};/** * Get the number of a language. * * @param id    identifier to use * @param lang  language * @return The number corresponding to the language. */static intidentifier_find_language (identifier *id, const char *lang) {    int i;    for (i = 0; i < id->nol; i++)	if (!strcmp (lang, id->langs[i]))	    return i;    return -1;}/** * Create a new language identifier. * * @return The new language identifier. */identifier *identifier_new (void) {    identifier *id;    multi_functions *mf;    tokenizer *tok;    mf = holders_find_classifier ("N-gram");    if (!mf)	return NULL;    id = my_malloc (sizeof(identifier));    id->cl = multi_one_max_new (mf, mf->new_db (NULL), 2);    tok = tokenizer_ngram_new ();    tokenizer_set_minmax (tok, 1, 3);    id->vec = vectorizer_tf_create (tok);    vectorizer_set_autobias (id->vec, 0);    id->langs = NULL;    id->nol = 0;    return id;}/** * Identify language of text. * * @param id       identifier to use * @param text     text to identify language of * @param len      length of text * @param charset  character set string * @return String containing the language name, or NULL if unable to guess. */const char *identifier_guess_language (identifier *id, const char *text, int len,			   const char *charset) {    int i;    vector *v;    v = vectorizer_vectorize_text (id->vec, NULL, text, len, NULL, charset);    vectorizer_normalize_vector (id->vec, v, 0);    i = classifier_classify_top (id->cl, v);    vector_free (v);    if (i < 0)	return NULL;    return id->langs[i];}/** * Add a new language to an identifier. * If the language is already present, return its number. * * @param id    identifier to use * @param lang  language to add * @return Number of the language */static intidentifier_add_language (identifier *id, const char *lang) {    int i;    i = identifier_find_language (id, lang);    if (i < 0) {	id->langs = my_realloc (id->langs, sizeof(char *) * (id->nol + 1));	i = id->nol;	id->langs[i] = my_strdup (lang);	if (!id->langs[i])	    return -1;	id->nol++;    }    return i;}/** * Load language from file. * * @param id        identifier to use * @param filename  filename * @param lang      language string * @return Zero if ok, or nonzero otherwise. */intidentifier_load_language (identifier *id, const char *filename,			  const char *lang) {    char buf[31];    float f;    int i;    FILE *fp;    dict *dt;    vector *v;    fp = fopen (filename, "r");    if (!fp)	return -1;    dt = vectorizer_get_dictionary (id->vec);    v = vector_new (10);    // FIXME is this a safe way to read UTF-8?    while (fscanf (fp, "%30s %f\n", buf, &f) == 2) {	for (i = 0; buf[i] != '\0'; i++)	    if (buf[i] == '_')		buf[i] = ' ';	i = dict_insert_word (dt, buf);	vector_set_value (v, i, f);    }    i = identifier_add_language (id, lang);    classifier_learn (id->cl, v, i);    vector_free (v);    fclose (fp);    return 0;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -