📄 identifier.c
字号:
/* Copyright (C) 2002 Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Language identifier. * Uses N-gram classifier with N = 1..3. * * @author Mikael Ylikoski * @date 2002 */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "classifier.h"#include "dictionary.h"#include "holders.h"#include "identifier.h"#include "multi_one.h"#include "tokenizer.h"#include "utility.h"#include "vector.h"#include "vectorizer.h"/** * Language identifier. */struct identifier_ { char **langs; /**< Array of language strings */ int nol; /**< Number Of Languages */ classifier *cl; /**< Language classifier */ vectorizer *vec; /**< Vectorizer */};/** * Get the number of a language. * * @param id identifier to use * @param lang language * @return The number corresponding to the language. */static intidentifier_find_language (identifier *id, const char *lang) { int i; for (i = 0; i < id->nol; i++) if (!strcmp (lang, id->langs[i])) return i; return -1;}/** * Create a new language identifier. * * @return The new language identifier. */identifier *identifier_new (void) { identifier *id; multi_functions *mf; tokenizer *tok; mf = holders_find_classifier ("N-gram"); if (!mf) return NULL; id = my_malloc (sizeof(identifier)); id->cl = multi_one_max_new (mf, mf->new_db (NULL), 2); tok = tokenizer_ngram_new (); tokenizer_set_minmax (tok, 1, 3); id->vec = vectorizer_tf_create (tok); vectorizer_set_autobias (id->vec, 0); id->langs = NULL; id->nol = 0; return id;}/** * Identify language of text. * * @param id identifier to use * @param text text to identify language of * @param len length of text * @param charset character set string * @return String containing the language name, or NULL if unable to guess. */const char *identifier_guess_language (identifier *id, const char *text, int len, const char *charset) { int i; vector *v; v = vectorizer_vectorize_text (id->vec, NULL, text, len, NULL, charset); vectorizer_normalize_vector (id->vec, v, 0); i = classifier_classify_top (id->cl, v); vector_free (v); if (i < 0) return NULL; return id->langs[i];}/** * Add a new language to an identifier. * If the language is already present, return its number. * * @param id identifier to use * @param lang language to add * @return Number of the language */static intidentifier_add_language (identifier *id, const char *lang) { int i; i = identifier_find_language (id, lang); if (i < 0) { id->langs = my_realloc (id->langs, sizeof(char *) * (id->nol + 1)); i = id->nol; id->langs[i] = my_strdup (lang); if (!id->langs[i]) return -1; id->nol++; } return i;}/** * Load language from file. * * @param id identifier to use * @param filename filename * @param lang language string * @return Zero if ok, or nonzero otherwise. */intidentifier_load_language (identifier *id, const char *filename, const char *lang) { char buf[31]; float f; int i; FILE *fp; dict *dt; vector *v; fp = fopen (filename, "r"); if (!fp) return -1; dt = vectorizer_get_dictionary (id->vec); v = vector_new (10); // FIXME is this a safe way to read UTF-8? while (fscanf (fp, "%30s %f\n", buf, &f) == 2) { for (i = 0; buf[i] != '\0'; i++) if (buf[i] == '_') buf[i] = ' '; i = dict_insert_word (dt, buf); vector_set_value (v, i, f); } i = identifier_add_language (id, lang); classifier_learn (id->cl, v, i); vector_free (v); fclose (fp); return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -