📄 tokenizer.c
字号:
/* Copyright (C) 2001-2002 Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Text tokenizer. * Divides a text into tokens. Can optionally remove stopwords and perform * stemming. * * @author Mikael Ylikoski * @date 2001-2002 */#include <ctype.h>#include <errno.h>#include <glib.h>#include <iconv.h>#include <locale.h>#include <stdlib.h>#include <string.h>#include "languages.h"#include "stemmer.h"#include "stopword.h"#include "tokenizer.h"#include "utility.h"/** * Tokenizer encoding mode. */enum tok_mode { BYTE, /**< Byte encoding */ UTF8 /**< UTF-8 encoding */};/** * Tokenizer */struct tokenizer_ { const char *text; /**< Text start */ const char *end; /**< Text end */ const char *pos; /**< Text position */ word_stopper *stopper; /**< Stopwords */ stemmer_functions *stemmer; /**< Stemmer */ languages *languages; /**< Languages */ int lower; /**< If 1 then make words lowercase */ int minlen; /**< Ignore words shorter than this */ int maxlen; /**< Ignore words longer than this unless 0 */ const char *(*next_token) (tokenizer *tok); /**< Tokenizing function */ char *buffer; /**< UTF-8 buffer, length (maxlen * 6 + 1) */ //enum tok_mode tm; /**< Encoding handling */ /* For ngram */ int state; /**< State of tokenizer */ /* For UTF-8 mode */ const char *charset; /**< Character encoding */ iconv_t cd; /**< Conversion state */ int wbl; /**< Length of wbuf */ gunichar *wbuf; /**< UCS-4 buffer, length (maxlen + 1) */ int wbs; /**< Current start position */ int wbe; /**< Current end position */ char *is; /**< Input start */ size_t il; /**< Length length */};/** * Create the basis for a new tokenizer. * * @return The new tokenizer. */static tokenizer *tokenizer_new_default (void) { tokenizer *t; t = my_malloc (sizeof(tokenizer)); t->text = NULL; t->end = NULL; t->pos = NULL; t->stopper = NULL; t->stemmer = NULL; t->languages = NULL; t->lower = 1; t->minlen = 2; t->maxlen = 30; t->next_token = NULL; /* Must be set by caller */ t->buffer = NULL; /* Must be set by caller */ t->state = 0; t->charset = NULL; t->cd = NULL; t->wbl = 0; t->wbuf = NULL; t->wbs = 0; t->wbe = 0; t->is = NULL; t->il = 0; return t;}/** * Get next alpha token, utf8 characters. * * @param tok tokenizer to use * @return The next token, or NULL if there are no more tokens. */static const char *tokenizer_alpha_next_token_utf8 (tokenizer *tok) { char *os; int i, j, start, end; size_t ol; gunichar u; while (1) { if (tok->wbs >= tok->wbe) { tok->wbs = tok->wbe = 0; // Maybe not be needed // Fill buffer os = (char *)&tok->wbuf[tok->wbe]; ol = (tok->wbl - tok->wbe) * sizeof(gunichar); i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol); if (i == -1 && errno == EILSEQ) return NULL; tok->wbe = tok->wbl - ol / sizeof(gunichar); // Check if no more letters if (tok->wbe == 0) return NULL; } /* Find first alpha char */ start = -1; for (i = tok->wbs; i < tok->wbe; i++) if (g_unichar_isalpha (tok->wbuf[i])) { start = i; break; } if (start != -1) break; if (tok->il == 0) return NULL; tok->wbs = tok->wbe = 0; } if (start > tok->wbl - tok->maxlen - 2) { // FIXME maybe not - 2 // Move buffer to beginning memcpy (tok->wbuf, &tok->wbuf[start], (tok->wbe - start) * sizeof(gunichar)); tok->wbe -= start; start = 0; // Fill buffer os = (char *)&tok->wbuf[tok->wbe]; ol = (tok->wbl - tok->wbe) * sizeof(gunichar); i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol); if (i == -1 && errno == EILSEQ) return NULL; tok->wbe = tok->wbl - ol / sizeof(gunichar); // Check if no more letters if (tok->wbe == 0) return NULL; } /* Find next non-alpha */ end = -1; for (i = start; i < tok->wbe; i++) if (!g_unichar_isalpha (tok->wbuf[i])) break; end = i; tok->wbs = end; if (end - start < tok->minlen || end - start > tok->maxlen) end = -1; if (end == -1) { /* Flush until next non-alpha */ // FIXME return tokenizer_alpha_next_token_utf8 (tok); } /* Convert to lowercase UTF-8 */ j = 0; for (i = start; i < end; i++) { if (tok->lower) u = g_unichar_tolower (tok->wbuf[i]); else u = tok->wbuf[i]; j += g_unichar_to_utf8 (u, &tok->buffer[j]); } tok->buffer[j] = '\0'; /* Stopwords and stemming */ if (tok->stopper && stopword_is (tok->stopper, tok->buffer)) return tokenizer_alpha_next_token_utf8 (tok); if (tok->stemmer) return tok->stemmer->stem_word (tok->buffer); return tok->buffer;}/** * Get next alpha token, utf8 characters. */static const char *tokenizer_alpha_next_token_utf8_circ (tokenizer *tok) { char *os; size_t ol; int i, j; int start, end; gunichar u; // FIXME wbs och wbe s鋞ts inte alltid korrekt while (1) { /* Fill wbuf as far as possible */ printf ("wbuf 0<%d:%d>:", tok->wbs, tok->wbe); for (i = 0; i <= tok->maxlen; i++) printf ("%c", tok->wbuf[i]); printf (":\n"); if (tok->wbs < 0) { printf ("0\n"); os = (char *)tok->wbuf; ol = (tok->maxlen + 1) * sizeof(gunichar); i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol); tok->wbe = tok->maxlen - ol / sizeof(gunichar); if (tok->wbe < 0) return NULL; tok->wbs = 0; } else if (tok->wbs <= tok->wbe) { printf ("1\n"); os = (char *)&tok->wbuf[tok->wbe + 1]; ol = (tok->maxlen - tok->wbe) * sizeof(gunichar); i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol); tok->wbe = tok->maxlen - ol / sizeof(gunichar); printf ("wbuf 1<%d:%d>:", tok->wbs, tok->wbe); for (i = 0; i <= tok->maxlen; i++) printf ("%c", tok->wbuf[i]); printf (":\n"); if (tok->wbs > 0 && tok->wbe == tok->maxlen) { printf ("2\n"); os = (char *)tok->wbuf; ol = tok->wbs * sizeof(gunichar); i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol); tok->wbe = tok->wbs - ol / sizeof(gunichar) - 1; if (tok->wbe < 0) tok->wbe = tok->maxlen - 1; } } else { // (tok->wbs > tok->wbe) printf ("3\n"); os = (char *)&tok->wbuf[tok->wbe + 1]; ol = (tok->wbs - tok->wbe - 1) * sizeof(gunichar); i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol); tok->wbe = tok->wbs - ol / sizeof(gunichar) - 1; } printf ("wbuf 2<%d:%d>:", tok->wbs, tok->wbe); for (i = 0; i <= tok->maxlen; i++) printf ("%c", tok->wbuf[i]); printf (":\n"); /* Find first alpha char */ start = -1; if (tok->wbs <= tok->wbe) { printf ("4\n"); for (i = tok->wbs; i <= tok->wbe; i++) if (g_unichar_isalpha (tok->wbuf[i])) { start = i; break; } } else if (tok->wbs > tok->wbe) { printf ("5\n"); for (i = tok->wbs; i <= tok->maxlen; i++) if (g_unichar_isalpha (tok->wbuf[i])) { start = i; break; } if (start == -1) { printf ("6\n"); for (i = 0; i <= tok->wbe; i++) if (g_unichar_isalpha (tok->wbuf[i])) { start = i; break; } } } else return NULL; printf ("start:%d:\n", start); tok->wbs = i + 1; if (tok->wbs > tok->maxlen) { tok->wbs = 0; if (tok->wbe == tok->maxlen) tok->wbs = -1; } if (start != -1) break; if (tok->il == 0) return NULL; } /* Fill wbuf as far as possible */ if (start <= tok->wbe) { os = (char *)&tok->wbuf[tok->wbe + 1]; ol = (tok->maxlen - tok->wbe) * sizeof(gunichar); i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol); tok->wbe = tok->maxlen - ol / sizeof(gunichar); if (start > 0 && tok->wbe == tok->maxlen) { os = (char *)tok->wbuf; ol = start * sizeof(gunichar); i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol); tok->wbe = start - ol / sizeof(gunichar) - 1; if (tok->wbe < 0) tok->wbe = tok->maxlen - 1; } } else { os = (char *)&tok->wbuf[tok->wbe + 1]; ol = (start - tok->wbe - 1) * sizeof(gunichar); i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol); tok->wbe = start - ol / sizeof(gunichar) - 1; } /* Find next non-alpha */ end = -1; if (start < tok->wbe) { printf ("7\n"); for (i = start; i <= tok->wbe; i++) if (!g_unichar_isalpha (tok->wbuf[i])) break; end = i; if (end - start < tok->minlen || end - start > tok->maxlen) end = -1; } else { // (start > tok->wbe) printf ("8\n"); for (i = start; i <= tok->maxlen; i++) if (!g_unichar_isalpha (tok->wbuf[i])) { end = i; break; } if (end == -1) { printf ("9\n"); for (i = 0; i <= tok->wbe; i++) if (!g_unichar_isalpha (tok->wbuf[i])) { end = i; break; } if (start - end < 1 || start - end > tok->maxlen - tok->minlen + 1) end = -1; } } printf ("end:%d:\n", end); if (end == -1) { /* Flush until next non-alpha */ printf ("to long\n"); return tokenizer_alpha_next_token_utf8_circ (tok); } tok->wbs = end; if (end) ; /* Convert to lowercase utf8 */ if (start < end) { printf ("10\n"); j = 0; for (i = start; i < end; i++) { if (tok->lower) u = g_unichar_tolower (tok->wbuf[i]); else u = tok->wbuf[i]; j += g_unichar_to_utf8 (u, &tok->buffer[j]); } tok->buffer[j] = '\0'; } else { printf ("11\n"); j = 0; for (i = start; i <= tok->maxlen; i++) { if (tok->lower)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -