📄 tokenizer.c
字号:
u = g_unichar_tolower (tok->wbuf[i]); else u = tok->wbuf[i]; j += g_unichar_to_utf8 (u, &tok->buffer[j]); } for (i = 0; i < end; i++) { if (tok->lower) u = g_unichar_tolower (tok->wbuf[i]); else u = tok->wbuf[i]; j += g_unichar_to_utf8 (u, &tok->buffer[j]); } tok->buffer[j] = '\0'; } /* Stopwords and stemming */ printf ("buffer:%s:\n", tok->buffer); if (tok->stopper && stopword_is (tok->stopper, tok->buffer)) return tokenizer_alpha_next_token_utf8_circ (tok); if (tok->stemmer) return tok->stemmer->stem_word (tok->buffer); return tok->buffer;}/** * Get next alpha token, byte characters. */static const char *tokenizer_alpha_next_token_byte (tokenizer *tok) { const char *c, *d; int i; for (c = tok->pos; c < tok->end && !isalpha (*c); c++) ; if (c >= tok->end) return NULL; for (d = c; d < tok->end && isalpha (*d); d++) ; tok->pos = d + 1; if (d - c < tok->minlen || (tok->maxlen && d - c > tok->maxlen)) return tokenizer_alpha_next_token_byte (tok); if (tok->lower) for (i = 0; c < d; i++, c++) tok->buffer[i] = tolower (*c); else for (i = 0; c < d; i++, c++) tok->buffer[i] = *c; tok->buffer[i] = '\0'; if (tok->stopper && stopword_is (tok->stopper, tok->buffer)) return tokenizer_alpha_next_token_byte (tok); if (tok->stemmer) return tok->stemmer->stem_word (tok->buffer); return tok->buffer;}/** * Create a new alpha tokenizer. * * @return The new alpha tokenizer. */tokenizer *tokenizer_alpha_new (void) { tokenizer *t; t = tokenizer_new_default (); t->buffer = my_malloc (t->maxlen * 6 + 1); t->next_token = tokenizer_alpha_next_token_utf8; t->wbl = 1024; t->wbuf = my_malloc (t->wbl * sizeof(gunichar)); return t;}/** * Get next non-whitespace token, byte characters. * * @param tok tokenizer to use * @return The next token, or NULL if there are no more tokens. */static const char *tokenizer_ws_next_token_byte (tokenizer *tok) { const char *c, *d; int i; for (c = tok->pos; c < tok->end && isspace (*c); c++) ; if (c >= tok->end) return NULL; for (d = c; d < tok->end && !isspace (*d); d++) ; tok->pos = d + 1; if (d - c < tok->minlen || (tok->maxlen && d - c > tok->maxlen)) return tokenizer_ws_next_token_byte (tok); if (tok->lower) for (i = 0; c < d; i++, c++) tok->buffer[i] = tolower (*c); else for (i = 0; c < d; i++, c++) tok->buffer[i] = *c; tok->buffer[i] = '\0'; if (tok->stopper && stopword_is (tok->stopper, tok->buffer)) return tokenizer_ws_next_token_byte (tok); if (tok->stemmer) return tok->stemmer->stem_word (tok->buffer); return tok->buffer;}/** * Create a new whitespace tokenizer. * * @return The new whitespace tokenizer. */tokenizer *tokenizer_ws_new (void) { tokenizer *t; t = tokenizer_new_default (); t->buffer = my_malloc (t->maxlen * 6 + 1); t->next_token = tokenizer_ws_next_token_byte; return t;}/** * Get next N-gram token, byte characters. */static const char *tokenizer_ngram_next_token_byte (tokenizer *tok) { int i; if (tok->state == tok->maxlen || tok->pos + tok->state >= tok->end) { tok->pos++; tok->state = tok->minlen; } else tok->state++; if (tok->pos + tok->minlen >= tok->end) return NULL; if (tok->lower) for (i = 0; i < tok->state; i++) tok->buffer[i] = tolower (tok->pos[i]); else for (i = 0; i < tok->state; i++) tok->buffer[i] = tok->pos[i]; tok->buffer[tok->state] = '\0'; return tok->buffer;}/** * Create a new N-gram tokenizer. * * @return The new N-gram tokenizer. */tokenizer *tokenizer_ngram_new (void) { tokenizer *t; t = tokenizer_new_default (); t->minlen = 1; t->maxlen = 3; t->buffer = my_malloc (t->maxlen * 6 + 1); t->next_token = tokenizer_ngram_next_token_byte; return t;}/** * Get next Null token. */static const char *tokenizer_null_next_token (tokenizer *tok) { return NULL;}/** * Create a new Null tokenizer. * * @return The new Null tokenizer. */tokenizer *tokenizer_null_new (void) { tokenizer *t; t = tokenizer_new_default (); t->next_token = tokenizer_null_next_token; return t;}tokenizer *tokenizer_new (const char *str) { if (!strcmp (str, "alpha")) return tokenizer_alpha_new (); //else if (!strcmp (str, "alpha.byte")) //return tokenizer_alpha_byte_new (); else if (!strcmp (str, "wspace.byte")) return tokenizer_ws_new (); else if (!strcmp (str, "ngram.byte")) return tokenizer_ngram_new (); else if (!strcmp (str, "null")) return tokenizer_null_new (); return NULL;}/** * Set minlen and maxlen for tokenizer to use. * * @param tok tokenizer to use * @param min minimum length * @param max maximum length * @return Zero if ok, nonzero otherwise. */inttokenizer_set_minmax (tokenizer *tok, int min, int max) { char *buf; if (max > tok->maxlen) { buf = my_malloc (max + 1); tok->buffer = buf; } tok->minlen = min; tok->maxlen = max; return 0;}/** * Set wordstopper for tokenizer to use. * * @param tok the tokenizer * @param ws wordstopper to use */voidtokenizer_set_stopwords (tokenizer *tok, word_stopper *ws) { tok->stopper = ws;}/** * Set wordstemmer for tokenizer to use. * * @param tok the tokenizer * @param sf stemmer to use */voidtokenizer_set_stemmer (tokenizer *tok, stemmer_functions *sf) { tok->stemmer = sf;}/** * Set multiple language handler for the tokenizer. * * @param tok the tokenizer * @param langs languages */voidtokenizer_set_languages (tokenizer *tok, languages *langs) { tok->languages = langs;}/** * Set stemmer and stopwords according to a language. * * @param tok tokenizer * @param lang language string * @return Zero if ok, nonzero otherwise. */inttokenizer_set_language (tokenizer *tok, const char *lang) { language *l; if (lang) { if (!tok->languages) return -1; l = languages_find_language (tok->languages, lang); if (!l) return -1; tok->stemmer = l->stemmer; tok->stopper = l->stopper; if (l->locale) setlocale (LC_CTYPE, l->locale); else setlocale (LC_CTYPE, "C"); } else { tok->stemmer = NULL; tok->stopper = NULL; setlocale (LC_CTYPE, "C"); } return 0;}/** * Set new text for tokenizer to operate on. * * @param tok the tokenizer * @param text text to use * @param size length of text in bytes * @param charset character set of text */voidtokenizer_set_text (tokenizer *tok, const char *text, int size, const char *charset) { tok->text = text; tok->end = text + size; tok->pos = text; tok->state = 0; tok->charset = charset; if (tok->cd) iconv_close (tok->cd); if (tok->charset) tok->cd = iconv_open ("UCS-4LE", tok->charset); else tok->cd = (iconv_t)-1; if (tok->cd == (iconv_t)-1) { //FIXME should first try to set according to language locale tok->cd = iconv_open ("UCS-4LE", "ISO-8859-1"); } tok->wbs = 0; tok->wbe = 0; tok->is = (char *)text; tok->il = size;}/** * Get next token from tokenizer. * * @param tok the tokenizer to use * @return The next token, or NULL if there are no more tokens. */const char *tokenizer_next_token (tokenizer *tok) { return tok->next_token (tok);}/** * Save a tokenizer to a file. * * @param tok tokenizer to save * @param f file to save to * @return Zero if ok, nonzero otherwise. */inttokenizer_save (tokenizer *tok, FILE *f) { if (tok->next_token == tokenizer_alpha_next_token_utf8) fprintf (f, "alpha"); else if (tok->next_token == tokenizer_alpha_next_token_byte) fprintf (f, "alpha.byte"); else if (tok->next_token == tokenizer_ws_next_token_byte) fprintf (f, "wspace.byte"); else if (tok->next_token == tokenizer_ngram_next_token_byte) fprintf (f, "ngram.byte"); else if (tok->next_token == tokenizer_null_next_token) fprintf (f, "null"); else return -1; // FIXME //fprintf (f, " min=%d max=%d lower=%1", tok->min, tok->max, tok->lower); return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -