📄 tokenizer.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Copyright (C) 2001-2002  Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Text tokenizer. * Divides a text into tokens. Can optionally remove stopwords and perform * stemming. * * @author  Mikael Ylikoski * @date    2001-2002 */#include <ctype.h>#include <errno.h>#include <glib.h>#include <iconv.h>#include <locale.h>#include <stdlib.h>#include <string.h>#include "languages.h"#include "stemmer.h"#include "stopword.h"#include "tokenizer.h"#include "utility.h"/** * Tokenizer encoding mode. */enum tok_mode {    BYTE,	/**< Byte encoding */    UTF8	/**< UTF-8 encoding */};/** * Tokenizer */struct tokenizer_ {    const char *text;		/**< Text start */    const char *end;		/**< Text end */    const char *pos;		/**< Text position */    word_stopper *stopper;	/**< Stopwords */    stemmer_functions *stemmer;	/**< Stemmer */    languages *languages;	/**< Languages */    int lower;			/**< If 1 then make words lowercase */    int minlen;			/**< Ignore words shorter than this */    int maxlen;			/**< Ignore words longer than this unless 0 */    const char *(*next_token) (tokenizer *tok);				/**< Tokenizing function */    char *buffer;		/**< UTF-8 buffer, length (maxlen * 6 + 1) */    //enum tok_mode tm;		/**< Encoding handling */    /* For ngram */    int state;			/**< State of tokenizer */    /* For UTF-8 mode */    const char *charset;	/**< Character encoding */    iconv_t cd;			/**< Conversion state */    int wbl;			/**< Length of wbuf */    gunichar *wbuf;		/**< UCS-4 buffer, length (maxlen + 1) */    int wbs;			/**< Current start position */    int wbe;			/**< Current end position */    char *is;			/**< Input start */    size_t il;			/**< Length length */};/** * Create the basis for a new tokenizer. * * @return The new tokenizer. */static tokenizer *tokenizer_new_default (void) {    tokenizer *t;    t = my_malloc (sizeof(tokenizer));    t->text = NULL;    t->end = NULL;    t->pos = NULL;    t->stopper = NULL;    t->stemmer = NULL;    t->languages = NULL;    t->lower = 1;    t->minlen = 2;    t->maxlen = 30;    t->next_token = NULL;	/* Must be set by caller */    t->buffer = NULL;		/* Must be set by caller */    t->state = 0;    t->charset = NULL;    t->cd = NULL;    t->wbl = 0;    t->wbuf = NULL;    t->wbs = 0;    t->wbe = 0;    t->is = NULL;    t->il = 0;    return t;}/** * Get next alpha token, utf8 characters. * * @param tok  tokenizer to use * @return The next token, or NULL if there are no more tokens. */static const char *tokenizer_alpha_next_token_utf8 (tokenizer *tok) {    char *os;    int i, j, start, end;    size_t ol;    gunichar u;    while (1) {	if (tok->wbs >= tok->wbe) {	    tok->wbs = tok->wbe = 0;	// Maybe not be needed	    // Fill buffer	    os = (char *)&tok->wbuf[tok->wbe];	    ol = (tok->wbl - tok->wbe) * sizeof(gunichar);	    i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol);	    if (i == -1 && errno == EILSEQ)		return NULL;	    tok->wbe = tok->wbl - ol / sizeof(gunichar);	    // Check if no more letters	    if (tok->wbe == 0)		return NULL;	}	/* Find first alpha char */	start = -1;	for (i = tok->wbs; i < tok->wbe; i++)	    if (g_unichar_isalpha (tok->wbuf[i])) {		start = i;		break;	    }	if (start != -1)	    break;	if (tok->il == 0)	    return NULL;	tok->wbs = tok->wbe = 0;    }    if (start > tok->wbl - tok->maxlen - 2) {	// FIXME maybe not - 2	// Move buffer to beginning	memcpy (tok->wbuf, &tok->wbuf[start],		(tok->wbe - start) * sizeof(gunichar));	tok->wbe -= start;	start = 0;	// Fill buffer	os = (char *)&tok->wbuf[tok->wbe];	ol = (tok->wbl - tok->wbe) * sizeof(gunichar);	i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol);	if (i == -1 && errno == EILSEQ)	    return NULL;	tok->wbe = tok->wbl - ol / sizeof(gunichar);	// Check if no more letters	if (tok->wbe == 0)	    return NULL;    }    /* Find next non-alpha */    end = -1;    for (i = start; i < tok->wbe; i++)	if (!g_unichar_isalpha (tok->wbuf[i]))	    break;    end = i;    tok->wbs = end;    if (end - start < tok->minlen || end - start > tok->maxlen)	end = -1;    if (end == -1) {	/* Flush until next non-alpha */	// FIXME	return tokenizer_alpha_next_token_utf8 (tok);    }    /* Convert to lowercase UTF-8 */    j = 0;    for (i = start; i < end; i++) {	if (tok->lower)	    u = g_unichar_tolower (tok->wbuf[i]);	else	    u = tok->wbuf[i];	j += g_unichar_to_utf8 (u, &tok->buffer[j]);    }    tok->buffer[j] = '\0';    /* Stopwords and stemming */    if (tok->stopper && stopword_is (tok->stopper, tok->buffer))	return tokenizer_alpha_next_token_utf8 (tok);    if (tok->stemmer)	return tok->stemmer->stem_word (tok->buffer);    return tok->buffer;}/** * Get next alpha token, utf8 characters. */static const char *tokenizer_alpha_next_token_utf8_circ (tokenizer *tok) {    char *os;    size_t ol;    int i, j;    int start, end;    gunichar u;    // FIXME wbs och wbe s鋞ts inte alltid korrekt    while (1) {	/* Fill wbuf as far as possible */	printf ("wbuf 0<%d:%d>:", tok->wbs, tok->wbe);	for (i = 0; i <= tok->maxlen; i++)	    printf ("%c", tok->wbuf[i]);	printf (":\n");	if (tok->wbs < 0) {	    printf ("0\n");	    os = (char *)tok->wbuf;	    ol = (tok->maxlen + 1) * sizeof(gunichar);	    i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol);	    tok->wbe = tok->maxlen - ol / sizeof(gunichar);	    if (tok->wbe < 0)		return NULL;	    tok->wbs = 0;	} else if (tok->wbs <= tok->wbe) {	    printf ("1\n");	    os = (char *)&tok->wbuf[tok->wbe + 1];	    ol = (tok->maxlen - tok->wbe) * sizeof(gunichar);	    i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol);	    tok->wbe = tok->maxlen - ol / sizeof(gunichar);	    printf ("wbuf 1<%d:%d>:", tok->wbs, tok->wbe);	    for (i = 0; i <= tok->maxlen; i++)		printf ("%c", tok->wbuf[i]);	    printf (":\n");	    if (tok->wbs > 0 && tok->wbe == tok->maxlen) {		printf ("2\n");		os = (char *)tok->wbuf;		ol = tok->wbs * sizeof(gunichar);		i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol);		tok->wbe = tok->wbs - ol / sizeof(gunichar) - 1;		if (tok->wbe < 0)		    tok->wbe = tok->maxlen - 1;	    }	} else {	// (tok->wbs > tok->wbe)	    printf ("3\n");	    os = (char *)&tok->wbuf[tok->wbe + 1];	    ol = (tok->wbs - tok->wbe - 1) * sizeof(gunichar);	    i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol);	    tok->wbe = tok->wbs - ol / sizeof(gunichar) - 1;	}	printf ("wbuf 2<%d:%d>:", tok->wbs, tok->wbe);	for (i = 0; i <= tok->maxlen; i++)	    printf ("%c", tok->wbuf[i]);	printf (":\n");	/* Find first alpha char */	start = -1;	if (tok->wbs <= tok->wbe) {	    printf ("4\n");	    for (i = tok->wbs; i <= tok->wbe; i++)		if (g_unichar_isalpha (tok->wbuf[i])) {			start = i;			break;		    }	} else if (tok->wbs > tok->wbe) {	    printf ("5\n");	    for (i = tok->wbs; i <= tok->maxlen; i++)		if (g_unichar_isalpha (tok->wbuf[i])) {		    start = i;		    break;		}	    if (start == -1) {		printf ("6\n");		for (i = 0; i <= tok->wbe; i++)		    if (g_unichar_isalpha (tok->wbuf[i])) {			start = i;			break;		    }	    }	} else	    return NULL;	printf ("start:%d:\n", start);	tok->wbs = i + 1;	if (tok->wbs > tok->maxlen) {	    tok->wbs = 0;	    if (tok->wbe == tok->maxlen)		tok->wbs = -1;	}	if (start != -1)	    break;	if (tok->il == 0)	    return NULL;    }    /* Fill wbuf as far as possible */    if (start <= tok->wbe) {	os = (char *)&tok->wbuf[tok->wbe + 1];	ol = (tok->maxlen - tok->wbe) * sizeof(gunichar);	i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol);	tok->wbe = tok->maxlen - ol / sizeof(gunichar);	if (start > 0 && tok->wbe == tok->maxlen) {	    os = (char *)tok->wbuf;	    ol = start * sizeof(gunichar);	    i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol);	    tok->wbe = start - ol / sizeof(gunichar) - 1;	    if (tok->wbe < 0)		tok->wbe = tok->maxlen - 1;	}    } else {	os = (char *)&tok->wbuf[tok->wbe + 1];	ol = (start - tok->wbe - 1) * sizeof(gunichar);	i = iconv (tok->cd, &tok->is, &tok->il, &os, &ol);	tok->wbe = start - ol / sizeof(gunichar) - 1;    }    /* Find next non-alpha */    end = -1;    if (start < tok->wbe) {	printf ("7\n");	for (i = start; i <= tok->wbe; i++)	    if (!g_unichar_isalpha (tok->wbuf[i]))		break;	end = i;	if (end - start < tok->minlen || end - start > tok->maxlen)	    end = -1;    } else {	// (start > tok->wbe)	printf ("8\n");	for (i = start; i <= tok->maxlen; i++)	    if (!g_unichar_isalpha (tok->wbuf[i])) {		end = i;		break;	    }	if (end == -1) {	    printf ("9\n");	    for (i = 0; i <= tok->wbe; i++)		if (!g_unichar_isalpha (tok->wbuf[i])) {		    end = i;		    break;		}	    if (start - end < 1 || start - end > tok->maxlen - tok->minlen + 1)		end = -1;	}    }    printf ("end:%d:\n", end);    if (end == -1) {	/* Flush until next non-alpha */	printf ("to long\n");	return tokenizer_alpha_next_token_utf8_circ (tok);    }    tok->wbs = end;    if (end)	;    /* Convert to lowercase utf8 */    if (start < end) {	printf ("10\n");	j = 0;	for (i = start; i < end; i++) {	    if (tok->lower)		u = g_unichar_tolower (tok->wbuf[i]);	    else		u = tok->wbuf[i];	    j += g_unichar_to_utf8 (u, &tok->buffer[j]);	}	tok->buffer[j] = '\0';    } else {	printf ("11\n");	j = 0;	for (i = start; i <= tok->maxlen; i++) {	    if (tok->lower)
12 下一页
💿 文件大小 472 K
👤 上传用户 huanzhudev
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#增量 #分类算法 #监控
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -