📄 reply.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
字号:
/* Copyright (C) 2001-2002  Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Reply classifier. * * Classifies replies to msgs as the same class as the original msgs. * Replies are usually written within a few days, so the database should be * limited to a time frame of a few days. * A double linked list is used to record the age of the threads. A thread * is as old as its latest message. * * Replies are identified by Subject lines beginning with "Re:". * A possible future extension is to use the In-Reply-To and References fields. * * @author  Mikael Ylikoski * @date    2001-2002 */#include <ctype.h>#include <stdlib.h>#include <string.h>#include "doc_classifier.h"#include "reply.h"#include "utility.h"typedef struct rply_ rply;/** * Reply classifier. */struct reply_ {    GHashTable *ht;	/**< Hash table */    int size;		/**< Maximum size */    int nor;		/**< Number Of Replies */    rply *first;	/**< First (oldest) reply */    rply *last;		/**< Last (newest) reply */};/** * Reply. */struct rply_ {    char *subject;	/**< Mail subject */    int class;		/**< Class */    rply *prev;    rply *next;};/** * Create a new Reply classifier. * * @param size  maximum number of addresses to remember * @return The new classifier. */reply *reply_new (int size) {    reply *rp;    if (size < 1)	return NULL;    rp = my_malloc (sizeof(reply));    rp->ht = g_hash_table_new (g_str_hash, g_str_equal);    if (!rp->ht) {	free (rp);	return NULL;    }    rp->size = size;    rp->nor = 0;    rp->first = NULL;    rp->last = NULL;    return rp;}void *reply_load (FILE *f) {    char *str;    int i, j, k;    reply *rdb;    fscanf (f, "size %d\n", &i);    rdb = reply_new (i);    fscanf (f, "nor %d\n", &j);    for (i = 0; i < j; i++) {	fscanf (f, "%d:", &k);	str = my_malloc (k + 1);	fread (str,  sizeof(char), k, f);	str[k] = '\0';	fscanf (f, "=%d;", &k);	reply_learn (rdb, str, k);	// FIXME	free (str);    }    fscanf (f, "\n");    return rdb;}intreply_save (FILE *f, void *db) {    reply *rdb;    rply *rp;    rdb = (reply *)db;    fprintf (f, "size %d\n", rdb->size);    fprintf (f, "nor %d\n", rdb->nor);    for (rp = rdb->first; rp; rp = rp->next)	fprintf (f, "%d:%s=%d;", strlen (rp->subject), rp->subject, rp->class);    fprintf (f, "\n");    return 0;}/** * Remove prefixed 'Re:' and white-space from a subject line. * * @param sub  subject line * @return A new subject line without any prefixed 'Re:' or white-space. */static const char *reply_trim_subject (const char *sub) {    int i, j;    j = strlen (sub);    for (i = 0; i < j;) {	while (sub[i] == ' ' || sub[i] == '\t')	    i++;	if ((sub[i] == 'R' || sub[i] == 'r') &&	    (sub[i + 1] == 'e' || sub[i + 1] == 'E') &&	    (sub[i + 2] == ':'))	    i += 3;	else	    break;    }    return &sub[i];}/** * "Add" an reply. * * @param rp     reply classifier * @param sub    subject line * @param class  class */intreply_learn (reply *rp, const char *sub, int class) {    char *s;    rply *r, *p;    s = (char *)reply_trim_subject (sub);    r = g_hash_table_lookup (rp->ht, s);    if (!r) {	r = my_malloc (sizeof(rply));	r->subject = my_strdup (s);	r->prev = NULL;	r->next = NULL;	if (rp->nor == rp->size) {	    p = rp->first;	    rp->first = p->next;	    g_hash_table_remove (rp->ht, p->subject);	    free (p->subject);	    free (p);	} else	    rp->nor++;	g_hash_table_insert (rp->ht, r->subject, r);    }    r->class = class;    /* put reply last in list */    if (rp->first == NULL) {		/* list is empty */	rp->first = r;	rp->last = r;    } else if (r != rp->last) {		/* r is not last in list */	if (r == rp->first) {		/* r is first in list */	    rp->first = r->next;	    r->next->prev = NULL;	} else if (r->next != NULL) {	/* r is inside list */	    r->prev->next = r->next;	    r->next->prev = r->prev;	}				/* else r is not in list */	r->prev = rp->last;	r->next = NULL;	rp->last->next = r;	rp->last = r;    }					/* else r is already last in list */    return 0;}/** * Classify a reply. * * @param rp   reply classifier * @param sub  subject line */intreply_classify (reply *rp, const char *sub) {    rply *r;    char *s;    s = (char *)reply_trim_subject (sub);    r = g_hash_table_lookup (rp->ht, s);    if (r)	return r->class;    return -1;}void *reply_new_doc (const char *opts) {    int n;    n = 100;    if (opts) {	n = get_opt_int (opts, "n=");	if (n < 1)	    n = 100;    }    return reply_new (n);}intreply_learn_doc (void *db, void *data, int class) {    const char *sub;    document *doc;    reply *rp;    rp = (reply *)db;    doc = (document *)data;    sub = document_get_subject (doc);    if (!sub)	return -1;    return reply_learn (rp, sub, class);}int *reply_classify_doc_rank (void *db, void *data) {    const char *sub;    int *il;    document *doc;    reply *rp;    rp = (reply *)db;    doc = (document *)data;    sub = document_get_subject (doc);    if (!sub)	return NULL;    il = my_malloc (2 * sizeof(int));    il[0] = reply_classify (rp, sub);    il[1] = -1;    return il;}intreply_classify_doc_top (void *db, void *data) {    const char *sub;    document *doc;    reply *rp;    rp = (reply *)db;    doc = (document *)data;    sub = document_get_subject (doc);    if (!sub)	return 0;    return reply_classify (rp, sub);}/** * Keep cygwin happy. */intmain (void) {    return 0;}const char *my_doc_classifier_name = "Reply";const doc_classifier_functions my_functions = {    .new = reply_new_doc,    .load = reply_load,    .save = reply_save,    .learn = reply_learn_doc,    .classify_rank = reply_classify_doc_rank,    .classify_top = reply_classify_doc_top};
💿 文件大小 472 K
👤 上传用户 huanzhudev
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#增量 #分类算法 #监控
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -