⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 collection.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
字号:
/* Copyright (C) 2002  Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * Document traverser. * Goes through a collection of documents in some specified order. * * @author  Mikael Ylikoski * @date    2002 */#include <stdio.h>#include <stdlib.h>#include <time.h>#include "bitarray.h"#include "box.h"#include "collection.h"#include "document.h"#include "utility.h"/** * Mailbox training information. */typedef struct {    box *b;		/**< Box */    int nom;		/**< Number Of Messages */    int notm;		/**< Number Of Trained Messages */    bitarray *trb;	/**< Bitarray indicating trained messages */    document *cache;	/**< Cache of last referenced message */    int cache_num;	/**< Number of cached message, or -1 if none */    int drop;		/**< Cache access counter */} box_info;/** * List entry for time order. */typedef struct timee_ timee;struct timee_ {    int box;		/**< Mailbox */    int msg;		/**< Message number */    time_t time;	/**< Message sent time */    timee *next;	/**< Next timee in line */};box_info *boxes;	/**< Mailboxes */int nob;		/**< Number Of mailboxes */int nom;		/**< Number Of Messages in total */int notm;		/**< Number Of Trained Messages in total */enum document_type docs_type;	/**< Type of documents */enum csm csm;		/**< */enum msm msm;		/**< */enum trm trm;		/**< */double tt;		/**< Percentage of msgs to use for training */timee *time_first;	/**< First entry in list of times */int box_no;		/**< Current box number */int msg_no;		/**< Current message number *//** * Put document in box's cache. */static voidput_in_cache (int b, int m) {    char *src;    if (boxes[b].cache_num != m) {	if (boxes[b].cache)	    document_free (boxes[b].cache);	src = box_get_source (boxes[b].b, m);	boxes[b].cache = document_new (src, docs_type);	// FIXME check if NULL	boxes[b].cache_num = m;	boxes[b].drop = 0;    }}/** * Get time of a document. */static inline time_tget_time (int b, int m) {    put_in_cache (b, m);    return document_get_time (boxes[b].cache);}/** * Insert element in time list. */static inttime_insert (timee *tim) {    timee *ti, *te;    if (tim->msg >= boxes[tim->box].nom) {	free (tim);	return -1;    }    tim->time = get_time (tim->box, tim->msg);    if (time_first == NULL) {	tim->next = time_first;	time_first = tim;    } else if (tim->time < time_first->time) {	tim->next = time_first;	time_first = tim;    } else {	te = time_first;	for (ti = te->next; ti != NULL; ti = ti->next) {	    if (tim->time < ti->time) {		tim->next = ti;		te->next = tim;		return 0;	    }	    te = ti;	}	tim->next = NULL;	te->next = tim;    }    return 0;}/** * Get next box in time list, and update time list. */static inttime_next (void) {    int i;    timee *tim;    if (!time_first)	return -1;    tim = time_first;    time_first = time_first->next;    i = tim->box;    tim->msg++;    time_insert (tim);    return i;}/** * Test whether a class is finished training. * * @param cls  class to check * @return Nonzero if the class is finished training, zero otherwise. */static intclass_finished (int cls) {    if (cls < 0 || cls >= nob)	return 1;    switch (trm) {    case ALL:	return boxes[cls].notm == boxes[cls].nom;    case ALL_INTERLEAVED:	return boxes[cls].notm == boxes[cls].nom;    case CLASS_PERCENTAGE:	return boxes[cls].notm >= boxes[cls].nom * tt;    case TOTAL_PERCENTAGE:	return boxes[cls].notm == boxes[cls].nom;    }    return 0;}/** * Test whether training is finished. * * @return Nonzero if training is finished, zero otherwise. */static intall_finished (void) {    int i;    switch (trm) {    case ALL:	return notm == nom;    case ALL_INTERLEAVED:	return notm == nom;    case CLASS_PERCENTAGE:	for (i = 0; i < nob; i++)	    if (!class_finished (i))		return 0;	return 1;	/* 0 */    case TOTAL_PERCENTAGE:	return notm >= nom * tt;    }    return 0;}/** * Change to next training class. * * @param inc  indicates whether to force a change of class. */static voidnext_class (int inc) {    switch (csm) {    case LINEAR:	if (inc || box_no == -1)	    box_no++;	break;    case CROSS:	box_no++;	if (box_no == nob)	    box_no = 0;	break;    case RANDOM:	box_no = rand () * (double)nob / (RAND_MAX + 1.0);	break;    case TIME:	box_no = time_next ();	break;    }}/** * Traverses the training messages in all mailclasses. * * @return Nonzero if there is another training message, zero otherwise. */intcollection_next_document (void) {    if (all_finished ())	return 0;    for (next_class (0); class_finished (box_no); next_class (1))	;    switch (msm) {    case LINEAR_SEQ:	msg_no = boxes[box_no].notm++;	notm++;	break;    case RANDOM_SEQ:	msg_no = bitarray_random_zero (boxes[box_no].trb);	bitarray_set_bit (boxes[box_no].trb, msg_no);	boxes[box_no].notm++;	notm++;	break;    }    return 1;}/** * Traverses the test messages in all mailclasses. * The test messages are all untrained messages, except in case where * all messages should be both trained and tested. * The message sequence mode is ignored, messages are always taken in * linear order. * * @return Nonzero if there is another message, zero otherwise. */intcollection_next_test_document (void) {    if (box_no >= nob)	return 0;    if (trm == ALL) {	if (boxes[box_no].notm == 0) {	    box_no++;	    return collection_next_test_document ();	}	msg_no = boxes[box_no].notm--;    } else {	if (boxes[box_no].notm == boxes[box_no].nom) {	    box_no++;	    return collection_next_test_document ();	}	switch (msm) {	case LINEAR_SEQ:	    msg_no = boxes[box_no].notm++;	    break;	case RANDOM_SEQ:	    msg_no = bitarray_first_zero (boxes[box_no].trb);	    bitarray_set_bit (boxes[box_no].trb, msg_no);	    boxes[box_no].notm++;	    break;	}    }    return 1;}/** * Initialize collection. */intcollection_init (enum csm c, enum msm m, enum trm t, enum document_type dt) {    boxes = my_malloc (sizeof(box_info) * 128);    nob = 0;    nom = 0;    notm = 0;    docs_type = dt;    csm = c;    msm = m;    trm = t;    tt = 0.7;    time_first = NULL;    box_no = -1;    msg_no = -1;    return 0;}/** * Add box to collection. */intcollection_add_box (box *b) {    timee *tim;    if (nob == 128)	return -1;	// FIXME realloc instead    boxes[nob].b = b;    boxes[nob].nom = box_get_nod (b);    boxes[nob].notm = 0;    boxes[nob].cache = NULL;    boxes[nob].cache_num = -1;    if (csm == TIME) {	tim = my_malloc (sizeof(timee));	tim->box = nob;	tim->msg = 0;	time_insert (tim);    }    if (msm == RANDOM_SEQ)	boxes[nob].trb = bitarray_new (boxes[nob].nom);    nom += boxes[nob].nom;    nob++;    return 0;}/** * Get total number of messages in collection. */intcollection_get_nod (void) {    return nom;}/** * Get total number of messages in collection. */intcollection_get_notd (void) {    return notm;}/** * Get current document from collection. */document *collection_get_document (void) {    put_in_cache (box_no, msg_no);    return boxes[box_no].cache;}/** * Drop a document. */voidcollection_drop_document (document *doc) {    if (boxes[box_no].cache == doc) {	if (++boxes[box_no].drop == 2) {	    document_free (boxes[box_no].cache);	    boxes[box_no].cache = NULL;	    boxes[box_no].cache_num = -1;	}    }	// else assert (0);}/** * Get current class number. */intcollection_get_class (void) {    return box_no;}/** * Get current message number. */intcollection_get_msg (void) {    return msg_no;}/** * Get number of messages in class. */intcollection_get_class_nod (int class) {    return boxes[class].nom;}/** * Get number of trained messages in class. */intcollection_get_class_notd (int class) {    return boxes[class].notm;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -