📄 collection.c
字号:
/* Copyright (C) 2002 Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * Document traverser. * Goes through a collection of documents in some specified order. * * @author Mikael Ylikoski * @date 2002 */#include <stdio.h>#include <stdlib.h>#include <time.h>#include "bitarray.h"#include "box.h"#include "collection.h"#include "document.h"#include "utility.h"/** * Mailbox training information. */typedef struct { box *b; /**< Box */ int nom; /**< Number Of Messages */ int notm; /**< Number Of Trained Messages */ bitarray *trb; /**< Bitarray indicating trained messages */ document *cache; /**< Cache of last referenced message */ int cache_num; /**< Number of cached message, or -1 if none */ int drop; /**< Cache access counter */} box_info;/** * List entry for time order. */typedef struct timee_ timee;struct timee_ { int box; /**< Mailbox */ int msg; /**< Message number */ time_t time; /**< Message sent time */ timee *next; /**< Next timee in line */};box_info *boxes; /**< Mailboxes */int nob; /**< Number Of mailboxes */int nom; /**< Number Of Messages in total */int notm; /**< Number Of Trained Messages in total */enum document_type docs_type; /**< Type of documents */enum csm csm; /**< */enum msm msm; /**< */enum trm trm; /**< */double tt; /**< Percentage of msgs to use for training */timee *time_first; /**< First entry in list of times */int box_no; /**< Current box number */int msg_no; /**< Current message number *//** * Put document in box's cache. */static voidput_in_cache (int b, int m) { char *src; if (boxes[b].cache_num != m) { if (boxes[b].cache) document_free (boxes[b].cache); src = box_get_source (boxes[b].b, m); boxes[b].cache = document_new (src, docs_type); // FIXME check if NULL boxes[b].cache_num = m; boxes[b].drop = 0; }}/** * Get time of a document. */static inline time_tget_time (int b, int m) { put_in_cache (b, m); return document_get_time (boxes[b].cache);}/** * Insert element in time list. */static inttime_insert (timee *tim) { timee *ti, *te; if (tim->msg >= boxes[tim->box].nom) { free (tim); return -1; } tim->time = get_time (tim->box, tim->msg); if (time_first == NULL) { tim->next = time_first; time_first = tim; } else if (tim->time < time_first->time) { tim->next = time_first; time_first = tim; } else { te = time_first; for (ti = te->next; ti != NULL; ti = ti->next) { if (tim->time < ti->time) { tim->next = ti; te->next = tim; return 0; } te = ti; } tim->next = NULL; te->next = tim; } return 0;}/** * Get next box in time list, and update time list. */static inttime_next (void) { int i; timee *tim; if (!time_first) return -1; tim = time_first; time_first = time_first->next; i = tim->box; tim->msg++; time_insert (tim); return i;}/** * Test whether a class is finished training. * * @param cls class to check * @return Nonzero if the class is finished training, zero otherwise. */static intclass_finished (int cls) { if (cls < 0 || cls >= nob) return 1; switch (trm) { case ALL: return boxes[cls].notm == boxes[cls].nom; case ALL_INTERLEAVED: return boxes[cls].notm == boxes[cls].nom; case CLASS_PERCENTAGE: return boxes[cls].notm >= boxes[cls].nom * tt; case TOTAL_PERCENTAGE: return boxes[cls].notm == boxes[cls].nom; } return 0;}/** * Test whether training is finished. * * @return Nonzero if training is finished, zero otherwise. */static intall_finished (void) { int i; switch (trm) { case ALL: return notm == nom; case ALL_INTERLEAVED: return notm == nom; case CLASS_PERCENTAGE: for (i = 0; i < nob; i++) if (!class_finished (i)) return 0; return 1; /* 0 */ case TOTAL_PERCENTAGE: return notm >= nom * tt; } return 0;}/** * Change to next training class. * * @param inc indicates whether to force a change of class. */static voidnext_class (int inc) { switch (csm) { case LINEAR: if (inc || box_no == -1) box_no++; break; case CROSS: box_no++; if (box_no == nob) box_no = 0; break; case RANDOM: box_no = rand () * (double)nob / (RAND_MAX + 1.0); break; case TIME: box_no = time_next (); break; }}/** * Traverses the training messages in all mailclasses. * * @return Nonzero if there is another training message, zero otherwise. */intcollection_next_document (void) { if (all_finished ()) return 0; for (next_class (0); class_finished (box_no); next_class (1)) ; switch (msm) { case LINEAR_SEQ: msg_no = boxes[box_no].notm++; notm++; break; case RANDOM_SEQ: msg_no = bitarray_random_zero (boxes[box_no].trb); bitarray_set_bit (boxes[box_no].trb, msg_no); boxes[box_no].notm++; notm++; break; } return 1;}/** * Traverses the test messages in all mailclasses. * The test messages are all untrained messages, except in case where * all messages should be both trained and tested. * The message sequence mode is ignored, messages are always taken in * linear order. * * @return Nonzero if there is another message, zero otherwise. */intcollection_next_test_document (void) { if (box_no >= nob) return 0; if (trm == ALL) { if (boxes[box_no].notm == 0) { box_no++; return collection_next_test_document (); } msg_no = boxes[box_no].notm--; } else { if (boxes[box_no].notm == boxes[box_no].nom) { box_no++; return collection_next_test_document (); } switch (msm) { case LINEAR_SEQ: msg_no = boxes[box_no].notm++; break; case RANDOM_SEQ: msg_no = bitarray_first_zero (boxes[box_no].trb); bitarray_set_bit (boxes[box_no].trb, msg_no); boxes[box_no].notm++; break; } } return 1;}/** * Initialize collection. */intcollection_init (enum csm c, enum msm m, enum trm t, enum document_type dt) { boxes = my_malloc (sizeof(box_info) * 128); nob = 0; nom = 0; notm = 0; docs_type = dt; csm = c; msm = m; trm = t; tt = 0.7; time_first = NULL; box_no = -1; msg_no = -1; return 0;}/** * Add box to collection. */intcollection_add_box (box *b) { timee *tim; if (nob == 128) return -1; // FIXME realloc instead boxes[nob].b = b; boxes[nob].nom = box_get_nod (b); boxes[nob].notm = 0; boxes[nob].cache = NULL; boxes[nob].cache_num = -1; if (csm == TIME) { tim = my_malloc (sizeof(timee)); tim->box = nob; tim->msg = 0; time_insert (tim); } if (msm == RANDOM_SEQ) boxes[nob].trb = bitarray_new (boxes[nob].nom); nom += boxes[nob].nom; nob++; return 0;}/** * Get total number of messages in collection. */intcollection_get_nod (void) { return nom;}/** * Get total number of messages in collection. */intcollection_get_notd (void) { return notm;}/** * Get current document from collection. */document *collection_get_document (void) { put_in_cache (box_no, msg_no); return boxes[box_no].cache;}/** * Drop a document. */voidcollection_drop_document (document *doc) { if (boxes[box_no].cache == doc) { if (++boxes[box_no].drop == 2) { document_free (boxes[box_no].cache); boxes[box_no].cache = NULL; boxes[box_no].cache_num = -1; } } // else assert (0);}/** * Get current class number. */intcollection_get_class (void) { return box_no;}/** * Get current message number. */intcollection_get_msg (void) { return msg_no;}/** * Get number of messages in class. */intcollection_get_class_nod (int class) { return boxes[class].nom;}/** * Get number of trained messages in class. */intcollection_get_class_notd (int class) { return boxes[class].notm;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -