⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 box.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
字号:
/* Copyright (C) 2002  Mikael Ylikoski * See the accompanying file "README" for the full copyright notice *//** * @file * Document source input. * Functions to read document sources from files. * * @author  Mikael Ylikoski * @date    2002 */#include <dirent.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <sys/stat.h>#include "box.h"#include "utility.h"/** * Box type. */enum input_type {    DIRECTORY,		/**< Directory with files (ie mh-format) */    MBOX		/**< One file with emails (ie mbox-format) */};/** * Pointer to document source. */typedef struct {    union {	char *name;	/**< File name (for DIRECTORY) */	int offset;	/**< Offset into file in bytes (for MBOX) */    } u;    int len;		/**< Length of source in bytes */} document_source;/** * Mailbox. */struct box_ {    enum input_type it;		/**< Input type */    char *name;			/**< File/directory name */    int len;			/**< Length of name string */    FILE *file;			/**< File pointer */    int nod;			/**< Number Of Documents */    document_source *docs;	/**< Documents */};/** * Comparison function for mail file names. * Shorter names are less than longer. * Names of equal length are compared alphabetically. */static intnamecomp (const void *e1, const void *e2) {    char *a1, *a2;    int l1, l2;    a1 = ((document_source *)e1)->u.name;    a2 = ((document_source *)e2)->u.name;    l1 = strlen (a1);    l2 = strlen (a2);    if (l1 < l2)	return -1;    if (l1 > l2)	return 1;    return strcmp (a1, a2);}/** * Create new box. * * @param file  The name of the directory or file to create a box from. * @return The new box. */box *box_new (const char *file) {    int i, j, k, len, size;    struct stat sstat;    struct dirent *dp;    box *b;    DIR *dirp;    char buf[20];    i = stat (file, &sstat);    if (i)	return NULL;    b = my_malloc (sizeof(box));    b->name = my_strdup (file);    b->len = strlen (file);    if (S_ISDIR (sstat.st_mode))	b->it = DIRECTORY;    else	b->it = MBOX;    size = 10;    b->docs = my_malloc (size * sizeof(document_source));    switch (b->it) {    case DIRECTORY:	dirp = opendir (b->name);	if (!dirp) {	    free (b->docs);	    free (b->name);	    free (b);	    return NULL;	}	len = 10;	b->name = my_realloc (b->name, b->len + len + 2);	b->name[b->len++] = '/';	for (b->nod = 0; (dp = readdir (dirp));) {	    if (dp->d_name[0] == '.')		continue;	    i = strlen (dp->d_name);	    if (i > len) {		len = i;		b->name = my_realloc (b->name, b->len + len + 1);	    }	    strcpy (&b->name[b->len], dp->d_name);	    j = stat (b->name, &sstat);	    if (j)		continue;	    if (S_ISDIR (sstat.st_mode))		continue;	    if (b->nod == size) {		size += 10;		b->docs = my_realloc (b->docs, size * sizeof(document_source));	    }	    b->docs[b->nod].len = sstat.st_size;	    b->docs[b->nod].u.name = my_strdup (dp->d_name);	    b->nod++;	}	closedir (dirp);	qsort (b->docs, b->nod, sizeof(document_source), namecomp);	break;    case MBOX:	b->file = fopen (b->name, "r");	b->nod = 0;	len = 1;	// indicates that next line is possible start	for (j = 0; 1; j += i) {	    // Get (beginning of) line	    for (i = 0; i < 19;) {		k = getc (b->file);		if (k == EOF)		    break;		buf[i++] = k;		if (k == '\n')		    break;	    }	    buf[i] = '\0';	    if (i == 19 && k != '\n')		// Flush line		while ((k = getc (b->file)) != EOF) {		    i++;		    if (k == '\n')			break;		}	    if (k == EOF)		break;	    // Check line for pine folder data	    if (b->nod == 0 && i > 17 &&		!strncmp (buf, "From MAILER-DAEMON", 18))		continue;	    // Check line FIXME shouldn't need strchr (buf, '@')	    if (len && !strncmp (buf, "From ", 5) && strchr (buf, '@')) {		if (b->nod == size) {		    size += 10;		    b->docs = my_realloc (b->docs,					  size * sizeof(document_source));		}		if (b->nod > 0)		    b->docs[b->nod - 1].len = j - b->docs[b->nod - 1].u.offset;		b->docs[b->nod].u.offset = j + i;		b->nod++;	    }	    len = (i == 1);	}	if (b->nod > 0)	    b->docs[b->nod - 1].len = j + i - b->docs[b->nod - 1].u.offset;	break;    }    return b;}/** * Free memory used by box. * * @param b  box to free */voidbox_free (box *b) {    int i;    switch (b->it) {    case DIRECTORY:	for (i = 0; i < b->nod; i++)	    free (b->docs[i].u.name);	break;    case MBOX:	fclose (b->file);	break;    }    free (b->name);    free (b->docs);    free (b);}/** * Get number of documents in box. * * @param b  box to get nod from * @return The number of documents. */intbox_get_nod (box *b) {    return b->nod;}/** * Get source of document in box. * * @param b    box of document * @param num  number of document * @return The document source. */char *box_get_source (box *b, int num) {    char *src;    int i, j;    FILE *file = NULL;    if (num >= b->nod)	return NULL;    src = my_malloc (b->docs[num].len + 1);    switch (b->it) {    case DIRECTORY:	strcpy (&b->name[b->len], b->docs[num].u.name);	file = fopen (b->name, "r");	break;    case MBOX:	file = b->file;	fseek (file, b->docs[num].u.offset, SEEK_SET);	break;    }    for (i = 0; i < b->docs[num].len; i++) {	j = getc (file);	if (j == EOF)	    break;	src[i] = j;    }    src[i] = '\0';    if (b->it == DIRECTORY)	fclose (file);    return src;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -