⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dice.c

📁 卡内基梅隆大学MaCallum开发的文本分类系统
💻 C
字号:
/*Please see attachment for the sample program :  It takes distributionfrom stdin, and output to stdout(some information to stderr).Probabilities don't need to sum up to 1.  In the output, each article isseparated by an empty line; each word occupies a single line.The job would be to write code that, given a probability distributionover words, (in the form:     0.022 foo     0.015 bar     0.001 baz     ...) would produce 60 documents of 200 words each, where the words wouldbe sampled from the given distribution.*/#include <stdio.h>#include <string.h>#include <stdlib.h>#include <assert.h>#include <sys/stat.h>#include <sys/types.h>#include <fcntl.h>#include <unistd.h>#include <errno.h>/* Defaults for command-line arguments. *//* The number of documents to generate */int ndocs = 100;/* The number of words per document */int nwords_per_doc = 20;/* Prefix to each filename created */const char *prefix = NULL;/* Directory into which to place the documens. */const char *dirname = NULL;int noise_vocab_size = 0;float noise_vocab_fraction = 0;/* maximum number of words */#define MAX	99999struct {  float P;  char *w;}word[MAX];voidprint_usage (const char *argv[]){  fprintf (stderr, "usage: %s "	   "[-d dirname] [-p prefix] [-l doclen] [-n ndocs]\n"	   "[-v noisevocabsize] [-f noisevocabfrac] distfile\n"	   " Will output NDOCS files each of length DOCLEN with"	   " filenames having \n"	   " PREFIX to directory DIRNAME.\n"	   " With probability NOISEVOCABFRAC, instead of picking"	   " a word from the\n"	   " distribution specified by DISTFILE, a word will be chosen"	   " uniformly \n"	   " from one of NOISEVOCABSIZE noise-words\n"	   , argv[0]);}intmain (int argc, const char *argv[]){ int argi, N, i=0;  float x; char s[256]; FILE *fp; char docname[1024]; const char *distfile; int e; for (argi = 1; argi < argc; argi++)   {     if (argv[argi][0] != '-')       break;     switch (argv[argi][1])       {       case 'd':	 dirname = argv[++argi];	 break;       case 'p':	 prefix = argv[++argi];	 break;       case 'l':	 nwords_per_doc = atoi (argv[++argi]);	 break;       case 'n':	 ndocs = atoi (argv[++argi]);	 break;       case 'v':	 noise_vocab_size = atoi (argv[++argi]);	 break;       case 'f':	 noise_vocab_fraction = atof (argv[++argi]);	 break;       case '?':       case 'h':	 print_usage (argv);	 exit (0);       default:	 fprintf (stderr, "%s: unrecognized option `%s'\n", 		  argv[0], argv[argi]);	 print_usage (argv);	 exit (-1);       }   } distfile = argv[argi]; if (dirname && dirname[0] == '/')   fprintf (stderr, "Output to %s\n", dirname); else   fprintf (stderr, "Output to ./%s\n", dirname); /* mkdir (dirname, S_IRWXU | S_IRWXG | S_IRWXO); */ /* read in prob. distribution */ fp = fopen (distfile, "r"); while (i<MAX && fscanf(fp, "%f %s", &x, s)==2)    {     word[i].P = i==0? x : word[i-1].P+x;     word[i].w = (char *)malloc(strlen(s)+1);     strcpy(word[i].w, s);     i++;    } fclose (fp); if (i>=MAX)   {     printf("Error: number of words exceeds %d\n", MAX);     exit (-1);   } N = i; fprintf(stderr, "Cumulative Prob.=%f\n", word[N-1].P); /* Create the directory if necessary */ e = mkdir (dirname, 0777); if (e != 0 && errno != EEXIST)   {     fprintf (stderr, "Error creating directory `%s'\n", dirname);     perror ("dice");     exit (-1);   } /* generate documents */ for (i = 0; i < ndocs; i++)   {    /* each with NWORDS_PER_DOC words */    int j;    if (prefix)      sprintf (docname, "%s/%s%05d", dirname, prefix, i);    else      sprintf (docname, "%s/%05d", dirname, i);    fp = fopen (docname, "w");    assert (fp);    for (j=0; j<nwords_per_doc; j++)      {	if (noise_vocab_fraction 	    && rand()/(float)RAND_MAX > noise_vocab_fraction)	  {	    int wn = rand () % noise_vocab_size;	    fprintf (fp, "noise");	    /* Convert number WN into alphabetics */	    while (wn)	      {		fprintf (fp, "%c", 'a' + wn % 10);		wn /= 10;	      }	    fprintf (fp, "\n");	  }	else	  {	    float r= rand()/(float)RAND_MAX * word[N-1].P;	    int k=0;	    while (word[k].P<r) k++;	    fprintf(fp, "%s\n", word[k].w);	  }      }    fprintf (fp, "\n");    fclose (fp);   } exit (0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -