📄 dicefactory.c
字号:
#include <stdio.h>#include <string.h>#include <stdlib.h>#include <assert.h>#include <sys/stat.h>#include <sys/types.h>#include <fcntl.h>#include <unistd.h>#include <errno.h>#include <bow/libbow.h>/* Defaults for command-line arguments. *//* The number of documents to generate */int ndocs = 100;/* The number of words per document */int nwords_per_doc = 50;/* Prefix to each filename created */const char *prefix = NULL;/* Directory into which to place the documens. */const char *dirname = NULL;int noise_vocab_size = 0;float noise_vocab_fraction = 0;int print_multinomial_header = 0;int print_multinomials_only = 0;/* maximum number of words */#define MAXN 99999int df_alphas[MAXN];int df_alphas_sum;char *df_words[MAXN];/* A multinomial sampled from the Dir(alphas) */float df_p[MAXN];voidprint_usage (const char *argv[]){ fprintf (stderr, "usage: %s " "[-d dirname] [-p prefix] [-l doclen] [-n ndocs] [-P]\n" "[-v noisevocabsize] [-f noisevocabfrac] [-m] distfile\n" " Will output NDOCS files each of length DOCLEN with" " filenames having \n" " PREFIX to directory DIRNAME.\n" " With probability NOISEVOCABFRAC, instead of picking" " a word from the\n" " distribution specified by DISTFILE, a word will be chosen" " uniformly \n" " from one of NOISEVOCABSIZE noise-words\n" " If the -P option is given, print each document's multinomial" " as a header.\n" " If the -m option is given, don't create documents, just" " print sampled multinomials\n" , argv[0]);}#if 1/* Return a sample from the Gamma distribution, with parameter IA *//* From Numerical "Recipes in C", page 292 */doublebow_gamma_distribution (int ia){ int j; double am, e, s, v1, v2, x, y; assert (ia >= 1) ; if (ia < 6) { x = 1.0; for (j = 1; j <= ia; j++) x *= bow_random_01 (); x = - log (x); } else { do { do { do { v1 = 2.0 * bow_random_01 () - 1.0; v2 = 2.0 * bow_random_01 () - 1.0; } while (v1 * v1 + v2 * v2 > 1.0); y = v2 / v1; am = ia - 1; s = sqrt (2.0 * am + 1.0); x = s * y + am; } while (x <= 0.0); e = (1.0 + y * y) * exp (am * log (x/am) - s * y); } while (bow_random_01 () > e); } return x;}#else//#error This one does not work for small alphas/* From Larry Wasserman */doublebow_gamma_distribution (double a){ double b, c, u, v, w, x, y, z; int accept; b = a-1.0; c = 3.0*a - 3.0/4.0; accept =0; while(accept==0){ u = bow_random_01 (); v = bow_random_01 (); if(u==0.0)u=0.00001; if(v==0.0)v=0.00001; if(u==1.0)u=0.99999; if(v==1.0)v=0.99999; w = u*(1.0-u); y = sqrt(c/w)*(u-0.5); x = b+y; if(x>=0.0){ z = 64.0*w*w*w*v*v; if(z<=1.0-2.0*y*y/x)accept=1; if(accept==0){ if(log(z)<=2.0*(b*log(x/b)-y))accept=1; } } } return x;}#endifintrandom_index_from_multinomial (float *p, int size){ float r = bow_random_01 (); int k; float sum; for (k = -1, sum = 0; sum < r; sum += p[k]) k++; return k;}voidnormalize_multinomial (float *p, int size){ int i; float sum = 0; for (i = 0; i < size; i++) sum += p[i]; assert (sum); for (i = 0; i < size; i++) p[i] /= sum;}/* Note that, unfortunately, here the ALPHAS must be integers. I don't know how to sample from a Dirichlet with continuous alphas. */voidrandom_multinomial_from_dirichlet (int *alphas, int size, float *p){ float p_sum = 0; int i; for (i = 0; i < size; i++) { p[i] = bow_gamma_distribution ((double)alphas[i]); p_sum += p[i]; } for (i = 0; i < size; i++) p[i] /= p_sum;}intmain (int argc, const char *argv[]){ int argi, N, i=0; float x; char s[256]; FILE *fp; char docname[1024]; const char *distfile; int e; bow_random_set_seed (); for (argi = 1; argi < argc; argi++) { if (argv[argi][0] != '-') break; switch (argv[argi][1]) { case 'm': print_multinomials_only = 1; break; break; case 'd': dirname = argv[++argi]; break; case 'p': prefix = argv[++argi]; break; case 'l': nwords_per_doc = atoi (argv[++argi]); break; case 'n': ndocs = atoi (argv[++argi]); break; case 'v': noise_vocab_size = atoi (argv[++argi]); break; case 'f': noise_vocab_fraction = atof (argv[++argi]); break; case 'P': print_multinomial_header = 1; break; case '?': case 'h': print_usage (argv); exit (0); default: fprintf (stderr, "%s: unrecognized option `%s'\n", argv[0], argv[argi]); print_usage (argv); exit (-1); } } distfile = argv[argi]; if (dirname && dirname[0] == '/') fprintf (stderr, "Output to %s\n", dirname); else fprintf (stderr, "Output to ./%s\n", dirname); /* mkdir (dirname, S_IRWXU | S_IRWXG | S_IRWXO); */ /* read in prob. distribution */ fp = fopen (distfile, "r"); df_alphas_sum = 0; for (i = 0; i < MAXN && fscanf(fp, "%f %s", &x, s)==2; i++) { df_alphas[i] = x; assert (df_alphas[i] == x); df_alphas_sum += x; df_words[i] = strdup (s); } fclose (fp); if (i>=MAXN) { printf("Error: number of words exceeds %d\n", MAXN); exit (-1); } N = i; fprintf(stderr, "Sum of alphas = %d\n", df_alphas_sum); if (!print_multinomials_only) { /* Create the directory if necessary */ e = mkdir (dirname, 0777); if (e != 0 && errno != EEXIST) { fprintf (stderr, "Error creating directory `%s'\n", dirname); perror ("dicefactory"); exit (-1); } } /* generate documents */ for (i = 0; i < ndocs; i++) { /* each with NWORDS_PER_DOC words */ int j; if (!print_multinomials_only) { if (prefix) sprintf (docname, "%s/%s%05d", dirname, prefix, i); else sprintf (docname, "%s/%05d", dirname, i); fp = fopen (docname, "w"); assert (fp); } /* Sample a multinomial from the Dirichlet */ random_multinomial_from_dirichlet (df_alphas, N, df_p); if (print_multinomials_only) { int k; for (k = 0; k < N; k++) printf ("%f ", df_p[k]); printf ("\n"); continue; } if (print_multinomial_header) { int k; for (k = 0; k < N; k++) fprintf (fp, "%f %s\n", df_p[k], df_words[k]); fprintf (fp, "\n"); } for (j=0; j<nwords_per_doc; j++) { if (noise_vocab_fraction && rand()/(float)RAND_MAX > noise_vocab_fraction) { int wn = rand () % noise_vocab_size; fprintf (fp, "noise"); /* Convert number WN into alphabetics */ while (wn) { fprintf (fp, "%c", 'a' + wn % 10); wn /= 10; } fprintf (fp, "\n"); } else { int k = random_index_from_multinomial (df_p, N); fprintf(fp, "%s\n", df_words[k]); } } fprintf (fp, "\n"); fclose (fp); } exit (0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -