📄 barrel.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 3 页
字号:
  bow_verbosify (bow_progress, 		 "Removing words by information gain:          ");  /* Hide words from the BARREL. */  for (i = num_words_to_keep; i < wi2ig_size; i++)    {      /* Hide the WI from BARREL. */      bow_wi2dvf_hide_wi (barrel->wi2dvf, wiig_list[i].wi);      if (i % 100 == 0)	bow_verbosify (bow_progress, "\b\b\b\b\b\b\b\b\b%9d", wi2ig_size - i);     }  /* Now that we have reduce vocabulary size, don't add more words to the     vocabulary.  For example, when doing --test-files, don't include     in the QUERY_WV words that aren't in the current reduced vocabulary,     the total number of words in the QUERY_WV will be too large! */  bow_word2int_do_not_add = 1;  bow_verbosify (bow_progress, "\n");}/* Set the BARREL->WI2DVF->ENTRY[WI].IDF to the sum of the COUNTS for   the given WI among those documents in the training set. */voidbow_barrel_set_idf_to_count_in_train (bow_barrel *barrel){  bow_wi2dvf *wi2dvf = barrel->wi2dvf;  int wi, nwi, dvi;  bow_dv *dv;  bow_cdoc *cdoc;  nwi = MIN (wi2dvf->size, bow_num_words());  for (wi = 0; wi < nwi; wi++)    {      dv = bow_wi2dvf_dv (wi2dvf, wi);      if (!dv)	continue;      dv->idf = 0;      for (dvi = 0; dvi < dv->length; dvi++)	{	  cdoc = bow_array_entry_at_index (barrel->cdocs, dv->entry[dvi].di);	  if (cdoc->type == bow_doc_train)	    dv->idf += dv->entry[dvi].count;	}    }}/* Return the number of unique words among those documents with TYPE   tag (train, test, unlabeled, etc) equal to TYPE. */int bow_barrel_num_unique_words_of_type (bow_barrel *doc_barrel, int type){  int wi, max_wi, dvi;  int num_unique = 0;  bow_dv *dv;  bow_cdoc *cdoc;  max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words());  for (wi = 0; wi < max_wi; wi++)    {      dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi);      for (dvi = 0; dv && dvi < dv->length; dvi++)	{	  cdoc = bow_array_entry_at_index (doc_barrel->cdocs,					   dv->entry[dvi].di);	  if (cdoc->type == type)	    {	      num_unique++;	      break;	    }	}    }  return num_unique;}int_bow_barrel_cdoc_write (bow_cdoc *cdoc, FILE *fp){  int ret;  ret = bow_fwrite_int (cdoc->type, fp);  ret += bow_fwrite_float (cdoc->normalizer, fp);  ret += bow_fwrite_float (cdoc->prior, fp);  ret += bow_fwrite_int (cdoc->word_count, fp);  ret += bow_fwrite_string (cdoc->filename, fp);  if (bow_file_format_version < 5)    ret += bow_fwrite_short (cdoc->class, fp);  else    ret += bow_fwrite_int (cdoc->class, fp);  return ret;}int_bow_barrel_cdoc_read (bow_cdoc *cdoc, FILE *fp){  int ret;  int type;  ret = bow_fread_int (&type, fp);  cdoc->type = type;  cdoc->class_probs = NULL;  ret += bow_fread_float (&(cdoc->normalizer), fp);  ret += bow_fread_float (&(cdoc->prior), fp);  ret += bow_fread_int (&(cdoc->word_count), fp);  ret += bow_fread_string ((char**)&(cdoc->filename), fp);  if (bow_file_format_version < 5)    {      short s;      ret += bow_fread_short (&s, fp);      cdoc->class = s;    }  else    ret += bow_fread_int (&(cdoc->class), fp);  return ret;}/* Create and return a `barrel' by reading data from the file-pointer FP. */bow_barrel *bow_barrel_new_from_data_fp (FILE *fp){  bow_barrel *ret;  int version_tag;  int method_id;  version_tag = fgetc (fp);  /* xxx assert (version_tag >= 0); */  if (version_tag <= 0)    return NULL;  if (_bow_barrel_version != -1 && _bow_barrel_version != version_tag)    bow_error ("Trying to read bow_barrel's with different version numbers");  _bow_barrel_version = version_tag;  ret = bow_malloc (sizeof (bow_barrel));  if (_bow_barrel_version < 3)    {      bow_fread_int (&method_id, fp);      bow_error ("Can no longer read barrels earlier than version 3");      /* ret->method = _old_bow_methods[method_id]; */    }  else    {      char *method_string;      bow_fread_string (&method_string, fp);      ret->method = (rainbow_method*) bow_method_at_name (method_string);      bow_free (method_string);    }  ret->cdocs =     bow_array_new_from_data_fp ((int(*)(void*,FILE*))_bow_barrel_cdoc_read,				 _bow_barrel_cdoc_free, fp);  assert (ret->cdocs->length);  if (bow_file_format_version > 5)    ret->classnames = bow_int4str_new_from_fp (fp);  else    ret->classnames = NULL;    ret->wi2dvf = bow_wi2dvf_new_from_data_fp (fp);  assert (ret->wi2dvf->num_words);  return ret;}/* Decide whether to keep this or not.  Currently it it used by   rainbow-h.c. */bow_barrel *bow_barrel_new_from_data_file (const char *filename){  FILE *fp;  bow_barrel *ret_barrel;  int wi;  bow_dv *dv;  int dv_count = 0;  fp = bow_fopen (filename, "rb");  ret_barrel = bow_barrel_new_from_data_fp (fp);  if (ret_barrel)    {      /* Read in all the dvf's so that we can close the FP. */      for (wi = 0; wi < ret_barrel->wi2dvf->size; wi++)	{	  dv = bow_wi2dvf_dv (ret_barrel->wi2dvf, wi);	  if (dv)	    dv_count++;	}      ret_barrel->wi2dvf->fp = NULL;      assert (dv_count);    }  fclose (fp);  return ret_barrel;}/* Read a line from FP until a newline, and return a newly malloc'ed   buffer containing the line read. */char *getline (FILE *fp){  int bufsize = 1024;  int buflen = 0;  char *buf = bow_malloc (bufsize);  int byte;  while ((byte = fgetc (fp)) != EOF	 && byte != '\n')    {      buf[buflen++] = byte;      if (buflen >= bufsize)	{	  bufsize *= 2;	  buf = bow_realloc (buf, bufsize);	}    }  if (byte == EOF)    {      bow_free (buf);      return NULL;    }  buf[buflen] = '\0';  return buf;}/* Create a new barrel and fill it from contents in --print-barrel=FORMAT   read in from FILENAME. */bow_barrel *bow_barrel_new_from_printed_barrel_file (const char *filename,					 const char *format){  FILE *fp;  enum {    word_index,    word_string,    word_string_and_index,    word_empty  } word_format = word_string_and_index;  enum {    binary_count,    integer_count  } word_count_format = integer_count;  int sparse_format = 1;  int di;  bow_cdoc cdoc;  int wi;  float count;  int int_count;  char datafilename[BOW_MAX_WORD_LENGTH];  char classname[BOW_MAX_WORD_LENGTH];  int word_count_column;  int num_chars_read;  char *buf, *line;  bow_barrel *ret;  /* Returns 1 on success, 0 on failure. */  int read_word_count (char **string, int *wi, float *count)    {      char word[BOW_MAX_WORD_LENGTH];      int ret = 0;      int num_chars_read;      switch (word_format)	{	case word_index:	  if (sscanf (*string, "%d %f%n", wi, count, &num_chars_read) == 2)	    ret = 1;	  break;	case word_string:	  if (sscanf (*string, "%s %f%n", word, count, &num_chars_read) == 2)	  {	    ret = 1;	    *wi = bow_word2int (word);	  }	  break;	case word_string_and_index:	  if (sscanf (*string,"%s %d %f%n",word,wi,count,&num_chars_read) == 3)	    ret = 1;	  break;	case word_empty:	  if (sscanf (*string, "%f%n", count, &num_chars_read) == 1)	  {	    ret = 1;	    *wi = word_count_column;	  }	  break;	}      if (word_count_format == binary_count)	*count = (*count > 0);      if (ret)	*string += num_chars_read;      return ret;    }  if (format && strchr (format, 'a'))    sparse_format = 0;  if (format && strchr (format, 'b'))    word_count_format = binary_count;  if (format && strchr (format, 'n'))    word_format = word_index;  else if (format && strchr (format, 'w'))    word_format = word_string;  else if (format && strchr (format, 'e'))    word_format = word_empty;  ret = bow_barrel_new (0, 0, sizeof (bow_cdoc), _bow_barrel_cdoc_free);  ret->classnames = bow_int4str_new (0);  fp = bow_fopen (filename, "r");  /* Each time through the loop reads one line. */  while ((buf = getline (fp)))    {      line = buf;      if (sscanf (line, "%s%n", datafilename, &num_chars_read) != 1)	bow_error ("Didn't find expected filename");      line += num_chars_read;      if (sscanf (line, "%s%n", classname, &num_chars_read) != 1)	bow_error ("Didn't find expected classname");      line += num_chars_read;      cdoc.filename = strdup (datafilename);      assert (cdoc.filename);      cdoc.class = bow_str2int (ret->classnames, classname);      cdoc.type = bow_doc_train;      cdoc.prior = 1.0f;      cdoc.class_probs = NULL;      di = bow_array_append (ret->cdocs, &cdoc);      while (read_word_count (&line, &wi, &count))	{	  if (count)	    {	      int_count = rint (count);	      bow_wi2dvf_add_wi_di_count_weight (&(ret->wi2dvf),						 wi, di, int_count, count);	    }	  else	    assert (sparse_format == 0);	}      bow_free (buf);    }  return ret;}/* Write BARREL to the file-pointer FP in a machine independent format. */voidbow_barrel_write (bow_barrel *barrel, FILE *fp){  if (!barrel)    {      fputc (0, fp);		/* 0 version_tag means NULL barrel */      return;    }  fputc (BOW_DEFAULT_BARREL_VERSION, fp);  _bow_barrel_version = BOW_DEFAULT_BARREL_VERSION;  bow_fwrite_string (barrel->method->name, fp);  bow_array_write (barrel->cdocs,		   (int(*)(void*,FILE*))_bow_barrel_cdoc_write, fp);  bow_int4str_write (barrel->classnames, fp);  /* The wi2dvf must be written last because when we read it, we don't     actually read the whole thing; we only read the seek-table. */  bow_wi2dvf_write (barrel->wi2dvf, fp);}/* Print barrel to FP in human-readable and awk-accessible format. */voidbow_barrel_printf_old1 (bow_barrel *barrel, FILE *fp, const char *format){    bow_dv_heap *heap;		/* a heap of "document vectors" */  int current_di;  bow_cdoc *cdoc;  bow_verbosify (bow_progress, "Printing barrel:          ");  heap = bow_make_dv_heap_from_wi2dvf (barrel->wi2dvf);  /* Keep going until the heap is empty */  while (heap->length > 0)    {      /* Set the current document we're working on */      current_di = heap->entry[0].current_di;      assert (heap->entry[0].dv->idf == heap->entry[0].dv->idf);  /* NaN */      if (current_di % 10 == 0)	bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", current_di);      /* Here we should check if this di is part of some training set and	 move on if it isn't. */          /* Get the document */      cdoc = bow_cdocs_di2doc (barrel->cdocs, current_di);#if 0      /* If it's not a model document, then move on to next one */      if (cdoc->type != model)	{	  do 	    {	      bow_dv_heap_update (heap);	    }	  while ((current_di == heap->entry[0].current_di)		 && (heap->length > 0));	  	  /* Try again */	  continue;	}#endif      fprintf (fp, "%s", cdoc->filename);          /* Loop over all words in this document, printing out the         FORMAT-requested statistics. */      do 	{#if 0
💿 文件大小 522 K
👤 上传用户 yuanata
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#mitchell #tom #机器学习 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -