📄 barrel.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
	  int wi;	  for (wi = 0; heap->entry[0].wi > wi; wi++)	    fprintf (fp, " 0");#endif	  fprintf (fp, "  %s %d %d", 		   bow_int2word (heap->entry[0].wi),		   heap->entry[0].wi,		   heap->entry[0].dv->entry[heap->entry[0].index].count);	  /* Update the heap, we are done with this di, move it to its	     new position */	  bow_dv_heap_update (heap);#if 0	  for (; heap->entry[0].wi > wi; wi++)	    fprintf (fp, " 0");#endif	}       while ((current_di == heap->entry[0].current_di)	     && (heap->length > 0));      fprintf (fp, "\n");    }  bow_free (heap);  bow_verbosify (bow_progress, "\n"); }/* Print barrel to FP in human-readable and awk-accessible format.   Step through each CDOC in BARREL->CDOCS instead of using a heap.     This way we even print out the documents that have zero words.    This function runs much more slowly than the one above. */voidbow_new_slow_barrel_printf (bow_barrel *barrel, FILE *fp, const char *format){  int di;  bow_cdoc *cdoc;  bow_de *de;  int wi, max_wi;  bow_verbosify (bow_progress, "Printing barrel:          ");  max_wi = barrel->wi2dvf->size;  for (di = 0; di < barrel->cdocs->length; di++)    {      if (barrel->cdocs->length - di % 10 == 0)	bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", 		       barrel->cdocs->length - di);      cdoc = bow_array_entry_at_index (barrel->cdocs, di);      fprintf (fp, "%s", cdoc->filename);      for (wi = 0; wi < max_wi; wi++)	{	  de = bow_wi2dvf_entry_at_wi_di (barrel->wi2dvf, wi, di);	  if (de)	    fprintf (fp, "  %s %d %d", 		     bow_int2word (wi),		     wi,		     de->count);	}      fprintf (fp, "\n");    }  bow_verbosify (bow_progress, "\n"); }/* Print barrel to FP in various formats.   Defaults are first in lists:   s - sparse       OR  a - all    i - integer      OR  b - binary   c - combination  OR  n - word index   OR  w - word string   OR e - empty   OR   I - UC Irvine format, same as Sahami's "feat-sel" format.   *//* Print document, but print only those documents for which the   function PRINT_IF_TRUE returns non-zero. */voidbow_barrel_printf_selected (bow_barrel *barrel, FILE *fp, 			    const char *format,			    int (*print_if_true)(bow_cdoc*)){  enum {    word_index,    word_string,    word_string_and_index,    word_empty,    word_long  } word_format = word_string_and_index;  enum {    binary_count,    integer_count  } word_count_format = integer_count;  int doing_uci_format = 0;  int doing_ipl_format = 0;  int sparse_format = 1;  bow_dv_heap *heap;  bow_wv *wv;  int di;  bow_cdoc *cdoc;  int wi, wvi;  bow_dv *dv;  int last_di;  void print_word_count (int wi, int count)    {      int oi;      const char *word;      if (word_count_format == binary_count)	count = (count > 0);      switch (word_format)	{	case word_index:	  printf ("%d %d  ", wi, count);	  break;	case word_string:	  printf ("%s %d  ", bow_int2word (wi), count);	  break;	case word_string_and_index:	  printf ("%s %d %d  ", bow_int2word (wi), wi, count);	  break;	case word_empty:	  printf ("%d  ", count);	  break;	case word_long:	  word = bow_int2word (wi);	  for (oi = 0; oi < count; oi++)	    printf("%s ", word);	}    }  if (format && strchr (format, 'I'))    {      doing_uci_format = 1;      sparse_format = 0;      word_count_format = binary_count;      word_format = word_empty;    }  if (format && strchr (format, 'P'))    doing_ipl_format = 1;  if (format && strchr (format, 'a'))    sparse_format = 0;  if (format && strchr (format, 'b'))    word_count_format = binary_count;  if (format && strchr (format, 'n'))    word_format = word_index;  else if (format && strchr (format, 'w'))    word_format = word_string;  else if (format && strchr (format, 'e'))    word_format = word_empty;  else if (format && strchr (format, 'l'))    word_format = word_long;  if (doing_uci_format)    {      /* Print the number of dimentions and the number of features */      printf ("%d\n%d\n", 	      barrel->wi2dvf->num_words, 	      barrel->cdocs->length);    }  heap = bow_test_new_heap (barrel);  wv = NULL;  last_di = -1;  while ((di = bow_heap_next_wv (heap, barrel, &wv, print_if_true))	 != -1)    {      /* Print documents that have no words in them. while (last_di ); */      cdoc = bow_array_entry_at_index (barrel->cdocs, di);      if (!doing_uci_format && !doing_ipl_format)	printf ("%s %s  ", cdoc->filename, 		bow_barrel_classname_at_index (barrel, cdoc->class));      else if (doing_ipl_format)	printf ("%s %s  ", bow_barrel_classname_at_index (barrel, cdoc->class),		cdoc->filename);      if (sparse_format)	{	  for (wvi = 0; wvi < wv->num_entries; wvi++)	    print_word_count (wv->entry[wvi].wi, wv->entry[wvi].count);	}      else	{	  for (wi = 0, wvi = 0; wi < barrel->wi2dvf->size; wi++)	    {	      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);	      if (!dv)		continue;	      if (wv->entry[wvi].wi < wi && wvi < wv->num_entries)		wvi++;	      assert (wv->entry[wvi].wi >= wi || wvi >= wv->num_entries);	      if ((wvi < wv->num_entries) && wv->entry[wvi].wi == wi)		print_word_count (wi, wv->entry[wvi].count);	      else		print_word_count (wi, 0);	    }	}      if (doing_uci_format)	/* Print the class index. */	printf (": %d", cdoc->class);      printf ("\n");    }}voidbow_barrel_printf (bow_barrel *barrel, FILE *fp, const char *format){  bow_barrel_printf_selected (barrel, fp, format, bow_cdoc_yes);}/* Print on stdout the number of times WORD occurs in the various   docs/classes of BARREL. */voidbow_barrel_print_word_count (bow_barrel *barrel, const char *word){  int wi;  bow_dv *dv;  int dvi;  bow_cdoc *cdoc;    wi = bow_word2int (word);  if (wi == -1)    {      fprintf (stderr, "No such word `%s' in dictionary\n", word);      exit (-1);    }  dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);  if (!dv)    {      fprintf (stderr, "No document vector for word `%s'\n", word);      return;    }  for (dvi = 0; dvi < dv->length; dvi++)     {      cdoc = bow_array_entry_at_index (barrel->cdocs, 				       dv->entry[dvi].di);      printf ("%9d / %9d  (%9.5f) %s\n", 	      dv->entry[dvi].count, 	      cdoc->word_count,	      ((float)dv->entry[dvi].count / cdoc->word_count),	      cdoc->filename);    }}/* For copying a class barrel.  Doesn't deal with class_probs at all. */bow_barrel *bow_barrel_copy (bow_barrel *barrel){  int ci;  int wi;  int dvi;  bow_dv *dv;  bow_dv *copy_dv;  bow_barrel *copy = bow_barrel_new(barrel->wi2dvf->size, 				    bow_barrel_num_classes(barrel),				    barrel->cdocs->entry_size,				    barrel->cdocs->free_func);  copy->method = barrel->method;  copy->is_vpc = 1;  copy->classnames = bow_int4str_new(0);  /* Initialize the CDOCS and CLASSNAMES parts of the copy.     Create BOW_CDOC structures for each class, and append them to the     copy->cdocs array. */  for (ci = 0; ci < bow_barrel_num_classes(barrel) ; ci++)    {      bow_cdoc *old_cdoc = bow_array_entry_at_index(barrel->cdocs, ci);      bow_cdoc cdoc;      cdoc.type = old_cdoc->type;      cdoc.normalizer = old_cdoc->normalizer;      cdoc.word_count = old_cdoc->word_count;      cdoc.prior = old_cdoc->prior;      cdoc.filename = strdup (old_cdoc->filename);      cdoc.class = old_cdoc->class;      cdoc.class_probs = NULL;      bow_array_append (copy->cdocs, &cdoc);      bow_str2int (copy->classnames, cdoc.filename);    }  /* set up the wi2dvf structure */  for (wi = 0; wi < barrel->wi2dvf->size; wi++)    {      dv = bow_wi2dvf_dv (barrel->wi2dvf, wi);      if (!dv)	continue;      for (dvi = 0; dvi < dv->length; dvi++)	bow_wi2dvf_add_wi_di_count_weight 	  (&(copy->wi2dvf), 	   wi, dv->entry[dvi].di, 	   dv->entry[dvi].count,	   dv->entry[dvi].weight);            /* Set the IDF of the class's wi2dvf directly from the doc's	 wi2dvf */            copy_dv = bow_wi2dvf_dv (copy->wi2dvf, wi);      copy_dv->idf = dv->idf;    }  return (copy);}/* Define an iterator over the columns of a barrel  */struct bow_barrel_iterator_context {  bow_barrel *barrel;  int ci;  bow_dv *dv;  int dvi;};#define CONTEXT ((struct bow_barrel_iterator_context*)context)static voidbarrel_iterator_reset_at_wi (int wi, void *context){  bow_cdoc *cdoc;  CONTEXT->dv =  bow_wi2dvf_dv (CONTEXT->barrel->wi2dvf, wi);  CONTEXT->dvi = 0;  /* Advance to the first document matching our criterion */  while (CONTEXT->dv && CONTEXT->dvi < CONTEXT->dv->length)    {      cdoc = bow_array_entry_at_index (CONTEXT->barrel->cdocs,				       CONTEXT->dv->entry[CONTEXT->dvi].di);      if (cdoc->class == CONTEXT->ci && cdoc->type == bow_doc_train)	break;      CONTEXT->dvi++;    }}static intbarrel_iterator_advance_to_next_di (void *context){  bow_cdoc *cdoc;  if (CONTEXT->dv == NULL)     return 0;  CONTEXT->dvi++;  while (CONTEXT->dvi < CONTEXT->dv->length)    {      cdoc = bow_array_entry_at_index (CONTEXT->barrel->cdocs,				       CONTEXT->dv->entry[CONTEXT->dvi].di);      if (cdoc->class == CONTEXT->ci && cdoc->type == bow_doc_train)	break;      CONTEXT->dvi++;    }  if (CONTEXT->dvi >= CONTEXT->dv->length)    return 0;  return 1;}static intbarrel_iterator_doc_index (void *context){  if (CONTEXT->dv == NULL || CONTEXT->dvi >= CONTEXT->dv->length)    return INT_MIN;  return CONTEXT->dv->entry[CONTEXT->dvi].di;}static doublebarrel_iterator_count_for_doc (void *context){  if (CONTEXT->dv == NULL || CONTEXT->dvi >= CONTEXT->dv->length)    return 0.0/0;		/* NaN */  return CONTEXT->dv->entry[CONTEXT->dvi].count;}bow_iterator_double *bow_barrel_iterator_for_ci_new (bow_barrel *barrel, int ci){  bow_iterator_double *ret;  void *context;  ret = bow_malloc (sizeof (bow_iterator_double) + 		    sizeof (struct bow_barrel_iterator_context));  ret->reset = barrel_iterator_reset_at_wi;  ret->advance = barrel_iterator_advance_to_next_di;  ret->index = barrel_iterator_doc_index;  ret->value = barrel_iterator_count_for_doc;  context = ret->context = (char*)ret + sizeof (bow_iterator_double);  CONTEXT->barrel = barrel;  CONTEXT->ci = ci;  CONTEXT->dv = NULL;  CONTEXT->dvi = 0;  return ret;}#undef CONTEXT/* Free the memory held by BARREL. */voidbow_barrel_free (bow_barrel *barrel){  if (barrel->wi2dvf)    bow_wi2dvf_free (barrel->wi2dvf);  if (barrel->cdocs)    bow_array_free (barrel->cdocs);  if (barrel->classnames)    bow_int4str_free (barrel->classnames);  bow_free (barrel);}
上一页 1 23
💿 文件大小 522 K
👤 上传用户 yuanata
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#mitchell #tom #机器学习 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -