📄 split.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
				bow_test_set_files_use_basename));	  if (index != -1)	    bow_verbosify (bow_quiet, "WARNING: Repeated file basename `%s'\n", 			   bow_int2str (map, si));	  bow_str2int (map2, bow_basename (bow_int2str (map, si),					   bow_test_set_files_use_basename));	}      bow_int4str_free (map);      map = map2;    }      for (di = 0; di < docs->length; di++)    {      cdoc = bow_array_entry_at_index (docs, di);      if (bow_test_set_files_use_basename)	filename = bow_basename (cdoc->filename, 				 bow_test_set_files_use_basename);      else	filename = cdoc->filename;      if (bow_str2int_no_add (map, filename) != -1)	{	  /* This filename is in the map; tag this cdoc. */	  if (cdoc->type != bow_doc_untagged)	    bow_verbosify (bow_quiet, "Duplicate tags requested for %s.  "			   "Using first tag.\n", filename);	  else	    {	      cdoc->type = type;	      files_count++;	    }	}    }    switch (type)    {    case bow_doc_train:      type_name = "train";      break;    case bow_doc_test:      type_name = "test";      break;    case bow_doc_unlabeled:      type_name = "unlabeled";      break;    case bow_doc_ignore:      type_name = "ignore";      break;    case bow_doc_validation:      type_name = "validation";      break;    default:      bow_error ("No implementation for this type.");    }  bow_verbosify (bow_progress, 		 "Using %s, placed %d documents in the %s set\n", 		 test_files_filename, files_count,		 type_name);  bow_int4str_free (map);  return;}/* Postprocess the tags on documents by setting untagged documents to   train or test, depending on context. */voidbow_set_doc_types_of_remaining (bow_array *docs, int type){  int di;  bow_cdoc *cdoc;  char *type_name = NULL;  int num_found = 0;  for (di = 0; di < docs->length; di++)    {      cdoc = bow_array_entry_at_index (docs, di);      if (cdoc->type == bow_doc_untagged)	{	  cdoc->type = type;	  num_found++;	}    }  switch (type)    {    case bow_doc_train:      type_name = "train";      break;    case bow_doc_test:      type_name = "test";      break;    case bow_doc_unlabeled:      type_name = "unlabeled";      break;    case bow_doc_ignore:      type_name = "ignore";      break;    case bow_doc_validation:      type_name = "validation";      break;    default:      bow_error ("No implementation for this type.");    }  bow_verbosify (bow_progress, "Placed remaining %d documents in the %s set:\n",		 num_found, type_name);}/* Use the command line arguments to create the appropriate train/test split */voidbow_set_doc_types (bow_array *docs, int num_classes, bow_int4str *classnames){        /* int num_docs; */  int ti;  int num_types;  int num_remaining = 0;  /* note it is important that ignore comes first, so we can      ignore them when doing even prior random splits later */  struct {    bow_files_source_type source;    float fraction;    int number;    char *filename;    bow_split_fancy_count *fancy_counts;    bow_files_source_type doc_type;  } types[] = {{bow_ignore_files_source, 		bow_ignore_fraction, 		bow_ignore_number, 		bow_ignore_filename, 		bow_ignore_fancy_counts,		bow_doc_ignore},	       {bow_test_files_source, 		bow_test_fraction, 		bow_test_number, 		bow_test_filename, 		bow_test_fancy_counts,		bow_doc_test},	       {bow_train_files_source, 		bow_train_fraction, 		bow_train_number, 		bow_train_filename, 		bow_train_fancy_counts,		bow_doc_train},	       {bow_unlabeled_files_source, 		bow_unlabeled_fraction, 		bow_unlabeled_number, 		bow_unlabeled_filename, 		bow_unlabeled_fancy_counts,		bow_doc_unlabeled},	       {bow_validation_files_source, 		bow_validation_fraction, 		bow_validation_number, 		bow_validation_filename, 		bow_validation_fancy_counts,		bow_doc_validation},	       {0,0,0,0,0,0}};    /* First set all files to be untagged. */  bow_set_all_docs_untagged (docs);  /* count the number of document types */  for (num_types = 0; types[num_types].source; num_types++);  /* First, set document types based on input files */  for (ti = 0; ti < num_types; ti++)    {      if (types[ti].source == bow_files_source_file)	bow_set_docs_to_type (docs,			      types[ti].filename,			      types[ti].doc_type);    }  /* Second, set document types based on specific numbers per class */  for (ti = 0; ti < num_types; ti++)    {      if (types[ti].source == bow_files_source_num_per_class)	{	  int ci;	  int *class_nums;	  class_nums = bow_malloc (sizeof (int) * num_classes);	  for (ci = 0; ci < num_classes; ci ++)	    class_nums[ci] = types[ti].number;	  	  bow_set_doc_types_randomly_by_count_per_class (docs, num_classes,							 classnames,							 class_nums, 							 types[ti].doc_type,							 bow_doc_untagged);	  bow_free(class_nums);	}      if (types[ti].source == bow_files_source_num_per_class_remaining)	{	  int ci;	  int *class_nums;	  bow_error ("prc suffix not yet implemented.");	  class_nums = bow_malloc (sizeof (int) * num_classes);	  for (ci = 0; ci < num_classes; ci ++)	    class_nums[ci] = types[ti].number;	  	  bow_set_doc_types_randomly_by_count_per_class (docs, num_classes,							 classnames,							 class_nums, 							 types[ti].doc_type,							 bow_doc_untagged);	  bow_free(class_nums);	}      else if (types[ti].source == bow_files_source_fancy_counts)	{	  int *counts = bow_malloc (sizeof (int) * num_classes);	  int ci;	  bow_split_fancy_count *class_count;	  assert (classnames);	  for (ci = 0; ci < num_classes; ci++)	    counts[ci] = -1;	  for (class_count = types[ti].fancy_counts; 	       class_count->class_name; 	       class_count++)	    {	      int class = -1;	      for (ci = 0; ci < num_classes; ci++)		{		  const char *name = bow_int2str (classnames, ci);		  if (!strcmp(class_count->class_name, name))		    {		      class = ci;		      break;		    }		}	      if (class == -1)		bow_error ("Unknown class %s.\n", class_count->class_name);	      	      counts[class] = class_count->num_docs;	    }	  for (ci = 0; ci < num_classes; ci++)	    if (counts[ci] == -1)	      bow_error("Under-specified class counts");	  	  bow_set_doc_types_randomly_by_count_per_class (docs, num_classes,							 classnames,							 counts, 							 types[ti].doc_type,							 bow_doc_untagged);	  bow_free(counts);	}    }  /* Third, do any random class-proportioned splits to set     document types */  for (ti = 0; ti < num_types; ti++)    {      if (types[ti].source == bow_files_source_fraction)	{	  bow_set_doc_types_randomly_by_fraction	    (docs, num_classes, classnames, types[ti].fraction,	     types[ti].doc_type, bow_doc_untagged);	}      else if (types[ti].source == bow_files_source_fraction_remaining)	{	  bow_set_doc_types_randomly_by_fraction_remaining	    (docs, num_classes, classnames, types[ti].fraction,	     types[ti].doc_type);	}      else if (types[ti].source == bow_files_source_number)	{	  bow_set_doc_types_randomly_by_count (docs, num_classes, classnames,					       types[ti].number,					       types[ti].doc_type, 0, 					       bow_doc_untagged);	}      else if (types[ti].source == bow_files_source_number_remaining)	{	  bow_set_doc_types_randomly_by_count (docs, num_classes, classnames,					       types[ti].number,					       types[ti].doc_type, 1,					       bow_doc_untagged);	}    }  /* Set remaining untagged docs if appropriate */  for (ti = 0; ti < num_types; ti++)    {      if (types[ti].source == bow_files_source_remaining)	{	  assert (num_remaining == 0);	  num_remaining = 1;	  bow_set_doc_types_of_remaining (docs, types[ti].doc_type);	}    }  /* Now that the training documents are fixed, check if any document     types feed from them */  for (ti = 0; ti < num_types; ti++)    {      if (types[ti].source == bow_files_source_fraction_train)	{	  bow_set_doc_types_randomly_by_fraction	    (docs, num_classes, classnames, types[ti].fraction,	     types[ti].doc_type,	     bow_doc_train);	}      else if (types[ti].source == bow_files_source_number_train)	{	  bow_set_doc_types_randomly_by_count (docs, num_classes, classnames,					       types[ti].number,					       types[ti].doc_type, 0,					       bow_doc_train);	}    }}/* Use the command line arguments to create the appropriate train/test split */voidbow_set_doc_types_for_barrel (bow_barrel *barrel){  bow_set_doc_types (barrel->cdocs, bow_barrel_num_classes (barrel), 		     barrel->classnames);}#define BOW_DOC_IS_X(X) \int bow_doc_is_ ## X (bow_doc *doc) { return (doc->type == bow_doc_ ## X); }BOW_DOC_IS_X(train)BOW_DOC_IS_X(test)BOW_DOC_IS_X(unlabeled)BOW_DOC_IS_X(untagged)BOW_DOC_IS_X(validation)BOW_DOC_IS_X(ignore)BOW_DOC_IS_X(pool)BOW_DOC_IS_X(waiting)/* return 1 for all training and unlabeled docs */intbow_cdoc_is_train_or_unlabeled (bow_cdoc *cdoc){  return ((cdoc->type == bow_doc_unlabeled) ||	  (cdoc->type == bow_doc_train));}void _register_split_args () __attribute__ ((constructor));void _register_split_args (){  static int done = 0;  if (done)     return;  bow_argp_add_child (&bow_split_argp_child);  done = 1;}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -