📄 crossbow.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 3 页
字号:
crossbow_new_root_from_dir (const char *dirname0, treenode *parent){  DIR *dir;  struct dirent *dirent_p;  struct stat st;  char dirname[PATH_MAX];  char child_dirname[PATH_MAX];  treenode *ret;  const char *basename;  int i;  /* Make DIRNAME be a copy of DIRNAME0, but without any trailing / */  strcpy (dirname, dirname0);  i = strlen (dirname);  if (dirname[i-1] == '/')    dirname[i-1] = '\0';  if ((basename = strrchr (dirname, '/')))    basename++;  else    basename = dirname;  if (parent)    bow_verbosify (bow_progress, "Building tree for %s%s\n", 		   parent->name, basename);  else    bow_verbosify (bow_progress, "Building root for %s\n", 		   basename);  ret = bow_treenode_new (parent, 8, basename);  if (!(dir = opendir (dirname)))    bow_error ("Couldn't open directory `%s'", dirname);  while ((dirent_p = readdir (dir)))    {      sprintf (child_dirname, "%s/%s", dirname, dirent_p->d_name);      stat (child_dirname, &st);      if (S_ISDIR (st.st_mode)	  && strcmp (dirent_p->d_name, "unlabeled")	  && strcmp (dirent_p->d_name, ".")	  && strcmp (dirent_p->d_name, ".."))	{	  /* This directory entry is a subdirectory.  Recursively 	     descend into it and append its files also. */	  crossbow_new_root_from_dir (child_dirname, ret);	}      else if (S_ISREG (st.st_mode))	{	  bow_verbosify (bow_verbose,			 "Ignoring file in hier non-leaf `%s'\n",			 dirent_p->d_name);	}    }  closedir (dir);  return ret;}/* Forward declare this function */void crossbow_index_multiclass_list ();static int text_file_count;/* CONTEXT points to the class index CI */int crossbow_index_filename (const char *filename, void *context){  FILE *fp;  bow_wv *wv;  crossbow_doc doc;  int i;  char dir[1024];  char munged_filename[1024];  char *last_slash;        fp = fopen (filename, "r");  if (fp == NULL)    {      perror ("crossbow_index_filename");      fprintf (stderr, "Couldn't open `%s' for reading\n", filename);      return 0;    }  if (bow_fp_is_text (fp))    {      text_file_count++;      wv = bow_wv_new_from_text_fp (fp, filename);      if (strstr (filename, "unlabeled/"))	{	  const char *u = strstr (filename, "unlabeled/");	  int ulen = strlen ("unlabeled/");	  strncpy (munged_filename, filename, u - filename);	  strcpy (munged_filename + (u - filename), u + ulen);	}      else	strcpy (munged_filename, filename);      if (wv)	{	  /* Create and add an entry for the doc-info array */	  doc.filename = strdup (munged_filename);	  assert (doc.filename);	  doc.tag = bow_doc_train;	  doc.word_count = bow_wv_word_count (wv);	  doc.wv_seek_pos = ftell (crossbow_wv_fp);	  doc.di = bow_array_next_index (crossbow_docs);	  /* Make DIR be the directory portion of FILENAME.	     It will be used as a classname. */	  strcpy (dir, munged_filename);	  last_slash = strrchr (dir, '/');	  *last_slash = '\0';	  if (crossbow_arg_state.what_doing == crossbow_index_multiclass_list)	    {	      doc.ci = -1;	    }	  else	    {#if CLASSES_FROM_DIRS	    doc.ci = bow_str2int (crossbow_classnames, dir);	    bow_verbosify (bow_progress, "Putting file %s into class %s\n",			   filename, dir);#else	    doc.ci = *(int*)context;#endif	    }	  doc.cis_size = 0;	  doc.cis = NULL;	  i = bow_array_append (crossbow_docs, &doc);	  assert (i == doc.di);	  i = bow_str2int (crossbow_filename2di, munged_filename);	  assert (i == doc.di);	  /* Write the WV to disk. */	  bow_wv_write (wv, crossbow_wv_fp);	}      else	bow_verbosify (bow_progress, "Empty WV in file `%s'\n", filename);      bow_wv_free (wv);    }  fclose (fp);  bow_verbosify (bow_progress,		 "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"		 "%6d : %6d", 		 text_file_count, bow_num_words ());  return 1;}voidcrossbow_index_multiclass_list (){  FILE *listfp;  char fn[BOW_MAX_WORD_LENGTH];  char line[BOW_MAX_WORD_LENGTH];  char *lineptr;  char *classptr, *fileptr;  crossbow_doc *doc;  int ci, cisi;  treenode *tn;  bow_random_set_seed ();  listfp = bow_fopen (crossbow_arg_state.multiclass_list_filename, "r");  /* Create some empty recepticals */  assert (crossbow_docs == NULL);  crossbow_docs = bow_array_new (0, sizeof (crossbow_doc), crossbow_doc_free);  crossbow_filename2di = bow_int4str_new (0);  crossbow_classnames = bow_int4str_new (0);  /* Create a root */  crossbow_root = bow_treenode_new_root (10);  /* Open the file to which we will write the WV's for each indexed file. */  sprintf (fn, "%s/wvs", bow_data_dirname);  crossbow_wv_fp = bow_fopen (fn, "wb");  /* If we are pruning the vocabulary by occurrence count, then read     all the documents to get the word counts, and limit the     vocabulary appropriately. */  if (bow_prune_vocab_by_occur_count_n)    {      bow_verbosify (bow_progress, "Scanning files to remove words "		     "occurring less than %d times...", 		     bow_prune_vocab_by_occur_count_n);      while (fgets (line, BOW_MAX_WORD_LENGTH, listfp)) 	{	  /* Skip empty lines (LINE[] includes the newline. */	  if (strlen (line) <= 1) 	    continue;	  assert (strlen (line) < BOW_MAX_WORD_LENGTH-1);	  lineptr = line;	  fileptr = strtok (lineptr, " \t\n\r");	  assert (fileptr);	  bow_words_add_occurrences_from_file (fileptr);	}      /* Rewind the file back to the beginning so we can read it again. */      fseek (listfp, 0, SEEK_SET);      bow_words_remove_occurrences_less_than	(bow_prune_vocab_by_occur_count_n);      /* Now insist that future calls to bow_word2int*() will not	 register new words. */      bow_word2int_do_not_add = 1;      bow_verbosify (bow_progress, "Done.\n");    }  /* From all the lines, read the filenames, one per line, until     there are no more left. */  while (fgets (line, BOW_MAX_WORD_LENGTH, listfp))     {      /* Skip empty lines (LINE[] includes the newline. */      if (strlen (line) <= 1) 	continue;      assert (strlen (line) < BOW_MAX_WORD_LENGTH-1);      lineptr = line;      fileptr = strtok (lineptr, " \t\n\r");      assert (fileptr);      crossbow_index_filename (fileptr, NULL);      /* Get the DOC just created by CROSSBOW_INDEX_FILENAME */      doc = bow_array_entry_at_index (crossbow_docs, crossbow_docs->length-1);      /* Grab all the classnames, and set their CLASSPROBS to non-zero */      while ((classptr = strtok (lineptr, " \t\n\r")))	{	  if (strlen (classptr) == 0)	    continue;	  /* No good reason, just a silly check: */	  assert (strlen (classptr) > 2);	  if (bow_str2int_no_add (crossbow_classnames, classptr) == -1)	    {	      /* This class hasn't been seen before; create a node for it */	      tn = bow_treenode_new (crossbow_root, 2, classptr);	      ci = bow_str2int (crossbow_classnames, classptr);	      assert (tn->ci_in_parent == ci);	    }	  if (doc->cis == NULL)	    {	      doc->cis_size = 1;	      doc->cis = bow_malloc (sizeof (typeof (doc->cis[0]))				     * doc->cis_size);	    }	  else 	    {	      doc->cis_size++;	      doc->cis = bow_realloc (doc->cis, (sizeof (typeof (doc->cis[0]))						 * doc->cis_size));	    }	  ci = bow_str2int (crossbow_classnames, classptr);	  /* Be that the same class isn't listed twice for the same file */	  for (cisi = 0; cisi < doc->cis_size-1; cisi++)	    assert (doc->cis[cisi] != ci);	  doc->cis[doc->cis_size-1] = ci;	}      /* Set DOC->CI from a randomly-selected entry in DOC->CIS so that	 the test/train splitting routines in split.c will work. */      doc->ci = doc->cis[random() % doc->cis_size];    }  /* Reallocate space for WORDS and NEW_WORDS distributions now that     we have indexed all the files and know what the complete     vocabulary size is */  bow_treenode_realloc_words_all (crossbow_root);  crossbow_classes_count = crossbow_classnames->str_array_length;  fclose (crossbow_wv_fp);  crossbow_wv_fp = NULL;  fclose (listfp);  crossbow_archive (bow_data_dirname);}voidcrossbow_index (){  char fn[1024];  int argi;  int ci;  text_file_count = 0;  /* If we are pruning the vocabulary by occurrence count, then read     all the documents to get the word counts, and limit the     vocabulary appropriately. */  if (bow_prune_vocab_by_occur_count_n)    {      /* Parse all the documents to get word occurrence counts. */      for (argi = crossbow_arg_state.non_option_argi; 	   argi < crossbow_argc; 	   argi++)	bow_words_add_occurrences_from_text_dir (crossbow_argv[argi], "");      bow_words_remove_occurrences_less_than	(bow_prune_vocab_by_occur_count_n);      /* Now insist that future calls to bow_word2int*() will not	 register new words. */      bow_word2int_do_not_add = 1;    }  if (crossbow_arg_state.vocab_map)    {      /* Set the vocabulary to be the vocab map. */      bow_words_set_map (crossbow_arg_state.vocab_map, 1);      /* Now insist that future calls to bow_word2int*() will not	 register new words. */      bow_word2int_do_not_add = 1;    }        assert (crossbow_docs == NULL);  crossbow_docs = bow_array_new (0, sizeof (crossbow_doc), crossbow_doc_free);  crossbow_filename2di = bow_int4str_new (0);  crossbow_classnames = bow_int4str_new (0);  /* Read all the documents and write them as bow_wv's to the     appropriate file in the model directory.  Also add entries to the     CROSSBOW_CLASSNAMES map.  This must be done before creating     TREENODE's so that we know how big to make the their vocabulary     distributions. */  sprintf (fn, "%s/wvs", bow_data_dirname);  crossbow_wv_fp = bow_fopen (fn, "wb");  for (argi = crossbow_arg_state.non_option_argi; argi < crossbow_argc; argi++)    {      ci = argi - crossbow_arg_state.non_option_argi;      bow_map_filenames_from_dir (crossbow_index_filename, &ci,				  crossbow_argv[argi], "");#if !CLASSES_FROM_DIRS      bow_str2int (crossbow_classnames, crossbow_argv[argi]);#endif    }  fclose (crossbow_wv_fp);  crossbow_wv_fp = NULL;  bow_verbosify (bow_progress, "\n");#if CLASSES_FROM_DIRS  /* The number of classes equals the number of entries in the     classname -> class index map. */  crossbow_classes_count = crossbow_classnames->str_array_length;#else  /* Remember the number of topic class tags */  crossbow_classes_count = crossbow_argc - crossbow_arg_state.non_option_argi;#endif  /* Build the hierarchy of treenode's.  This must be done after the     documents have been read so that the treenode's know how big to     make their vocabulary distributions. */  if (crossbow_arg_state.build_hier_from_dir)    {      /* xxx This scheme currently makes it difficult to have a class	 distribution in each node, and to set the hierarchy from a	 directory structure. */      assert (crossbow_argc - crossbow_arg_state.non_option_argi == 1);      crossbow_root = crossbow_new_root_from_dir	(crossbow_argv[crossbow_arg_state.non_option_argi], NULL);      bow_treenode_set_classes_uniform (crossbow_root, crossbow_classes_count);    }  else    {      /* Create just a single root node */      crossbow_root = bow_treenode_new_root (10);      /* If there was more than one directory specified on command-line	 allocate space to hold a class distribution */      if (crossbow_classes_count > 1)	bow_treenode_set_classes_uniform (crossbow_root, 					  crossbow_classes_count);    }  crossbow_archive (bow_data_dirname);}/* Return a bow_wa containing the classification scores (log   probabilities) of DOC indexed by the leaf indices */bow_wa *//crossbow_classify_doc_new_wa (crossbow_doc *doc)crossbow_classify_doc_new_wa (bow_wv *wv){  int li, leaf_count;  double leaf_membership;  treenode **leaves, *iterator, *leaf;  bow_wa *wa;  /* Classify the documents in the TAG-set */  leaf_count = bow_treenode_leaf_count (crossbow_root);  leaves = alloca (leaf_count * sizeof (void*));  wa = bow_wa_new (leaf_count);  /* Get the membership probability of each leaf */  for (iterator = crossbow_root, li = 0;       (leaf = bow_treenode_iterate_leaves (&iterator));        li++)    {      if (crossbow_hem_shrinkage)	leaf_membership = bow_treenode_log_prob_of_wv (leaf, wv);      else	leaf_membership = bow_treenode_log_local_prob_of_wv (leaf, wv);      leaf_membership += log (leaf->prior);      bow_wa_append (wa, li, leaf_membership);      leaves[li] = leaf;    }  return wa;}intcrossbow_classify_doc (crossbow_doc *doc, int verbose, FILE *out){  int li, leaf_count;  double leaf_membership;  treenode **leaves, *iterator, *leaf;  bow_wv *wv;  bow_wa *wa;  int ret;#if 0  double inverse_rank_sum = 0;  int rank;  double score_diff_sum = 0;#endif  int word_count;  /* Classify the documents in the TAG-set */  leaf_count = bow_treenode_leaf_count (crossbow_root);  leaves = alloca (leaf_count * sizeof (void*));  wv = crossbow_wv_at_di (doc->di);  assert (wv);  word_count = bow_wv_word_count (wv);  wa = bow_wa_new (leaf_count);  /* Get the membership probability of each leaf */  for (iterator = crossbow_root, li = 0;       (leaf = bow_treenode_iterate_leaves (&iterator));        li++)    {      if (crossbow_hem_shrinkage)	leaf_membership = bow_treenode_log_prob_of_wv (leaf, wv);      else	leaf_membership = bow_treenode_log_local_prob_of_wv (leaf, wv);      leaf_membership += log (leaf->prior);#define DOC_LENGTH_SCORE_TRANSFORM 1#if DOC_LENGTH_SCORE_TRANSFORM      leaf_membership /= ((word_count + 1) / MIN(9,word_count));#endif      bow_wa_append (wa, li, leaf_membership);      leaves[li] = leaf;    }  /* Print the results. */  assert (wa->length == leaf_count);  bow_wa_sort (wa);  leaf = bow_treenode_descendant_matching_name (crossbow_root, 						doc->filename);  if (leaf && !strcmp (leaf->name, leaves[wa->entry[0].wi]->name))    ret = 1;  else    ret = 0;#if 0  for (rank = 0; rank < wa->length; rank++)    if (! strcmp (leaf->name, leaves[wa->entry[rank].wi]->name))      {	inverse_rank_sum += 1.0 / (rank + 1);	score_diff_sum += wa->entry[0].weight - wa->entry[rank].weight;	break;      }#endif  if (verbose)    {      fprintf (out, "%s %s ", doc->filename, leaf ? leaf->name : "<unknown>");      if (verbose >= 2)	for (li = 0; li < leaf_count; li++)	  fprintf (out, "%s:%g ", 		   leaves[wa->entry[li].wi]->name,		   wa->entry[li].weight);
💿 文件大小 12 K
👤 上传用户 yjpynnpl
📂 所属分类 Linux/Unix编程
📄 代码行数 1,396 行
💻 语言类型 C语言
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -