📄 crossbow.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* A clustering front-end to libbow. *//* Copyright (C) 1997, 1998, 1999, 2000 Andrew McCallum   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA *//* Some naming conventions:   Use x_count; not num_x or x_size.   func_x_all() is the same as func_x(), except that is iterates over   all nodes in the tree.*/#include <bow/libbow.h>#include <argp.h>#include <bow/crossbow.h>/* For query serving on a socket */#include <errno.h>		/* needed on DEC Alpha's */#include <unistd.h>		/* for getopt(), maybe */#include <stdlib.h>		/* for atoi() */#include <string.h>		/* for strrchr() */#include <sys/types.h>#include <sys/socket.h>#ifndef WINNT#include <sys/un.h>#endif /* WINNT */#include <netinet/in.h>#include <netdb.h>#include <strings.h>#include <signal.h>#include <unistd.h>#include <fcntl.h>/* For mkdir() and stat() */#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <unistd.h>/* For opendir */#include <dirent.h>#define SHRINKAGE 1#define CLASSES_FROM_DIRS 1extern void crossbow_hem_cluster ();extern void crossbow_hem_full_em ();extern void crossbow_hem_fienberg ();extern int crossbow_hem_deterministic_horizontal;extern int crossbow_hem_restricted_horizontal;extern int crossbow_hem_shrinkage;/* The version number of this program. */#define CROSSBOW_MAJOR_VERSION 0#define CROSSBOW_MINOR_VERSION 0/* Global variables. *//* The top of the hierarchy */treenode *crossbow_root;/* The list of documents */bow_array *crossbow_docs;/* A hashtable from doc filenames to document indices */bow_int4str *crossbow_filename2di;/* The number of classes in a "supervised" setting */int crossbow_classes_count;/* A mapping between classnames and class indices */bow_int4str *crossbow_classnames;/* FILE* to file containing WV's of the documents */FILE *crossbow_wv_fp;/* Access to the arguments of main() */int crossbow_argc;char **crossbow_argv;struct crossbow_arg_state{  /* What this invocation of crossbow to do? */  void (*what_doing)();  int non_option_argi;  const char *server_port_num;  int serve_with_forking;  const char *cluster_output_dir;  int build_hier_from_dir;  const char *print_file_prefix;  const char *printing_tag;  const char *classify_files_dirname;  const char *multiclass_list_filename;  bow_int4str *vocab_map;} crossbow_arg_state;/* Functions for creating, reading, writing a crossbow_doc */intcrossbow_doc_write (crossbow_doc *doc, FILE *fp){  int ret;  int i;  ret = bow_fwrite_string (doc->filename, fp);  ret += bow_fwrite_int (doc->tag, fp);  ret += bow_fwrite_int (doc->word_count, fp);  ret += bow_fwrite_int (doc->wv_seek_pos, fp);  ret += bow_fwrite_int (doc->di, fp);  ret += bow_fwrite_int (doc->ci, fp);  if (bow_file_format_version >= 7)    {      ret += bow_fwrite_int (doc->cis_size, fp);      for (i = 0; i < doc->cis_size; i++)	ret += bow_fwrite_int (doc->cis[i], fp);    }  return ret;}intcrossbow_doc_read (crossbow_doc *doc, FILE *fp){  int ret;  int tag;  int i;  ret = bow_fread_string ((char**)&(doc->filename), fp);  ret += bow_fread_int (&tag, fp);  doc->tag = tag;  ret += bow_fread_int (&(doc->word_count), fp);  ret += bow_fread_int (&(doc->wv_seek_pos), fp);  ret += bow_fread_int (&(doc->di), fp);  ret += bow_fread_int (&(doc->ci), fp);  if (bow_file_format_version >= 7)    {      ret += bow_fread_int (&(doc->cis_size), fp);      if (doc->cis_size)	{	  doc->cis = bow_malloc (doc->cis_size				 * sizeof (typeof (doc->cis[0])));	  for (i = 0; i < doc->cis_size; i++)	    ret += bow_fread_int (&(doc->cis[i]), fp);	}      else	doc->cis = NULL;    }  doc->wv = NULL;  doc->cis_mixture = NULL;  return ret;}voidcrossbow_doc_free (crossbow_doc *doc){  if (doc->filename)    free ((void*)doc->filename);}/* Return the WV of the DI'th document */bow_wv *crossbow_wv_at_di (int di){  crossbow_doc *doc;    doc = bow_array_entry_at_index (crossbow_docs, di);  if (!doc->wv)    {      fseek (crossbow_wv_fp, doc->wv_seek_pos, SEEK_SET);      doc->wv = bow_wv_new_from_data_fp (crossbow_wv_fp);    }  return doc->wv;}/* Load all the document WV's into their DOC's */voidcrossbow_load_doc_wvs (){  int di;  crossbow_doc *doc;  fseek (crossbow_wv_fp, 0, SEEK_SET);  for (di = 0; di < crossbow_docs_count; di++)    {      doc = bow_array_entry_at_index (crossbow_docs, di);      assert (doc->wv == NULL);      assert (doc->wv_seek_pos == ftell (crossbow_wv_fp));      doc->wv = bow_wv_new_from_data_fp (crossbow_wv_fp);    }}/* Convert the array of log-probabilities into a normalized array of   probabilities. */voidcrossbow_convert_log_probs_to_probs (double *log_probs, int num_entries){  double max_log_prob;  int i;  double normalizer;    /* Renormalize by adding a constant to the LOG_PROBS */  max_log_prob = -DBL_MAX;  for (i = 0; i < num_entries; i++)    {      assert (log_probs[i] <= 0);      if (log_probs[i] > max_log_prob)	max_log_prob = log_probs[i];    }  assert (max_log_prob != -DBL_MAX);  for (i = 0; i < num_entries; i++)    log_probs[i] -= max_log_prob;  /* Exponentiate the log_probs to get probabilities, and renormalize     by dividing by their sum. */  normalizer = 0;  for (i = 0; i < num_entries; i++)    {      log_probs[i] = exp (log_probs[i]);      normalizer += log_probs[i];    }  for (i = 0; i < num_entries; i++)    {      log_probs[i] /= normalizer;      assert (log_probs[i] >= 0);      assert (log_probs[i] <= 1);    }}/* Print the filenames of the documents that are most probable in each   leaf. */voidcrossbow_leaf_document_probs_print (int num_to_print){  int num_leaves;  treenode *iterator, *leaf;  double *leaf_membership;  int li, di, i;  bow_wa **was;			/* being used with di's instead of wi's */  bow_wv *wv;  crossbow_doc *doc;  num_leaves = bow_treenode_leaf_count (crossbow_root);  leaf_membership = alloca (num_leaves * sizeof (double));  was = alloca (num_leaves * sizeof (void*));  for (li = 0; li < num_leaves; li++)    was[li] = bow_wa_new (crossbow_root->words_capacity+2);  for (di = 0; di < crossbow_docs->length; di++)    {      wv = crossbow_wv_at_di (di);      for (iterator = crossbow_root, li = 0;	   (leaf = bow_treenode_iterate_leaves (&iterator)); 	   li++)	{	  leaf_membership[li] = (log (leaf->prior)				 + bow_treenode_log_prob_of_wv (leaf, wv));	  leaf_membership[li] /= bow_wv_word_count (wv);	  //leaf_membership[li] = 1.0 / leaf_membership[li];	}      //crossbow_convert_log_probs_to_probs (leaf_membership, num_leaves);      for (li = 0; li < num_leaves; li++)	bow_wa_append (was[li], di, leaf_membership[li]);    }  for (iterator = crossbow_root, li = 0;       (leaf = bow_treenode_iterate_leaves (&iterator));        li++)    {      bow_wa_sort (was[li]);      fprintf (stdout, "%s\n", leaf->name);      for (i = 0; i < num_to_print; i++)	{	  char buf[1024];	  doc = bow_array_entry_at_index (crossbow_docs, was[li]->entry[i].wi);	  fprintf (stdout, "%20.10f %s\n",		   was[li]->entry[i].weight,		   strrchr (doc->filename, '/') + 1);	  //fflush (stdout);	  sprintf (buf, "/net/server1/cora/bin/spit-info %s", doc->filename);	  assert (strlen (buf) < 1024);	  //system (buf);	}    }  /* Free the WAS */  for (li = 0; li < num_leaves; li++)    bow_wa_free (was[li]);}/* Write CROSSBOW_ROOT to disk in directory DIRNAME */void crossbow_archive (const char *dirname){  FILE *fp;  char buf[1024];#if 0  struct stat st;  int e;  /* Create the data directory, if it doesn't exist already. */  e = stat (crossbow_arg_state.cluster_output_dir, &st);  if (e == 0)    {      /* Assume this means the file exists. */      if (!S_ISDIR (st.st_mode))	bow_error ("`%s' already exists, but is not a directory");    }  else    {      if (mkdir (crossbow_arg_state.cluster_output_dir, 0777) == 0)	bow_verbosify (bow_quiet, "Created directory `%s'.\n", 		       dirname);      else if (errno != EEXIST)	bow_error ("Couldn't create default data directory `%s'",		   dirname);    }#endif        /* Write archive file format version */  sprintf (buf, "%s/version", dirname);  bow_write_format_version_to_file (buf);  /* Write the tree */  sprintf (buf, "%s/tree", dirname);  fp = bow_fopen (buf, "wb");  bow_treenode_write (crossbow_root, fp);  fclose (fp);  /* Write the list of info about documents */  sprintf (buf, "%s/docs", dirname);  fp = bow_fopen (buf, "wb");  bow_array_write (crossbow_docs, (int(*)(void*,FILE*))crossbow_doc_write, fp);  fclose (fp);  /* Write the classnames map. */  sprintf (buf, "%s/classnames", dirname);  fp = bow_fopen (buf, "w");  bow_int4str_write (crossbow_classnames, fp);  fclose (fp);  /* Write the vocabulary */  sprintf (buf, "%s/vocabulary", dirname);  fp = bow_fopen (buf, "wb");  bow_words_write (fp);  fclose (fp);  /* Write the map from doc filenames to document indices */  sprintf (buf, "%s/filename2di", dirname);  fp = bow_fopen (buf, "w");  bow_int4str_write (crossbow_filename2di, fp);  fclose (fp);  /* Write the CROSSBOW_CLASSES_COUNT */  sprintf (buf, "%s/classes-count", dirname);  fp = bow_fopen (buf, "wb");  bow_fwrite_int (crossbow_classes_count, fp);  fclose (fp);}/* Read CROSSBOW_ROOT from disk in directory DIRNAME */voidcrossbow_unarchive (const char *dirname){  FILE *fp;  char buf[1024];  int di;  crossbow_doc *doc;  /* Read the archive file format version */  sprintf (buf, "%s/version", dirname);  bow_read_format_version_from_file (buf);  /* Read the hierarchy */  sprintf (buf, "%s/tree", dirname);  fp = bow_fopen (buf, "rb");  crossbow_root = bow_treenode_new_from_fp (fp);  fclose (fp);  /* Read the list of info about documents */  sprintf (buf, "%s/docs", dirname);  fp = bow_fopen (buf, "rb");  crossbow_docs =     bow_array_new_with_entry_size_from_data_fp    (sizeof (crossbow_doc),      (int(*)(void*,FILE*))crossbow_doc_read,     crossbow_doc_free, fp);  fclose (fp);  /* Read the classnames map */  sprintf (buf, "%s/classnames", dirname);  fp = bow_fopen (buf, "r");  crossbow_classnames = bow_int4str_new_from_fp (fp);  fclose (fp);  /* Read the vocabulary */  sprintf (buf, "%s/vocabulary", dirname);  fp = bow_fopen (buf, "rb");  bow_words_read_from_fp (fp);  fclose (fp);  /* Read the mapping from doc filenames to document indices */  sprintf (buf, "%s/filename2di", dirname);  fp = bow_fopen (buf, "r");  crossbow_filename2di = bow_int4str_new_from_fp (fp);  fclose (fp);  /* Open the document FP so it is ready for later reading. */  sprintf (buf, "%s/wvs", dirname);  crossbow_wv_fp = bow_fopen (buf, "rb");  /* Load all the WV's of the documents into memory. */  crossbow_load_doc_wvs ();  /* Initialize CROSSBOW_CLASSES_COUNT. */  if (bow_file_format_version >= 7)    {      sprintf (buf, "%s/classes-count", dirname);      fp = bow_fopen (buf, "rb");      bow_fread_int (&crossbow_classes_count, fp);      fclose (fp);    }  else    {      crossbow_classes_count = 0;      /* Step through all the documents, and record the largest class index */      for (di = 0; di < crossbow_docs->length; di++)	{	  doc = bow_array_entry_at_index (crossbow_docs, di);	  if (doc->ci + 1 > crossbow_classes_count)	    crossbow_classes_count = doc->ci + 1;	}    }}treenode *
12 3 下一页
💿 文件大小 12 K
👤 上传用户 yjpynnpl
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -