📄 crossbow.c
字号:
crossbow_new_root_from_dir (const char *dirname0, treenode *parent){ DIR *dir; struct dirent *dirent_p; struct stat st; char dirname[PATH_MAX]; char child_dirname[PATH_MAX]; treenode *ret; const char *basename; int i; /* Make DIRNAME be a copy of DIRNAME0, but without any trailing / */ strcpy (dirname, dirname0); i = strlen (dirname); if (dirname[i-1] == '/') dirname[i-1] = '\0'; if ((basename = strrchr (dirname, '/'))) basename++; else basename = dirname; if (parent) bow_verbosify (bow_progress, "Building tree for %s%s\n", parent->name, basename); else bow_verbosify (bow_progress, "Building root for %s\n", basename); ret = bow_treenode_new (parent, 8, basename); if (!(dir = opendir (dirname))) bow_error ("Couldn't open directory `%s'", dirname); while ((dirent_p = readdir (dir))) { sprintf (child_dirname, "%s/%s", dirname, dirent_p->d_name); stat (child_dirname, &st); if (S_ISDIR (st.st_mode) && strcmp (dirent_p->d_name, "unlabeled") && strcmp (dirent_p->d_name, ".") && strcmp (dirent_p->d_name, "..")) { /* This directory entry is a subdirectory. Recursively descend into it and append its files also. */ crossbow_new_root_from_dir (child_dirname, ret); } else if (S_ISREG (st.st_mode)) { bow_verbosify (bow_verbose, "Ignoring file in hier non-leaf `%s'\n", dirent_p->d_name); } } closedir (dir); return ret;}/* Forward declare this function */void crossbow_index_multiclass_list ();static int text_file_count;/* CONTEXT points to the class index CI */int crossbow_index_filename (const char *filename, void *context){ FILE *fp; bow_wv *wv; crossbow_doc doc; int i; char dir[1024]; char munged_filename[1024]; char *last_slash; fp = fopen (filename, "r"); if (fp == NULL) { perror ("crossbow_index_filename"); fprintf (stderr, "Couldn't open `%s' for reading\n", filename); return 0; } if (bow_fp_is_text (fp)) { text_file_count++; wv = bow_wv_new_from_text_fp (fp, filename); if (strstr (filename, "unlabeled/")) { const char *u = strstr (filename, "unlabeled/"); int ulen = strlen ("unlabeled/"); strncpy (munged_filename, filename, u - filename); strcpy (munged_filename + (u - filename), u + ulen); } else strcpy (munged_filename, filename); if (wv) { /* Create and add an entry for the doc-info array */ doc.filename = strdup (munged_filename); assert (doc.filename); doc.tag = bow_doc_train; doc.word_count = bow_wv_word_count (wv); doc.wv_seek_pos = ftell (crossbow_wv_fp); doc.di = bow_array_next_index (crossbow_docs); /* Make DIR be the directory portion of FILENAME. It will be used as a classname. */ strcpy (dir, munged_filename); last_slash = strrchr (dir, '/'); *last_slash = '\0'; if (crossbow_arg_state.what_doing == crossbow_index_multiclass_list) { doc.ci = -1; } else {#if CLASSES_FROM_DIRS doc.ci = bow_str2int (crossbow_classnames, dir); bow_verbosify (bow_progress, "Putting file %s into class %s\n", filename, dir);#else doc.ci = *(int*)context;#endif } doc.cis_size = 0; doc.cis = NULL; i = bow_array_append (crossbow_docs, &doc); assert (i == doc.di); i = bow_str2int (crossbow_filename2di, munged_filename); assert (i == doc.di); /* Write the WV to disk. */ bow_wv_write (wv, crossbow_wv_fp); } else bow_verbosify (bow_progress, "Empty WV in file `%s'\n", filename); bow_wv_free (wv); } fclose (fp); bow_verbosify (bow_progress, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b" "%6d : %6d", text_file_count, bow_num_words ()); return 1;}voidcrossbow_index_multiclass_list (){ FILE *listfp; char fn[BOW_MAX_WORD_LENGTH]; char line[BOW_MAX_WORD_LENGTH]; char *lineptr; char *classptr, *fileptr; crossbow_doc *doc; int ci, cisi; treenode *tn; bow_random_set_seed (); listfp = bow_fopen (crossbow_arg_state.multiclass_list_filename, "r"); /* Create some empty recepticals */ assert (crossbow_docs == NULL); crossbow_docs = bow_array_new (0, sizeof (crossbow_doc), crossbow_doc_free); crossbow_filename2di = bow_int4str_new (0); crossbow_classnames = bow_int4str_new (0); /* Create a root */ crossbow_root = bow_treenode_new_root (10); /* Open the file to which we will write the WV's for each indexed file. */ sprintf (fn, "%s/wvs", bow_data_dirname); crossbow_wv_fp = bow_fopen (fn, "wb"); /* If we are pruning the vocabulary by occurrence count, then read all the documents to get the word counts, and limit the vocabulary appropriately. */ if (bow_prune_vocab_by_occur_count_n) { bow_verbosify (bow_progress, "Scanning files to remove words " "occurring less than %d times...", bow_prune_vocab_by_occur_count_n); while (fgets (line, BOW_MAX_WORD_LENGTH, listfp)) { /* Skip empty lines (LINE[] includes the newline. */ if (strlen (line) <= 1) continue; assert (strlen (line) < BOW_MAX_WORD_LENGTH-1); lineptr = line; fileptr = strtok (lineptr, " \t\n\r"); assert (fileptr); bow_words_add_occurrences_from_file (fileptr); } /* Rewind the file back to the beginning so we can read it again. */ fseek (listfp, 0, SEEK_SET); bow_words_remove_occurrences_less_than (bow_prune_vocab_by_occur_count_n); /* Now insist that future calls to bow_word2int*() will not register new words. */ bow_word2int_do_not_add = 1; bow_verbosify (bow_progress, "Done.\n"); } /* From all the lines, read the filenames, one per line, until there are no more left. */ while (fgets (line, BOW_MAX_WORD_LENGTH, listfp)) { /* Skip empty lines (LINE[] includes the newline. */ if (strlen (line) <= 1) continue; assert (strlen (line) < BOW_MAX_WORD_LENGTH-1); lineptr = line; fileptr = strtok (lineptr, " \t\n\r"); assert (fileptr); crossbow_index_filename (fileptr, NULL); /* Get the DOC just created by CROSSBOW_INDEX_FILENAME */ doc = bow_array_entry_at_index (crossbow_docs, crossbow_docs->length-1); /* Grab all the classnames, and set their CLASSPROBS to non-zero */ while ((classptr = strtok (lineptr, " \t\n\r"))) { if (strlen (classptr) == 0) continue; /* No good reason, just a silly check: */ assert (strlen (classptr) > 2); if (bow_str2int_no_add (crossbow_classnames, classptr) == -1) { /* This class hasn't been seen before; create a node for it */ tn = bow_treenode_new (crossbow_root, 2, classptr); ci = bow_str2int (crossbow_classnames, classptr); assert (tn->ci_in_parent == ci); } if (doc->cis == NULL) { doc->cis_size = 1; doc->cis = bow_malloc (sizeof (typeof (doc->cis[0])) * doc->cis_size); } else { doc->cis_size++; doc->cis = bow_realloc (doc->cis, (sizeof (typeof (doc->cis[0])) * doc->cis_size)); } ci = bow_str2int (crossbow_classnames, classptr); /* Be that the same class isn't listed twice for the same file */ for (cisi = 0; cisi < doc->cis_size-1; cisi++) assert (doc->cis[cisi] != ci); doc->cis[doc->cis_size-1] = ci; } /* Set DOC->CI from a randomly-selected entry in DOC->CIS so that the test/train splitting routines in split.c will work. */ doc->ci = doc->cis[random() % doc->cis_size]; } /* Reallocate space for WORDS and NEW_WORDS distributions now that we have indexed all the files and know what the complete vocabulary size is */ bow_treenode_realloc_words_all (crossbow_root); crossbow_classes_count = crossbow_classnames->str_array_length; fclose (crossbow_wv_fp); crossbow_wv_fp = NULL; fclose (listfp); crossbow_archive (bow_data_dirname);}voidcrossbow_index (){ char fn[1024]; int argi; int ci; text_file_count = 0; /* If we are pruning the vocabulary by occurrence count, then read all the documents to get the word counts, and limit the vocabulary appropriately. */ if (bow_prune_vocab_by_occur_count_n) { /* Parse all the documents to get word occurrence counts. */ for (argi = crossbow_arg_state.non_option_argi; argi < crossbow_argc; argi++) bow_words_add_occurrences_from_text_dir (crossbow_argv[argi], ""); bow_words_remove_occurrences_less_than (bow_prune_vocab_by_occur_count_n); /* Now insist that future calls to bow_word2int*() will not register new words. */ bow_word2int_do_not_add = 1; } if (crossbow_arg_state.vocab_map) { /* Set the vocabulary to be the vocab map. */ bow_words_set_map (crossbow_arg_state.vocab_map, 1); /* Now insist that future calls to bow_word2int*() will not register new words. */ bow_word2int_do_not_add = 1; } assert (crossbow_docs == NULL); crossbow_docs = bow_array_new (0, sizeof (crossbow_doc), crossbow_doc_free); crossbow_filename2di = bow_int4str_new (0); crossbow_classnames = bow_int4str_new (0); /* Read all the documents and write them as bow_wv's to the appropriate file in the model directory. Also add entries to the CROSSBOW_CLASSNAMES map. This must be done before creating TREENODE's so that we know how big to make the their vocabulary distributions. */ sprintf (fn, "%s/wvs", bow_data_dirname); crossbow_wv_fp = bow_fopen (fn, "wb"); for (argi = crossbow_arg_state.non_option_argi; argi < crossbow_argc; argi++) { ci = argi - crossbow_arg_state.non_option_argi; bow_map_filenames_from_dir (crossbow_index_filename, &ci, crossbow_argv[argi], "");#if !CLASSES_FROM_DIRS bow_str2int (crossbow_classnames, crossbow_argv[argi]);#endif } fclose (crossbow_wv_fp); crossbow_wv_fp = NULL; bow_verbosify (bow_progress, "\n");#if CLASSES_FROM_DIRS /* The number of classes equals the number of entries in the classname -> class index map. */ crossbow_classes_count = crossbow_classnames->str_array_length;#else /* Remember the number of topic class tags */ crossbow_classes_count = crossbow_argc - crossbow_arg_state.non_option_argi;#endif /* Build the hierarchy of treenode's. This must be done after the documents have been read so that the treenode's know how big to make their vocabulary distributions. */ if (crossbow_arg_state.build_hier_from_dir) { /* xxx This scheme currently makes it difficult to have a class distribution in each node, and to set the hierarchy from a directory structure. */ assert (crossbow_argc - crossbow_arg_state.non_option_argi == 1); crossbow_root = crossbow_new_root_from_dir (crossbow_argv[crossbow_arg_state.non_option_argi], NULL); bow_treenode_set_classes_uniform (crossbow_root, crossbow_classes_count); } else { /* Create just a single root node */ crossbow_root = bow_treenode_new_root (10); /* If there was more than one directory specified on command-line allocate space to hold a class distribution */ if (crossbow_classes_count > 1) bow_treenode_set_classes_uniform (crossbow_root, crossbow_classes_count); } crossbow_archive (bow_data_dirname);}/* Return a bow_wa containing the classification scores (log probabilities) of DOC indexed by the leaf indices */bow_wa *//crossbow_classify_doc_new_wa (crossbow_doc *doc)crossbow_classify_doc_new_wa (bow_wv *wv){ int li, leaf_count; double leaf_membership; treenode **leaves, *iterator, *leaf; bow_wa *wa; /* Classify the documents in the TAG-set */ leaf_count = bow_treenode_leaf_count (crossbow_root); leaves = alloca (leaf_count * sizeof (void*)); wa = bow_wa_new (leaf_count); /* Get the membership probability of each leaf */ for (iterator = crossbow_root, li = 0; (leaf = bow_treenode_iterate_leaves (&iterator)); li++) { if (crossbow_hem_shrinkage) leaf_membership = bow_treenode_log_prob_of_wv (leaf, wv); else leaf_membership = bow_treenode_log_local_prob_of_wv (leaf, wv); leaf_membership += log (leaf->prior); bow_wa_append (wa, li, leaf_membership); leaves[li] = leaf; } return wa;}intcrossbow_classify_doc (crossbow_doc *doc, int verbose, FILE *out){ int li, leaf_count; double leaf_membership; treenode **leaves, *iterator, *leaf; bow_wv *wv; bow_wa *wa; int ret;#if 0 double inverse_rank_sum = 0; int rank; double score_diff_sum = 0;#endif int word_count; /* Classify the documents in the TAG-set */ leaf_count = bow_treenode_leaf_count (crossbow_root); leaves = alloca (leaf_count * sizeof (void*)); wv = crossbow_wv_at_di (doc->di); assert (wv); word_count = bow_wv_word_count (wv); wa = bow_wa_new (leaf_count); /* Get the membership probability of each leaf */ for (iterator = crossbow_root, li = 0; (leaf = bow_treenode_iterate_leaves (&iterator)); li++) { if (crossbow_hem_shrinkage) leaf_membership = bow_treenode_log_prob_of_wv (leaf, wv); else leaf_membership = bow_treenode_log_local_prob_of_wv (leaf, wv); leaf_membership += log (leaf->prior);#define DOC_LENGTH_SCORE_TRANSFORM 1#if DOC_LENGTH_SCORE_TRANSFORM leaf_membership /= ((word_count + 1) / MIN(9,word_count));#endif bow_wa_append (wa, li, leaf_membership); leaves[li] = leaf; } /* Print the results. */ assert (wa->length == leaf_count); bow_wa_sort (wa); leaf = bow_treenode_descendant_matching_name (crossbow_root, doc->filename); if (leaf && !strcmp (leaf->name, leaves[wa->entry[0].wi]->name)) ret = 1; else ret = 0;#if 0 for (rank = 0; rank < wa->length; rank++) if (! strcmp (leaf->name, leaves[wa->entry[rank].wi]->name)) { inverse_rank_sum += 1.0 / (rank + 1); score_diff_sum += wa->entry[0].weight - wa->entry[rank].weight; break; }#endif if (verbose) { fprintf (out, "%s %s ", doc->filename, leaf ? leaf->name : "<unknown>"); if (verbose >= 2) for (li = 0; li < leaf_count; li++) fprintf (out, "%s:%g ", leaves[wa->entry[li].wi]->name, wa->entry[li].weight);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -