📄 treenode.c
字号:
/* treenode.c - Functions for hierarchical word distributions. *//* Copyright (C) 1997, 1998, 1999 Andrew McCallum Written by: Andrew Kachites McCallum <mccallum@cs.cmu.edu> This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <argp.h>#include <bow/crossbow.h>#define USE_ACCELERATED_EM 0#define MISC_STAYS_FLAT 1/* Accellerated EM: still gauranteed to converge below 2.0. 1.8 is good */#define EM_ACCELERATION 1.8/* Functions for creating, reading, writing the treenode *//* Create and return a new treenode, adding it as a child of PARENT, if PARENT is non-NULL. */treenode *bow_treenode_new (treenode *parent, int children_capacity, const char *name){ treenode *ret; int i; /* To avoid malloc'ing zero bytes */ if (children_capacity == 0) children_capacity = 1; ret = bow_malloc (sizeof (treenode)); /* Set relationship with parent. */ ret->parent = parent; if (parent) { ret->ci_in_parent = parent->children_count; if (parent->children_count >= parent->children_capacity) { parent->children_capacity *= 2; parent->children = bow_realloc (parent->children, parent->children_capacity * sizeof (void*)); } parent->children[parent->children_count++] = ret; if (name) { /* +1 for the /, +1 for the terminating 0 */ ret->name = bow_malloc (strlen (parent->name) + strlen (name) + 2); sprintf ((char*)ret->name, "%s%s/", parent->name, name); } else { ret->name = bow_malloc (strlen (parent->name) + 20); sprintf ((char*)ret->name, "%s%d/", parent->name, ret->ci_in_parent); } ret->words_capacity = parent->words_capacity; ret->classes_capacity = parent->classes_capacity; } else { ret->ci_in_parent = -1; ret->name = strdup ("/"); ret->words_capacity = bow_num_words (); ret->classes_capacity = 0; } ret->children_count = 0; ret->children_capacity = children_capacity; assert (children_capacity); ret->children = bow_malloc (ret->children_capacity * sizeof (void*)); ret->words = bow_malloc (ret->words_capacity * sizeof (double)); ret->new_words = bow_malloc (ret->words_capacity * sizeof (double)); ret->new_words_normalizer = 0; for (i = 0; i < ret->words_capacity; i++) { ret->words[i] = 0; ret->new_words[i] = 0; } ret->prior = 1.0; ret->new_prior = 1.0; if (parent) ret->depth = parent->depth + 1; else ret->depth = 0; /* Initialize ancestor mixture weights, LAMBDAS, to use exclusively the local estimate. */ ret->lambdas = bow_malloc ((ret->depth + 2) * sizeof (double)); ret->new_lambdas = bow_malloc ((ret->depth + 2) * sizeof (double)); ret->lambdas[0] = 1.0; ret->new_lambdas[0] = 1.0; for (i = 1; i < ret->depth + 2; i++) { ret->lambdas[i] = 0; ret->new_lambdas[i] = 0; } /* Initialize the CLASSES distribution later, only if requested. */ if (ret->classes_capacity == 0) { ret->classes = NULL; ret->new_classes = NULL; } else { ret->classes = bow_malloc (ret->classes_capacity * sizeof (double)); ret->new_classes = bow_malloc (ret->classes_capacity * sizeof (double)); for (i = 0; i < ret->classes_capacity; i++) { ret->classes[i] = 1.0 / ret->classes_capacity; ret->new_classes[i] = 0.0; } } /* Initialize the DI_WI_NEW_WORDS later, only if requested. */ ret->di_loo = NULL; ret->di_wvi_loo = NULL; ret->new_di_loo = NULL; ret->new_di_wvi_loo = NULL; return ret;}/* Free the memory allocate by TN and its children */voidbow_treenode_free (treenode *tn){ int ci; assert (tn == NULL && tn->ci_in_parent == -1); for (ci = 0; ci < tn->children_count; ci++) bow_treenode_free (tn->children[ci]); if (tn->children) bow_free (tn->children); if (tn->words) bow_free (tn->words); if (tn->new_words) bow_free (tn->new_words); if (tn->lambdas) bow_free (tn->lambdas); if (tn->new_lambdas) bow_free (tn->new_lambdas); if (tn->name) bow_free ((char*)tn->name); bow_free (tn);}/* Create and return a new treenode with the proper settings to be the root treenode */treenode *bow_treenode_new_root (int children_count){ return bow_treenode_new (NULL, children_count, NULL);}/* Reallocate memory for the WORDS and NEW_WORDS arrays, big enough for the vocabulary of size bow_num_words(). This is useful when the tree has been created before all the documents have been indexed. */voidbow_treenode_realloc_words_all (treenode *root){ int i; if (bow_num_words () > root->words_capacity) { root->words_capacity = bow_num_words (); root->words = bow_realloc (root->words, root->words_capacity * sizeof (double)); root->new_words = bow_realloc (root->words, root->words_capacity * sizeof (double)); root->new_words_normalizer = 0; for (i = 0; i < root->words_capacity; i++) { root->words[i] = 0; root->new_words[i] = 0; } } for (i = 0; i < root->children_count; i++) bow_treenode_realloc_words_all (root->children[i]);}/* Add to parent a CHILD that was previously created with a NULL parent. */voidbow_treenode_add_child (treenode *parent, treenode *child){ assert (parent->children_count < parent->children_capacity); assert (child->ci_in_parent == -1); child->parent = parent; child->ci_in_parent = parent->children_count; parent->children[parent->children_count++] = child;}/* Detach CHILD from PARENT, shifting remaining children, and updating the remaining children's CI_IN_PARENT. */voidbow_treenode_remove_child (treenode *parent, treenode *child){ bow_error ("%s: Not yet implemented", __PRETTY_FUNCTION__);}/* To this node and all its children, add a child named "Misc" if not there already. */voidbow_treenode_add_misc_child_all (treenode *root){ int ci; /* If ROOT is a leaf, just return immediately */ if (root->children_count == 0) return; /* Search for a pre-existing "Misc" child */ for (ci = 0; ci < root->children_count; ci++) if (strstr (root->children[ci]->name, "/Misc/")) goto do_children; /* Add a "Misc" child */ bow_treenode_new (root, 0, "Misc"); /* Recursively handle children */ do_children: for (ci = 0; ci < root->children_count; ci++) bow_treenode_add_misc_child_all (root->children[ci]);}/* Return the "next" treenode in a traversal of the tree. CONTEXT should be initialized to the root of the tree, otherwise strange results will ensue: nodes on the path from the initial CONTEXT node to the root will be skipped by the iteration. When the traversal is finished, NULL will be returned. */treenode *bow_treenode_iterate_all (treenode **context){ treenode *ret; if (*context == NULL) return NULL; /* Save the context as this call's return value */ ret = *context; /* Update the context for the next call. */ if ((*context)->children_count) *context = (*context)->children[0]; else { while ((*context)->parent && ((*context)->ci_in_parent == (*context)->parent->children_count-1)) { *context = (*context)->parent; } if ((*context)->parent) *context = (*context)->parent->children[(*context)->ci_in_parent+1]; else *context = NULL; } return ret;}/* Same as above, but only return the leaf nodes. */treenode *bow_treenode_iterate_leaves (treenode **context){ treenode *ret; while ((ret = bow_treenode_iterate_all (context)) && ret->children_count != 0) ; return ret;}treenode *bow_treenode_iterate_all_under_node (treenode **context, treenode *node){ treenode *ret; if (*context == NULL) return NULL; /* Save the context as this call's return value */ ret = *context; /* Update the context for the next call. */ if ((*context)->children_count) *context = (*context)->children[0]; else { while ((*context)->parent != node && ((*context)->ci_in_parent == (*context)->parent->children_count-1)) { *context = (*context)->parent; } if ((*context)->parent != node) *context = (*context)->parent->children[(*context)->ci_in_parent+1]; else *context = NULL; } return ret;}/* Same as above, but only return the leaf nodes. */treenode *bow_treenode_iterate_leaves_under_node (treenode **context, treenode *node){ treenode *ret; while ((ret = bow_treenode_iterate_all_under_node (context, node)) && ret->children_count != 0) ; return ret;}/* Return the deepest descendant with a ->NAME that is contained in NAME */treenode *bow_treenode_descendant_matching_name (treenode *root, const char *name){ int ci; treenode *tr;#if WHIZBANG char buf[256]; strcpy (buf, "./data"); strcat (buf, root->name); if (strstr (name, buf) != name)#else if (!strstr (name, root->name)) return NULL;#endif for (ci = 0; ci < root->children_count; ci++) { if ((tr = bow_treenode_descendant_matching_name (root->children[ci], name))) return tr; }#if WHIZBANG assert (root->depth == 2);#endif return root;}/* Archiving */#define BOW_TREENODE_HEADER_STRING "treenode\n"/* Write a treenode (and all its children) to FP. */voidbow_treenode_write (treenode *tn, FILE *fp){ int i; /* Write a tag that will later help verify we are reading correctly. */ bow_fwrite_string (BOW_TREENODE_HEADER_STRING, fp); /* If TN is NULL, write a 0 and return */ if (tn) bow_fwrite_int (1, fp); else { bow_fwrite_int (0, fp); return; } /* Write the name */ bow_fwrite_string (tn->name, fp); /* Write the multinomial */ bow_fwrite_int (tn->words_capacity, fp); bow_fwrite_double (tn->new_words_normalizer, fp); for (i = 0; i < tn->words_capacity; i++) bow_fwrite_double (tn->words[i], fp); /* Write the prior */ bow_fwrite_double (tn->prior, fp); bow_fwrite_double (tn->new_prior, fp); /* Write the lambda mixture weights */ bow_fwrite_int (tn->depth, fp); for (i = 0; i < tn->depth + 2; i++) { bow_fwrite_double (tn->lambdas[i], fp); bow_fwrite_double (tn->new_lambdas[i], fp); } /* Write the class distribution */ bow_fwrite_int (tn->classes_capacity, fp); if (tn->classes_capacity) { for (i = 0; i < tn->classes_capacity; i++) bow_fwrite_double (tn->classes[i], fp); } /* Write the children treenodes */ bow_fwrite_int (tn->children_count, fp); bow_fwrite_int (tn->children_capacity, fp); bow_fwrite_int (tn->ci_in_parent, fp); for (i = 0; i < tn->children_count; i++) bow_treenode_write (tn->children[i], fp);}/* Read and return a new treenode (and all its children) from FP. */treenode *bow_treenode_new_from_fp (FILE *fp){ char *header; treenode *tn; int i; /* Verify that we are starting read from the correct place in the FP */ bow_fread_string (&header, fp); if (strcmp (header, BOW_TREENODE_HEADER_STRING) != 0) bow_error ("Trying to read a treenode from bad FILE* location"); bow_free (header); /* If a NULL treenode was written, return NULL. */ bow_fread_int (&i, fp); if (i == 0) return NULL; tn = bow_malloc (sizeof (treenode)); tn->parent = NULL; /* Read the name */ bow_fread_string ((char**)&(tn->name), fp); /* Read the multinomial */ bow_fread_int (&(tn->words_capacity), fp); tn->words = bow_malloc (tn->words_capacity * sizeof (double)); tn->new_words = bow_malloc (tn->words_capacity * sizeof (double)); bow_fread_double (&(tn->new_words_normalizer), fp); for (i = 0; i < tn->words_capacity; i++) { bow_fread_double (&(tn->words[i]), fp);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -