kbest.c

来自「General Hidden Markov Model Library 一个通用」· C语言 代码 · 共 605 行 · 第 1/2 页

C
605
字号
/*********************************************************************************       This file is part of the General Hidden Markov Model Library,*       GHMM version 0.8_beta1, see http://ghmm.org**       Filename: ghmm/ghmm/kbest.c*       Authors:  Anyess von Bock, Alexander Riemer, Janne Grunau**       Copyright (C) 1998-2004 Alexander Schliep*       Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln*       Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,*                               Berlin**       Contact: schliep@ghmm.org**       This library is free software; you can redistribute it and/or*       modify it under the terms of the GNU Library General Public*       License as published by the Free Software Foundation; either*       version 2 of the License, or (at your option) any later version.**       This library is distributed in the hope that it will be useful,*       but WITHOUT ANY WARRANTY; without even the implied warranty of*       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU*       Library General Public License for more details.**       You should have received a copy of the GNU Library General Public*       License along with this library; if not, write to the Free*       Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA***       This file is version $Revision: 1713 $*                       from $Date: 2006-10-16 16:06:28 +0200 (Mon, 16 Oct 2006) $*             last change by $Author: grunau $.********************************************************************************/#ifdef HAVE_CONFIG_H#  include "../config.h"#endif#include <stdlib.h>#include <math.h>#include "mes.h"#include "mprintf.h"#include "model.h"#include "kbest.h"#include "ghmm_internals.h"/* threshold probability (logarithmized) */#define KBEST_THRESHOLD -3.50655789732  /* log(0.03) => threshold: 3% of most probable partial hypothesis */#define KBEST_EPS 1E-15/*============================================================================*//** Data type for linked list of hypotheses    Stores the actual label, a link to the parent hypothesis, a counter of the    links to this hypothesis and the gamma values */struct hypo_List {  int hyp_c;  int refcount;  int chosen;  int gamma_states;  double *gamma_a;  int *gamma_id;  struct hypo_List *next;  struct hypo_List *parent;};typedef struct hypo_List hypoList;/*============================================================================*//* inserts new hypothesis into list at position indicated by pointer plist */static void ighmm_hlist_insert (hypoList ** plist, int newhyp,                                hypoList * parlist);/*============================================================================*//* removes hypothesis at position indicated by pointer plist from the list */static void ighmm_hlist_remove (hypoList ** plist);/*============================================================================*//**   Propagates list of hypotheses forward by extending each old hypothesis to   the possible new hypotheses depending on the states in which the old   hypothesis could end and the reachable labels   @return number of old hypotheses   @param mo:         pointer to the ghmm_dmodel   @param h:          pointer to list of hypotheses   @param hplus:      address of a pointer to store the propagated hypotheses   @param labels:     number of labels   @param nr_s:       number states which have assigned the index aa label   @param max_out:    maximum number of out_states over all states with the index aa label */static int ighmm_hlist_prop_forward (ghmm_dmodel * mo, hypoList * h, hypoList ** hplus, int labels,                     int *nr_s, int *max_out);/*============================================================================*//**   Calculates the logarithm of sum(exp(log_a[j,a_pos])+exp(log_gamma[j,g_pos]))   which corresponds to the logarithm of the sum of a[j,a_pos]*gamma[j,g_pos]   @return log. sum for products of a row from gamma and a row from matrix A   @param log_a:      transition matrix with logarithmic values (1.0 for log(0))   @param s:          ghmm_dstate whose gamma-value is calculated   @param parent:     a pointer to the parent hypothesis*/static double ighmm_log_gamma_sum (double *log_a, ghmm_dstate * s, hypoList * parent);/*============================================================================*//**  Builds logarithmic transition matrix from the states' in_a values  the row for each state is the logarithmic version of the state's in_a  @return transition matrix with logarithmic values, 1.0 if a[i,j] = 0  @param s:           array of all states of the model  @param N:           number of states in the model */static double **kbest_buildLogMatrix (ghmm_dstate * s, int N){#define CUR_PROC "kbest_buildLogMatrix"  int i, j;  double **log_a;               /* log(a(i,j)) => log_a[i*N+j] */  /* create & initialize matrix: */  ARRAY_MALLOC (log_a, N);  for (i = 0; i < N; i++) {    ARRAY_MALLOC (log_a[i], s[i].in_states);    for (j = 0; j < s[i].in_states; j++)      log_a[i][j] = log (s[i].in_a[j]);  }  return log_a;STOP:     /* Label STOP from ARRAY_[CM]ALLOC */  GHMM_LOG(LCONVERTED, "kbest_buildLogMatrix failed\n");  exit (1);#undef CUR_PROC}/*============================================================================*//**   Calculates the most probable labeling for the given sequence in the given   model using k-best decoding.   Labels must be from interval [0:max_label] without gaps!!! (not checked)   Model must not have silent states. (checked in Python wrapper)   @return array of labels (internal representation)   @param mo:         pointer to a ghmm_dmodel   @param o_seq:      output sequence (array of internal representation chars)   @param seq_len:    length of output sequence   @param k:          number of hypotheses to keep for each state   @param log_p:      variable reference to store the log prob. of the labeling */int *ghmm_dmodel_label_kbest (ghmm_dmodel * mo, int *o_seq, int seq_len, int k, double *log_p){#define CUR_PROC "ghmm_dl_kbest"  int i, t, c, l, m;            /* counters */  int no_oldHyps;               /* number of hypotheses until position t-1 */  int b_index, i_id;            /* index for addressing states' b arrays */  int no_labels = 0;  int exists, g_nr;  int *states_wlabel;  int *label_max_out;  char *str;  /* logarithmized transition matrix A, log(a(i,j)) => log_a[i*N+j],     1.0 for zero probability */  double **log_a;  /* matrix of hypotheses, holds for every position in the sequence a list     of hypotheses */  hypoList **h;  hypoList *hP;  /* vectors for rows in the matrices */  int *hypothesis;  /* pointer & prob. of the k most probable hypotheses for each state     - matrices of dimensions #states x k:  argm(i,l) => argmaxs[i*k+l] */  double *maxima;  hypoList **argmaxs;  /* pointer to & probability of most probable hypothesis in a certain state */  hypoList *argmax;  double sum;  /* break if sequence empty or k<1: */  if (seq_len <= 0 || k <= 0)    return NULL;  ARRAY_CALLOC (h, seq_len);  /** 1. Initialization (extend empty hypothesis to #labels hypotheses of         length 1): */  /* get number of labels (= maximum label + 1)     and number of states with those labels */  ARRAY_CALLOC (states_wlabel, mo->N);  ARRAY_CALLOC (label_max_out, mo->N);  for (i = 0; i < mo->N; i++) {    c = mo->label[i];    states_wlabel[c]++;    if (c > no_labels)      no_labels = c;    if (mo->s[i].out_states > label_max_out[c])      label_max_out[c] = mo->s[i].out_states;  }  /* add one to the maximum label to get the number of labels */  no_labels++;  ARRAY_REALLOC (states_wlabel, no_labels);  ARRAY_REALLOC (label_max_out, no_labels);  /* initialize h: */  hP = h[0];  for (i = 0; i < mo->N; i++) {    if (mo->s[i].pi > KBEST_EPS) {      /* printf("Found State %d with initial probability %f\n", i, mo->s[i].pi); */      exists = 0;      while (hP != NULL) {        if (hP->hyp_c == mo->label[i]) {          /* add entry to the gamma list */          g_nr = hP->gamma_states;          hP->gamma_id[g_nr] = i;          hP->gamma_a[g_nr] =            log (mo->s[i].pi) +            log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]);          hP->gamma_states = g_nr + 1;          exists = 1;          break;        }        else          hP = hP->next;      }      if (!exists) {        ighmm_hlist_insert (&(h[0]), mo->label[i], NULL);        /* initiallize gamma-array with safe size (number of states) and add the first entry */        ARRAY_MALLOC (h[0]->gamma_a, states_wlabel[mo->label[i]]);        ARRAY_MALLOC (h[0]->gamma_id, states_wlabel[mo->label[i]]);        h[0]->gamma_id[0] = i;        h[0]->gamma_a[0] =          log (mo->s[i].pi) +          log (mo->s[i].b[get_emission_index (mo, i, o_seq[0], 0)]);        h[0]->gamma_states = 1;        h[0]->chosen = 1;      }      hP = h[0];    }  }  /* reallocating the gamma list to the real size */  hP = h[0];  while (hP != NULL) {    ARRAY_REALLOC (hP->gamma_a, hP->gamma_states);    ARRAY_REALLOC (hP->gamma_id, hP->gamma_states);    hP = hP->next;  }  /* calculate transition matrix with logarithmic values: */  log_a = kbest_buildLogMatrix (mo->s, mo->N);  /* initialize temporary arrays: */  ARRAY_MALLOC (maxima, mo->N * k);                             /* for each state save k */  ARRAY_MALLOC (argmaxs, mo->N * k);  /*------ Main loop: Cycle through the sequence: ------*/  for (t = 1; t < seq_len; t++) {    /* put o_seq[t-1] in emission history: */    update_emission_history (mo, o_seq[t - 1]);    /** 2. Propagate hypotheses forward and update gamma: */    no_oldHyps =      ighmm_hlist_prop_forward (mo, h[t - 1], &(h[t]), no_labels, states_wlabel,                     label_max_out);    /* printf("t = %d (%d), no of old hypotheses = %d\n", t, seq_len, no_oldHyps); */    /*-- calculate new gamma: --*/    hP = h[t];    /* cycle through list of hypotheses */    while (hP != NULL) {      for (i = 0; i < hP->gamma_states; i++) {        /* if hypothesis hP ends with label of state i:           gamma(i,c):= log(sum(exp(a(j,i)*exp(oldgamma(j,old_c)))))           + log(b[i](o_seq[t]))           else: gamma(i,c):= -INF (represented by 1.0) */        i_id = hP->gamma_id[i];        hP->gamma_a[i] = ighmm_log_gamma_sum (log_a[i_id], &mo->s[i_id], hP->parent);        b_index = get_emission_index (mo, i_id, o_seq[t], t);        if (b_index < 0) {          hP->gamma_a[i] = 1.0;          if (mo->order[i_id] > t)            continue;          else {            str = ighmm_mprintf (NULL, 0,                           "i_id: %d, o_seq[%d]=%d\ninvalid emission index!\n",                           i_id, t, o_seq[t]);            GHMM_LOG(LCONVERTED, str);            m_free (str);          }        }        else          hP->gamma_a[i] += log (mo->s[i_id].b[b_index]);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?