📄 s3_align.c
字号:
/* ==================================================================== * Copyright (c) 1995-2004 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * align.c -- Viterbi time align transcripts to speech. * * ********************************************** * CMU ARPA Speech Project * * Copyright (c) 1996 Carnegie Mellon University. * ALL RIGHTS RESERVED. * ********************************************** * * HISTORY * * 13-Sep-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Changed align_sen_active to flag active senones instead of building a list * of them. * * 24-Jul-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Stripped alternative pronunciation spec in input transcript to obtain * base word. * * 15-Jul-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Created. *//** \file s3_align.c \brief Engine for Sphinx 3 aligner */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include <s3types.h>#include "mdef.h"#include "tmat.h"#include "dict.h"#include "logs3.h"#include "s3_align.h"/** * SOME ASSUMPTIONS * - All phones (ciphones and triphones) have same HMM topology with n_state states. * - Initial state = state 0; final state = state n_state-1. * - Final state is a non-emitting state with no arcs out of it. * - Some form of Bakis topology (ie, no cycles, except for self-transitions). *//** * Phone-level sentence HMM structures: * pnode_t: nodes of phones forming sentence HMM. * plink_t: a link between two pnode_t nodes. * A phone node may have multiple successors and/or predecessors because of multiple * alternative pronunciations for a word, as well as the presence of OPTIONAL filler * words. * * Assumptions: * - No cycles in phone level sentence HMM. */typedef struct pnode_s { s3wid_t wid; /** Parent word id */ s3cipid_t ci; /** CI phone id corresponding to this node */ s3cipid_t lc; /** Left context CI phone */ s3cipid_t rc; /** Right context CI phone */ int8 pos; /** Phone position within word for this node */ s3pid_t pid; /** Triphone id for this node */ int32 id; /** Unique id for identifying node, debugging */ struct plink_s *succlist; /* Links to successor nodes */ struct plink_s *predlist; /* Links to predecessor nodes */ struct pnode_s *next; /* For building various lists of nodes */ struct pnode_s *alloc_next; /* Linear list of all allocated structures */ struct snode_s *startstate; /* Start state of underlying HMM */} pnode_t;/** * A may have links (transitions) to several successor or predecessor nodes. * They are captured by a list of the following plink_t type. */typedef struct plink_s { pnode_t *node; /** Target node for this link for a given parent node */ struct plink_s *next; /** Next link for same parent node */} plink_t;/** * Viterbi search history for each state at each time. */typedef struct history_s { int32 score; struct snode_s *snode; /** State for which this history node created */ struct history_s *pred; /** Previous frame history */ struct history_s *alloc_next; /** Linear list of all allocated history nodes */} history_t;static history_t *hist_head; /** Head of list of all history nodes *//** * State DAG structures similar to phone DAG structures. */typedef struct snode_s { pnode_t *pnode; /** Parent phone node */ struct slink_s *succlist; /** List of successor states */ struct slink_s *predlist; /** List of predecessor states */ int32 score, newscore; /** Score at start, end of each frame */ history_t *hist, *newhist; /** Path history at start, end of each frame */ int32 active_frm; /** Frame no. most recently active */ s3senid_t sen; /** Senone id, BAD_S3SENID if dummy node (head/tail) */ int8 state; /** Local state no. (within parent HMM) */} snode_t;typedef struct slink_s { snode_t *node; struct slink_s *next; int32 prob;} slink_t;static pnode_t phead, ptail; /** Dummies at the beginning and end of the sent hmm */static pnode_t *pnode_list; /** List of all dynamically allocated pnodes */static int32 n_pnode; /** #pnodes allocated (used to ID each pnode) */static snode_t shead, stail; /** State-level DAG head and tail */static dict_t *dict; /** The dictionary */static mdef_t *mdef; /** Model definition */static tmat_t *tmat; /** Transition probability matrices */static s3wid_t silwid, startwid, finishwid; /** Base wids */static s3wid_t *fillwid; /** BAD_S3WID terminated array of optional filler basewid */static snode_t **cur_active; /** NULL-terminated active state list for current frame */static snode_t **next_active; /** Similar list for next frame */static int32 active_list_size = 0;#define ACTIVE_LIST_SIZE_INCR 16380static int32 n_active;static int32 curfrm; /** Current frame */static int32 beam; /** Pruning beamwidth */static int32 *score_scale; /** Score by which state scores scaled in each frame *//** Lists of state, phone and word-level alignments for most recent utterance */static align_stseg_t *align_stseg;static align_phseg_t *align_phseg;static align_wdseg_t *align_wdseg;/** Free all allocated pnodes */static void pnodes_free ( void ){ pnode_t *p; while (pnode_list) { p = pnode_list->alloc_next; listelem_free ((char *) pnode_list, sizeof(pnode_t)); pnode_list = p; }}/** Free the specified set of plinks */static void plinks_free (plink_t *l){ plink_t *tmp; while (l) { tmp = l->next; listelem_free ((char *) l, sizeof(plink_t)); l = tmp; }}/** * Append a pnode to a list of pnodes (maintained in a list of plinks). */static plink_t *append_pnode (plink_t *list, pnode_t *node){ plink_t *l; l = (plink_t *) listelem_alloc (sizeof(plink_t)); l->node = node; l->next = list; return l;}/** * Allocate a pnode with the given attributes and automatically link it to the global * list. Return the allocated node pointer. */static pnode_t *alloc_pnode (s3wid_t w, int32 pos, s3cipid_t ci, s3cipid_t lc, s3cipid_t rc, word_posn_t wpos){ pnode_t *p; p = (pnode_t *) listelem_alloc (sizeof(pnode_t)); p->wid = w; p->ci = ci; p->lc = lc; p->rc = rc; p->pos = pos; p->pid = mdef_phone_id_nearest (mdef, ci, lc, rc, wpos); p->succlist = NULL; p->predlist = NULL; p->next = NULL; p->id = n_pnode++; p->startstate = NULL; p->alloc_next = pnode_list; pnode_list = p; return p;}/** * Link source and destination phone HMM nodes. */static void link_pnodes (pnode_t *src, pnode_t *dst){ src->succlist = append_pnode (src->succlist, dst); dst->predlist = append_pnode (dst->predlist, src);}/** * Create phone-level HMM for a single word and append to partial sentence HMM. * Replicate HMM nodes as needed to account for all possible left and right context * phones. * Return a list of the final HMM nodes for the single word appended. */static pnode_t *append_word (s3wid_t w, pnode_t *prev_end, s3cipid_t *pred_ci, s3cipid_t *succ_ci){ int32 i, M, N, m, n, pronlen, pron; pnode_t *node, *nodelist, *p; for (i = 0; IS_S3CIPID(pred_ci[i]); i++); M = (i > 0) ? i : 1; /* #predecessor CI phones */ for (i = 0; IS_S3CIPID(succ_ci[i]); i++); N = (i > 0) ? i : 1; /* #successor CI phones */ if ((pronlen = dict->word[w].pronlen) == 1) { /* Single phone case; replicated MxN times for all possible contexts */ nodelist = NULL; for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { node = alloc_pnode (w, 0, dict->word[w].ciphone[0], pred_ci[m], succ_ci[n], WORD_POSN_SINGLE); /* Link to all predecessor nodes matching context requirements */ for (p = prev_end; p; p = p->next) { if ((p->ci == node->lc) && ((NOT_S3CIPID(p->rc)) || (p->rc == node->ci))) { link_pnodes (p, node); } } node->next = nodelist; nodelist = node; } } return nodelist; } /* Multi-phone case. First phone, replicated M times */ nodelist = NULL; for (m = 0; m < M; m++) { node = alloc_pnode (w, 0, dict->word[w].ciphone[0], pred_ci[m], dict->word[w].ciphone[1], WORD_POSN_BEGIN); /* Link to predecessor node(s) matching context requirements */ for (p = prev_end; p; p = p->next) { if ((p->ci == node->lc) && ((NOT_S3CIPID(p->rc)) || (p->rc == node->ci))) { link_pnodes (p, node); } } node->next = nodelist; nodelist = node; } /* Intermediate phones */ for (pron = 1; pron < pronlen-1; pron++) { node = alloc_pnode (w, pron, dict->word[w].ciphone[pron], dict->word[w].ciphone[pron-1], dict->word[w].ciphone[pron+1], WORD_POSN_INTERNAL); for (p = nodelist; p; p = p->next) link_pnodes (p, node); nodelist = node; } /* Final phone, replicated N times */ prev_end = nodelist; nodelist = NULL; for (n = 0; n < N; n++) { node = alloc_pnode (w, pron, dict->word[w].ciphone[pron], dict->word[w].ciphone[pron-1], succ_ci[n], WORD_POSN_END); for (p = prev_end; p; p = p->next) link_pnodes (p, node); node->next = nodelist; nodelist = node; } return nodelist;}static void build_pred_ci (pnode_t *nodelist, s3cipid_t *pred_ci){ int32 i, p; pnode_t *node; for (p = 0; p < mdef->n_ciphone; p++) pred_ci[p] = 0; for (node = nodelist; node; node = node->next) if (node->ci != BAD_S3CIPID) pred_ci[(unsigned)node->ci] = 1; i = 0; for (p = 0; p < mdef->n_ciphone; p++) { if (pred_ci[p]) pred_ci[i++] = p; } pred_ci[i] = BAD_S3CIPID;}static void build_succ_ci (s3wid_t w, int32 append_filler, s3cipid_t *succ_ci){ int32 i, p; for (p = 0; p < mdef->n_ciphone; p++) succ_ci[p] = 0; for (; IS_S3WID(w); w = dict->word[w].alt) succ_ci[(unsigned)dict->word[w].ciphone[0]] = 1; if (append_filler) { for (i = 0; IS_S3WID(fillwid[i]); i++) for (w = fillwid[i]; IS_S3WID(w); w = dict->word[w].alt) succ_ci[(unsigned)dict->word[w].ciphone[0]] = 1; } i = 0; for (p = 0; p < mdef->n_ciphone; p++) { if (succ_ci[p]) succ_ci[i++] = p; } succ_ci[i] = BAD_S3CIPID;}/** * Append a new word to partially build phone-level sentence HMM. (Handle alternative * pronunciations.) Link new word to end phones of previous words. * Append optional filler words before w, if indicated. * Also Link prev_end into the global node list. * Return value: list of end phone nodes for w. (NOTE: these are not yet linked into * the global node list.) */static pnode_t *append_transcript_word (s3wid_t w, /** Transcript word to be appended */ pnode_t *prev_end, /** Previous end points to be attached to w */ s3wid_t nextw, /** Next word to follow w (ignoring optional fillers) */ int32 prefix_filler, /** Whether optional filler words to precede w */ int32 append_filler) /** Whether optional filler words to follow w */{ int32 i; pnode_t *new_end, *tmp_end, *node; s3cipid_t pred_ci[256], succ_ci[256]; s3wid_t fw; if (mdef->n_ciphone >= 256) E_FATAL("Increase pred_ci, succ_ci array sizes to > #CIphones (%d)\n", mdef->n_ciphone); assert (prev_end != NULL); /* Add optional silence/filler words before w, if indicated */ if (prefix_filler) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -