📄 s3_align.c

📁 CMU大名鼎鼎的SPHINX－3大词汇量连续语音识别系统
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* ==================================================================== * Copyright (c) 1995-2004 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * align.c -- Viterbi time align transcripts to speech. *  * ********************************************** * CMU ARPA Speech Project * * Copyright (c) 1996 Carnegie Mellon University. * ALL RIGHTS RESERVED. * ********************************************** *  * HISTORY *  * 13-Sep-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Changed align_sen_active to flag active senones instead of building a list * 		of them. *  * 24-Jul-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Stripped alternative pronunciation spec in input transcript to obtain * 		base word. *  * 15-Jul-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Created. *//** \file s3_align.c    \brief Engine for Sphinx 3 aligner */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include <s3types.h>#include "mdef.h"#include "tmat.h"#include "dict.h"#include "logs3.h"#include "s3_align.h"/** * SOME ASSUMPTIONS *   - All phones (ciphones and triphones) have same HMM topology with n_state states. *   - Initial state = state 0; final state = state n_state-1. *   - Final state is a non-emitting state with no arcs out of it. *   - Some form of Bakis topology (ie, no cycles, except for self-transitions). *//** * Phone-level sentence HMM structures: *     pnode_t: nodes of phones forming sentence HMM. *     plink_t: a link between two pnode_t nodes. * A phone node may have multiple successors and/or predecessors because of multiple * alternative pronunciations for a word, as well as the presence of OPTIONAL filler * words. *  * Assumptions: *   - No cycles in phone level sentence HMM. */typedef struct pnode_s {    s3wid_t wid;	/** Parent word id */    s3cipid_t ci;	/** CI phone id corresponding to this node */    s3cipid_t lc;	/** Left context CI phone */    s3cipid_t rc;	/** Right context CI phone */    int8 pos;		/** Phone position within word for this node */    s3pid_t pid;	/** Triphone id for this node */    int32 id;		/** Unique id for identifying node, debugging */    struct plink_s *succlist;	/* Links to successor nodes */    struct plink_s *predlist;	/* Links to predecessor nodes */    struct pnode_s *next;	/* For building various lists of nodes */    struct pnode_s *alloc_next;	/* Linear list of all allocated structures */    struct snode_s *startstate;	/* Start state of underlying HMM */} pnode_t;/** * A may have links (transitions) to several successor or predecessor nodes. * They are captured by a list of the following plink_t type. */typedef struct plink_s {    pnode_t *node;		/** Target node for this link for a given parent node */    struct plink_s *next;	/** Next link for same parent node */} plink_t;/** * Viterbi search history for each state at each time. */typedef struct history_s {    int32 score;    struct snode_s *snode;		/** State for which this history node created */    struct history_s *pred;		/** Previous frame history */    struct history_s *alloc_next;	/** Linear list of all allocated history nodes */} history_t;static history_t *hist_head;		/** Head of list of all history nodes *//** * State DAG structures similar to phone DAG structures. */typedef struct snode_s {    pnode_t *pnode;		/** Parent phone node */    struct slink_s *succlist;	/** List of successor states */    struct slink_s *predlist;	/** List of predecessor states */    int32 score, newscore;	/** Score at start, end of each frame */    history_t *hist, *newhist;	/** Path history at start, end of each frame */    int32 active_frm;		/** Frame no. most recently active */    s3senid_t sen;		/** Senone id, BAD_S3SENID if dummy node (head/tail) */    int8 state;			/** Local state no. (within parent HMM) */} snode_t;typedef struct slink_s {    snode_t *node;    struct slink_s *next;    int32 prob;} slink_t;static pnode_t phead, ptail;	/** Dummies at the beginning and end of the sent hmm */static pnode_t *pnode_list;	/** List of all dynamically allocated pnodes */static int32 n_pnode;		/** #pnodes allocated (used to ID each pnode) */static snode_t shead, stail;	/** State-level DAG head and tail */static dict_t *dict;		/** The dictionary */static mdef_t *mdef;		/** Model definition */static tmat_t *tmat;		/** Transition probability matrices */static s3wid_t silwid, startwid, finishwid;	/** Base wids */static s3wid_t *fillwid;	/** BAD_S3WID terminated array of optional filler basewid */static snode_t **cur_active;	/** NULL-terminated active state list for current frame */static snode_t **next_active;	/** Similar list for next frame */static int32 active_list_size = 0;#define ACTIVE_LIST_SIZE_INCR	16380static int32 n_active;static int32 curfrm;		/** Current frame */static int32 beam;		/** Pruning beamwidth */static int32 *score_scale;	/** Score by which state scores scaled in each frame *//** Lists of state, phone and word-level alignments for most recent utterance */static align_stseg_t *align_stseg;static align_phseg_t *align_phseg;static align_wdseg_t *align_wdseg;/** Free all allocated pnodes */static void pnodes_free ( void ){    pnode_t *p;        while (pnode_list) {	p = pnode_list->alloc_next;	listelem_free ((char *) pnode_list, sizeof(pnode_t));	pnode_list = p;    }}/** Free the specified set of plinks */static void plinks_free (plink_t *l){    plink_t *tmp;        while (l) {	tmp = l->next;	listelem_free ((char *) l, sizeof(plink_t));	l = tmp;    }}/** * Append a pnode to a list of pnodes (maintained in a list of plinks). */static plink_t *append_pnode (plink_t *list, pnode_t *node){    plink_t *l;        l = (plink_t *) listelem_alloc (sizeof(plink_t));    l->node = node;    l->next = list;    return l;}/** * Allocate a pnode with the given attributes and automatically link it to the global * list.  Return the allocated node pointer. */static pnode_t *alloc_pnode (s3wid_t w, int32 pos,			     s3cipid_t ci, s3cipid_t lc, s3cipid_t rc,			     word_posn_t wpos){    pnode_t *p;    p = (pnode_t *) listelem_alloc (sizeof(pnode_t));    p->wid = w;    p->ci = ci;    p->lc = lc;    p->rc = rc;    p->pos = pos;    p->pid = mdef_phone_id_nearest (mdef, ci, lc, rc, wpos);    p->succlist = NULL;    p->predlist = NULL;    p->next = NULL;        p->id = n_pnode++;        p->startstate = NULL;        p->alloc_next = pnode_list;    pnode_list = p;        return p;}/** * Link source and destination phone HMM nodes. */static void link_pnodes (pnode_t *src, pnode_t *dst){    src->succlist = append_pnode (src->succlist, dst);    dst->predlist = append_pnode (dst->predlist, src);}/** * Create phone-level HMM for a single word and append to partial sentence HMM. * Replicate HMM nodes as needed to account for all possible left and right context * phones. * Return a list of the final HMM nodes for the single word appended. */static pnode_t *append_word (s3wid_t w,			     pnode_t *prev_end,			     s3cipid_t *pred_ci,			     s3cipid_t *succ_ci){    int32 i, M, N, m, n, pronlen, pron;    pnode_t *node, *nodelist, *p;        for (i = 0; IS_S3CIPID(pred_ci[i]); i++);    M = (i > 0) ? i : 1;	/* #predecessor CI phones */        for (i = 0; IS_S3CIPID(succ_ci[i]); i++);    N = (i > 0) ? i : 1;	/* #successor CI phones */        if ((pronlen = dict->word[w].pronlen) == 1) {	/* Single phone case; replicated MxN times for all possible contexts */	nodelist = NULL;	for (m = 0; m < M; m++) {	    for (n = 0; n < N; n++) {		node = alloc_pnode (w, 0,				    dict->word[w].ciphone[0], pred_ci[m], succ_ci[n],				    WORD_POSN_SINGLE);		/* Link to all predecessor nodes matching context requirements */		for (p = prev_end; p; p = p->next) {		    if ((p->ci == node->lc) &&			((NOT_S3CIPID(p->rc)) || (p->rc == node->ci))) {			link_pnodes (p, node);		    }		}		node->next = nodelist;		nodelist = node;	    }	}		return nodelist;    }    /* Multi-phone case.  First phone, replicated M times */    nodelist = NULL;    for (m = 0; m < M; m++) {	node = alloc_pnode (w, 0,			    dict->word[w].ciphone[0],			    pred_ci[m],			    dict->word[w].ciphone[1],			    WORD_POSN_BEGIN);	/* Link to predecessor node(s) matching context requirements */	for (p = prev_end; p; p = p->next) {	    if ((p->ci == node->lc) &&		((NOT_S3CIPID(p->rc)) || (p->rc == node->ci))) {		link_pnodes (p, node);	    }	}	node->next = nodelist;	nodelist = node;    }    /* Intermediate phones */    for (pron = 1; pron < pronlen-1; pron++) {	node = alloc_pnode (w, pron,			    dict->word[w].ciphone[pron],			    dict->word[w].ciphone[pron-1],			    dict->word[w].ciphone[pron+1],			    WORD_POSN_INTERNAL);	for (p = nodelist; p; p = p->next)	    link_pnodes (p, node);	nodelist = node;    }    /* Final phone, replicated N times */    prev_end = nodelist;    nodelist = NULL;    for (n = 0; n < N; n++) {	node = alloc_pnode (w, pron,			    dict->word[w].ciphone[pron],			    dict->word[w].ciphone[pron-1],			    succ_ci[n],			    WORD_POSN_END);	for (p = prev_end; p; p = p->next)	    link_pnodes (p, node);	node->next = nodelist;	nodelist = node;    }    return nodelist;}static void build_pred_ci (pnode_t *nodelist, s3cipid_t *pred_ci){    int32 i, p;    pnode_t *node;        for (p = 0; p < mdef->n_ciphone; p++)	pred_ci[p] = 0;    for (node = nodelist; node; node = node->next)	if (node->ci != BAD_S3CIPID)	    pred_ci[(unsigned)node->ci] = 1;    i = 0;    for (p = 0; p < mdef->n_ciphone; p++) {	if (pred_ci[p])	    pred_ci[i++] = p;    }    pred_ci[i] = BAD_S3CIPID;}static void build_succ_ci (s3wid_t w, int32 append_filler, s3cipid_t *succ_ci){    int32 i, p;        for (p = 0; p < mdef->n_ciphone; p++)	succ_ci[p] = 0;    for (; IS_S3WID(w); w = dict->word[w].alt)	succ_ci[(unsigned)dict->word[w].ciphone[0]] = 1;        if (append_filler) {	for (i = 0; IS_S3WID(fillwid[i]); i++)	    for (w = fillwid[i]; IS_S3WID(w); w = dict->word[w].alt)		succ_ci[(unsigned)dict->word[w].ciphone[0]] = 1;    }    i = 0;    for (p = 0; p < mdef->n_ciphone; p++) {	if (succ_ci[p])	    succ_ci[i++] = p;    }    succ_ci[i] = BAD_S3CIPID;}/** * Append a new word to partially build phone-level sentence HMM.  (Handle alternative * pronunciations.)  Link new word to end phones of previous words. * Append optional filler words before w, if indicated. * Also Link prev_end into the global node list. * Return value: list of end phone nodes for w.  (NOTE: these are not yet linked into * the global node list.) */static pnode_t *append_transcript_word    (s3wid_t w,			/** Transcript word to be appended */     pnode_t *prev_end,		/** Previous end points to be attached to w */     s3wid_t nextw,		/** Next word to follow w (ignoring optional fillers) */     int32 prefix_filler,	/** Whether optional filler words to precede w */     int32 append_filler)	/** Whether optional filler words to follow w */{    int32 i;    pnode_t *new_end, *tmp_end, *node;    s3cipid_t pred_ci[256], succ_ci[256];    s3wid_t fw;        if (mdef->n_ciphone >= 256)	E_FATAL("Increase pred_ci, succ_ci array sizes to > #CIphones (%d)\n",		mdef->n_ciphone);    assert (prev_end != NULL);        /* Add optional silence/filler words before w, if indicated */    if (prefix_filler) {
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -