📄 lm_test.c

📁 CMU大名鼎鼎的SPHINX－3大词汇量连续语音识别系统
💻 C
字号:
/* ==================================================================== * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * A test program for the large language model module. * You should give this program an ARGS file, and a text file containing * a list of N-grams. It will output to stdout the language model * scores for each N-gram. * */#include <stdio.h>#include <string.h>#include <lm.h>#include <logs3.h>#include <s3types.h>#include "cmd_ln_args.h"#include "metrics.h"#define MAX_NGRAMS 5100#define MAX_STRLEN 100int read_ngrams(char *ngrams_file, char **ngrams,                 s3lmwid_t *wid[], int32 nwords[], int max_lines, lm_t *lm);int ngram2wid(char *word, int length, s3lmwid_t *w, lm_t *lm);int score_ngram(s3lmwid_t *wid, int nwd, lm_t *lm);int main(int argc, char *argv[]){    char *lm_file;    char *args_file;    char *ngrams_file;    char *lmLoadTimer = "LM Load";    char *lmLookupTimer = "LM Lookup";    char *ngrams[MAX_NGRAMS];    float64 lw, wip, uw, logbase;    int i, n;        int32 nwords[MAX_NGRAMS];    int scores[MAX_NGRAMS];    lm_t *lm;    s3lmwid_t *wid[MAX_NGRAMS];    if (argc < 3) {        E_FATAL("USAGE: %s <lm_file> <args_file> <ngrams_file>\n", argv[0]);    }    args_file = argv[1];    lm_file = argv[2];    ngrams_file = argv[3];    parse_args_file(args_file);    lw = cmd_ln_float32("-lw");    wip = cmd_ln_float32("-wip");    uw = cmd_ln_float32("-uw");    logbase = cmd_ln_float32("-logbase");    logs3_init(logbase);    metricsStart(lmLoadTimer);        /* initialize the language model */    lm = lm_read(lm_file, lw, wip, uw);    metricsStop(lmLoadTimer);    /* read in all the N-grams */    n = read_ngrams(ngrams_file, ngrams, wid, nwords, MAX_NGRAMS, lm);    metricsStart(lmLookupTimer);    /* scores the N-grams */    for (i = 0; i < n; i++) {        scores[i] = score_ngram(wid[i], nwords[i], lm);    }    metricsStop(lmLookupTimer);    for (i = 0; i < n; i++) {        printf("%-10d %s\n", scores[i], ngrams[i]);    }    printf("Bigram misses: %d \n", lm->n_bg_bo);    printf("Trigram misses: %d \n", lm->n_tg_bo);    fflush(stdout);    metricsPrint();    return 0;}/** * Reads all the N-grams in the given N-gram file into the array of strings. * * args: * ngrams_file - the N-gram file to read N-grams from * ngrams - the array of string to read N-grams into * * returns: the number of ngrams read */int read_ngrams(char *ngrams_file,                 char **ngrams,                 s3lmwid_t *wid[],                 int32 nwords[],                int max_lines,                 lm_t *lm){    FILE *fp;    char line_read[MAX_STRLEN];    int n, length;        if ((fp = fopen(ngrams_file, "r")) == NULL) {        E_FATAL("Unable to open N-gram file %s\n", ngrams_file);    }    n = 0;    /* read each line in the file into the ngrams array */    while (fgets(line_read, MAX_STRLEN, fp) != NULL) {        if (n < max_lines) {            length = strlen(line_read);            line_read[length-1] = '\0';            ngrams[n] = (char *) ckd_calloc(length, sizeof(char));            strncpy(ngrams[n], line_read, length-1);            wid[n] = (s3lmwid_t *) ckd_calloc(3, sizeof(s3lmwid_t));            nwords[n] = ngram2wid(line_read, length, wid[n], lm);            n++;        } else {            break;        }    }    return n;}/** * Map the given ngram string to an array of word IDs of the individual * words in the ngram. * * args: * ngram - the ngram string to map * length - the length of the ngram string * w - the word ID array * lm - the language model to use * * returns: * the number of words in the ngram string, or 0 if the string contains an * unknown word */int ngram2wid(char *ngram, int length, s3lmwid_t *w, lm_t *lm){    char *word[1024];    int nwd;    int i;        if ((nwd = str2words(ngram, word, length)) < 0)	E_FATAL("Increase word[] and w[] arrays size\n");        for (i = 0; i < nwd; i++) {	w[i] = lm_wid (lm, word[i]);	if (NOT_S3LMWID(w[i])) {	    E_ERROR("Unknown word: %s\n", word[i]);	    return 0;	}    }    return nwd;}/** * Scores the given N-gram using the given language model. * * args: * wid - the IDs of the sequence of words in the n-gram * nwd - the number of words in the n-gram * lm - the language model to use * * return: the language model score of the given sequence of words */int score_ngram(s3lmwid_t *wid, int nwd, lm_t *lm){    int32 score;        score = 0;    if (nwd == 3) {      /* The last argument is a hack: the information there - the dict       * ID - is never used if LM classes are not used, and classes       * are not used in this code. Therefore, the last argument here       * is a nop.       */      score = lm_tg_score(lm, wid[0], wid[1], wid[2], 0);    } else if (nwd == 2) {      /* Ditto.       */      score = lm_bg_score(lm, wid[0], wid[1], 0);    } else if (nwd == 1) {      /* Ditto.       */      score = lm_ug_score(lm, wid[0], 0);    } else {        printf("%d grams not supported\n", nwd);    }        return score;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -