📄 lm_utt_test.c

📁 CMU大名鼎鼎的SPHINX－3大词汇量连续语音识别系统
💻 C
字号:
/* ==================================================================== * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * A test program for the large language model module. * You should give this program an ARGS file, and a text file containing * a list of N-grams. It will output to stdout the language model * scores for each N-gram. * */#include <stdio.h>#include <string.h>#include <s3types.h>#include <lm.h>#include <logs3.h>#include "metrics.h"#include "cmd_ln_args.h"#define MAX_NGRAMS 5000#define MAX_STRLEN 100#define MAX_WORDS_PER_NGRAM 3int read_ngrams(FILE* fp, char **ngrams,                 s3lmwid_t wid[][], int32 nwords[], int max_lines, lm_t *lm);int ngram2wid(char *word, int length, s3lmwid_t *w, lm_t *lm);int score_ngram(s3lmwid_t *wid, int nwd, lm_t *lm);int has_more_utterances(FILE* fp);/* FIX ME!, why do we use this function instead of strcmp? */int str_cmp(char *s1, char *s2);int main(int argc, char *argv[]){    char *lm_file;    char *args_file;    char *ngrams_file;    char *lmLoadTimer = "LM Load";    char *lmLookupTimer = "LM Lookup";    char *ngrams[MAX_NGRAMS];    float64 lw, wip, uw, logbase;    int i, n;        int32 nwords[MAX_NGRAMS];    int scores[MAX_NGRAMS];    lm_t *lm;    s3lmwid_t wid[MAX_NGRAMS][MAX_WORDS_PER_NGRAM];    FILE* fp;    if (argc < 3) {        E_FATAL("USAGE: %s <lm_file> <args_file> <ngrams_file>\n", argv[0]);    }    args_file = argv[1];    lm_file = argv[2];    ngrams_file = argv[3];    parse_args_file(args_file);    lw = cmd_ln_float32("-lw");    wip = cmd_ln_float32("-wip");    uw = cmd_ln_float32("-uw");    logbase = cmd_ln_float32("-logbase");    logs3_init(logbase);    metricsStart(lmLoadTimer);        /* initialize the language model */    lm = lm_read(lm_file, lw, wip, uw);    metricsStop(lmLoadTimer);    if ((fp = fopen(ngrams_file, "r")) == NULL) {        E_FATAL("Unable to open N-gram file %s\n", ngrams_file);    }        while (has_more_utterances(fp)) {      /* read in all the N-grams */      n = read_ngrams(fp, ngrams, wid, nwords, MAX_NGRAMS, lm);            metricsStart(lmLookupTimer);      /* scores the N-grams */      for (i = 0; i < n; i++) {        scores[i] = score_ngram(wid[i], nwords[i], lm);        printf("%-10d %s\n", scores[i], ngrams[i]);	/*	printf("%-10d %s %d %d %d\n", scores[i], ngrams[i], 	       wid[i][0], wid[i][1], wid[i][2]);	*/      }      /* reset cache if <END_UTT> was reached */      if (n != MAX_NGRAMS) {	lm_cache_reset(lm);      }      metricsStop(lmLookupTimer);    }    printf("Bigram misses: %d \n", lm->n_bg_bo);    printf("Trigram misses: %d \n", lm->n_tg_bo);    fflush(stdout);    metricsPrint();    return 0;}/** * Returns true if the given file has more utterances in it. * * args: * fp - the file pointer * * returns: 0 if there are more utterances, 1 if there are no more */int has_more_utterances(FILE* fp){    int i;    char line_read[MAX_STRLEN];    if (fgets(line_read, MAX_STRLEN, fp) != NULL) {        if (str_cmp("<START_UTT>\n", line_read) != 0) {            /* if not <START_UTT>, we want to push the line back */            for (i = strlen(line_read) - 1; i >= 0; i--) {                ungetc(line_read[i], fp);            }        }        return 1;    }    return 0;}/** * Reads all the N-grams in the given N-gram file into the array of strings. * * args: * ngrams_file - the N-gram file to read N-grams from * ngrams - the array of string to read N-grams into * * returns: the number of ngrams read */int read_ngrams(FILE *fp,                char **ngrams,                 s3lmwid_t wid[][MAX_WORDS_PER_NGRAM],                 int32 nwords[],                int max_lines,                 lm_t *lm){    char line_read[MAX_STRLEN];    int n, length;        n = 0;    /* read each line in the file into the ngrams array */    while (fgets(line_read, MAX_STRLEN, fp) != NULL) {      if (str_cmp("<END_UTT>\n", line_read) == 0) {	break;      }      if (n < max_lines) {	length = strlen(line_read);	line_read[length-1] = '\0';	ngrams[n] = (char *) ckd_calloc(length, sizeof(char));	strncpy(ngrams[n], line_read, length-1);	nwords[n] = ngram2wid(line_read, length, wid[n], lm);	n++;      } else {	break;      }    }    return n;}/** * Compares the two given strings for equality. Unlike the strcmp() * function in the standard libraries, this method returns immediately * if any characters don't match up. * * args: * s1 - the first string * s2 - the second string * * return: * 0 if the two strings are completely equal * -1 otherwise */int str_cmp(char *s1, char *s2){  while (*s1 != '\0' && *s2 != '\0') {    if (*s1++ != *s2++) {      return -1;    }  }  if (*s1 == '\0' && *s2 == '\0') {    return 0;  } else {    return -1;  }}/** * Map the given ngram string to an array of word IDs of the individual * words in the ngram. * * args: * ngram - the ngram string to map * length - the length of the ngram string * w - the word ID array * lm - the language model to use * * returns: * the number of words in the ngram string, or 0 if the string contains an * unknown word */int ngram2wid(char *ngram, int length, s3lmwid_t w[], lm_t *lm){    char word[1024];    int nwd;    int i;    i = 0, nwd = 0;    while (1) {      if (*ngram == ' ' || *ngram == '\0') {	word[i++] = '\0';	w[nwd] = lm_wid(lm, word);	if (NOT_S3LMWID(w[nwd])) {	  E_ERROR("Unknown word: %s\n", word[nwd]);	  return 0;	}	nwd++;	if (*ngram == '\0') {	  break;	}	i = 0;      } else {	word[i++] = *ngram;      }      ngram++;    }    return nwd;}/** * Scores the given N-gram using the given language model. * * args: * wid - the IDs of the sequence of words in the n-gram * nwd - the number of words in the n-gram * lm - the language model to use * * return: the language model score of the given sequence of words */int score_ngram(s3lmwid_t *wid, int nwd, lm_t *lm){    int32 score;        score = 0;    if (nwd == 3) {      /* The last argument is a hack: the information there - the dict       * ID - is never used if LM classes are not used, and classes       * are not used in this code. Therefore, the last argument here       * is a nop.       */      score = lm_tg_score(lm, wid[0], wid[1], wid[2], 0);    } else if (nwd == 2) {      /* Ditto.       */      score = lm_bg_score(lm, wid[0], wid[1], 0);    } else if (nwd == 1) {      /* Ditto.       */      score = lm_ug_score(lm, wid[0], 0);    } else {        printf("%d grams not supported\n", nwd);    }        return score;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -