📄 corpus.c

📁 CMU大名鼎鼎的SPHINX－3大词汇量连续语音识别系统
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* ==================================================================== * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * corpus.c -- Corpus-file related misc functions. *  * ********************************************** * CMU ARPA Speech Project * * Copyright (c) 1996-2004 Carnegie Mellon University. * ALL RIGHTS RESERVED. * ********************************************** *  * HISTORY *  * 09-Dec-1999	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon * 		Added ctl_process_utt (). *  * 01-Mar-1999	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon * 		Added check for already existing file extension in ctl_infile(). *  * 23-Mar-1998	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon * 		Added a general purpose data argument to ctl_process() and its function * 		argument func. *  * 22-Nov-1997	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon * 		Added an optional validation function argument and an optional *		duplicate-resolution function argument to both corpus_load_headid() and * 		corpus_load_tailid(). *  * 25-Oct-1997	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Started. */#include "corpus.h"#include "kb.h"corpus_t *corpus_load_headid (char *file,			      int32 (*validate)(char *str),			      int32 (*dup_resolve)(char *s1, char *s2)){    FILE *fp;    char line[16384], wd[4096], *id;    int32 j, k, m, n;    corpus_t *corp;        E_INFO("Loading corpus (%s)\n", file);        if ((fp = fopen(file, "r")) == NULL)	E_FATAL_SYSTEM("fopen(%s,r) failed\n", file);    corp = (corpus_t *) ckd_calloc (1, sizeof(corpus_t));        n = 0;    while (fgets (line, sizeof(line), fp) != NULL) {	/* Skip empty lines */	if (sscanf (line, "%s", wd) == 1)	    n++;    }    rewind (fp);        corp->ht = hash_new (n, HASH_CASE_YES);    corp->n = 0;    corp->str = (char **) ckd_calloc (n, sizeof(char *));        n = 0;    while (fgets (line, sizeof(line), fp) != NULL) {	/* Skip blank lines */	if (sscanf (line, "%s%n", wd, &k) != 1)	    continue;		/* Eliminate the line-terminating newline */	j = strlen(line);	if ((j > 0) && (line[j-1] == '\n'))	    line[j-1] = '\0';	/* Validate if a validation function is given */	if (validate && (! (*validate)(line+k))) {	    E_INFO("Corpus validation %s failed; skipping\n", wd);	    continue;	}		id = ckd_salloc (wd);	if ((m = hash_enter (corp->ht, id, n)) != n) {	    /* Duplicate entry */	    if (! dup_resolve)		E_FATAL("corpus_load_headid(%s) failed; duplicate ID: %s\n", file, id);	    else {		/* Invoke the application provided duplicate resolver function */		if ((j = (*dup_resolve)(corp->str[m], line+k)) < 0)		    E_FATAL("corpus_load_headid(%s) failed; duplicate ID: %s\n", file, id);		ckd_free (id);		if (j > 0) {		    /* Overwrite the original with the new entry */		    ckd_free (corp->str[m]);		    corp->str[m] = ckd_salloc (line+k);		} else {		    /* Retain the original entry, discard the new one */		}	    }	} else {	    /* Fill in new entry */	    corp->str[n] = ckd_salloc (line+k);	    n++;	}    }    corp->n = n;        fclose (fp);        E_INFO("%s: %d entries\n", file, n);        return corp;}static int32 sep_tailid (char *line, char *uttid){    int32 i, k, l;        l = strlen(line);    uttid[0] = '\0';        /* Find last close-paren */    for (i = l-1;	 (i >= 0) && ((line[i] == '\n') || (line[i] == ' ') || (line[i] == '\t'));	 --i);    if ((i < 0)	|| (line[i] != ')'))		/* Missing uttid */	return -1;    k = i;        /* Find closest open-paren; no spaces allowed in uttid */    for (--i; (i >= 0) && (line[i] != ' ') && (line[i] != '\t') && (line[i] != '('); --i);    if ((i < 0) || (k-i < 2) || (line[i] != '('))	/* Empty or missing uttid */	return -1;        /* Remove parentheses and copy uttid */    line[k] = '\0';    strcpy (uttid, line+i+1);    /* Strip uttid from line */    line[i] = '\0';    return 0;}corpus_t *corpus_load_tailid (char *file,			      int32 (*validate)(char *str),			      int32 (*dup_resolve)(char *s1, char *s2)){    FILE *fp;    char line[16384], uttid[4096], *id;    int32 j, m, n;    corpus_t *corp;        E_INFO("Loading corpus (%s)\n", file);        if ((fp = fopen(file, "r")) == NULL)	E_FATAL_SYSTEM("fopen(%s,r) failed\n", file);    corp = (corpus_t *) ckd_calloc (1, sizeof(corpus_t));        n = 0;    while (fgets (line, sizeof(line), fp) != NULL) {	/* Skip empty lines */	if (sscanf (line, "%s", uttid) == 1)	    n++;    }    rewind (fp);        corp->ht = hash_new (n, 0 /* Not no-case */);    corp->n = 0;    corp->str = (char **) ckd_calloc (n, sizeof(char *));        n = 0;    while (fgets (line, sizeof(line), fp) != NULL) {	/* Skip blank lines */	if (sscanf (line, "%s", uttid) < 1)	    continue;		/* Look for a (uttid) at the end */	if (sep_tailid (line, uttid) < 0)	    E_FATAL("corpus_load_tailid(%s) failed; bad line: %s\n", file, line);		/* Validate if a validation function is given */	if (validate && (! (*validate)(line))) {	    E_INFO("Corpus validation %s failed; skipping\n", uttid);	    continue;	}		id = ckd_salloc (uttid);	if ((m = hash_enter (corp->ht, id, n)) != n) {	    /* Duplicate entry */	    if (! dup_resolve)		E_FATAL("corpus_load_tailid(%s) failed; duplicate ID: %s\n", file, id);	    else {		/* Invoke the application provided duplicate resolver function */		if ((j = (*dup_resolve)(corp->str[m], line)) < 0)		    E_FATAL("corpus_load(tailid(%s) failed; duplicate ID: %s\n", file, id);		ckd_free (id);		if (j > 0) {		    /* Overwrite the original with the new entry */		    ckd_free (corp->str[m]);		    corp->str[m] = ckd_salloc (line);		} else {		    /* Retain the original entry, discard the new one */		}	    }	} else {	    /* Fill in new entry */	    corp->str[n] = ckd_salloc (line);	    n++;	}    }    corp->n = n;        fclose (fp);        E_INFO("%s: %d entries\n", file, n);    return corp;}char *corpus_lookup (corpus_t *corp, char *id){    int32 n;        if (hash_lookup (corp->ht, id, &n) < 0)	return NULL;    assert ((n >= 0) && (n < corp->n));    return (corp->str[n]);}#if _CORPUS_TEST_main (int32 argc, char *argv[]){    corpus_t *ch, *ct;    char id[4096], *str;        if (argc != 3)	E_FATAL("Usage: %s headid-corpusfile tailid-corpusfile\n", argv[0]);        ch = corpus_load_headid (argv[1], NULL, NULL);    ct = corpus_load_tailid (argv[2], NULL, NULL);    for (;;) {	printf ("> ");	scanf ("%s", id);		str = corpus_lookup (ch, id);	if (str == NULL)	    printf ("%s Not found in 1\n");	else	    printf ("%s(1): %s\n", id, str);	str = corpus_lookup (ct, id);	if (str == NULL)	    printf ("%s Not found in 2\n");	else	    printf ("%s(2): %s\n", id, str);    }}#endifint32 ctl_read_entry (FILE *fp, char *uttfile, int32 *sf, int32 *ef, char *uttid){    char line[16384];    char base[16384];    int32 k;        do {	if (fgets (line, sizeof(line), fp) == NULL)	    return -1;	if (line[0] == '#')	    k = 0;	else
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -