lm_3g.c

来自「WinCE平台上的语音识别程序」· C语言代码 · 共 2,105 行 · 第 1/5 页
2,105 行
        fgets(string, sizeof(string), fp);    while ((strcmp(string, "\\data\\\n") != 0) && (!feof(fp)));    if (strcmp(string, "\\data\\\n") != 0)        E_FATAL("No \\data\\ mark in LM file\n");    *n_ug = *n_bg = *n_tg = 0;    while (fgets(string, sizeof(string), fp) != NULL) {        if (sscanf(string, "ngram %d=%d", &ngram, &ngram_cnt) != 2)            break;        switch (ngram) {        case 1:            *n_ug = ngram_cnt;            break;        case 2:            *n_bg = ngram_cnt;            break;        case 3:            *n_tg = ngram_cnt;            break;        default:            E_FATAL("Unknown ngram (%d)\n", ngram);            break;        }    }    /* Position file to just after the unigrams header '\1-grams:\' */    while ((strcmp(string, "\\1-grams:\n") != 0) && (!feof(fp)))        fgets(string, sizeof(string), fp);    /* Check counts;  NOTE: #trigrams *CAN* be 0 */    if ((*n_ug <= 0) || (*n_bg <= 0) || (*n_tg < 0))        E_FATAL("Bad or missing ngram count\n");}/* * Read in the unigrams from given file into the LM structure model.  On * entry to this procedure, the file pointer is positioned just after the * header line '\1-grams:'. */static voidReadUnigrams(FILE * fp, lm_t * model){    char string[256];    char name[128];    int32 wcnt;    float p1, bo_wt;    E_INFO("Reading unigrams\n");    wcnt = 0;    while ((fgets(string, sizeof(string), fp) != NULL) &&           (strcmp(string, "\\2-grams:\n") != 0)) {        if (sscanf(string, "%f %s %f", &p1, name, &bo_wt) != 3) {            if (string[0] != '\n')                E_WARN("Format error; unigram ignored:%s", string);            continue;        }        if (wcnt >= model->ucount)            E_FATAL("Too many unigrams\n");        /* Associate name with word id */        word_str[wcnt] = ckd_salloc(name);        hash_table_enter(model->HT, word_str[wcnt], (void *) wcnt);        model->unigrams[wcnt].prob1.f = p1;        model->unigrams[wcnt].bo_wt1.f = bo_wt;        model->unigrams[wcnt].mapid = wcnt;        wcnt++;    }    if (model->ucount != wcnt) {        E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n",               model->ucount, wcnt);        model->ucount = wcnt;    }}/* * Read bigrams from given file into given model structure.  File may be arpabo * or arpabo-id format, depending on idfmt = 0 or 1. */static voidReadBigrams(FILE * fp, lm_t * model, int32 idfmt){    char string[1024], word1[256], word2[256];    int32 w1, w2, prev_w1, bgcount, p;    bigram_t *bgptr;    float p2, bo_wt;    int32 n_fld, n;    E_INFO("Reading bigrams\n");    bgcount = 0;    bgptr = model->bigrams;    prev_w1 = -1;    n_fld = (model->tcount > 0) ? 4 : 3;    bo_wt = 0.0;    while (fgets(string, sizeof(string), fp) != NULL) {        if (!idfmt)            n = sscanf(string, "%f %s %s %f", &p2, word1, word2, &bo_wt);        else            n = sscanf(string, "%f %d %d %f", &p2, &w1, &w2, &bo_wt);        if (n < n_fld) {            if (string[0] != '\n')                break;            continue;        }        if (!idfmt) {            if ((w1 = wstr2wid(model, word1)) == NO_WORD)                E_FATAL("Unknown word: %s\n", word1);            if ((w2 = wstr2wid(model, word2)) == NO_WORD)                E_FATAL("Unknown word: %s\n", word2);        }        else {            if ((w1 >= model->ucount) || (w2 >= model->ucount) || (w1 < 0)                || (w2 < 0))                E_FATAL("Bad bigram: %s", string);        }        /* HACK!! to quantize probs to 4 decimal digits */        p = p2 * 10000;        p2 = p * 0.0001;        p = bo_wt * 10000;        bo_wt = p * 0.0001;        if (bgcount >= model->bcount)            E_FATAL("Too many bigrams\n");        bgptr->wid = w2;        bgptr->prob2 = sorted_id(&sorted_prob2, &p2);        if (model->tcount > 0)            bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt);        if (w1 != prev_w1) {            if (w1 < prev_w1)                E_FATAL("Bigrams not in unigram order\n");            for (prev_w1++; prev_w1 <= w1; prev_w1++)                model->unigrams[prev_w1].bigrams = bgcount;            prev_w1 = w1;        }        bgcount++;        bgptr++;        if ((bgcount & 0x0000ffff) == 0) {            E_INFOCONT(".");        }    }    if ((strcmp(string, "\\end\\\n") != 0)        && (strcmp(string, "\\3-grams:\n") != 0))        E_FATAL("Bad bigram: %s\n", string);    for (prev_w1++; prev_w1 <= model->ucount; prev_w1++)        model->unigrams[prev_w1].bigrams = bgcount;}/* * Very similar to ReadBigrams. */static voidReadTrigrams(FILE * fp, lm_t * model, int32 idfmt){    char string[1024], word1[256], word2[256], word3[256];    int32 i, n, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg,        p;    int32 seg, prev_seg, prev_seg_lastbg;    trigram_t *tgptr;    bigram_t *bgptr;    float p3;    E_INFO("Reading trigrams\n");    tgcount = 0;    tgptr = model->trigrams;    prev_w1 = -1;    prev_w2 = -1;    prev_bg = -1;    prev_seg = -1;    while (fgets(string, sizeof(string), fp) != NULL) {        if (!idfmt)            n = sscanf(string, "%f %s %s %s", &p3, word1, word2, word3);        else            n = sscanf(string, "%f %d %d %d", &p3, &w1, &w2, &w3);        if (n != 4) {            if (string[0] != '\n')                break;            continue;        }        if (!idfmt) {            if ((w1 = wstr2wid(model, word1)) == NO_WORD)                E_FATAL("Unknown word: %s\n", word1);            if ((w2 = wstr2wid(model, word2)) == NO_WORD)                E_FATAL("Unknown word: %s\n", word2);            if ((w3 = wstr2wid(model, word3)) == NO_WORD)                E_FATAL("Unknown word: %s\n", word3);        }        else {            if ((w1 >= model->ucount) || (w2 >= model->ucount)                || (w3 >= model->ucount) || (w1 < 0) || (w2 < 0)                || (w3 < 0))                E_FATAL("Bad trigram: %s", string);        }        /* HACK!! to quantize probs to 4 decimal digits */        p = p3 * 10000;        p3 = p * 0.0001;        if (tgcount >= model->tcount)            E_FATAL("Too many trigrams\n");        tgptr->wid = w3;        tgptr->prob3 = sorted_id(&sorted_prob3, &p3);        if ((w1 != prev_w1) || (w2 != prev_w2)) {            /* Trigram for a new bigram; update tg info for all previous bigrams */            if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2)))                E_FATAL("Trigrams not in bigram order\n");            bg = (w1 !=                  prev_w1) ? model->unigrams[w1].bigrams : prev_bg + 1;            endbg = model->unigrams[w1 + 1].bigrams;            bgptr = model->bigrams + bg;            for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);            if (bg >= endbg)                E_FATAL("Missing bigram for trigram: %s", string);            /* bg = bigram entry index for <w1,w2>.  Update tseg_base */            seg = bg >> LOG_BG_SEG_SZ;            for (i = prev_seg + 1; i <= seg; i++)                model->tseg_base[i] = tgcount;            /* Update trigrams pointers for all bigrams until bg */            if (prev_seg < seg) {                int32 tgoff = 0;                if (prev_seg >= 0) {                    tgoff = tgcount - model->tseg_base[prev_seg];                    if (tgoff > 65535)                        E_FATAL("Offset from tseg_base > 65535\n");                }                prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;                bgptr = model->bigrams + prev_bg;                for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;                     prev_bg++, bgptr++)                    bgptr->trigrams = tgoff;                for (; prev_bg <= bg; prev_bg++, bgptr++)                    bgptr->trigrams = 0;            }            else {                int32 tgoff;                tgoff = tgcount - model->tseg_base[prev_seg];                if (tgoff > 65535)                    E_FATAL("Offset from tseg_base > 65535\n");                bgptr = model->bigrams + prev_bg;                for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)                    bgptr->trigrams = tgoff;            }            prev_w1 = w1;            prev_w2 = w2;            prev_bg = bg;            prev_seg = seg;        }        tgcount++;        tgptr++;        if ((tgcount & 0x0000ffff) == 0) {            E_INFOCONT(".");        }    }    if (strcmp(string, "\\end\\\n") != 0)        E_FATAL("Bad trigram: %s\n", string);    for (prev_bg++; prev_bg <= model->bcount; prev_bg++) {        if ((prev_bg & (BG_SEG_SZ - 1)) == 0)            model->tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;        if ((tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535)            E_FATAL("Offset from tseg_base > 65535\n");        model->bigrams[prev_bg].trigrams =            tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ];    }}static FILE *lm_file_open(char const *filename, int32 usepipe){    FILE *fp;    if (usepipe) {#if defined(_WIN32_WCE) || defined(GNUWINCE)        E_FATAL("No popen() on WinCE\n");#else        char command[1024];#ifdef WIN32        /* FIXME: COMPLETELY BOGUS!!! */        sprintf(command, "D:\\compress\\gzip.exe -d -c %s", filename);        if ((fp = _popen(command, "r")) == NULL)            E_FATAL("Cannot popen %s\n", command);#else        sprintf(command, "zcat %s", filename);        if ((fp = popen(command, "r")) == NULL)            E_FATAL("Cannot popen %s\n", command);#endif                          /* !WIN32 */#endif                          /* !GNUWINCE */    }    else {        fp = myfopen(filename, "r");    }    return (fp);}static int32lm_get_classid(lm_t * model, char *name){    int32 i;    if (!model->lmclass)        return -1;    for (i = 0; i < model->n_lmclass; i++) {        if (strcmp(lmclass_getname(model->lmclass[i]), name) == 0)            return (i + LM_CLASSID_BASE);    }    return -1;}static int32get_dict_size(int32 *inout_n_unigram, char const *lmname){    int32 dict_size, max_new_oov;    dict_size = word_dict->dict_entry_count;    E_INFO("%d words in dictionary\n", dict_size);    /*     * If this is the "BASE" LM also count space for OOVs and words added at run time.     * UGLY!!  Assumes that OOVs will only be added to LM with no name.     */    if (lmname[0] == '\0') {        int32 first_oov, last_oov;        first_oov = dict_get_first_initial_oov();        last_oov = dict_get_last_initial_oov();        *inout_n_unigram += (last_oov - first_oov + 1);    }    /* Add space for words added in at run time */    max_new_oov = cmd_ln_int32("-maxnewoov");    *inout_n_unigram += max_new_oov;    if (dict_size >= 65535)        E_FATAL("#dict-words(%d) > 65534\n", dict_size);    return dict_size;}static int32lmtext_load(char const *filename, char const *lmname, lm_t **out_model){    FILE *fp;    size_t k;    int32 usingPipe = FALSE;    int32 n_unigram;    int32 n_bigram;    int32 n_trigram;    lm_t *model;    int32 idfmt, dict_size, i;    char *kbdumpdir, dumpfile[1024];    /* Check if a compressed file */    k = strlen(filename);#ifdef WIN32    usingPipe = (k > 3) && ((strcmp(filename + k - 3, ".gz") == 0)                            || (strcmp(filename + k - 3, ".GZ") == 0));#else    usingPipe = (k > 2) && ((strcmp(filename + k - 2, ".Z") == 0)                            || (strcmp(filename + k - 2, ".z") == 0));#endif    /* Check if an .arpabo-id format file; More HACK!! Hardwired check for -id */    if (usingPipe)        k -= 2;    idfmt = ((k > 3) && (strncmp(filename + k - 3, "-id", 3) == 0));    fp = lm_file_open(filename, usingPipe);    /* Read #unigrams, #bigrams, #trigrams from file */    ReadNgramCounts(fp, &n_unigram, &n_bigram, &n_trigram);    E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);    /* Determine dictionary size (for dict-wid -> LM-wid map) */    dict_size = get_dict_size(&n_unigram, lmname);    /* Allocate space for word strings. */    word_str = ckd_calloc(n_unigram, sizeof(char *));    /* Allocate space for LM, including initial OOVs and placeholders; initialize it */    model = NewModel(n_unigram, n_bigram, n_trigram, dict_size);    /* Create name for binary dump form of Darpa LM file */#ifdef WIN32    for (i = strlen(filename) - 1;         (i >= 0) && (filename[i] != '\\') && (filename[i] != '/');         --i);#else    for (i = strlen(filename) - 1; (i >= 0) && (filename[i] != '/');         --i);#endif    i++;    kbdumpdir = cmd_ln_str("-lmdumpdir");    /* form dumpfilename */    if (kbdumpdir)
lm_3g.c - 源码说明

本页面展示了「WinCE平台上的语音识别程序」中的 lm_3g.c 源码文件，采用 C语言编程语言编写，共 2,105 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与WinCE相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?