⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 paper.ps

📁 这是一款很好用的工具包
💻 PS
📖 第 1 页 / 共 4 页
字号:
2[54 54 50 42 54 1[46 58 54 71 50 2[29 58 58 46 50 5454 50 54 11[37 37 37 37 37 2[19 46[{TeXBase1Encoding ReEncodeFont}4875.000000 /Times-Bold rf /Fm 134[50 1[72 2[28 39 33 1[5050 50 78 28 50 1[28 50 50 1[44 50 44 50 44 11[72 61 5566 1[55 2[89 61 2[33 5[66 1[72 6[28 10[28 25 1[25 44[{TeXBase1Encoding ReEncodeFont}33 100.000000 /Times-Romanrf /Fn 139[28 39 39 2[50 50 1[28 44 5[44 50 44 1[50 13[5017[61 65[{TeXBase1Encoding ReEncodeFont}13 100.000000/Times-Italic rf /Fo 104[100 62[72 2[72 66 55 72 2[7872 94 66 78 1[39 1[78 1[66 72 1[66 72 65[{TeXBase1Encoding ReEncodeFont}17 100.000000 /Times-Bold rf end%%EndProlog%%BeginSetup%%Feature: *Resolution 600dpiTeXDict begin%%EndSetup%%Page: 1 11 0 bop 404 260 a Fo(SRILM)26 b(\227)e(AN)h(EXTENSIBLE)i(LANGU)-6b(A)h(GE)25 b(MODELING)f(T)n(OOLKIT)1562 482 y Fn(Andr)l(eas)g(Stolc)n(k)o(e)977 703 y Fm(Speech)i(T)-7 b(echnology)23 b(and)i(Research)h(Laboratory)1022 814 y(SRI)g(International,)e(Menlo)g(P)o(ark,)h(CA,)g(U.S.A.)1335 924 y(http://www)-6 b(.speech.sri.com/)5991235 y Fl(ABSTRA)l(CT)-184 1395 y Fk(S)t(R)t(I)t(L)t(M)23b Fj(is)g(a)g(collection)g(of)g(C++)g(libraries,)h(e)o(x)o(ecutable)g(programs,)h(and)-186 1482 y(helper)16 b(scripts)f(designed)i(to)e(allo)n(w)h(both)g(production)h(of)e(and)h(e)o(xperimen-)-1861569 y(tation)21 b(with)f(statistical)f(language)j(models)g(for)e(speech)i(recognition)g(and)-186 1656 y(other)k(applications.)46b Fk(S)t(R)t(I)t(L)t(M)26 b Fj(is)f(freely)g(a)o(v)n(ailable)h(for)g(noncommercial)-186 1742 y(purposes.)41 b(The)24 b(toolkit)h(supports)g(creation)g(and)g(e)n(v)n(aluation)h(of)e(a)g(v)n(ari-)-1861829 y(ety)i(of)g(language)i(model)f(types)f(based)h(on)g(N-gram)f(statistics,)h(as)f(well)-186 1916 y(as)k(se)n(v)o(eral)g(related)f(tasks,)k(such)d(as)g(statistical)e(tagging)j(and)f(manipu-)-1862003 y(lation)e(of)g(N-best)g(lists)g(and)g(w)o(ord)h(lattices.)50b(This)28 b(paper)h(summarizes)-186 2089 y(the)20 b(functionality)g(of)g(the)g(toolkit)g(and)g(discusses)h(its)e(design)i(and)f(imple-)-1862176 y(mentation,)h(highlighting)g(ease)g(of)f(rapid)g(prototyping,)i(reusability)-5 b(,)21 b(and)-186 2263 y(combinability)f(of)f(tools.)452 2469 y Fl(1.)45 b(INTR)n(ODUCTION)-186 2635 y Fj(Statistical)33b(language)k(modeling)f(is)e(the)h(science)h(\(and)g(often)f(art\))f(of)-186 2722 y(b)o(uilding)41 b(models)g(that)g(estimate)f(the)h(prior)f(probabilities)h(of)g(w)o(ord)-186 2809 y(strings.)28b(Language)22 b(modeling)f(has)g(man)o(y)h(applications)f(in)f(natural)h(lan-)-186 2896 y(guage)f(technology)h(and)e(other)g(areas)g(where)h(sequences)g(of)f(discrete)g(ob-)-186 2982 y(jects)24b(play)h(a)g(role,)g(with)f(prominent)i(roles)e(in)g(speech)i(recognition)g(and)-186 3069 y(natural)18 b(language)i(tagging)f(\(including)g(specialized)f(tasks)h(such)f(as)g(part-)-1863156 y(of-speech)25 b(tagging,)g(w)o(ord)g(and)f(sentence)h(se)o(gmentation,)g(and)g(shallo)n(w)-186 3243 y(parsing\).)56b(As)29 b(pointed)i(out)f(in)f([1],)j(the)e(main)g(techniques)h(for)e(ef)n(fec-)-186 3329 y(ti)n(v)o(e)20 b(language)h(modeling)f(ha)o(v)o(e)g(been)h(kno)n(wn)g(for)e(at)h(least)f(a)h(decade,)h(al-)-1863416 y(though)h(one)g(suspects)g(that)f(important)g(adv)n(ances)i(are)e(possible,)h(and)f(in-)-186 3503 y(deed)d(needed,)h(to)f(bring)g(about)g(signi\002cant)g(breakthroughs)i(in)d(the)h(appli-)-1863590 y(cation)i(areas)g(cited)g(abo)o(v)o(e\227such)i(breakthroughs)g(just)d(ha)o(v)o(e)h(been)h(v)o(ery)-186 3676 y(hard)e(to)g(come)h(by)f([2,)g(3].)-61 3770 y(V)-8 b(arious)28 b(softw)o(are)g(packages)i(for)e(statistical)f(language)j(modeling)-186 3857 y(ha)o(v)o(e)20b(been)h(in)e(use)i(for)e(man)o(y)i(years\227the)f(basic)g(algorithms)g(are)g(simple)-186 3944 y(enough)h(that)e(one)g(can)h(easily)f(implement)g(them)g(with)g(reasonable)h(ef)n(fort)-1864030 y(for)29 b(research)i(use.)55 b(One)30 b(such)g(package,)j(the)d(CMU-Cambridge)g(LM)-186 4117 y(toolkit)c([1],)g(has)g(been)h(in)e(wide)h(use)g(in)g(the)f(research)i(community)g(and)-1864204 y(has)22 b(greatly)f(f)o(acilitated)g(the)g(construction)i(of)e(language)i(models)f(\(LMs\))-186 4291 y(for)d(man)o(y)g(practitioners.)-61 4384 y(This)35 b(paper)h(describes)h(a)e(f)o(airly)h(recent)g(addition)g(to)g(the)g(set)f(of)-186 4471 y(publicly)23b(a)o(v)n(ailable)g(LM)f(tools,)h(the)f(SRI)g(Language)i(Modeling)f(T)-6 b(oolkit)-186 4558 y(\()r Fk(S)t(R)t(I)t(L)t(M)r Fj(\).)52b(Compared)31 b(to)e(e)o(xisting)h(LM)f(tools,)34 b Fk(S)t(R)t(I)t(L)t(M)29 b Fj(of)n(fers)h(a)f(pro-)-186 4645 y(gramming)e(interf)o(ace)f(and)h(an)f(e)o(xtensible)g(set)g(of)g(LM)f(classes,)j(se)n(v)o(eral)-186 4731 y(non-standard)e(LM)e(types,)h(and)g(more)f(a)g(comprehensi)n(v)o(e)j(functionality)-186 4818 y(that)i(goes)h(be)o(yond)g(language)h(modeling)f(to)e(include)i(tagging,)i(N-best)-186 4905y(rescoring,)22 b(and)f(other)g(applications.)29 b(This)20b(paper)h(describes)h(the)e(design)-186 4992 y(philosophy)32b(and)f(k)o(e)o(y)h(implementation)f(choices)g(in)i Fk(S)t(R)t(I)t(L)t(M)r Fj(,)e(summa-)-186 5078 y(rizes)19 b(its)f(capabilities,)g(and)i(concludes)g(by)f(discussing)h(de\002ciencies)g(and)-1865165 y(plans)j(for)g(future)g(de)n(v)o(elopment.)37 b(F)o(or)22b(lack)h(of)g(space)g(we)g(must)g(refer)f(to)-186 5252y(other)16 b(publications)h(for)f(an)h(introduction)g(to)f(language)h(modeling)g(and)g(its)-186 5339 y(role)i(in)g(speech)h(recognition)g(and)f(other)g(areas)h([3,)e(4)q(].)2359 1235 y Fl(2.)45b(DESIGN)18 b(GO)m(ALS)h(AND)f(HIST)o(OR)m(Y)1977 1389y Fk(S)t(R)t(I)t(L)t(M)j Fj(gre)n(w)f(out)h(of)f(a)h(dissatisf)o(action)g(with)f(pre)n(viously)h(a)o(v)n(ailable)g(LM)19751476 y(tools)e(at)g(SRI,)e(and)i(a)g(desire)g(to)f(design)i(an)f(LM)f(toolkit)h(from)f(the)h(ground)1975 1563 y(up,)g(with)g(the)g(follo)n(wing)g(goals)h(in)f(mind:)2020 1678 y Fi(\017)42 b Fj(Ef)n(\002cient)31 b(and)i(careful)f(implementation)g(of)g(state-of-the-art)g(LM)21001765 y(algorithms,)g(to)d(support)h(de)n(v)o(elopment)h(of)f(competiti)n(v)o(e)g(systems,)2100 1851 y(mainly)19 b(in)g(speech)h(recognition.)2020 1966 y Fi(\017)42 b Fj(Fle)o(xibility)27 b(and)i(e)o(xtendibility)-5 b(,)31 b(so)e(as)f(to)g(f)o(acilitate)g(research)h(into)21002053 y(ne)n(w)21 b(types)h(of)f(LMs,)g(while)g(being)h(able)g(to)f(reuse)g(e)o(xisting)h(compo-)2100 2140 y(nents.)20202255 y Fi(\017)42 b Fj(A)23 b(rational,)h(clean)f(softw)o(are)h(design,)h(pro)o(viding)f(both)g(an)f(applica-)2100 2342y(tion)18 b(programming)h(interf)o(ace)f(\(API\))e(and)i(a)g(con)m(v)o(enient)h(toolbox)g(of)2100 2428 y(commands)h(for)f(LM)g(b)o(uilding)g(and)h(testing.)2100 2544 y(The)28 b(design)i(w)o(as)f(in\003uenced)g(by)g(other)g(related)g(softw)o(are)g(imple-)1975 2630y(mentations.)67 b(The)33 b(CMU-Cambridge)h(toolkit)f([1],)j(and)d(discussions)1975 2717 y(with)19 b(its)f(original)g(author)m(,)h(Roni)g(Rosenfeld,)g(serv)o(ed)g(as)g(a)f(general)i(inspi-)19752804 y(ration)g(and)g(reference)g(point.)k(The)c(HTK)e(Lattice)h(T)-6b(oolkit)19 b([5])g(\(to)g(which)1977 2891 y Fk(S)t(R)t(I)t(L)t(M)dFj(has)h(an)f(interf)o(ace\))g(pro)o(vided)h(man)o(y)g(good)g(ideas)f(for)g(a)g(viable)g(and)1975 2977 y(ef)n(\002cient)i(API)f(for)h(language)h(models.)24 b(The)17 b(decision)i(to)f(e)o(xplore)g(object-)1975 3064 y(oriented)g(design)f(w)o(as)g(based)h(on)f(a)g(prior)g(project,)g(an)g(implementation)h(of)1975 3151 y(v)n(arious)23b(types)f(of)g(statistical)f(grammars)h(in)g(the)g(Common)g(Lisp)g(Object)1975 3238 y(System)g([6].)31 b(The)22 b(softw)o(are)g(b)o(uild)f(system)h(w)o(as)g(borro)n(wed)h(from)f(SRI')l(s)19753334 y(Decipher)2250 3302 y Fk(T)t(M)2369 3334 y Fj(speech)e(recognition)g(system)f([7)q(].)2100 3420 y(A)c(\002rst)f(implementation)i(with)f(minimal)g(functionality)h(for)f(standard)19753507 y(N-gram)k(models)h(w)o(as)e(created)i(prior)e(to)h(the)g(1995)g(Johns)h(Hopkins)g(Lan-)1975 3594 y(guage)j(Modeling)g(Summer)e(W)-6b(orkshop)23 b([8].)30 b(By)22 b(the)f(end)h(of)g(the)f(w)o(ork-)19753681 y(shop,)h(support)g(for)f(dynamic)h(LM)e(interpolation)i(and)f(N-best)g(rescoring)1975 3768 y(had)f(been)f(added,)h(and)f(a)g(small)f(community)i(of)e(users)h(outside)g(SRI)f(with)1975 3854y(an)25 b(associated)g(mailing)f(list)f(e)o(xisted.)39b(Ov)o(er)24 b(the)g(ne)o(xt)g(four)g(years)h(a)f(se-)19753941 y(ries)30 b(of)g(alpha)h(v)o(ersions)g(were)f(made)g(a)o(v)n(ailable)g(to)g(this)g(small)g(group,)1975 4028 y(while)19b(much)g(of)g(the)g(current)g(functionality)g(\(described)h(belo)n(w\))f(w)o(as)f(be-)1975 4115 y(ing)e(added.)23 b(In)15 b(July)h(1999)h(a)e(beta)g(v)o(ersion)h(w)o(as)g(released)g(for)f(general)h(dis-)19754201 y(trib)o(ution)j(under)g(an)h(open)f(source)h(license,)f(follo)n(wed)g(about)g(a)g(year)g(later)1975 4288 y(by)f(v)o(ersion)f(1.0.)23b(As)16 b(of)h(this)f(writing,)h(the)g(latest)f(released)h(v)o(ersion)g(is)g(1.3,)1975 4375 y(which)29 b(added)g(a)f(w)o(ord)g(graph)h(rescoring)g(tool,)h(a)d(test)h(suite,)i(and)e(sup-)19754462 y(port)d(for)g(W)m(indo)n(ws)h(platforms)f(\(pre)n(vious)h(v)o(ersions)f(were)g(Unix-only\).)1975 4548 y(Most)f(ongoing)g(go)o(v)o(ernment-funded)i(LM)c(research)h(and)h(de)n(v)o(elopment)19754635 y(at)d(SRI)f(is)h(based)g(on)j Fk(S)t(R)t(I)t(L)t(M)rFj(;)19 b(we)i(therefore)h(e)o(xpect)f(a)g(steady)h(stream)f(of)19754722 y(functionality)f(enhancements)h(\(as)e(well)f(as)h(b)o(ug)g(\002x)o(es\))g(to)g(continue.)2594 4891 y Fl(3.)45 b(FUNCTION)o(ALITY)1975 5045 y(3.1.)g(Basic)19 b(LM)f(operations)1975 5165y Fj(The)30 b(main)f(purpose)h(of)i Fk(S)t(R)t(I)t(L)t(M)dFj(is)g(to)g(support)h(language)g(model)g(esti-)19755252 y(mation)22 b(and)h(e)n(v)n(aluation.)32 b(Estimation)21b(means)h(the)g(creation)g(of)g(a)f(model)1975 5339 y(from)30b(training)g(data;)35 b(e)n(v)n(aluation)c(means)g(computing)g(the)f(probability)p eop%%Page: 2 22 1 bop -186 83 a Fj(of)17 b(a)f(test)h(corpus,)h(con)m(v)o(entionally)g(e)o(xpressed)g(as)f(the)g(test)f(set)h(perple)o(xity)-5b(.)-186 170 y(Since)16 b(most)h(LMs)f(in)j Fk(S)t(R)t(I)t(L)t(M)dFj(are)h(based)g(on)g(N-gram)g(statistics,)f(the)g(tools)-186257 y(to)j(accomplish)h(these)g(tw)o(o)f(purposes)i(are)e(named)hFh(ngram-count)e Fj(and)-186 343 y Fh(ngram)p Fj(,)k(respecti)n(v)o(ely)-5 b(.)35 b(A)23 b(standard)g(LM)g(\(trigram)f(with)h(Good-T)m(uring)-186 430 y(discounting)d(and)g(Katz)f(back)o(of)n(f)h(for)f(smoothing\))h(w)o(ould)g(be)f(created)g(by)-186 571y Fh(ngram-count)43 b(-text)h(TRAINDATA)f(-lm)i(LM)-61711 y Fj(The)16 b(resulting)g(LM)g(may)h(then)f(be)h(e)n(v)n(aluated)g(on)g(a)f(test)f(corpus)j(using)-186 852 y Fh(ngram)44b(-lm)g(LM)h(-ppl)f(TESTDATA)f(-debug)h(2)-61 993 y Fj(The)29b Fh(ngram)44 b(-debug)29 b Fj(option)i(controls)f(the)g(le)n(v)o(el)g(of)g(detail)g(of)-186 1079 y(diagnostic)g(output.)55b(A)29 b(v)n(alue)h(of)g(2)f(means)h(that)f(probabilities)h(are)f(to)-186 1166 y(be)20 b(reported)g(at)f(the)h(w)o(ord)g(le)n(v)o(el,)f(including)i(the)f(order)g(of)f(N-gram)h(used,)-186 1253y(in)d(addition)g(to)f(the)h(standard)h(log)e(probabilities)h(and)h(perple)o(xities.)k(Some)-186 1340 y(additional)16 b(statistics)e(that)g(also)h(help)h(gauge)f(LM)g(quality)g(are)g(the)g(number)-1861426 y(of)24 b(out-of-v)o(ocab)o(ulary)i(\(OO)l(V\))e(w)o(ords)h(and)h(the)e(\223hit)g(rates\224)h(of)f(v)n(arious)-186 1513y(le)n(v)o(els)18 b(of)h(N-grams)f(\(in)g(LMs)h(based)g(on)g(N-grams\))f([1];)g(these)h(are)f(either)-186 1600 y(computed)i(by)eFh(ngram)f Fj(itself)h(or)g(\(as)g(in)g(the)g(case)h(of)f(hit)g(rates\))g(tallied)g(by)-186 1687 y(auxiliary)h(scripts)g(that)g(analyze)h(the)f Fh(ngram)f Fj(output.)-59 1773 y Fk(S)t(R)t(I)t(L)t(M)23 b Fj(by)g(itself)g(performs)g(no)h(te)o(xt)f(conditioning,)i(and)f(treats)f(e)n(v-)-186 1860 y(erything)30 b(between)f(whitespace)h(as)f(a)f(w)o(ord.)54 b(Normalization)29 b(and)g(to-)-1861947 y(k)o(enization)d(of)e(te)o(xt)h(are)f(highly)i(corpus-dependent,)i(and)d(are)g(typically)-186 2034 y(accomplished)c(with)d(\002lters)g(that)h(preprocess)h(the)f(data.)-186 2194 y Fl(3.2.)45b(Bells)18 b(and)g(whistles)-186 2314 y Fj(The)32 b(programs)hFh(ngram-count)d Fj(and)j Fh(ngram)e Fj(ha)o(v)o(e)h(a)g(rather)g(lar)o(ge)-186 2401 y(number)23 b(of)e(options)i(to)e(control)h(the)g(man)o(y)g(parameters)g(of)g(LM)f(estima-)-186 2487 y(tion)i(and)h(testing.)36 b(The)23 b(most)g(important)g(parameters)h(for)f(LM)g(training)-1862574 y(are)-141 2688 y Fi(\017)42 b Fj(the)21 b(order)g(of)g(N-grams)h(to)f(use)g(\(e.g.,)g(unigram,)h(bigram\).)30 b(There)22b(is)-61 2775 y(no)d(b)o(uilt-in)f(limit)g(on)h(the)g(length)h(of)f(N-grams.)-141 2888 y Fi(\017)42 b Fj(the)37 b(type)g(of)g(discounting)i(algorithm)e(to)g(use.)78 b(Currently)37 b(sup-)-612975 y(ported)27 b(methods)h(include)g(Good-T)m(uring,)h(absolute,)h(W)m(itten-Bell,)-61 3062 y(and)15 b(modi\002ed)g(Kneser)o(-Ne)o(y)f([9)q(].)21 b(Each)15 b(of)f(these)h(discounting)h(meth-)-613149 y(ods)23 b(requires)g(its)f(o)n(wn)h(set)g(of)g(parameters,)g(as)g(well)f(as)h(a)g(choice)g(of)-61 3235 y(whether)18 b(higher)o(-)g(and)h(lo)n(wer)o(-order)f(estimates)g(are)g(to)g(be)h(combined)-613322 y(via)g(back)o(of)n(f)h(or)f(interpolation.)-1413436 y Fi(\017)42 b Fj(an)22 b(optional)g(prede\002ned)h(v)o(ocab)o(ulary)g(to)f(limit)f(or)g(augment)i(the)f(set)-61 3523y(of)c(w)o(ords)i(from)f(the)g(training)g(data.)-1413636 y Fi(\017)42 b Fj(whether)28 b(unkno)n(wn)j(w)o(ords)e(are)f(to)h(be)f(discarded)i(or)e(treated)h(as)f(a)-61 3723 y(special)19b(\223unkno)n(wn)i(w)o(ord\224)e(tok)o(en.)-141 3837y Fi(\017)42 b Fj(whether)19 b(to)g(collapse)g(case)g(distinctions)h(in)f(the)g(input)g(te)o(xt.)-186 3950 y(Be)o(yond)i(LM)g(estimation,)fFh(ngram-count)e Fj(performs)j(useful)g(N-gram)-186 4037y(count)29 b(manipulations,)j(such)d(as)f(generating)i(counts)f(from)g(te)o(xt,)h(sum-)-186 4124 y(ming)16 b(count)f(\002les,)g(and)h(recomputing)h(lo)n(wer)o(-order)e(counts)h(from)f(higher)o(-)-1864211 y(order)g(counts.)23 b Fh(ngram-count)13 b Fj(handles)j(inte)o(ger)f(or)h(fractional)f(counts,)-186 4297 y(although)27b(only)e(a)g(subset)h(of)f(the)h(smoothing)g(algorithms)g(supports)g(the)-186 4384 y(latter)37 b(\(generally)i(speaking,)44b(those)38 b(that)g(rely)g(on)g(counts-of-counts)-1864471 y(statistics)18 b(do)h(not\).)-61 4558 y(The)i(main)g(parameters)g(controlling)h(LM)f(e)n(v)n(aluation)i(are)e(the)g(order)-1864645 y(of)e(N-gram)g(to)g(use)g(\(which)h(can)f(be)g(lo)n(wer)g(than)h(what)f(the)g(LM)g(includes,)-186 4731 y(so)f(that)g(a)g(4-gram)g(model)h(may)f(con)m(v)o(eniently)i(be)e(used)h(also)f(as)g(a)g(bigram)-186 4818 y(or)25 b(trigram)h(model\),)h(and)f(the)g(v)n(ariant)f(of)h(N-gram)g(model)g(to)f(use\227for)-186 4905 y(e)o(xample,)32b(a)d(w)o(ord-based,)j(class-based,)g(or)d(interpolated)h(N-gram,)h(as)-186 4992 y(well)17 b(as)h(an)o(y)g(additional)g(parameters)h

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -