⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tokenizedlmtest.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package com.aliasi.test.unit.lm;import com.aliasi.lm.CompiledTokenizedLM;import com.aliasi.lm.TokenizedLM;import com.aliasi.lm.UniformBoundaryLM;import com.aliasi.lm.TrieIntSeqCounter;import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;import com.aliasi.tokenizer.TokenizerFactory;import com.aliasi.test.unit.BaseTestCase;import com.aliasi.symbol.SymbolTable;import com.aliasi.util.ScoredObject;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.util.Random;public class TokenizedLMTest extends BaseTestCase {    private static final int MAX_NGRAM = 3;    private static final double LAMBDA_FACTOR = 4.0;    public void testTrainSequence() {        TokenizerFactory tf = new IndoEuropeanTokenizerFactory();        TokenizedLM lm = new TokenizedLM(tf,3);        SymbolTable st = lm.symbolTable();        TrieIntSeqCounter counter = lm.sequenceCounter();        // automatically get the BOUNDARY_TOKEN incremented to start        assertEquals(1,counter.count(new int[0],0,0));        String ab = "a b";        String ac = "a c";        String abc = "a b c";        lm.trainSequence(ab,2);        lm.trainSequence(ac,3);        lm.trainSequence(abc,4);                int a = st.symbolToID("a");        int b = st.symbolToID("b");        int c = st.symbolToID("c");                assertEquals(2,counter.count(new int[] { a, b },0,2));        assertEquals(3,counter.count(new int[] { a, c },0,2));        assertEquals(4,counter.count(new int[] { a, b, c},0,3));        assertEquals(5,counter.extensionCount(new int[] { a },0,1));        lm.trainSequence("a a a c c c",111);        assertEquals(111,counter.count(new int[] { c, c, c},0,3));        assertEquals(111,counter.extensionCount(new int[] { c, c},0,2));        lm.trainSequence("",999);        assertEquals(1000,counter.count(new int[0],0,0));    }    public void testZeroGram() {        TokenizerFactory tf = new IndoEuropeanTokenizerFactory();        try {             new TokenizedLM(tf,                            0,                            new UniformBoundaryLM(16),                            new UniformBoundaryLM(16),                            LAMBDA_FACTOR);            fail();        } catch (IllegalArgumentException e) {            succeed();        }    }    public void testUnigram() {        TokenizerFactory tf = new IndoEuropeanTokenizerFactory();        TokenizedLM lm             = new TokenizedLM(tf,                              1,                              new UniformBoundaryLM(16),                              new UniformBoundaryLM(16),                              LAMBDA_FACTOR);        lm.train("John Smith");    }    public void testBiggerGram() {        TokenizerFactory tf = new IndoEuropeanTokenizerFactory();        TokenizedLM lm             = new TokenizedLM(tf,                              4,                              new UniformBoundaryLM(16),                              new UniformBoundaryLM(16),                              LAMBDA_FACTOR);        lm.train("John Smith");    }    public void testChiSquaredIndependence() {        TokenizerFactory tf = new IndoEuropeanTokenizerFactory();        TokenizedLM lm             = new TokenizedLM(tf,                              3,                              new UniformBoundaryLM(16),                              new UniformBoundaryLM(16),                              LAMBDA_FACTOR);        lm.train("a b c a b d a b e a b");        SymbolTable table = lm.symbolTable();        assertEquals(5,table.numSymbols());        int aI = table.symbolToID("a");        int bI = table.symbolToID("b");        int cI = table.symbolToID("c");        int dI = table.symbolToID("d");        int eI = table.symbolToID("e");        assertTrue(aI >= 0);        assertTrue(bI >= 0);        assertTrue(cI >= 0);        assertTrue(dI >= 0);        assertTrue(eI >= 0);            assertTrue(lm.chiSquaredIndependence(new int[] { aI, bI })                   > lm.chiSquaredIndependence(new int[] { bI, cI }));        assertTrue(lm.chiSquaredIndependence(new int[] { cI, aI })                   > lm.chiSquaredIndependence(new int[] { cI, eI }));    }    public void testConstantSubModels() throws ClassNotFoundException, IOException {        TokenizerFactory tf = new IndoEuropeanTokenizerFactory();        TokenizedLM lm            = new TokenizedLM(tf,                              MAX_NGRAM,                              new UniformBoundaryLM(127), // unknown tok                              new UniformBoundaryLM(15), // whitespace                              LAMBDA_FACTOR);        // INITIAL SYMBOL TRIE        // 2        // (EOS) 1        // P("")        // P(EOS|EOS) + Pws("")        // P(EOS|EOS) = P(EOS) = lambda() * PML(EOS)        double lambda_ = 1.0/(1.0 + 4.0*1.0);        double pml_EOS = 1.0;        double p_EOS = lambda_ * pml_EOS;        double pws_ = 1.0/16.0;        assertEstimate(com.aliasi.util.Math.log2(p_EOS * pws_),                       lm,"");        // "a"        // P(UNK|EOS) * P(EOS|EOS,UNK) * Ptok("a") * Pws("") * Pws("")        // P(UNK|EOS) = P(UNK) = (1 - lambda())        // P(EOS|EOS,UNK) = P(EOS|EOS) = P(EOS)        double p_UNK = 1.0 - lambda_;        double ptok_a = (1.0/128.0) * (1.0/128.0);        assertEstimate(com.aliasi.util.Math.log2(p_UNK * p_EOS                                                  * ptok_a                                                  * pws_ * pws_),                       lm,"a");        // P("a b")        // = P(UNK|EOS) P(UNK|EOS,UNK) P(EOS|UNK,UNK)         //   Pws("") Pws("") Pws(" ")         //   Ptok(a) Ptok(b)        // P(UNK|EOS) = P(UNK)        // P(UNK|EOS,UNK) = P(UNK|UNK) = P(UNK)        // P(EOS|UNK) = P(EOS)        double ptok_b = ptok_a;        double pws_s = pws_ * 1.0/16.0;        assertEstimate(com.aliasi.util.Math.log2(p_UNK * p_UNK * p_EOS                                                 * pws_ * pws_ * pws_s                                                 * ptok_a * ptok_b),                       lm,"a b");        double ptok_c = ptok_b;        assertEstimate(com.aliasi.util.Math.log2(p_UNK * p_UNK * p_UNK * p_EOS                                                 * pws_ * pws_ * pws_s * pws_s                                                 * ptok_a * ptok_b * ptok_c),                       lm,"a b c");        double ptok_d = ptok_b;        assertEstimate(com.aliasi.util.Math.log2(p_UNK * p_UNK * p_UNK * p_UNK * p_EOS                                                 * pws_ * pws_ * pws_s * pws_s * pws_s                                                 * ptok_a * ptok_b * ptok_c * ptok_d),                       lm,"a b c d");        // ============ train on "a" ===================        // go through same estimates, see above            lm.train("a");        // EOS 2        //     a 1        //       EOS 1        // a 1        //   EOS 1        // P("")        // P(EOS|EOS) + Pws("")        // P(EOS|EOS) = lambda(EOS) * PML(EOS|EOS)        //            + (1-lambda(EOS)) * P(EOS)        //            = (1-lambda(EOS)) *(lambda() * PML(EOS))        lambda_ = 3.0/(3.0 + 4.0*2.0);        pml_EOS = 2.0/3.0;        p_EOS = lambda_ * pml_EOS;        double lambda_EOS = 1.0 / (1.0 + 4.0 * 1.0);        double p_EOS_giv_EOS = (1.0 - lambda_EOS) * p_EOS;        assertEstimate(com.aliasi.util.Math.log2(p_EOS_giv_EOS * pws_),                       lm,"");        // "a"        // P(a|EOS) + P(EOS|EOS,a) + Pws("") + Pws("")        // P(a|EOS) = lambda(EOS) Pml(a|EOS) + (1-lambda(EOS)) P(a)            //    P(a) = lambda() * Pml(a)        // P(EOS|EOS,a) = lambda(EOS,a) Pml(EOS|EOS,a)         //              + (1-lambda(EOS,a)) P(EOS|a)        //         //     P(EOS|a) = lambda(a) Pml(EOS|a) 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -