⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tokenizedlmtest.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
        //              + (1-lambda(a)) P(EOS)        //         P(EOS) = lambda() * Pml(EOS)        lambda_EOS = 1.0 / (1.0 + 4.0 * 1.0);        double pml_A_giv_EOS = 1.0;        lambda_ = 3.0 / (3.0 + 4.0 * 2.0);        double pml_A = 1.0 / 3.0;        double p_A = lambda_ * pml_A;        double p_A_giv_EOS = lambda_EOS * pml_A_giv_EOS            + (1.0 - lambda_EOS) * p_A;            double lambda_EOS_A = 1.0 / (1.0 + 4.0 * 1.0);        double pml_EOS_giv_EOS_A = 1.0;        double lambda_A = 1.0 / (1.0 + 4.0 * 1.0);        double pml_EOS_giv_A = 1.0;        pml_EOS = 2.0 / 3.0;        p_EOS = lambda_ * pml_EOS;        double p_EOS_giv_A = lambda_A * pml_EOS_giv_A             + (1.0 - lambda_A) * p_EOS;        double p_EOS_giv_EOS_A = lambda_EOS_A * pml_EOS_giv_EOS_A            + (1.0 - lambda_EOS_A) * p_EOS_giv_A;                assertEstimate(com.aliasi.util.Math.log2(p_A_giv_EOS                                                 * p_EOS_giv_EOS_A                                                 * pws_ * pws_),                       lm,"a");    }    public void testTwo() throws ClassNotFoundException, IOException {        TokenizedLM lm            = new TokenizedLM(new IndoEuropeanTokenizerFactory(),                              MAX_NGRAM,                              new UniformBoundaryLM(127), // unknown tok                              new UniformBoundaryLM(15), // whitespace                              LAMBDA_FACTOR);        assertEqEstimate(lm,"a");        assertEqEstimate(lm,"a b");        assertEqEstimate(lm,"a a b");        assertEqEstimate(lm,"a b a");        lm.train("a");        assertEqEstimate(lm,"a");        assertEqEstimate(lm,"a b");        assertEqEstimate(lm,"a a b");        assertEqEstimate(lm,"a b a");        lm.train("a b c");        assertEqEstimate(lm,"a");        assertEqEstimate(lm,"a b");        assertEqEstimate(lm,"a b e");        lm.train("x y");        assertEqEstimate(lm,"x y a b e x y");        assertEqEstimate(lm,"");        assertEqEstimate(lm,"x");    }    public void testCollocs() {        TokenizedLM lm            = new TokenizedLM(new IndoEuropeanTokenizerFactory(),4);        lm.train("a b c d");        lm.train("a b e f");        lm.train("c f e");        ScoredObject[] collocs            = lm.collocations(2,1,2);        assertEquals(2,collocs.length);        assertEqualsArray(new String[] { "a", "b" },                          (String[]) collocs[0].getObject());        assertEqualsArray(new String[] { "c", "d" },                          (String[]) collocs[1].getObject());        lm = new TokenizedLM(new IndoEuropeanTokenizerFactory(),4);        lm.train("a b c d");        lm.train("a b c e");        lm.train("d e f");        lm.train("f d e");        lm.train("e f d");        collocs = lm.collocations(3,1,2);        assertEquals(2,collocs.length);        assertEqualsArray(new String[] { "a", "b", "c" },                          (String[]) collocs[0].getObject());        try {            lm.collocations(1,1,3);            fail();        } catch (IllegalArgumentException e) {            succeed();        }    }    public void testNewAndOldTerms() {        TokenizedLM lm1            = new TokenizedLM(new IndoEuropeanTokenizerFactory(),3);        TokenizedLM lm2            = new TokenizedLM(new IndoEuropeanTokenizerFactory(),3);        // need several instances to overcome boundaries and unknowns        lm1.train("b c d");        lm1.train("b c d");        lm1.train("b c d");        lm1.train("b c f");        lm2.train("b c x");        lm2.train("b c x");        lm2.train("b c x");        lm2.train("b c y");            ScoredObject[] newTerms1 = lm1.newTerms(2,1,3,lm2);        assertEqualsArray(new String[] { "c", "d" },                          (String[]) newTerms1[0].getObject());        ScoredObject[] newTerms2 = lm2.newTerms(2,1,2,lm1);        assertEqualsArray(new String[] { "c", "x" },                          (String[]) newTerms2[0].getObject());        ScoredObject[] oldTerms1 = lm1.oldTerms(2,1,3,lm2);        assertEqualsArray(new String[] { "c", "f" },                          (String[]) oldTerms1[0].getObject());        ScoredObject[] oldTerms2 = lm2.oldTerms(2,1,3,lm1);        assertEqualsArray(new String[] { "c", "y" },                          (String[]) oldTerms2[0].getObject());        ScoredObject[] fTerms1 = lm1.frequentTerms(2,10);        assertEqualsArray(new String[] { "b", "c" },                          (String[]) fTerms1[0].getObject());        assertEqualsArray(new String[] { "c", "d" },                          (String[]) fTerms1[1].getObject());        assertEqualsArray(new String[] { "c", "f" },                          (String[]) fTerms1[2].getObject());        ScoredObject[] fTerms2 = lm1.infrequentTerms(2,10);        assertEqualsArray(new String[] { "b", "c" },                          (String[]) fTerms2[2].getObject());        assertEqualsArray(new String[] { "c", "d" },                          (String[]) fTerms2[1].getObject());        assertEqualsArray(new String[] { "c", "f" },                          (String[]) fTerms2[0].getObject());    }    private void assertEstimate(double estimate,                                TokenizedLM lm,                                CharSequence cSeq)        throws ClassNotFoundException, IOException {        assertEquals(estimate,lm.log2Estimate(cSeq),0.005);        assertEqEstimate(lm,cSeq.toString());    }    public void assertEqEstimate(TokenizedLM lm, CharSequence cSeq)        throws ClassNotFoundException, IOException {        assertEquals(lm.log2Estimate(cSeq),                     writeRead(lm).log2Estimate(cSeq),                     0.005);    }    private static CompiledTokenizedLM writeRead(TokenizedLM lm) {        try {             ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();            ObjectOutputStream objOut = new ObjectOutputStream(bytesOut);            lm.compileTo(objOut);            ByteArrayInputStream bytesIn                 = new ByteArrayInputStream(bytesOut.toByteArray());            ObjectInputStream objIn = new ObjectInputStream(bytesIn);            return (CompiledTokenizedLM) objIn.readObject();        } catch (IOException e) {            fail(e.toString());        } catch (ClassNotFoundException e) {            fail(e.toString());        }        return null; // bogus unreachable; compiler doesn't know fail()    }        public void testMultipleIncrements() {        Random random = new Random();        TokenizerFactory tf = new IndoEuropeanTokenizerFactory();        TokenizedLM lm1 = new TokenizedLM(tf,3);        TokenizedLM lm2 = new TokenizedLM(tf,3);        for (int i = 0; i < 100; ++i) {            StringBuffer sb = new StringBuffer();            for (int k = 0; k < 5; ++k) {                sb.append((char)random.nextInt(16));                sb.append(' ');            }            int trainingCount = random.nextInt(10); // train 0 to 10 times            incrementAssertSynched(lm1,lm2,sb,trainingCount);        }    }    void incrementAssertSynched(TokenizedLM lm1,                                TokenizedLM lm2,                                CharSequence cs,                                int count) {        for (int i = 0; i < count; ++i)            lm1.train(cs);        lm2.train(cs,count);        assertSynched(lm1,lm2);    }    void assertSynched(TokenizedLM lm1, TokenizedLM lm2) {        for (int i = 0; i < 100; ++i)            for (int k = 0; k < 5; ++k)                 assertSynched(lm1,lm2,k);    }    void assertSynched(TokenizedLM lm1, TokenizedLM lm2, int k) {        Random random = new Random();        StringBuffer sb = new StringBuffer();        for (int i = 0; i < k; ++i) {            sb.append((char)random.nextInt(16));            sb.append(' ');        }        assertEquals(lm1.log2Estimate(sb),lm2.log2Estimate(sb),0.0001);    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -