📄 tokenizedlmtest.java
字号:
// + (1-lambda(a)) P(EOS) // P(EOS) = lambda() * Pml(EOS) lambda_EOS = 1.0 / (1.0 + 4.0 * 1.0); double pml_A_giv_EOS = 1.0; lambda_ = 3.0 / (3.0 + 4.0 * 2.0); double pml_A = 1.0 / 3.0; double p_A = lambda_ * pml_A; double p_A_giv_EOS = lambda_EOS * pml_A_giv_EOS + (1.0 - lambda_EOS) * p_A; double lambda_EOS_A = 1.0 / (1.0 + 4.0 * 1.0); double pml_EOS_giv_EOS_A = 1.0; double lambda_A = 1.0 / (1.0 + 4.0 * 1.0); double pml_EOS_giv_A = 1.0; pml_EOS = 2.0 / 3.0; p_EOS = lambda_ * pml_EOS; double p_EOS_giv_A = lambda_A * pml_EOS_giv_A + (1.0 - lambda_A) * p_EOS; double p_EOS_giv_EOS_A = lambda_EOS_A * pml_EOS_giv_EOS_A + (1.0 - lambda_EOS_A) * p_EOS_giv_A; assertEstimate(com.aliasi.util.Math.log2(p_A_giv_EOS * p_EOS_giv_EOS_A * pws_ * pws_), lm,"a"); } public void testTwo() throws ClassNotFoundException, IOException { TokenizedLM lm = new TokenizedLM(new IndoEuropeanTokenizerFactory(), MAX_NGRAM, new UniformBoundaryLM(127), // unknown tok new UniformBoundaryLM(15), // whitespace LAMBDA_FACTOR); assertEqEstimate(lm,"a"); assertEqEstimate(lm,"a b"); assertEqEstimate(lm,"a a b"); assertEqEstimate(lm,"a b a"); lm.train("a"); assertEqEstimate(lm,"a"); assertEqEstimate(lm,"a b"); assertEqEstimate(lm,"a a b"); assertEqEstimate(lm,"a b a"); lm.train("a b c"); assertEqEstimate(lm,"a"); assertEqEstimate(lm,"a b"); assertEqEstimate(lm,"a b e"); lm.train("x y"); assertEqEstimate(lm,"x y a b e x y"); assertEqEstimate(lm,""); assertEqEstimate(lm,"x"); } public void testCollocs() { TokenizedLM lm = new TokenizedLM(new IndoEuropeanTokenizerFactory(),4); lm.train("a b c d"); lm.train("a b e f"); lm.train("c f e"); ScoredObject[] collocs = lm.collocations(2,1,2); assertEquals(2,collocs.length); assertEqualsArray(new String[] { "a", "b" }, (String[]) collocs[0].getObject()); assertEqualsArray(new String[] { "c", "d" }, (String[]) collocs[1].getObject()); lm = new TokenizedLM(new IndoEuropeanTokenizerFactory(),4); lm.train("a b c d"); lm.train("a b c e"); lm.train("d e f"); lm.train("f d e"); lm.train("e f d"); collocs = lm.collocations(3,1,2); assertEquals(2,collocs.length); assertEqualsArray(new String[] { "a", "b", "c" }, (String[]) collocs[0].getObject()); try { lm.collocations(1,1,3); fail(); } catch (IllegalArgumentException e) { succeed(); } } public void testNewAndOldTerms() { TokenizedLM lm1 = new TokenizedLM(new IndoEuropeanTokenizerFactory(),3); TokenizedLM lm2 = new TokenizedLM(new IndoEuropeanTokenizerFactory(),3); // need several instances to overcome boundaries and unknowns lm1.train("b c d"); lm1.train("b c d"); lm1.train("b c d"); lm1.train("b c f"); lm2.train("b c x"); lm2.train("b c x"); lm2.train("b c x"); lm2.train("b c y"); ScoredObject[] newTerms1 = lm1.newTerms(2,1,3,lm2); assertEqualsArray(new String[] { "c", "d" }, (String[]) newTerms1[0].getObject()); ScoredObject[] newTerms2 = lm2.newTerms(2,1,2,lm1); assertEqualsArray(new String[] { "c", "x" }, (String[]) newTerms2[0].getObject()); ScoredObject[] oldTerms1 = lm1.oldTerms(2,1,3,lm2); assertEqualsArray(new String[] { "c", "f" }, (String[]) oldTerms1[0].getObject()); ScoredObject[] oldTerms2 = lm2.oldTerms(2,1,3,lm1); assertEqualsArray(new String[] { "c", "y" }, (String[]) oldTerms2[0].getObject()); ScoredObject[] fTerms1 = lm1.frequentTerms(2,10); assertEqualsArray(new String[] { "b", "c" }, (String[]) fTerms1[0].getObject()); assertEqualsArray(new String[] { "c", "d" }, (String[]) fTerms1[1].getObject()); assertEqualsArray(new String[] { "c", "f" }, (String[]) fTerms1[2].getObject()); ScoredObject[] fTerms2 = lm1.infrequentTerms(2,10); assertEqualsArray(new String[] { "b", "c" }, (String[]) fTerms2[2].getObject()); assertEqualsArray(new String[] { "c", "d" }, (String[]) fTerms2[1].getObject()); assertEqualsArray(new String[] { "c", "f" }, (String[]) fTerms2[0].getObject()); } private void assertEstimate(double estimate, TokenizedLM lm, CharSequence cSeq) throws ClassNotFoundException, IOException { assertEquals(estimate,lm.log2Estimate(cSeq),0.005); assertEqEstimate(lm,cSeq.toString()); } public void assertEqEstimate(TokenizedLM lm, CharSequence cSeq) throws ClassNotFoundException, IOException { assertEquals(lm.log2Estimate(cSeq), writeRead(lm).log2Estimate(cSeq), 0.005); } private static CompiledTokenizedLM writeRead(TokenizedLM lm) { try { ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); ObjectOutputStream objOut = new ObjectOutputStream(bytesOut); lm.compileTo(objOut); ByteArrayInputStream bytesIn = new ByteArrayInputStream(bytesOut.toByteArray()); ObjectInputStream objIn = new ObjectInputStream(bytesIn); return (CompiledTokenizedLM) objIn.readObject(); } catch (IOException e) { fail(e.toString()); } catch (ClassNotFoundException e) { fail(e.toString()); } return null; // bogus unreachable; compiler doesn't know fail() } public void testMultipleIncrements() { Random random = new Random(); TokenizerFactory tf = new IndoEuropeanTokenizerFactory(); TokenizedLM lm1 = new TokenizedLM(tf,3); TokenizedLM lm2 = new TokenizedLM(tf,3); for (int i = 0; i < 100; ++i) { StringBuffer sb = new StringBuffer(); for (int k = 0; k < 5; ++k) { sb.append((char)random.nextInt(16)); sb.append(' '); } int trainingCount = random.nextInt(10); // train 0 to 10 times incrementAssertSynched(lm1,lm2,sb,trainingCount); } } void incrementAssertSynched(TokenizedLM lm1, TokenizedLM lm2, CharSequence cs, int count) { for (int i = 0; i < count; ++i) lm1.train(cs); lm2.train(cs,count); assertSynched(lm1,lm2); } void assertSynched(TokenizedLM lm1, TokenizedLM lm2) { for (int i = 0; i < 100; ++i) for (int k = 0; k < 5; ++k) assertSynched(lm1,lm2,k); } void assertSynched(TokenizedLM lm1, TokenizedLM lm2, int k) { Random random = new Random(); StringBuffer sb = new StringBuffer(); for (int i = 0; i < k; ++i) { sb.append((char)random.nextInt(16)); sb.append(' '); } assertEquals(lm1.log2Estimate(sb),lm2.log2Estimate(sb),0.0001); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -