📄 tokenizedlmtest.java
字号:
package com.aliasi.test.unit.lm;import com.aliasi.lm.CompiledTokenizedLM;import com.aliasi.lm.TokenizedLM;import com.aliasi.lm.UniformBoundaryLM;import com.aliasi.lm.TrieIntSeqCounter;import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;import com.aliasi.tokenizer.TokenizerFactory;import com.aliasi.test.unit.BaseTestCase;import com.aliasi.symbol.SymbolTable;import com.aliasi.util.ScoredObject;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.util.Random;public class TokenizedLMTest extends BaseTestCase { private static final int MAX_NGRAM = 3; private static final double LAMBDA_FACTOR = 4.0; public void testTrainSequence() { TokenizerFactory tf = new IndoEuropeanTokenizerFactory(); TokenizedLM lm = new TokenizedLM(tf,3); SymbolTable st = lm.symbolTable(); TrieIntSeqCounter counter = lm.sequenceCounter(); // automatically get the BOUNDARY_TOKEN incremented to start assertEquals(1,counter.count(new int[0],0,0)); String ab = "a b"; String ac = "a c"; String abc = "a b c"; lm.trainSequence(ab,2); lm.trainSequence(ac,3); lm.trainSequence(abc,4); int a = st.symbolToID("a"); int b = st.symbolToID("b"); int c = st.symbolToID("c"); assertEquals(2,counter.count(new int[] { a, b },0,2)); assertEquals(3,counter.count(new int[] { a, c },0,2)); assertEquals(4,counter.count(new int[] { a, b, c},0,3)); assertEquals(5,counter.extensionCount(new int[] { a },0,1)); lm.trainSequence("a a a c c c",111); assertEquals(111,counter.count(new int[] { c, c, c},0,3)); assertEquals(111,counter.extensionCount(new int[] { c, c},0,2)); lm.trainSequence("",999); assertEquals(1000,counter.count(new int[0],0,0)); } public void testZeroGram() { TokenizerFactory tf = new IndoEuropeanTokenizerFactory(); try { new TokenizedLM(tf, 0, new UniformBoundaryLM(16), new UniformBoundaryLM(16), LAMBDA_FACTOR); fail(); } catch (IllegalArgumentException e) { succeed(); } } public void testUnigram() { TokenizerFactory tf = new IndoEuropeanTokenizerFactory(); TokenizedLM lm = new TokenizedLM(tf, 1, new UniformBoundaryLM(16), new UniformBoundaryLM(16), LAMBDA_FACTOR); lm.train("John Smith"); } public void testBiggerGram() { TokenizerFactory tf = new IndoEuropeanTokenizerFactory(); TokenizedLM lm = new TokenizedLM(tf, 4, new UniformBoundaryLM(16), new UniformBoundaryLM(16), LAMBDA_FACTOR); lm.train("John Smith"); } public void testChiSquaredIndependence() { TokenizerFactory tf = new IndoEuropeanTokenizerFactory(); TokenizedLM lm = new TokenizedLM(tf, 3, new UniformBoundaryLM(16), new UniformBoundaryLM(16), LAMBDA_FACTOR); lm.train("a b c a b d a b e a b"); SymbolTable table = lm.symbolTable(); assertEquals(5,table.numSymbols()); int aI = table.symbolToID("a"); int bI = table.symbolToID("b"); int cI = table.symbolToID("c"); int dI = table.symbolToID("d"); int eI = table.symbolToID("e"); assertTrue(aI >= 0); assertTrue(bI >= 0); assertTrue(cI >= 0); assertTrue(dI >= 0); assertTrue(eI >= 0); assertTrue(lm.chiSquaredIndependence(new int[] { aI, bI }) > lm.chiSquaredIndependence(new int[] { bI, cI })); assertTrue(lm.chiSquaredIndependence(new int[] { cI, aI }) > lm.chiSquaredIndependence(new int[] { cI, eI })); } public void testConstantSubModels() throws ClassNotFoundException, IOException { TokenizerFactory tf = new IndoEuropeanTokenizerFactory(); TokenizedLM lm = new TokenizedLM(tf, MAX_NGRAM, new UniformBoundaryLM(127), // unknown tok new UniformBoundaryLM(15), // whitespace LAMBDA_FACTOR); // INITIAL SYMBOL TRIE // 2 // (EOS) 1 // P("") // P(EOS|EOS) + Pws("") // P(EOS|EOS) = P(EOS) = lambda() * PML(EOS) double lambda_ = 1.0/(1.0 + 4.0*1.0); double pml_EOS = 1.0; double p_EOS = lambda_ * pml_EOS; double pws_ = 1.0/16.0; assertEstimate(com.aliasi.util.Math.log2(p_EOS * pws_), lm,""); // "a" // P(UNK|EOS) * P(EOS|EOS,UNK) * Ptok("a") * Pws("") * Pws("") // P(UNK|EOS) = P(UNK) = (1 - lambda()) // P(EOS|EOS,UNK) = P(EOS|EOS) = P(EOS) double p_UNK = 1.0 - lambda_; double ptok_a = (1.0/128.0) * (1.0/128.0); assertEstimate(com.aliasi.util.Math.log2(p_UNK * p_EOS * ptok_a * pws_ * pws_), lm,"a"); // P("a b") // = P(UNK|EOS) P(UNK|EOS,UNK) P(EOS|UNK,UNK) // Pws("") Pws("") Pws(" ") // Ptok(a) Ptok(b) // P(UNK|EOS) = P(UNK) // P(UNK|EOS,UNK) = P(UNK|UNK) = P(UNK) // P(EOS|UNK) = P(EOS) double ptok_b = ptok_a; double pws_s = pws_ * 1.0/16.0; assertEstimate(com.aliasi.util.Math.log2(p_UNK * p_UNK * p_EOS * pws_ * pws_ * pws_s * ptok_a * ptok_b), lm,"a b"); double ptok_c = ptok_b; assertEstimate(com.aliasi.util.Math.log2(p_UNK * p_UNK * p_UNK * p_EOS * pws_ * pws_ * pws_s * pws_s * ptok_a * ptok_b * ptok_c), lm,"a b c"); double ptok_d = ptok_b; assertEstimate(com.aliasi.util.Math.log2(p_UNK * p_UNK * p_UNK * p_UNK * p_EOS * pws_ * pws_ * pws_s * pws_s * pws_s * ptok_a * ptok_b * ptok_c * ptok_d), lm,"a b c d"); // ============ train on "a" =================== // go through same estimates, see above lm.train("a"); // EOS 2 // a 1 // EOS 1 // a 1 // EOS 1 // P("") // P(EOS|EOS) + Pws("") // P(EOS|EOS) = lambda(EOS) * PML(EOS|EOS) // + (1-lambda(EOS)) * P(EOS) // = (1-lambda(EOS)) *(lambda() * PML(EOS)) lambda_ = 3.0/(3.0 + 4.0*2.0); pml_EOS = 2.0/3.0; p_EOS = lambda_ * pml_EOS; double lambda_EOS = 1.0 / (1.0 + 4.0 * 1.0); double p_EOS_giv_EOS = (1.0 - lambda_EOS) * p_EOS; assertEstimate(com.aliasi.util.Math.log2(p_EOS_giv_EOS * pws_), lm,""); // "a" // P(a|EOS) + P(EOS|EOS,a) + Pws("") + Pws("") // P(a|EOS) = lambda(EOS) Pml(a|EOS) + (1-lambda(EOS)) P(a) // P(a) = lambda() * Pml(a) // P(EOS|EOS,a) = lambda(EOS,a) Pml(EOS|EOS,a) // + (1-lambda(EOS,a)) P(EOS|a) // // P(EOS|a) = lambda(a) Pml(EOS|a)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -