📄 tokenizedlm.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
     * @throws IndexOutOfBoundsException If the indices are out of     * range for the character array.     */    public void train(char[] cs, int start, int end) {        Strings.checkArgsStartEnd(cs,start,end);        Tokenizer tokenizer =  mTokenizerFactory.tokenizer(cs,start,end-start);        ArrayList tokenList = new ArrayList();        while (true) {            if (mDynamicWhitespaceModel != null) {                String whitespace = tokenizer.nextWhitespace();                mDynamicWhitespaceModel.train(whitespace);            } // this'll pick up the last whitespace after last token            String token = tokenizer.nextToken();            if (token == null) break;            tokenList.add(token);        }        int[] tokIds = new int[tokenList.size()+2];        tokIds[0] = BOUNDARY_TOKEN;        tokIds[tokIds.length-1] = BOUNDARY_TOKEN;        Iterator it = tokenList.iterator();        for (int i = 1; it.hasNext(); ++i) {            String token = it.next().toString();            // train underlying token model just once per token            if (mDynamicUnknownTokenModel != null                && mSymbolTable.symbolToID(token) < 0) {                mDynamicUnknownTokenModel.train(token);            }            tokIds[i] = mSymbolTable.getOrAddSymbol(token);        }        mCounter.incrementSubsequences(tokIds,0,tokIds.length);        mCounter.decrementUnigram(BOUNDARY_TOKEN);    }    /**     * This method is a convenience implementation of the <code>TextHandler</code>     * interface which delegates calls to {@link #train(char[], int, int)}.     *     * @param cs Underlying character array.     * @param start Index of first character in slice.     * @param length Length of slice.     * @throws IndexOutOfBoundsException If the indices are out of     * range for the character array.     */    public void handle(char[] cs, int start, int length) {        train(cs,start,start+length);    }    /**     * Trains the token sequence model, whitespace model (if dynamic) and     * unknown token model (if dynamic).     *     * @param cs Underlying character array.     * @param start Index of first character in slice.     * @param end Index of one plus last character in slice.     * @param count Number of instances of sequence to train.     * @throws IndexOutOfBoundsException If the indices are out of range for the     * character array.     * @throws IllegalArgumentException If the count is negative.     */    public void train(char[] cs, int start, int end, int count) {        Strings.checkArgsStartEnd(cs,start,end);        if (count < 0) {            String msg = "Counts must be non-negative."                + " Found count=" + count;            throw new IllegalArgumentException(msg);        }        if (count == 0) return;        Tokenizer tokenizer =  mTokenizerFactory.tokenizer(cs,start,end-start);        ArrayList tokenList = new ArrayList();        while (true) {            if (mDynamicWhitespaceModel != null) {                String whitespace = tokenizer.nextWhitespace();                mDynamicWhitespaceModel.train(whitespace,count);            } // this'll pick up the last whitespace after last token            String token = tokenizer.nextToken();            if (token == null) break;            tokenList.add(token);        }        int[] tokIds = new int[tokenList.size()+2];        tokIds[0] = BOUNDARY_TOKEN;        tokIds[tokIds.length-1] = BOUNDARY_TOKEN;        Iterator it = tokenList.iterator();        for (int i = 1; it.hasNext(); ++i) {            String token = it.next().toString();            // train underlying token model just once per token            if (mDynamicUnknownTokenModel != null                && mSymbolTable.symbolToID(token) < 0) {                mDynamicUnknownTokenModel.train(token,count);            }            tokIds[i] = mSymbolTable.getOrAddSymbol(token);        }        mCounter.incrementSubsequences(tokIds,0,tokIds.length,count);        mCounter.decrementUnigram(BOUNDARY_TOKEN,count);    }    /**     * This method trains the last token in the sequence given the     * previous tokens.  See {@link #trainSequence(CharSequence, int)}     * for more information.     *     * @param cs Underlying character array.     * @param start Index of first character in slice.     * @param end Index of one plus last character in slice.     * @throws IndexOutOfBoundsException If the indices are out of     * range for the character array.     * @throws IllegalArgumentException If the count is negative.     */    public void trainSequence(char[] cs, int start, int end, int count) {        Strings.checkArgsStartEnd(cs,start,end);        if (count < 0) {            String msg = "Count must be non-negative.  Found count=" + count;            throw new IllegalArgumentException(msg);        }        Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start);        String[] tokens = tokenizer.tokenize();        int len = Math.min(tokens.length,nGramOrder());        int offset = tokens.length - len;        int[] tokIds = new int[len];        for (int i = 0; i < len; ++i)            tokIds[i] = mSymbolTable.getOrAddSymbol(tokens[i+offset]);        mCounter.incrementSequence(tokIds,0,len,count);    }    /**     * This method increments the count of the entire sequence     * specified.  Note that this method does not increment any of the     * token subsequences and does not increment the whitespace or     * token smoothing models.     *     * <p>This method may be used to train a tokenized language model     * from individual character sequence counts.  Because the token     * smoothing models are not implemented for this method, a pure     * token model may be constructed by calling     * <code>train(CharSequence,int)</code> for character sequences     * corresponding to unigrams rather than this method in order to     * train token smoothing with character subseuqneces.     *     * <p>For instance, with     * <code>com.aliasi.tokenizer.IndoEuropeanTokenizerFactory</code>,     * the sequence calling <code>trainSequence(&quot;the fast     * computer&quot;,5)</code> would extract three tokens,     * <code>the</code>, <code>fast</code> and <code>computer</code>,     * and would increment the count of the three-token sequence, but     * not any of its subsequences.     *     * <p>If the number of tokens is longer than the maximum n-gram     * length, only the final tokens are trained. For instance, with     * an n-gram length of 2, and the Indo-European tokenizer factory,     * calling <code>trainSequence(&quot;a slightly faster     * computer&quot;,93)</code> is equivalent to calling     * <code>trainSequence(&quot;faster computer&quot;,93)</code>.     *     * <p>All tokens trained are added to the symbol table.  This     * does not include any initial tokens that are not used because     * the maximum n-gram length is too short.     *     * @param cSeq Character sequence to train.     * @param count Number of instances to train.     * @throws IllegalArgumentException If the count is negative.     */    public void trainSequence(CharSequence cSeq, int count) {        char[] cs = Strings.toCharArray(cSeq);        trainSequence(cs,0,cs.length,count);    }    public double log2Estimate(CharSequence cSeq) {        char[] cs = Strings.toCharArray(cSeq);        return log2Estimate(cs,0,cs.length);    }    public double log2Estimate(char[] cs, int start, int end) {        Strings.checkArgsStartEnd(cs,start,end);        double logEstimate = 0.0;        // collect tokens, estimate whitespaces        Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start);        ArrayList tokenList = new ArrayList();        while (true) {            String whitespace = tokenizer.nextWhitespace();            logEstimate += mWhitespaceModel.log2Estimate(whitespace);            String token = tokenizer.nextToken();            if (token == null) break;            tokenList.add(token);        }        // collect token ids, estimate unknown tokens        int[] tokIds = new int[tokenList.size()+2];        tokIds[0] = BOUNDARY_TOKEN;        tokIds[tokIds.length-1] = BOUNDARY_TOKEN;        Iterator it = tokenList.iterator();        for (int i = 1; it.hasNext(); ++i) {            String token = it.next().toString();            tokIds[i] = mSymbolTable.symbolToID(token);            if (tokIds[i] < 0) {                logEstimate += mUnknownTokenModel.log2Estimate(token);            }        }        // estimate token ids excluding start, inlcuding end        for (int i = 2; i <= tokIds.length; ++i) {            logEstimate += conditionalLog2TokenEstimate(tokIds,0,i);        }        return logEstimate;    }    class StringArrayAdapter implements IntArrayHandler {        StringArrayHandler mHandler;        public StringArrayAdapter(StringArrayHandler handler) {            mHandler = handler;        }        public void handle(int[] nGram) {            mHandler.handle(simpleNGramToTokens(nGram));        }        String[] simpleNGramToTokens(int[] nGram) {            String[] tokens = new String[nGram.length];            for (int i = 0; i < tokens.length; ++i)                tokens[i]                    = nGram[i] >= 0                    ? mSymbolTable.idToSymbol(nGram[i])                    : null;            return tokens;        }    }    abstract class Collector implements IntArrayHandler {        final BoundedPriorityQueue<ScoredObject<String[]>> mBPQ;        Collector(int maxReturned, boolean reverse) {            Comparator<Scored> comparator                = reverse                ? Scored.REVERSE_SCORE_COMPARATOR                : Scored.SCORE_COMPARATOR;            mBPQ = new BoundedPriorityQueue(comparator,                                            maxReturned);        }        ScoredObject<String[]>[] nGrams() {            ScoredObject<String[]>[] result                = (ScoredObject<String[]>[]) new ScoredObject[mBPQ.size()];            mBPQ.toArray(result);            return result;        }        public void handle(int[] nGram) {            for (int i = 0; i < nGram.length; ++i)                if (nGram[i] < 0) return;  // don't include boundaries            mBPQ.add(new ScoredObject<String[]>(nGramToTokens(nGram),                                                scoreNGram(nGram)));        }        abstract double scoreNGram(int[] nGram);    }    class FreqTermCollector extends Collector {        FreqTermCollector(int maxReturned, boolean reverse) {            super(maxReturned,reverse);        }        double scoreNGram(int[] nGram) {            return mCounter.count(nGram,0,nGram.length);        }    }    class CollocationCollector extends Collector {        CollocationCollector(int maxReturned) {            super(maxReturned,false);        }        double scoreNGram(int[] nGram) {            return chiSquaredIndependence(nGram);        }    }    class SigTermCollector extends Collector {        final LanguageModel.Tokenized mBGModel;        SigTermCollector(int maxReturned, LanguageModel.Tokenized bgModel,                         boolean reverse) {            super(maxReturned,reverse);            mBGModel = bgModel;        }        double scoreNGram(int[] nGram) {            String[] tokens = nGramToTokens(nGram);            int totalSampleCount = mCounter.count(nGram,0,0);            int sampleCount = mCounter.count(nGram,0,nGram.length);            double bgProb                = mBGModel.tokenProbability(tokens,0,tokens.length);            double score = BinomialDistribution.z(bgProb,                                                  sampleCount,                                                  totalSampleCount);            return score;        }    }    String[] nGramToTokens(int[] nGram) {        String[] toks = new String[nGram.length];        for (int i = 0; i < nGram.length; ++i) {            toks[i] = nGram[i] >= 0                ? mSymbolTable.idToSymbol(nGram[i])                : (i == 0) ? "*BEGIN*" : "*END*";        }        return toks;    }    public double tokenProbability(String[] tokens, int start, int end) {        return Math.pow(2.0,tokenLog2Probability(tokens,start,end));    }    public double tokenLog2Probability(String[] tokens, int start, int end) {        // check args!!!        double log2Estimate = 0.0;        int[] tokIds = new int[tokens.length];        for (int i = start; i < end; ++i) {            tokIds[i] = mSymbolTable.symbolToID(tokens[i]);            double conditionalLog2TokenEstimate                = conditionalLog2TokenEstimate(tokIds,0,i+1);            if (Double.isInfinite(conditionalLog2TokenEstimate)) {                double extCountD = mCounter.extensionCount(new int[0], 0, 0);                double numTokensD = mSymbolTable.numSymbols();                log2Estimate                    +=  com.aliasi.util.Math.log2(extCountD                                                  / (extCountD + numTokensD));                log2Estimate += mUnknownTokenModel.log2Estimate(tokens[i]);            } else {                log2Estimate += conditionalLog2TokenEstimate;            }            if (Double.isInfinite(log2Estimate)) {                System.out.println("tokens[" + i + "]=" + tokens[i]                                   + "\n     id=" + tokIds[i]);            }        }        return log2Estimate;    }    /**     * Returns the probability of the specified tokens in the     * underlying token n-gram distribution.  This includes the     * estimation of the actual token for unknown tokens.     *     * @param tokens Tokens whose probability is returned.     * @return The probability of the tokens.     */    public double processLog2Probability(String[] tokens) {        return tokenLog2Probability(tokens,0,tokens.length);    }    /**     * Returns an array of collocations in order of confidence that     * their token sequences are not independent.  The object     * contained in the returned scored objects will be an instance of     * <code>String[]</code> containing tokens.  The length of n-gram,     * minimum count for a result and the maximum number of results     * returned are all specified.  The confidence ordering is based     * on the result of Pearson's C<sub><sub>2</sub></sub>     * independence statistic as computed by {@link     * #chiSquaredIndependence(int[])}.     *     * @param nGram Length of n-grams to search for collocations.     * @param minCount Minimum count for a returned n-gram.     * @param maxReturned Maximum number of results returned.     * @return Array of collocations in confidence order.     */    public ScoredObject<String[]>[] collocations(int nGram, int minCount,                                                 int maxReturned) {        CollocationCollector collector = new CollocationCollector(maxReturned);        mCounter.handleNGrams(nGram,minCount,collector);        return collector.nGrams();    }    /**
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -