📄 latentdirichletallocation.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
            mDocWords = docWords;            mDocTopicPrior = docTopicPrior;            mTopicWordPrior = topicWordPrior;            mDocTopicCount = docTopicCount;            mWordTopicCount = wordTopicCount;            mTopicCount = topicCount;            mNumChangedTopics = numChangedTopics;            mNumWords = numWords;            mNumTokens = numTokens;        }        /**         * Returns the epoch in which this sample was generated.         *         * @return The epoch for this sample.         */        public int epoch() {            return mEpoch;        }        /**         * Returns the number of documents on which the sample was         * based.         *         * @return The number of documents for the sample.         */        public int numDocuments() {            return mDocWords.length;        }        /**         * Returns the number of distinct words in the documents on         * which the sample was based.         *         * @return The number of words underlying the model.         */        public int numWords() {            return mNumWords;        }        /**         * Returns the number of tokens in documents on which the         * sample was based.  Each token is an instance of a particular         * word.         */        public int numTokens() {            return mNumTokens;        }        /**         * Returns the number of topics for this sample.         *         * @return The number of topics for this sample.         */        public int numTopics() {            return mTopicCount.length;        }        /**         * Returns the topic identifier sampled for the specified         * token position in the specified document.         *         * @param doc Identifier for a document.         * @param token Token position in the specified document.         * @return The topic assigned to the specified token in this         * sample.         * @throws IndexOutOfBoundsException If the document         * identifier is not between 0 (inclusive) and the number of         * documents (exclusive), or if the token is not between 0         * (inclusive) and the number of tokens (exclusive) in the         * specified document.         */        public short topicSample(int doc, int token) {            return mTopicSample[doc][token];        }        /**         * Returns the word identifier for the specified token position         * in the specified document.         *         * @param doc Identifier for a document.         * @param token Token position in the specified document.         * @return The word found at the specified position in the         * specified document.         * @throws IndexOutOfBoundsException If the document         * identifier is not between 0 (inclusive) and the number of         * documents (exclusive), or if the token is not between 0         * (inclusive) and the number of tokens (exclusive) in the         * specified document.         */        public int word(int doc, int token) {            return mDocWords[doc][token];        }        /**         * Returns the uniform Dirichlet concentration hyperparameter         * <code>&alpha;</code> for document distributions over topics         * from which this sample was produced.         *         * @return The document-topic prior.         */        public double documentTopicPrior() {            return mDocTopicPrior;        }        /**         * Returns the uniform Dirichlet concentration hyperparameter         * <code>&beta;</code> for topic distributions over words from         * which this sample was produced.         */        public double topicWordPrior() {            return mTopicWordPrior;        }        /**         * Returns the number of times the specified topic was         * assigned to the specified document in this sample.         *         * @param doc Identifier for a document.         * @param topic Identifier for a topic.         * @return The count of the topic in the document in this         * sample.         * @throws IndexOutOfBoundsException If the document identifier         * is not between 0 (inclusive) and the number of documents         * (exclusive) or if the topic identifier is not between 0 (inclusive)         * and the number of topics (exclusive).         */        public int documentTopicCount(int doc, int topic) {            return mDocTopicCount[doc][topic];        }        /**         * Returns the length of the specified document in tokens.         *         * @param doc Identifier for a document.         * @return The length of the specified document in tokens.         * @throws IndexOutOfBoundsException If the document         * identifier is not between 0 (inclusive) and the number of         * documents (exclusive).         */        public int documentLength(int doc) {            return mDocWords[doc].length;        }        /**         * Returns the number of times tokens for the specified word         * were assigned to the specified topic.         *         * @param topic Identifier for a topic.         * @param word Identifier for a word.         * @return The number of tokens of the specified word assigned         * to the specified topic.         * @throws IndexOutOfBoundsException If the specified topic is         * not between 0 (inclusive) and the number of topics (exclusive),         * or if the word is not between 0 (inclusive) and the number of         * words (exclusive).         */        public int topicWordCount(int topic, int word) {            return mWordTopicCount[word][topic];        }        /**         * Returns the total number of tokens assigned to the specified         * topic in this sample.         *         * @param topic Identifier for a topic.         * @return The total number of tokens assigned to the         * specified topic.         * @throws IllegalArgumentException If the specified topic is         * not between 0 (inclusive) and the number of topics (exclusive).         */        public int topicCount(int topic) {            return mTopicCount[topic];        }        /**         * Returns the total number of topic assignments to tokens         * that changed between the last sample and this one.  Note         * that this is the last sample in the chain, not the last         * sample necessarily passed to a handler, because handlers         * may not be configured to handle every * sample.         *         * @return The number of topics assignments that changed in this         * sample relative to the previous sample.         */        public int numChangedTopics() {            return mNumChangedTopics;        }        /**         * Returns the probability estimate for the specified word in         * the specified topic in this sample.  This value is         * calculated as a maximum a posteriori estimate computed as         * described in the class documentation for {@link         * LatentDirichletAllocation} using the topic assignment         * counts in this sample and the topic-word prior.         *         * @param topic Identifier for a topic.         * @param word Identifier for a word.         * @return The probability of generating the specified word in         * the specified topic.         * @throws IndexOutOfBoundsException If the specified topic is         * not between 0 (inclusive) and the number of topics (exclusive),         * or if the word is not between 0 (inclusive) and the number of         * words (exclusive).         */        public double topicWordProb(int topic, int word) {            return (topicWordCount(topic,word) + topicWordPrior())                / (topicCount(topic) + numWords() * topicWordPrior());        }        /**         * Returns the number of times tokens of the specified word         * appeared in the corpus.         *         * @param word Identifier of a word.         * @return The number of tokens of the word in the corpus.         * @throws IndexOutOfBoundsException If the word identifier is         * not between 0 (inclusive) and the number of words         * (exclusive).         */        public int wordCount(int word) {            int count = 0;            for (int topic = 0; topic < numTopics(); ++topic)                count += topicWordCount(topic,word);            return count;        }        /**         * Returns the estimate of the probability of the topic being         * assigned to a word in the specified document given the         * topic * assignments in this sample.  This is the maximum a         * posteriori estimate computed from the topic assignments *         * as described in the class documentation for {@link         * LatentDirichletAllocation} using the topic assignment         * counts in this sample and the document-topic prior.         *         * @param doc Identifier of a document.         * @param topic Identifier for a topic.         * @return An estimate of the probabilty of the topic in the         * document.         * @throws IndexOutOfBoundsException If the document identifier         * is not between 0 (inclusive) and the number of documents         * (exclusive) or if the topic identifier is not between 0 (inclusive)         * and the number of topics (exclusive).         */        public double documentTopicProb(int doc, int topic) {            return (documentTopicCount(doc,topic) + documentTopicPrior())                / (documentLength(doc) + numTopics() * documentTopicPrior());        }        /**         * Returns an estimate of the log (base 2) likelihood of the         * corpus given the point estimates of topic and document         * multinomials determined from this sample.         *         * <p>This likelihood calculation uses the methods         * {@link #documentTopicProb(int,int)} and {@link         * #topicWordProb(int,int)} for estimating likelihoods         * according the following formula:         *         * <blockquote><pre>         * corpusLog2Probability()         * = <big><big><big>&Sigma;</big></big></big><sub><sub>doc,i</sub></sub> log<sub><sub>2</sub></sub> <big><big><big>&Sigma;</big></big></big><sub><sub>topic</sub></sub> p(topic|doc) * p(word[doc][i]|topic)</pre></blockquote>         *         * <p>Note that this is <i>not</i> the complete corpus likelihood,         * which requires integrating over possible topic and document         * multinomials given the priors.         *         * @return The log (base 2) likelihood of the training corpus         * * given the document and topic multinomials determined by         * this sample.         */        public double corpusLog2Probability() {            double corpusLog2Prob = 0.0;            int numDocs = numDocuments();            int numTopics = numTopics();            for (int doc = 0; doc < numDocs; ++doc) {                int docLength = documentLength(doc);                for (int token = 0; token < docLength; ++token) {                    int word = word(doc,token);                    double wordProb = 0.0;                    for (int topic = 0; topic < numTopics; ++topic) {                        double wordTopicProbGivenDoc = topicWordProb(topic,word) * documentTopicProb(doc,topic);                        wordProb += wordTopicProbGivenDoc;                    }                    corpusLog2Prob += Math.log2(wordProb);                }            }            return corpusLog2Prob;        }        /**         * Returns a latent Dirichlet allocation model corresponding         * to this sample.  The topic-word probabilities are         * calculated according to {@link #topicWordProb(int,int)},         * and the document-topic prior is as specified in the call         * to LDA that produced this sample.         *         * @return The
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -