📄 latentdirichletallocation.java
字号:
mDocWords = docWords; mDocTopicPrior = docTopicPrior; mTopicWordPrior = topicWordPrior; mDocTopicCount = docTopicCount; mWordTopicCount = wordTopicCount; mTopicCount = topicCount; mNumChangedTopics = numChangedTopics; mNumWords = numWords; mNumTokens = numTokens; } /** * Returns the epoch in which this sample was generated. * * @return The epoch for this sample. */ public int epoch() { return mEpoch; } /** * Returns the number of documents on which the sample was * based. * * @return The number of documents for the sample. */ public int numDocuments() { return mDocWords.length; } /** * Returns the number of distinct words in the documents on * which the sample was based. * * @return The number of words underlying the model. */ public int numWords() { return mNumWords; } /** * Returns the number of tokens in documents on which the * sample was based. Each token is an instance of a particular * word. */ public int numTokens() { return mNumTokens; } /** * Returns the number of topics for this sample. * * @return The number of topics for this sample. */ public int numTopics() { return mTopicCount.length; } /** * Returns the topic identifier sampled for the specified * token position in the specified document. * * @param doc Identifier for a document. * @param token Token position in the specified document. * @return The topic assigned to the specified token in this * sample. * @throws IndexOutOfBoundsException If the document * identifier is not between 0 (inclusive) and the number of * documents (exclusive), or if the token is not between 0 * (inclusive) and the number of tokens (exclusive) in the * specified document. */ public short topicSample(int doc, int token) { return mTopicSample[doc][token]; } /** * Returns the word identifier for the specified token position * in the specified document. * * @param doc Identifier for a document. * @param token Token position in the specified document. * @return The word found at the specified position in the * specified document. * @throws IndexOutOfBoundsException If the document * identifier is not between 0 (inclusive) and the number of * documents (exclusive), or if the token is not between 0 * (inclusive) and the number of tokens (exclusive) in the * specified document. */ public int word(int doc, int token) { return mDocWords[doc][token]; } /** * Returns the uniform Dirichlet concentration hyperparameter * <code>α</code> for document distributions over topics * from which this sample was produced. * * @return The document-topic prior. */ public double documentTopicPrior() { return mDocTopicPrior; } /** * Returns the uniform Dirichlet concentration hyperparameter * <code>β</code> for topic distributions over words from * which this sample was produced. */ public double topicWordPrior() { return mTopicWordPrior; } /** * Returns the number of times the specified topic was * assigned to the specified document in this sample. * * @param doc Identifier for a document. * @param topic Identifier for a topic. * @return The count of the topic in the document in this * sample. * @throws IndexOutOfBoundsException If the document identifier * is not between 0 (inclusive) and the number of documents * (exclusive) or if the topic identifier is not between 0 (inclusive) * and the number of topics (exclusive). */ public int documentTopicCount(int doc, int topic) { return mDocTopicCount[doc][topic]; } /** * Returns the length of the specified document in tokens. * * @param doc Identifier for a document. * @return The length of the specified document in tokens. * @throws IndexOutOfBoundsException If the document * identifier is not between 0 (inclusive) and the number of * documents (exclusive). */ public int documentLength(int doc) { return mDocWords[doc].length; } /** * Returns the number of times tokens for the specified word * were assigned to the specified topic. * * @param topic Identifier for a topic. * @param word Identifier for a word. * @return The number of tokens of the specified word assigned * to the specified topic. * @throws IndexOutOfBoundsException If the specified topic is * not between 0 (inclusive) and the number of topics (exclusive), * or if the word is not between 0 (inclusive) and the number of * words (exclusive). */ public int topicWordCount(int topic, int word) { return mWordTopicCount[word][topic]; } /** * Returns the total number of tokens assigned to the specified * topic in this sample. * * @param topic Identifier for a topic. * @return The total number of tokens assigned to the * specified topic. * @throws IllegalArgumentException If the specified topic is * not between 0 (inclusive) and the number of topics (exclusive). */ public int topicCount(int topic) { return mTopicCount[topic]; } /** * Returns the total number of topic assignments to tokens * that changed between the last sample and this one. Note * that this is the last sample in the chain, not the last * sample necessarily passed to a handler, because handlers * may not be configured to handle every * sample. * * @return The number of topics assignments that changed in this * sample relative to the previous sample. */ public int numChangedTopics() { return mNumChangedTopics; } /** * Returns the probability estimate for the specified word in * the specified topic in this sample. This value is * calculated as a maximum a posteriori estimate computed as * described in the class documentation for {@link * LatentDirichletAllocation} using the topic assignment * counts in this sample and the topic-word prior. * * @param topic Identifier for a topic. * @param word Identifier for a word. * @return The probability of generating the specified word in * the specified topic. * @throws IndexOutOfBoundsException If the specified topic is * not between 0 (inclusive) and the number of topics (exclusive), * or if the word is not between 0 (inclusive) and the number of * words (exclusive). */ public double topicWordProb(int topic, int word) { return (topicWordCount(topic,word) + topicWordPrior()) / (topicCount(topic) + numWords() * topicWordPrior()); } /** * Returns the number of times tokens of the specified word * appeared in the corpus. * * @param word Identifier of a word. * @return The number of tokens of the word in the corpus. * @throws IndexOutOfBoundsException If the word identifier is * not between 0 (inclusive) and the number of words * (exclusive). */ public int wordCount(int word) { int count = 0; for (int topic = 0; topic < numTopics(); ++topic) count += topicWordCount(topic,word); return count; } /** * Returns the estimate of the probability of the topic being * assigned to a word in the specified document given the * topic * assignments in this sample. This is the maximum a * posteriori estimate computed from the topic assignments * * as described in the class documentation for {@link * LatentDirichletAllocation} using the topic assignment * counts in this sample and the document-topic prior. * * @param doc Identifier of a document. * @param topic Identifier for a topic. * @return An estimate of the probabilty of the topic in the * document. * @throws IndexOutOfBoundsException If the document identifier * is not between 0 (inclusive) and the number of documents * (exclusive) or if the topic identifier is not between 0 (inclusive) * and the number of topics (exclusive). */ public double documentTopicProb(int doc, int topic) { return (documentTopicCount(doc,topic) + documentTopicPrior()) / (documentLength(doc) + numTopics() * documentTopicPrior()); } /** * Returns an estimate of the log (base 2) likelihood of the * corpus given the point estimates of topic and document * multinomials determined from this sample. * * <p>This likelihood calculation uses the methods * {@link #documentTopicProb(int,int)} and {@link * #topicWordProb(int,int)} for estimating likelihoods * according the following formula: * * <blockquote><pre> * corpusLog2Probability() * = <big><big><big>Σ</big></big></big><sub><sub>doc,i</sub></sub> log<sub><sub>2</sub></sub> <big><big><big>Σ</big></big></big><sub><sub>topic</sub></sub> p(topic|doc) * p(word[doc][i]|topic)</pre></blockquote> * * <p>Note that this is <i>not</i> the complete corpus likelihood, * which requires integrating over possible topic and document * multinomials given the priors. * * @return The log (base 2) likelihood of the training corpus * * given the document and topic multinomials determined by * this sample. */ public double corpusLog2Probability() { double corpusLog2Prob = 0.0; int numDocs = numDocuments(); int numTopics = numTopics(); for (int doc = 0; doc < numDocs; ++doc) { int docLength = documentLength(doc); for (int token = 0; token < docLength; ++token) { int word = word(doc,token); double wordProb = 0.0; for (int topic = 0; topic < numTopics; ++topic) { double wordTopicProbGivenDoc = topicWordProb(topic,word) * documentTopicProb(doc,topic); wordProb += wordTopicProbGivenDoc; } corpusLog2Prob += Math.log2(wordProb); } } return corpusLog2Prob; } /** * Returns a latent Dirichlet allocation model corresponding * to this sample. The topic-word probabilities are * calculated according to {@link #topicWordProb(int,int)}, * and the document-topic prior is as specified in the call * to LDA that produced this sample. * * @return The
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -