📄 topicalngrams.java
字号:
/* Copyright (C) 2005 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.base.topics;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.util.Random;import java.util.Arrays;import java.io.*;/** * Like Latent Dirichlet Allocation, but with integrated phrase discovery. * @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> * based on C code by Xuerui Wang. */public class TopicalNGrams { int numTopics; Alphabet uniAlphabet; Alphabet biAlphabet; double alpha, beta, gamma, delta, tAlpha, vBeta, vGamma; InstanceList ilist; // containing FeatureSequenceWithBigrams in the data field of each instance int[][] topics; // {0...T-1}, the topic index, indexed by <document index, sequence index> int[][] grams; // {0,1}, the bigram status, indexed by <document index, sequence index> TODO: Make this boolean? int numTypes; // number of unique unigrams int numBitypes; // number of unique bigrams int numTokens; // total number of word occurrences // "totalNgram" int biTokens; // total number of tokens currently generated as bigrams (only used for progress messages) // "docTopic" int[][] docTopicCounts; // indexed by <document index, topic index> // Used to calculate p(x|w,t). "ngramCount" int[][][] typeNgramTopicCounts; // indexed by <feature index, ngram status, topic index> // Used to calculate p(w|t) and p(w|t,w), "topicWord" and "topicNgramWord" int[][] unitypeTopicCounts; // indexed by <feature index, topic index> int[][] bitypeTopicCounts; // index by <bifeature index, topic index> // "sumWords" int[] tokensPerTopic; // indexed by <topic index> // "sumNgramWords" int[][] bitokensPerTopic; // indexed by <feature index, topic index>, where the later is the conditioned word public TopicalNGrams (int numberOfTopics) { this (numberOfTopics, 50.0, 0.1, 0.05, 0.2); } public TopicalNGrams (int numberOfTopics, double alphaSum, double beta, double gamma, double delta) { this.numTopics = numberOfTopics; this.alpha = alphaSum / numTopics; this.beta = beta; this.gamma = gamma; this.delta = delta; } public void estimate (InstanceList documents, int numIterations, int showTopicsInterval, int outputModelInterval, String outputModelFilename, Random r) { ilist = documents; uniAlphabet = ilist.getDataAlphabet(); biAlphabet = ((FeatureSequenceWithBigrams)ilist.getInstance(0).getData()).getBiAlphabet(); numTypes = uniAlphabet.size(); numBitypes = biAlphabet.size(); int numDocs = ilist.size(); topics = new int[numDocs][]; grams = new int[numDocs][]; docTopicCounts = new int[numDocs][numTopics]; typeNgramTopicCounts = new int[numTypes][2][numTopics]; unitypeTopicCounts = new int[numTypes][numTopics]; bitypeTopicCounts = new int[numBitypes][numTopics]; tokensPerTopic = new int[numTopics]; bitokensPerTopic = new int[numTypes][numTopics]; tAlpha = alpha * numTopics; vBeta = beta * numTypes; vGamma = gamma * numTypes; long startTime = System.currentTimeMillis(); // Initialize with random assignments of tokens to topics // and finish allocating this.topics and this.tokens int topic, gram, seqLen, fi; for (int di = 0; di < numDocs; di++) { FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.getInstance(di).getData(); seqLen = fs.getLength(); numTokens += seqLen; topics[di] = new int[seqLen]; grams[di] = new int[seqLen]; // Randomly assign tokens to topics int prevFi = -1, prevTopic = -1; for (int si = 0; si < seqLen; si++) { // randomly sample a topic for the word at position si topic = r.nextInt(numTopics); // if a bigram is allowed at position si, then sample a gram status for it. gram = (fs.getBiIndexAtPosition(si) == -1 ? 0 : r.nextInt(2)); if (gram != 0) biTokens++; topics[di][si] = topic; grams[di][si] = gram; docTopicCounts[di][topic]++; fi = fs.getIndexAtPosition(si); if (prevFi != -1) typeNgramTopicCounts[prevFi][gram][prevTopic]++; if (gram == 0) { unitypeTopicCounts[fi][topic]++; tokensPerTopic[topic]++; } else { bitypeTopicCounts[fs.getBiIndexAtPosition(si)][topic]++; bitokensPerTopic[prevFi][topic]++; } prevFi = fi; prevTopic = topic; } } for (int iterations = 0; iterations < numIterations; iterations++) { sampleTopicsForAllDocs (r); if (iterations % 10 == 0) System.out.print (iterations); else System.out.print ("."); System.out.flush(); if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) { System.out.println (); printTopWords (5, false); } if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) { this.write (new File(outputModelFilename+'.'+iterations)); } } System.out.println ("\nTotal time (sec): " + ((System.currentTimeMillis() - startTime)/1000.0)); } /* One iteration of Gibbs sampling, across all documents. */ private void sampleTopicsForAllDocs (Random r) { double[] uniTopicWeights = new double[numTopics]; double[] biTopicWeights = new double[numTopics*2]; // Loop over every word in the corpus for (int di = 0; di < topics.length; di++) { sampleTopicsForOneDoc ((FeatureSequenceWithBigrams)ilist.getInstance(di).getData(), topics[di], grams[di], docTopicCounts[di], uniTopicWeights, biTopicWeights, r); } } private void sampleTopicsForOneDoc (FeatureSequenceWithBigrams oneDocTokens, int[] oneDocTopics, int[] oneDocGrams, int[] oneDocTopicCounts, // indexed by topic index double[] uniTopicWeights, // length==numTopics double[] biTopicWeights, // length==numTopics*2: joint topic/gram sampling Random r) { int[] currentTypeTopicCounts; int[] currentBitypeTopicCounts; int[] previousBitokensPerTopic; int type, bitype, oldGram, nextGram, newGram, oldTopic, newTopic; double topicWeightsSum, tw; // xxx int docLen = oneDocTokens.length; int docLen = oneDocTokens.getLength(); // Iterate over the positions (words) in the document for (int si = 0; si < docLen; si++) { type = oneDocTokens.getIndexAtPosition(si); bitype = oneDocTokens.getBiIndexAtPosition(si); //if (bitype == -1) System.out.println ("biblock "+si+" at "+uniAlphabet.lookupObject(type)); oldTopic = oneDocTopics[si]; oldGram = oneDocGrams[si]; nextGram = (si == docLen-1) ? -1 : oneDocGrams[si+1]; //nextGram = (si == docLen-1) ? -1 : (oneDocTokens.getBiIndexAtPosition(si+1) == -1 ? 0 : 1); boolean bigramPossible = (bitype != -1); assert (!(!bigramPossible && oldGram == 1)); if (!bigramPossible) { // Remove this token from all counts oneDocTopicCounts[oldTopic]--; tokensPerTopic[oldTopic]--; unitypeTopicCounts[type][oldTopic]--; if (si != docLen-1) { typeNgramTopicCounts[type][nextGram][oldTopic]--; assert (typeNgramTopicCounts[type][nextGram][oldTopic] >= 0); } assert (oneDocTopicCounts[oldTopic] >= 0); assert (tokensPerTopic[oldTopic] >= 0); assert (unitypeTopicCounts[type][oldTopic] >= 0); // Build a distribution over topics for this token Arrays.fill (uniTopicWeights, 0.0); topicWeightsSum = 0; currentTypeTopicCounts = unitypeTopicCounts[type]; for (int ti = 0; ti < numTopics; ti++) { tw = ((currentTypeTopicCounts[ti] + beta) / (tokensPerTopic[ti] + vBeta)) * ((oneDocTopicCounts[ti] + alpha)); // additional term is constance across all topics topicWeightsSum += tw; uniTopicWeights[ti] = tw; } // Sample a topic assignment from this distribution newTopic = r.nextDiscrete (uniTopicWeights, topicWeightsSum); // Put that new topic into the counts oneDocTopics[si] = newTopic; oneDocTopicCounts[newTopic]++; unitypeTopicCounts[type][newTopic]++; tokensPerTopic[newTopic]++; if (si != docLen-1) typeNgramTopicCounts[type][nextGram][newTopic]++; } else { // Bigram is possible int prevType = oneDocTokens.getIndexAtPosition(si-1); int prevTopic = oneDocTopics[si-1]; // Remove this token from all counts oneDocTopicCounts[oldTopic]--; typeNgramTopicCounts[prevType][oldGram][prevTopic]--; if (si != docLen-1) typeNgramTopicCounts[type][nextGram][oldTopic]--; if (oldGram == 0) { unitypeTopicCounts[type][oldTopic]--; tokensPerTopic[oldTopic]--; } else { bitypeTopicCounts[bitype][oldTopic]--; bitokensPerTopic[prevType][oldTopic]--; biTokens--; } assert (oneDocTopicCounts[oldTopic] >= 0); assert (typeNgramTopicCounts[prevType][oldGram][prevTopic] >= 0); assert (si == docLen-1 || typeNgramTopicCounts[type][nextGram][oldTopic] >= 0); assert (unitypeTopicCounts[type][oldTopic] >= 0); assert (tokensPerTopic[oldTopic] >= 0); assert (bitypeTopicCounts[bitype][oldTopic] >= 0); assert (bitokensPerTopic[prevType][oldTopic] >= 0); assert (biTokens >= 0); // Build a joint distribution over topics and ngram-status for this token Arrays.fill (biTopicWeights, 0.0); topicWeightsSum = 0; currentTypeTopicCounts = unitypeTopicCounts[type]; currentBitypeTopicCounts = bitypeTopicCounts[bitype]; previousBitokensPerTopic = bitokensPerTopic[prevType]; for (int ti = 0; ti < numTopics; ti++) { newTopic = ti << 1; // just using this variable as an index into [ti*2+gram] // The unigram outcome tw = (currentTypeTopicCounts[ti] + beta) / (tokensPerTopic[ti] + vBeta) * (oneDocTopicCounts[ti] + alpha) * (typeNgramTopicCounts[prevType][0][prevTopic] + delta); topicWeightsSum += tw; biTopicWeights[newTopic] = tw; // The bigram outcome newTopic++; tw = (currentBitypeTopicCounts[ti] + gamma) / (previousBitokensPerTopic[ti] + vGamma) * (oneDocTopicCounts[ti] + alpha) * (typeNgramTopicCounts[prevType][1][prevTopic] + delta); topicWeightsSum += tw; biTopicWeights[newTopic] = tw; } // Sample a topic assignment from this distribution newTopic = r.nextDiscrete (biTopicWeights, topicWeightsSum); // Put that new topic into the counts newGram = newTopic % 2; newTopic /= 2; // Put that new topic into the counts oneDocTopics[si] = newTopic; oneDocGrams[si] = newGram; oneDocTopicCounts[newTopic]++; typeNgramTopicCounts[prevType][newGram][prevTopic]++; if (si != docLen-1) typeNgramTopicCounts[type][nextGram][newTopic]++; if (newGram == 0) { unitypeTopicCounts[type][newTopic]++; tokensPerTopic[newTopic]++; } else { bitypeTopicCounts[bitype][newTopic]++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -