topicalngrams.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 543 行 · 第 1/2 页
JAVA
543 行
/* Copyright (C) 2005 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.base.topics;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.util.Random;import java.util.Arrays;import java.io.*;/** * Like Latent Dirichlet Allocation, but with integrated phrase discovery. * @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> * based on C code by Xuerui Wang. */public class TopicalNGrams {	int numTopics;	Alphabet uniAlphabet;	Alphabet biAlphabet;	double alpha, beta, gamma, delta, tAlpha, vBeta, vGamma;	InstanceList ilist; // containing FeatureSequenceWithBigrams in the data field of each instance	int[][] topics; // {0...T-1}, the topic index, indexed by <document index, sequence index>	int[][] grams; // {0,1}, the bigram status, indexed by <document index, sequence index> TODO: Make this boolean?	int numTypes; // number of unique unigrams	int numBitypes; // number of unique bigrams	int numTokens; // total number of word occurrences	// "totalNgram"	int biTokens; // total number of tokens currently generated as bigrams (only used for progress messages)	// "docTopic"	int[][] docTopicCounts; // indexed by <document index, topic index>	// Used to calculate p(x|w,t).  "ngramCount"	int[][][] typeNgramTopicCounts; // indexed by <feature index, ngram status, topic index>	// Used to calculate p(w|t) and p(w|t,w), "topicWord" and "topicNgramWord"	int[][] unitypeTopicCounts; // indexed by <feature index, topic index>	int[][] bitypeTopicCounts; // index by <bifeature index, topic index>	// "sumWords"	int[] tokensPerTopic; // indexed by <topic index>	// "sumNgramWords"	int[][] bitokensPerTopic; // indexed by <feature index, topic index>, where the later is the conditioned word	public TopicalNGrams (int numberOfTopics)	{		this (numberOfTopics, 50.0, 0.1, 0.05, 0.2);	}	public TopicalNGrams (int numberOfTopics, double alphaSum, double beta, double gamma, double delta)	{		this.numTopics = numberOfTopics;		this.alpha = alphaSum / numTopics;		this.beta = beta;		this.gamma = gamma;		this.delta = delta;	}	public void estimate (InstanceList documents, int numIterations, int showTopicsInterval,                        int outputModelInterval, String outputModelFilename,                        Random r)	{		ilist = documents;		uniAlphabet = ilist.getDataAlphabet();		biAlphabet = ((FeatureSequenceWithBigrams)ilist.getInstance(0).getData()).getBiAlphabet();		numTypes = uniAlphabet.size();		numBitypes = biAlphabet.size();		int numDocs = ilist.size();		topics = new int[numDocs][];		grams = new int[numDocs][];		docTopicCounts = new int[numDocs][numTopics];		typeNgramTopicCounts = new int[numTypes][2][numTopics];		unitypeTopicCounts = new int[numTypes][numTopics];		bitypeTopicCounts = new int[numBitypes][numTopics];		tokensPerTopic = new int[numTopics];		bitokensPerTopic = new int[numTypes][numTopics];		tAlpha = alpha * numTopics;		vBeta = beta * numTypes;		vGamma = gamma * numTypes;		long startTime = System.currentTimeMillis();		// Initialize with random assignments of tokens to topics		// and finish allocating this.topics and this.tokens		int topic, gram, seqLen, fi;		for (int di = 0; di < numDocs; di++) {			FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.getInstance(di).getData();			seqLen = fs.getLength();			numTokens += seqLen;			topics[di] = new int[seqLen];			grams[di] = new int[seqLen];			// Randomly assign tokens to topics			int prevFi = -1, prevTopic = -1;			for (int si = 0; si < seqLen; si++) {				// randomly sample a topic for the word at position si				topic = r.nextInt(numTopics);				// if a bigram is allowed at position si, then sample a gram status for it.				gram = (fs.getBiIndexAtPosition(si) == -1 ? 0 : r.nextInt(2));				if (gram != 0) biTokens++;				topics[di][si] = topic;				grams[di][si] = gram;				docTopicCounts[di][topic]++;				fi = fs.getIndexAtPosition(si);				if (prevFi != -1)					typeNgramTopicCounts[prevFi][gram][prevTopic]++;				if (gram == 0) {					unitypeTopicCounts[fi][topic]++;					tokensPerTopic[topic]++;				}	else {					bitypeTopicCounts[fs.getBiIndexAtPosition(si)][topic]++;					bitokensPerTopic[prevFi][topic]++;				}				prevFi = fi;  prevTopic = topic;			}		}		for (int iterations = 0; iterations < numIterations; iterations++) {      sampleTopicsForAllDocs (r);      if (iterations % 10 == 0) System.out.print (iterations);	else System.out.print (".");			System.out.flush();			if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) {				System.out.println ();				printTopWords (5, false);			}      if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) {        this.write (new File(outputModelFilename+'.'+iterations));      }		}		System.out.println ("\nTotal time (sec): " + ((System.currentTimeMillis() - startTime)/1000.0));	}	/* One iteration of Gibbs sampling, across all documents. */	private void sampleTopicsForAllDocs (Random r)	{		double[] uniTopicWeights = new double[numTopics];		double[] biTopicWeights = new double[numTopics*2];		// Loop over every word in the corpus		for (int di = 0; di < topics.length; di++) {			sampleTopicsForOneDoc ((FeatureSequenceWithBigrams)ilist.getInstance(di).getData(),			                       topics[di], grams[di], docTopicCounts[di],			                       uniTopicWeights, biTopicWeights,			                       r);		}	}	private void sampleTopicsForOneDoc (FeatureSequenceWithBigrams oneDocTokens,	                                    int[] oneDocTopics, int[] oneDocGrams,		                                  int[] oneDocTopicCounts, // indexed by topic index		                                  double[] uniTopicWeights, // length==numTopics	                                    double[] biTopicWeights, // length==numTopics*2: joint topic/gram sampling	                                    Random r)	{		int[] currentTypeTopicCounts;		int[] currentBitypeTopicCounts;		int[] previousBitokensPerTopic;		int type, bitype, oldGram, nextGram, newGram, oldTopic, newTopic;		double topicWeightsSum, tw;		// xxx int docLen = oneDocTokens.length;		int docLen = oneDocTokens.getLength();		// Iterate over the positions (words) in the document		for (int si = 0; si < docLen; si++) {			type = oneDocTokens.getIndexAtPosition(si);			bitype = oneDocTokens.getBiIndexAtPosition(si);			//if (bitype == -1) System.out.println ("biblock "+si+" at "+uniAlphabet.lookupObject(type));			oldTopic = oneDocTopics[si];			oldGram = oneDocGrams[si];			nextGram = (si == docLen-1) ? -1 : oneDocGrams[si+1];			//nextGram = (si == docLen-1) ? -1 : (oneDocTokens.getBiIndexAtPosition(si+1) == -1 ? 0 : 1);			boolean bigramPossible = (bitype != -1);			assert (!(!bigramPossible && oldGram == 1));			if (!bigramPossible) {				// Remove this token from all counts				oneDocTopicCounts[oldTopic]--;				tokensPerTopic[oldTopic]--;				unitypeTopicCounts[type][oldTopic]--;				if (si != docLen-1) {					typeNgramTopicCounts[type][nextGram][oldTopic]--;					assert (typeNgramTopicCounts[type][nextGram][oldTopic] >= 0);				}				assert (oneDocTopicCounts[oldTopic] >= 0);				assert (tokensPerTopic[oldTopic] >= 0);				assert (unitypeTopicCounts[type][oldTopic] >= 0);				// Build a distribution over topics for this token				Arrays.fill (uniTopicWeights, 0.0);				topicWeightsSum = 0;				currentTypeTopicCounts = unitypeTopicCounts[type];				for (int ti = 0; ti < numTopics; ti++) {					tw = ((currentTypeTopicCounts[ti] + beta) / (tokensPerTopic[ti] + vBeta))						    * ((oneDocTopicCounts[ti] + alpha)); // additional term is constance across all topics					topicWeightsSum += tw;					uniTopicWeights[ti] = tw;				}				// Sample a topic assignment from this distribution				newTopic = r.nextDiscrete (uniTopicWeights, topicWeightsSum);				// Put that new topic into the counts				oneDocTopics[si] = newTopic;				oneDocTopicCounts[newTopic]++;				unitypeTopicCounts[type][newTopic]++;				tokensPerTopic[newTopic]++;				if (si != docLen-1)					typeNgramTopicCounts[type][nextGram][newTopic]++;			} else {				// Bigram is possible				int prevType = oneDocTokens.getIndexAtPosition(si-1);				int prevTopic = oneDocTopics[si-1];				// Remove this token from all counts				oneDocTopicCounts[oldTopic]--;				typeNgramTopicCounts[prevType][oldGram][prevTopic]--;				if (si != docLen-1)					typeNgramTopicCounts[type][nextGram][oldTopic]--;				if (oldGram == 0) {					unitypeTopicCounts[type][oldTopic]--;					tokensPerTopic[oldTopic]--;				} else {					bitypeTopicCounts[bitype][oldTopic]--;					bitokensPerTopic[prevType][oldTopic]--;					biTokens--;				}				assert (oneDocTopicCounts[oldTopic] >= 0);				assert (typeNgramTopicCounts[prevType][oldGram][prevTopic] >= 0);				assert (si == docLen-1 || typeNgramTopicCounts[type][nextGram][oldTopic] >= 0);				assert (unitypeTopicCounts[type][oldTopic] >= 0);				assert (tokensPerTopic[oldTopic] >= 0);				assert (bitypeTopicCounts[bitype][oldTopic] >= 0);				assert (bitokensPerTopic[prevType][oldTopic] >= 0);				assert (biTokens >= 0);				// Build a joint distribution over topics and ngram-status for this token				Arrays.fill (biTopicWeights, 0.0);				topicWeightsSum = 0;				currentTypeTopicCounts = unitypeTopicCounts[type];				currentBitypeTopicCounts = bitypeTopicCounts[bitype];				previousBitokensPerTopic = bitokensPerTopic[prevType];				for (int ti = 0; ti < numTopics; ti++) {					newTopic = ti << 1; // just using this variable as an index into [ti*2+gram]					// The unigram outcome					tw =					    (currentTypeTopicCounts[ti] + beta) / (tokensPerTopic[ti] + vBeta)					  * (oneDocTopicCounts[ti] + alpha)						* (typeNgramTopicCounts[prevType][0][prevTopic] + delta);					topicWeightsSum += tw;					biTopicWeights[newTopic] = tw;					// The bigram outcome					newTopic++;					tw =					    (currentBitypeTopicCounts[ti] + gamma) / (previousBitokensPerTopic[ti] + vGamma)					    * (oneDocTopicCounts[ti] + alpha)					    * (typeNgramTopicCounts[prevType][1][prevTopic] + delta);					topicWeightsSum += tw;					biTopicWeights[newTopic] = tw;				}				// Sample a topic assignment from this distribution				newTopic = r.nextDiscrete (biTopicWeights, topicWeightsSum);				// Put that new topic into the counts				newGram = newTopic % 2;				newTopic /= 2;				// Put that new topic into the counts				oneDocTopics[si] = newTopic;				oneDocGrams[si] = newGram;				oneDocTopicCounts[newTopic]++;				typeNgramTopicCounts[prevType][newGram][prevTopic]++;				if (si != docLen-1)					typeNgramTopicCounts[type][nextGram][newTopic]++;				if (newGram == 0) {					unitypeTopicCounts[type][newTopic]++;					tokensPerTopic[newTopic]++;				} else {					bitypeTopicCounts[bitype][newTopic]++;
topicalngrams.java - 源码说明

本页面展示了「mallet是自然语言处理、机器学习领域的一个开源项目。」中的 topicalngrams.java 源码文件，采用 Java 编程语言编写，共 543 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与mallet相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?