⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 topicalngrams.java

📁 mallet是自然语言处理、机器学习领域的一个开源项目。
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
					bitokensPerTopic[prevType][newTopic]++;					biTokens++;				}			}		}	}	public void printTopWords (int numWords, boolean useNewLines)	{		class WordProb implements Comparable {			int wi; double p;			public WordProb (int wi, double p) { this.wi = wi; this.p = p; }			public final int compareTo (Object o2) {				if (p > ((WordProb)o2).p)					return -1;				else if (p == ((WordProb)o2).p)					return 0;				else return 1;			}		}		for (int ti = 0; ti < numTopics; ti++) {			// Unigrams			WordProb[] wp = new WordProb[numTypes];			for (int wi = 0; wi < numTypes; wi++)				wp[wi] = new WordProb (wi, (double)unitypeTopicCounts[wi][ti]);			Arrays.sort (wp);			if (useNewLines) {				System.out.println ("\nTopic "+ti+" unigrams");				for (int i = 0; i < numWords; i++)					System.out.println (uniAlphabet.lookupObject(wp[i].wi).toString()					                    + " " + wp[i].p/tokensPerTopic[ti]);			} else {				System.out.print ("Topic "+ti+": ");				for (int i = 0; i < numWords; i++)					System.out.print (uniAlphabet.lookupObject(wp[i].wi).toString() + " ");			}			// Bigrams			/*			wp = new WordProb[numBitypes];			int bisum = 0;			for (int wi = 0; wi < numBitypes; wi++) {				wp[wi] = new WordProb (wi, ((double)bitypeTopicCounts[wi][ti]));				bisum += bitypeTopicCounts[wi][ti];			}			Arrays.sort (wp);			if (useNewLines) {				System.out.println ("\nTopic "+ti+" bigrams");				for (int i = 0; i < numWords; i++)					System.out.println (biAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p/bisum);			} else {				System.out.print ("          ");				for (int i = 0; i < numWords; i++)					System.out.print (biAlphabet.lookupObject(wp[i].wi).toString() + " ");				System.out.println();			}			*/			// Ngrams			AugmentableFeatureVector afv = new AugmentableFeatureVector(new Alphabet(), 10000, false);			for (int di = 0; di < topics.length; di++) {				FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.getInstance(di).getData();				for (int si = topics[di].length-1; si >= 0; si--) {					if (topics[di][si] == ti && grams[di][si] == 1) {						String gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString();						while (grams[di][si] == 1 && --si >= 0)							gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString() + "_" + gramString;						afv.add(gramString, 1.0);					}				}			}			//System.out.println ("pre-sorting");			int numNgrams = afv.numLocations();			//System.out.println ("post-sorting "+numNgrams);			wp = new WordProb[numNgrams];			int ngramSum = 0;			for (int loc = 0; loc < numNgrams; loc++) {				wp[loc] = new WordProb (afv.indexAtLocation(loc), afv.valueAtLocation(loc));				ngramSum += wp[loc].p;			}			Arrays.sort (wp);			if (useNewLines) {				System.out.println ("\nTopic "+ti+" bigrams (#unique ngrams="+numNgrams+				                    " ngram count="+Math.round(afv.oneNorm())+")");				for (int i = 0; i < numWords; i++)					System.out.println (afv.getAlphabet().lookupObject(wp[i].wi).toString() + " " + wp[i].p/ngramSum);			} else {				System.out.print (" (unique-ngrams="+numNgrams+" ngram-count="+Math.round(afv.oneNorm())+")\n         ");				for (int i = 0; i < numWords; i++)					System.out.print (afv.getAlphabet().lookupObject(wp[i].wi).toString() + " ");				System.out.println();			}		}	}  public void printDocumentTopics (File f) throws IOException  {    printDocumentTopics (new PrintWriter (new FileWriter (f)));  }  public void printDocumentTopics (PrintWriter pw)  {    pw.println ("#doc source topic proportions");    int docLen;    for (int di = 0; di < topics.length; di++) {      pw.print (di); pw.print (' ');      docLen = topics[di].length;      for (int ti = 0; ti < numTopics; ti++)        pw.print (((float)docTopicCounts[di][ti])/docLen); pw.print (' ');      pw.println (ilist.getInstance(di).getSource().toString()); pw.print (' ');    }  }  public void printState (File f) throws IOException	{		printState (new PrintWriter (new FileWriter(f)));  }	public void printState (PrintWriter pw)	{		pw.println ("#doc pos typeindex type bigrampossible? topic bigram");		for (int di = 0; di < topics.length; di++) {			FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.getInstance(di).getData();			for (int si = 0; si < topics[di].length; si++) {				int type = fs.getIndexAtPosition(si);				pw.print(di); pw.print(' ');				pw.print(si); pw.print(' ');				pw.print(type); pw.print(' ');				pw.print(uniAlphabet.lookupObject(type)); pw.print(' ');				pw.print(fs.getBiIndexAtPosition(si)==-1 ? 0 : 1); pw.print(' ');				pw.print(topics[di][si]); pw.print(' ');				pw.print(grams[di][si]); pw.println();			}		}	}  public void write (File f) {    try {      ObjectOutputStream oos = new ObjectOutputStream (new FileOutputStream(f));      oos.writeObject(this);      oos.close();    }    catch (IOException e) {      System.err.println("Exception writing file " + f + ": " + e);    }  }  // Serialization	private static final long serialVersionUID = 1;	private static final int CURRENT_SERIAL_VERSION = 0;	private static final int NULL_INTEGER = -1;	private void writeIntArray2 (int[][] a, ObjectOutputStream out) throws IOException {		out.writeInt (a.length);		int d2 = a[0].length;		out.writeInt (d2);		for (int i = 0; i < a.length; i++)			for (int j = 0; j < d2; j++)				out.writeInt (a[i][j]);	}	private int[][] readIntArray2 (ObjectInputStream in) throws IOException {		int d1 = in.readInt();		int d2 = in.readInt();		int[][] a = new int[d1][d2];		for (int i = 0; i < d1; i++)			for (int j = 0; j < d2; j++)				a[i][j] = in.readInt();		return a;	}	private void writeObject (ObjectOutputStream out) throws IOException {		out.writeInt (CURRENT_SERIAL_VERSION);		out.writeObject (ilist);		out.writeInt (numTopics);		out.writeDouble (alpha);		out.writeDouble (beta);		out.writeDouble (gamma);		out.writeDouble (delta);		out.writeDouble (tAlpha);		out.writeDouble (vBeta);		out.writeDouble (vGamma);		out.writeInt (numTypes);		out.writeInt (numBitypes);		out.writeInt (numTokens);		out.writeInt (biTokens);		for (int di = 0; di < topics.length; di ++)			for (int si = 0; si < topics[di].length; si++)				out.writeInt (topics[di][si]);		for (int di = 0; di < topics.length; di ++)			for (int si = 0; si < topics[di].length; si++)				out.writeInt (grams[di][si]);		writeIntArray2 (docTopicCounts, out);		for (int fi = 0; fi < numTypes; fi++)			for (int n = 0; n < 2; n++)				for (int ti = 0; ti < numTopics; ti++)					out.writeInt (typeNgramTopicCounts[fi][n][ti]);		writeIntArray2 (unitypeTopicCounts, out);		writeIntArray2 (bitypeTopicCounts, out);		for (int ti = 0; ti < numTopics; ti++)			out.writeInt (tokensPerTopic[ti]);		writeIntArray2 (bitokensPerTopic, out);	}	private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {		int featuresLength;		int version = in.readInt ();		ilist = (InstanceList) in.readObject ();		numTopics = in.readInt();		alpha = in.readDouble();		beta = in.readDouble();		gamma = in.readDouble();		delta = in.readDouble();		tAlpha = in.readDouble();		vBeta = in.readDouble();		vGamma = in.readDouble();		numTypes = in.readInt();		numBitypes = in.readInt();		numTokens = in.readInt();		biTokens = in.readInt();		int numDocs = ilist.size();		topics = new int[numDocs][];		grams = new int[numDocs][];		for (int di = 0; di < ilist.size(); di++) {			int docLen = ((FeatureSequence)ilist.getInstance(di).getData()).getLength();			topics[di] = new int[docLen];			for (int si = 0; si < docLen; si++)				topics[di][si] = in.readInt();		}		for (int di = 0; di < ilist.size(); di++) {			int docLen = ((FeatureSequence)ilist.getInstance(di).getData()).getLength();			grams[di] = new int[docLen];			for (int si = 0; si < docLen; si++)				grams[di][si] = in.readInt();		}		docTopicCounts = readIntArray2 (in);		typeNgramTopicCounts = new int[numTypes][2][numTopics];		for (int fi = 0; fi < numTypes; fi++)			for (int n = 0; n < 2; n++)				for (int ti = 0; ti < numTopics; ti++)					typeNgramTopicCounts[fi][n][ti] = in.readInt();		unitypeTopicCounts = readIntArray2 (in);		bitypeTopicCounts = readIntArray2 (in);		tokensPerTopic = new int[numTopics];		for (int ti = 0; ti < numTopics; ti++)			tokensPerTopic[ti] = in.readInt();		bitokensPerTopic = readIntArray2 (in);	}  // Just for testing.  Recommend instead is mallet/bin/vectors2topics  public static void main (String[] args)	{		InstanceList ilist = InstanceList.load (new File(args[0]));		int numIterations = args.length > 1 ? Integer.parseInt(args[1]) : 1000;		int numTopWords = args.length > 2 ? Integer.parseInt(args[2]) : 20;		System.out.println ("Data loaded.");		TopicalNGrams tng = new TopicalNGrams (10);		tng.estimate (ilist, 1000, 50, 0, null, new Random());		tng.printTopWords (60, true);	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -