📄 topicalngrams.java
字号:
bitokensPerTopic[prevType][newTopic]++; biTokens++; } } } } public void printTopWords (int numWords, boolean useNewLines) { class WordProb implements Comparable { int wi; double p; public WordProb (int wi, double p) { this.wi = wi; this.p = p; } public final int compareTo (Object o2) { if (p > ((WordProb)o2).p) return -1; else if (p == ((WordProb)o2).p) return 0; else return 1; } } for (int ti = 0; ti < numTopics; ti++) { // Unigrams WordProb[] wp = new WordProb[numTypes]; for (int wi = 0; wi < numTypes; wi++) wp[wi] = new WordProb (wi, (double)unitypeTopicCounts[wi][ti]); Arrays.sort (wp); if (useNewLines) { System.out.println ("\nTopic "+ti+" unigrams"); for (int i = 0; i < numWords; i++) System.out.println (uniAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p/tokensPerTopic[ti]); } else { System.out.print ("Topic "+ti+": "); for (int i = 0; i < numWords; i++) System.out.print (uniAlphabet.lookupObject(wp[i].wi).toString() + " "); } // Bigrams /* wp = new WordProb[numBitypes]; int bisum = 0; for (int wi = 0; wi < numBitypes; wi++) { wp[wi] = new WordProb (wi, ((double)bitypeTopicCounts[wi][ti])); bisum += bitypeTopicCounts[wi][ti]; } Arrays.sort (wp); if (useNewLines) { System.out.println ("\nTopic "+ti+" bigrams"); for (int i = 0; i < numWords; i++) System.out.println (biAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p/bisum); } else { System.out.print (" "); for (int i = 0; i < numWords; i++) System.out.print (biAlphabet.lookupObject(wp[i].wi).toString() + " "); System.out.println(); } */ // Ngrams AugmentableFeatureVector afv = new AugmentableFeatureVector(new Alphabet(), 10000, false); for (int di = 0; di < topics.length; di++) { FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.getInstance(di).getData(); for (int si = topics[di].length-1; si >= 0; si--) { if (topics[di][si] == ti && grams[di][si] == 1) { String gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString(); while (grams[di][si] == 1 && --si >= 0) gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString() + "_" + gramString; afv.add(gramString, 1.0); } } } //System.out.println ("pre-sorting"); int numNgrams = afv.numLocations(); //System.out.println ("post-sorting "+numNgrams); wp = new WordProb[numNgrams]; int ngramSum = 0; for (int loc = 0; loc < numNgrams; loc++) { wp[loc] = new WordProb (afv.indexAtLocation(loc), afv.valueAtLocation(loc)); ngramSum += wp[loc].p; } Arrays.sort (wp); if (useNewLines) { System.out.println ("\nTopic "+ti+" bigrams (#unique ngrams="+numNgrams+ " ngram count="+Math.round(afv.oneNorm())+")"); for (int i = 0; i < numWords; i++) System.out.println (afv.getAlphabet().lookupObject(wp[i].wi).toString() + " " + wp[i].p/ngramSum); } else { System.out.print (" (unique-ngrams="+numNgrams+" ngram-count="+Math.round(afv.oneNorm())+")\n "); for (int i = 0; i < numWords; i++) System.out.print (afv.getAlphabet().lookupObject(wp[i].wi).toString() + " "); System.out.println(); } } } public void printDocumentTopics (File f) throws IOException { printDocumentTopics (new PrintWriter (new FileWriter (f))); } public void printDocumentTopics (PrintWriter pw) { pw.println ("#doc source topic proportions"); int docLen; for (int di = 0; di < topics.length; di++) { pw.print (di); pw.print (' '); docLen = topics[di].length; for (int ti = 0; ti < numTopics; ti++) pw.print (((float)docTopicCounts[di][ti])/docLen); pw.print (' '); pw.println (ilist.getInstance(di).getSource().toString()); pw.print (' '); } } public void printState (File f) throws IOException { printState (new PrintWriter (new FileWriter(f))); } public void printState (PrintWriter pw) { pw.println ("#doc pos typeindex type bigrampossible? topic bigram"); for (int di = 0; di < topics.length; di++) { FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.getInstance(di).getData(); for (int si = 0; si < topics[di].length; si++) { int type = fs.getIndexAtPosition(si); pw.print(di); pw.print(' '); pw.print(si); pw.print(' '); pw.print(type); pw.print(' '); pw.print(uniAlphabet.lookupObject(type)); pw.print(' '); pw.print(fs.getBiIndexAtPosition(si)==-1 ? 0 : 1); pw.print(' '); pw.print(topics[di][si]); pw.print(' '); pw.print(grams[di][si]); pw.println(); } } } public void write (File f) { try { ObjectOutputStream oos = new ObjectOutputStream (new FileOutputStream(f)); oos.writeObject(this); oos.close(); } catch (IOException e) { System.err.println("Exception writing file " + f + ": " + e); } } // Serialization private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 0; private static final int NULL_INTEGER = -1; private void writeIntArray2 (int[][] a, ObjectOutputStream out) throws IOException { out.writeInt (a.length); int d2 = a[0].length; out.writeInt (d2); for (int i = 0; i < a.length; i++) for (int j = 0; j < d2; j++) out.writeInt (a[i][j]); } private int[][] readIntArray2 (ObjectInputStream in) throws IOException { int d1 = in.readInt(); int d2 = in.readInt(); int[][] a = new int[d1][d2]; for (int i = 0; i < d1; i++) for (int j = 0; j < d2; j++) a[i][j] = in.readInt(); return a; } private void writeObject (ObjectOutputStream out) throws IOException { out.writeInt (CURRENT_SERIAL_VERSION); out.writeObject (ilist); out.writeInt (numTopics); out.writeDouble (alpha); out.writeDouble (beta); out.writeDouble (gamma); out.writeDouble (delta); out.writeDouble (tAlpha); out.writeDouble (vBeta); out.writeDouble (vGamma); out.writeInt (numTypes); out.writeInt (numBitypes); out.writeInt (numTokens); out.writeInt (biTokens); for (int di = 0; di < topics.length; di ++) for (int si = 0; si < topics[di].length; si++) out.writeInt (topics[di][si]); for (int di = 0; di < topics.length; di ++) for (int si = 0; si < topics[di].length; si++) out.writeInt (grams[di][si]); writeIntArray2 (docTopicCounts, out); for (int fi = 0; fi < numTypes; fi++) for (int n = 0; n < 2; n++) for (int ti = 0; ti < numTopics; ti++) out.writeInt (typeNgramTopicCounts[fi][n][ti]); writeIntArray2 (unitypeTopicCounts, out); writeIntArray2 (bitypeTopicCounts, out); for (int ti = 0; ti < numTopics; ti++) out.writeInt (tokensPerTopic[ti]); writeIntArray2 (bitokensPerTopic, out); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { int featuresLength; int version = in.readInt (); ilist = (InstanceList) in.readObject (); numTopics = in.readInt(); alpha = in.readDouble(); beta = in.readDouble(); gamma = in.readDouble(); delta = in.readDouble(); tAlpha = in.readDouble(); vBeta = in.readDouble(); vGamma = in.readDouble(); numTypes = in.readInt(); numBitypes = in.readInt(); numTokens = in.readInt(); biTokens = in.readInt(); int numDocs = ilist.size(); topics = new int[numDocs][]; grams = new int[numDocs][]; for (int di = 0; di < ilist.size(); di++) { int docLen = ((FeatureSequence)ilist.getInstance(di).getData()).getLength(); topics[di] = new int[docLen]; for (int si = 0; si < docLen; si++) topics[di][si] = in.readInt(); } for (int di = 0; di < ilist.size(); di++) { int docLen = ((FeatureSequence)ilist.getInstance(di).getData()).getLength(); grams[di] = new int[docLen]; for (int si = 0; si < docLen; si++) grams[di][si] = in.readInt(); } docTopicCounts = readIntArray2 (in); typeNgramTopicCounts = new int[numTypes][2][numTopics]; for (int fi = 0; fi < numTypes; fi++) for (int n = 0; n < 2; n++) for (int ti = 0; ti < numTopics; ti++) typeNgramTopicCounts[fi][n][ti] = in.readInt(); unitypeTopicCounts = readIntArray2 (in); bitypeTopicCounts = readIntArray2 (in); tokensPerTopic = new int[numTopics]; for (int ti = 0; ti < numTopics; ti++) tokensPerTopic[ti] = in.readInt(); bitokensPerTopic = readIntArray2 (in); } // Just for testing. Recommend instead is mallet/bin/vectors2topics public static void main (String[] args) { InstanceList ilist = InstanceList.load (new File(args[0])); int numIterations = args.length > 1 ? Integer.parseInt(args[1]) : 1000; int numTopWords = args.length > 2 ? Integer.parseInt(args[2]) : 20; System.out.println ("Data loaded."); TopicalNGrams tng = new TopicalNGrams (10); tng.estimate (ilist, 1000, 50, 0, null, new Random()); tng.printTopWords (60, true); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -