📄 markov.java
字号:
import java.util.*;import java.io.*;public class Markov { static public void main(String[] args) { new Markov(); } /** * The public constructor does everything: * * 1. reads a tagged input file tagged_text.txt and builds * a list of all possible tags and words * 2. trains a visible Markov model * 3. tests the Markov model by tagging new test sentences * containing only words originally in tagged text file. */ public Markov() { build_words_and_tags(); print_statistics(); train_model(); test_model(); } /** * Read an input file of manually tagged text. Get a sequence of words and * associated tags and alos build sequences of unique words and unique tags. */ public void build_words_and_tags() { try { FileReader fr = new FileReader("tagged_text.txt"); BufferedReader br = new BufferedReader(fr); while (true) { String line = br.readLine(); if (line == null) break; p(line); line = line.trim(); while (true) { int index = line.indexOf(" "); String key; String tag; if (index == -1) { int index2 = line.indexOf("/"); key = line.substring(0, index2).toLowerCase(); tag = line.substring(index2 + 1); } else { String line2 = line.substring(0, index); line = line.substring(index + 1); int index2 = line2.indexOf("/"); key = line2.substring(0, index2).toLowerCase(); tag = line2.substring(index2 + 1); } Vector v = (Vector) lexicon.get(key); if (v == null) { v = new Vector(5); } v.add(tag); lexicon.put(key, v); if (tags.get(tag) == null) { tags.put(tag, new Integer(1)); tagCount++; } else { int old_count = ((Integer)tags.get(tag)).intValue(); tags.put(tag, new Integer(old_count + 1)); } if (words.get(key) == null) { words.put(key, new Integer(wordCount++)); } wordList.add(key); tagList.add(tag); if (index == -1) break; } } Enumeration enum = tags.keys(); while (enum.hasMoreElements()) uniqueTags.add((String)enum.nextElement()); uniqueTagCount = uniqueTags.size(); enum = words.keys(); while (enum.hasMoreElements()) uniqueWords.add((String)enum.nextElement()); uniqueWordCount = uniqueWords.size(); } catch (Exception e) { e.printStackTrace(); } } /** * For debug only: print out statistics of number of unique words and unique tags. */ public void print_statistics() { int word_count = 0; Enumeration enum = lexicon.keys(); while (enum.hasMoreElements()) { word_count++; String key = (String) enum.nextElement(); Vector v = (Vector) lexicon.get(key); p0(key + ":"); for (int i = 0, size = v.size(); i < size; i++) p0(" " + v.elementAt(i)); p(""); } p("\ntotal number of unique words is " + word_count + "\n"); p("wordCount=" + wordCount + ", tagCount=" + tagCount); } /** * This is for debug only: print out training matrices in a CSV type format * so that the matices can be examined in a spreadsheet program for debugging purposes. */ private void WriteCSVfile(Vector rowNames, Vector colNames, float[][] buf, String fileName) { p("tagList.size()="+tagList.size()); try { FileWriter fw = new FileWriter(fileName + ".txt"); PrintWriter bw = new PrintWriter(new BufferedWriter(fw)); // write the first title row: StringBuffer sb = new StringBuffer(500); for (int i = 0, size = colNames.size(); i < size; i++) { sb.append("," + colNames.elementAt(i)); } bw.println(sb.toString()); // loop on remaining rows: for (int i = 0, size = buf.length; i < size; i++) { sb.delete(0, sb.length()); sb.append(rowNames.elementAt(i)); for (int j = 0, size2 = buf[i].length; j < size2; j++) { sb.append("," + buf[i][j]); } bw.println(sb.toString()); } bw.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } /** * Train a Markov model using manually tagged input text */ public void train_model() { // start by filling in the tag to tag transition count matrix: tagToTagTransitionCount = new float[uniqueTagCount][uniqueTagCount]; p("tagCount="+tagCount); p("uniqueTagCount="+uniqueTagCount); for (int i = 0; i < uniqueTagCount; i++) { for (int j = 0; j < uniqueTagCount; j++) { tagToTagTransitionCount[i][j] = 0; } } String tag1 = (String) tagList.elementAt(0); int index1 = uniqueTags.indexOf(tag1); // inefficient! String tag0; int index0; for (int i = 0, size1 = wordList.size() - 1; i < size1; i++) { tag0 = tag1; index0 = index1; tag1 = (String) tagList.elementAt(i + 1); index1 = uniqueTags.indexOf(tag1); // inefficient tagToTagTransitionCount[index0][index1]++; } WriteCSVfile(uniqueTags, uniqueTags, tagToTagTransitionCount, "tag_to_tag"); // now calculate the probabilities of transitioning from tagN to tagM: probabilityTag1ToTag2 = new float[uniqueTagCount][uniqueTagCount]; for (int i = 0; i < uniqueTagCount; i++) { int count = ((Integer)tags.get((String)uniqueTags.elementAt(i))).intValue(); p("tag: " + uniqueTags.elementAt(i) + ", count="+count); for (int j = 0; j < uniqueTagCount; j++) { probabilityTag1ToTag2[i][j] = 0.0001f + tagToTagTransitionCount[i][j] / (float)count; } } WriteCSVfile(uniqueTags, uniqueTags, probabilityTag1ToTag2, "prob_tag_to_tag"); // now calculate the probability of a word, given a preceeding tag: probabilityWordGivenTag = new float[uniqueWordCount][uniqueTagCount]; for (int i = 0; i < uniqueWordCount; i++) { String word = (String)uniqueWords.elementAt(i); for (int j = 0; j < uniqueTagCount; j++) { String tag = (String)uniqueTags.elementAt(j); // note: index of tag is one less than index of emitted word we are testing: int countTagOccurence = ((Integer)tags.get(tag)).intValue(); float wordWithTagOccurence = 0; for (int n=0, sizem1=wordList.size()-1; n<sizem1; n++) { String testWord = (String)wordList.elementAt(n); String testTag = (String)tagList.elementAt(n); if (testWord.equals(word) && testTag.equals(tag)) { wordWithTagOccurence++; } } probabilityWordGivenTag[i][j] = wordWithTagOccurence / (float)countTagOccurence; } } WriteCSVfile(uniqueWords, uniqueTags, probabilityWordGivenTag, "prob_word_given_tag"); } // data for exponential method of evaluating most probable tags for a sequence of words: int [] indices; int [] counts; Vector possibleTags; /** * Increment the class variable indices[] to point to the next possible set of tags * to check. */ boolean incrementIndices(int num) { // uses the global arrays indices and counts for (int i=0; i<num; i++) { if (indices[i] < (counts[i] - 1)) { indices[i] += 1; for (int j=0; j<i; j++) { indices[j] = 0; } return true; } } return false; } /** * For a sequence of words, values of class variable indices[], and class variable * possibleTags, evaluate how well tags rate using equation 10.7 in [Manning/Schutze, 1999] */ float score(Vector words) { // uses global variables float s = 1.0f; int num = words.size(); float prob_tag_i_given_tag_i_minus_1 = 1.0f; for (int i=0; i<num; i++) { System.out.println("words["+i+"]="+words.elementAt(i)); int tag_index = indices[i]; if (i > 0) { Vector v0 = (Vector)possibleTags.elementAt(i - 1); Vector v1 = (Vector)possibleTags.elementAt(i); int index1 = uniqueTags.indexOf(v0.elementAt(indices[i - 1])); int index2 = uniqueTags.indexOf(v1.elementAt(indices[i])); System.out.println("index1="+index1+"[tag: "+uniqueTags.elementAt(index1)+"], index2="+index2 +"[tag: "+uniqueTags.elementAt(index2)+"]"); prob_tag_i_given_tag_i_minus_1 = probabilityTag1ToTag2[index1][index2]; int index3 = uniqueWords.indexOf("" + words.elementAt(i)); float p = probabilityWordGivenTag[index3][index2]; System.out.println("word: " + words.elementAt(i) + ", p="+p); prob_tag_i_given_tag_i_minus_1 *= p; } s *= prob_tag_i_given_tag_i_minus_1; } return s; } /** * Use exponential runtime tagging algorithm (evaluates trained Markov model). * NOTE: do not use this algorithm for long input word sequences - instead, * break up long sequences of text into smaller pieces (i.e., process just * a few sentences at a time). */ public Vector exponential_tagging_algorithm(Vector words) { possibleTags = new Vector(); int num = words.size(); indices = new int[num]; counts = new int[num]; int [] best_indices = new int[num]; for (int i=0; i<num; i++) { indices[i] = 0; counts[i] = 0;} for (int i=0; i<num; i++) { String word = "" + words.elementAt(i); Vector v = (Vector)lexicon.get(word); Vector v2 = new Vector(); // possible tags at index i for (int j=0; j<v.size(); j++) { String tag = "" + v.elementAt(j); if (v2.contains(tag) == false) { v2.add(tag); counts[i]++; } } possibleTags.add(v2); // possible tags at index i System.out.print("^^ word: " + word + ", tag count: " + counts[i] + ", tags: "); for (int j=0; j<v2.size(); j++) System.out.print(" " + v2.elementAt(j)); System.out.println(); } float best_score = -9999; do { System.out.print("Current indices:"); for (int k=0; k<num; k++) System.out.print(" " + indices[k]); System.out.println(); float score = score(words); if (score > best_score) { best_score = score; System.out.println(" ** ** new best score: " + best_score); for (int m=0; m<num; m++) best_indices[m] = indices[m]; } } while (incrementIndices(num)); Vector tags = new Vector(num); for (int i=0; i<num; i++) { Vector v = (Vector)possibleTags.elementAt(i); tags.add(v.elementAt(best_indices[i])); } return tags; } /** * Throw away test method. */ public void test_model() { Vector words = new Vector(); words.add("."); words.add("the"); words.add("dog"); words.add("chased"); words.add("the"); words.add("cat"); words.add("."); words.add("mary"); words.add("went"); words.add("to"); words.add("the"); words.add("river"); words.add("."); words.add("john"); words.add("saw"); words.add("mary"); words.add("bank"); words.add("the"); words.add("airplane"); words.add("."); Vector tags = exponential_tagging_algorithm(words); p(""); for (int i=0; i<words.size()-1; i++) { p(""+words.elementAt(i)+"\t: "+tags.elementAt(i)); } } /** * Utility print method - with line feed */ public void p(String s) { System.out.println(s); } /** * Utility print method - no line feed */ public void p0(String s) { System.out.print(s); } Hashtable lexicon = new Hashtable(); Hashtable tags = new Hashtable(); Hashtable words = new Hashtable(); Vector uniqueTags = new Vector(); Vector uniqueWords = new Vector(); int uniqueTagCount; int uniqueWordCount; //String [] tagNames; int tagCount = 0; // from training text int wordCount = 0; // from training text Vector wordList = new Vector(); // from training text Vector tagList = new Vector(); // from training text float[][] tagToTagTransitionCount; // [num_tag][num_tag] int[][] wordCountByTag; float [][]probabilityTag1ToTag2; float [][]probabilityWordGivenTag;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -