📄 markov.java

📁 人工智能代码JAVA程序,包括神经网络,遗传算法
💻 JAVA
字号:
import java.util.*;import java.io.*;public class Markov {    static public void main(String[] args) {        new Markov();    }    /**     *  The public constructor does everything:     *     *   1. reads a tagged input file tagged_text.txt and builds     *      a list of all possible tags and words     *   2. trains a visible Markov model     *   3. tests the Markov model by tagging new test sentences     *      containing only words originally in tagged text file.     */    public Markov() {        build_words_and_tags();        print_statistics();        train_model();        test_model();    }    /**     *  Read an input file of manually tagged text. Get a sequence of words and     *  associated tags and alos build sequences of unique words and unique tags.     */    public void build_words_and_tags() {        try {            FileReader fr = new FileReader("tagged_text.txt");            BufferedReader br = new BufferedReader(fr);            while (true) {                String line = br.readLine();                if (line == null) break;                p(line);                line = line.trim();                while (true) {                    int index = line.indexOf(" ");                    String key;                    String tag;                    if (index == -1) {                        int index2 = line.indexOf("/");                        key = line.substring(0, index2).toLowerCase();                        tag = line.substring(index2 + 1);                    } else {                        String line2 = line.substring(0, index);                        line = line.substring(index + 1);                        int index2 = line2.indexOf("/");                        key = line2.substring(0, index2).toLowerCase();                        tag = line2.substring(index2 + 1);                    }                    Vector v = (Vector) lexicon.get(key);                    if (v == null) {                        v = new Vector(5);                    }                    v.add(tag);                    lexicon.put(key, v);                    if (tags.get(tag) == null) {                        tags.put(tag, new Integer(1));                        tagCount++;                    } else {                        int old_count = ((Integer)tags.get(tag)).intValue();                        tags.put(tag, new Integer(old_count + 1));                    }                    if (words.get(key) == null) {                        words.put(key, new Integer(wordCount++));                    }                    wordList.add(key);                    tagList.add(tag);                    if (index == -1) break;                }            }            Enumeration enum = tags.keys();            while (enum.hasMoreElements())  uniqueTags.add((String)enum.nextElement());            uniqueTagCount = uniqueTags.size();            enum = words.keys();            while (enum.hasMoreElements())  uniqueWords.add((String)enum.nextElement());            uniqueWordCount = uniqueWords.size();        } catch (Exception e) {            e.printStackTrace();        }    }    /**     *   For debug only: print out statistics of number of unique words and unique tags.     */    public void print_statistics() {        int word_count = 0;        Enumeration enum = lexicon.keys();        while (enum.hasMoreElements()) {            word_count++;            String key = (String) enum.nextElement();            Vector v = (Vector) lexicon.get(key);            p0(key + ":");            for (int i = 0, size = v.size(); i < size; i++) p0(" " + v.elementAt(i));            p("");        }        p("\ntotal number of unique words is " + word_count + "\n");        p("wordCount=" + wordCount + ", tagCount=" + tagCount);    }    /**     *  This is for debug only: print out training matrices in a CSV type format     *  so that the matices can be examined in a spreadsheet program for debugging purposes.     */    private void WriteCSVfile(Vector rowNames, Vector colNames, float[][] buf, String fileName) {        p("tagList.size()="+tagList.size());        try {            FileWriter fw = new FileWriter(fileName + ".txt");            PrintWriter bw = new PrintWriter(new BufferedWriter(fw));            // write the first title row:            StringBuffer sb = new StringBuffer(500);            for (int i = 0, size = colNames.size(); i < size; i++) {                sb.append("," + colNames.elementAt(i));            }            bw.println(sb.toString());            // loop on remaining rows:            for (int i = 0, size = buf.length; i < size; i++) {                sb.delete(0, sb.length());                sb.append(rowNames.elementAt(i));                for (int j = 0, size2 = buf[i].length; j < size2; j++) {                    sb.append("," + buf[i][j]);                }                bw.println(sb.toString());            }            bw.close();        } catch (IOException ioe) {            ioe.printStackTrace();        }    }    /**     *  Train a Markov model using manually tagged input text     */    public void train_model() {        // start by filling in the tag to tag transition count matrix:        tagToTagTransitionCount = new float[uniqueTagCount][uniqueTagCount];        p("tagCount="+tagCount);        p("uniqueTagCount="+uniqueTagCount);        for (int i = 0; i < uniqueTagCount; i++) {            for (int j = 0; j < uniqueTagCount; j++) {                tagToTagTransitionCount[i][j] = 0;            }        }        String tag1 = (String) tagList.elementAt(0);        int index1 = uniqueTags.indexOf(tag1);           // inefficient!        String tag0;        int index0;        for (int i = 0, size1 = wordList.size() - 1; i < size1; i++) {            tag0 = tag1;            index0 = index1;            tag1 = (String) tagList.elementAt(i + 1);            index1 = uniqueTags.indexOf(tag1);           // inefficient            tagToTagTransitionCount[index0][index1]++;        }        WriteCSVfile(uniqueTags, uniqueTags, tagToTagTransitionCount, "tag_to_tag");        // now calculate the probabilities of transitioning from tagN to tagM:        probabilityTag1ToTag2 = new float[uniqueTagCount][uniqueTagCount];        for (int i = 0; i < uniqueTagCount; i++) {            int count = ((Integer)tags.get((String)uniqueTags.elementAt(i))).intValue();            p("tag: " + uniqueTags.elementAt(i) + ", count="+count);            for (int j = 0; j < uniqueTagCount; j++) {                probabilityTag1ToTag2[i][j] = 0.0001f + tagToTagTransitionCount[i][j] / (float)count;            }        }        WriteCSVfile(uniqueTags, uniqueTags, probabilityTag1ToTag2, "prob_tag_to_tag");        // now calculate the probability of a word, given a preceeding tag:        probabilityWordGivenTag = new float[uniqueWordCount][uniqueTagCount];        for (int i = 0; i < uniqueWordCount; i++) {            String word = (String)uniqueWords.elementAt(i);            for (int j = 0; j < uniqueTagCount; j++) {                String tag = (String)uniqueTags.elementAt(j);                // note: index of tag is one less than index of emitted word we are testing:                int countTagOccurence = ((Integer)tags.get(tag)).intValue();                float wordWithTagOccurence = 0;                for (int n=0, sizem1=wordList.size()-1; n<sizem1; n++) {                    String testWord = (String)wordList.elementAt(n);                    String testTag  = (String)tagList.elementAt(n);                    if (testWord.equals(word) && testTag.equals(tag)) {                        wordWithTagOccurence++;                    }                }                probabilityWordGivenTag[i][j] = wordWithTagOccurence / (float)countTagOccurence;            }        }        WriteCSVfile(uniqueWords, uniqueTags, probabilityWordGivenTag, "prob_word_given_tag");    }    // data for exponential method of evaluating most probable tags for a sequence of words:    int [] indices;    int [] counts;    Vector possibleTags;    /**     *  Increment the class variable indices[] to point to the next possible set of tags     *  to check.     */    boolean incrementIndices(int num) { // uses the global arrays indices and counts        for (int i=0; i<num; i++) {            if (indices[i] < (counts[i] - 1)) {                indices[i] += 1;                for (int j=0; j<i; j++) {                    indices[j] = 0;                }                return true;            }        }        return false;    }    /**     *  For a sequence of words, values of class variable indices[], and class variable     *  possibleTags, evaluate how well tags rate using equation 10.7 in [Manning/Schutze, 1999]     */    float score(Vector words) { // uses global variables        float s = 1.0f;        int num = words.size();        float prob_tag_i_given_tag_i_minus_1 = 1.0f;        for (int i=0; i<num; i++) {            System.out.println("words["+i+"]="+words.elementAt(i));            int tag_index = indices[i];            if (i > 0) {                Vector v0 = (Vector)possibleTags.elementAt(i - 1);                Vector v1 = (Vector)possibleTags.elementAt(i);                int index1 = uniqueTags.indexOf(v0.elementAt(indices[i - 1]));                int index2 = uniqueTags.indexOf(v1.elementAt(indices[i]));                System.out.println("index1="+index1+"[tag: "+uniqueTags.elementAt(index1)+"], index2="+index2                            +"[tag: "+uniqueTags.elementAt(index2)+"]");                prob_tag_i_given_tag_i_minus_1 = probabilityTag1ToTag2[index1][index2];                int index3 = uniqueWords.indexOf("" + words.elementAt(i));                float p = probabilityWordGivenTag[index3][index2];                System.out.println("word: " + words.elementAt(i) + ", p="+p);                prob_tag_i_given_tag_i_minus_1 *= p;            }            s *= prob_tag_i_given_tag_i_minus_1;        }        return s;    }    /**     *  Use exponential runtime tagging algorithm (evaluates trained Markov model).     *  NOTE: do not use this algorithm for long input word sequences - instead,     *  break up long sequences of text into smaller pieces (i.e., process just     *  a few sentences at a time).     */    public Vector exponential_tagging_algorithm(Vector words) {        possibleTags = new Vector();        int num = words.size();        indices = new int[num];        counts = new int[num];        int [] best_indices = new int[num];        for (int i=0; i<num; i++) { indices[i] = 0; counts[i] = 0;}        for (int i=0; i<num; i++) {            String word = "" + words.elementAt(i);            Vector v = (Vector)lexicon.get(word);            Vector v2 = new Vector();  // possible tags at index i            for (int j=0; j<v.size(); j++) {                String tag = "" + v.elementAt(j);                if (v2.contains(tag) == false)   { v2.add(tag);  counts[i]++; }            }            possibleTags.add(v2);      // possible tags at index i            System.out.print("^^ word: " + word + ", tag count: " + counts[i] + ", tags: ");            for (int j=0; j<v2.size(); j++) System.out.print(" " + v2.elementAt(j));            System.out.println();        }        float best_score = -9999;        do {            System.out.print("Current indices:");            for (int k=0; k<num; k++) System.out.print(" " + indices[k]);            System.out.println();            float score = score(words);            if (score > best_score) {                best_score = score;                System.out.println(" **  ** new best score: " + best_score);                for (int m=0; m<num; m++) best_indices[m] = indices[m];            }        } while (incrementIndices(num));        Vector tags = new Vector(num);        for (int i=0; i<num; i++) {            Vector v = (Vector)possibleTags.elementAt(i);            tags.add(v.elementAt(best_indices[i]));        }        return tags;    }    /**     *   Throw away test method.     */    public void test_model() {        Vector words = new Vector();     words.add(".");        words.add("the"); words.add("dog"); words.add("chased"); words.add("the"); words.add("cat"); words.add(".");        words.add("mary"); words.add("went"); words.add("to"); words.add("the"); words.add("river"); words.add(".");        words.add("john"); words.add("saw"); words.add("mary"); words.add("bank"); words.add("the");        words.add("airplane"); words.add(".");        Vector tags = exponential_tagging_algorithm(words);        p("");        for (int i=0; i<words.size()-1; i++) {            p(""+words.elementAt(i)+"\t: "+tags.elementAt(i));        }    }    /**     *  Utility print method - with line feed     */    public void p(String s) {        System.out.println(s);    }    /**     *  Utility print method - no line feed     */    public void p0(String s) {        System.out.print(s);    }    Hashtable lexicon = new Hashtable();    Hashtable tags = new Hashtable();    Hashtable words = new Hashtable();    Vector uniqueTags = new Vector();    Vector uniqueWords = new Vector();    int uniqueTagCount;    int uniqueWordCount;    //String [] tagNames;    int tagCount = 0;                  // from training text    int wordCount = 0;                 // from training text    Vector wordList = new Vector();    // from training text    Vector tagList = new Vector();     // from training text    float[][] tagToTagTransitionCount;      // [num_tag][num_tag]    int[][] wordCountByTag;    float [][]probabilityTag1ToTag2;    float [][]probabilityWordGivenTag;}
💿 文件大小 1685 K
👤 上传用户 leeixndong
📂 所属分类 Java编程
🏷️ 相关标签

#JAVA #人工智能 #代码 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -