📄 parserme.java
字号:
* Advances the parse by assigning it POS tags and returns multiple tag sequences. * @param p The parse to be tagged. * @return Parses with different POS-tag sequence assignments. */ protected Parse[] advanceTags(final Parse p) { Parse[] children = p.getChildren(); String[] words = new String[children.length]; double[] probs = new double[words.length]; for (int i = 0,il = children.length; i < il; i++) { words[i] = children[i].toString(); } Sequence[] ts = tagger.topKSequences(words); if (ts.length == 0) { System.err.println("no tag sequence"); } Parse[] newParses = new Parse[ts.length]; for (int i = 0; i < ts.length; i++) { String[] tags = (String[]) ts[i].getOutcomes().toArray(new String[words.length]); ts[i].getProbs(probs); newParses[i] = (Parse) p.clone(); //copies top level if (createDerivationString) newParses[i].getDerivation().append(i).append("."); for (int j = 0; j < words.length; j++) { Parse word = children[j]; //System.err.println("inserting tag "+tags[j]); double prob = probs[j]; newParses[i].insert(new Parse(word.getText(), word.getSpan(), tags[j], prob)); newParses[i].addProb(Math.log(prob)); //newParses[i].show(); } } return newParses; } /** * Removes the punctuation from the specified set of chunks, adds it to the parses * adjacent to the punctuation is specified, and returns a new array of parses with the punctuation * removed. * @param chunks A set of parses. * @param punctSet The set of punctuation which is to be removed. * @return An array of parses which is a subset of chunks with punctuation removed. */ public static Parse[] collapsePunctuation(Parse[] chunks, Set punctSet) { List collapsedParses = new ArrayList(chunks.length); int lastNonPunct = -1; int nextNonPunct = -1; for (int ci=0,cn=chunks.length;ci<cn;ci++) { if (punctSet.contains(chunks[ci].getType())) { if (lastNonPunct >= 0) { chunks[lastNonPunct].addNextPunctuation(chunks[ci]); } for (nextNonPunct=ci+1;nextNonPunct<cn;nextNonPunct++) { if (!punctSet.contains(chunks[nextNonPunct].getType())) { break; } } if (nextNonPunct < cn) { chunks[nextNonPunct].addPreviousPunctuation(chunks[ci]); } } else { collapsedParses.add(chunks[ci]); lastNonPunct = ci; } } if (collapsedParses.size() == chunks.length) { return chunks; } //System.err.println("collapsedPunctuation: collapsedParses"+collapsedParses); return (Parse[]) collapsedParses.toArray(new Parse[collapsedParses.size()]); } public static GISModel train(opennlp.maxent.EventStream es, int iterations, int cut) throws java.io.IOException { return opennlp.maxent.GIS.trainModel(iterations, new TwoPassDataIndexer(es, cut)); } private static boolean lastChild(Parse child, Parse parent, Set punctSet) { Parse[] kids = collapsePunctuation(parent.getChildren(),punctSet); return (kids[kids.length - 1] == child); } private static void usage() { System.err.println("Usage: ParserME -[dict|tag|chunk|build|check|fun] trainingFile parserModelDirectory [iterations cutoff]"); System.err.println(); System.err.println("Training file should be one sentence per line where each line consists of a Penn Treebank Style parse"); System.err.println("-dict Just build the dictionaries."); System.err.println("-tag Just build the tagging model."); System.err.println("-chunk Just build the chunking model."); System.err.println("-build Just build the build model"); System.err.println("-check Just build the check model"); System.err.println("-fun Predict function tags"); } /** * Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off. * @param data The data stream of parses. * @param rules The head rules for the parses. * @param cutoff The minimum number of entries required for the n-gram to be saved as part of the dictionary. * @return A dictionary object. */ private static MutableDictionary buildDictionary(DataStream data, HeadRules rules, int cutoff) { MutableDictionary mdict = new MutableDictionary(cutoff); while(data.hasNext()) { String parseStr = (String) data.nextToken(); Parse p = Parse.parseParse(parseStr); p.updateHeads(rules); Parse[] pwords = p.getTagNodes(); String[] words = new String[pwords.length]; //add all uni-grams for (int wi=0;wi<words.length;wi++) { words[wi] = pwords[wi].toString(); } mdict.add(words,1,true); //add tri-grams and bi-grams for inital sequence Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p),rules.getPunctuationTags()); String[] cwords = new String[chunks.length]; for (int wi=0;wi<cwords.length;wi++) { cwords[wi] = chunks[wi].getHead().toString(); } mdict.add(cwords,3,false); //emulate reductions to produce additional n-grams int ci = 0; while (ci < chunks.length) { //System.err.println("chunks["+ci+"]="+chunks[ci].getHead().toString()+" chunks.length="+chunks.length); if (lastChild(chunks[ci], chunks[ci].getParent(),rules.getPunctuationTags())) { //perform reduce int reduceStart = ci; while (reduceStart >=0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) { reduceStart--; } reduceStart++; chunks = ParserEventStream.reduceChunks(chunks,ci,chunks[ci].getParent()); ci = reduceStart; if (chunks.length != 0) { String[] window = new String[5]; int wi = 0; if (ci-2 >= 0) window[wi++] = chunks[ci-2].getHead().toString(); if (ci-1 >= 0) window[wi++] = chunks[ci-1].getHead().toString(); window[wi++] = chunks[ci].getHead().toString(); if (ci+1 < chunks.length) window[wi++] = chunks[ci+1].getHead().toString(); if (ci+2 < chunks.length) window[wi++] = chunks[ci+2].getHead().toString(); if (wi < 5) { String[] subWindow = new String[wi]; for (int swi=0;swi<wi;swi++) { subWindow[swi]=window[swi]; } window = subWindow; } if (window.length >=3) { mdict.add(window,3,false); } else if (window.length == 2) { mdict.add(window,2,false); } } ci=reduceStart-1; //ci will be incremented at end of loop } ci++; } } return mdict; } public static void main(String[] args) throws java.io.IOException { if (args.length < 3) { usage(); System.exit(1); } boolean dict = false; boolean tag = false; boolean chunk = false; boolean build = false; boolean check = false; boolean fun = false; boolean all = true; int argIndex = 0; while (args[argIndex].startsWith("-")) { all = false; if (args[argIndex].equals("-dict")) { dict = true; } else if (args[argIndex].equals("-tag")) { tag = true; } else if (args[argIndex].equals("-chunk")) { chunk = true; } else if (args[argIndex].equals("-build")) { build = true; } else if (args[argIndex].equals("-check")) { check = true; } else if (args[argIndex].equals("-fun")) { fun = true; } else if (args[argIndex].equals("--")) { argIndex++; break; } else { System.err.println("Invalid option " + args[argIndex]); usage(); System.exit(1); } argIndex++; } java.io.File inFile = new java.io.File(args[argIndex++]); String modelDirectory = args[argIndex++]; HeadRules rules = new opennlp.tools.lang.english.HeadRules(modelDirectory+"/head_rules"); java.io.File dictFile = new java.io.File(modelDirectory+"/dict.bin.gz"); java.io.File tagFile = new java.io.File(modelDirectory+"/tag.bin.gz"); java.io.File chunkFile = new java.io.File(modelDirectory+"/chunk.bin.gz"); java.io.File buildFile = new java.io.File(modelDirectory+"/build.bin.gz"); java.io.File checkFile = new java.io.File(modelDirectory+"/check.bin.gz"); int iterations = 100; int cutoff = 5; if (args.length > argIndex) { iterations = Integer.parseInt(args[argIndex++]); cutoff = Integer.parseInt(args[argIndex++]); } if (fun) { Parse.useFunctionTags(true); } if (dict || all) { System.err.println("Building dictionary"); DataStream data = new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile)); MutableDictionary mdict = buildDictionary(data, rules, cutoff); System.out.println("Saving the dictionary"); mdict.persist(dictFile); } if (tag || all) { System.err.println("Training tagger"); System.err.println("Loading Dictionary"); Dictionary tridict = new Dictionary(dictFile.toString()); opennlp.maxent.EventStream tes = new ParserEventStream(new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile)), rules, EventTypeEnum.TAG,tridict); GISModel tagModel = train(tes, iterations, cutoff); System.out.println("Saving the tagger model as: " + tagFile); new opennlp.maxent.io.SuffixSensitiveGISModelWriter(tagModel, tagFile).persist(); } if (chunk || all) { System.err.println("Training chunker"); opennlp.maxent.EventStream ces = new ParserEventStream(new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile)), rules, EventTypeEnum.CHUNK); GISModel chunkModel = train(ces, iterations, cutoff); System.out.println("Saving the chunker model as: " + chunkFile); new opennlp.maxent.io.SuffixSensitiveGISModelWriter(chunkModel, chunkFile).persist(); } if (build || all) { System.err.println("Loading Dictionary"); Dictionary tridict = new Dictionary(dictFile.toString()); System.err.println("Training builder"); opennlp.maxent.EventStream bes = new ParserEventStream(new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile)), rules, EventTypeEnum.BUILD,tridict); GISModel buildModel = train(bes, iterations, cutoff); System.out.println("Saving the build model as: " + buildFile); new opennlp.maxent.io.SuffixSensitiveGISModelWriter(buildModel, buildFile).persist(); } if (check || all) { System.err.println("Training checker"); opennlp.maxent.EventStream kes = new ParserEventStream(new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile)), rules, EventTypeEnum.CHECK); GISModel checkModel = train(kes, iterations, cutoff); System.out.println("Saving the check model as: " + checkFile); new opennlp.maxent.io.SuffixSensitiveGISModelWriter(checkModel, checkFile).persist(); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -