📄 parserme.java

📁 自然语言处理领域的一个开发包
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
   * Advances the parse by assigning it POS tags and returns multiple tag sequences.   * @param p The parse to be tagged.   * @return Parses with different POS-tag sequence assignments.   */  protected Parse[] advanceTags(final Parse p) {    Parse[] children = p.getChildren();    String[] words = new String[children.length];    double[] probs = new double[words.length];    for (int i = 0,il = children.length; i < il; i++) {      words[i] = children[i].toString();    }    Sequence[] ts = tagger.topKSequences(words);    if (ts.length == 0) {      System.err.println("no tag sequence");    }    Parse[] newParses = new Parse[ts.length];    for (int i = 0; i < ts.length; i++) {      String[] tags = (String[]) ts[i].getOutcomes().toArray(new String[words.length]);      ts[i].getProbs(probs);      newParses[i] = (Parse) p.clone(); //copies top level      if (createDerivationString) newParses[i].getDerivation().append(i).append(".");      for (int j = 0; j < words.length; j++) {        Parse word = children[j];        //System.err.println("inserting tag "+tags[j]);        double prob = probs[j];        newParses[i].insert(new Parse(word.getText(), word.getSpan(), tags[j], prob));        newParses[i].addProb(Math.log(prob));        //newParses[i].show();      }    }    return newParses;  }    /**   * Removes the punctuation from the specified set of chunks, adds it to the parses   * adjacent to the punctuation is specified, and returns a new array of parses with the punctuation   * removed.   * @param chunks A set of parses.   * @param punctSet The set of punctuation which is to be removed.   * @return An array of parses which is a subset of chunks with punctuation removed.   */  public static Parse[] collapsePunctuation(Parse[] chunks, Set punctSet) {    List collapsedParses = new ArrayList(chunks.length);    int lastNonPunct = -1;    int nextNonPunct = -1;    for (int ci=0,cn=chunks.length;ci<cn;ci++) {      if (punctSet.contains(chunks[ci].getType())) {        if (lastNonPunct >= 0) {          chunks[lastNonPunct].addNextPunctuation(chunks[ci]);        }        for (nextNonPunct=ci+1;nextNonPunct<cn;nextNonPunct++) {          if (!punctSet.contains(chunks[nextNonPunct].getType())) {            break;          }        }        if (nextNonPunct < cn) {          chunks[nextNonPunct].addPreviousPunctuation(chunks[ci]);        }      }      else {        collapsedParses.add(chunks[ci]);        lastNonPunct = ci;      }    }    if (collapsedParses.size() == chunks.length) {      return chunks;    }    //System.err.println("collapsedPunctuation: collapsedParses"+collapsedParses);    return (Parse[]) collapsedParses.toArray(new Parse[collapsedParses.size()]);  }  public static GISModel train(opennlp.maxent.EventStream es, int iterations, int cut) throws java.io.IOException {    return opennlp.maxent.GIS.trainModel(iterations, new TwoPassDataIndexer(es, cut));  }    private static boolean lastChild(Parse child, Parse parent, Set punctSet) {    Parse[] kids = collapsePunctuation(parent.getChildren(),punctSet);    return (kids[kids.length - 1] == child);  }    private static void usage() {    System.err.println("Usage: ParserME -[dict|tag|chunk|build|check|fun] trainingFile parserModelDirectory [iterations cutoff]");    System.err.println();    System.err.println("Training file should be one sentence per line where each line consists of a Penn Treebank Style parse");    System.err.println("-dict Just build the dictionaries.");    System.err.println("-tag Just build the tagging model.");    System.err.println("-chunk Just build the chunking model.");    System.err.println("-build Just build the build model");    System.err.println("-check Just build the check model");    System.err.println("-fun Predict function tags");  }  /**   * Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off.   * @param data The data stream of parses.   * @param rules The head rules for the parses.   * @param cutoff The minimum number of entries required for the n-gram to be saved as part of the dictionary.    * @return A dictionary object.   */  private static MutableDictionary buildDictionary(DataStream data, HeadRules rules, int cutoff) {    MutableDictionary mdict = new MutableDictionary(cutoff);    while(data.hasNext()) {      String parseStr = (String) data.nextToken();      Parse p = Parse.parseParse(parseStr);      p.updateHeads(rules);      Parse[] pwords = p.getTagNodes();      String[] words = new String[pwords.length];      //add all uni-grams      for (int wi=0;wi<words.length;wi++) {        words[wi] = pwords[wi].toString();      }      mdict.add(words,1,true);      //add tri-grams and bi-grams for inital sequence      Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p),rules.getPunctuationTags());      String[] cwords = new String[chunks.length];      for (int wi=0;wi<cwords.length;wi++) {        cwords[wi] = chunks[wi].getHead().toString();      }      mdict.add(cwords,3,false);      //emulate reductions to produce additional n-grams       int ci = 0;      while (ci < chunks.length) {        //System.err.println("chunks["+ci+"]="+chunks[ci].getHead().toString()+" chunks.length="+chunks.length);        if (lastChild(chunks[ci], chunks[ci].getParent(),rules.getPunctuationTags())) {          //perform reduce          int reduceStart = ci;          while (reduceStart >=0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) {            reduceStart--;          }          reduceStart++;          chunks = ParserEventStream.reduceChunks(chunks,ci,chunks[ci].getParent());          ci = reduceStart;          if (chunks.length != 0) {            String[] window = new String[5];            int wi = 0;            if (ci-2 >= 0) window[wi++] = chunks[ci-2].getHead().toString();            if (ci-1 >= 0) window[wi++] = chunks[ci-1].getHead().toString();            window[wi++] = chunks[ci].getHead().toString();            if (ci+1 < chunks.length) window[wi++] = chunks[ci+1].getHead().toString();            if (ci+2 < chunks.length) window[wi++] = chunks[ci+2].getHead().toString();            if (wi < 5) {              String[] subWindow = new String[wi];              for (int swi=0;swi<wi;swi++) {                subWindow[swi]=window[swi];              }              window = subWindow;            }            if (window.length >=3) {              mdict.add(window,3,false);            }            else if (window.length == 2) {              mdict.add(window,2,false);            }          }          ci=reduceStart-1; //ci will be incremented at end of loop        }        ci++;      }    }    return mdict;  }  public static void main(String[] args) throws java.io.IOException {    if (args.length < 3) {      usage();      System.exit(1);    }    boolean dict = false;     boolean tag = false;    boolean chunk = false;    boolean build = false;    boolean check = false;    boolean fun = false;    boolean all = true;    int argIndex = 0;    while (args[argIndex].startsWith("-")) {      all = false;      if (args[argIndex].equals("-dict")) {        dict = true;      }      else if (args[argIndex].equals("-tag")) {        tag = true;      }      else if (args[argIndex].equals("-chunk")) {        chunk = true;      }      else if (args[argIndex].equals("-build")) {        build = true;      }      else if (args[argIndex].equals("-check")) {        check = true;      }      else if (args[argIndex].equals("-fun")) {        fun = true;      }      else if (args[argIndex].equals("--")) {        argIndex++;        break;      }      else {        System.err.println("Invalid option " + args[argIndex]);        usage();        System.exit(1);      }      argIndex++;    }    java.io.File inFile = new java.io.File(args[argIndex++]);    String modelDirectory = args[argIndex++];    HeadRules rules = new opennlp.tools.lang.english.HeadRules(modelDirectory+"/head_rules");    java.io.File dictFile = new java.io.File(modelDirectory+"/dict.bin.gz");    java.io.File tagFile = new java.io.File(modelDirectory+"/tag.bin.gz");    java.io.File chunkFile = new java.io.File(modelDirectory+"/chunk.bin.gz");    java.io.File buildFile = new java.io.File(modelDirectory+"/build.bin.gz");    java.io.File checkFile = new java.io.File(modelDirectory+"/check.bin.gz");    int iterations = 100;    int cutoff = 5;    if (args.length > argIndex) {      iterations = Integer.parseInt(args[argIndex++]);      cutoff = Integer.parseInt(args[argIndex++]);    }    if (fun) {      Parse.useFunctionTags(true);    }    if (dict || all) {      System.err.println("Building dictionary");      DataStream data = new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile));      MutableDictionary mdict = buildDictionary(data, rules, cutoff);      System.out.println("Saving the dictionary");      mdict.persist(dictFile);    }    if (tag || all) {      System.err.println("Training tagger");      System.err.println("Loading Dictionary");      Dictionary tridict = new Dictionary(dictFile.toString());      opennlp.maxent.EventStream tes = new ParserEventStream(new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile)), rules, EventTypeEnum.TAG,tridict);      GISModel tagModel = train(tes, iterations, cutoff);      System.out.println("Saving the tagger model as: " + tagFile);      new opennlp.maxent.io.SuffixSensitiveGISModelWriter(tagModel, tagFile).persist();    }    if (chunk || all) {      System.err.println("Training chunker");      opennlp.maxent.EventStream ces = new ParserEventStream(new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile)), rules, EventTypeEnum.CHUNK);      GISModel chunkModel = train(ces, iterations, cutoff);      System.out.println("Saving the chunker model as: " + chunkFile);      new opennlp.maxent.io.SuffixSensitiveGISModelWriter(chunkModel, chunkFile).persist();    }    if (build || all) {      System.err.println("Loading Dictionary");      Dictionary tridict = new Dictionary(dictFile.toString());      System.err.println("Training builder");      opennlp.maxent.EventStream bes = new ParserEventStream(new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile)), rules, EventTypeEnum.BUILD,tridict);      GISModel buildModel = train(bes, iterations, cutoff);      System.out.println("Saving the build model as: " + buildFile);      new opennlp.maxent.io.SuffixSensitiveGISModelWriter(buildModel, buildFile).persist();    }    if (check || all) {      System.err.println("Training checker");      opennlp.maxent.EventStream kes = new ParserEventStream(new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile)), rules, EventTypeEnum.CHECK);      GISModel checkModel = train(kes, iterations, cutoff);      System.out.println("Saving the check model as: " + checkFile);      new opennlp.maxent.io.SuffixSensitiveGISModelWriter(checkModel, checkFile).persist();    }  }}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -