📄 parserme.java

📁 自然语言处理领域的一个开发包
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
      return new Parse[] {(Parse) parses.first()};    }    else {      List topParses = new ArrayList(numParses);      while(!parses.isEmpty() && topParses.size() < numParses) {        Parse tp = (Parse) parses.first();        topParses.add(tp);        parses.remove(tp);      }      return (Parse[]) topParses.toArray(new Parse[topParses.size()]);    }  }    /**   * Assigns parent references for the specified parse so that they   * are consistent with the children references.   * @param p The parse whose parent references need to be assigned.     */  public static void setParents(Parse p) {    Parse[] children = p.getChildren();    for (int ci = 0; ci < children.length; ci++) {      children[ci].setParent(p);      setParents(children[ci]);    }  }  /**   * Returns a parse for the specified parse of tokens.   * @param tokens The root node of a flat parse containing only tokens.    * @return A full parse of the specified tokens or the flat chunks of the tokens if a fullparse could not be found.   */  public Parse parse(Parse tokens) {    Parse p = parse(tokens,1)[0];    setParents(p);    return p;  }  /**   * Adds the "TOP" node to the specified parse.   * @param p The complete parse.   */  protected void advanceTop(Parse p) {    buildModel.eval(buildContextGenerator.getContext(p.getChildren(), 0), bprobs);    p.addProb(Math.log(bprobs[topStartIndex]));    checkModel.eval(checkContextGenerator.getContext(p.getChildren(), TOP_NODE, 0, 0), cprobs);    p.addProb(Math.log(cprobs[completeIndex]));    p.setType(TOP_NODE);  }  /**   * Determines the mapping between the specified index into the specified parses without punctuation to    * the coresponding index into the specified parses.   * @param index An index into the parses without punctuation.   * @param nonPunctParses The parses without punctuation.   * @param parses The parses wit punctuation.   * @return An index into the specified parses which coresponds to the same node the specified index   * into the parses with punctuation.   */  private int mapParseIndex(int index, Parse[] nonPunctParses, Parse[] parses) {    int parseIndex = index;    while (parses[parseIndex] != nonPunctParses[index]) {      parseIndex++;    }    return parseIndex;  }  /** Advances the specified parse and returns the an array advanced parses whose probability accounts for   * more than the speicficed amount of probability mass, Q.   * @param p The parse to advance.   * @param Q The amount of probability mass that should be accounted for by the advanced parses.    */  protected Parse[] advanceParses(final Parse p, double Q) {    double q = 1 - Q;    /** The closest previous node which has been labeled as a start node. */    Parse lastStartNode = null;    /** The index of the closest previous node which has been labeled as a start node. */    int lastStartIndex = -1;    /** The type of the closest previous node which has been labeled as a start node. */    String lastStartType = null;    /** The index of the node which will be labeled in this iteration of advancing the parse. */    int advanceNodeIndex;    /** The node which will be labeled in this iteration of advancing the parse. */    Parse advanceNode=null;    Parse[] originalChildren = p.getChildren();    Parse[] children = collapsePunctuation(originalChildren,punctSet);    int numNodes = children.length;    if (numNodes == 0) {      return null;    }    //determines which node needs to be labeled and prior labels.    for (advanceNodeIndex = 0; advanceNodeIndex < numNodes; advanceNodeIndex++) {      advanceNode = children[advanceNodeIndex];      if (advanceNode.getLabel() == null) {        break;      }      else if (startTypeMap.containsKey(advanceNode.getLabel())) {        lastStartType = (String) startTypeMap.get(advanceNode.getLabel());        lastStartNode = advanceNode;        lastStartIndex = advanceNodeIndex;        //System.err.println("lastStart "+i+" "+lastStart.label+" "+lastStart.prob);      }    }    int originalAdvanceIndex = mapParseIndex(advanceNodeIndex,children,originalChildren);    List newParsesList = new ArrayList(buildModel.getNumOutcomes());    //call build    buildModel.eval(buildContextGenerator.getContext(children, advanceNodeIndex), bprobs);    double bprobSum = 0;    while (bprobSum < Q) {      /** The largest unadvanced labeling. */       int max = 0;      for (int pi = 1; pi < bprobs.length; pi++) { //for each build outcome        if (bprobs[pi] > bprobs[max]) {          max = pi;        }      }      if (bprobs[max] == 0) {        break;      }      double bprob = bprobs[max];      bprobs[max] = 0; //zero out so new max can be found      bprobSum += bprob;      String tag = buildModel.getOutcome(max);      //System.out.println("trying "+tag+" "+bprobSum+" lst="+lst);      if (max == topStartIndex) { // can't have top until complete        continue;      }      //System.err.println(i+" "+tag+" "+bprob);      if (startTypeMap.containsKey(tag)) { //update last start        lastStartIndex = advanceNodeIndex;        lastStartNode = advanceNode;        lastStartType = (String) startTypeMap.get(tag);      }      else if (contTypeMap.containsKey(tag)) {        if (lastStartNode == null || !lastStartType.equals(contTypeMap.get(tag))) {          continue; //Cont must match previous start or continue        }      }      Parse newParse1 = (Parse) p.clone(); //clone parse      if (createDerivationString) newParse1.getDerivation().append(max).append("-");            newParse1.setChild(originalAdvanceIndex,tag); //replace constituent labeled      newParse1.addProb(Math.log(bprob));      //check      //String[] context = checkContextGenerator.getContext(newParse1.getChildren(), lastStartType, lastStartIndex, advanceNodeIndex);      checkModel.eval(checkContextGenerator.getContext(collapsePunctuation(newParse1.getChildren(),punctSet), lastStartType, lastStartIndex, advanceNodeIndex), cprobs);      //System.out.println("check "+lastStartType+" "+cprobs[completeIndex]+" "+cprobs[incompleteIndex]+" "+tag+" "+java.util.Arrays.asList(context));      Parse newParse2 = newParse1;      if (cprobs[completeIndex] > q) { //make sure a reduce is likely        newParse2 = (Parse) newParse1.clone();        if (createDerivationString) newParse2.getDerivation().append(1).append(".");        newParse2.addProb(Math.log(cprobs[1]));        Parse[] cons = new Parse[advanceNodeIndex - lastStartIndex + 1];        boolean flat = true;        //first        cons[0] = lastStartNode;        if (!cons[0].getType().equals(cons[0].getHead().getType())) {          flat = false;        }        //last        cons[advanceNodeIndex - lastStartIndex] = advanceNode;        if (flat && !cons[advanceNodeIndex - lastStartIndex].getType().equals(cons[advanceNodeIndex - lastStartIndex].getHead().getType())) {          flat = false;        }        //middle        for (int ci = 1; ci < advanceNodeIndex - lastStartIndex; ci++) {          cons[ci] = children[ci + lastStartIndex];          if (flat && !cons[ci].getType().equals(cons[ci].getHead().getType())) {            flat = false;          }        }        if (!flat) { //flat chunks are done by chunker          if (lastStartIndex == 0 && advanceNodeIndex == numNodes-1) { //check for top node to include end and begining punctuation            //System.err.println("ParserME.advanceParses: reducing entire span: "+new Span(lastStartNode.getSpan().getStart(), advanceNode.getSpan().getEnd())+" "+lastStartType+" "+java.util.Arrays.asList(children));            newParse2.insert(new Parse(p.getText(), p.getSpan(), lastStartType, cprobs[1], headRules.getHead(cons, lastStartType)));          }          else {            newParse2.insert(new Parse(p.getText(), new Span(lastStartNode.getSpan().getStart(), advanceNode.getSpan().getEnd()), lastStartType, cprobs[1], headRules.getHead(cons, lastStartType)));          }          newParsesList.add(newParse2);        }      }      if (cprobs[incompleteIndex] > q) { //make sure a shift is likly        if (createDerivationString) newParse1.getDerivation().append(0).append(".");        if (advanceNodeIndex != numNodes - 1) { //can't shift last element          newParse1.addProb(Math.log(cprobs[0]));          newParsesList.add(newParse1);        }      }    }    Parse[] newParses = new Parse[newParsesList.size()];    newParsesList.toArray(newParses);    return newParses;  }  /**   * Reutrns the top chunk sequences for the specified parse.   * @param p A pos-tag assigned parse.   * @return The top chunk assignments to the specified parse.   */  protected Parse[] advanceChunks(final Parse p, double minChunkScore) {    // chunk    Parse[] children = p.getChildren();    String words[] = new String[children.length];    String ptags[] = new String[words.length];    double probs[] = new double[words.length];    Parse sp = null;    for (int i = 0, il = children.length; i < il; i++) {      sp = children[i];      words[i] = sp.getHead().toString();      ptags[i] = sp.getType();    }    //System.err.println("adjusted mcs = "+(minChunkScore-p.getProb()));    Sequence[] cs = chunker.topKSequences(words, ptags,minChunkScore-p.getProb());    Parse[] newParses = new Parse[cs.length];    for (int si = 0, sl = cs.length; si < sl; si++) {      newParses[si] = (Parse) p.clone(); //copies top level      if (createDerivationString) newParses[si].getDerivation().append(si).append(".");      String[] tags = (String[]) cs[si].getOutcomes().toArray(new String[words.length]);      cs[si].getProbs(probs);      int start = -1;      int end = 0;      String type = null;      //System.err.print("sequence "+si+" ");      for (int j = 0; j <= tags.length; j++) {        //if (j != tags.length) {System.err.println(words[j]+" "+ptags[j]+" "+tags[j]+" "+probs.get(j));}        if (j != tags.length) {          newParses[si].addProb(Math.log(probs[j]));        }        if (j != tags.length && tags[j].startsWith(CONT)) { // if continue just update end chunking tag don't use contTypeMap          end = j;        }        else { //make previous constituent if it exists          if (type != null) {            //System.err.println("inserting tag "+tags[j]);            Parse p1 = p.getChildren()[start];            Parse p2 = p.getChildren()[end];            //System.err.println("Putting "+type+" at "+start+","+end+" "+newParses[si].prob);            Parse[] cons = new Parse[end - start + 1];            cons[0] = p1;            //cons[0].label="Start-"+type;            if (end - start != 0) {              cons[end - start] = p2;              //cons[end-start].label="Cont-"+type;              for (int ci = 1; ci < end - start; ci++) {                cons[ci] = p.getChildren()[ci + start];                //cons[ci].label="Cont-"+type;              }            }            newParses[si].insert(new Parse(p1.getText(), new Span(p1.getSpan().getStart(), p2.getSpan().getEnd()), type, 1, headRules.getHead(cons, type)));          }          if (j != tags.length) { //update for new constituent            if (tags[j].startsWith(START)) { // don't use startTypeMap these are chunk tags              type = tags[j].substring(START.length());              start = j;              end = j;            }            else { // other               type = null;            }          }        }      }      //newParses[si].show();System.out.println();    }    return newParses;  }  /**
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -