📄 mentionpairiterator.java
字号:
/* Copyright (C) 2002 Dept. of Computer Science, Univ. of Massachusetts, Amherst This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This program toolkit free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. For more details see the GNU General Public License and the file README-LEGAL. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *//** @author Ben Wellner */package edu.umass.cs.mallet.projects.seg_plus_coref.anaphora;import org.jdom.input.*;import org.xml.sax.*;import org.xml.sax.ext.LexicalHandler;import org.xml.sax.helpers.DefaultHandler;import org.xml.sax.helpers.XMLReaderFactory;import org.jdom.Document;import org.jdom.JDOMException;import org.jdom.*;import org.jdom.output.XMLOutputter;import java.util.*;import java.util.regex.Pattern;import java.util.regex.Matcher;import java.io.File;import java.io.*;import java.net.URI;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.projects.seg_plus_coref.anaphora.*;public class MentionPairIterator extends AbstractPipeInputIterator{ public static final int NP_LOOKBACK = 20000; // need to change this to look at sentences FileIterator fileIterator; public static final String [] pronouns = new String[] {"He", "he", "Him", "him", "His", "his", "She", "she", "Her", "her", "hers", "it", "It", "its", "Its", "itself", "himself", "herself"}; /* public static final String [] pronouns = new String[] {"He", "he", "Him", "him", "They", "they", "I", "We", "we", "Our", "our","His", "his", "She", "she", "Her", "her", "hers", "it", "their", "It", "Their", "its", "Its", "itself", "himself", "herself", "ourselves", "themselves"}; */ public static final String [] malePronouns = new String[] {"He", "he", "Him", "him", "His", "his", "himself"}; public static final String [] femalePronouns = new String[] {"She","she","Her","her","hers","herself"}; public static final int pronounsSize = 18; public static final int numMalePronouns = 7; public static final int numFemalePronouns = 6; int refIndex = 0; SAXBuilder builder = null; Document currentDocument; // JDOM structure MalletDocument malletDocument; // Mallet document structure, optimized for access DocumentMentionPairIterator docNodePairIterator; java.util.Vector allDocuments; File targetDocPath; boolean positiveAntecedent = false; String sourceType = ""; boolean addNullAntecedent; int numberOfReferents = 0; boolean includeProperNouns = false; boolean includeEverything = false; List filters; ArrayList nodePairArray; public MentionPairIterator (FileIterator fi, String sourceType) { this (fi, sourceType, true); } public MentionPairIterator (FileIterator fi, String sourceType, boolean addNullAntecedent, boolean includeNNPs, boolean includeAll, List filters) { this.includeProperNouns = includeNNPs; this.sourceType = sourceType; this.addNullAntecedent = addNullAntecedent; this.includeEverything = includeAll; this.filters = filters; constructAux(fi, sourceType); } public MentionPairIterator (FileIterator fi, String sourceType, boolean addNullAntecedent, boolean includeNNPs) { this.includeProperNouns = includeNNPs; this.sourceType = sourceType; this.addNullAntecedent = addNullAntecedent; constructAux(fi, sourceType); } public MentionPairIterator (FileIterator fi, String sourceType, boolean addNullAntecedent) { this.sourceType = sourceType; this.addNullAntecedent = addNullAntecedent; constructAux(fi, sourceType); } private void constructAux (FileIterator fi, String sourceType) { fileIterator = fi; builder = new SAXBuilder(); allDocuments = new java.util.Vector(); if ((currentDocument == null) && fileIterator.hasNext()) { try { File openDocPath = (File)fileIterator.nextInstance().getData(); targetDocPath = new File(((String)openDocPath.getAbsolutePath()).concat(".sys")); currentDocument = builder.build(openDocPath); malletDocument = new MalletDocument(currentDocument, sourceType); allDocuments.add(currentDocument); docNodePairIterator = new DocumentMentionPairIterator(); } catch (JDOMException e) { e.printStackTrace(); } } } public int getNumReferents () { return numberOfReferents; } public java.util.Vector getAllDocuments () { return allDocuments; } public MentionPair getNextMentionPairFromDocument() { if (docNodePairIterator.hasNext()) return (MentionPair)docNodePairIterator.next(); else return null; } public MentionPair getNextMentionPair () { MentionPair pair = null; File openDocPath = null; while (((pair = (MentionPair)docNodePairIterator.next()) == null) && currentDocument != null) { if (fileIterator.hasNext()) { try { openDocPath = (File)fileIterator.nextInstance().getData(); targetDocPath = new File(((String)openDocPath.getAbsolutePath()).concat(".sys")); currentDocument = builder.build(openDocPath); malletDocument = new MalletDocument (currentDocument, sourceType); } catch (JDOMException e) { e.printStackTrace(); } docNodePairIterator = new DocumentMentionPairIterator(); } } return pair; } public Instance nextInstance() { MentionPair pair = null; String target = null; URI nodePairURI = null; Mention referent = null; if (this.hasNext()) { pair = (MentionPair)getNextMentionPair(); if (pair.nullPair()) { if (!positiveAntecedent) { target = "yes"; } else { target = "no"; } positiveAntecedent = false; } else { String tval = null; tval = docNodePairIterator.getTargetValue(pair); if (tval != null) { target = new String("yes"); pair.setEntityReference(tval); } else { pair.setEntityReference(null); target = new String("no"); } if (target.equals("yes")) { positiveAntecedent = true; } } try { nodePairURI = new URI("nodePairURI"); } catch (Exception e) { e.printStackTrace(); } } // include pair as the "source" of this pipe object because we want this to carry // into the feature vector return new Instance (pair, target, nodePairURI, null); } // Iterator methods public boolean hasNext() { return (docNodePairIterator.hasNext() || fileIterator.hasNext()); } public void remove () { throw new UnsupportedOperationException(); } public class DocumentMentionPairIterator implements Iterator { int pairCount; int currentIndex; public DocumentMentionPairIterator () { java.util.Vector candPatterns = new java.util.Vector(); nodePairArray = new ArrayList (); //should parameterize this: //candPatterns.add(Pattern.compile("[A-Z]*NAME[A-Z]*")); //candPatterns.add(Pattern.compile("PRO[A-Z]*")); if (sourceType.equals("TB")) { candPatterns.add(Pattern.compile("NP.*")); // for TreeBank candPatterns.add(Pattern.compile("lex")); } else if (sourceType.equals("MUC")) { candPatterns.add(Pattern.compile("NG")); // for the MUC data candPatterns.add(Pattern.compile("COREF")); // for the MUC data } fillNodePairArray (candPatterns); nodePairArray = filterPairs(); pairCount = nodePairArray.size(); System.out.println("Pair array size = " + pairCount); currentIndex = 0; } private ArrayList filterPairs () { ArrayList newList = new ArrayList(); Iterator i = filters.iterator(); while (i.hasNext()) { Filter filter = (Filter)i.next(); Iterator i2 = nodePairArray.iterator(); while (i2.hasNext()) { MentionPair pair = (MentionPair)i2.next(); if (!filter.filters(pair)) { /* if (pair.getAntecedent() != null) System.out.println("Pair: " + pair.getAntecedent().getString() + " " + pair.getReferent().getString()); else System.out.println("Pair: " + "NULL" + " " + pair.getReferent().getString()); */ newList.add(pair); } } } return newList; } private boolean compatible (Element n, java.util.Vector patterns) { if (sourceType.equals("TB")) { List children = n.getChildren(); Iterator iter = children.iterator(); Pattern p1 = Pattern.compile("NP.*");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -