📄 citation.java
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).http://www.cs.umass.edu/~mccallum/malletThis software is provided under the terms of the Common Public License,version 1.0, as published by http://www.opensource.org. For furtherinformation, see the file `LICENSE' included with this distribution. *//** @author Ben Wellner */package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;import com.wcohen.secondstring.*;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.*;import edu.umass.cs.mallet.projects.seg_plus_coref.graphs.*;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEInterface;//import edu.umass.cs.mallet.users.hay.canopy.Util;//import edu.umass.cs.mallet.users.hay.canopy.NGramAnalyzer;//import edu.umass.cs.mallet.users.hay.canopy.DateAnalyzer;import salvo.jesus.graph.*;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.util.*;import edu.umass.cs.mallet.base.fst.CRF;//import edu.umass.cs.mallet.base.fst.CRF3;import edu.umass.cs.mallet.base.fst.Transducer;import java.util.*;import java.util.Arrays;import java.lang.*;import java.io.*;/** Objects of this class represent citations (in the citation domain). This object essentially maintains all of field values for the citation in separate field slots. This will make feature extraction easier an much more efficient. Previsouly, we were essentially re-parsing the citation in each stage of the pipeline. */public class Citation { // segmented fields public final static String author = "author"; public final static String title = "title"; public final static String booktitle = "booktitle"; public final static String date = "date"; public final static String pages = "pages"; public final static String publisher = "publisher"; public final static String location = "location"; public final static String journal = "journal"; public final static String volume = "volume"; public final static String venueCluster = "venue_cluster"; public final static String venueVolume = "venue_vol"; public final static String tech = "tech"; public final static String note = "note"; public final static String institution = "institution"; public final static String other = "O"; public final static String editor = "editor"; // *new* fields for coreference public final static String type = "type"; // booktitle and/or journal public final static String citation = "citation"; public final static String year = "year"; public final static String authors = "authors"; // list of authors public final static String venue = "venue"; // booktitle and/or journal // list of valid fields that Pipes should use private static List corefFieldsList; public final static String[] corefFields = new String[] { author, title, pages, volume, booktitle, journal, //location, tech, institution, editor, publisher, venue, year//, type, venueCluster, venueVolume }; private final static String [] POSSIBLE_FIELDS = new String[]{author,title,booktitle,date,year,pages,publisher, location,journal,volume,note,institution, tech, venueCluster, venueVolume}; private Map fields; // key: fieldname -> value: String field value private Map fieldTokens; private Map fieldTokensAsSets; private static CitationNormalizer normalizer; private String rawstring; private String origstring; private String underlyingString; private ArrayList normalizedAuthors; private Object label; private int index; private ArrayList nBest; private double viterbiScore; // score of the selected viterbi path private double confidenceScore; public String getRawstring() { return rawstring; } public String getUnderlyingString () { return underlyingString; } public Citation (String s) { rawstring = s.toLowerCase(); origstring = s; underlyingString = SGMLStringOperation.removeSGMLTags (s); corefFieldsList = Arrays.asList(corefFields); normalizer = new CitationNormalizer(); fields = new HashMap(); fieldTokens = new HashMap(); fieldTokensAsSets = new HashMap(); parseCitation(s); } public Citation (String s, Object label, int index) { this(s); this.label = label; this.index = index; } public Citation (String s, Object label, int index, IEInterface ieInterface, int n) { this (s, label, index, ieInterface, n, 0); } public Citation (String s, Object label, int index, IEInterface ieInterface) { this (s, label, index, ieInterface, 1); } public Citation (String s, Object label, int index, IEInterface ieInterface, int n, int nthToUse) { this(s, label, index); //assert(n >= 1); // use the CRF to parse the string Instance instance; instance = new Instance(s, null, new Integer(index), null, ieInterface.pipe); Transducer.ViterbiPath_NBest viterbiP_NBest; CRF crf = ieInterface.crf; viterbiP_NBest = crf.viterbiPath_NBest((Sequence) instance.getData(), n);//n-best list //double [] confidenceScores = viterbiP_NBest.confidenceNBest(); Sequence[] sequence_temp = viterbiP_NBest.outputNBest(); TokenSequence tokenSequence = (TokenSequence) (instance.getSource()); this.nBest = new ArrayList(); for (int i=0; i < n; i++) { String crfStr = ieInterface.printResultInFormat(true, sequence_temp[i], tokenSequence); if (i == nthToUse) { // this MAY not be the top viterbi path for // certain experiments viterbiScore = ieInterface.InstanceAccuracy (sequence_temp[i], (Sequence)instance.getTarget(), instance); crfStr = crfStr.toLowerCase(); // downcase now this.rawstring = crfStr; parseCitation(crfStr); //System.out.println("score on " + i + " path is: " + viterbiScore); } Citation none = new Citation(crfStr, label, index); none.setConfidenceScore(1.0); //none.setConfidenceScore (confidenceScores[i]); nBest.add(none); } } public double getConfidenceScore() { return confidenceScore; } public void setConfidenceScore (double s) { this.confidenceScore = s; } public double getScore() { return viterbiScore; } private void parseCitation(String s) { // set fields setField(author, normalizer.norm(normalizer.getAlphaOnly(SGMLStringOperation.locateAndConcatFields(author, s)))); setField(title, normalizer.norm(SGMLStringOperation.locateAndConcatFields(title, s))); setField(pages, normalizer.getNumericOnly(SGMLStringOperation.locateAndConcatFields(pages, s))); setField(volume, normalizer.getNumericOnly(SGMLStringOperation.locateAndConcatFields(volume, s))); setField(year, normalizer.getFourDigitString(SGMLStringOperation.locateAndConcatFields(date, s))); setField(publisher, normalizer.norm(SGMLStringOperation.locateAndConcatFields(publisher, s))); setField(note, normalizer.norm(SGMLStringOperation.locateAndConcatFields(note, s))); setField(booktitle, normalizer.norm(SGMLStringOperation.locateAndConcatFields(booktitle, s))); setField(journal, normalizer.norm(SGMLStringOperation.locateAndConcatFields(journal, s))); setField(tech, normalizer.norm(SGMLStringOperation.locateAndConcatFields(tech, s))); setAuthors(); // List of authors setVenue(); // = journal and/or booktitle setType(); setField(citation, normalizer.norm(SGMLStringOperation.removeSGMLTags(s))); } public String getField(String fieldName) { //assert (corefFieldsList.contains(fieldName)); String s = (String)fields.get(fieldName); return (s==null) ? "" : s; } public List getFieldTokens(String fieldName) { assert (corefFieldsList.contains(fieldName)); return (List)fieldTokens.get(fieldName); } public boolean hasField(String fieldName) { assert (corefFieldsList.contains(fieldName)); String value = getField(fieldName); if (value == null) { return false; } else if (value.length() == 0) { return false; } return true; } private void setField(String fieldName, String fieldValue) { //assert (corefFieldsList.contains(fieldName)); fields.put(fieldName, fieldValue); fieldTokens.put(fieldName, normalizer.getTokens(fieldValue)); fieldTokensAsSets.put(fieldName,normalizer.getTokensAsSet(fieldValue)); } public Set getFieldTokensAsSet(String fieldName) { assert (corefFieldsList.contains(fieldName)); return (Set)fieldTokensAsSets.get(fieldName); } private void setAuthors() { normalizedAuthors = new ArrayList(); List authors = SGMLStringOperation.locateFields(author, rawstring); for (int i = 0; i < authors.size(); i++) { String authorStr = (String) authors.get(i); normalizedAuthors.add(normalizer.norm(authorStr)); } } public List getAuthors() { return normalizedAuthors; } public int getNumAuthors() { return normalizedAuthors.size(); } private void setVenue() { String booktitleStr = null; String journalStr = null; String techStr = null; booktitleStr = normalizer.norm(SGMLStringOperation.locateAndConcatFields(booktitle, rawstring)); journalStr = normalizer.norm(SGMLStringOperation.locateAndConcatFields(journal, rawstring)); techStr = normalizer.norm(SGMLStringOperation.locateAndConcatFields(tech,rawstring)); HashMap venueAttributes = new HashMap(); if (booktitleStr.equals("") && techStr.equals("")) { venueAttributes = SGMLStringOperation.locateAttributes (journal, rawstring); fields.put(venue, journalStr); } else if (journalStr.equals("") && techStr.equals("")) { fields.put(venue, booktitleStr); venueAttributes = SGMLStringOperation.locateAttributes (booktitle, rawstring); } else if (booktitleStr.equals("") && techStr.equals("")) { fields.put(venue, techStr); } else { fields.put(venue, journalStr + booktitleStr + techStr); } if (venueAttributes.size() != 0) { fields.put (venueCluster, venueAttributes.get (venueCluster).toString()); fields.put (venueVolume, venueAttributes.get (venueVolume).toString()); } } public Object getLabel() { return label; } public int getIndex() { return index; } public List getAllStringsWrapped() { return getAllStringsWrapped(corefFields); } public List getAllStringsWrapped(String[] fields) { ArrayList allStrings = new ArrayList(); for (int i = 0; i < fields.length; i++) { String fieldName = fields[i]; allStrings.add(new StringWrapper(getField(fieldName))); } return allStrings; } public List getNBest() { return nBest; } public Citation getNthBest(int i) { if (nBest.size() < (i-1)) return null; return (Citation)nBest.get(i); } public Citation() {} public String getOrigString() { return origstring; } public String getString() { return rawstring; } public void setString (String s) { this.rawstring = s; } public Map getFields() { return fields; } public void setField(Object key, Object val) { fields.put(key, val); } public String print () { Integer i = new Integer(getIndex()); return i.toString(); } public String[] getPossibleFields() { return POSSIBLE_FIELDS; } public boolean isConferencePaper () { String booktitle = getField("booktitle"); if (booktitle.matches(".*proc\\..*") || booktitle.matches(".*proceedings.*") || booktitle.matches(".*workshop.*") || booktitle.matches(".*conference.*")) return true; return false; } public boolean isJournalPaper() { String journal = getField("journal"); if (journal.length() > 0) return true; String volume = getField("volume"); if (volume.length() > 0) return true; return false; } public boolean isTechPaper() { String journal = getField("journal"); if (journal.length() > 0) return false; String volume = getField("volume"); if (volume.length() > 0) return false; String tech = getField("tech"); if (tech.length() > 0) return true; return false; } private void setType() { String btitle = null; String jnl = null; String tch = null; btitle = (String)fields.get(this.booktitle); jnl = (String)fields.get(this.journal); tch = (String)fields.get(this.tech); assert(btitle != null); assert(jnl != null); assert(tch != null); String type = ""; if (isConferencePaper()) { type = "conference"; } else if (isJournalPaper()) { type = "journal"; } else if (isTechPaper()) { type = "tech"; } else { type = "uncertain"; } setField(this.type, type); } public String toString () { List alls = getAllStringsWrapped(); String ret = "Original string: " + origstring + "-->Fields: " + fields; return ret; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -