citation.java
来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 592 行 · 第 1/2 页
JAVA
592 行
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).http://www.cs.umass.edu/~mccallum/malletThis software is provided under the terms of the Common Public License,version 1.0, as published by http://www.opensource.org. For furtherinformation, see the file `LICENSE' included with this distribution. *//** @author Ben Wellner */package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;import com.wcohen.secondstring.*;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.*;import edu.umass.cs.mallet.projects.seg_plus_coref.graphs.*;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEInterface;//import edu.umass.cs.mallet.users.hay.canopy.Util;//import edu.umass.cs.mallet.users.hay.canopy.NGramAnalyzer;//import edu.umass.cs.mallet.users.hay.canopy.DateAnalyzer;import salvo.jesus.graph.*;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.util.*;import edu.umass.cs.mallet.base.fst.CRF;import edu.umass.cs.mallet.base.fst.Transducer;import java.util.*;import java.util.Arrays;import java.lang.*;import java.io.*;/** Objects of this class represent citations (in the citation domain). This object essentially maintains all of field values for the citation in separate field slots. This will make feature extraction easier an much more efficient. Previsouly, we were essentially re-parsing the citation in each stage of the pipeline. */public class Citation { // segmented fields public final static String author = "author"; public final static String title = "title"; public final static String booktitle = "booktitle"; public final static String date = "date"; public final static String pages = "pages"; public final static String publisher = "publisher"; public final static String address = "address"; public final static String journal = "journal"; public final static String conference = "conference"; public final static String volume = "volume"; public final static String paperID = "reference_no"; public final static String paperCluster = "cluster_no"; public final static String venueID = "venue_no"; public final static String venueCluster = "venue_cluster"; public final static String venueVolume = "venue_vol"; public final static String tech = "tech"; public final static String note = "note"; public final static String institution = "institution"; public final static String other = "O"; public final static String editor = "editor"; // *new* fields for coreference public final static String type = "type"; // booktitle and/or journal public final static String citation = "citation"; public final static String authors = "authors"; // list of authors public final static String venue = "venue"; // booktitle and/or journal // list of valid fields that Pipes should use private static List corefFieldsList; public final static String[] corefFields = new String[] { address,author,authors,booktitle,citation,conference,date, editor,institution,journal,note,other, pages,publisher, title,tech,type, venue,venueVolume,volume }; private final static String [] POSSIBLE_FIELDS = new String[]{address,author,authors,booktitle,citation,conference,date, editor,institution,journal,note,other, pages,paperCluster,paperID,publisher, title,tech,type, venue,venueCluster,venueID,venueVolume,volume}; private Map fields; // key: fieldname -> value: String field value private Map fieldTokens; private Map fieldTokensAsSets; private static CitationNormalizer normalizer; private String rawstring; private String origstring; private String underlyingString; private ArrayList normalizedAuthors; private Object label; private int index; private ArrayList nBest; private double viterbiScore; // score of the selected viterbi path private double confidenceScore; public String getRawstring() { return rawstring; } public String getUnderlyingString () { return underlyingString; } public Citation (String s) { rawstring = s.toLowerCase(); origstring = s; // xxx removes abstract from underlying string String noabstract = s.replaceAll("<abstract>[^<]+<\\/abstract>", " "); underlyingString = SGMLStringOperation.removeSGMLTags (noabstract); corefFieldsList = Arrays.asList(corefFields); normalizer = new CitationNormalizer(); fields = new HashMap(); fieldTokens = new HashMap(); fieldTokensAsSets = new HashMap(); parseCitation(s); } public Citation (String s, Object label, int index) { this(s); this.label = label; this.index = index; } public Citation (String s, Object label, int index, IEInterface ieInterface, int n) { this (s, label, index, ieInterface, n, 0); } public Citation (String s, Object label, int index, IEInterface ieInterface) { this (s, label, index, ieInterface, 1); } public Citation (String s, Object label, int index, IEInterface ieInterface, int n, int nthToUse) { this(s, label, index); //assert(n >= 1); // use the CRF to parse the string Instance instance; instance = new Instance(s, null, new Integer(index), null, ieInterface.pipe); Transducer.ViterbiPath_NBest viterbiP_NBest; CRF crf = ieInterface.crf; viterbiP_NBest = crf.viterbiPath_NBest((Sequence) instance.getData(), n);//n-best list //double [] confidenceScores = viterbiP_NBest.confidenceNBest(); Sequence[] sequence_temp = viterbiP_NBest.outputNBest(); TokenSequence tokenSequence = (TokenSequence) (instance.getSource()); this.nBest = new ArrayList(); for (int i=0; i < n; i++) { String crfStr = ieInterface.printResultInFormat(true, sequence_temp[i], tokenSequence); if (i == nthToUse) { // this MAY not be the top viterbi path for // certain experiments viterbiScore = ieInterface.InstanceAccuracy (sequence_temp[i], (Sequence)instance.getTarget(), instance); crfStr = crfStr.toLowerCase(); // downcase now this.rawstring = crfStr; parseCitation(crfStr); //System.out.println("score on " + i + " path is: " + viterbiScore); } Citation none = new Citation(crfStr, label, index); none.setConfidenceScore(1.0); //none.setConfidenceScore (confidenceScores[i]); nBest.add(none); } } public double getConfidenceScore() { return confidenceScore; } public void setConfidenceScore (double s) { this.confidenceScore = s; } public double getScore() { return viterbiScore; } private void parseCitation(String s) { // set fields setField(address, normalizer.norm(SGMLStringOperation.locateAndConcatFields(address, s))); setField(author, normalizer.norm(normalizer.getAlphaOnly(SGMLStringOperation.locateAndConcatFields(author, s)))); setField(authors, normalizer.norm(normalizer.getAlphaOnly(SGMLStringOperation.locateAndConcatFields(authors, s)))); setField(booktitle, normalizer.norm(SGMLStringOperation.locateAndConcatFields(booktitle, s)));// setField(citation, normalizer.norm(SGMLStringOperation.removeSGMLTags(s))); setField(citation, normalizer.norm(s)); setField(conference, normalizer.norm(SGMLStringOperation.locateAndConcatFields(conference, s))); setField(date, normalizer.getFourDigitString(SGMLStringOperation.locateAndConcatFields(date, s))); setField(editor, normalizer.norm(normalizer.getAlphaOnly(SGMLStringOperation.locateAndConcatFields(editor, s)))); setField(institution, normalizer.norm(SGMLStringOperation.locateAndConcatFields(institution, s))); setField(journal, normalizer.norm(SGMLStringOperation.locateAndConcatFields(journal, s))); setField(note, normalizer.norm(SGMLStringOperation.locateAndConcatFields(note, s))); setField(other, normalizer.norm(SGMLStringOperation.locateAndConcatFields(other, s))); setField(pages, normalizer.getNumericOnly(SGMLStringOperation.locateAndConcatFields(pages, s))); setField(publisher, normalizer.norm(SGMLStringOperation.locateAndConcatFields(publisher, s))); setField(title, normalizer.norm(SGMLStringOperation.locateAndConcatFields(title, s))); setField(tech, normalizer.norm(SGMLStringOperation.locateAndConcatFields(tech, s))); setField(volume, normalizer.getNumericOnly(SGMLStringOperation.locateAndConcatFields(volume, s))); setAuthors(); // List of authors setVenue(); // = journal and/or booktitle setType(); HashMap clusterAttributes = SGMLStringOperation.locateAttributes ("meta", s); String pcid = (String)clusterAttributes.get (paperCluster); String pid = (String)clusterAttributes.get (paperID); if (pcid == null) throw new IllegalArgumentException ("Paper has no cluster: " + s); if (pid == null) throw new IllegalArgumentException ("Paper has no id: " + s); setField (paperCluster, pcid); setField (paperID, pid); } public String getField(String fieldName) { //assert (corefFieldsList.contains(fieldName)); String s = (String)fields.get(fieldName); return (s==null) ? "" : s; } public List getFieldTokens(String fieldName) { assert (corefFieldsList.contains(fieldName)); return (List)fieldTokens.get(fieldName); } public boolean hasField(String fieldName) { assert (corefFieldsList.contains(fieldName)); String value = getField(fieldName); if (value == null) { return false; } else if (value.length() == 0) { return false; } return true; } private void setField(String fieldName, String fieldValue) { //assert (corefFieldsList.contains(fieldName)); fields.put(fieldName, fieldValue); fieldTokens.put(fieldName, normalizer.getTokens(fieldValue)); fieldTokensAsSets.put(fieldName,normalizer.getTokensAsSet(fieldValue)); } public Set getFieldTokensAsSet(String fieldName) { assert (corefFieldsList.contains(fieldName)); return (Set)fieldTokensAsSets.get(fieldName); } private void setAuthors() { normalizedAuthors = new ArrayList(); List authors = SGMLStringOperation.locateFields(author, rawstring); for (int i = 0; i < authors.size(); i++) { String authorStr = (String) authors.get(i); normalizedAuthors.add(normalizer.authorNorm(authorStr)); } } public List getAuthors() { return normalizedAuthors; } public int getNumAuthors() { return normalizedAuthors.size(); } private void setVenue() { String booktitleStr = null; String journalStr = null; String techStr = null; booktitleStr = normalizer.norm(SGMLStringOperation.locateAndConcatFields(booktitle, rawstring)); journalStr = normalizer.norm(SGMLStringOperation.locateAndConcatFields(journal, rawstring)); techStr = normalizer.norm(SGMLStringOperation.locateAndConcatFields(tech,rawstring)); String confStr = normalizer.norm(SGMLStringOperation.locateAndConcatFields(conference,rawstring)); HashMap venueAttributes = new HashMap(); if (booktitleStr.equals("") && techStr.equals("") && confStr.equals("")) { venueAttributes = SGMLStringOperation.locateAttributes (journal, rawstring); fields.put(venue, journalStr); } else if (journalStr.equals("") && techStr.equals("") && confStr.equals("")) { fields.put(venue, booktitleStr);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?