⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 authorpipe.java

📁 这是一个matlab的java实现。里面有许多内容。请大家慢慢捉摸。
💻 JAVA
字号:
package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;import com.wcohen.secondstring.*;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.util.*;import java.util.*;import java.lang.*;import java.io.*;public class AuthorPipe extends Pipe {	StringDistance distMetric;	Levenstein nw;		public AuthorPipe (StringDistance distMetric) {		this.distMetric = distMetric;		this.nw = new Levenstein();	}	private double normNW (String s1, String s2) {		return 1 - (Math.abs(nw.score(s1,s2))/(double)(s1.length() + s2.length()));	}	private void listOfStringsIntersection (List l1, List l2, int type) {		List toRemoveList = new ArrayList();		for (int i=0; i < l1.size(); i++) {			int toRemove = -1;			int j = 0;			String s = (String)l1.get(i);			//StringWrapper sw = new StringWrapper(s);			for (j=0; j < l2.size(); j++) {				if (type == 1) {					//if (normNW(s,(String)l2.get(j)) > 0.9) {					if (s.equals((String)l2.get(j))) {						toRemove = j;						toRemoveList.add(new Integer(i));						break;					}				} else if (type == 2) {					if (s.startsWith((String)l2.get(j))) {						toRemove = j;						toRemoveList.add(new Integer(i));						break;					}				}			}			if (toRemove > -1)				l2.remove(toRemove);		}		for (int k=0; k < toRemoveList.size(); k++) {			int rem = (int)((Integer)toRemoveList.get(k)).intValue();			l1.remove(rem-k);		}				}	private void removeStops (String s) {		s.replaceAll("and"," ");		s.replaceAll("&"," ");	}	private void parseAuthorName (String author, Collection longTokens, Collection shortTokens) {		// first remove punctuation		String s1;		String a1 = new String (author);		CharSequenceLexer l1 = new CharSequenceLexer ();		a1.replaceAll("[,.;]"," "); // replace all punctuation with space		l1.setCharSequence(a1);		while (l1.hasNext()) {			s1 = (String)l1.next();			if (s1.length() > 1) {				longTokens.add(s1);			} else if (s1.length() > 0) {				shortTokens.add(s1);			}		}		//System.out.println(" author longs: " + longTokens);		//System.out.println(" author shorts: " + shortTokens);			}	private boolean containsEtAl (List l) {		if (l.size() == 2) {			for (int i=0; i < l.size(); i++) {				if (!(((String)l.get(i)).equals("et") ||							((String)l.get(i)).equals("al")))					return false;			}			return true;		} else {			return false;		}	}	public void extraPipe (List a1, List a2) {	}	public Instance pipe (Instance carrier) {		NodePair pair = (NodePair)carrier.getData();		Citation c1 = (Citation)pair.getObject1();		Citation c2 = (Citation)pair.getObject2();		List a1 = c1.getAuthors();		List a2 = c2.getAuthors();		List longs1 = new ArrayList();		List shorts1 = new ArrayList();		List longs2 = new ArrayList();		List shorts2 = new ArrayList();		/*		if (!(a1.size() == a2.size()))			pair.setFeatureValue("DiffNumAuthors", 1.0);		*/		Iterator i1 = a1.iterator();		while (i1.hasNext()) {			parseAuthorName ((String)i1.next(), longs1, shorts1);		}		Iterator i2 = a2.iterator();		while (i2.hasNext()) {			parseAuthorName ((String)i2.next(), longs2, shorts2);		}		int initialsize = longs1.size()+longs2.size()+shorts1.size()+shorts2.size();		//System.out.println(" l1: " + longs1 + " l2: " + longs2);		//System.out.println ("longs BEFORE: " + longs1);		//System.out.println ("longs BEFORE: " + longs2);		listOfStringsIntersection (longs1, longs2, 1);		listOfStringsIntersection (shorts1, shorts2, 1);		listOfStringsIntersection (longs1, shorts2, 2);		listOfStringsIntersection (longs2, shorts1, 2);		// set some features based on how this all happened		/*		if (initialsize > 0 && longs2.isEmpty() && longs1.isEmpty()) {			pair.setFeatureValue("AuthorLongNamesMatch", 1.0);			if (shorts2.isEmpty() && shorts2.isEmpty()) {				pair.setFeatureValue("AuthorNamesMatch", 1.0);			}		} else if (initialsize > 0 && shorts2.isEmpty() && shorts2.isEmpty()) {			//pair.setFeatureValue("AuthorShortNamesMatch", 1.0);		}		*/		// et al is all that remains in one of them		if (containsEtAl(longs1) || containsEtAl(longs2)) {			System.out.println(">>>> ET AL <<<");			longs1 = new ArrayList(); // reset these in this case			longs2 = new ArrayList();		}		// this is intersection/union		//System.out.println("initial size: " + initialsize);		int topLong = longs1.size() + longs2.size();		int topShort =  shorts1.size() + shorts2.size();		if (initialsize > 0) {			if (true) {				double overlap = 1 - ((double)(topLong+(topShort*0.3))/(double)initialsize);				if (overlap > 0.7)					pair.setFeatureValue("AuthorOverlapHIGH", overlap );				else if (overlap > 0.4)					pair.setFeatureValue("AuthorOverlapMED", overlap );				else if (overlap > 0.1)					pair.setFeatureValue("AuthorOverlapLOW", overlap );				if (overlap <= 0.1)					pair.setFeatureValue("AuthorsDoNotMatch", 1.0);			}						if (false) {			double overlapLong = 1 - ((double)topLong/(double)initialsize);			double overlapShort = 1 - ((double)topShort/(double)initialsize);			if ( overlapLong > 0.7 )				pair.setFeatureValue("AuthorOverlapLongHigh", 1.0 );			if ( overlapShort > 0.7)				pair.setFeatureValue("AuthorOverlapShortHigh", 1.0 );			if ( overlapLong > 0.45 && overlapLong < 0.7)				pair.setFeatureValue("AuthorOverlapLongLow", 1.0 );			if ( overlapShort > 0.45 && overlapLong < 0.7)				pair.setFeatureValue("AuthorOverlapShortLow", 1.0 );			if ( overlapLong <= 0.45)				pair.setFeatureValue("AuthorOverlapLongNone", 1.0 );			if ( overlapShort <= 0.45)				pair.setFeatureValue("AuthorOverlapShortNone", 1.0 );			}					}		if (false) {			double authorSimilarity = 0;			if (a1.size() == a2.size()) {				for (int i=0; i < a2.size(); i++) {					authorSimilarity += distMetric.score((String)a1.get(i), (String)a2.get(i));				}				// average of each author similarity				if (a2.size() > 0) {					authorSimilarity = authorSimilarity/a2.size();					pair.setFeatureValue("AuthorSimilaritySoftTFIDF", authorSimilarity);				}			}		}		return carrier;			}}																

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -