⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 person.java

📁 常用机器学习算法,java编写源代码,内含常用分类算法,包括说明文档
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

/**   Keeps person's names, logins and domains, etc.
 *   @author Ron Bekkerman <A HREF="mailto:ronb@cs.umass.edu">ronb@cs.umass.edu</A>
*/

package edu.umass.cs.mallet.projects.dex.types;

import edu.umass.cs.mallet.projects.dex.web.*;
import edu.umass.cs.mallet.base.types.AugmentableFeatureVector;
import edu.umass.cs.mallet.base.types.RankedFeatureVector;
import edu.umass.cs.mallet.base.types.Alphabet;

import java.util.*;
import java.io.*;
import java.util.regex.*;
import edu.umass.cs.mallet.base.types.StringKernel;

public class Person implements Serializable, Comparable {
	public Person() {
		processedForWebPages = false;
		processedForContactInformation = false;
		names = new Vector();
		logins = new Vector();
		domains = new Vector();
		alphabet = new Alphabet();
		keyWords = new AugmentableFeatureVector (alphabet, 1, false);
		topKeyWords = new ArrayList();
		contextModel = new AugmentableFeatureVector (alphabet, 1, false);
		emailLinks = new HashMap();
		pages = new Vector ();
		contextPages = new Vector ();
		contactRecord = new ContactRecord();
		inLinks = new HashSet();
		outLinks = new HashSet();
		numberOfOccurrences = 1;
		id = -1;
	}
    
	public Person(String name, String login, String domain) {
		this();
		if(name != null)
	    names.addElement(new CountedString(name));
		if(login != null)
	    logins.addElement(new CountedString(login));
		if(domain != null)
	    domains.addElement(new CountedString(domain));
	}
	
	public Person(Vector names_, Vector logins_, Vector domains_,
								AugmentableFeatureVector keyWords_, AugmentableFeatureVector contextModel_) {
		this();
		if(names_ != null)
			names.addAll(names_);
		if(logins_ != null)
			logins.addAll(logins_);
		if(domains_ != null)
	    domains.addAll(domains_);
		if (keyWords_ != null) {
			keyWords = keyWords_;
			this.alphabet = keyWords.getAlphabet();
		}
		if (contextModel_ != null) {
			contextModel = contextModel_;
			if (keyWords_ != null && keyWords_.getAlphabet() != contextModel_.getAlphabet())
				throw new IllegalArgumentException ("feature vector dictionaries don't match.");
			this.alphabet = contextModel.getAlphabet();
		}
	}
	
	public Iterator pageIterator() { 
		return this.pages.iterator(); 
	}
	
	public void addPage(WebPage webPage) { 
		this.pages.add(webPage); 
	}
	
	public int getNumberOfOccurrences() {
		return numberOfOccurrences;
	}
	
	public void setContactRecord (ContactRecord c) { 
		this.contactRecord = c; 
	}
	
	public ContactRecord getContactRecord () { 
		return this.contactRecord; 
	}

	public void setAlphabet (Alphabet a) { this.alphabet = a; }
	
	public HashSet getInLinks() {
		return this.inLinks;
	}
	
	public void addInLink (int i) {
		this.inLinks.add (new Integer (i));
	}
	
	public void addOutLink (int i) {
		this.outLinks.add(new Integer(i));
	}
	
	public void removeOutLink (int index) {
		outLinks.remove (new Integer (index));
	}

	public int getId () {return this.id;}

	public void setId (int i) {this.id = i;}
	
	public String getFirstName() {
		String name = null;
		if(names.size() > 0)
	    name = ((CountedString)names.elementAt(0)).str;
		return name;
	}
	
	public boolean findName(String name1) {
		for(int i = 0; i < names.size(); i++) {
	    String name2 = ((CountedString)names.elementAt(i)).str;
	    if(sameName(name1, name2))
				return true;
		}
		return false;
	}
	
	public String getSurname() {
		String surname = null;
		for(int i = 0; i < names.size(); i++) {
	    String name = ((CountedString)names.elementAt(i)).str;
	    Pattern pat = Pattern.compile(" ");
	    Matcher mat = pat.matcher(name);
	    if(mat.find() == false)
				continue;
	    String[] words = pat.split(name);
	    // If name is long we cannot confidently specify where surname is
	    if(words.length > 3)
				continue;
	    surname = words[words.length - 1];
	    break;
		}
		return surname;
	}
	
	public static boolean contains(Vector vec, String str) {
		for(int i = 0; i < vec.size(); i++) {
	    if(((CountedString)vec.elementAt(i)).equals(str))
				return true;
		}
		return false;
	}
	
	public static boolean contains(Vector vec, Integer g) {
		for(int i = 0; i < vec.size(); i++) {
	    if(((Integer)vec.elementAt(i)).equals(g))
				return true;
		}
		return false;
	}
	public static void swapNeighbors(Vector vec, int ind){
		// swaps ind and ind + 1
		CountedString cs = (CountedString)vec.elementAt(ind + 1);
		vec.add(ind, cs);
		vec.remove(ind + 2);
	}
	
	public static void incrementCounterAndSort(Vector vec, String str) {
		for(int i = 0; i < vec.size(); i++)
	    if(((CountedString)vec.elementAt(i)).equals(str)){
				((CountedString)vec.elementAt(i)).cnt++;
				for(int j = i - 1; j >= 0; j--)
					if(((CountedString)vec.elementAt(j+1)).cnt >
						 ((CountedString)vec.elementAt(j)).cnt)
						swapNeighbors(vec,j);
				return;
	    }
	}
	
	public static boolean sameName(String name1, String name2) {
		Pattern pat = Pattern.compile(" ");
		String[] words1 = pat.split(name1);
		String[] words2 = pat.split(name2);
		int matches = 0;
		boolean longWordMatch = false;
		for(int i = 0; i < words1.length; i++){
	    for(int j = 0; j < words2.length; j++){
				if(words1[i].compareTo(words2[j]) == 0){
					matches++;
		    words2[j] = "";
		    if(words1[i].length() > 1 && 2*j >= words2.length) 
					// at least one long word matches in second half of the name
					longWordMatch = true;
		    break;
				}
				// If one of words is an initial, and both are not last name
				// and the initial matches one of the first names
				else if ((words1[i].length() == 1 || words2[j].length() == 1) &&
								 i < words1.length - 1 && j < words2.length - 1 &&
								 words1[i].regionMatches(0,words2[j],0,1)) {
					matches++;
					words2[j] = "";
					break;
				}
	    }
		}
		// At least two words should be identical (one long word among them)
		// and number of non-matched words is less than number of matched words
		if(longWordMatch && matches > 1 && 
			 words1.length < 2*matches && words2.length < 2*matches)
	    return true;
		return false;
	}
	
	public static Vector mergeVectors(Vector v1, Vector v2) {
		for(int i = 0; i < v2.size(); i++) {
	    String str = ((CountedString)v2.elementAt(i)).str;
	    if (str != null)
				if(contains(v1, str) == false)
					v1.add(new CountedString(str));
				else
					incrementCounterAndSort(v1, str);
		}
		return v1;
	}
	
	public static String cleanURL(String url) {
		// don't look into hierarchy under ~login
		Pattern pat = Pattern.compile("^(.+\\/\\~[^\\/]+)\\/.*$");
		Matcher mat = pat.matcher(url);
		if(mat.matches()) 
	    url = mat.group(1);
		// remove extension such as .html
		pat = Pattern.compile("^(.+)\\.\\w+$");
		mat = pat.matcher(url);
		if(mat.matches())
	    url = mat.group(1);
		return url;
	}
	
	public boolean isLoginOrNameInURL(String url, String threadName) {
		double STRING_KERNEL_THRESHOLD = 0.75;
		StringKernel sk = new StringKernel();
		url = cleanURL(url);
		
		Pattern pat = Pattern.compile("[\\/\\~]");
		String[] fields = pat.split(url);
		for(int i = 0; i < fields.length; i++){
	    String field = fields[i].toLowerCase();
	    if(field.length() < 2 || field.length() > 30) {
				//System.out.println(threadName + ") Field " + field + " is too short/long");
				continue;
	    }
	    for(int j = 0; j < logins.size(); j++) {
				String login = ((CountedString)logins.elementAt(j)).str;
				//System.out.println(threadName + ") Processing field " + field + 
				//		   " and login " + login);
				if(login.equals(field))
					return true;
				pat = Pattern.compile("[\\._]");
				String[] words = pat.split(login);
				for(int k = 0; k < words.length; k++)
					if(words[k].equals(field))
						return true;
	    }
	    for(int j = 0; j < names.size(); j++) {
				String name = ((CountedString)names.elementAt(j)).str;
				double value = sk.K(name,field);
				//System.out.println(threadName + ") String kernel between " + field + 
				//		   " and name " + name + " is " + value);
				if(value > STRING_KERNEL_THRESHOLD && field.length() > 3)
					return true;
				pat = Pattern.compile(" ");
				String[] words = pat.split(name);
				for(int k = 0; k < words.length; k++) {
					if(words[k].equals(field)) {
						//System.out.println(threadName + ") Field " + field + 
			//	   " equals word in name" + name);
						return true;
					}
					value = sk.K(words[k],field);
					//System.out.println(threadName + ") String kernel between " + field + 
					//		   " and word " + words[k] + " is " + value);
					if(value > STRING_KERNEL_THRESHOLD && 
						 field.length() > 3 && words[k].length() > 3)
						return true;
				}
				
	    }
		}
		return false;
	}
	
	public boolean findLogin(String login) {
		return contains(logins,login);
	}
	
	public boolean loginsIntersect(Person person) {
		for(int i = 0; i < person.logins.size(); i++) {
			String login = ((CountedString)person.logins.elementAt(i)).str;
			if(contains(logins, login))
				return true;
			Pattern pat = Pattern.compile("[_\\.]");
			Matcher mat = pat.matcher(login);
			if(mat.find() == false)
				continue;
			String name = mat.replaceFirst(" ");
			Person p = new Person(name,"","");
			if(namesIntersect(p))
				return true;
		}
		return false;
	}
	
	private boolean similarNames (String n1, String n2) {
		if (n1.equalsIgnoreCase (n2))
			return true;
		String[] names = n1.split ("\\s+");
		String[] newNames = n2.split("\\s+");
		if (names.length < 2 || newNames.length < 2)
			return false;
		String fname = names[0];
		String lname = names[names.length-1];
		String newfname = newNames[0];
		String newlname = newNames[newNames.length-1];
		if (fname.charAt(0) == newfname.charAt(0) && lname.equalsIgnoreCase(newlname)) {
			System.err.println ("RESOLVING COREFERENCE " + n1 + " <-> " + n2);
			return true;
		}
		return false;
	}
	
	public boolean namesIntersect(Person person) {
		for(int i = 0; i < person.names.size(); i++) {
	    String name1 = ((CountedString)person.names.elementAt(i)).str;
	    for(int j = 0; j < names.size(); j++) {
				String name2 = ((CountedString)names.elementAt(j)).str;
				if(sameName(name1, name2) || similarNames (name1, name2))
					return true;
	    }
	    Pattern pat = Pattern.compile(" ");
	    Matcher mat = pat.matcher(name1);
	    if(mat.find() == false)
				continue;
	    String login = mat.replaceAll("\\."); // look for login firstName.lastName
	    if(contains(logins, login))
				return true;
	    login = mat.replaceAll("\\_"); // look for login firstName_lastName
	    if(contains(logins, login))
				return true;
		}
		return false;
	}
	
	public void addKeyWord (String word) {
		addKeyWords (new String[] {word});
	}
	
	public void addKeyWords (String[] _words) {
		addWords (_words, keyWords);
	}

	public void addWordToContextModel (String word) {
		addWordToContextModel (word, 1.0);
	}

	public void addWordToContextModel (String word, double val) {
		addWordsToContextModel (new String[] {word}, new double[] {val});
	}

	public void addWordsToContextModel (String[] _words) {
		addWords (_words, contextModel);		
	}
	
	public void addWordsToContextModel (String[] _words, double[] weights) {
		addWords (_words, contextModel, weights);
	}

	public void addWords (String[] toadd, AugmentableFeatureVector words) {
		if (words == null)
			words = new AugmentableFeatureVector (this.alphabet);
		for (int i=0; i < toadd.length; i++) 
			words.add (toadd[i], 1.0);
	}

	public void addWords (String[] toadd, AugmentableFeatureVector words, double[] weights) {
		if (words == null)
			words = new AugmentableFeatureVector (this.alphabet);
		if (weights.length != toadd.length)
			throw new IllegalArgumentException ("tadd.size: " + toadd.length + ", weights.size: " + weights.length);
		for (int i=0; i < toadd.length; i++) 
			words.add (toadd[i], weights[i]);
	}

	/** Remove stop words from keyWord vector.*/
	public void stopKeyWords(HashSet stop) {
		String[] keys = (String[])stop.toArray(new String[]{});
		AugmentableFeatureVector stoplist = new AugmentableFeatureVector (this.alphabet);
		addWords (keys, stoplist);
		for (int i=0; i < stoplist.numLocations(); i++) {
			int index = stoplist.indexAtLocation (i);
			this.keyWords.setValue (index, 0.0);
		}
	}
	
	public void removePage(String fileName) {
		for(int i = 0; i < pages.size(); i++) {
	    WebPage webPage = (WebPage)pages.elementAt(i);
	    if(webPage.fileName.equals(fileName)) {
				pages.removeElementAt(i);
				break;
	    }
		}
	}


	public void addEmailLink(String name, int value) {
		Integer counter = (Integer)emailLinks.get(name);
		if(counter != null) {
	    int newCounter = counter.intValue() + value;
	    emailLinks.put(name, new Integer(newCounter));
		}
		else {
	    emailLinks.put(name, new Integer(value));
		}
	}
	
	public void addEmailLink(Person p) {
		String name = p.getFirstName();
		if(name == null)
	    return;
		addEmailLink(name, 1);
	}
	
	public double calculateCosineWithContextModel(String[] model) {
		if (model == null || this.contextModel == null)
			return 0.0;
		AugmentableFeatureVector tocompare = new AugmentableFeatureVector (this.alphabet);
		addWords (model,tocompare);		
		double tocompareTwoNorm = tocompare.twoNorm();
		double contextTwoNorm = contextModel.twoNorm();
		if (tocompareTwoNorm == 0 || contextTwoNorm == 0)
			return 0.0;
		return tocompare.dotProduct (contextModel) / (tocompareTwoNorm*contextTwoNorm);
	}
	

	public double calculateCosineWithKeyWords (String[] model) {
		if (model == null || this.keyWords == null)
			return 0.0;
		AugmentableFeatureVector tocompare = new AugmentableFeatureVector (this.alphabet);
		addWords (model, tocompare);
		return calculateCosineWithKeyWords (tocompare);
	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -