📄 person.java
字号:
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/** Keeps person's names, logins and domains, etc.
* @author Ron Bekkerman <A HREF="mailto:ronb@cs.umass.edu">ronb@cs.umass.edu</A>
*/
package edu.umass.cs.mallet.projects.dex.types;
import edu.umass.cs.mallet.projects.dex.web.*;
import edu.umass.cs.mallet.base.types.AugmentableFeatureVector;
import edu.umass.cs.mallet.base.types.RankedFeatureVector;
import edu.umass.cs.mallet.base.types.Alphabet;
import java.util.*;
import java.io.*;
import java.util.regex.*;
import edu.umass.cs.mallet.base.types.StringKernel;
public class Person implements Serializable, Comparable {
public Person() {
processedForWebPages = false;
processedForContactInformation = false;
names = new Vector();
logins = new Vector();
domains = new Vector();
alphabet = new Alphabet();
keyWords = new AugmentableFeatureVector (alphabet, 1, false);
topKeyWords = new ArrayList();
contextModel = new AugmentableFeatureVector (alphabet, 1, false);
emailLinks = new HashMap();
pages = new Vector ();
contextPages = new Vector ();
contactRecord = new ContactRecord();
inLinks = new HashSet();
outLinks = new HashSet();
numberOfOccurrences = 1;
id = -1;
}
public Person(String name, String login, String domain) {
this();
if(name != null)
names.addElement(new CountedString(name));
if(login != null)
logins.addElement(new CountedString(login));
if(domain != null)
domains.addElement(new CountedString(domain));
}
public Person(Vector names_, Vector logins_, Vector domains_,
AugmentableFeatureVector keyWords_, AugmentableFeatureVector contextModel_) {
this();
if(names_ != null)
names.addAll(names_);
if(logins_ != null)
logins.addAll(logins_);
if(domains_ != null)
domains.addAll(domains_);
if (keyWords_ != null) {
keyWords = keyWords_;
this.alphabet = keyWords.getAlphabet();
}
if (contextModel_ != null) {
contextModel = contextModel_;
if (keyWords_ != null && keyWords_.getAlphabet() != contextModel_.getAlphabet())
throw new IllegalArgumentException ("feature vector dictionaries don't match.");
this.alphabet = contextModel.getAlphabet();
}
}
public Iterator pageIterator() {
return this.pages.iterator();
}
public void addPage(WebPage webPage) {
this.pages.add(webPage);
}
public int getNumberOfOccurrences() {
return numberOfOccurrences;
}
public void setContactRecord (ContactRecord c) {
this.contactRecord = c;
}
public ContactRecord getContactRecord () {
return this.contactRecord;
}
public void setAlphabet (Alphabet a) { this.alphabet = a; }
public HashSet getInLinks() {
return this.inLinks;
}
public void addInLink (int i) {
this.inLinks.add (new Integer (i));
}
public void addOutLink (int i) {
this.outLinks.add(new Integer(i));
}
public void removeOutLink (int index) {
outLinks.remove (new Integer (index));
}
public int getId () {return this.id;}
public void setId (int i) {this.id = i;}
public String getFirstName() {
String name = null;
if(names.size() > 0)
name = ((CountedString)names.elementAt(0)).str;
return name;
}
public boolean findName(String name1) {
for(int i = 0; i < names.size(); i++) {
String name2 = ((CountedString)names.elementAt(i)).str;
if(sameName(name1, name2))
return true;
}
return false;
}
public String getSurname() {
String surname = null;
for(int i = 0; i < names.size(); i++) {
String name = ((CountedString)names.elementAt(i)).str;
Pattern pat = Pattern.compile(" ");
Matcher mat = pat.matcher(name);
if(mat.find() == false)
continue;
String[] words = pat.split(name);
// If name is long we cannot confidently specify where surname is
if(words.length > 3)
continue;
surname = words[words.length - 1];
break;
}
return surname;
}
public static boolean contains(Vector vec, String str) {
for(int i = 0; i < vec.size(); i++) {
if(((CountedString)vec.elementAt(i)).equals(str))
return true;
}
return false;
}
public static boolean contains(Vector vec, Integer g) {
for(int i = 0; i < vec.size(); i++) {
if(((Integer)vec.elementAt(i)).equals(g))
return true;
}
return false;
}
public static void swapNeighbors(Vector vec, int ind){
// swaps ind and ind + 1
CountedString cs = (CountedString)vec.elementAt(ind + 1);
vec.add(ind, cs);
vec.remove(ind + 2);
}
public static void incrementCounterAndSort(Vector vec, String str) {
for(int i = 0; i < vec.size(); i++)
if(((CountedString)vec.elementAt(i)).equals(str)){
((CountedString)vec.elementAt(i)).cnt++;
for(int j = i - 1; j >= 0; j--)
if(((CountedString)vec.elementAt(j+1)).cnt >
((CountedString)vec.elementAt(j)).cnt)
swapNeighbors(vec,j);
return;
}
}
public static boolean sameName(String name1, String name2) {
Pattern pat = Pattern.compile(" ");
String[] words1 = pat.split(name1);
String[] words2 = pat.split(name2);
int matches = 0;
boolean longWordMatch = false;
for(int i = 0; i < words1.length; i++){
for(int j = 0; j < words2.length; j++){
if(words1[i].compareTo(words2[j]) == 0){
matches++;
words2[j] = "";
if(words1[i].length() > 1 && 2*j >= words2.length)
// at least one long word matches in second half of the name
longWordMatch = true;
break;
}
// If one of words is an initial, and both are not last name
// and the initial matches one of the first names
else if ((words1[i].length() == 1 || words2[j].length() == 1) &&
i < words1.length - 1 && j < words2.length - 1 &&
words1[i].regionMatches(0,words2[j],0,1)) {
matches++;
words2[j] = "";
break;
}
}
}
// At least two words should be identical (one long word among them)
// and number of non-matched words is less than number of matched words
if(longWordMatch && matches > 1 &&
words1.length < 2*matches && words2.length < 2*matches)
return true;
return false;
}
public static Vector mergeVectors(Vector v1, Vector v2) {
for(int i = 0; i < v2.size(); i++) {
String str = ((CountedString)v2.elementAt(i)).str;
if (str != null)
if(contains(v1, str) == false)
v1.add(new CountedString(str));
else
incrementCounterAndSort(v1, str);
}
return v1;
}
public static String cleanURL(String url) {
// don't look into hierarchy under ~login
Pattern pat = Pattern.compile("^(.+\\/\\~[^\\/]+)\\/.*$");
Matcher mat = pat.matcher(url);
if(mat.matches())
url = mat.group(1);
// remove extension such as .html
pat = Pattern.compile("^(.+)\\.\\w+$");
mat = pat.matcher(url);
if(mat.matches())
url = mat.group(1);
return url;
}
public boolean isLoginOrNameInURL(String url, String threadName) {
double STRING_KERNEL_THRESHOLD = 0.75;
StringKernel sk = new StringKernel();
url = cleanURL(url);
Pattern pat = Pattern.compile("[\\/\\~]");
String[] fields = pat.split(url);
for(int i = 0; i < fields.length; i++){
String field = fields[i].toLowerCase();
if(field.length() < 2 || field.length() > 30) {
//System.out.println(threadName + ") Field " + field + " is too short/long");
continue;
}
for(int j = 0; j < logins.size(); j++) {
String login = ((CountedString)logins.elementAt(j)).str;
//System.out.println(threadName + ") Processing field " + field +
// " and login " + login);
if(login.equals(field))
return true;
pat = Pattern.compile("[\\._]");
String[] words = pat.split(login);
for(int k = 0; k < words.length; k++)
if(words[k].equals(field))
return true;
}
for(int j = 0; j < names.size(); j++) {
String name = ((CountedString)names.elementAt(j)).str;
double value = sk.K(name,field);
//System.out.println(threadName + ") String kernel between " + field +
// " and name " + name + " is " + value);
if(value > STRING_KERNEL_THRESHOLD && field.length() > 3)
return true;
pat = Pattern.compile(" ");
String[] words = pat.split(name);
for(int k = 0; k < words.length; k++) {
if(words[k].equals(field)) {
//System.out.println(threadName + ") Field " + field +
// " equals word in name" + name);
return true;
}
value = sk.K(words[k],field);
//System.out.println(threadName + ") String kernel between " + field +
// " and word " + words[k] + " is " + value);
if(value > STRING_KERNEL_THRESHOLD &&
field.length() > 3 && words[k].length() > 3)
return true;
}
}
}
return false;
}
public boolean findLogin(String login) {
return contains(logins,login);
}
public boolean loginsIntersect(Person person) {
for(int i = 0; i < person.logins.size(); i++) {
String login = ((CountedString)person.logins.elementAt(i)).str;
if(contains(logins, login))
return true;
Pattern pat = Pattern.compile("[_\\.]");
Matcher mat = pat.matcher(login);
if(mat.find() == false)
continue;
String name = mat.replaceFirst(" ");
Person p = new Person(name,"","");
if(namesIntersect(p))
return true;
}
return false;
}
private boolean similarNames (String n1, String n2) {
if (n1.equalsIgnoreCase (n2))
return true;
String[] names = n1.split ("\\s+");
String[] newNames = n2.split("\\s+");
if (names.length < 2 || newNames.length < 2)
return false;
String fname = names[0];
String lname = names[names.length-1];
String newfname = newNames[0];
String newlname = newNames[newNames.length-1];
if (fname.charAt(0) == newfname.charAt(0) && lname.equalsIgnoreCase(newlname)) {
System.err.println ("RESOLVING COREFERENCE " + n1 + " <-> " + n2);
return true;
}
return false;
}
public boolean namesIntersect(Person person) {
for(int i = 0; i < person.names.size(); i++) {
String name1 = ((CountedString)person.names.elementAt(i)).str;
for(int j = 0; j < names.size(); j++) {
String name2 = ((CountedString)names.elementAt(j)).str;
if(sameName(name1, name2) || similarNames (name1, name2))
return true;
}
Pattern pat = Pattern.compile(" ");
Matcher mat = pat.matcher(name1);
if(mat.find() == false)
continue;
String login = mat.replaceAll("\\."); // look for login firstName.lastName
if(contains(logins, login))
return true;
login = mat.replaceAll("\\_"); // look for login firstName_lastName
if(contains(logins, login))
return true;
}
return false;
}
public void addKeyWord (String word) {
addKeyWords (new String[] {word});
}
public void addKeyWords (String[] _words) {
addWords (_words, keyWords);
}
public void addWordToContextModel (String word) {
addWordToContextModel (word, 1.0);
}
public void addWordToContextModel (String word, double val) {
addWordsToContextModel (new String[] {word}, new double[] {val});
}
public void addWordsToContextModel (String[] _words) {
addWords (_words, contextModel);
}
public void addWordsToContextModel (String[] _words, double[] weights) {
addWords (_words, contextModel, weights);
}
public void addWords (String[] toadd, AugmentableFeatureVector words) {
if (words == null)
words = new AugmentableFeatureVector (this.alphabet);
for (int i=0; i < toadd.length; i++)
words.add (toadd[i], 1.0);
}
public void addWords (String[] toadd, AugmentableFeatureVector words, double[] weights) {
if (words == null)
words = new AugmentableFeatureVector (this.alphabet);
if (weights.length != toadd.length)
throw new IllegalArgumentException ("tadd.size: " + toadd.length + ", weights.size: " + weights.length);
for (int i=0; i < toadd.length; i++)
words.add (toadd[i], weights[i]);
}
/** Remove stop words from keyWord vector.*/
public void stopKeyWords(HashSet stop) {
String[] keys = (String[])stop.toArray(new String[]{});
AugmentableFeatureVector stoplist = new AugmentableFeatureVector (this.alphabet);
addWords (keys, stoplist);
for (int i=0; i < stoplist.numLocations(); i++) {
int index = stoplist.indexAtLocation (i);
this.keyWords.setValue (index, 0.0);
}
}
public void removePage(String fileName) {
for(int i = 0; i < pages.size(); i++) {
WebPage webPage = (WebPage)pages.elementAt(i);
if(webPage.fileName.equals(fileName)) {
pages.removeElementAt(i);
break;
}
}
}
public void addEmailLink(String name, int value) {
Integer counter = (Integer)emailLinks.get(name);
if(counter != null) {
int newCounter = counter.intValue() + value;
emailLinks.put(name, new Integer(newCounter));
}
else {
emailLinks.put(name, new Integer(value));
}
}
public void addEmailLink(Person p) {
String name = p.getFirstName();
if(name == null)
return;
addEmailLink(name, 1);
}
public double calculateCosineWithContextModel(String[] model) {
if (model == null || this.contextModel == null)
return 0.0;
AugmentableFeatureVector tocompare = new AugmentableFeatureVector (this.alphabet);
addWords (model,tocompare);
double tocompareTwoNorm = tocompare.twoNorm();
double contextTwoNorm = contextModel.twoNorm();
if (tocompareTwoNorm == 0 || contextTwoNorm == 0)
return 0.0;
return tocompare.dotProduct (contextModel) / (tocompareTwoNorm*contextTwoNorm);
}
public double calculateCosineWithKeyWords (String[] model) {
if (model == null || this.keyWords == null)
return 0.0;
AugmentableFeatureVector tocompare = new AugmentableFeatureVector (this.alphabet);
addWords (model, tocompare);
return calculateCosineWithKeyWords (tocompare);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -