📄 emailpeopleextractor.java
字号:
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/** Extracts people from email messages and resolves coreferent mentions.
* @author Ron Bekkerman <A HREF="mailto:ronb@cs.umass.edu">ronb@cs.umass.edu</A>
*/
package edu.umass.cs.mallet.projects.dex.ie;
import edu.umass.cs.mallet.projects.dex.types.*;
import edu.umass.cs.mallet.base.util.*;
import java.io.*;
import java.util.*;
import java.util.regex.*;
import java.util.logging.*;
public class EmailPeopleExtractor {
private static Logger logger = MalletLogger.getLogger(EmailPeopleExtractor.class.getName());
public EmailPeopleExtractor(File dir, HashSet stopWords) {
people = new People();
this.stopWords = stopWords;
processDir(dir);
people.writeToFile(new File("disambig_log.txt"));
}
public People getPeople() {
return this.people;
}
public String cleanName(String name){
// Remove email address out of name
Pattern pat =
Pattern.compile("[^\\w\\.\\-][\\w\\.\\-]+\\@[\\w\\.\\-]+[^\\w\\.\\-]");
Matcher mat = pat.matcher(name);
name = mat.replaceAll(" ");
// remove all non-literals out of name
pat = Pattern.compile("\\W");
mat = pat.matcher(name);
name = mat.replaceAll(" ");
// remove spaces from the beginning and end
pat = Pattern.compile("^ *([^ ](.*[^ ])*) *$");
mat = pat.matcher(name);
if(mat.matches()) {
name = mat.group(1);
}
// decrease number of spaces between words to one space
pat = Pattern.compile(" +");
String[] words = pat.split(name);
if(words.length == 0) {
return null;
}
name = "";
for(int i = 0; i < words.length - 1; i++) {
name += (words[i] + " ");
}
name += words[words.length - 1];
// remove extra words
pat = Pattern.compile(" (e mail|mailto)");
mat = pat.matcher(name);
name = mat.replaceAll("");
return name;
}
public void getLoginAndName(String person,
PeopleInMessage peopleInMessage,
double weight) {
person = person.toLowerCase();
Pattern pat = Pattern.compile("[\\<\\>\\\"\\'\\,]");
Matcher mat = pat.matcher(person);
person = mat.replaceAll(" ");
pat = Pattern.compile("^(.* )*([\\w\\.\\-]+)\\+*\\@([\\w\\.\\-]+) *$");
mat = pat.matcher(person);
if(mat.matches()) {
String name = mat.group(1);
String login = mat.group(2);
String domain = mat.group(3);
if(name != null)
name = cleanName(name);
PersonInMessage p = new PersonInMessage(name, login, domain, weight);
peopleInMessage.addPerson(p);
}
else {
logger.fine("No match in person " + person + ".");
}
}
public double getWeight(String prefix) {
if(prefix.compareTo("from") == 0)
return 2;
if(prefix.compareTo("to") == 0)
return 1;
if(prefix.compareTo("cc") == 0)
return 0.5;
if(prefix.compareTo("bcc") == 0)
return 0.5;
return 0;
}
public void getWordsFromLine(String line, PeopleInMessage peopleInMessage) {
// remove MIME attachments
Pattern pat = Pattern.compile(" ");
Matcher mat = pat.matcher(line);
if(mat.find() == false && line.length() > 40)
return;
pat = Pattern.compile("\\W+");
String[] words = pat.split(line);
for(int i = 0; i < words.length; i++){
String word = words[i].toLowerCase();
if(word.equals("") || stopWords.contains(word))
continue;
peopleInMessage.addWordToContextModel(word);
}
}
public void processLine(String line,
PeopleInMessage peopleInMessage,
LineProcessor lineProcessor) {
double oldNestedMessageLevel = lineProcessor.getNestedMessageLevel();
if(lineProcessor.nextCorrespondentLine(line) ||
lineProcessor.isCorrespondentLine(line)) {
if(lineProcessor.getNestedMessageLevel() > oldNestedMessageLevel)
peopleInMessage.reduceWeights();
String prefix = lineProcessor.getPrefix();
line = lineProcessor.getLineWithoutPrefix(line);
line = lineProcessor.removeCommasInBrackets(line);
Pattern pat = Pattern.compile("\\,");
String[] peopleInLine = pat.split(line);
for(int i = 0; i < peopleInLine.length; i++){
getLoginAndName(peopleInLine[i], peopleInMessage, getWeight(prefix));
}
return;
}
// Remove header lines (except for Subject)
if(lineProcessor.nextHeaderLine(line) ||
lineProcessor.isHeaderLine(line))
return;
if(lineProcessor.updateNestedMessageLevel(line) > oldNestedMessageLevel)
peopleInMessage.reduceWeights();
getWordsFromLine(line, peopleInMessage);
}
public void processFile(File file) {
try {
PeopleInMessage peopleInMessage = new PeopleInMessage();
LineProcessor lineProcessor = new LineProcessor();
BufferedReader in = new BufferedReader(new FileReader(file));
String line = new String(in.readLine());
boolean isCorrespondentLineProcessed = false;
while (line != null) {
if(lineProcessor.isLastLine(line))
break;
processLine(line, peopleInMessage, lineProcessor);
line = in.readLine();
}
in.close();
People simplePeople = peopleInMessage.getSimplePeople();
simplePeople.buildEmailLinks();
people.addAll(simplePeople);
} catch(IOException e) {
System.err.print("Cannot open file ");
System.err.println(file.getName());
}
}
public void processDir(File dir){
File files[] = dir.listFiles();
for(int i = 0; i < files.length; i++){
if(files[i].isFile()){
processFile(files[i]);
}
if(files[i].isDirectory()){
processDir(files[i]);
}
}
}
//Inner classes
public class LineProcessor {
public LineProcessor() {
headerLine = false;
correspondentLine = false;
nestedMessageLevel = 0.5;
prefix = "";
}
public boolean isEmptyLine(String line){
Pattern pat = Pattern.compile("^\\s*$");
Matcher mat = pat.matcher(line);
return mat.matches();
}
public boolean isLastLine(String line){
Pattern pat = Pattern.compile("^Content-Type\\: .*\\/html");
Matcher mat = pat.matcher(line);
return mat.lookingAt();
}
public boolean isHeaderLine(String line) {
Pattern pat = Pattern.compile("^((\\> )*|(\\>+ ))([\\w\\-]+)\\: ");
Matcher mat = pat.matcher(line);
if(mat.lookingAt()) {
prefix = mat.group(4);
prefix = prefix.toLowerCase();
if(prefix.compareTo("subject") == 0)
return false;
headerLine = true;
return true;
}
return false;
}
public boolean isCorrespondentLine(String line) {
Pattern pat = Pattern.compile("^((\\> )*|(\\>+ ))(from|to|cc|bcc)\\: ",
Pattern.CASE_INSENSITIVE);
Matcher mat = pat.matcher(line);
if(mat.lookingAt()) {
correspondentLine = true;
prefix = mat.group(4);
prefix = prefix.toLowerCase();
if(prefix.compareTo("from") == 0)
nestedMessageLevel = 2 * nestedMessageLevel;
return true;
}
return false;
}
public boolean nextHeaderLine(String line) {
Pattern pat = Pattern.compile("^\\s");
Matcher mat = pat.matcher(line);
if(mat.lookingAt())
return headerLine;
headerLine = false;
return false;
}
public boolean nextCorrespondentLine(String line) {
Pattern pat = Pattern.compile("^\\s");
Matcher mat = pat.matcher(line);
if(mat.lookingAt())
return correspondentLine;
correspondentLine = false;
return false;
}
public boolean wasHeaderLine() {
return headerLine;
}
public boolean wasCorrespondentLine() {
return correspondentLine;
}
public String getLineWithoutPrefix(String line) {
Pattern pat = Pattern.compile("^((\\> )*|(\\>+ ))(from|to|cc|bcc)\\: ",
Pattern.CASE_INSENSITIVE);
Matcher mat = pat.matcher(line);
if(mat.lookingAt()) {
line = mat.replaceAll("");
}
return line;
}
public String getPrefix() {
return prefix;
}
public double getNestedMessageLevel() {
return nestedMessageLevel;
}
public String removeCommasInBrackets(String line) {
Pattern pat = Pattern.compile("\\\"");
Matcher mat = pat.matcher(line);
if(mat.find() == false)
return line;
String[] chunks = pat.split(line);
for(int i = 1; i < chunks.length; i += 2) {
pat = Pattern.compile("\\,");
mat = pat.matcher(chunks[i]);
chunks[i] = mat.replaceAll(" ");
}
line = "";
for(int i = 0; i < chunks.length; i++) {
line += (" " + chunks[i]);
}
return line;
}
public double updateNestedMessageLevel(String line) {
Pattern pat = Pattern.compile("^([\\> ]+)[^\\> ]");
Matcher mat = pat.matcher(line);
if (mat.lookingAt()) {
line = mat.group(1);
int counter = 1;
for (int i = 0; i < line.length(); i++) {
if (line.charAt(i) == '>')
counter = 2 * counter;
}
if (nestedMessageLevel < counter)
nestedMessageLevel = counter;
}
return nestedMessageLevel;
}
public boolean headerLine;
public boolean correspondentLine;
public double nestedMessageLevel;
public String prefix;
}
public People people;
public HashSet stopWords;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -