⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 emailpeopleextractor.java

📁 常用机器学习算法,java编写源代码,内含常用分类算法,包括说明文档
💻 JAVA
字号:
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

/**  Extracts people from email messages and resolves coreferent mentions.
 *   @author Ron Bekkerman <A HREF="mailto:ronb@cs.umass.edu">ronb@cs.umass.edu</A>
*/
package edu.umass.cs.mallet.projects.dex.ie;

import edu.umass.cs.mallet.projects.dex.types.*;
import edu.umass.cs.mallet.base.util.*;

import java.io.*;
import java.util.*;
import java.util.regex.*;

import java.util.logging.*;

public class EmailPeopleExtractor {

	private static Logger logger = MalletLogger.getLogger(EmailPeopleExtractor.class.getName());
	
	public EmailPeopleExtractor(File dir, HashSet stopWords) {
		people = new People();
		this.stopWords = stopWords;
		processDir(dir);
		people.writeToFile(new File("disambig_log.txt"));
	}
	
	public People getPeople() { 
		return this.people; 
	}
	
	public String cleanName(String name){
		// Remove email address out of name
		Pattern pat = 
	    Pattern.compile("[^\\w\\.\\-][\\w\\.\\-]+\\@[\\w\\.\\-]+[^\\w\\.\\-]");
		Matcher mat = pat.matcher(name);
		name = mat.replaceAll(" ");
		// remove all non-literals out of name
		pat = Pattern.compile("\\W");
		mat = pat.matcher(name);
		name = mat.replaceAll(" ");
		// remove spaces from the beginning and end
		pat = Pattern.compile("^ *([^ ](.*[^ ])*) *$");
		mat = pat.matcher(name);
		if(mat.matches()) {
	    name = mat.group(1);
		}
		// decrease number of spaces between words to one space
		pat = Pattern.compile(" +");
		String[] words = pat.split(name);
		if(words.length == 0) {
	    return null;
		}
		name = "";
		for(int i = 0; i < words.length - 1; i++) {
	    name += (words[i] + " ");
		}
		name += words[words.length - 1];
		// remove extra words
		pat = Pattern.compile(" (e mail|mailto)");
		mat = pat.matcher(name);
		name = mat.replaceAll("");
		return name;
	}
	
	public void getLoginAndName(String person,
															PeopleInMessage peopleInMessage,
															double weight) {
		person = person.toLowerCase();
		Pattern pat = Pattern.compile("[\\<\\>\\\"\\'\\,]");
		Matcher mat = pat.matcher(person);
		person = mat.replaceAll(" ");
		pat = Pattern.compile("^(.* )*([\\w\\.\\-]+)\\+*\\@([\\w\\.\\-]+) *$");
		mat = pat.matcher(person);
		if(mat.matches()) {
	    String name = mat.group(1);
	    String login = mat.group(2);
	    String domain = mat.group(3);
	    if(name != null)
				name = cleanName(name);
	    PersonInMessage p = new PersonInMessage(name, login, domain, weight);
	    peopleInMessage.addPerson(p);
		}
		else {
	    logger.fine("No match in person " + person + ".");
		}
	}
	
	public double getWeight(String prefix) {
		if(prefix.compareTo("from") == 0)
	    return 2;
		if(prefix.compareTo("to") == 0)
	    return 1;
		if(prefix.compareTo("cc") == 0)
	    return 0.5;
		if(prefix.compareTo("bcc") == 0)
	    return 0.5;
		return 0;
	}
	
	public void getWordsFromLine(String line, PeopleInMessage peopleInMessage) {
		// remove MIME attachments
		Pattern pat = Pattern.compile(" ");
		Matcher mat = pat.matcher(line);
		if(mat.find() == false && line.length() > 40)
	    return;
	pat = Pattern.compile("\\W+");
	String[] words = pat.split(line);
	for(int i = 0; i < words.length; i++){
		String word = words[i].toLowerCase();
		if(word.equals("") || stopWords.contains(word))
			continue;
		peopleInMessage.addWordToContextModel(word);
	}
	}
	
	public void processLine(String line,
													PeopleInMessage peopleInMessage,
													LineProcessor lineProcessor) {
		double oldNestedMessageLevel = lineProcessor.getNestedMessageLevel();
		if(lineProcessor.nextCorrespondentLine(line) ||
	   lineProcessor.isCorrespondentLine(line)) {
	    if(lineProcessor.getNestedMessageLevel() > oldNestedMessageLevel)
				peopleInMessage.reduceWeights();
	    String prefix = lineProcessor.getPrefix();
	    line = lineProcessor.getLineWithoutPrefix(line);
	    line = lineProcessor.removeCommasInBrackets(line);
	    Pattern pat = Pattern.compile("\\,");
	    String[] peopleInLine = pat.split(line);
	    for(int i = 0; i < peopleInLine.length; i++){
				getLoginAndName(peopleInLine[i], peopleInMessage, getWeight(prefix));
	    }
	    return;
	}
		// Remove header lines (except for Subject)
		if(lineProcessor.nextHeaderLine(line) ||
			 lineProcessor.isHeaderLine(line))
	    return;
		if(lineProcessor.updateNestedMessageLevel(line) > oldNestedMessageLevel)
	    peopleInMessage.reduceWeights();
		getWordsFromLine(line, peopleInMessage);
	}
	
	public void processFile(File file) {
		try {
	    PeopleInMessage peopleInMessage = new PeopleInMessage();
	    LineProcessor lineProcessor = new LineProcessor();
	    BufferedReader in = new BufferedReader(new FileReader(file));
	    String line = new String(in.readLine());
	    boolean isCorrespondentLineProcessed = false;
	    while (line != null) {
				if(lineProcessor.isLastLine(line))
					break;
				processLine(line, peopleInMessage, lineProcessor);
				line = in.readLine();
	    }
	    in.close();			
	    People simplePeople = peopleInMessage.getSimplePeople();
	    simplePeople.buildEmailLinks();
	    people.addAll(simplePeople);
		} catch(IOException e) {
	    System.err.print("Cannot open file ");
	    System.err.println(file.getName());
		}
	}
	
	public void processDir(File dir){
		File files[] = dir.listFiles();
		for(int i = 0; i < files.length; i++){
	    if(files[i].isFile()){
				processFile(files[i]);
	    }
	    if(files[i].isDirectory()){
				processDir(files[i]);
	    }
		}
	}
	
	//Inner classes
	public class LineProcessor {
		public LineProcessor() {
	    headerLine = false;
	    correspondentLine = false;
	    nestedMessageLevel = 0.5;
	    prefix = "";
		}
		
		public boolean isEmptyLine(String line){
	    Pattern pat = Pattern.compile("^\\s*$");
	    Matcher mat = pat.matcher(line);
	    return mat.matches();
		}
		
		public boolean isLastLine(String line){
	    Pattern pat = Pattern.compile("^Content-Type\\: .*\\/html");
	    Matcher mat = pat.matcher(line);
	    return mat.lookingAt();
		}
		
		public boolean isHeaderLine(String line) {
	    Pattern pat = Pattern.compile("^((\\> )*|(\\>+ ))([\\w\\-]+)\\: ");
	    Matcher mat = pat.matcher(line);
	    if(mat.lookingAt()) {
				prefix = mat.group(4);
				prefix = prefix.toLowerCase();
				if(prefix.compareTo("subject") == 0)
					return false;
				headerLine = true;
				return true;
	    }
	    return false;
		}
		
		public boolean isCorrespondentLine(String line) {
	    Pattern pat = Pattern.compile("^((\\> )*|(\\>+ ))(from|to|cc|bcc)\\: ",
																		Pattern.CASE_INSENSITIVE);
	    Matcher mat = pat.matcher(line);
	    if(mat.lookingAt()) {
				correspondentLine = true;
				prefix = mat.group(4);
				prefix = prefix.toLowerCase();
				if(prefix.compareTo("from") == 0)
					nestedMessageLevel = 2 * nestedMessageLevel;
				return true;
	    }
	    return false;
		}
		
		public boolean nextHeaderLine(String line) {
	    Pattern pat = Pattern.compile("^\\s");
	    Matcher mat = pat.matcher(line);
	    if(mat.lookingAt())
				return headerLine;
	    headerLine = false;
	    return false;
		}
		
		public boolean nextCorrespondentLine(String line) {
	    Pattern pat = Pattern.compile("^\\s");
	    Matcher mat = pat.matcher(line);
	    if(mat.lookingAt())
				return correspondentLine;
	    correspondentLine = false;
	    return false;
		}
		
		public boolean wasHeaderLine() {
	    return headerLine;
		}
		
		public boolean wasCorrespondentLine() {
	    return correspondentLine;
		}
		
		public String getLineWithoutPrefix(String line) {
	    Pattern pat = Pattern.compile("^((\\> )*|(\\>+ ))(from|to|cc|bcc)\\: ",
																		Pattern.CASE_INSENSITIVE);
	    Matcher mat = pat.matcher(line);
	    if(mat.lookingAt()) {
				line = mat.replaceAll("");
	    }
	    return line;
		}
		
		public String getPrefix() {
			return prefix;
		}
		
		public double getNestedMessageLevel() {
	    return nestedMessageLevel;
		}
		
		public String removeCommasInBrackets(String line) {
	    Pattern pat = Pattern.compile("\\\"");
	    Matcher mat = pat.matcher(line);
	    if(mat.find() == false)
				return line;
	    String[] chunks = pat.split(line);
	    for(int i = 1; i < chunks.length; i += 2) {
				pat = Pattern.compile("\\,");
				mat = pat.matcher(chunks[i]);
				chunks[i] = mat.replaceAll(" ");
	    }
	    line = "";
	    for(int i = 0; i < chunks.length; i++) {
				line += (" " + chunks[i]);
	    }
	    return line;
		}
		
		public double updateNestedMessageLevel(String line) {
	    Pattern pat = Pattern.compile("^([\\> ]+)[^\\> ]");
	    Matcher mat = pat.matcher(line);
	    if (mat.lookingAt()) {
				line = mat.group(1);
				int counter = 1;
				for (int i = 0; i < line.length(); i++) {
					if (line.charAt(i) == '>')
						counter = 2 * counter;
				}
				if (nestedMessageLevel < counter)
					nestedMessageLevel = counter;
	    }
	    return nestedMessageLevel;
		}
		
		public boolean headerLine;
		public boolean correspondentLine;
		public double nestedMessageLevel;
		public String prefix;
	}
	
	public People people;
	public HashSet stopWords;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -