📄 contactrecordextractor.java

📁 这是一个matlab的java实现。里面有许多内容。请大家慢慢捉摸。
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. *//**    @author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a> */package edu.umass.cs.mallet.projects.dex.ie;import edu.umass.cs.mallet.projects.dex.types.*;import edu.umass.cs.mallet.projects.dex.web.*;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.fst.confidence.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.tsf.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.util.*;import java.util.logging.*;import java.util.Iterator;import java.util.HashSet;import java.util.ArrayList;import java.util.Vector;import java.util.regex.Pattern;import java.util.regex.Matcher;import java.io.*;/**	 Extracts the contact information from the homepages of a list of	 {@link Person} objects. */public class ContactRecordExtractor {	private static Logger logger = Logger.getLogger(ContactRecordExtractor.class.getName());	/** CRF to extract contact info */	private CRF4 crf;	/** processing pipe for html document*/	private SerialPipes inputPipe;	/** Tags for the beginning and within states of each field*/	private Object[] startTags;	private Object[] inTags;	/** String for background state */	private String bg = "O";		/** estimates the confidence of predicted fields */	private ConstrainedForwardBackwardConfidenceEstimator confidenceEstimator = null;	/** file name of CRF object file */	File crfFile;	/** perform confidence prediction? */	boolean confidencePrediction;	/** filename to print VCF output*/	File vcfFile;	File htmlFile;	/** how often to dump VCF data*/	int peopleBetweenVCFPrint = 20;	/** stop list for keywords */	HashSet stopList;	public static String newline = System.getProperty ("line.separator");		 	public ContactRecordExtractor (File _crfFile, File _vcfFile, File _htmlFile, HashSet _stopList) {		this (_crfFile, _vcfFile, _htmlFile,  _stopList, false);	} 	public ContactRecordExtractor (File _crfFile, File _vcfFile, File _htmlFile,																 HashSet _stopList, boolean _confidencePrediction) {		this.crfFile = _crfFile;		this.vcfFile = _vcfFile;		this.htmlFile = _htmlFile;		this.stopList = _stopList;		this.confidencePrediction = _confidencePrediction;				readCRF ();		HashSet allowed = new HashSet ();		fillAllowedTags (allowed);		startTags = prepend ("B-", allowed.toArray());		inTags = prepend ("I-", allowed.toArray());		if (confidencePrediction)			confidenceEstimator = new ConstrainedForwardBackwardConfidenceEstimator (crf);						}		/** Reads CRF object file */ 	private void readCRF () {		logger.info ("Reading in CRF in object file " + crfFile + " for ContactRecordExtractor...");		try {			ObjectInputStream ois = new ObjectInputStream(new FileInputStream(crfFile));						crf = (CRF4) ois.readObject();			ois.close();			logger.info ("Read CRF successfully!");		}		catch (IOException e) {			System.err.println("Exception reading file: " + e);			System.exit(-1);		}		catch (ClassNotFoundException cnfe) {			System.err.println("Cound not find class reading in object: " + cnfe);			System.exit(-2);		}		this.inputPipe = (SerialPipes)crf.getInputPipe();		crf.getInputAlphabet().stopGrowth();	}	public People extractContactRecordsFor (People people) {		return extractContactRecordsFor (people, 0);	}	public void test (File fin, File fout) {		Instance inst = new Instance (fileToStringBuffer (fin).toString(),																	null, null, null, this.inputPipe);		FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData (inputPipe);		Sequence ls = (Sequence)crf.viterbiPath (fvs).output(); 		printLabeledFile (ls, inst, fout.getAbsolutePath()); 	}	public void eval (File fin, File fout) {		Instance inst = new Instance (fileToStringBuffer (fin).toString(),																	null, null, null, this.inputPipe);		InstanceList ilist = new InstanceList (inst.getPipe ());		ilist.add (inst);				MultiSegmentationEvaluator eval = new MultiSegmentationEvaluator (startTags, inTags);		eval.test (crf, ilist, "testing", null);		FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData (inputPipe);		Sequence ls = (Sequence)crf.viterbiPath (fvs).output(); 		printLabeledFile (ls, inst, fout.getAbsolutePath()); 	}	private StringBuffer fileToStringBuffer (File f) {		StringBuffer sb = new StringBuffer();		try {			BufferedReader rd = new BufferedReader (new FileReader (f));			String line = "";			boolean inTag = false;			while ((line = rd.readLine()) != null) {				line = line.replaceAll ("<br>", newline);				line = line.replaceAll ("&nbsp;", " ");				// fsm to parse html				for (int ci=0; ci < line.length(); ci++) {					if (line.charAt (ci) == '<') 						inTag = true;												else if (line.charAt (ci) == '>') 						inTag = false;					else if (!inTag) {						sb.append (line.charAt (ci));					}				}				sb.append ("\n");																	}			rd.close();		}		catch (IOException e) {			System.err.println("Exception reaiding file: " + e);		}		return sb;	}		/** Extracts ContactRecords and associated People from web documents	 * stored in each Person. Also adds the words of each page to each	 * Person, to be used later for associating keywords with this	 * person.	 * @param people list of Person objects	 * @param startingIndex first index into People to extract info for	 * @return People augmented with extracted information	 */	public People extractContactRecordsFor (People people, int startingIndex) {				CharSequenceLexer lexer = new CharSequenceLexer (CharSequenceLexer.LEX_WORDS);		Iterator piter = people.iterator();		int i=0;		while (piter.hasNext()) {						// dump VCF data for intermediary results			if (i % peopleBetweenVCFPrint == 0 && vcfFile != null && htmlFile != null) {				logger.info ("printing intermediate VCF data to " + vcfFile +					" and HTML data to " + htmlFile);				InformationGain ig = new InformationGain (people, "key_"+vcfFile );				people.writeHTML (htmlFile);				people.writeVCF (vcfFile);			}			i++;			Person p = (Person)piter.next();			// check and set flags for whether (1) pages have been found for			// this person, and (2) info has been extracted from these pages			if (p.processedForContactInformation) {				continue;			}			//if (p.processedForWebPages)			//	p.processedForContactInformation = true;						Iterator iter = p.pageIterator();			ContactRecord cr = new ContactRecord();			int pagei = 0;			while (iter.hasNext()) {				WebPage webPage = (WebPage) iter.next();				String fname = webPage.fileName;				logger.info ("Extracting contact records for " + fname);				StringBuffer sb = fileToStringBuffer (new File (fname));				/** create Instance from web page run through feature extraction pipes */				Instance inst = new Instance (sb.toString(), null, null, null, this.inputPipe); 				FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData (inputPipe);				Sequence ls = (Sequence)crf.viterbiPath (fvs).output(); // predicted extractions			 	// add extracted info to ContactRecord				ArrayList names = getNames (ls, inst, pagei);				cr.addNames (names);			 	HashSet contactRecordIndices = getContactRecordIndices (ls, inst);			 	SegmentIterator segIter = new SegmentIterator ((Sequence)fvs, ls, ls, startTags, inTags);				augmentContactRecord (cr, segIter, inst, contactRecordIndices);				printLabeledFile (ls, inst, htmlFileToLabelFile (fname));				addWords (p, ls, inst, lexer, true);				pagei++; 			}			if (cr.numberFields() > 0) {								p.setContactRecord (cr);		 		System.err.println (cr);			}			else {				System.err.println (">>No contact info found for person: " + p.getFirstName());				if (p.getContactRecord().size() != 0)					throw new IllegalArgumentException ("ContactRecord should be size 0, instead it's\n" + p.getContactRecord());			} 	 	}		return people;	}		private void addWords (Person p, Sequence ls, Instance inst, CharSequenceLexer lexer, boolean bigram) {		TokenSequence input = (TokenSequence)inst.getSource();		String prevWord = null;		for (int i=0; i < input.size(); i++) {			if (!(ls.get(i).equals (bg))) { // exclude contact tokens				prevWord = null;				continue;			}			lexer.setCharSequence (input.getToken(i).getText());			while (lexer.hasNext()) {				String s = (String)lexer.next();				//if (s.equals("ENDLINE") || s.matches(".*[0-9].*")) {				if (s.matches(".*[0-9].*") || stopList.contains (s.toLowerCase())) {					prevWord = null;					continue;				}				else if (s.equals ("ENDLINE")) {					continue;				}				s = s.toLowerCase();				if (bigram && prevWord != null)					p.addKeyWord (prevWord + " " + s);				//else p.addKeyWord (s);				//				p.addWord (s, 1.0);				prevWord = s;			}		}	}
12 下一页
💿 文件大小 5351 K
👤 上传用户 wait2010
📂 所属分类 matlab例程
🏷️ 相关标签

#matlab #java #家
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -