📄 contactrecordextractor.java
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. *//** @author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a> */package edu.umass.cs.mallet.projects.dex.ie;import edu.umass.cs.mallet.projects.dex.types.*;import edu.umass.cs.mallet.projects.dex.web.*;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.fst.confidence.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.tsf.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.util.*;import java.util.logging.*;import java.util.Iterator;import java.util.HashSet;import java.util.ArrayList;import java.util.Vector;import java.util.regex.Pattern;import java.util.regex.Matcher;import java.io.*;/** Extracts the contact information from the homepages of a list of {@link Person} objects. */public class ContactRecordExtractor { private static Logger logger = Logger.getLogger(ContactRecordExtractor.class.getName()); /** CRF to extract contact info */ private CRF4 crf; /** processing pipe for html document*/ private SerialPipes inputPipe; /** Tags for the beginning and within states of each field*/ private Object[] startTags; private Object[] inTags; /** String for background state */ private String bg = "O"; /** estimates the confidence of predicted fields */ private ConstrainedForwardBackwardConfidenceEstimator confidenceEstimator = null; /** file name of CRF object file */ File crfFile; /** perform confidence prediction? */ boolean confidencePrediction; /** filename to print VCF output*/ File vcfFile; File htmlFile; /** how often to dump VCF data*/ int peopleBetweenVCFPrint = 20; /** stop list for keywords */ HashSet stopList; public static String newline = System.getProperty ("line.separator"); public ContactRecordExtractor (File _crfFile, File _vcfFile, File _htmlFile, HashSet _stopList) { this (_crfFile, _vcfFile, _htmlFile, _stopList, false); } public ContactRecordExtractor (File _crfFile, File _vcfFile, File _htmlFile, HashSet _stopList, boolean _confidencePrediction) { this.crfFile = _crfFile; this.vcfFile = _vcfFile; this.htmlFile = _htmlFile; this.stopList = _stopList; this.confidencePrediction = _confidencePrediction; readCRF (); HashSet allowed = new HashSet (); fillAllowedTags (allowed); startTags = prepend ("B-", allowed.toArray()); inTags = prepend ("I-", allowed.toArray()); if (confidencePrediction) confidenceEstimator = new ConstrainedForwardBackwardConfidenceEstimator (crf); } /** Reads CRF object file */ private void readCRF () { logger.info ("Reading in CRF in object file " + crfFile + " for ContactRecordExtractor..."); try { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(crfFile)); crf = (CRF4) ois.readObject(); ois.close(); logger.info ("Read CRF successfully!"); } catch (IOException e) { System.err.println("Exception reading file: " + e); System.exit(-1); } catch (ClassNotFoundException cnfe) { System.err.println("Cound not find class reading in object: " + cnfe); System.exit(-2); } this.inputPipe = (SerialPipes)crf.getInputPipe(); crf.getInputAlphabet().stopGrowth(); } public People extractContactRecordsFor (People people) { return extractContactRecordsFor (people, 0); } public void test (File fin, File fout) { Instance inst = new Instance (fileToStringBuffer (fin).toString(), null, null, null, this.inputPipe); FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData (inputPipe); Sequence ls = (Sequence)crf.viterbiPath (fvs).output(); printLabeledFile (ls, inst, fout.getAbsolutePath()); } public void eval (File fin, File fout) { Instance inst = new Instance (fileToStringBuffer (fin).toString(), null, null, null, this.inputPipe); InstanceList ilist = new InstanceList (inst.getPipe ()); ilist.add (inst); MultiSegmentationEvaluator eval = new MultiSegmentationEvaluator (startTags, inTags); eval.test (crf, ilist, "testing", null); FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData (inputPipe); Sequence ls = (Sequence)crf.viterbiPath (fvs).output(); printLabeledFile (ls, inst, fout.getAbsolutePath()); } private StringBuffer fileToStringBuffer (File f) { StringBuffer sb = new StringBuffer(); try { BufferedReader rd = new BufferedReader (new FileReader (f)); String line = ""; boolean inTag = false; while ((line = rd.readLine()) != null) { line = line.replaceAll ("<br>", newline); line = line.replaceAll (" ", " "); // fsm to parse html for (int ci=0; ci < line.length(); ci++) { if (line.charAt (ci) == '<') inTag = true; else if (line.charAt (ci) == '>') inTag = false; else if (!inTag) { sb.append (line.charAt (ci)); } } sb.append ("\n"); } rd.close(); } catch (IOException e) { System.err.println("Exception reaiding file: " + e); } return sb; } /** Extracts ContactRecords and associated People from web documents * stored in each Person. Also adds the words of each page to each * Person, to be used later for associating keywords with this * person. * @param people list of Person objects * @param startingIndex first index into People to extract info for * @return People augmented with extracted information */ public People extractContactRecordsFor (People people, int startingIndex) { CharSequenceLexer lexer = new CharSequenceLexer (CharSequenceLexer.LEX_WORDS); Iterator piter = people.iterator(); int i=0; while (piter.hasNext()) { // dump VCF data for intermediary results if (i % peopleBetweenVCFPrint == 0 && vcfFile != null && htmlFile != null) { logger.info ("printing intermediate VCF data to " + vcfFile + " and HTML data to " + htmlFile); InformationGain ig = new InformationGain (people, "key_"+vcfFile ); people.writeHTML (htmlFile); people.writeVCF (vcfFile); } i++; Person p = (Person)piter.next(); // check and set flags for whether (1) pages have been found for // this person, and (2) info has been extracted from these pages if (p.processedForContactInformation) { continue; } //if (p.processedForWebPages) // p.processedForContactInformation = true; Iterator iter = p.pageIterator(); ContactRecord cr = new ContactRecord(); int pagei = 0; while (iter.hasNext()) { WebPage webPage = (WebPage) iter.next(); String fname = webPage.fileName; logger.info ("Extracting contact records for " + fname); StringBuffer sb = fileToStringBuffer (new File (fname)); /** create Instance from web page run through feature extraction pipes */ Instance inst = new Instance (sb.toString(), null, null, null, this.inputPipe); FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData (inputPipe); Sequence ls = (Sequence)crf.viterbiPath (fvs).output(); // predicted extractions // add extracted info to ContactRecord ArrayList names = getNames (ls, inst, pagei); cr.addNames (names); HashSet contactRecordIndices = getContactRecordIndices (ls, inst); SegmentIterator segIter = new SegmentIterator ((Sequence)fvs, ls, ls, startTags, inTags); augmentContactRecord (cr, segIter, inst, contactRecordIndices); printLabeledFile (ls, inst, htmlFileToLabelFile (fname)); addWords (p, ls, inst, lexer, true); pagei++; } if (cr.numberFields() > 0) { p.setContactRecord (cr); System.err.println (cr); } else { System.err.println (">>No contact info found for person: " + p.getFirstName()); if (p.getContactRecord().size() != 0) throw new IllegalArgumentException ("ContactRecord should be size 0, instead it's\n" + p.getContactRecord()); } } return people; } private void addWords (Person p, Sequence ls, Instance inst, CharSequenceLexer lexer, boolean bigram) { TokenSequence input = (TokenSequence)inst.getSource(); String prevWord = null; for (int i=0; i < input.size(); i++) { if (!(ls.get(i).equals (bg))) { // exclude contact tokens prevWord = null; continue; } lexer.setCharSequence (input.getToken(i).getText()); while (lexer.hasNext()) { String s = (String)lexer.next(); //if (s.equals("ENDLINE") || s.matches(".*[0-9].*")) { if (s.matches(".*[0-9].*") || stopList.contains (s.toLowerCase())) { prevWord = null; continue; } else if (s.equals ("ENDLINE")) { continue; } s = s.toLowerCase(); if (bigram && prevWord != null) p.addKeyWord (prevWord + " " + s); //else p.addKeyWord (s); // p.addWord (s, 1.0); prevWord = s; } } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -