📄 contactrecordextractor.java
字号:
static private String htmlFileToLabelFile (String f) { return f + ".tagged"; } private void printLabeledFile (Sequence ls, Instance inst, String fname) { try { FileOutputStream ostream = new FileOutputStream (new File (fname)); PrintWriter pw = new PrintWriter (ostream); TokenSequence input = (TokenSequence) inst.getSource(); for (int i=0; i < input.size(); i++) { String term = input.getToken(i).getText(); if (term.equals ("ENDLINE")) pw.println (""); else pw.print (term + " (" + ls.get (i) + ") "); } pw.flush(); ostream.close(); } catch (IOException e) { System.err.println("Exception reading file: " + e); } } /** Get follwong names: first middle last, first last*/ private ArrayList getNames (Sequence ls, Instance inst, int pagei) { ArrayList names = new ArrayList (); TokenSequence input = (TokenSequence) inst.getSource(); for (int i=0; i < input.size(); i++) { String s = input.getToken(i).getText(); String l = ls.get(i).toString(); if (l.equalsIgnoreCase ("B-FirstName")) { if (i+2 < input.size() && ls.get(i+1).toString().equals ("B-MiddleName") && ls.get(i+2).toString().equals ("B-LastName")) { // John W. Smith String name = input.getToken(i).getText() + " " + input.getToken(i+1).getText() + " " + input.getToken(i+2).getText(); addName (name, names, pagei); } else if (i+1 < input.size() && ls.get(i+1).toString().equals ("B-LastName")) { // John Smith String name = input.getToken(i).getText() + " " + input.getToken(i+1).getText(); addName (name, names, pagei); } } else if (l.equalsIgnoreCase ("B-LastName")) { if (i+2 < input.size() && input.get (i+1).equals (",") && ls.get (i+2).equals ("B-FirstName")) { if (i+3 < input.size() && ls.get(i+3).equals ("B-MiddleName")) { // Smith, John W. String name = input.getToken(i+2).getText() + " " + input.getToken(i+3).getText() + " " + input.getToken(i); addName (name, names, pagei); } else { // Smith, John String name = input.getToken(i+2).getText() + " " + input.getToken(i); addName (name, names, pagei); } } } } return names; } private boolean addName (String n, ArrayList names, int pagei) { if (n.matches (".*[0-9].*")) return false; names.add (new CountedString (cleanName (n), pagei)); return true; } private String cleanName (String n) { n = n.replaceAll ("\\p{Punct}", " "); n = n.trim(); n = n.replaceAll ("\\s{2,}", " "); return n.toLowerCase(); } /** Print the viterbi predictions for this TokenSequence */ private void printViterbi (Sequence ls, TokenSequence source, Sequence target) { for (int i=0; i < source.size(); i++) System.err.println ("("+ ls.get (i) + ") (" + target.get (i) + ") " + source.getToken(i).getText() + "\n" + source.get (i)); } // define contact record to be a series of tagged fields with no // more than "bgThresh" consecutive background labels and at least "recordThresh" different fields. private HashSet getContactRecordIndices (Sequence ls, Instance inst) { HashSet h = new HashSet (); TokenSequence input = (TokenSequence) inst.getSource(); int numbg = 0; int startRecord = -1; int bgThresh = 15; int recordThresh = 5; HashSet seenTags = new HashSet (); for (int i=0; i < input.size(); i++) { String label = ls.get(i).toString(); if (label.equals (bg)) { // O if (!(input.getToken(i).getText().equals ("ENDLINE"))) numbg++; if (numbg > bgThresh && seenTags.size() < recordThresh) { // flush this incomplete record numbg = 0; startRecord = -1; seenTags = new HashSet (); //System.err.println ("Flushing incomplete record."); } // include current record else if (numbg > bgThresh && (seenTags.size() >= recordThresh || seenTags.contains ("B-PostalCode"))) { if (startRecord == -1) throw new IllegalArgumentException ("Adding record without setting startRecord."); h = addIntegers (startRecord, i-1, h); numbg = 0; //System.err.println ("Adding record: " + getStringFromIndices (startRecord, i-1, input)); startRecord = -1; seenTags = new HashSet (); return h; } } else if (label.startsWith ("B-")) { // B if (startRecord == -1) { // beginning of record //System.err.println ("Adding first label " + label); numbg = 0; startRecord = i; seenTags = new HashSet (); seenTags.add (label); } else { seenTags.add (label); //System.err.println ("Adding subsequent label " + label); } } else if (!label.startsWith ("I-")) // I throw new IllegalArgumentException ("Invalid label: " + label); } // print last record if available if (seenTags.size() > recordThresh || seenTags.contains ("B-PostalCode")) { if (startRecord == -1) throw new IllegalArgumentException ("Adding record without setting startRecord."); h = addIntegers (startRecord, input.size()-1, h); //System.err.println ("Adding last record: " + getStringFromIndices (startRecord, input.size()-1, input)); } return h; } private HashSet addIntegers (int from, int to, HashSet h) { //System.err.println ("Adding integers from " + from + " to " + to); for (int i=from; i <= to; i++) h.add (new Integer (i)); return h; } private String getStringFromIndices (int from, int to, TokenSequence ts) { String ret = ""; for (int i=from; i <= to; i++) { ret += ts.getToken(i).getText() + " "; } return ret; } private String getStringFromSegment (Segment seg, TokenSequence ts) { String ret = ""; for (int ii=seg.getStart(); ii <= seg.getEnd(); ii++) { ret += ts.getToken(ii).getText() + " "; } return ret; } private ContactRecord augmentContactRecord (ContactRecord cr, SegmentIterator segIter, Instance inst, HashSet contactIndices) { int si=0; while (segIter.hasNext()) { Segment seg = segIter.nextSegment(); if (!seg.getStartTag().equals (this.bg)) { String fieldValue = ""; if (!contactIndices.contains (new Integer (seg.getStart()))) continue; if (confidencePrediction) logger.info ("Confidence: " + confidenceEstimator.estimateConfidenceFor (seg)); TokenSequence ts = (TokenSequence) inst.getSource (); for (int ii=seg.getStart(); ii <= seg.getEnd(); ii++) { fieldValue += ts.getToken(ii).getText() + " "; } cr.setFieldValue (getFieldFromLabel ((String)seg.getStartTag()), fieldValue); } } return cr; } private static String getFieldFromLabel (String l) { return l.substring (l.indexOf("-")+1, l.length()); } private static void fillAllowedTags (HashSet h) { h.add ("FirstName"); h.add ("MiddleName"); h.add ("Nickname"); h.add ("nickname"); h.add ("Suffix"); h.add ("LastName"); h.add ("Title"); h.add ("JobTitle"); h.add ("CompanyName"); h.add ("Department"); h.add ("AddressLine"); h.add ("City1"); h.add ("City2"); h.add ("State"); h.add ("Country"); h.add ("PostalCode"); h.add ("HomePhoneNumber"); h.add ("FaxNumber"); h.add ("CompanyPhoneNumber"); h.add ("DirectPhoneNumber"); h.add ("MobilePhoneNumber"); h.add ("PagerNumber"); h.add ("WebPageURL"); h.add ("Email"); h.add ("InstantMessagingAddress"); h.add ("VoiceMail"); } private static String[] prepend (String prefix, Object[] a) { String[] ret = new String[a.length]; for (int i=0; i < a.length; i++) ret[i] = prefix + (String)a[i]; return ret; } static CommandOption.File crfFileOption = new CommandOption.File (ContactRecordExtractor.class, "crf-file", "FILE", true, new File ("/usr/col/tmp1/culotta/mallet/exp/address/crf.obj"), "CRF object file to perform contact record extraction", null); static CommandOption.Boolean confidencePredictionOption = new CommandOption.Boolean (ContactRecordExtractor.class, "confidence-prediction", "true|false", false, false, "predict confidence of each extracted field?", null); static CommandOption.Boolean evalOption = new CommandOption.Boolean (ContactRecordExtractor.class, "eval", "true|false", false, false, "evaluate performance on labeled input file?", null); static CommandOption.File inputOption = new CommandOption.File (ContactRecordExtractor.class, "input", "FILE", true, null, "file to extract from", null); static CommandOption.File outputOption = new CommandOption.File (ContactRecordExtractor.class, "output", "FILE", true, null, "file to print extractions", null); static final CommandOption.List commandOptions = new CommandOption.List ( "Extract contact information from text/html.", new CommandOption[] { crfFileOption, confidencePredictionOption, inputOption, outputOption, evalOption, }); public static void main (String[] args) { commandOptions.process (args); ContactRecordExtractor cre = new ContactRecordExtractor (crfFileOption.value, null, null, null, confidencePredictionOption.value); if (evalOption.value) cre.eval (inputOption.value, outputOption.value); else cre.test (inputOption.value, outputOption.value); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -