⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 contactrecordextractor.java

📁 常用机器学习算法,java编写源代码,内含常用分类算法,包括说明文档
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
	static private String htmlFileToLabelFile (String f) {		return f + ".tagged";	}	private void printLabeledFile (Sequence ls, Instance inst, String fname) {		try {			FileOutputStream ostream = new FileOutputStream																 (new File (fname));			PrintWriter pw = new PrintWriter (ostream);			TokenSequence input = (TokenSequence) inst.getSource();			for (int i=0; i < input.size(); i++) {				String term = input.getToken(i).getText();				if (term.equals ("ENDLINE"))					pw.println ("");				else 					pw.print (term + " (" + ls.get (i) + ") ");			}			pw.flush();			ostream.close();		}		catch (IOException e) {			System.err.println("Exception reading file: " + e);		}			}	/** Get follwong names: first middle last, first last*/	private ArrayList getNames (Sequence ls, Instance inst, int pagei) {		ArrayList names = new ArrayList ();		TokenSequence input = (TokenSequence) inst.getSource();		for (int i=0; i < input.size(); i++) {			String s = input.getToken(i).getText();			String l = ls.get(i).toString();			if (l.equalsIgnoreCase ("B-FirstName")) { 				if (i+2 < input.size() && ls.get(i+1).toString().equals ("B-MiddleName") &&						ls.get(i+2).toString().equals ("B-LastName")) { // John W. Smith					String name = input.getToken(i).getText() + " " + input.getToken(i+1).getText() + " " +												input.getToken(i+2).getText();					addName (name, names, pagei);				}								else if (i+1 < input.size() &&								 ls.get(i+1).toString().equals ("B-LastName")) { // John Smith					String name = input.getToken(i).getText() + " " + input.getToken(i+1).getText();					addName (name, names, pagei);				}							}			else if (l.equalsIgnoreCase ("B-LastName")) {				if (i+2 < input.size() && input.get (i+1).equals (",") &&						ls.get (i+2).equals ("B-FirstName")) { 					if (i+3 < input.size() && ls.get(i+3).equals ("B-MiddleName")) { // Smith, John W.						String name = input.getToken(i+2).getText() + " " + input.getToken(i+3).getText() +													" " + input.getToken(i);						addName (name, names, pagei);					}					else { // Smith, John						String name = input.getToken(i+2).getText() + " " + input.getToken(i);						addName (name, names, pagei);					}				}			}		}		return names;	}			private boolean addName (String n, ArrayList names, int pagei) {		if (n.matches (".*[0-9].*"))			return false;		names.add (new CountedString (cleanName (n), pagei));		return true;	}		private String cleanName (String n) {		n = n.replaceAll ("\\p{Punct}", " ");		n = n.trim();		n = n.replaceAll ("\\s{2,}", " ");								return n.toLowerCase();	}		/** Print the viterbi predictions for this TokenSequence */	private void printViterbi (Sequence ls, TokenSequence source, Sequence target) {		for (int i=0; i < source.size(); i++) 			System.err.println ("("+ ls.get (i) + ") (" + target.get (i) + ") " + source.getToken(i).getText() + "\n" + source.get (i));	}	// define contact record to be a series of tagged fields with no	// more than "bgThresh" consecutive background labels and at least "recordThresh" different fields.	private HashSet getContactRecordIndices (Sequence ls, Instance inst) {		HashSet h = new HashSet ();		TokenSequence input = (TokenSequence) inst.getSource();		int numbg = 0;		int startRecord = -1;		int bgThresh = 15;		int recordThresh = 5;		HashSet seenTags = new HashSet ();				for (int i=0; i < input.size(); i++) {			String label = ls.get(i).toString();			if (label.equals (bg)) {	// O				if (!(input.getToken(i).getText().equals ("ENDLINE")))					numbg++;				if (numbg > bgThresh && seenTags.size() < recordThresh) { // flush this incomplete record					numbg = 0;					startRecord = -1;					seenTags = new HashSet ();					//System.err.println ("Flushing incomplete record.");				}				// include current record				else if (numbg > bgThresh &&								 (seenTags.size() >= recordThresh || seenTags.contains ("B-PostalCode"))) { 					if (startRecord == -1)						throw new IllegalArgumentException ("Adding record without setting startRecord.");					h = addIntegers (startRecord, i-1, h);					numbg = 0;					//System.err.println ("Adding record: " + getStringFromIndices (startRecord, i-1, input));					startRecord = -1;					seenTags = new HashSet ();					return h;				}			}			else if (label.startsWith ("B-")) { // B 				if (startRecord == -1) { // beginning of record					//System.err.println ("Adding first label " + label);					numbg = 0;					startRecord = i;					seenTags = new HashSet ();					seenTags.add (label);				}				else {					seenTags.add (label);					//System.err.println ("Adding subsequent label " + label);				} 			}		 	else if (!label.startsWith ("I-")) // I		 		throw new IllegalArgumentException ("Invalid label: " + label);  		}		// print last record if available	 	if (seenTags.size() > recordThresh || seenTags.contains ("B-PostalCode")) {			if (startRecord == -1)				throw new IllegalArgumentException ("Adding record without setting startRecord.");			h = addIntegers (startRecord, input.size()-1, h);			//System.err.println ("Adding last record: " + getStringFromIndices (startRecord, input.size()-1, input));		}		return h;	}		private HashSet addIntegers (int from, int to, HashSet h) {		//System.err.println ("Adding integers from " + from + " to " + to);		for (int i=from; i <= to; i++)			h.add (new Integer (i));		return h;	}	private String getStringFromIndices (int from, int to, TokenSequence ts) {		String ret = "";		for (int i=from; i <= to; i++) {			ret += ts.getToken(i).getText() + " ";		}		return ret;	}	private String getStringFromSegment (Segment seg, TokenSequence ts) {		String ret = "";		for (int ii=seg.getStart(); ii <= seg.getEnd(); ii++) {			ret += ts.getToken(ii).getText() + " ";		}		return ret;	}		private ContactRecord augmentContactRecord (ContactRecord cr, SegmentIterator segIter,																							Instance inst, HashSet contactIndices) {		int si=0;		while (segIter.hasNext()) {			Segment seg = segIter.nextSegment();								if (!seg.getStartTag().equals (this.bg)) {				String fieldValue = "";				if (!contactIndices.contains (new Integer (seg.getStart())))					continue;				if (confidencePrediction) 					logger.info ("Confidence: " + confidenceEstimator.estimateConfidenceFor (seg));				TokenSequence ts = (TokenSequence) inst.getSource ();				for (int ii=seg.getStart(); ii <= seg.getEnd(); ii++) {					fieldValue += ts.getToken(ii).getText() + " ";				}				cr.setFieldValue (getFieldFromLabel ((String)seg.getStartTag()), fieldValue);			}		}		return cr;	}		private static String getFieldFromLabel (String l) {		return l.substring (l.indexOf("-")+1, l.length());	}		private static void fillAllowedTags (HashSet h) {		h.add ("FirstName");		h.add ("MiddleName");		h.add ("Nickname");		h.add ("nickname");		h.add ("Suffix");		h.add ("LastName");		h.add ("Title");		h.add ("JobTitle");		h.add ("CompanyName");		h.add ("Department");		h.add ("AddressLine");		h.add ("City1");		h.add ("City2");		h.add ("State");		h.add ("Country");		h.add ("PostalCode");		h.add ("HomePhoneNumber");		h.add ("FaxNumber");		h.add ("CompanyPhoneNumber");		h.add ("DirectPhoneNumber");		h.add ("MobilePhoneNumber");		h.add ("PagerNumber");		h.add ("WebPageURL");		h.add ("Email");		h.add ("InstantMessagingAddress");		h.add ("VoiceMail");	}	private static String[] prepend (String prefix, Object[] a) {		String[] ret = new String[a.length];		for (int i=0; i < a.length; i++)			ret[i] = prefix + (String)a[i];		return ret;	}	static CommandOption.File crfFileOption = new CommandOption.File	(ContactRecordExtractor.class, "crf-file", "FILE", true, new File ("/usr/col/tmp1/culotta/mallet/exp/address/crf.obj"), "CRF object file to perform contact record extraction", null);		static CommandOption.Boolean confidencePredictionOption = new CommandOption.Boolean (ContactRecordExtractor.class, "confidence-prediction", "true|false", false, false, "predict confidence of each extracted field?", null);	static CommandOption.Boolean evalOption = new CommandOption.Boolean (ContactRecordExtractor.class, "eval", "true|false", false, false, "evaluate performance on labeled input file?", null);	static CommandOption.File inputOption = new CommandOption.File	(ContactRecordExtractor.class, "input", "FILE", true, null, "file to extract from", null);	static CommandOption.File outputOption = new CommandOption.File	(ContactRecordExtractor.class, "output", "FILE", true, null, "file to print extractions", null);	static final CommandOption.List commandOptions =				new CommandOption.List (				"Extract contact information from text/html.",                                new CommandOption[] {																	crfFileOption,																	confidencePredictionOption,																	inputOption,																	outputOption,																	evalOption,                                });		public static void main (String[] args) {		commandOptions.process (args);		ContactRecordExtractor cre = new ContactRecordExtractor (crfFileOption.value, null, null, null, confidencePredictionOption.value);		if (evalOption.value)			cre.eval (inputOption.value, outputOption.value);		else			cre.test (inputOption.value, outputOption.value);			}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -