⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 datacruncher.java

📁 CRF1.2
💻 JAVA
字号:
package iitb.Segment;import java.io.*;import java.util.*;import iitb.CRF.*;/** * * @author Sunita Sarawagi * */ class DCTrainRecord implements TrainRecord {    int[] ls;    String[][] _tokens;    int[] labelsPerToken;    int[] snum, spos;    DCTrainRecord(int[] ts, String[][] toks) {	ls = ts;	_tokens = toks;	int len = 0;	for (int i = 0; i < numSegments(); i++) {	    len+= _tokens[i].length;	}	labelsPerToken = new int[len];	snum = new int[len];	spos = new int[len];	int pos = 0;	for (int i = 0; i < ls.length; i++) {	    for (int p = 0; p < _tokens[i].length; p++) {		spos[pos] = p;		snum[pos] = i;		labelsPerToken[pos++] = ls[i];	    }	}    }    public int[] labels() {	return ls;    }    public void set_y(int i, int l) {labelsPerToken[i] = l;} // not applicable for training data.    public int length() {return labelsPerToken.length;}    public Object x(int i) {return _tokens[snum[i]][spos[i]];}    public int y(int i) {return  labelsPerToken[i];}    public int numSegments() {	return ls.length;    }    public int numSegments(int l) {	int sz = 0;	for (int i = 0; i < ls.length; i++)	    if (ls[i] == l) sz++;	return sz;    }    public String[] tokens(int snum) {	return _tokens[snum];    }    public String[] tokens(int l, int p) {	int pos = 0;	for (int i = 0; i < ls.length; i++)	    if (ls[i] == l) {		if (pos == p)		    return _tokens[i];		pos++;	    }	return null;    }	/* (non-Javadoc)	 * @see iitb.CRF.SegmentDataSequence#getSegmentEnd(int)	 */	public int getSegmentEnd(int segmentStart) {		for (int i = segmentStart+1; i < length(); i++) {			if (y(i)!= y(segmentStart))				return i-1;		}		return length()-1;	}	/* (non-Javadoc)	 * @see iitb.CRF.SegmentDataSequence#setSegment(int, int, int)	 */	public void setSegment(int segmentStart, int segmentEnd, int y) {		for (int i = segmentStart; i <= segmentEnd; i++)			set_y(i,y);	}};class DCTrainData implements TrainData {    Vector trainRecs;    int pos;    DCTrainData(Vector trs) {	trainRecs = trs;    }    public int size() {	return trainRecs.size();    }    public void startScan() {	pos = 0;    }    public TrainRecord nextRecord() {	return (TrainRecord)trainRecs.elementAt(pos++);    }    public boolean hasMoreRecords() {	return (pos < size());    }    public boolean hasNext() {	return hasMoreRecords();    }    public DataSequence next() {	return nextRecord();    }};class TestData {    BufferedReader rin;    String line;    String seq[];    String fname;    String delimit, impDelimit;    TestData(String file,String delimitP,String impDelimitP, String grpDelimit) {	try {	    fname = file;	    rin =new BufferedReader(new FileReader(file+".raw"));	    delimit = delimitP;	    impDelimit = impDelimitP;	}  catch(IOException e) {	    System.out.println("I/O Error"+e);	    System.exit(-1);	}    }    void startScan() {	try {	    rin =new BufferedReader(new FileReader(fname+".raw"));	}  catch(IOException e) {	    System.out.println("I/O Error"+e);	    System.exit(-1);	}       }    int[] groupedTokens() {	/*	if (grp == null)	    return null;	return grp.groupingArray(seq.length);	*/	return null;    }    String[] nextRecord() {	try {	    if ((line=rin.readLine())!=null) {		StringTokenizer tok=new StringTokenizer(line.toLowerCase(),delimit,true);		int len = tok.countTokens();		if ((seq == null) || (seq.length < len))		    seq =new String[len];		int count=0;		for(int i=0 ; i<len; i++) {		    String tokStr=tok.nextToken();		    if (delimit.indexOf(tokStr)==-1 || impDelimit.indexOf(tokStr)!=-1) {			seq[count++]=new String(tokStr);		    } 		}		String aseq[]=new String[count];		for(int i=0 ; i<count ; i++) {		    aseq[i]=seq[i];		}		return aseq;	    } else {		rin.close();		return null;	    }	} catch(IOException e) {	    System.out.println("I/O Error"+e);	    System.exit(-1);	}	return null;    }};class TestDataWrite {    PrintWriter out;    BufferedReader rin;    String outputBuffer;    String rawLine;    String delimit, tagDelimit, impDelimit;    LabelMap labelmap;    TestDataWrite(String outfile,String rawfile,String delimitP,String tagDelimitP,String impDelimitP, LabelMap linfo) {	try {	    labelmap = linfo;	    out=new PrintWriter(new FileOutputStream(outfile+".tagged"));	    rin=new BufferedReader(new FileReader(rawfile+".raw"));	    outputBuffer=new String();	    delimit = delimitP;	    tagDelimit = tagDelimitP;	    impDelimit = impDelimitP;	} catch(IOException e) {	    System.err.println("I/O Error"+e);	    System.exit(-1);	}    }	    void writeRecord(int[] tok, int tokLen) {	try {		rawLine=rin.readLine();		StringTokenizer rawTok=new StringTokenizer(rawLine,delimit,true);		String tokArr[]=new String[rawTok.countTokens()];		for(int j=0 ; j<tokArr.length ; j++) {		    tokArr[j]=rawTok.nextToken();		}		int ptr=0;		int t=tok[0];		for(int j=0 ; j<=tokLen ; j++) {		    if ((j < tokLen) && (t==tok[j])) {			while(ptr<tokArr.length && delimit.indexOf(tokArr[ptr])!=-1 && impDelimit.indexOf(tokArr[ptr])==-1) {			    outputBuffer=new String(outputBuffer+tokArr[ptr]);			    ptr++;			}			if (ptr<tokArr.length) {			    outputBuffer=new String(outputBuffer+tokArr[ptr]);			    ptr++;			}			while(ptr<tokArr.length && delimit.indexOf(tokArr[ptr])!=-1 && impDelimit.indexOf(tokArr[ptr])==-1) {			    outputBuffer=new String(outputBuffer+tokArr[ptr]);			    ptr++;			}		    } else {			int revScanPtr=outputBuffer.length()-1;			int goBackPtr=0;			boolean foundOpenChar=false;			while(outputBuffer.charAt(revScanPtr)==' ' || outputBuffer.charAt(revScanPtr)=='(' || outputBuffer.charAt(revScanPtr)=='{' || outputBuffer.charAt(revScanPtr)=='[') {			    char currChar=outputBuffer.charAt(revScanPtr);			    if (impDelimit.indexOf(currChar)!=-1) {				break;			    }			    if (currChar=='{' || currChar=='[' || currChar=='(') {				foundOpenChar=true;			    }			    revScanPtr--;			    goBackPtr++;			}			if (foundOpenChar) {			    outputBuffer=outputBuffer.substring(0,revScanPtr+1);			    ptr-=goBackPtr;			}			outputBuffer=new String(outputBuffer+tagDelimit+labelmap.revMap(t));			out.println(outputBuffer);			outputBuffer=new String();			//						out.println(tagDelimit+t);			//						System.out.println(tagDelimit+t);			if (j < tokLen) {			    t=tok[j];			    j--;			}		    }		}		out.println();	}  catch(IOException e) {	    System.err.println("I/O Error"+e);	    System.exit(-1);	}    }    void close() {	try {	    rin.close();	    out.close();	}  catch(IOException e) {	    System.err.println("I/O Error"+e);	    System.exit(-1);	}     }};    public class DataCruncher {    static String[]  getTokenList(String text, String delimit, String impDelimit) {	StringTokenizer textTok=new StringTokenizer(text.toLowerCase(),delimit,true);	int tlen = 0;	while (textTok.hasMoreTokens()) {	    String tokStr=textTok.nextToken();	    if (delimit.indexOf(tokStr)==-1 || impDelimit.indexOf(tokStr)!=-1){		tlen++;	    }	}	String[] cArray = new String[tlen];	tlen = 0;	textTok=new StringTokenizer(text.toLowerCase(),delimit,true);	while (textTok.hasMoreTokens()) {	    String tokStr=textTok.nextToken();	    if (delimit.indexOf(tokStr)==-1 || impDelimit.indexOf(tokStr)!=-1) {	    		cArray[tlen++] = tokStr;	    }	}	return cArray;    }    static int readRowVarCol(int numLabels, BufferedReader tin, String tagDelimit, String delimit, String impDelimit, int[] t, String[][] cArray) throws IOException     {	int ptr=0;	int previousLabel = -1;	while(true) {	    String line=tin.readLine();	    StringTokenizer firstSplit=null;	    if (line!=null) {		firstSplit=new StringTokenizer(line.toLowerCase(),tagDelimit);	    }	    if ((line==null) || (firstSplit.countTokens()<2)) {		// Empty Line		return ptr;	    }	    String w = firstSplit.nextToken();	    int label=Integer.parseInt(firstSplit.nextToken()); 	    //if ((!c[label].equals(" ")) && (previousLabel != label)) {	    //	System.out.println("WARNING: duplicate tags in training data are not allowed: " + w);	    //}	    t[ptr] = label;	    cArray[ptr++] = getTokenList(w,delimit,impDelimit);	    previousLabel = label;	}    }    static int readRowFixedCol(int numLabels, BufferedReader tin, String tagDelimit, String delimit, String impDelimit, int[] t, String[][] cArray, int labels[], StringTokenizer rawTok) throws IOException {	String line=tin.readLine();	if (line == null)	    return 0;	StringTokenizer firstSplit=new StringTokenizer(line.toLowerCase(),tagDelimit,true);	int ptr = 0;	for (int i = 0; (i < labels.length) 		 && firstSplit.hasMoreTokens(); i++) {	    int label = labels[i];	    String w = firstSplit.nextToken();	    if (tagDelimit.indexOf(w)!=-1) {		continue;	    } else {		if (firstSplit.hasMoreTokens())		    // read past the delimiter.		    firstSplit.nextToken();	    }	    if ((label > 0) && (label <= numLabels)) {		t[ptr] = label;		cArray[ptr++] = getTokenList(w,delimit,impDelimit);	    }	}	return ptr;    }    static int[] readHeaderInfo(int numLabels, BufferedReader tin, String tagDelimit) throws IOException {	tin.mark(1000);	String line = tin.readLine();	if (line == null)	    throw new IOException("Header row not present in tagged file");	// System.out.println(line);	if (! line.toLowerCase().startsWith("fixed-column-format")) {	    tin.reset();	    return null;	}	line = tin.readLine();	StringTokenizer firstSplit=new StringTokenizer(line,tagDelimit);	int labels[] = new int[numLabels];	for (int i = 0; (i < numLabels)&& firstSplit.hasMoreTokens();) {	    labels[i++] = Integer.parseInt(firstSplit.nextToken());	}	return labels;    }    public static TrainData readTagged(int numLabels,String tfile,String rfile,String delimit,String tagDelimit,String impDelimit, LabelMap labelMap) {	try {	    Vector td = new Vector();	    BufferedReader tin=new BufferedReader(new FileReader(tfile+".tagged"));	    BufferedReader rin=new BufferedReader(new FileReader(rfile+".raw"));	    boolean fixedColFormat = false;	    String rawLine;	    StringTokenizer rawTok,temp;	    int t[] = new int[0];	    String[] zeroString = new String[0];	    String cArray[][] = new String[0][0];	    int[] labels = null;	    // read list of columns in the header of the tag file	    labels = readHeaderInfo(numLabels,tin,tagDelimit);	    if (labels != null)		fixedColFormat = true;	    while((rawLine=rin.readLine())!=null) {		rawTok=new StringTokenizer(rawLine,delimit,true);		int len = rawTok.countTokens();		if (len > t.length) {		    t=new int[len];		    cArray=new String[len][0];		}		int ptr = 0;		if (fixedColFormat) {		    ptr = readRowFixedCol(numLabels, tin, tagDelimit, delimit, impDelimit,t,cArray,labels,rawTok);		} else {		    ptr = readRowVarCol(numLabels, tin, tagDelimit, delimit,impDelimit,t,cArray);		}		if (ptr == 0)		    break;		int at[]=new int[ptr];		String[][] c = new String[ptr][0];		for(int i=0 ; i<ptr ; i++) {		    at[i]=labelMap.map(t[i]);		    c[i] = cArray[i];		}		td.add(new DCTrainRecord(at,c));	    }	    return new DCTrainData(td);	} catch(IOException e) {	    System.err.println("I/O Error"+e);	    System.exit(-1);	}	return null;    }    public static void readRaw(Vector data,String file,String delimit,String impDelimit) {	try {	    BufferedReader rin=new BufferedReader(new FileReader(file+".raw"));	    String line;	    while((line=rin.readLine())!=null) {		StringTokenizer tok=new StringTokenizer(line.toLowerCase(),delimit,true);		String seq[]=new String[tok.countTokens()];		int count=0;		for(int i=0 ; i<seq.length ; i++) {		    String tokStr=tok.nextToken();		    if (delimit.indexOf(tokStr)==-1 || impDelimit.indexOf(tokStr)!=-1) {			seq[count++]=new String(tokStr);		    } 		}		String aseq[]=new String[count];		for(int i=0 ; i<count ; i++) {		    aseq[i]=seq[i];		}		data.add(aseq);	    }	    rin.close();	} catch(IOException e) {	    System.out.println("I/O Error"+e);	    System.exit(-1);	}    }    public static void createRaw(String file,String tagDelimit) {	try {	    BufferedReader in=new BufferedReader(new FileReader(file+".tagged"));	    PrintWriter out=new PrintWriter(new FileOutputStream(file+".raw"));	    String line,rawLine;	    rawLine=new String("");	    while((line=in.readLine())!=null) {		StringTokenizer t=new StringTokenizer(line,tagDelimit);		if(t.countTokens()<2) {		    out.println(rawLine);		    rawLine=new String("");		} else {		    rawLine=new String(rawLine+" "+t.nextToken());		}	    }	    out.println(rawLine);	    in.close();	    out.close();	} catch(IOException e) {	    System.out.println("I/O Error"+e);	    System.exit(-1);	}    }};

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -