📄 computeupperbound1.java
字号:
} if(strs2 != null) for(int k=0; k<strs2.length; k++){ ArrayList namelist2 = LastName(strs2[k]); String tempStr = ""; for(int j=0; j<namelist2.size(); j++){ tempStr += (String)namelist2.get(j); if( j<namelist2.size()-1){ tempStr += " "; } } strs2[k] = tempStr; } }*/ String str1 = ""; if(strs1 != null) for(int j=0; j<strs1.length; j++){ str1 += strs1[j]; if( j<strs1.length - 1 ) str1 += " "; } String str2 = ""; if(strs2 != null) for(int j=0; j<strs2.length; j++){ str2 += strs2[j]; if( j<strs2.length - 1 ) str2 += " "; } distTemp = tagWeight[i] * computeStringDistance(str1, str2);// distTemp = Math.abs(distTemp); totalWeight += tagWeight[i]; dist += distTemp; if(distTemp != 0){ usedNumFields ++; }// if(startTags[i].equals("<author>")) System.out.println(startTags[i] + ": " + str1 + " : " + str2 + " : " + distTemp + " : " + usedNumFields + " : " + dist); }// dist /= usedNumFields;// dist /= totalWeight; System.out.println(dist + " : " + usedNumFields);// dist = Math.exp(dist); return dist; } protected ArrayList LastName(String ss) {// System.out.println("ss=" + ss); ArrayList names = new ArrayList(); if(ss == null){ return names; } ss = ss.replaceAll(" \\w\\.", ""); ss = ss.replaceAll("\\s\\w\\s\\.", ""); ss = ss.replaceAll("^\\w\\.", ""); ss = ss.replaceAll("^\\w\\s\\.", ""); ss = ss.replaceAll(" and", " ,"); ss = ss.replaceAll("\\.$", "");// System.out.println(ss); String[] authors = ss.split(","); String last_name; for(int i=0; i<authors.length; i++){ String author = authors[i]; author = author.replaceAll("^\\s+|\\s+$", ""); String[] first_last_name = author.split(" "); if(first_last_name.length == 2){ last_name = first_last_name[1]; } else if(first_last_name.length == 1){ last_name = first_last_name[0]; } else {// System.out.println(ss);// throw new UnsupportedOperationException(author); last_name = first_last_name[first_last_name.length-1]; } if(!last_name.equals("")){// System.out.println(i+": \""+last_name+"\""); names.add(last_name); } } return names; }// protected String locateFields(String startTag, String endTag, String string) protected String[] locateFields(String startTag, String endTag, String string) { int indexStart = string.indexOf(startTag); int indexEnd = string.indexOf(endTag, indexStart); if(indexStart == -1 || indexEnd == -1){ return null; } else{ ArrayList strlist = new ArrayList(); while(indexStart != -1 && indexEnd != -1){ String str = string.substring(indexStart+startTag.length(), indexEnd-1); strlist.add(str); indexStart = string.indexOf(startTag, indexEnd); indexEnd = string.indexOf(endTag, indexStart); } String[] strs = new String[strlist.size()]; for(int i=0; i<strlist.size(); i++){ strs[i] = (String)strlist.get(i); } return strs; } } protected double computeStringDistance(String str1, String str2) { str1 = str1.toLowerCase(); str2 = str2.toLowerCase(); if(str1.length() > 0 && str2.length() > 0 ){ return nw.score(str1, str2); } else if(str1.length() == 0 && str2.length() == 0){ return default_Ignore_Dist; } else{ return default_Max_Dist; }// return nw.score(str1, str2); } public double PairSimilarity(Sequence sequence1, Sequence sequence2, Instance instance1, Instance instance2) { TokenSequence tokenSequence1 = (TokenSequence)(instance1.getSource()); TokenSequence tokenSequence2 = (TokenSequence)(instance2.getSource()); String str1 = ieInterface.printResultInFormat(true, sequence1, tokenSequence1); String str2 = ieInterface.printResultInFormat(true, sequence2, tokenSequence2); double sim1 = computeSGMLObjDistance(str1, str2);// double sim2 = computeSGMLObjDistance(str2, str1);// double sim = (sim1+sim2)/2; double sim = sim1; System.out.println(str1 + "\n" + str2 + " : " + sim + "\n"); return sim; } //viterbi for all files under a given directory, //if the given directory is a plain file, viterbi for this file public void viterbiCRF(String inputDir, boolean sgml, String seperator, int N) { // variables for performance measurement Alphabet targets = (this.pipe).getTargetAlphabet(); assert(targets != null); System.out.println("target size: " + targets.size()); System.out.print ("State labels:"); for (int i = 0; i < targets.size(); i++) System.out.print (" " + targets.lookupObject(i)); System.out.println (""); int numCorrectTokens = 0, totalTokens = 0; int[] numTrueSegments, numPredictedSegments, numCorrectSegments; int[] numCorrectSegmentsInVocabulary, numCorrectSegmentsOOV; int[] numIncorrectSegmentsInVocabulary, numIncorrectSegmentsOOV; int[][] matrixEntry; int numCorrectWholeInstance = 0; numTrueSegments = new int[targets.size()]; numPredictedSegments = new int[targets.size()]; numCorrectSegments = new int[targets.size()]; matrixEntry = new int[targets.size()][targets.size()]; String PUNT = "[,\\.;:?!()*]"; Pattern puntPattern = Pattern.compile(PUNT); boolean ignorePunct = true; // if inputDir is a plain file instancelist = new InstanceList (pipe); optimalViterbi = new ArrayList(); int totalInstanceNum = 0; System.out.println(inputDir); File file = new File(inputDir); if( file.isFile() ){ viterbiCRF(file, sgml, seperator, N); } else{ // continue if it is a directory FileIterator fileIter = new FileIterator (inputDir); ArrayList fileList = fileIter.getFileArray(); for(int i=0; i<fileList.size(); i++){ file = (File) fileList.get(i); viterbiCRF(file, sgml, seperator, N); totalInstanceNum += instancelist.size(); for(int k=0; k<instancelist.size(); k++){ Instance instance = instancelist.getInstance(k); boolean wholeInstanceCorrect = true; Sequence trueSequence = (Sequence)instance.getTarget(); tokenSequence = (TokenSequence)instance.getSource(); for (int j = 0; j < trueSequence.size(); j++) { String tokenStr = tokenSequence.getToken(j).getText(); if(puntPattern.matcher(tokenStr).matches() && ignorePunct ){//ignore punct; continue; } totalTokens ++; Object trueO = trueSequence.get(j); int trueIndex = targets.lookupIndex(trueO); numTrueSegments[trueIndex] ++; Object predO = ((Sequence)optimalViterbi.get(k)).get(j); int predIndex = targets.lookupIndex(predO); numPredictedSegments[predIndex] ++; matrixEntry[trueIndex][predIndex] ++; if(predIndex == trueIndex){ numCorrectTokens ++; numCorrectSegments[trueIndex] ++; } else{ wholeInstanceCorrect = false; } } if(wholeInstanceCorrect) numCorrectWholeInstance ++; } } } // print out the performance double accuracy = (double)numCorrectTokens/totalTokens; System.out.println ("\n" +" accuracy=" + numCorrectTokens +"/"+ totalTokens + " = " +accuracy); double wholeInstanceAccuracy = (double)numCorrectWholeInstance/totalInstanceNum; System.out.println ("Whole instance accuracy = " + numCorrectWholeInstance + "/" + totalInstanceNum + " = " + wholeInstanceAccuracy); System.out.println("targets size = " + targets.size()); System.out.print ("State labels:"); for (int i = 0; i < targets.size(); i++) System.out.print (" " + targets.lookupObject(i)); System.out.println (""); for(int t=0; t<targets.size(); t++){ double precision = numPredictedSegments[t] == 0 ? 1 : ((double)numCorrectSegments[t]) / numPredictedSegments[t]; double recall = numTrueSegments[t] == 0 ? 1 : ((double)numCorrectSegments[t]) / numTrueSegments[t]; double f1 = recall+precision == 0.0 ? 0.0 : (2.0 * recall * precision) / (recall + precision); double accuracy_individual = (double)(totalTokens-numPredictedSegments[t]-numTrueSegments[t] + 2*numCorrectSegments[t] )/totalTokens; System.out.println (targets.lookupObject(t) + " precision="+precision+" recall="+recall+" f1="+f1 + " accuracy=" + accuracy_individual); System.out.println ("segments true="+numTrueSegments[t]+" pred="+numPredictedSegments[t]+" correct="+numCorrectSegments[t]+" misses="+(numTrueSegments[t]-numCorrectSegments[t])+" alarms="+(numPredictedSegments[t]-numCorrectSegments[t]) + "\n"); } System.out.println("\n Confusion Matrix (row: true label, col: predicted label)"); System.out.print("\t"); for(int t=0; t<targets.size(); t++){ System.out.print(targets.lookupObject(t) + "\t"); } System.out.println(); for(int t=0; t< targets.size(); t++){ System.out.print(targets.lookupObject(t)+"\t"); for(int tt=0; tt<targets.size(); tt++){ System.out.print(matrixEntry[t][tt] + "\t"); } System.out.println(); } } public void viterbiCRF(String inputDir, int N) { viterbiCRF(inputDir, true, N); } public void viterbiCRF(String inputDir, boolean sgml, int N) { viterbiCRF(inputDir, sgml, seperator, N); } public static void main (String[] args) { File f = new File ("/tmp/wellner/crfs/CRF_face"); ComputeUpperBound1 c = new ComputeUpperBound1 (f); c.loadCRF(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -