📄 shoeboxparser.java
字号:
// remember the annot id with the highest index part String idString = aRec.getAnnotationId(); long annotIndex = Long.valueOf(idString.substring( ANN_ID_PREFIX.length())).longValue(); // get rid of prefix part if (annotIndex > highestIndex) { highestIndex = annotIndex; highestAnnot = aRec; } } } if (highestIndex >= 0) { // make new begin timeslot // set end of highest existing annot to this begin long beginTSId = tsId++; annRec.setBeginTimeSlotId(TS_ID_PREFIX + Long.toString(beginTSId)); String oldEndTSId = highestAnnot.getEndTimeSlotId(); highestAnnot.setEndTimeSlotId(TS_ID_PREFIX + Long.toString(beginTSId)); updateChildAnnot(highestAnnot, oldEndTSId); // store timeslots long[] begin = { beginTSId, TimeSlot.TIME_UNALIGNED }; timeSlots.add(begin); // find index of aRec's begin ts in timeOrder String beginId = highestAnnot.getBeginTimeSlotId(); String beginNo = ""; int index = timeOrder.size(); if (beginId != null) { beginNo = highestAnnot.getBeginTimeSlotId().substring(TS_ID_PREFIX.length()); for (int i = 0; i < timeOrder.size(); i++) { long[] ts = (long[]) timeOrder.get(i); if (ts[0] == new Integer(beginNo).intValue()) { index = i; break; } } } if (index > (timeOrder.size() - 1)) { timeOrder.add(begin); } else { timeOrder.add(index + 1, begin); } } else { // first, connect to parent begin if (parentRec != null) { annRec.setBeginTimeSlotId(parentRec.getBeginTimeSlotId()); } } // set end to end of parent if (parentRec != null) { annRec.setEndTimeSlotId(parentRec.getEndTimeSlotId()); } } /** * Recursively update the end timeslot id of annotations referring to this annotation referring to the same * old end time slot. * * @param par the annotatiom that has been modified * @param oldEndTSId the old timeslot id */ private void updateChildAnnot(AnnotationRecord par, String oldEndTSId) { AnnotationRecord aRec = null; Iterator annIter = annotationRecords.iterator(); while (annIter.hasNext()) { aRec = (AnnotationRecord) annIter.next(); if ((aRec.getReferredAnnotId() == par.getAnnotationId()) && (aRec.getEndTimeSlotId() == oldEndTSId)) { aRec.setEndTimeSlotId(par.getEndTimeSlotId()); updateChildAnnot(aRec, oldEndTSId); return; } } } /** * Word boundaries without offset * * @param s given String * * @return index of last of contiguous whitespace */ private final ArrayList wbound(String s, boolean utf8) { return wbound(s, utf8, 0); } /** * Get the wordboundaries from given String. Wordboundaries are the * positions of all white space. If white space is followed by white * space, the last position is used. * * @param s given String * @param offset to add to all wordboundaries * * @return ArrayList of wordboundaries */ private final ArrayList wbound(String s, boolean utf8, int offset) { //System.out.println(""+val+ " ---- entry"); ArrayList result = new ArrayList(); result.add(new Integer(offset)); ArrayList idx = indexesOf(s.trim(), utf8, ' '); //ArrayList idx = indexesOf(s, utf8, ' '); //System.out.println(""+v1+ " ---- indexes of"); idx = lastIntInRow(idx); idx = addToAllIntegers(idx, offset + 1); //System.out.println(""+v1+ " ---- last in row"); result.addAll(idx); // hb, 2-9-04: added +1 because rest of code assumes space between // word beginnings /* Integer lastBound = (Integer) result.lastElement(); if (lastBound.intValue() == s.length() + offset - 1) { result.remove(lastBound); result.add(new Integer(s.length() + offset - 2)); }*/ // this is now done in indexesOf, because the values are based on bytes positions // String.getBytes(charset).length can be different from String.length() //if (!utf8) { result.add(new Integer(s.length() + 1 + offset)); //} // ending on ws result = lastIntInRow(result); //System.out.println(""+wordboundaries+ " ---- result"); return result; } /** * Returns the list of all integers i where myself.indexOf(i) is true. * * @param myself 'this' String * @param lookingfor the String you look for * * @return ArrayList of Integers. */ private final static ArrayList indexesOf(String myself, String lookingfor) { ArrayList result = new ArrayList(); int i = myself.indexOf(lookingfor, 0); while (i != -1) { result.add(new Integer(i)); i = myself.indexOf(lookingfor, i + 1); } return result; } /** * DOCUMENT ME! * * @param myself DOCUMENT ME! * @param utf8 DOCUMENT ME! * @param lookingfor DOCUMENT ME! * * @return DOCUMENT ME! */ private final static ArrayList indexesOf(String myself, boolean utf8, char lookingfor) { ArrayList result = new ArrayList(); try { // bytes are needed for proper counting, since alignment // is on basis of bytes. byte[] bytes = null; if (utf8) { bytes = myself.getBytes("UTF-8"); } else { bytes = myself.getBytes("ISO-8859-1"); } for (int i = 0; i < bytes.length; i++) { if (bytes[i] == lookingfor) { result.add(new Integer(i)); } } // temp add the las index if it is not yet in there //if (utf8 && result.size() > 0 && ((Integer)result.get(result.size() - 1)).intValue() != bytes.length - 1) { // result.add(new Integer(bytes.length - 1)); //} } catch (Exception e) { e.printStackTrace(); } return result; } /** * DOCUMENT ME! * * @param in ArrayList of ordered Integers * * @return ArrayList of ordered Integers, only last in row */ private final static ArrayList lastIntInRow(ArrayList in) { ArrayList result = new ArrayList(); Iterator it = in.iterator(); int last = 0; Integer Last = null; if (it.hasNext()) { Last = (Integer) it.next(); last = Last.intValue(); } while (it.hasNext()) { Integer I = (Integer) it.next(); int i = I.intValue(); //System.out.println("last="+ last + " i="+ i); if ((last + 1) == i) { Last = I; last = i; } else { result.add(Last); Last = I; last = i; } } if (Last != null) { result.add(Last); } return result; } /** * Add the offset to all Integers in ArrayList. * * @param myself 'this' ArrayList * @param offset the offset you want to add * * @return ArrayList of Objects */ private final static ArrayList addToAllIntegers(ArrayList myself, int offset) { ArrayList result = new ArrayList(); for (int i = 0; i < myself.size(); i++) { Object e = myself.get(i); if (e instanceof Integer) { int ii = ((Integer) e).intValue(); ii += offset; result.add(new Integer(ii)); } else { result.add(e); } } return result; } /** * DOCUMENT ME! * * @param val DOCUMENT ME! * @param wb DOCUMENT ME! * @param wc DOCUMENT ME! * * @return DOCUMENT ME! */ private final String snap(String val, ArrayList wb, int wc, boolean trim, boolean utf8) { //logger.log(Level.FINE, "-- snap (" + val+ ", " + wb+ ", " + wc); String result = ""; int b = 0; int e = 0; if (wc < wb.size()) { b = ((Integer) wb.get(wc)).intValue(); } if (wc < (wb.size() - 1)) { e = ((Integer) wb.get(wc + 1)).intValue(); } if (!utf8 && (val.length() < e)) { e = val.length(); } // HB, 3 nov 04, hack to fix improper shoebox alignment pattern // ... woi Bia teri... // ... woi teri... // ... mother across... if (!utf8) { if (val.charAt(e - 1) != ' ') { // take with previous word if (wc < (wb.size() - 2)) { e = ((Integer) wb.get(wc + 2)).intValue(); } } if (val.length() < e) { e = val.length(); } if (val.length() < b) { b = val.length(); } if ((b > 0) && (val.charAt(b - 1) != ' ')) { // ignore, if taken with previous word b = e; } } // toolbox stores interlinearization on basis of byte position // This causes a problem in case of UTF-8 encodings of more than // 1 byte. // Correct b and e to fix this if (utf8) { char[] chars = new char[val.length()]; val.getChars(0, val.length(), chars, 0); for (int i = 0; i < chars.length; i++) { char ch = chars[i]; if ((ch == '\u0000') || ((ch >= '\u0080') && (ch <= '\u07ff'))) { // 2 bytes if (i < b) { b--; } if (i <= e) { e--; } } else if ((ch >= '\u0800') && (ch <= '\uffff')) { // 3 bytes if (i < b) { b -= 2; } if (i <= e) { e -= 2; } } if (i > e) { break; } } } if (b > e) { System.out.println("Val: " + val); System.out.println("b > e: " + b + " - " + e + " l: " + val.length()); e = val.length();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -