📄 shoeboxparser.java
字号:
// MK's code assumes that interlinear tiers are handled before additional 'symbolic // association' tiers. Therefore, reorder by putting the first interlinear line under the // interlinearRootMarker first in vdcs // iterate over 'to' names. If 'from' value is interlinearRootMarker and 'to' is element // of 'from' (has children), then make it topTierName /* Enumeration toEnum = typfile.tofromHash.keys(); while (toEnum.hasMoreElements()) { String toName = (String) toEnum.nextElement(); if ( (((String) typfile.tofromHash.get(toName)).equals(rootMarker)) && (typfile.tofromHash.containsValue(toName))) { topTierName = toName; break; } } */ topTierName = rootMarker; Vector vdcs = childs(rootMarker); if (topTierName != "") { int maxDepth = 0; Vector reorderedChildren = new Vector(); Iterator vdcsIter = vdcs.iterator(); while (vdcsIter.hasNext()) { String n = (String) vdcsIter.next(); // hb, 19-4-05 DefaultMutableTreeNode childNode = sbxfile.getLabelNode(n); if (childNode.getDepth() > maxDepth) { reorderedChildren.add(0, n); maxDepth = childNode.getDepth(); } else { reorderedChildren.add(n); } /* if (n.equals(topTierName)) { reorderedChildren.add(0, n); } else { reorderedChildren.add(n); } */ } vdcs = reorderedChildren; } // Annotation has to be split up by words int wordcounter = 0; boolean hasMoreWords = true; while (hasMoreWords) { // logger.log(Level.INFO, "-- create Block for word " + wordcounter); Enumeration dcs = vdcs.elements(); // logger.log(Level.INFO, "== vdcs=" + vdcs); //hasMoreWords = createChildsInBlock(annRec, dcs, row, null, wordcounter); // HS may 06 new implementation after change in ShoeboxArray: Unicode tiers are converted there hasMoreWords = createChildrenInBlock(annRec, dcs, row, null, wordcounter); wordcounter += 1; } } /** * Recursively find word boundaries and create annotations. * * @param par the parent annotation * @param brothers enumeration of sibling * @param row the current row or block or record index * @param wordboundaries the wordboundaries of the parent * @param wordcount the (current) index in the list of boundaries * @return true as long as there are more siblings to process * @throws Exception ?? */ private boolean createChildrenInBlock(AnnotationRecord par, Enumeration brothers, int row, ArrayList wordboundaries, int wordcount) throws Exception { boolean result = true; if (!brothers.hasMoreElements()) { // logger.log(Level.INFO, "== ending recursion"); return false; } String name = (String) brothers.nextElement(); String spk = sbxfile.getSpeaker(row); // logger.log(Level.INFO, // "== createChildsInBlock(" + par.getValue() + ", '" + name + "', " + // row + ", " + wordboundaries + ", " + wordcount); ArrayList mywordboundaries = null; String val = sbxfile.getCell(name, row); if ((val == null) || (val.length() == 0)) { // skip this value, but there might be more brothers return createChildrenInBlock(par, brothers, row, wordboundaries, wordcount); } if ((simpleConverter != null) && typfile.isIPAtier(name)) { val = simpleConverter.toUnicode(val); } Vector vdcs = childs(name); Enumeration dcs = vdcs.elements(); boolean iHaveKids = dcs.hasMoreElements(); // HS March 2007: if the marker has no children but is the first element in a Parse procedure // (subdivision) treat it as if it had children? if (!iHaveKids && typfile.getInterlinearTierMarkers().contains(name)) { iHaveKids = true; } if ((wordboundaries == null) && !iHaveKids) { // append only once if (wordcount > 0) { return false; } // logger.log(Level.INFO, "== ("+val.substring(0, 8)+") is hanging childless under \\ref "); // logger.log(Level.INFO, // "== (" + val + ") is hanging childless under \\ref "); } else { // tier is not under ref, kids or not // use the wordcounter if (wordboundaries == null) { // ref is my parent and I have kids, I set the wordboundaries and the wordcounter. mywordboundaries = wordbounds(val); result = wordcount < (mywordboundaries.size() - 2); } else { // my parent set some wordboundaries // I have to get the right word mywordboundaries = wordboundaries; // If I have (inner )wordboundaries myself, // I have to ignore parents bounds and set a new ones. String xval = snapWord(val, mywordboundaries, wordcount, true); int index = ((Integer) mywordboundaries.get(wordcount)).intValue(); // pad xval with spaces until endIndex (hb, 3 sept 04) int endIndex = ((Integer) mywordboundaries.get(wordcount + 1)).intValue(); int xvalLength = xval.length(); for (int i = 0; i < (endIndex - index - xvalLength - 1); i++) { xval += " "; } ArrayList xmywordboundaries = wordbounds(xval, index); // logger.log(Level.INFO, // " (" + xval + ") " + index + "/" + // xmywordboundaries); if (xmywordboundaries.size() > 2) { ///////////////////////////////// // recursion over trees within words (bern -e) int ww_wordcount = 0; boolean ww_hasMoreWords = true; while (ww_hasMoreWords) { String ww_val = snapWord(val, xmywordboundaries, ww_wordcount, true); // logger.log(Level.INFO, // " ........... '" + ww_val + "', of " + // ww_wordcount); // logger.log(Level.INFO, // " brothers.... '" + vdcs); AnnotationRecord annRec = new AnnotationRecord(); annRec.setAnnotationId(ANN_ID_PREFIX + annotId++); annRec.setValue(ww_val); if ((typfile.procedureTypeHash.get(name) != null) && (typfile.procedureTypeHash.get(name).equals("TimeSubdivision") || typfile.procedureTypeHash.get(name).equals("IncludedIn"))) { // alignable annot annRec.setAnnotationType(AnnotationRecord.ALIGNABLE); annRec.setReferredAnnotId(par.getAnnotationId()); // used to create and connect timeslots // NOTE: order of these 3 statements important annotRecordToTierMap.put(annRec, name + "@" + spk); createAndConnectTimeSlots(annRec); annotationRecords.add(annRec); } else { // ref annotation annRec.setAnnotationType(AnnotationRecord.REFERENCE); annRec.setReferredAnnotId(par.getAnnotationId()); // NOTE: order of these 3 statements important annotRecordToTierMap.put(annRec, name + "@" + spk); fillInPrevAnnotRef(annRec); annotationRecords.add(annRec); } if (!participantOrder.contains(spk)) { participantOrder.add(spk); } tierNameSet.add(name + "@" + spk); // logger.log(Level.INFO, "ww "); createChildrenInBlock(annRec, vdcs.elements(), row, xmywordboundaries, ww_wordcount); ww_hasMoreWords = ww_wordcount < (xmywordboundaries.size() - 2); ww_wordcount += 1; } return createChildrenInBlock(par, brothers, row, wordboundaries, wordcount); ////////////////////////////////////////////// } } // snap sentence to word val = snapWord(val, mywordboundaries, wordcount, true); } AnnotationRecord aRec = new AnnotationRecord(); aRec.setAnnotationId(ANN_ID_PREFIX + annotId++); aRec.setValue(val); if ((typfile.procedureTypeHash.get(name) != null) && (typfile.procedureTypeHash.get(name).equals("TimeSubdivision") || typfile.procedureTypeHash.get(name).equals("IncludedIn"))) { // alignable annot aRec.setAnnotationType(AnnotationRecord.ALIGNABLE); aRec.setReferredAnnotId(par.getAnnotationId()); // used to create and connect timeslots // NOTE: order of these 3 statements important annotRecordToTierMap.put(aRec, name + "@" + spk); createAndConnectTimeSlots(aRec); annotationRecords.add(aRec); } else { // ref annot aRec.setAnnotationType(AnnotationRecord.REFERENCE); aRec.setReferredAnnotId(par.getAnnotationId()); // NOTE: order of next 3 statements important annotRecordToTierMap.put(aRec, name + "@" + spk); fillInPrevAnnotRef(aRec); annotationRecords.add(aRec); } if (!participantOrder.contains(spk)) { participantOrder.add(spk); } tierNameSet.add(name + "@" + spk); // System.out.println("added annot: " + me.getValue()); if (iHaveKids) { createChildrenInBlock(aRec, dcs, row, mywordboundaries, wordcount); } createChildrenInBlock(par, brothers, row, wordboundaries, wordcount); return result; } /** * DOCUMENT ME! * * @param s DOCUMENT ME! * * @return DOCUMENT ME! */ private final ArrayList wordbounds(String s) { return wordbounds(s, 0); } /** * Get the wordboundaries from given String. Wordboundaries are the * positions of all white space. If white space is followed by white * space, the last position is used. * * @param s given String * @param offset to add to all wordboundaries * * @return ArrayList of wordboundaries */ private final ArrayList wordbounds(String s, int offset) { //System.out.println(""+val+ " ---- entry"); ArrayList result = new ArrayList(); result.add(new Integer(offset)); ArrayList idx = indicesOf(s.trim(), ' '); //ArrayList idx = indexesOf(s, utf8, ' '); //System.out.println(""+v1+ " ---- indexes of"); idx = lastIntInRow(idx); idx = addToAllIntegers(idx, offset + 1); //System.out.println(""+v1+ " ---- last in row"); result.addAll(idx); // hb, 2-9-04: added +1 because rest of code assumes space between // word beginnings // String.getBytes(charset).length can be different from String.length() result.add(new Integer(s.length() + 1 + offset)); // ending on ws result = lastIntInRow(result); //System.out.println(""+wordboundaries+ " ---- result"); return result; } /** * Returna a list with all indices of a certain char. * @param val the string * @param lookingfor the character to find in the string * @return a list of indices */ private final ArrayList indicesOf(String val, char lookingfor) { ArrayList result = new ArrayList(); try { char[] chars = new char[val.length()]; val.getChars(0, val.length(), chars, 0); for (int i = 0; i < chars.length; i++) { if (chars[i] == lookingfor) { result.add(new Integer(i)); } } } catch (Exception e) { e.printStackTrace(); } return result; } /** * Extract a word from the input string, based on word boundaries and index. * * May 2006: unproper alignment was fixed in previous version in non utf-8 markers. * This distinction is no longer made (see ShoeboxArray): * @param val the complete line * @param wb the boundary indices * @param wc the index into the boundary list * @param trim whether or not to trim the result * @return the extracted word */ private final String snapWord(String val, ArrayList wb, int wc, boolean trim) { //logger.log(Level.FINE, "-- snap (" + val+ ", " + wb+ ", " + wc); String result = ""; int b = 0; int e = 0; if (wc < wb.size()) { b = ((Integer) wb.get(wc)).intValue(); } if (wc < (wb.size() - 1)) { e = ((Integer) wb.get(wc + 1)).intValue(); } if (val.length() < e) { e = val.length(); } // HB, 3 nov 04, hack to fix improper shoebox alignment pattern // ... woi Bia teri... // ... woi teri... // ... mother across... if (fixImproperAlign) { if (val.charAt(e - 1) != ' ') { // take with previous word
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -