📄 shoeboxarray.java
字号:
nlFound = false; } int numOfSpaces = maxLengthInInterlinearBlock - l; String spaces = ""; for (int i = 0; i < numOfSpaces; i++) { spaces += " "; } String newC = ""; if (nlFound) { newC = c.substring(0, l) + spaces + " " + c.substring(l + 1); } else { newC = c.substring(0, l) + spaces; } shoeboxArray[lblIndex][currentIndexBlock] = newC; // store in array } } // decrement all lineCounts Iterator markerIter4 = interlinearTierMarkers.iterator(); while (markerIter4.hasNext()) { String lbl = (String) markerIter4.next(); int newCount = ((Integer) (lineCounts.get(lbl))).intValue() - 1; if (newCount < 0) { newCount = 0; } Integer i = new Integer(newCount); lineCounts.put(lbl, i); // decr count } } /* Used for preparation (counting) and storing. */ private final void readSbx() throws Exception { String line = null; String utf8Line = null; /* A shoebox file may contain 8byte characters from custom fonts. Treating it as isolatin-1 may introduce character errors! */ Reader filereader = null; Reader utf8FileReader = null; MediaDescriptor mediaDescriptor = null; mediaDescriptors = new ArrayList(); // boolean useDedicatedCharacterset = false; // if (useDedicatedCharacterset) { // InputStream fis = new FileInputStream(file); // filereader = new InputStreamReader(fis, "DedicatedCharacterset"); // } else { // use the default encoding // filereader = new FileReader(file); filereader = new InputStreamReader(new FileInputStream(file), "ISO-8859-1"); utf8FileReader = new InputStreamReader(new FileInputStream(file), "UTF-8"); // } // explicit performance care: buffering the filereader BufferedReader br = new BufferedReader(filereader); BufferedReader utf8Br = new BufferedReader(utf8FileReader); String label = null; prevLabel = null; String content = null; int linenumber = 0; while ((line = br.readLine()) != null) { utf8Line = utf8Br.readLine(); prevLabel = label; // if unicode tier, substitute line with utf8Line StringTokenizer t = new StringTokenizer(line); if (t.hasMoreTokens()) { label = t.nextToken(); // the first word if ((label.length() > 1) && (!label.startsWith("\\") || ((label.charAt(0) == '\\') && ((label.charAt(1) == ' ') || (label.charAt(1) == '\t'))))) { label = prevLabel; } if (typFile.isUnicodeTier(label)) { line = utf8Line; if (!isShoeboxArrayPreparation && interlinearTierMarkers.contains(label)) { line = decodeToolboxUnicode(line); } } } linenumber++; line = line.trim(); // logger.log(Level.FINE, " ..." + line); if (linenumber == 1) { // HS 06-2006 extended the test with support for files with the Unicode Byte Order Mark, // \ufeff if ((line.startsWith("\\_sh v4.0")) || (line.startsWith("\\_sh v3.0")) || (utf8Line.startsWith("\ufeff\\_sh v3.0") || utf8Line.startsWith("\ufeff\\_sh v4.0"))) { shoeboxheader = line; // last token is database type, store in ShoeboxTypFile String dbType = ""; while (t.hasMoreTokens()) { dbType = t.nextToken(); } if (!dbType.equals("")) { typFile.setDatabaseType(dbType); } continue; } else { throw new Exception( "A shoebox file must begin with '\\_sh v4.0' or '\\_sh v3.0', found " + line + "!"); } } if (line.length() == 0) { // skip white lines continue; } if (line.startsWith("\\_") && (currentIndexBlock == -1)) { // add to header shoeboxheader = shoeboxheader + "\n" + line; continue; } if (!line.startsWith("\\")) { /* when a line does not start with a label, this is an error. If we are not in strict mode, we assume that the preceding line is continued. */ if (strict1) { throw new Exception("tier without leading label \"" + line + "\""); } //else nevertested if (currentIndexLabel < 0) { throw new Exception( "There is no tier where I can append \"" + line + "\" to!"); } if (isShoeboxArrayPreparation) { continue; } // hacky append String oldContent = shoeboxArray[currentIndexLabel][currentIndexBlock]; //if (oldContent.length() == 0) { // HS 06-2006 only throw exception when there is no old contents if (oldContent == null) { throw new Exception( "There is no tier where I can append \"" + line + "\" to!"); } // concatenate, do not mark the point of concatenation, // fix error silently. shoeboxArray[currentIndexLabel][currentIndexBlock] = oldContent + " " + line; content = ""; // ?? // logger.log(Level.FINE, " appended (" + line + ")"); continue; } else if (!isShoeboxArrayPreparation && ((line.length() == 1) || (line.charAt(1) == ' ') || (line.charAt(1) == '\t'))) { //HS june 2006: allow a single backslash to be part of the content of a marker // append if (shoeboxArray[currentIndexLabel][currentIndexBlock] != null) { shoeboxArray[currentIndexLabel][currentIndexBlock] = shoeboxArray[currentIndexLabel][currentIndexBlock] + " " + line; content = ""; } continue; } // tokenize the shoebox line into label and content { StringTokenizer xxx = new StringTokenizer(line); //prevLabel = label; label = xxx.nextToken(); // the first word storeLabelInOrder(label, prevLabel); // label contains leading backslash! } content = ""; if (line.length() > label.length()) { content = (line.substring(label.length() + 1)); } // strip trailing spaces, if any if (content.length() > 0) { int lastNonSpaceIndex = content.length() - 1; while (content.charAt(lastNonSpaceIndex) == ' ') { lastNonSpaceIndex--; } if ((lastNonSpaceIndex < (content.length() - 1)) && (lastNonSpaceIndex >= 0)) { content = content.substring(0, lastNonSpaceIndex); } } if (line.startsWith(ShoeboxEncoder.elanMediaURLLabel)) { if (mediaDescriptor != null) { mediaDescriptors.add(mediaDescriptor); } mediaDescriptor = new MediaDescriptor(content, null); continue; } if (line.startsWith(ShoeboxEncoder.elanMediaMIMELabel)) { if (mediaDescriptor != null) { mediaDescriptor.mimeType = content; } continue; } if (line.startsWith(ShoeboxEncoder.elanMediaExtractedLabel)) { if (mediaDescriptor != null) { mediaDescriptor.extractedFrom = content; } continue; } if (line.startsWith(ShoeboxEncoder.elanMediaOriginLabel)) { if (mediaDescriptor != null) { mediaDescriptor.timeOrigin = new Long(content).longValue(); } continue; } store_label(label); if (label.equals(label_ref)) { // lastlabel = label_ref; prepare_or_finish_block(); // HB, 31 jul 02, reset lineCounts Iterator markerIter = interlinearTierMarkers.iterator(); while (markerIter.hasNext()) { lineCounts.put(markerIter.next(), new Integer(0)); } } store_label_and_content(label, content); } // add last pending mediaDescriptor, if present if (mediaDescriptor != null) { mediaDescriptors.add(mediaDescriptor); } prepare_or_finish_block(); //checkIfCompletelyAligned(); br.close(); filereader.close(); isShoeboxArrayPreparation = false; // only once } /* private void checkIfCompletelyAligned() { // if completelyUnaligned is false there is at least one time set. // For the moment time alignment must be complete, otherwise imported file is // to be treated as completely unaligned. // TEMPORARY: method can be removed when proper dealing with partial time alignment // on top level tiers is implemented if (!completelyUnaligned && !isShoeboxArrayPreparation) { treatAsUnaligned = false; // check t0's for value -1 int x = labelList.indexOf(ShoeboxEncoder.elanBeginLabel); String[] beginStrings = shoeboxArray[x]; for (int i = 0; i < beginStrings.length; i++) { if (beginStrings[i].equals("-1")) { treatAsUnaligned = true; break; } } // check t1's for value -1, only if treatAsUnaligned isn't already true if (!treatAsUnaligned) { x = labelList.indexOf(ShoeboxEncoder.elanEndLabel); String[] endStrings = shoeboxArray[x]; for (int j = 0; j < endStrings.length; j++) { if (endStrings[j].equals("-1")) { treatAsUnaligned = true; break; } } } } } */ public String getRootMarkerForBlock(int row) { String result = ""; Enumeration en = getLabels(); while (en.hasMoreElements()) { String lbl = (String) en.nextElement(); if (lbl.equals(ShoeboxEncoder.elanBeginLabel) || lbl.equals(ShoeboxEncoder.elanEndLabel) || lbl.equals(ShoeboxEncoder.elanParticipantLabel) || lbl.equals(ShoeboxEncoder.elanELANLabel) || lbl.equals(ShoeboxEncoder.elanBlockStart)) { continue; } if (!typFile.tofromHash.containsKey(lbl) && (getCell(lbl, row) != null)) { result = lbl; break; } } return result; } /** * DOCUMENT ME! * * @param doc wacfile * * @throws Exception DOCUMENT ME! */ private final void readWac(Document doc) throws Exception { NodeList blockList = doc.getElementsByTagName("block"); for (int i = 0; i < blockList.getLength(); i++) { Element blockElement = (Element) blockList.item(i); if (isShoeboxArrayPreparation) { maxIndexBlocks += 1; } NodeList tierList = blockElement.getElementsByTagName("tier"); for (int j = 0; j < tierList.getLength(); j++) { Element tierElement = (Element) tierList.item(j); //MK:02/11/29 the sad tale of standard procedures: WAC tiernames must follow sbx \-convention... String tierName = "\\" + tierElement.getAttribute("name"); String tierValue = tierElement.getFirstChild().getNodeValue(); if (isShoeboxArrayPreparation) { store_label(tierName); // System.out.println(i+"/"+j+") " + tierName + ": " + tierValue); } else { overwriteContent(tierName, i, tierValue); } } prepare_or_finish_block(); } } /** * Toolbox uses 2 bytes or 3 bytes for certain characters on Unicode markers. * The interlinear alignement based on whitespace characters is corrected here by adding extra space characters * in between words, depending on the characters in the String. This way the alignement corresponds to the * alignment in ISO Latin markers. * * Toolbox stores interlinearization on basis of byte position * This causes a problem in case of UTF-8 encodings of more than 1 byte. * * @param value the original Toolbox unicode encoded String * @return the modified string */ private String decodeToolboxUnicode(String value) { if (value == null) { return value; } int length = value.length(); char[] chars = value.toCharArray(); // first count how many chars to add int count = 0; char cc; for (int i = 0; i < length; i++) { cc = chars[i]; if ((cc == '\u0000') || ((cc >= '\u0080') && (cc <= '\u07ff'))) { // 2 bytes count++; } else if ((cc >= '\u0800') && (cc <= '\uffff')) { // 3 bytes count += 2; } } char[] resChars = new char[length + count]; count = 0; int k = 0; for (int i = 0; (i < length) && (k < resChars.length); i++, k++) { cc = chars[i]; resChars[k] = chars[i]; if (cc == ' ') { if (count > 0) { for (int z = 0; z < count; z++) { k++; if (k < resChars.length) { resChars[k] = ' '; } else { break; } } count = 0; } } else if ((cc == '\u0000') || ((cc >= '\u0080') && (cc <= '\u07ff'))) { // 2 bytes count++; } else if ((cc >= '\u0800') && (cc <= '\uffff')) { // 3 bytes count += 2; } } return new String(resChars); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -