📄 dobesparser.java
字号:
/* * File: DobesParser.java * Project: MPI Linguistic Application * Date: 02 May 2007 * * Copyright (C) 2001-2007 Max Planck Institute for Psycholinguistics * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package mpi.eudico.server.corpora.clomimpl.dobes;import mpi.eudico.server.util.ServerConfiguration;import org.xml.sax.AttributeList;import org.xml.sax.HandlerBase;import org.xml.sax.InputSource;import org.xml.sax.SAXParseException;import java.io.File;import java.io.FileInputStream;import java.io.InputStreamReader;import java.util.Collections;import java.util.Hashtable;import java.util.Iterator;import java.util.Vector;import javax.xml.parsers.SAXParser;import javax.xml.parsers.SAXParserFactory;/** * A Parser for DOBES minimal format compliant XML files. MAYBE THIS CLASS MUST * BE MADE THREAD SAFE BY ADDING SOME SYNCHRONIZED BLOCKS OR BY GIVING UP THE * SINGLETON PATTERN. * * @author Hennie Brugman * @version 6-Apr-2001 */public class DobesParser extends HandlerBase { private static DobesParser parser; /** The DOBES minimal XML file is parsed. */ private final Float ORDERED_KEYS_KEY = new Float(0.12345); private boolean verbose; private SAXParser saxParser; private String lastParsed; private String currentFileName; private File xmlFile; private boolean parseError; private Hashtable tiers; private String currentTierId; private String currentAnnotationId; private String currentSpeakerId; private String currentStart; private String currentEnd; private String content; /** * Private constructor for DobesParser because the Singleton pattern is * applied here. */ private DobesParser() { try { SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setValidating(true); saxParser = factory.newSAXParser(); lastParsed = ""; verbose = false; } catch (Exception e) { e.printStackTrace(); } } /** * The instance method returns the single incarnation of DobesParser to the * caller. * * @return DOCUMENT ME! */ public static DobesParser Instance() { if (parser == null) { parser = new DobesParser(); } return parser; } /** * Returns the names of the Tiers that are present in the Transcription * file * * @param fileName DOCUMENT ME! * * @return DOCUMENT ME! */ public Vector getTierNames(String fileName) { // make sure that the correct file has been parsed if (!lastParsed.equals(fileName)) { parse(fileName); } Vector tierNames = new Vector(tiers.keySet()); Collections.sort(tierNames); return tierNames; } /** * Returns a Vector with the Annotations for this Tier. Each * AnnotationRecord contains begin time, end time and text values * * @param tierName DOCUMENT ME! * @param fileName DOCUMENT ME! * * @return DOCUMENT ME! */ public Vector getAnnotationsFor(String tierName, String fileName) { // make sure that the correct file has been parsed if (!lastParsed.equals(fileName)) { parse(fileName); } long start = System.currentTimeMillis(); Vector annotationVector = new Vector(); // get the tags from the tiers Hashtable Hashtable annotations = (Hashtable) tiers.get(tierName); // get an iterator that iterates over the tags in the right order. Iterator iter = ((Vector) annotations.get(ORDERED_KEYS_KEY)).iterator(); while (iter.hasNext()) { Vector annotation = (Vector) annotations.get(iter.next()); annotationVector.add(annotation); } long duration = System.currentTimeMillis() - start; // System.out.println("Extracting Annotations took " + duration + " milli seconds"); return annotationVector; } /** * Parses a DOBES-minimal compliant xml file. * * @param fileName the DOBES-minimal compliant xml file that must be * parsed. */ private void parse(String fileName) { long start = System.currentTimeMillis(); try { // System.out.println("Parse : " + fileName); // System.out.println("Free memory : " + Runtime.getRuntime().freeMemory()); // only parse the same file once if (lastParsed.equals(fileName)) { return; } tiers = new Hashtable(); // parse the file xmlFile = new File(fileName); lastParsed = fileName; currentFileName = fileName; saxParser.parse(xmlFile, this); } catch (Exception e) { printErrorLocationInfo("Fatal(?) Error! " + e.getMessage()); } long duration = System.currentTimeMillis() - start; // System.out.println("Parsing took " + duration + " milli seconds"); } /** * HandlerBase method */ public void startDocument() { parseError = false; } /** * HandlerBase method */ public void endDocument() { } /** * HandlerBase method * * @param name DOCUMENT ME! * @param attributes DOCUMENT ME! */ public void startElement(String name, AttributeList attributes) { content = null; if (name.equals("HEADER")) { // implement when dealing with MediaObject } else if (name.equals("CHUNK")) { currentSpeakerId = attributes.getValue("SPEAKER"); currentStart = attributes.getValue("START"); currentEnd = attributes.getValue("END"); /* currentTierId = attributes.getValue("SPEAKER"); // First check whether this tier already exists if (!tiers.containsKey(currentTierId)) { // create an entry in the tiers Hashtable that can hold the Tag values for the new Tier tiers.put(currentTierId, new Hashtable()); // put the Vector with the ordered key info in the Hashtable ((Hashtable) tiers.get(currentTierId)).put(ORDERED_KEYS_KEY, new Vector()); } // The id attribute contains the Annotation identifier, start time is used as id. currentAnnotationId = attributes.getValue("START");
// create new "AnnotationRecord" and add to annotations Hashtable for current tier ((Hashtable) tiers.get(currentTierId)).put(currentAnnotationId, new Vector()); ((Vector)((Hashtable) tiers.get(currentTierId)).get(ORDERED_KEYS_KEY)).add(currentAnnotationId); // add start and end times to this AnnotationRecord ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(attributes.getValue("START")); ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(attributes.getValue("END")); */ } else if (name.equals("RENDERED_TEXT") || name.equals("TRANSLATION")) { currentTierId = currentSpeakerId; if (name.equals("RENDERED_TEXT")) { currentTierId += "-RT"; } else { currentTierId += "-TR"; } // First check whether this tier already exists if (!tiers.containsKey(currentTierId)) { // create an entry in the tiers Hashtable that can hold the Annotation values for the new Tier tiers.put(currentTierId, new Hashtable()); // put the Vector with the ordered key info in the Hashtable ((Hashtable) tiers.get(currentTierId)).put(ORDERED_KEYS_KEY, new Vector()); } // The id attribute contains the Annotation identifier, start time is used as id. currentAnnotationId = currentStart; // create new "AnnotationRecord" and add to annotations Hashtable for current tier ((Hashtable) tiers.get(currentTierId)).put(currentAnnotationId, new Vector()); ((Vector) ((Hashtable) tiers.get(currentTierId)).get(ORDERED_KEYS_KEY)).add(currentAnnotationId); // add start and end times to this AnnotationRecord ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(currentStart); ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(currentEnd); } } //startElement /** * HandlerBase method * * @param name DOCUMENT ME! */ public void endElement(String name) { if (name.equals("RENDERED_TEXT") || name.equals("TRANSLATION")) { ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(content); } } /** * HandlerBase method * * @param buf DOCUMENT ME! * @param start DOCUMENT ME! * @param length DOCUMENT ME! */ public void characters(char[] buf, int start, int length) { if (content == null) { content = removeWhiteSpace(buf, start, length); } else { content += removeWhiteSpace(buf, start, length); } } /** * HandlerBase method * * @param publicId DOCUMENT ME! * @param systemId DOCUMENT ME! * * @return DOCUMENT ME! */ public InputSource resolveEntity(String publicId, String systemId) { InputSource inputSource = null; try { // Open an InputSource to a DOBES-minimal DTD // The location of the dtd defs is under the corpus directory in the path dobes/dtd. if (systemId.endsWith(".dtd")) { int to = systemId.indexOf(".dtd") + 4; int from = systemId.lastIndexOf('/', to) + 1; String fileName = ServerConfiguration.CORPUS_DIRECTORY + File.separator + "dobes" + File.separator + "dtd" + File.separator + systemId.substring(from, to); // inputSource = new InputSource(new FileInputStream(fileName)); // inputSource = new InputSource(StringUtil.openEncodedFile("UTF-8", fileName)); inputSource = new InputSource(new InputStreamReader( new FileInputStream(fileName), "UTF8")); } } catch (Exception e) { e.printStackTrace(); } return inputSource; } /** * HandlerBase method * * @param e DOCUMENT ME! */ public void error(SAXParseException e) { printErrorLocationInfo("Parse error " + e.getMessage()); parseError = true; } /** * DOCUMENT ME! * * @param e DOCUMENT ME! */ public void fatalError(SAXParseException e) { printErrorLocationInfo("Fatal Parse Error " + e.getMessage()); parseError = true; } private String removeWhiteSpace(char[] buf, int start, int length) { int from = start; int to = start + length; /* for (int i = start; i < start + length; i++) { if (buf[i] == ' ' || buf[i] == '\t') { from++; } else { to = from; for (int j = from; j < start + length; j++) { if (buf[j] != ' ' && buf[i] != '\t') { to++; } else { break; } } break; } }
*/ return new String(buf, from, to - from); } private void println(String s) { if (verbose) { System.out.println(s); } } private void printErrorLocationInfo(String message) { System.out.println(message); System.out.println("Exception for " + currentFileName); System.out.println("Tier id " + currentTierId); System.out.println("Annotation id " + currentAnnotationId); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -