📄 dobesparser.java

📁 编辑视频文件
💻 JAVA
字号:
/* * File:     DobesParser.java * Project:  MPI Linguistic Application * Date:     02 May 2007 * * Copyright (C) 2001-2007  Max Planck Institute for Psycholinguistics * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package mpi.eudico.server.corpora.clomimpl.dobes;import mpi.eudico.server.util.ServerConfiguration;import org.xml.sax.AttributeList;import org.xml.sax.HandlerBase;import org.xml.sax.InputSource;import org.xml.sax.SAXParseException;import java.io.File;import java.io.FileInputStream;import java.io.InputStreamReader;import java.util.Collections;import java.util.Hashtable;import java.util.Iterator;import java.util.Vector;import javax.xml.parsers.SAXParser;import javax.xml.parsers.SAXParserFactory;/** * A Parser for DOBES minimal format compliant XML files. MAYBE THIS CLASS MUST * BE MADE THREAD SAFE BY ADDING SOME SYNCHRONIZED BLOCKS OR BY GIVING UP THE * SINGLETON PATTERN. * * @author Hennie Brugman * @version 6-Apr-2001 */public class DobesParser extends HandlerBase {    private static DobesParser parser;    /** The DOBES minimal XML file is parsed. */    private final Float ORDERED_KEYS_KEY = new Float(0.12345);    private boolean verbose;    private SAXParser saxParser;    private String lastParsed;    private String currentFileName;    private File xmlFile;    private boolean parseError;    private Hashtable tiers;    private String currentTierId;    private String currentAnnotationId;    private String currentSpeakerId;    private String currentStart;    private String currentEnd;    private String content;    /**     * Private constructor for DobesParser because the Singleton pattern is     * applied here.     */    private DobesParser() {        try {            SAXParserFactory factory = SAXParserFactory.newInstance();            factory.setValidating(true);            saxParser = factory.newSAXParser();            lastParsed = "";            verbose = false;        } catch (Exception e) {            e.printStackTrace();        }    }    /**     * The instance method returns the single incarnation of DobesParser to the     * caller.     *     * @return DOCUMENT ME!     */    public static DobesParser Instance() {        if (parser == null) {            parser = new DobesParser();        }        return parser;    }    /**     * Returns the names of the Tiers that are present in the Transcription     * file     *     * @param fileName DOCUMENT ME!     *     * @return DOCUMENT ME!     */    public Vector getTierNames(String fileName) {        // make sure that the correct file has been parsed        if (!lastParsed.equals(fileName)) {            parse(fileName);        }        Vector tierNames = new Vector(tiers.keySet());        Collections.sort(tierNames);        return tierNames;    }    /**     * Returns a Vector with the Annotations for this Tier. Each     * AnnotationRecord contains begin time, end time and text values     *     * @param tierName DOCUMENT ME!     * @param fileName DOCUMENT ME!     *     * @return DOCUMENT ME!     */    public Vector getAnnotationsFor(String tierName, String fileName) {        // make sure that the correct file has been parsed        if (!lastParsed.equals(fileName)) {            parse(fileName);        }        long start = System.currentTimeMillis();        Vector annotationVector = new Vector();        // get the tags from the tiers Hashtable        Hashtable annotations = (Hashtable) tiers.get(tierName);        // get an iterator that iterates over the tags in the right order.        Iterator iter = ((Vector) annotations.get(ORDERED_KEYS_KEY)).iterator();        while (iter.hasNext()) {            Vector annotation = (Vector) annotations.get(iter.next());            annotationVector.add(annotation);        }        long duration = System.currentTimeMillis() - start;        //	System.out.println("Extracting Annotations took " + duration + " milli seconds");        return annotationVector;    }    /**     * Parses a DOBES-minimal compliant xml file.     *     * @param fileName the DOBES-minimal compliant xml file that must be     *        parsed.     */    private void parse(String fileName) {        long start = System.currentTimeMillis();        try {            //		System.out.println("Parse : " + fileName);            //		System.out.println("Free memory : " + Runtime.getRuntime().freeMemory());            // only parse the same file once            if (lastParsed.equals(fileName)) {                return;            }            tiers = new Hashtable();            // parse the file            xmlFile = new File(fileName);            lastParsed = fileName;            currentFileName = fileName;            saxParser.parse(xmlFile, this);        } catch (Exception e) {            printErrorLocationInfo("Fatal(?) Error! " + e.getMessage());        }        long duration = System.currentTimeMillis() - start;        //	System.out.println("Parsing took " + duration + " milli seconds");    }    /**     * HandlerBase method     */    public void startDocument() {        parseError = false;    }    /**     * HandlerBase method     */    public void endDocument() {    }    /**     * HandlerBase method     *     * @param name DOCUMENT ME!     * @param attributes DOCUMENT ME!     */    public void startElement(String name, AttributeList attributes) {        content = null;        if (name.equals("HEADER")) {            // implement when dealing with MediaObject        } else if (name.equals("CHUNK")) {            currentSpeakerId = attributes.getValue("SPEAKER");            currentStart = attributes.getValue("START");            currentEnd = attributes.getValue("END");            /*            currentTierId = attributes.getValue("SPEAKER");                           // First check whether this tier already exists                           if (!tiers.containsKey(currentTierId)) {                               // create an entry in the tiers Hashtable that can hold the Tag values for the new Tier                               tiers.put(currentTierId, new Hashtable());                               // put the Vector with the ordered key info in the Hashtable                               ((Hashtable) tiers.get(currentTierId)).put(ORDERED_KEYS_KEY, new Vector());                           }                           // The id attribute contains the Annotation identifier, start time is used as id.                           currentAnnotationId = attributes.getValue("START");               
            // create new "AnnotationRecord" and add to annotations Hashtable for current tier                           ((Hashtable) tiers.get(currentTierId)).put(currentAnnotationId, new Vector());                           ((Vector)((Hashtable) tiers.get(currentTierId)).get(ORDERED_KEYS_KEY)).add(currentAnnotationId);                           // add start and end times to this AnnotationRecord                           ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(attributes.getValue("START"));                           ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(attributes.getValue("END"));             */        } else if (name.equals("RENDERED_TEXT") || name.equals("TRANSLATION")) {            currentTierId = currentSpeakerId;            if (name.equals("RENDERED_TEXT")) {                currentTierId += "-RT";            } else {                currentTierId += "-TR";            }            // First check whether this tier already exists            if (!tiers.containsKey(currentTierId)) {                // create an entry in the tiers Hashtable that can hold the Annotation values for the new Tier                tiers.put(currentTierId, new Hashtable());                // put the Vector with the ordered key info in the Hashtable                ((Hashtable) tiers.get(currentTierId)).put(ORDERED_KEYS_KEY,                    new Vector());            }            // The id attribute contains the Annotation identifier, start time is used as id.            currentAnnotationId = currentStart;            // create new "AnnotationRecord" and add to annotations Hashtable for current tier            ((Hashtable) tiers.get(currentTierId)).put(currentAnnotationId,                new Vector());            ((Vector) ((Hashtable) tiers.get(currentTierId)).get(ORDERED_KEYS_KEY)).add(currentAnnotationId);            // add start and end times to this AnnotationRecord            ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(currentStart);            ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(currentEnd);        }    }    //startElement    /**     * HandlerBase method     *     * @param name DOCUMENT ME!     */    public void endElement(String name) {        if (name.equals("RENDERED_TEXT") || name.equals("TRANSLATION")) {            ((Vector) ((Hashtable) tiers.get(currentTierId)).get(currentAnnotationId)).add(content);        }    }    /**     * HandlerBase method     *     * @param buf DOCUMENT ME!     * @param start DOCUMENT ME!     * @param length DOCUMENT ME!     */    public void characters(char[] buf, int start, int length) {        if (content == null) {            content = removeWhiteSpace(buf, start, length);        } else {            content += removeWhiteSpace(buf, start, length);        }    }    /**     * HandlerBase method     *     * @param publicId DOCUMENT ME!     * @param systemId DOCUMENT ME!     *     * @return DOCUMENT ME!     */    public InputSource resolveEntity(String publicId, String systemId) {        InputSource inputSource = null;        try {            // Open an InputSource to a DOBES-minimal DTD            // The location of the dtd defs is under the corpus directory in the path dobes/dtd.            if (systemId.endsWith(".dtd")) {                int to = systemId.indexOf(".dtd") + 4;                int from = systemId.lastIndexOf('/', to) + 1;                String fileName = ServerConfiguration.CORPUS_DIRECTORY +                    File.separator + "dobes" + File.separator + "dtd" +                    File.separator + systemId.substring(from, to);                //	inputSource = new InputSource(new FileInputStream(fileName));                //	inputSource = new InputSource(StringUtil.openEncodedFile("UTF-8", fileName));                inputSource = new InputSource(new InputStreamReader(                            new FileInputStream(fileName), "UTF8"));            }        } catch (Exception e) {            e.printStackTrace();        }        return inputSource;    }    /**     * HandlerBase method     *     * @param e DOCUMENT ME!     */    public void error(SAXParseException e) {        printErrorLocationInfo("Parse error " + e.getMessage());        parseError = true;    }    /**     * DOCUMENT ME!     *     * @param e DOCUMENT ME!     */    public void fatalError(SAXParseException e) {        printErrorLocationInfo("Fatal Parse Error " + e.getMessage());        parseError = true;    }    private String removeWhiteSpace(char[] buf, int start, int length) {        int from = start;        int to = start + length;        /*           for (int i = start; i  < start + length; i++) {               if (buf[i] == ' ' || buf[i] == '\t') {                   from++;               }               else {                   to = from;                   for (int j = from; j < start + length; j++) {                       if  (buf[j] != ' ' && buf[i] != '\t') {                           to++;                       }                       else {                           break;                       }                   }                   break;               }           }           
*/        return new String(buf, from, to - from);    }    private void println(String s) {        if (verbose) {            System.out.println(s);        }    }    private void printErrorLocationInfo(String message) {        System.out.println(message);        System.out.println("Exception for " + currentFileName);        System.out.println("Tier id " + currentTierId);        System.out.println("Annotation id " + currentAnnotationId);    }}
💿 文件大小 23621 K
👤 上传用户 ccuading
📂 所属分类 Java编程
🏷️ 相关标签

#编辑 #视频
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -