📄 eaf24parser.java
字号:
/* * File: EAF24Parser.java * Project: MPI Linguistic Application * Date: 02 May 2007 * * Copyright (C) 2001-2007 Max Planck Institute for Psycholinguistics * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package mpi.eudico.server.corpora.clomimpl.dobes;import mpi.eudico.server.corpora.clom.TimeSlot;import mpi.eudico.server.corpora.clomimpl.abstr.LinkedFileDescriptor;import mpi.eudico.server.corpora.clomimpl.abstr.MediaDescriptor;import mpi.eudico.server.corpora.clomimpl.abstr.Parser;import mpi.eudico.server.corpora.clomimpl.abstr.PropertyImpl;import org.xml.sax.Attributes;import org.xml.sax.ContentHandler;import org.xml.sax.ErrorHandler;import org.xml.sax.InputSource;import org.xml.sax.Locator;import org.xml.sax.SAXException;import org.xml.sax.SAXParseException;import org.xml.sax.XMLReader;import org.xml.sax.helpers.XMLReaderFactory;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.Locale;/** * A (SAX2) Parser for Elan Annotation Format (EAF) compliant XML files. * * @author Hennie Brugman * @author Han Sloetjes * @version 1-Dec-2003 * @version jun 2004 addition of ControlledVocabularies * @version sep 2005 the constructor is now public giving up the singleton pattern * the path parameter of all getter methods can be removed in the next parser version * (replace by a public parse(String path) method) * @version Feb 2006 support for LinkedFleDescrptors and for stereotype * Included In is added. For compatibility reasons the filename parameter to the getters is maintained. * @version Dec 2006 element PROPERTY has been added to the HEADER element, attribute * ANNOTATOR has been added to element TIER */public class EAF24Parser extends Parser { private boolean verbose = false; private XMLReader reader; /** stores tiername - tierrecord pairs */ private final HashMap tierMap = new HashMap(); /** a map with tiername - ArrayList with Annotation Records pairs */ private final HashMap tiers = new HashMap(); /** Holds value of property DOCUMENT ME! */ private final ArrayList tierNames = new ArrayList(); /** Holds value of property DOCUMENT ME! */ private final ArrayList linguisticTypes = new ArrayList(); /** Holds value of property DOCUMENT ME! */ private final ArrayList locales = new ArrayList(); /** Holds value of property DOCUMENT ME! */ private final HashMap timeSlots = new HashMap(); /** stores the ControlledVocabulary objects by their ID */ private final HashMap controlledVocabularies = new HashMap(); /** Holds value of property DOCUMENT ME! */ private final ArrayList docProperties = new ArrayList(); /** stores the time slots orderd by id */ private final ArrayList timeOrder = new ArrayList(); // since a HashMap is not ordered, all time_slot_ids have to be stored in order separately. private String mediaFile; private ArrayList mediaDescriptors = new ArrayList(); private ArrayList linkedFileDescriptors = new ArrayList(); private String svgFile; private String author; private String currentTierId; private String currentAnnotationId; private AnnotationRecord currentAnnRecord; private String currentCVId; private CVEntryRecord currentEntryRecord; private String content = ""; private String lastParsed = ""; private String currentFileName; private PropertyImpl currentProperty; private boolean parseError; /** * Constructor, creates a new XMLReader * */ public EAF24Parser() { try { reader = XMLReaderFactory.createXMLReader( "org.apache.xerces.parsers.SAXParser"); reader.setFeature("http://xml.org/sax/features/namespaces", true); reader.setFeature("http://xml.org/sax/features/validation", true); reader.setFeature("http://apache.org/xml/features/validation/schema", true); reader.setFeature("http://apache.org/xml/features/validation/dynamic", true); reader.setProperty("http://java.sun.com/xml/jaxp/properties/schemaSource", this.getClass().getResource("/mpi/eudico/resources/EAFv2.4.xsd") .openStream()); //reader.setProperty("http://apache.org/xml/properties/schema/external-noNamespaceSchemaLocation", // "http://www.mpi.nl/tools/elan/EAFv2.4.xsd"); reader.setContentHandler(new EAFContentHandler()); //reader.setErrorHandler(new EAFErrorHandler()); } catch (SAXException se) { se.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } } /** * For backward compatibility; not used anymore * * @param fileName the eaf filename, parameter also for historic reasons * * @return media file name */ public String getMediaFile(String fileName) { parse(fileName); return mediaFile; } /** * Returns the media descriptors * * @param fileName the eaf filename, parameter also for historic reasons * * @return the media descriptors */ public ArrayList getMediaDescriptors(String fileName) { parse(fileName); return mediaDescriptors; } /** * Returns the linked file descriptors * * @param fileName the eaf file name, for historic reasons * * @return a list of linked file descriptors */ public ArrayList getLinkedFileDescriptors(String fileName) { parse(fileName); return linkedFileDescriptors; } /** * DOCUMENT ME! * * @param fileName DOCUMENT ME! * * @return DOCUMENT ME! */ public String getSVGFile(String fileName) { parse(fileName); return svgFile; } /** * DOCUMENT ME! * * @param fileName DOCUMENT ME! * * @return DOCUMENT ME! */ public String getAuthor(String fileName) { parse(fileName); return author; } /** * Returns a list of PropertyImpl objects that have been retrieved from the eaf. * * @see mpi.eudico.server.corpora.clomimpl.abstr.Parser#getTranscriptionProperties(java.lang.String) */ public ArrayList getTranscriptionProperties(String fileName) { parse(fileName); return docProperties; } /** * DOCUMENT ME! * * @param fileName DOCUMENT ME! * * @return DOCUMENT ME! */ public ArrayList getLinguisticTypes(String fileName) { parse(fileName); return linguisticTypes; } /** * DOCUMENT ME! * * @param fileName DOCUMENT ME! * * @return DOCUMENT ME! */ public ArrayList getTimeOrder(String fileName) { parse(fileName); return timeOrder; } /** * DOCUMENT ME! * * @param fileName DOCUMENT ME! * * @return DOCUMENT ME! */ public HashMap getTimeSlots(String fileName) { parse(fileName); return timeSlots; } /** * Returns a Hastable of ArrayLists with the cv id's as keys.<br> * Each ArrayList can contain one String, the description and an * unknown number of CVEntryRecords. * * @param fileName the eaf filename * * @return a Hastable of ArrayLists with the cv id's as keys */ public HashMap getControlledVocabularies(String fileName) { parse(fileName); return controlledVocabularies; } /** * Returns the names of the Tiers that are present in the Transcription * file * * @param fileName DOCUMENT ME! * * @return DOCUMENT ME! */ public ArrayList getTierNames(String fileName) { parse(fileName); return tierNames; } /** * Returns participant attribute of a tier. * The tier record is not used in TranscriptionStore yet. * * @param tierName name of tier * @param fileName the eaf * * @return the participant */ public String getParticipantOf(String tierName, String fileName) { parse(fileName); if (tierMap.get(tierName) != null) { if (((TierRecord) tierMap.get(tierName)).getParticipant() != null) { return ((TierRecord) tierMap.get(tierName)).getParticipant(); } } return ""; } /** * Returns the annotator attribute of a tier. * The tier record is not used in TranscriptionStore yet. * * @param tierName name of tier * @param fileName the eaf * * @return the annotator of the tier */ public String getAnnotatorOf(String tierName, String fileName) { parse(fileName); if (tierMap.get(tierName) != null) { if (((TierRecord) tierMap.get(tierName)).getAnnotator() != null) { return ((TierRecord) tierMap.get(tierName)).getAnnotator(); } } return ""; } /** * Returns the name of the linguistic type of a tier. * The tier record is not used in TranscriptionStore yet. * * @param tierName the name of the tier * @param fileName the eaf * * @return name of the type */ public String getLinguisticTypeIDOf(String tierName, String fileName) { parse(fileName); if (tierMap.get(tierName) != null) { if (((TierRecord) tierMap.get(tierName)).getLinguisticType() != null) { return ((TierRecord) tierMap.get(tierName)).getLinguisticType(); } } return ""; } /** * Returns the Locale object for a tier. * * @param tierName the name of the tier * @param fileName the eaf * * @return the default Locale object */ public Locale getDefaultLanguageOf(String tierName, String fileName) { parse(fileName); Locale resultLoc = null; String localeId = null; if (tierMap.get(tierName) != null) { localeId = ((TierRecord) tierMap.get(tierName)).getDefaultLocale(); } Iterator locIter = locales.iterator(); while (locIter.hasNext()) { Locale l = (Locale) locIter.next(); if (l.getLanguage().equals(localeId)) { resultLoc = l; } } return resultLoc; } /** * Returns the name of the parent tier, if any. * * @param tierName the name of the tier * @param fileName the eaf * * @return the name of the parent tier, or null */ public String getParentNameOf(String tierName, String fileName) { parse(fileName); if (tierMap.get(tierName) != null) { return ((TierRecord) tierMap.get(tierName)).getParentTier(); } return null; } /** * Returns a ArrayList with the Annotations for this Tier. Each * AnnotationRecord contains begin time, end time and text values * * @param tierName the name of the tier * @param fileName the eaf * * @return ArrayList of AnnotationRecord objects for the tier */ public ArrayList getAnnotationsOf(String tierName, String fileName) { // make sure that the correct file has been parsed parse(fileName); return (ArrayList) tiers.get(tierName); } /** * Parses a EAF v2.4 (or <) xml file. * * @param fileName the EAF v2.4 xml file that must be parsed. */ private void parse(String fileName) { //long start = System.currentTimeMillis(); // System.out.println("Parse : " + fileName); // System.out.println("Free memory : " + Runtime.getRuntime().freeMemory()); // only parse the same file once if (lastParsed.equals(fileName)) { return; } // (re)set everything to null for each parse tiers.clear(); tierNames.clear(); // HB, 2-1-02, to store name IN ORDER //tierAttributes.clear(); mediaFile = ""; linguisticTypes.clear(); locales.clear(); timeSlots.clear(); timeOrder.clear(); mediaDescriptors.clear(); linkedFileDescriptors.clear(); controlledVocabularies.clear(); // parse the file lastParsed = fileName; currentFileName = fileName; try { reader.parse(fileName); } catch (SAXException e) { System.out.println("Parsing error: " + e.getMessage()); // the SAX parser can have difficulties with certain characters in // the filepath: try to create an InputSource for the parser // HS Mar 2007: depending on Xerces version a SAXException or an IOException
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -