📄 muc6chunkparser.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.corpus.parsers;import com.aliasi.chunk.Chunk;import com.aliasi.chunk.Chunking;import com.aliasi.chunk.ChunkFactory;import com.aliasi.chunk.ChunkingImpl;import com.aliasi.corpus.ChunkHandler;import com.aliasi.corpus.XMLParser;import com.aliasi.xml.DelegatingHandler;import java.util.Iterator;import java.util.ArrayList;import org.xml.sax.Attributes;import org.xml.sax.helpers.DefaultHandler;/** * A <code>Muc6ChunkParser</code> parses MUC6-formatted named-entity * corpora in XML. * * <h3>SGML to XML Munging</h3> * * <p>Because the MUC corpora are formatted using SGML, we employed a * program to munge the actual data by replacing unknown entity * references with simple equivalents, as follows: * * <ul> * <li> <code>&MD;</code> is replaced with a pair of dashes * (<code>--</code>)</li> * <li> <code>&LR;</code>, <code>&UR;</code>, <code>&QR;</code> * and <code>&QC;</code> are removed * <li><code>&AMP;</code> is replaced with <code>&amp;</code>. * </ul> * * We also added a DTD declaration with the UTF-8 character format * (the original data is all in the ASCII range, 0-127). Finally, * we removed <code>STORYID</code> and <code>SLUG</code> elements * and all of their content. * * <h3>Corpus Format Requirements</h3> * * <p>The data files must be well-formed XML, as an XML parser is used * to parse them. Training is restricted to the sentence * (<code>s</code>) elements, the entities in which are wrapped in * an <code>ENAMEX</code> element. An example is: only requirements for this format is that it is organized by * sentence with named-entities marked with the <code>ENAMEX</code> * element, as in: * * <blockquote><pre>... * <s> After 20 years of pushing labor proposals to * overhaul the nation's health-care system, <ENAMEX * TYPE="PERSON">Bert Seidman</ENAMEX> of <ENAMEX * TYPE="ORGANIZATION">the AFL-CIO</ENAMEX> is finding interest from * an unlikely quarter: big business. </s> * ...</pre></blockquote> * * <p>Any other containing elements, such as the paragraph * (<code>p</code>) elements in the MUC6 data, will be ignored. There * should be no additional element markup within the <code>s</code> * elements other than the <code>ENAMEX</code> elements. These * <code>ENAMEX</code> elements must have an attribute * <code>TYPE</code> whose value is the entity type of the element. * For most of the chunkers, extra whitespace does not matter; the * extra whitespace above is courtesy of the original corpus. * * @author Bob Carpenter * @version 3.1.2 * @since LingPipe2.2 */public class Muc6ChunkParser extends XMLParser<ChunkHandler> { String mSentenceTag = "s"; // default for MUC6 /** * Construct a MUC6 chunk parser with no handler specified. */ public Muc6ChunkParser() { super(); } /** * Construct a MUC6 chunk parser with the specified chunk handler. * * @param handler Chunk handler for the parser. */ public Muc6ChunkParser(ChunkHandler handler) { super(handler); } protected DefaultHandler getXMLHandler() { return new MucHandler(getHandler()); } /** * Sets the value of the sentence tag to be the specified value. * Only elements within sentences will be picked up by the parser. * * @param tag Tag marking sentence elements. */ public void setSentenceTag(String tag) { mSentenceTag = tag; } class MucHandler extends DelegatingHandler { ChunkHandler mChunkHandler; SentenceHandler mSentHandler; MucHandler(ChunkHandler chunkHandler) { mChunkHandler = chunkHandler; mSentHandler = new SentenceHandler(); setDelegate(mSentenceTag,mSentHandler); } public void finishDelegate(String qName, DefaultHandler handler) { Chunking chunking = mSentHandler.getChunking(); mChunkHandler.handle(chunking); } } static class SentenceHandler extends DefaultHandler { StringBuffer mBuf; String mType; int mStart; int mEnd; final ArrayList mChunkList = new ArrayList(); SentenceHandler() { /* do nothing */ } public void startDocument() { mBuf = new StringBuffer(); mChunkList.clear(); } public void startElement(String uri, String localName, String qName, Attributes attributes) { if (!"ENAMEX".equals(qName)) return; mType = attributes.getValue("TYPE"); mStart = mBuf.length(); } public void endElement(String uri, String localName, String qName) { if (!"ENAMEX".equals(qName)) return; mEnd = mBuf.length(); Chunk chunk = ChunkFactory.createChunk(mStart,mEnd,mType,0); mChunkList.add(chunk); } public void characters(char[] cs, int start, int length) { mBuf.append(cs,start,length); } public Chunking getChunking() { ChunkingImpl chunking = new ChunkingImpl(mBuf); Iterator it = mChunkList.iterator(); while (it.hasNext()) chunking.add((Chunk)it.next()); return chunking; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -