📄 geniasentenceparser.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.corpus.parsers;import com.aliasi.chunk.Chunk;import com.aliasi.chunk.ChunkFactory;import com.aliasi.chunk.ChunkingImpl;import com.aliasi.corpus.ChunkHandler;import com.aliasi.corpus.Handler;import com.aliasi.corpus.XMLParser;import com.aliasi.sentences.SentenceChunker;import com.aliasi.xml.DelegatingHandler;import com.aliasi.xml.DelegateHandler;import com.aliasi.xml.TextAccumulatorHandler;import java.util.ArrayList;import java.util.List;import org.xml.sax.SAXException;import org.xml.sax.helpers.DefaultHandler;/** * A <code>GeniaSentenceParser</code> provides a chunk parser for the * XML version of the GENIA corpus. The type assigned to sentence * chunks is the constant {@link SentenceChunker#SENTENCE_CHUNK_TYPE}. * It only returns the sentences from citation abstracts, not * sentences in citation titles. * * <P>The following example is drawn from the initial part of the merged * 3.02 version of the GENIA corpus (with some content ellided and replaced * by ellipses (<code>...</code>, but all spaces/linebreaks left as is): * * <blockquote><table border='1' cellpadding='5'><tr><td><pre><set><article><articleinfo><bibliomisc>MEDLINE:95369245</bibliomisc></articleinfo><title><sentence>...</sentence></title><abstract><sentence><w c="NN">Activation</w> <w c="IN">of</w> <w c="DT">the</w> <cons lex="CD28_surface_receptor" sem="G#protein_family_or_group"><cons lex="CD28" sem="G#protein_molecule"><w c="NN">CD28</w></cons> <w c="NN">surface</w> <w c="NN">receptor</w></cons> <w c="VBZ">provides</w> <w c="DT">a</w> <w c="JJ">major</w> <w c="JJ">costimulatory</w> <w c="NN">signal</w> <w c="IN">for</w> <cons lex="T_cell_activation" sem="G#other_name"><w c="NN">T</w> <w c="NN">cell</w> <w c="NN">activation</w></cons> <w c="VBG">resulting</w> <w c="IN">in</w> <w c="VBN">enhanced</w> <w c="NN">production</w> <w c="IN">of</w> <cons lex="interleukin-2" sem="G#protein_molecule"><w c="NN">interleukin-2</w></cons> <w c="(">(</w><cons lex="IL-2" sem="G#protein_molecule"><w c="NN">IL-2</w></cons><w c=")">)</w> <w c="CC">and</w> <cons lex="cell_proliferation" sem="G#other_name"><w c="NN">cell</w> <w c="NN">proliferation</w></cons><w c=".">.</w></sentence><sentence>...</sentence>... * </pre></td></tr></table></blockquote> * * All that is required is to pull all of the text content (including * informative spaces) from the sentence elements. * * <P>The GENIA corpus is available free of charge from: * * <UL> * * <LI><a href="http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/" * >GENIA Project Home Page</a> * </UL> * * @author Bob Carpenter * @version 2.1.1 * @since LingPipe2.1.1 */public class GeniaSentenceParser extends XMLParser { /** * Construct a GENIA sentence chunk parser with no designated chunk * handler. Chunk handlers may be later set using the method * {@link #setHandler(Handler)}. * * @throws SAXException If there is an error configuring the * SAX XML reader required for parsing. */ public GeniaSentenceParser() throws SAXException { super(); } /** * Construct a GENIA sentence chunk parser with the specified * chunk handler. * * @param handler The chunk handler used to process sentences * found by this parser. * @throws SAXException If there is an error configuring the * SAX XML reader required for parsing. */ public GeniaSentenceParser(ChunkHandler handler) throws SAXException { super(handler); } /** * Returns the embedded XML handler. This method implements * the required method for the abstract superclass {@link XMLParser}. * * @return The XML handler for this class. */ protected DefaultHandler getXMLHandler() { return new SetHandler(getChunkHandler()); } /** * Sets the handler to the specified chunk handler. If the handler * is not a chunk handler, an illegal argument exception will be * raised. * * @param handler New chunk handler. * @throws IllegalArgumentException If the handler is not a chunk * handler. */ public void setHandler(Handler handler) { if (!(handler instanceof ChunkHandler)) { String msg = "Handler must be a chunk handler." + " Found handler with class=" + handler.getClass(); throw new IllegalArgumentException(msg); } super.setHandler(handler); } /** * Returns the chunk handler for this sentence parser. The result * will be the same as calling the superclass method {@link * #getHandler()}, but the result in this case is cast to type * <code>ChunkHandler</code>. * * @return The chunk handler for this sentence parser. */ public ChunkHandler getChunkHandler() { return (ChunkHandler) getHandler(); } /** * The tag used for sentence elements in GENIA, namely * <code>sentence</code>. */ public static final String GENIA_SENTENCE_ELT = "sentence"; /** * The tag used for abstract elements in GENIA, namely * <code>abstract</code>. */ public static final String GENIA_ABSTRACT_ELT = "abstract"; private static class SetHandler extends DelegatingHandler { final ChunkHandler mChunkHandler; final AbstractHandler mAbstractHandler; SetHandler(ChunkHandler chunkHandler) { mChunkHandler = chunkHandler; mAbstractHandler = new AbstractHandler(this); setDelegate(GENIA_ABSTRACT_ELT,mAbstractHandler); } public void finishDelegate(String qName, DefaultHandler delegate) { if (qName.equals(GENIA_ABSTRACT_ELT)) { handleSentenceTexts(mAbstractHandler.getSentenceTexts()); } } void handleSentenceTexts(List texts) { StringBuffer sb = new StringBuffer(); int numChunks = texts.size(); int[] lengths = new int[numChunks]; for (int i = 0; i< numChunks; i++) { if (i > 0) sb.append(" "); String text = (String)texts.get(i); sb.append(text); lengths[i] = text.length(); } char[] cs = sb.toString().toCharArray(); int offset = 0; ChunkingImpl chunking = new ChunkingImpl(cs,0,cs.length); for (int i = 0; i< numChunks; i++) { Chunk chunk = ChunkFactory .createChunk(offset,offset+lengths[i], SentenceChunker.SENTENCE_CHUNK_TYPE); chunking.add(chunk); offset += lengths[i]+1; } mChunkHandler.handle(chunking); } } private static class AbstractHandler extends DelegateHandler { final ArrayList mSentTexts = new ArrayList(); final TextAccumulatorHandler mSentenceHandler = new TextAccumulatorHandler(); public AbstractHandler(DelegatingHandler parent) { super(parent); setDelegate(GENIA_SENTENCE_ELT, mSentenceHandler); } public void startDocument() { mSentTexts.clear(); } public void finishDelegate(String qName, DefaultHandler delegate) { if (qName.equals(GENIA_SENTENCE_ELT)) { String text = mSentenceHandler.getText().trim(); if (text.length() > 0) mSentTexts.add(text); } } List getSentenceTexts() { return mSentTexts; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -