📄 sentencechunker.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.sentences;import com.aliasi.chunk.Chunk;import com.aliasi.chunk.ChunkFactory;import com.aliasi.chunk.Chunker;import com.aliasi.chunk.Chunking;import com.aliasi.chunk.ChunkingImpl;import com.aliasi.tokenizer.Tokenizer;import com.aliasi.tokenizer.TokenizerFactory;import com.aliasi.util.Strings;import java.util.ArrayList;/** * The <code>SentenceChunker</code> class uses a * <code>SentenceModel</code> to implement sentence detection through * the <code>chunk.Chunker</code> interface. A sentence chunker is * constructed from a tokenizer factory and a sentence model. The * tokenizer factory creates tokens that it sends to the sentence * model. The types of the chunks produced are given by the * constant {@link #SENTENCE_CHUNK_TYPE}. * * <P>The tokens and whitespaces returned by the tokenizer are * concatenated to form the underlying text slice of the chunks * returned by the chunker. Thus a tokenizer like the stop list * tokenizer or Porter stemmer tokenizer will create a character slice * that does not match the input. A whitespace-normalizing tokenizer * filter can be used, for example, to produce normalized text for the * basis of the chunks. * * @author Mitzi Morris * @version 2.1 * @since LingPipe2.1 */public class SentenceChunker implements Chunker { private final TokenizerFactory mTokenizerFactory; private final SentenceModel mSentenceModel; /** * Construct a sentence chunker from the specified tokenizer * factory and sentence model. * * @param tf Tokenizer factory for chunker. * @param sm Sentence model for chunker. */ public SentenceChunker(TokenizerFactory tf, SentenceModel sm) { mTokenizerFactory = tf; mSentenceModel = sm; } /** * Return the chunking derived from the underlying sentence model * over the tokenization of the specified character slice. * Iterating over the returned set is guaranteed to return the * sentence chunks in their original textual order. * * <P><i>Warning:</i> As described in the class documentation * above, a tokenizer factory that produces tokenizers that do not * reproduce the original sequence may cause the underlying * character slice for the chunks to differ from the slice * provided as an argument. * * @param cSeq Character sequence underlying the slice. * @return The sentence chunking of the specified character * sequence. */ public Chunking chunk(CharSequence cSeq) { char[] cs = Strings.toCharArray(cSeq); return chunk(cs,0,cs.length); } /** * Return the chunking derived from the underlying sentence model * over the tokenization of the specified character slice. See * {@link #chunk(CharSequence)} for more information. * * @param cs Underlying character sequence. * @param start Index of first character in slice. * @param end Index of one past the last character in the slice. * @return The sentence chunking of the specified character slice. */ public Chunking chunk(char[] cs, int start, int end) { ArrayList tokenList = new ArrayList(); ArrayList whiteList = new ArrayList(); Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start); tokenizer.tokenize(tokenList,whiteList); ChunkingImpl chunking = new ChunkingImpl(cs,start,end); if (tokenList.size() == 0) return chunking; String[] tokens = new String[tokenList.size()]; String[] whites = new String[whiteList.size()]; tokenList.toArray(tokens); whiteList.toArray(whites); int[] tokenStarts = new int[tokens.length]; int[] tokenEnds = new int[tokens.length]; int pos = whites[0].length(); for (int i = 0; i < tokens.length; ++i) { tokenStarts[i] = pos; pos += tokens[i].length(); tokenEnds[i] = pos; pos += whites[i+1].length(); } int[] sentenceBoundaries = mSentenceModel.boundaryIndices(tokens,whites); if (sentenceBoundaries.length < 1) return chunking; int nextSentStart = tokenStarts[0]; for (int i = 0; i < sentenceBoundaries.length; ++i) { int sentenceStart = nextSentStart; int endTokIdx = sentenceBoundaries[i]; int sentenceEnd = tokenEnds[endTokIdx]; Chunk chunk = ChunkFactory.createChunk(sentenceStart,sentenceEnd, SENTENCE_CHUNK_TYPE); chunking.add(chunk); nextSentStart = sentenceEnd + whites[endTokIdx+1].length(); } return chunking; } /** * The type assigned to sentence chunks, namely * <code>"S"</code>. */ public static final String SENTENCE_CHUNK_TYPE = "S";}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -