📄 medlinesentencemodel.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.sentences;import java.util.HashSet;/** * A <code>MedlineSentenceModel</code> is a heuristic sentence model * designed for operating over biomedical research abstracts as found * in MEDLINE. * * <P>The MEDLINE model assumes that parentheses are balanced as * defined in the class documentation for {@link * HeuristicSentenceModel}. It also assumes the final token is a * sentence boundary, overriding any other possible checks. This is * set because there are many truncated MEDLINE abstracts, and this * ensures that every token falls within a sentence in the result. * * <P>The sets required by the superclass constructor {@link * HeuristicSentenceModel#HeuristicSentenceModel(Set,Set,Set,boolean,boolean)} * determine which tokens are possible sentence stops, which are * disallowed before stops, and which are disallowed as starts. These * three sets are: * * <blockquote> * <table border='0' cellpadding='10'> * <tr> * <td width='33%' valign='top'> * <table border="1" cellpadding="3" width='100%'> * <tr><td><b>Possible Stops</b></td></tr> * <tr><td><code><b>.</b></code></td></tr> * <tr><td><code><b>..</b></code></td></tr> * <tr><td><code><b>!</b></code></td></tr> * <tr><td><code><b>?</b></code></td></tr> * </table> * </td> * * <td width='33%' valign='top'> * <table border="1" cellpadding="3" width='100%'> * <tr><td><b>Impossible Penultimates</b></td></tr> * <tr><td><i>some scientific and publishing terms</i></td></tr> * <tr><td><i>personal/professional titles/suffixes</i></td></tr> * <tr><td><i>months, times</i></td></tr> * <tr><td><i>corporate designators</i></td></tr> * <tr><td><i>common abbreviations</i></td></tr> * <tr><td><i>back quotes, commas</i></td></tr> * </table> * </td> * * <td width='33%' valign='top'> * <table border="1" cellpadding="3" width='100%'> * <tr><td><b>Impossible Sentence Starts</b></td></tr> * <tr><td><i>possible stops (see above)</i></td></tr> * <tr><td><i>close parens, brackets, braces</i></td></tr> * <tr><td><code><b>;</b></code></td></tr> * <tr><td><code><b>:</b></code></td></tr> * <tr><td><code><b>-</b></code></td></tr> * <tr><td><code><b>--</b></code></td></tr> * <tr><td><code><b>---</b></code></td></tr> * <tr><td><code><b>%</b></code></td></tr> * </table> * </td> * </table> * </blockquote> * * <P>This class overrides the default implementation of the possible * start token method to allow a sentence start to be any sequence of * tokens uninterrupted by spaces that contains a non-lowercase letter * character. This behavior is described with examples in its * implementing method's documentation: {@link * #possibleStart(String[],String[],int,int)}. * * @author Mitzi Morris * @version 2.1 * @since LingPipe2.1 */public class MedlineSentenceModel extends HeuristicSentenceModel { /** * Construct a MEDLINE sentence model. */ public MedlineSentenceModel() { super(POSSIBLE_STOPS, IMPOSSIBLE_PENULTIMATES, IMPOSSIBLE_SENTENCE_STARTS, true,true); // +force final stop, +balance parens } /** * Return <code>true</code> if the specified start index can * be a sentence start in the specified array of tokens and * whitespaces running up to the end token. * * <P>For MEDLINE, this implementation returns <code>true</code> * if the sequence of contiguous tokens starting with the * specified token contains an uppercase or digit character. Each * token is considered, beginning with the specified start token * and continuing through all tokens that are not separated by * non-empty whitespace, up to the token with the end index minus * one. If any of the tokens contains an uppercase or digit * character, then the result is <code>true</code>. Otherwise, * the result is <code>false</code>. * * <P>For example, if the first token is "Therefore", then * it can be a sentence start because it contains the non-lowercase * letter "T". Similarly, the token "pH" can be a sentence start, * as can "p53", because they have non-lower-case characters "H" * and "5" respectively. If the underlying sequence is * " correlation. p-53 was...", then the array of tokens * and whitespaces is: * * <blockquote> * <table border='1' cellpadding='5'> * <tr><td><i>Index</i></td> * <td><i>Whitespace</i></td> * <td><i>Token</i></td></tr> * <tr><td>0</td> * <td><code>" "</code></td> * <td><code>correlation</code></td></tr> * <tr><td>1</td> * <td><code>""</code></td> * <td><code>.</code></td></tr> * <tr><td>2</td> * <td><code>" "</code></td> * <td bgcolor='#CCCCFF'><code>p</code></td></tr> * <tr><td>3</td> * <td bgcolor='yellow'><code>""</code></td> * <td bgcolor='#CCCCFF'><code>-</code></td></tr> * <tr><td>4</td> * <td bgcolor='yellow'><code>""</code></td> * <td bgcolor='#CCCCFF'><code>53</code></td></tr> * <tr><td>5</td> * <td><code>" "</code></td> * <td><code>was</code></td></tr> * <tr><td>6</td> * <td>...</td> * <td><code>" "</code></td></tr> * <tr><td colspan='3'>Tokenization of: <code>" correlation. p-53 was ..."</code></td></tr> * </table> * </blockquote> * * Here, "p" is a valid sentence start token even though * it is only a single lowercase character, because it is followed * by a hyphen (<code>-</code>) with no intervening whitespace. * By way of contrast, the first token * <code>"and"</code> in the sequence <code>"and * Foo"</code>, can't start a sentence because it is separated * from the following token by a non-empty whitespace. * * Recall that the whitespace with * the same index as a token precedes the token. * * @param tokens Array of tokens to check. * @param whitespaces Array of whitespaces to check. * @param start Index of first token to check. * @param end Index of last token to check. */ protected boolean possibleStart(String[] tokens, String[] whitespaces, int start, int end) { for (int i = start; i < end; i++) { if (LOWERCASE_STARTS.contains(tokens[i])) return true; if (containsDigitOrUpper(tokens[i])) return true; if (whitespaces[i+1].length() > 0) return false; } return false; } private boolean containsDigitOrUpper(CharSequence token) { int len = token.length(); for (int i=0; i < len; i++) { if (Character.isUpperCase(token.charAt(i))) return true; if (Character.isDigit(token.charAt(i))) return true; } return false; } private static final HashSet POSSIBLE_STOPS = new HashSet(); static { POSSIBLE_STOPS.add("."); POSSIBLE_STOPS.add(".."); // abbrev + stop occurs POSSIBLE_STOPS.add("!"); POSSIBLE_STOPS.add("?"); } private static final HashSet IMPOSSIBLE_PENULTIMATES = new HashSet(); static { // Common Abbrevs IMPOSSIBLE_PENULTIMATES.add("Bros"); IMPOSSIBLE_PENULTIMATES.add("No"); // too common ?? IMPOSSIBLE_PENULTIMATES.add("al"); IMPOSSIBLE_PENULTIMATES.add("vs"); IMPOSSIBLE_PENULTIMATES.add("etc"); IMPOSSIBLE_PENULTIMATES.add("Fig"); // thanks to MM // Professional Honorifics IMPOSSIBLE_PENULTIMATES.add("Dr"); IMPOSSIBLE_PENULTIMATES.add("Prof"); IMPOSSIBLE_PENULTIMATES.add("PhD"); IMPOSSIBLE_PENULTIMATES.add("MD"); // Corporate Designators IMPOSSIBLE_PENULTIMATES.add("Co"); IMPOSSIBLE_PENULTIMATES.add("Corp"); IMPOSSIBLE_PENULTIMATES.add("Inc"); // Month Abbrevs IMPOSSIBLE_PENULTIMATES.add("Jan"); IMPOSSIBLE_PENULTIMATES.add("Feb"); IMPOSSIBLE_PENULTIMATES.add("Mar"); IMPOSSIBLE_PENULTIMATES.add("Apr"); // IMPOSSIBLE_PENULTIMATES.add("Jun"); common term: c-jun. IMPOSSIBLE_PENULTIMATES.add("Jul"); IMPOSSIBLE_PENULTIMATES.add("Aug"); IMPOSSIBLE_PENULTIMATES.add("Sep"); IMPOSSIBLE_PENULTIMATES.add("Sept"); IMPOSSIBLE_PENULTIMATES.add("Oct"); IMPOSSIBLE_PENULTIMATES.add("Nov"); IMPOSSIBLE_PENULTIMATES.add("Dec"); // Location Suffixes IMPOSSIBLE_PENULTIMATES.add("St"); // times IMPOSSIBLE_PENULTIMATES.add("AM"); IMPOSSIBLE_PENULTIMATES.add("PM"); } private static final HashSet IMPOSSIBLE_SENTENCE_STARTS = new HashSet(); static { IMPOSSIBLE_SENTENCE_STARTS.add(")"); IMPOSSIBLE_SENTENCE_STARTS.add("]"); IMPOSSIBLE_SENTENCE_STARTS.add("}"); IMPOSSIBLE_SENTENCE_STARTS.add(">"); IMPOSSIBLE_SENTENCE_STARTS.add("<"); IMPOSSIBLE_SENTENCE_STARTS.add("."); IMPOSSIBLE_SENTENCE_STARTS.add("!"); IMPOSSIBLE_SENTENCE_STARTS.add("?"); IMPOSSIBLE_SENTENCE_STARTS.add(":"); IMPOSSIBLE_SENTENCE_STARTS.add(";"); IMPOSSIBLE_SENTENCE_STARTS.add("-"); IMPOSSIBLE_SENTENCE_STARTS.add("--"); IMPOSSIBLE_SENTENCE_STARTS.add("---"); IMPOSSIBLE_SENTENCE_STARTS.add("%"); } private static final HashSet LOWERCASE_STARTS = new HashSet(); static { LOWERCASE_STARTS.add("alpha"); LOWERCASE_STARTS.add("beta"); LOWERCASE_STARTS.add("gamma"); LOWERCASE_STARTS.add("delta"); LOWERCASE_STARTS.add("c"); // c-jun, c-myc etc. LOWERCASE_STARTS.add("i"); LOWERCASE_STARTS.add("ii"); LOWERCASE_STARTS.add("iii"); LOWERCASE_STARTS.add("iv"); LOWERCASE_STARTS.add("v"); LOWERCASE_STARTS.add("vi"); LOWERCASE_STARTS.add("vii"); LOWERCASE_STARTS.add("viii"); LOWERCASE_STARTS.add("ix"); LOWERCASE_STARTS.add("x"); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -