📄 indoeuropeansentencemodel.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.sentences;import java.util.HashSet;/** * An <code>IndoEuropeanSentenceModel</code> is a heuristic sentence * designed primarily for English. Whehter or not it balances * parentheses or forces the last token to be a boundary may be * set in the constructor. It uses the default implementation of * possible sentence starts and the following token sets: * * <blockquote> * <table border='0' cellpadding='20'> * <tr> * * <td valign='top' width='33%'> * <table border="1" cellpadding="3" width='100%'> * <tr><td><b>Possible Stops</b></td></tr> * <tr><td><code><b>.</b></code></td></tr> * <tr><td><code><b>..</b></code></td></tr> * <tr><td><code><b>!</b></code></td></tr> * <tr><td><code><b>?</b></code></td></tr> * <tr><td><code><b>"</b></code></td></tr> * <tr><td><code><b>''</b></code></td></tr> * <tr><td><code><b>).</b></code></td></tr> * </table> * </td> * * <td valign='top' width='33%'> * <table border="1" cellpadding="3" width='100%'> * <tr><td><b>Impossible Penultimates</b></td></tr> * <tr><td><i>any single letter</i></td></tr> * <tr><td><i>personal and professional titles, ranks, etc.</i></td></tr> * <tr><td><i>commas, colon, and quotes</i></td></tr> * <tr><td><i>common abbreviations</i></td></tr> * <tr><td><i>directions</i></td></tr> * <tr><td><i>corporate designators</i></td></tr> * <tr><td><i>times, months, etc.</i></td></tr> * <tr><td><i>U.S. political parties</i></td></tr> * <tr><td><i>U.S. states (not ME or IN)</i></td></tr> * <tr><td><i>shipping terms</i></td></tr> * <tr><td><i>address abbreviations</i></td></tr> * </table> * </td> * * <td valign='top' width='33%'> * <table border="1" cellpadding="3" width='100%'> * <tr><td><b>Impossible Starts</b></td></tr> * <tr><td><i>possible stops (see above)</i></td></tr> * <tr><td><i>close parentheses</i></td></tr> * <tr><td><code><b>,</b></code></td></tr> * <tr><td><code><b>;</b></code></td></tr> * <tr><td><code><b>:</b></code></td></tr> * <tr><td><code><b>-</b></code></td></tr> * <tr><td><code><b>--</b></code></td></tr> * <tr><td><code><b>---</b></code></td></tr> * <tr><td><code><b>%</b></code></td></tr> * </table> * </td> * * </tr> * </table> * </blockquote> * * Note that all of these sets are case insensitive. * * @author Bob Carpenter * @version 3.4.0 * @since LingPipe1.0 */public class IndoEuropeanSentenceModel extends HeuristicSentenceModel { /** * Construct an Indo-European sentence model that does * not force the final token to be a stop and does not * balance parentheses. */ public IndoEuropeanSentenceModel() { this(false,false); } /** * Construct an Indo-European sentence model that forces final * tokens and balances parentheses according to the specified * flags. * * @param forceFinalToken Whether the final token is always a * sentence stop. * @param balanceParentheses Whether sentences can stop if not all * open parentheses have been closed. * */ public IndoEuropeanSentenceModel(boolean forceFinalToken, boolean balanceParentheses) { super(POSSIBLE_STOPS, IMPOSSIBLE_PENULTIMATES, IMPOSSIBLE_STARTS, forceFinalToken,balanceParentheses); } private static final HashSet POSSIBLE_STOPS = new HashSet(); static { POSSIBLE_STOPS.add("."); POSSIBLE_STOPS.add(".."); // abbrev + stop occurs POSSIBLE_STOPS.add("!"); POSSIBLE_STOPS.add("?"); POSSIBLE_STOPS.add("\""); POSSIBLE_STOPS.add("''"); POSSIBLE_STOPS.add(")."); POSSIBLE_STOPS.add("\u00BB"); // french close quote POSSIBLE_STOPS.add(">>"); // french close quote } private static final HashSet IMPOSSIBLE_PENULTIMATES = new HashSet(); static { // Non abbreviations which shouldn't be penultimate IMPOSSIBLE_PENULTIMATES.add(","); IMPOSSIBLE_PENULTIMATES.add(":"); IMPOSSIBLE_PENULTIMATES.add("''"); // Single letters; typically middle initials or parts of acronyms IMPOSSIBLE_PENULTIMATES.add("A"); IMPOSSIBLE_PENULTIMATES.add("B"); IMPOSSIBLE_PENULTIMATES.add("C"); IMPOSSIBLE_PENULTIMATES.add("D"); IMPOSSIBLE_PENULTIMATES.add("E"); IMPOSSIBLE_PENULTIMATES.add("F"); IMPOSSIBLE_PENULTIMATES.add("G"); IMPOSSIBLE_PENULTIMATES.add("H"); IMPOSSIBLE_PENULTIMATES.add("I"); IMPOSSIBLE_PENULTIMATES.add("J"); IMPOSSIBLE_PENULTIMATES.add("K"); IMPOSSIBLE_PENULTIMATES.add("L"); IMPOSSIBLE_PENULTIMATES.add("M"); IMPOSSIBLE_PENULTIMATES.add("N"); IMPOSSIBLE_PENULTIMATES.add("O"); IMPOSSIBLE_PENULTIMATES.add("P"); IMPOSSIBLE_PENULTIMATES.add("Q"); IMPOSSIBLE_PENULTIMATES.add("R"); IMPOSSIBLE_PENULTIMATES.add("S"); IMPOSSIBLE_PENULTIMATES.add("T"); IMPOSSIBLE_PENULTIMATES.add("U"); IMPOSSIBLE_PENULTIMATES.add("V"); IMPOSSIBLE_PENULTIMATES.add("W"); IMPOSSIBLE_PENULTIMATES.add("X"); IMPOSSIBLE_PENULTIMATES.add("Y"); IMPOSSIBLE_PENULTIMATES.add("Z"); // Common Abbrevs IMPOSSIBLE_PENULTIMATES.add("Bros"); IMPOSSIBLE_PENULTIMATES.add("No"); // too common ?? IMPOSSIBLE_PENULTIMATES.add("vs"); IMPOSSIBLE_PENULTIMATES.add("etc"); IMPOSSIBLE_PENULTIMATES.add("Fig"); // thanks to MM // French Abbrevs: IMPOSSIBLE_PENULTIMATES.add("T\u00E9l"); // e + accent aigu IMPOSSIBLE_PENULTIMATES.add("t\u00E9l"); // Directional Abbrevs IMPOSSIBLE_PENULTIMATES.add("NE"); IMPOSSIBLE_PENULTIMATES.add("N.E"); IMPOSSIBLE_PENULTIMATES.add("NW"); IMPOSSIBLE_PENULTIMATES.add("N.W"); IMPOSSIBLE_PENULTIMATES.add("SE"); IMPOSSIBLE_PENULTIMATES.add("S.E"); IMPOSSIBLE_PENULTIMATES.add("SW"); IMPOSSIBLE_PENULTIMATES.add("S.W"); // Personal Honorifics IMPOSSIBLE_PENULTIMATES.add("Mr"); IMPOSSIBLE_PENULTIMATES.add("Mrs"); IMPOSSIBLE_PENULTIMATES.add("Ms"); IMPOSSIBLE_PENULTIMATES.add("MM"); IMPOSSIBLE_PENULTIMATES.add("Mssrs"); IMPOSSIBLE_PENULTIMATES.add("Messrs"); // Professional Honorifics IMPOSSIBLE_PENULTIMATES.add("Dr"); IMPOSSIBLE_PENULTIMATES.add("Gov"); IMPOSSIBLE_PENULTIMATES.add("Hon"); IMPOSSIBLE_PENULTIMATES.add("Rev"); IMPOSSIBLE_PENULTIMATES.add("Pres"); IMPOSSIBLE_PENULTIMATES.add("Prof"); IMPOSSIBLE_PENULTIMATES.add("Ph.D"); IMPOSSIBLE_PENULTIMATES.add("Ph"); IMPOSSIBLE_PENULTIMATES.add("Rep"); IMPOSSIBLE_PENULTIMATES.add("Reps"); IMPOSSIBLE_PENULTIMATES.add("Rev"); IMPOSSIBLE_PENULTIMATES.add("Sen"); IMPOSSIBLE_PENULTIMATES.add("Sens"); // Name Suffixes IMPOSSIBLE_PENULTIMATES.add("Jr"); IMPOSSIBLE_PENULTIMATES.add("Sr"); // Military Ranks IMPOSSIBLE_PENULTIMATES.add("PFC"); IMPOSSIBLE_PENULTIMATES.add("Cpl"); IMPOSSIBLE_PENULTIMATES.add("Sgt"); IMPOSSIBLE_PENULTIMATES.add("Lt"); IMPOSSIBLE_PENULTIMATES.add("Lieut"); IMPOSSIBLE_PENULTIMATES.add("Capt"); IMPOSSIBLE_PENULTIMATES.add("Cpt"); IMPOSSIBLE_PENULTIMATES.add("Maj"); IMPOSSIBLE_PENULTIMATES.add("Gen"); IMPOSSIBLE_PENULTIMATES.add("Col"); IMPOSSIBLE_PENULTIMATES.add("Cmdr"); IMPOSSIBLE_PENULTIMATES.add("Adm"); IMPOSSIBLE_PENULTIMATES.add("Col"); // Corporate Designators IMPOSSIBLE_PENULTIMATES.add("Co"); IMPOSSIBLE_PENULTIMATES.add("Corp"); IMPOSSIBLE_PENULTIMATES.add("Inc"); IMPOSSIBLE_PENULTIMATES.add("Ltd"); // Month Abbrevs IMPOSSIBLE_PENULTIMATES.add("Jan"); IMPOSSIBLE_PENULTIMATES.add("Feb"); IMPOSSIBLE_PENULTIMATES.add("Mar"); IMPOSSIBLE_PENULTIMATES.add("Apr"); IMPOSSIBLE_PENULTIMATES.add("Jun"); IMPOSSIBLE_PENULTIMATES.add("Jul"); IMPOSSIBLE_PENULTIMATES.add("Aug"); IMPOSSIBLE_PENULTIMATES.add("Sep"); IMPOSSIBLE_PENULTIMATES.add("Sept"); IMPOSSIBLE_PENULTIMATES.add("Oct"); IMPOSSIBLE_PENULTIMATES.add("Nov"); IMPOSSIBLE_PENULTIMATES.add("Dec"); // Location Suffixes IMPOSSIBLE_PENULTIMATES.add("St"); // Political Parties IMPOSSIBLE_PENULTIMATES.add("Rep"); IMPOSSIBLE_PENULTIMATES.add("Dem"); // Politicians IMPOSSIBLE_PENULTIMATES.add("Atty"); // State Names - Post Office // Source: http://www.usps.com/ncsc/lookups/usps_abbreviations.html#states IMPOSSIBLE_PENULTIMATES.add("AL"); IMPOSSIBLE_PENULTIMATES.add("AK"); IMPOSSIBLE_PENULTIMATES.add("AS"); IMPOSSIBLE_PENULTIMATES.add("AZ"); IMPOSSIBLE_PENULTIMATES.add("AR"); IMPOSSIBLE_PENULTIMATES.add("CA"); IMPOSSIBLE_PENULTIMATES.add("CO"); IMPOSSIBLE_PENULTIMATES.add("CT"); IMPOSSIBLE_PENULTIMATES.add("DE"); IMPOSSIBLE_PENULTIMATES.add("DC"); IMPOSSIBLE_PENULTIMATES.add("FM"); IMPOSSIBLE_PENULTIMATES.add("FL"); IMPOSSIBLE_PENULTIMATES.add("GA"); IMPOSSIBLE_PENULTIMATES.add("GU"); IMPOSSIBLE_PENULTIMATES.add("HI"); IMPOSSIBLE_PENULTIMATES.add("ID"); IMPOSSIBLE_PENULTIMATES.add("IL"); // IMPOSSIBLE_PENULTIMATES.add("IN"); too common IMPOSSIBLE_PENULTIMATES.add("IA"); IMPOSSIBLE_PENULTIMATES.add("KS"); IMPOSSIBLE_PENULTIMATES.add("KY"); IMPOSSIBLE_PENULTIMATES.add("LA"); // IMPOSSIBLE_PENULTIMATES.add("ME"); too common IMPOSSIBLE_PENULTIMATES.add("MH"); IMPOSSIBLE_PENULTIMATES.add("MD"); IMPOSSIBLE_PENULTIMATES.add("MA"); IMPOSSIBLE_PENULTIMATES.add("MI"); IMPOSSIBLE_PENULTIMATES.add("MN"); IMPOSSIBLE_PENULTIMATES.add("MS"); IMPOSSIBLE_PENULTIMATES.add("MO"); IMPOSSIBLE_PENULTIMATES.add("MT"); IMPOSSIBLE_PENULTIMATES.add("NE"); IMPOSSIBLE_PENULTIMATES.add("NV"); IMPOSSIBLE_PENULTIMATES.add("NH"); IMPOSSIBLE_PENULTIMATES.add("NJ");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -