stopwordlist.java

来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 556 行

JAVA
556
字号
/*
 * @(#)StopWordList.java	18/01/2005
 *
 * Copyright 2003, 2004 EveryMail Pty Ltd. All rights reserved.
 * Use is subject to license terms.
 */
package org.jasen.core.token;

import java.util.Arrays;

/**
 * <P>
 * 	Maintains a hard-coded list of English stopwords.
 * </P>
 * @author Jason Polites
 */
public final class StopWordList
{

    public static final String[] STOP_WORDS = {
	    "able",
	    "about",
	    "above",
	    "according",
	    "accordingly",
	    "across",
	    "actually",
	    "after",
	    "afterwards",
	    "again",
	    "against",
	    "ain't",
	    "all",
	    "allow",
	    "allows",
	    "almost",
	    "alone",
	    "along",
	    "already",
	    "also",
	    "although",
	    "always",
	    "among",
	    "amongst",
	    "and",
	    "another",
	    "any",
	    "anybody",
	    "anyhow",
	    "anyone",
	    "anything",
	    "anyway",
	    "anyways",
	    "anywhere",
	    "apart",
	    "appear",
	    "appreciate",
	    "appropriate",
	    "are",
	    "aren't",
	    "around",
	    "aside",
	    "ask",
	    "asking",
	    "associated",
	    "available",
	    "away",
	    "awfully",
	    "be",
	    "became",
	    "because",
	    "become",
	    "becomes",
	    "becoming",
	    "been",
	    "before",
	    "beforehand",
	    "behind",
	    "being",
	    "believe",
	    "below",
	    "beside",
	    "besides",
	    "best",
	    "better",
	    "between",
	    "beyond",
	    "both",
	    "brief",
	    "but",
	    "by",
	    "c'mon",
	    "came",
	    "can",
	    "can't",
	    "cannot",
	    "cant",
	    "cause",
	    "causes",
	    "certain",
	    "certainly",
	    "changes",
	    "clearly",
	    "com",
	    "come",
	    "comes",
	    "concerning",
	    "consequently",
	    "consider",
	    "considering",
	    "contain",
	    "containing",
	    "contains",
	    "corresponding",
	    "could",
	    "couldn't",
	    "course",
	    "currently",
	    "definitely",
	    "described",
	    "despite",
	    "did",
	    "didn't",
	    "different",
	    "does",
	    "doesn't",
	    "doing",
	    "don't",
	    "done",
	    "down",
	    "downwards",
	    "during",
	    "each",
	    "edu",
	    "eight",
	    "either",
	    "else",
	    "elsewhere",
	    "enough",
	    "entirely",
	    "especially",
	    "etc",
	    "even",
	    "ever",
	    "every",
	    "everybody",
	    "everyone",
	    "everything",
	    "everywhere",
	    "exactly",
	    "example",
	    "except",
	    "far",
	    "few",
	    "fifth",
	    "first",
	    "five",
	    "followed",
	    "following",
	    "follows",
	    "for",
	    "former",
	    "formerly",
	    "forth",
	    "four",
	    "from",
	    "further",
	    "furthermore",
	    "get",
	    "gets",
	    "getting",
	    "given",
	    "gives",
	    "goes",
	    "going",
	    "gone",
	    "got",
	    "gotten",
	    "greetings",
	    "had",
	    "hadn't",
	    "happens",
	    "hardly",
	    "has",
	    "hasn't",
	    "have",
	    "haven't",
	    "having",
	    "he's",
	    "hello",
	    "help",
	    "hence",
	    "her",
	    "here",
	    "here's",
	    "hereafter",
	    "hereby",
	    "herein",
	    "hereupon",
	    "hers",
	    "herself",
	    "him",
	    "himself",
	    "his",
	    "hither",
	    "hopefully",
	    "how",
	    "howbeit",
	    "however",
	    "i'd",
	    "i'll",
	    "i'm",
	    "i've",
	    "ignored",
	    "immediate",
	    "inasmuch",
	    "inc",
	    "indeed",
	    "indicate",
	    "indicated",
	    "indicates",
	    "inner",
	    "insofar",
	    "instead",
	    "into",
	    "inward",
	    "isn't",
	    "it'd",
	    "it'll",
	    "it's",
	    "its",
	    "itself",
	    "just",
	    "keep",
	    "keeps",
	    "kept",
	    "know",
	    "knows",
	    "known",
	    "last",
	    "lately",
	    "later",
	    "latter",
	    "latterly",
	    "least",
	    "less",
	    "lest",
	    "let",
	    "let's",
	    "like",
	    "liked",
	    "likely",
	    "little",
	    "look",
	    "looking",
	    "looks",
	    "ltd",
	    "mainly",
	    "many",
	    "may",
	    "maybe",
	    "mean",
	    "meanwhile",
	    "merely",
	    "might",
	    "more",
	    "moreover",
	    "most",
	    "mostly",
	    "much",
	    "must",
	    "myself",
	    "name",
	    "namely",
	    "nd",
	    "near",
	    "nearly",
	    "necessary",
	    "need",
	    "needs",
	    "neither",
	    "never",
	    "nevertheless",
	    "new",
	    "next",
	    "nine",
	    "nobody",
	    "non",
	    "none",
	    "noone",
	    "nor",
	    "normally",
	    "not",
	    "nothing",
	    "novel",
	    "now",
	    "nowhere",
	    "obviously",
	    "off",
	    "often",
	    "okay",
	    "old",
	    "once",
	    "one",
	    "ones",
	    "only",
	    "onto",
	    "other",
	    "others",
	    "otherwise",
	    "ought",
	    "our",
	    "ours",
	    "ourselves",
	    "out",
	    "outside",
	    "over",
	    "overall",
	    "own",
	    "particular",
	    "particularly",
	    "per",
	    "perhaps",
	    "placed",
	    "please",
	    "plus",
	    "possible",
	    "presumably",
	    "probably",
	    "provides",
	    "que",
	    "quite",
	    "rather",
	    "really",
	    "reasonably",
	    "regarding",
	    "regardless",
	    "regards",
	    "relatively",
	    "respectively",
	    "right",
	    "said",
	    "same",
	    "saw",
	    "say",
	    "saying",
	    "says",
	    "second",
	    "secondly",
	    "see",
	    "seeing",
	    "seem",
	    "seemed",
	    "seeming",
	    "seems",
	    "seen",
	    "self",
	    "selves",
	    "sensible",
	    "sent",
	    "serious",
	    "seriously",
	    "seven",
	    "several",
	    "shall",
	    "she",
	    "should",
	    "shouldn't",
	    "since",
	    "six",
	    "some",
	    "somebody",
	    "somehow",
	    "someone",
	    "something",
	    "sometime",
	    "sometimes",
	    "somewhat",
	    "somewhere",
	    "soon",
	    "sorry",
	    "specified",
	    "specify",
	    "specifying",
	    "still",
	    "sub",
	    "such",
	    "sup",
	    "sure",
	    "t's",
	    "take",
	    "taken",
	    "tell",
	    "tends",
	    "than",
	    "thank",
	    "thanks",
	    "thanx",
	    "that",
	    "that's",
	    "thats",
	    "the",
	    "their",
	    "theirs",
	    "them",
	    "themselves",
	    "then",
	    "thence",
	    "there",
	    "there's",
	    "thereafter",
	    "thereby",
	    "therefore",
	    "therein",
	    "theres",
	    "thereupon",
	    "these",
	    "they",
	    "they'd",
	    "they'll",
	    "they're",
	    "they've",
	    "think",
	    "third",
	    "this",
	    "thorough",
	    "thoroughly",
	    "those",
	    "though",
	    "three",
	    "through",
	    "throughout",
	    "thru",
	    "thus",
	    "to",
	    "together",
	    "too",
	    "took",
	    "toward",
	    "towards",
	    "tried",
	    "tries",
	    "truly",
	    "try",
	    "trying",
	    "twice",
	    "two",
	    "un",
	    "under",
	    "unfortunately",
	    "unless",
	    "unlikely",
	    "until",
	    "unto",
	    "up",
	    "upon",
	    "us",
	    "use",
	    "used",
	    "useful",
	    "uses",
	    "using",
	    "usually",
	    "uucp",
	    "value",
	    "various",
	    "very",
	    "via",
	    "viz",
	    "vs",
	    "want",
	    "wants",
	    "was",
	    "wasn't",
	    "way",
	    "we",
	    "we'd",
	    "we'll",
	    "we're",
	    "we've",
	    "welcome",
	    "well",
	    "went",
	    "were",
	    "weren't",
	    "what",
	    "what's",
	    "whatever",
	    "when",
	    "whence",
	    "whenever",
	    "where",
	    "where's",
	    "whereafter",
	    "whereas",
	    "whereby",
	    "wherein",
	    "whereupon",
	    "wherever",
	    "whether",
	    "which",
	    "while",
	    "whither",
	    "who",
	    "who's",
	    "whoever",
	    "whole",
	    "whom",
	    "whose",
	    "why",
	    "will",
	    "willing",
	    "wish",
	    "with",
	    "within",
	    "without",
	    "won't",
	    "wonder",
	    "would",
	    "would",
	    "wouldn't",
	    "yes",
	    "yet",
	    "you",
	    "you'd",
	    "you'll",
	    "you're",
	    "you've",
	    "your",
	    "yours",
	    "yourself",
	    "yourselves",
	    "zero"
    };
    
    static {
        Arrays.sort(STOP_WORDS);
    }
    
    /**
     * Determines if the given word is a recognizable stop word
     * @param str The string to test.  It is converted to lower 
     * case in this method to perform the test
     * @return True if the word is a stop word, false otherwise
     */
    public static final boolean isStopWord(String str) {
        return (Arrays.binarySearch(STOP_WORDS, str.toLowerCase()) > -1);
    }
    
    
    private StopWordList() {
        super ();
    }

}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?