stopwordfilter.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 751 行 · 第 1/2 页

JAVA
751
字号
        "or",        "other",        "others",        "otherwise",        "ought",        "our",        "ours",        "ourselves",        "out",        "outside",        "over",        "overall",        "own",        "p",        "particular",        "particularly",        "per",        "perhaps",        "placed",        "please",        "plus",        "possible",        "presumably",        "probably",        "provides",        "q",        "que",        "quite",        "qv",        "r",        "rather",        "rd",        "re",        "really",        "reasonably",        "regarding",        "regardless",        "regards",        "relatively",        "respectively",        "right",        "s",        "said",        "same",        "saw",        "say",        "saying",        "says",        "second",        "secondly",        "see",        "seeing",        "seem",        "seemed",        "seeming",        "seems",        "seen",        "self",        "selves",        "sensible",        "sent",        "serious",        "seriously",        "seven",        "several",        "shall",        "she",        "should",        "shouldn't",        "since",        "six",        "so",        "some",        "somebody",        "somehow",        "someone",        "something",        "sometime",        "sometimes",        "somewhat",        "somewhere",        "soon",        "sorry",        "specified",        "specify",        "specifying",        "still",        "sub",        "such",        "sup",        "sure",        "t",        "t's",        "take",        "taken",        "tell",        "tends",        "th",        "than",        "thank",        "thanks",        "thanx",        "that",        "that's",        "thats",        "the",        "their",        "theirs",        "them",        "themselves",        "then",        "thence",        "there",        "there's",        "thereafter",        "thereby",        "therefore",        "therein",        "theres",        "thereupon",        "these",        "they",        "they'd",        "they'll",        "they're",        "they've",        "think",        "third",        "this",        "thorough",        "thoroughly",        "those",        "though",        "three",        "through",        "throughout",        "thru",        "thus",        "to",        "together",        "too",        "took",        "toward",        "towards",        "tried",        "tries",        "truly",        "try",        "trying",        "twice",        "two",        "u",        "un",        "under",        "unfortunately",        "unless",        "unlikely",        "until",        "unto",        "up",        "upon",        "us",        "use",        "used",        "useful",        "uses",        "using",        "usually",        "uucp",        "v",        "value",        "various",        "very",        "via",        "viz",        "vs",        "w",        "want",        "wants",        "was",        "wasn't",        "way",        "we",        "we'd",        "we'll",        "we're",        "we've",        "welcome",        "well",        "went",        "were",        "weren't",        "what",        "what's",        "whatever",        "when",        "whence",        "whenever",        "where",        "where's",        "whereafter",        "whereas",        "whereby",        "wherein",        "whereupon",        "wherever",        "whether",        "which",        "while",        "whither",        "who",        "who's",        "whoever",        "whole",        "whom",        "whose",        "why",        "will",        "willing",        "wish",        "with",        "within",        "without",        "won't",        "wonder",        "would",        "wouldn't",        "x",        "y",        "yes",        "yet",        "you",        "you'd",        "you'll",        "you're",        "you've",        "your",        "yours",        "yourself",        "yourselves",        "z",        "zero",    };    /** The stop list. */    protected HashSet m_aStopList;    /** True if input is lowercase only. */    protected boolean m_bLowerCaseOnly;    ////// WEKA specific. //////    /** True if the default stop list is skipped. */    protected boolean m_bSkipDefault;    /** The option string for stop list files. */    protected String m_strFiles;    ////// Ends WEKA specific. //////    /**     * Creates a stop word filter.     *     * @param ts  The TextSource object.     */    public StopWordFilter(TextSource ts, String[] options) throws Exception {        Pattern patSep;        String[] aFiles;        m_aStopList = new HashSet();        ////// WEKA specific. //////        m_bSkipDefault = Utils.getFlag('e', options);        if (!m_bSkipDefault)            for (int i = 0; i < m_aDefStopList.length; ++i)                m_aStopList.add(m_aDefStopList[i]);        m_strFiles = Utils.getOption('f', options);        if (m_strFiles.length() > 0) {            patSep = Pattern.compile(":");            aFiles = patSep.split(m_strFiles);            for (int i = 0; i < aFiles.length; ++i)                addFile(aFiles[i]);        }        m_bLowerCaseOnly = Utils.getFlag('w', options);    }    protected Pattern m_patChomp = null;    /**     * Add the words specified in a given file to the stop list.     * Stop words are listed in separate lines.  If there are multiple     * words on the same line, then only the first word will be read.     * Stop words are converted to lowercase before being put in the     * list.  Leading and trailing whitespace and empty lines are     * ignored.     *     * @param strFileName  The name of the stop list file.     */    protected void addFile(String strFileName) throws IOException {        BufferedReader in;        String str;        Matcher matChomp;        if (m_patChomp == null)            m_patChomp = Pattern.compile("^\\s*(\\S*)");        in = new BufferedReader(new FileReader(strFileName));        str = in.readLine();        while (str != null) {            matChomp = m_patChomp.matcher(str);            matChomp.lookingAt();            str = matChomp.group(1);            if (str.length() > 0)                m_aStopList.add(str.toLowerCase());            str = in.readLine();        }    }    /**     * Tosses tokens that appear in the stop list.  Case is ignored     * when doing comparison.     *     * @param strToken  The input token     * @return <code>null</code> if the input token appears in the     * stop list; the token itself if otherwise.     */    public String apply(String strToken) {        if (m_bLowerCaseOnly) {            if (m_aStopList.contains(strToken))                return null;        } else {            if (m_aStopList.contains(strToken.toLowerCase()))                return null;        }        return strToken;    }    ////// WEKA specific. //////    public static Collection listOptions() {        ArrayList aOpts;        aOpts = new ArrayList();        aOpts.add(new Option("\tStopWordFilter: " +                             "Set if input is guanranteed lowercase",                             "w", 0, "-w"));        aOpts.add(new Option("\tStopWordFilter: " +                             "Skip default SMART stop list",                             "e", 0, "-e"));        aOpts.add(new Option("\tStopWordFilter: Stop list files " +                             "(default empty)",                             "f", 1, "-f <str>[:<str>...]"));        return aOpts;    }    public Collection getOptions() {        ArrayList aOpts;        aOpts = new ArrayList();        if (m_bLowerCaseOnly)            aOpts.add("-w"); // ??        if (m_bSkipDefault)            aOpts.add("-e"); // ??        if (m_strFiles.length() > 0) {            aOpts.add("-f");            aOpts.add(m_strFiles);        }        return aOpts;    }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?