⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stopwordfilter.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
        "or",        "other",        "others",        "otherwise",        "ought",        "our",        "ours",        "ourselves",        "out",        "outside",        "over",        "overall",        "own",        "p",        "particular",        "particularly",        "per",        "perhaps",        "placed",        "please",        "plus",        "possible",        "presumably",        "probably",        "provides",        "q",        "que",        "quite",        "qv",        "r",        "rather",        "rd",        "re",        "really",        "reasonably",        "regarding",        "regardless",        "regards",        "relatively",        "respectively",        "right",        "s",        "said",        "same",        "saw",        "say",        "saying",        "says",        "second",        "secondly",        "see",        "seeing",        "seem",        "seemed",        "seeming",        "seems",        "seen",        "self",        "selves",        "sensible",        "sent",        "serious",        "seriously",        "seven",        "several",        "shall",        "she",        "should",        "shouldn't",        "since",        "six",        "so",        "some",        "somebody",        "somehow",        "someone",        "something",        "sometime",        "sometimes",        "somewhat",        "somewhere",        "soon",        "sorry",        "specified",        "specify",        "specifying",        "still",        "sub",        "such",        "sup",        "sure",        "t",        "t's",        "take",        "taken",        "tell",        "tends",        "th",        "than",        "thank",        "thanks",        "thanx",        "that",        "that's",        "thats",        "the",        "their",        "theirs",        "them",        "themselves",        "then",        "thence",        "there",        "there's",        "thereafter",        "thereby",        "therefore",        "therein",        "theres",        "thereupon",        "these",        "they",        "they'd",        "they'll",        "they're",        "they've",        "think",        "third",        "this",        "thorough",        "thoroughly",        "those",        "though",        "three",        "through",        "throughout",        "thru",        "thus",        "to",        "together",        "too",        "took",        "toward",        "towards",        "tried",        "tries",        "truly",        "try",        "trying",        "twice",        "two",        "u",        "un",        "under",        "unfortunately",        "unless",        "unlikely",        "until",        "unto",        "up",        "upon",        "us",        "use",        "used",        "useful",        "uses",        "using",        "usually",        "uucp",        "v",        "value",        "various",        "very",        "via",        "viz",        "vs",        "w",        "want",        "wants",        "was",        "wasn't",        "way",        "we",        "we'd",        "we'll",        "we're",        "we've",        "welcome",        "well",        "went",        "were",        "weren't",        "what",        "what's",        "whatever",        "when",        "whence",        "whenever",        "where",        "where's",        "whereafter",        "whereas",        "whereby",        "wherein",        "whereupon",        "wherever",        "whether",        "which",        "while",        "whither",        "who",        "who's",        "whoever",        "whole",        "whom",        "whose",        "why",        "will",        "willing",        "wish",        "with",        "within",        "without",        "won't",        "wonder",        "would",        "wouldn't",        "x",        "y",        "yes",        "yet",        "you",        "you'd",        "you'll",        "you're",        "you've",        "your",        "yours",        "yourself",        "yourselves",        "z",        "zero",    };    /** The stop list. */    protected HashSet m_aStopList;    /** True if input is lowercase only. */    protected boolean m_bLowerCaseOnly;    ////// WEKA specific. //////    /** True if the default stop list is skipped. */    protected boolean m_bSkipDefault;    /** The option string for stop list files. */    protected String m_strFiles;    ////// Ends WEKA specific. //////    /**     * Creates a stop word filter.     *     * @param ts  The TextSource object.     */    public StopWordFilter(TextSource ts, String[] options) throws Exception {        Pattern patSep;        String[] aFiles;        m_aStopList = new HashSet();        ////// WEKA specific. //////        m_bSkipDefault = Utils.getFlag('e', options);        if (!m_bSkipDefault)            for (int i = 0; i < m_aDefStopList.length; ++i)                m_aStopList.add(m_aDefStopList[i]);        m_strFiles = Utils.getOption('f', options);        if (m_strFiles.length() > 0) {            patSep = Pattern.compile(":");            aFiles = patSep.split(m_strFiles);            for (int i = 0; i < aFiles.length; ++i)                addFile(aFiles[i]);        }        m_bLowerCaseOnly = Utils.getFlag('w', options);    }    protected Pattern m_patChomp = null;    /**     * Add the words specified in a given file to the stop list.     * Stop words are listed in separate lines.  If there are multiple     * words on the same line, then only the first word will be read.     * Stop words are converted to lowercase before being put in the     * list.  Leading and trailing whitespace and empty lines are     * ignored.     *     * @param strFileName  The name of the stop list file.     */    protected void addFile(String strFileName) throws IOException {        BufferedReader in;        String str;        Matcher matChomp;        if (m_patChomp == null)            m_patChomp = Pattern.compile("^\\s*(\\S*)");        in = new BufferedReader(new FileReader(strFileName));        str = in.readLine();        while (str != null) {            matChomp = m_patChomp.matcher(str);            matChomp.lookingAt();            str = matChomp.group(1);            if (str.length() > 0)                m_aStopList.add(str.toLowerCase());            str = in.readLine();        }    }    /**     * Tosses tokens that appear in the stop list.  Case is ignored     * when doing comparison.     *     * @param strToken  The input token     * @return <code>null</code> if the input token appears in the     * stop list; the token itself if otherwise.     */    public String apply(String strToken) {        if (m_bLowerCaseOnly) {            if (m_aStopList.contains(strToken))                return null;        } else {            if (m_aStopList.contains(strToken.toLowerCase()))                return null;        }        return strToken;    }    ////// WEKA specific. //////    public static Collection listOptions() {        ArrayList aOpts;        aOpts = new ArrayList();        aOpts.add(new Option("\tStopWordFilter: " +                             "Set if input is guanranteed lowercase",                             "w", 0, "-w"));        aOpts.add(new Option("\tStopWordFilter: " +                             "Skip default SMART stop list",                             "e", 0, "-e"));        aOpts.add(new Option("\tStopWordFilter: Stop list files " +                             "(default empty)",                             "f", 1, "-f <str>[:<str>...]"));        return aOpts;    }    public Collection getOptions() {        ArrayList aOpts;        aOpts = new ArrayList();        if (m_bLowerCaseOnly)            aOpts.add("-w"); // ??        if (m_bSkipDefault)            aOpts.add("-e"); // ??        if (m_strFiles.length() > 0) {            aOpts.add("-f");            aOpts.add(m_strFiles);        }        return aOpts;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -