wvtool.java

来自「一个很不错的词频统计程序,目前只支持英文,中文的本人正在修改中.改好后上传给大家」· Java 代码 · 共 458 行 · 第 1/2 页

JAVA
458
字号
     * 
     * @param input the input list
     * @param config the configuration
     * @param wordList a word list (possibly containing document and class frequencies).
     * @throws Exception
     */
    public void createVectors(WVTInputList input, WVTConfiguration config, WVTWordList wordList) throws WVToolException {

        // Set up the word list properly

        wordList.setAppendWords(false);
        wordList.setUpdateOnlyCurrent(true);

        // Initialize pointers to components for the individual steps
        WVTDocumentLoader loader = null;
        WVTInputFilter infilter = null;
        WVTCharConverter charConverter = null;
        WVTTokenizer tokenizer = null;
        WVTWordFilter wordFilter = null;
        WVTStemmer stemmer = null;
        WVTVectorCreator vectorCreator = null;
        WVTOutputFilter outputFilter = null;

        // Obtain an expanded list of all documents to consider
        Iterator inList = input.getEntries();

        // Get through the list
        while (inList.hasNext()) {

            WVTDocumentInfo d = (WVTDocumentInfo) inList.next();

            try {

                // Intialize all required components for this document

                loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);
                infilter = (WVTInputFilter) config.getComponentForStep(WVTConfiguration.STEP_INPUT_FILTER, d);
                charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
                tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
                wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
                stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);

                vectorCreator = (WVTVectorCreator) config.getComponentForStep(WVTConfiguration.STEP_VECTOR_CREATION, d);

                outputFilter = (WVTOutputFilter) config.getComponentForStep(WVTConfiguration.STEP_OUTPUT, d);

                // Process the document

                TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(infilter.convertToPlainText(loader.loadDocument(d), d), d), d), d), d);

                while (tokens.hasMoreTokens()) {
                    wordList.addWordOccurance(tokens.nextToken());
                }

                outputFilter.write(vectorCreator.createVector(wordList.getFrequenciesForCurrentDocument(), wordList.getTermCountForCurrentDocument(), wordList, d));

                wordList.closeDocument(d);
                loader.close(d);

            } catch (WVToolException e) {

                // If an error occurs add it to the error log
                WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);

                // close the input stream for this document
                loader.close(d);

                // If errors should not be skip throw an exception
                if (!skipErrors)
                    throw new WVToolException("Problems processing document " + d.getSourceName(), e);

                // otherwise do nothing and proceed with the next document

            }

        }

    }

    /**
     * Create a single word vector.
     * 
     * @param text the underlying text
     * @param d information about the text
     * @param config the configuration to use (though it will be only partly used)
     * @param wordList the word list to use
     * @return WVTWordVector
     */
    public WVTWordVector createVector(String text, WVTDocumentInfo d, WVTConfiguration config, WVTWordList wordList) throws WVToolException {

        // Set up the word list properly

        wordList.setAppendWords(false);
        wordList.setUpdateOnlyCurrent(true);

        // Initialize pointers to components for the individual steps

        WVTCharConverter charConverter = null;
        WVTTokenizer tokenizer = null;
        WVTWordFilter wordFilter = null;
        WVTStemmer stemmer = null;
        WVTVectorCreator vectorCreator = null;

        WVTWordVector result = null;

        try {

            // Intialize all required components for this document

            charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
            tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
            wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
            stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);

            vectorCreator = (WVTVectorCreator) config.getComponentForStep(WVTConfiguration.STEP_VECTOR_CREATION, d);

            // Process the document

            TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(new StringReader(text), d), d), d), d);

            while (tokens.hasMoreTokens()) {
                wordList.addWordOccurance(tokens.nextToken());
            }

            result = vectorCreator.createVector(wordList.getFrequenciesForCurrentDocument(), wordList.getTermCountForCurrentDocument(), wordList, d);

            wordList.closeDocument(d);

        } catch (WVToolException e) {

            WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);

            // If errors should not be skip throw an exception
            if (!skipErrors)
                throw new WVToolException("Problems processing document " + d.getSourceName(), e);
            // otherwise do nothing and proceed with the next document

        }

        return result;
    }

    /**
     * Create an individual word vector from a String using TF/IDF weights and stadard configuration.
     * 
     * @param text the underlying text
     * @param wordList a wordlist (for IDF)
     * @return a WVTWordVector
     * @throws Exception
     */
    public WVTWordVector createVector(String text, WVTWordList wordList) throws WVToolException {

        WVTConfiguration config = new WVTConfiguration();

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        return createVector(text, new WVTDocumentInfo("", "", "", ""), config, wordList);

    }

    /**
     * Process the specified documents using the configured steps and send all encountered words to a listener class. This method can be used to implement specialized applications that merely use the preprocessing steps of the tool instead of using the vectorization functions.
     * 
     * @param input the input list
     * @param config the configuration
     * @param listener a call back class that is invoked on every processed document and word
     * @throws WVToolException
     */
    public void iterateWords(WVTInputList input, WVTConfiguration config, WVToolWordListener listener) throws WVToolException {

        // Initialize pointers to components for the individual steps
        WVTDocumentLoader loader = null;
        WVTInputFilter infilter = null;
        WVTCharConverter charConverter = null;
        WVTTokenizer tokenizer = null;
        WVTWordFilter wordFilter = null;
        WVTStemmer stemmer = null;

        // Obtain an expanded list of all documents to consider
        Iterator inList = input.getEntries();

        // Get through the list
        while (inList.hasNext()) {

            WVTDocumentInfo d = (WVTDocumentInfo) inList.next();
            listener.openNewDocument(d);

            try {

                // Intialize all required components for this document

                loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);
                infilter = (WVTInputFilter) config.getComponentForStep(WVTConfiguration.STEP_INPUT_FILTER, d);
                charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
                tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
                wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
                stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);

                // Process the document

                TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(infilter.convertToPlainText(loader.loadDocument(d), d), d), d), d), d);

                while (tokens.hasMoreTokens()) {
                    listener.processWord(tokens.nextToken());
                }

                loader.close(d);

            } catch (WVToolException e) {

                // If an error occurs add it to the error log
                WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);

                // close the input stream for this document
                loader.close(d);

                // If errors should not be skip throw an exception
                if (!skipErrors)
                    throw new WVToolException("Problems processing document " + d.getSourceName(), e);

                // otherwise do nothing and proceed with the next document

            }

        }
    }

}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?