wvtool.java
来自「一个很不错的词频统计程序,目前只支持英文,中文的本人正在修改中.改好后上传给大家」· Java 代码 · 共 458 行 · 第 1/2 页
JAVA
458 行
*
* @param input the input list
* @param config the configuration
* @param wordList a word list (possibly containing document and class frequencies).
* @throws Exception
*/
public void createVectors(WVTInputList input, WVTConfiguration config, WVTWordList wordList) throws WVToolException {
// Set up the word list properly
wordList.setAppendWords(false);
wordList.setUpdateOnlyCurrent(true);
// Initialize pointers to components for the individual steps
WVTDocumentLoader loader = null;
WVTInputFilter infilter = null;
WVTCharConverter charConverter = null;
WVTTokenizer tokenizer = null;
WVTWordFilter wordFilter = null;
WVTStemmer stemmer = null;
WVTVectorCreator vectorCreator = null;
WVTOutputFilter outputFilter = null;
// Obtain an expanded list of all documents to consider
Iterator inList = input.getEntries();
// Get through the list
while (inList.hasNext()) {
WVTDocumentInfo d = (WVTDocumentInfo) inList.next();
try {
// Intialize all required components for this document
loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);
infilter = (WVTInputFilter) config.getComponentForStep(WVTConfiguration.STEP_INPUT_FILTER, d);
charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);
vectorCreator = (WVTVectorCreator) config.getComponentForStep(WVTConfiguration.STEP_VECTOR_CREATION, d);
outputFilter = (WVTOutputFilter) config.getComponentForStep(WVTConfiguration.STEP_OUTPUT, d);
// Process the document
TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(infilter.convertToPlainText(loader.loadDocument(d), d), d), d), d), d);
while (tokens.hasMoreTokens()) {
wordList.addWordOccurance(tokens.nextToken());
}
outputFilter.write(vectorCreator.createVector(wordList.getFrequenciesForCurrentDocument(), wordList.getTermCountForCurrentDocument(), wordList, d));
wordList.closeDocument(d);
loader.close(d);
} catch (WVToolException e) {
// If an error occurs add it to the error log
WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);
// close the input stream for this document
loader.close(d);
// If errors should not be skip throw an exception
if (!skipErrors)
throw new WVToolException("Problems processing document " + d.getSourceName(), e);
// otherwise do nothing and proceed with the next document
}
}
}
/**
* Create a single word vector.
*
* @param text the underlying text
* @param d information about the text
* @param config the configuration to use (though it will be only partly used)
* @param wordList the word list to use
* @return WVTWordVector
*/
public WVTWordVector createVector(String text, WVTDocumentInfo d, WVTConfiguration config, WVTWordList wordList) throws WVToolException {
// Set up the word list properly
wordList.setAppendWords(false);
wordList.setUpdateOnlyCurrent(true);
// Initialize pointers to components for the individual steps
WVTCharConverter charConverter = null;
WVTTokenizer tokenizer = null;
WVTWordFilter wordFilter = null;
WVTStemmer stemmer = null;
WVTVectorCreator vectorCreator = null;
WVTWordVector result = null;
try {
// Intialize all required components for this document
charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);
vectorCreator = (WVTVectorCreator) config.getComponentForStep(WVTConfiguration.STEP_VECTOR_CREATION, d);
// Process the document
TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(new StringReader(text), d), d), d), d);
while (tokens.hasMoreTokens()) {
wordList.addWordOccurance(tokens.nextToken());
}
result = vectorCreator.createVector(wordList.getFrequenciesForCurrentDocument(), wordList.getTermCountForCurrentDocument(), wordList, d);
wordList.closeDocument(d);
} catch (WVToolException e) {
WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);
// If errors should not be skip throw an exception
if (!skipErrors)
throw new WVToolException("Problems processing document " + d.getSourceName(), e);
// otherwise do nothing and proceed with the next document
}
return result;
}
/**
* Create an individual word vector from a String using TF/IDF weights and stadard configuration.
*
* @param text the underlying text
* @param wordList a wordlist (for IDF)
* @return a WVTWordVector
* @throws Exception
*/
public WVTWordVector createVector(String text, WVTWordList wordList) throws WVToolException {
WVTConfiguration config = new WVTConfiguration();
config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));
return createVector(text, new WVTDocumentInfo("", "", "", ""), config, wordList);
}
/**
* Process the specified documents using the configured steps and send all encountered words to a listener class. This method can be used to implement specialized applications that merely use the preprocessing steps of the tool instead of using the vectorization functions.
*
* @param input the input list
* @param config the configuration
* @param listener a call back class that is invoked on every processed document and word
* @throws WVToolException
*/
public void iterateWords(WVTInputList input, WVTConfiguration config, WVToolWordListener listener) throws WVToolException {
// Initialize pointers to components for the individual steps
WVTDocumentLoader loader = null;
WVTInputFilter infilter = null;
WVTCharConverter charConverter = null;
WVTTokenizer tokenizer = null;
WVTWordFilter wordFilter = null;
WVTStemmer stemmer = null;
// Obtain an expanded list of all documents to consider
Iterator inList = input.getEntries();
// Get through the list
while (inList.hasNext()) {
WVTDocumentInfo d = (WVTDocumentInfo) inList.next();
listener.openNewDocument(d);
try {
// Intialize all required components for this document
loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);
infilter = (WVTInputFilter) config.getComponentForStep(WVTConfiguration.STEP_INPUT_FILTER, d);
charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);
// Process the document
TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(infilter.convertToPlainText(loader.loadDocument(d), d), d), d), d), d);
while (tokens.hasMoreTokens()) {
listener.processWord(tokens.nextToken());
}
loader.close(d);
} catch (WVToolException e) {
// If an error occurs add it to the error log
WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);
// close the input stream for this document
loader.close(d);
// If errors should not be skip throw an exception
if (!skipErrors)
throw new WVToolException("Problems processing document " + d.getSourceName(), e);
// otherwise do nothing and proceed with the next document
}
}
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?