📄 wvtmanager.java
字号:
package text_category;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.Calendar;
import java.util.List;
import edu.udo.cs.wvtool.config.WVTConfiguration;
import edu.udo.cs.wvtool.config.WVTConfigurationFact;
import edu.udo.cs.wvtool.generic.inputfilter.SimpleTagIgnoringReader;
import edu.udo.cs.wvtool.generic.inputfilter.WVTInputFilter;
import edu.udo.cs.wvtool.generic.output.WordVectorWriter;
import edu.udo.cs.wvtool.generic.stemmer.DummyStemmer;
import edu.udo.cs.wvtool.generic.stemmer.WVTStemmer;
import edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer;
import edu.udo.cs.wvtool.generic.vectorcreation.TFIDF;
import edu.udo.cs.wvtool.generic.wordfilter.DummyWordFilter;
import edu.udo.cs.wvtool.generic.wordfilter.WVTWordFilter;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.main.WVTFileInputList;
import edu.udo.cs.wvtool.main.WVTWordVector;
import edu.udo.cs.wvtool.main.WVTool;
import edu.udo.cs.wvtool.wordlist.WVTWordList;
public class WVTManager {
public List category(String url, String content){
try
{
long bs = Calendar.getInstance().getTimeInMillis();
// EXAMPLE HOW TO CALL THE PROGRAM FROM JAVA
// Initialize the WVTool
WVTool wvt = new WVTool(false);
// Initialize the configuration
WVTConfiguration config = new WVTConfiguration();
WVTStemmer stemmer = new DummyStemmer();
WVTInputFilter ifilter = new SimpleTagIgnoringReader();
WVTWordFilter filter = new DummyWordFilter();
WVTTokenizer tokenizer = new ChineseTokenizer();
config.setConfigurationRule(WVTConfiguration.STEP_INPUT_FILTER, new WVTConfigurationFact(ifilter));
config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(stemmer));
config.setConfigurationRule(WVTConfiguration.STEP_WORDFILTER, new WVTConfigurationFact(filter));
config.setConfigurationRule(WVTConfiguration.STEP_TOKENIZER, new WVTConfigurationFact(tokenizer));
// Initialize the input list with two classes
WVTFileInputList list = new WVTFileInputList(10);
// Add entries
//list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
//list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 0));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//auto", "html", "", "chinese", 0));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//edu", "html", "", "chinese", 1));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//ent", "html", "", "chinese", 2));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//finance", "html", "", "chinese", 3));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//house", "html", "", "chinese", 4));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//mil", "html", "", "chinese", 5));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//olympics", "html", "", "chinese", 6));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//politics", "html", "", "chinese", 7));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//sports", "html", "", "chinese", 8));
list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//tech", "html", "", "chinese", 9));
// Generate the word list
WVTWordList wordList = null;
if ((wordList = wvt.createWordListFromFile("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wordlist.txt")) == null)
{
wordList = wvt.createWordList(list, config);
wordList.store(new FileWriter("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wordlist.txt"));
}
if (wordList == null)
{
return null;
}
//WVTWordList wordList = new WVTWordList(new LinkedList(), list.getNumClasses());
// Prune the word list
//wordList.pruneByFrequency(0, 300);
// Create the vectors
WVTWordVector[] vectors = null;
if ((vectors = wvt.createVectorsFromFile("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wv.txt")) == null)
{
FileWriter outFile = new FileWriter("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wv.txt");
WordVectorWriter wvw = new WordVectorWriter(outFile, true);
config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));
config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));
vectors = wvt.createVectors(list, config, wordList, null);
wvw.close();
outFile.close();
}
if (vectors == null)
{
return null;
}
// Just for demonstration: Create a vector from a String
WVTDocumentInfo d = new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//test//sports.htm", "html", "", "chinese");
WVTWordVector q = wvt.createVector(null, d, config, wordList);
//WVTDocumentInfo d = new WVTDocumentInfo(url, "html", "", "chinese");
//WVTWordVector q = wvt.createVector(content, d, config, wordList);
if (q == null)
{
return null;
}
FileWriter outFile1 = new FileWriter("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wv1.txt");
WordVectorWriter wvw1 = new WordVectorWriter(outFile1, true);
wvw1.write(q);
wvw1.close();
outFile1.close();
KNN knn = new KNN();
List result = knn.LazyLearning(q, vectors, list.getNumClasses());
System.out.println("using time: " + (Calendar.getInstance().getTimeInMillis() - bs) + "ms");
return result;
}catch (Exception e)
{
return null;
}
}
public static void main(String[] args)
{
WVTManager wm = new WVTManager();
wm.category("", "");
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -