⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 wvtmanager.java

📁 中文自动分类。使用spider抓取网络信息
💻 JAVA
字号:
package text_category;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.Calendar;
import java.util.List;

import edu.udo.cs.wvtool.config.WVTConfiguration;
import edu.udo.cs.wvtool.config.WVTConfigurationFact;
import edu.udo.cs.wvtool.generic.inputfilter.SimpleTagIgnoringReader;
import edu.udo.cs.wvtool.generic.inputfilter.WVTInputFilter;
import edu.udo.cs.wvtool.generic.output.WordVectorWriter;
import edu.udo.cs.wvtool.generic.stemmer.DummyStemmer;
import edu.udo.cs.wvtool.generic.stemmer.WVTStemmer;
import edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer;
import edu.udo.cs.wvtool.generic.vectorcreation.TFIDF;
import edu.udo.cs.wvtool.generic.wordfilter.DummyWordFilter;
import edu.udo.cs.wvtool.generic.wordfilter.WVTWordFilter;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.main.WVTFileInputList;
import edu.udo.cs.wvtool.main.WVTWordVector;
import edu.udo.cs.wvtool.main.WVTool;
import edu.udo.cs.wvtool.wordlist.WVTWordList;

public class WVTManager {

	public List category(String url, String content){
		try
		{						        	
			long bs = Calendar.getInstance().getTimeInMillis();

	        // EXAMPLE HOW TO CALL THE PROGRAM FROM JAVA

	        // Initialize the WVTool
	        WVTool wvt = new WVTool(false);

	        // Initialize the configuration
	        WVTConfiguration config = new WVTConfiguration();

	        WVTStemmer stemmer = new DummyStemmer();
	        WVTInputFilter ifilter = new SimpleTagIgnoringReader();
	        WVTWordFilter filter = new DummyWordFilter();
	        WVTTokenizer tokenizer = new ChineseTokenizer();

	        config.setConfigurationRule(WVTConfiguration.STEP_INPUT_FILTER, new WVTConfigurationFact(ifilter));
	        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(stemmer));
	        config.setConfigurationRule(WVTConfiguration.STEP_WORDFILTER, new WVTConfigurationFact(filter));
	        config.setConfigurationRule(WVTConfiguration.STEP_TOKENIZER, new WVTConfigurationFact(tokenizer));

	        // Initialize the input list with two classes
	        WVTFileInputList list = new WVTFileInputList(10);

	        // Add entries
	        //list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
	        //list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 0));
	        
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//auto", "html", "", "chinese", 0));
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//edu", "html", "", "chinese", 1));
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//ent", "html", "", "chinese", 2));
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//finance", "html", "", "chinese", 3));
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//house", "html", "", "chinese", 4));
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//mil", "html", "", "chinese", 5));
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//olympics", "html", "", "chinese", 6));
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//politics", "html", "", "chinese", 7));
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//sports", "html", "", "chinese", 8));
	        list.addEntry(new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//tech", "html", "", "chinese", 9));

	        // Generate the word list
	        
	        WVTWordList wordList = null;
	        if ((wordList = wvt.createWordListFromFile("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wordlist.txt")) == null)
	        {
	        	wordList = wvt.createWordList(list, config);
	        	wordList.store(new FileWriter("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wordlist.txt"));
	        }        
	        if (wordList == null)
	        {        	
	        	return null;
	        }
	        //WVTWordList wordList = new WVTWordList(new LinkedList(), list.getNumClasses()); 

	        // Prune the word list
	        //wordList.pruneByFrequency(0, 300);               
	        
	        // Create the vectors        
	        WVTWordVector[] vectors = null; 
	        if ((vectors = wvt.createVectorsFromFile("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wv.txt")) == null)
	        {
	        	FileWriter outFile = new FileWriter("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wv.txt");
	            WordVectorWriter wvw = new WordVectorWriter(outFile, true);
	            
	            config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));
	            config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));
	            
	            vectors = wvt.createVectors(list, config, wordList, null);
	            
	            wvw.close();
	            outFile.close();
	        }
	        if (vectors == null)
	        {        	
	        	return null;
	        }
	        	 	        
	        // Just for demonstration: Create a vector from a String        
	    	WVTDocumentInfo d = new WVTDocumentInfo("D://Program Files//heritrix-1.12.1//jobs//test//sports.htm", "html", "", "chinese");    	
	        WVTWordVector q = wvt.createVector(null, d, config, wordList);
	        //WVTDocumentInfo d = new WVTDocumentInfo(url, "html", "", "chinese");    	
	        //WVTWordVector q = wvt.createVector(content, d, config, wordList);
	        if (q == null)	
	        {
	        	return null;
	        }		        
	        
	        FileWriter outFile1 = new FileWriter("D://Program Files//Apache Software Foundation//Tomcat 5.5//webapps//text_category//wv1.txt");
	        WordVectorWriter wvw1 = new WordVectorWriter(outFile1, true);       
	        wvw1.write(q);        
	        wvw1.close();
	        outFile1.close();
	        
	        KNN knn = new KNN();	        
	        List result = knn.LazyLearning(q, vectors, list.getNumClasses());	     	        
	        
	        System.out.println("using time: " + (Calendar.getInstance().getTimeInMillis() - bs) + "ms");

	        return result;
		}catch (Exception e)
		{
			return null;
		}		
				
	}	
	
	public static void main(String[] args)
	{
		WVTManager wm = new WVTManager();
		wm.category("", "");
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -