⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 articleclassifierimpl.java

📁 一个简单的kNN算法实现
💻 JAVA
字号:
package article.service.impl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Map.Entry;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import article.entity.Article;
import article.entity.Category;

/**
 * @author ahuaxuan(aaron zhang)
 * @since 2008-2-18
 * @version $Id$
 */
public class ArticleClassifierImpl {

	private static transient Log logger = LogFactory.getLog(ArticleClassifierImpl.class);
	private double vectorGene = 2;
	
	public Map<String, List<String>> matchArticle(List<Category> categoryList, List<Article> articleList) {
		try {
			Map<String, Map<String, Integer>> classVector = getClassVector(categoryList);
			Map<String, Map<String, Integer>> articleVector = getArticleVector(articleList);
			
			return analyse(articleVector, classVector);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			logger.error("", e);
			return Collections.emptyMap();
		}
	}
	
	protected Map<String, List<String>> analyse(Map<String, Map<String, Integer>> articleVectorMap, Map<String, Map<String, Integer>> categoryVectorMap) {
		
		Map<String, List<String>> map = new HashMap<String, List<String>>();
		
		for (Entry<String, Map<String, Integer>> copyrightEntry : categoryVectorMap.entrySet()) {
			List<String> itemIdList = new ArrayList<String>();
			
			Map<String, String> tempMap = new HashMap<String, String>();
			
			for (Entry<String, Map<String, Integer>> itemEntry : articleVectorMap.entrySet()) {
				double acos = caculateVector(itemEntry.getValue(), filterVectorMap(copyrightEntry.getValue()));
				if (acos < vectorGene) {
					itemIdList.add(itemEntry.getKey());
					tempMap.put(itemEntry.getKey(), String.valueOf(acos));
				}
			}
			
			if (logger.isDebugEnabled()) {
				logger.debug(new StringBuilder().append("++++++++++++ ").append("article vector informations of category which id is ")
												.append(copyrightEntry.getKey()).append(" ++++++++"));
				for (Entry<String, String> e : tempMap.entrySet()) {
					logger.debug(new StringBuilder().append("articleId=").append(e.getKey())
													.append("---------").append("acos value=").append(e.getValue()));
				}
			}
			
			map.put(copyrightEntry.getKey(), itemIdList);
		}
		return map;
	}
	
	protected Map<String, Map<String, Integer>> getClassVector(List<Category> categoryList) throws Exception {
		
		if (categoryList == null || categoryList.size() == 0) {
			if (logger.isDebugEnabled()) {
				logger.debug("The list of new categoryList which should be classified is null or size = 0");
			}
			return Collections.emptyMap();
		}
		
		Map<String, Map<String, Integer>> categoryMap = new HashMap<String, Map<String, Integer>>();
		
		Directory ramDir = new RAMDirectory();
		IndexWriter writer = new IndexWriter(ramDir, new PaodingAnalyzer(), true);
//		IndexWriter writer = new IndexWriter(ramDir, new ChineseAnalyzer(), true);
		
		for (Category cRc : categoryList) {
			for (Article item : cRc.getArticleList()) {
				
				Document doc = new Document();
				doc.add(new Field("description", item.getContent(), Field.Store.NO,
						Field.Index.TOKENIZED, TermVector.YES));
				doc.add(new Field("category", cRc.getId().toString(), Field.Store.YES, Field.Index.NO));
				writer.addDocument(doc);
			}
		}
		
		if (logger.isDebugEnabled()) {
			logger.debug("Generate the index in the memory, the size of categoryList list is " + categoryList.size());
		}
		
		writer.close();
		
		buildContentVectors(ramDir, categoryMap, "category", "description");
		return categoryMap;
		
	}
	
	protected Map<String, Map<String, Integer>> getArticleVector(List<Article> articleList) throws Exception {
		if (articleList == null || articleList.size() == 0) {
			if (logger.isDebugEnabled()) {
				logger.debug("The list of articles which should be classified is null or size = 0");
			}
		}
		
		Map<String, Map<String, Integer>> articleMap = new HashMap<String, Map<String, Integer>>();
		
		Directory articleRamDir = new RAMDirectory();
//		IndexWriter writer = new IndexWriter(articleRamDir, new ChineseAnalyzer(), true);
		IndexWriter writer = new IndexWriter(articleRamDir, new PaodingAnalyzer(), true);
		
		for (Article article : articleList) {
			Document doc = new Document();
			doc.add(new Field("articleId", article.getId(),
					Field.Store.YES, Field.Index.NO));
			doc.add(new Field("description", article.getText(), Field.Store.NO, Field.Index.TOKENIZED, TermVector.YES));
			writer.addDocument(doc);
		}
		
		writer.flush();
		writer.close();
		
		buildContentVectors(articleRamDir, articleMap, "articleId", "description");
		return articleMap;
	}
	
	protected void buildContentVectors(Directory ramDir, Map<String, Map<String, Integer>> contentMap, String key, String fieldName) throws CorruptIndexException, IOException {
		IndexReader reader = IndexReader.open(ramDir);
		
		int numDocs = reader.numDocs();
		
		for (int k = 0; k < numDocs; k++) {
			if (!reader.isDeleted(k)) {
				Document doc = reader.document(k);
				
				String category = doc.getField(key).stringValue();
				
				Map<String, Integer> vectorMap = contentMap.get(category);
				
				if (vectorMap == null) {
					vectorMap = new TreeMap<String, Integer>();
					contentMap.put(category, vectorMap);
				} 
				
				TermFreqVector termFreqVector = reader.getTermFreqVector(k, fieldName);
				if (termFreqVector == null) {
					continue;
				}
				
				addTermFreqToMap(vectorMap, termFreqVector);
			}
			
		
		}
		
		reader.close();
	}
	
	protected void addTermFreqToMap(Map<String, Integer> vectorMap, TermFreqVector termFreqv) {
	
		
		String[] terms = termFreqv.getTerms();
		int[] freqs = termFreqv.getTermFrequencies();
		
		for (int i = 0; i < terms.length; i++) {
			String term = terms[i];
			
			if(vectorMap.containsKey(term)) {
				Integer value = (Integer) vectorMap.get(term);
				vectorMap.put(term, new Integer(value.intValue() + freqs[i]));
			} else {
				vectorMap.put(term, new Integer(freqs[i]));
			}
		}
	}
	
	protected Map<String, Integer> filterVectorMap(Map<String, Integer> map) {
		Map<String, Integer> vectorMap = new HashMap<String, Integer>();
		
		for (Entry<String, Integer> entry : map.entrySet()) {
			if (entry.getValue() > 3 && !StringUtils.isNumeric(entry.getKey()) && entry.getKey().length() > 1) {
				vectorMap.put(entry.getKey(), entry.getValue());
			}
		}
		return vectorMap;
	}
	
	public double caculateVector(Map<String, Integer> articleVectorMap, Map<String, Integer> classVectorMap) {
		if (articleVectorMap == null || classVectorMap == null) {
			if (logger.isDebugEnabled()) {
				logger.debug("itemVectorMap or classVectorMap is null");
			}
			
			return 20;
		}
		
		int dotItem = 0;
		int sumOfSquares = 0;
		int matchSize = 0;
		
		for (Entry<String, Integer> entry : articleVectorMap.entrySet()) {
			String word = entry.getKey();
			double categoryWordFreq = 0;
			
			if (classVectorMap.containsKey(word)) {
				categoryWordFreq = classVectorMap.get(word).intValue();
				++matchSize;
			}
			
//			dotItem += categoryWordFreq * Math.sqrt(entry.getValue());
			dotItem += categoryWordFreq;
			sumOfSquares += categoryWordFreq * categoryWordFreq;
		}
		
		double denominator;
		if (sumOfSquares == articleVectorMap.size()) {
			denominator = sumOfSquares;
		} else {
			denominator = Math.sqrt(sumOfSquares) * Math.sqrt(articleVectorMap.size());
		}
		
		double ratio =  dotItem / denominator;
		
		return Math.acos(ratio);
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -