📄 knn.java
字号:
package edu.ustc.cs.classfier;
import edu.ustc.cs.process.Corpus;
import edu.ustc.cs.structer.WordVector;
import edu.ustc.cs.structer.CategoryResult;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: dolphin
* Date: 2008-11-30
* Time: 11:10:56
* To change this template use File | Settings | File Templates.
*/
public class KNN {
private int k = 20; // 预设值
private int topN=1;//将文本分类到前topN最大相似度的类别中
private double[] ClassSim = null;
private Map<Integer, String> indexmap = null;//类别及类别标号
//======================================
public KNN(){
indexmap = new HashMap<Integer,String>();
Iterator it=Corpus.classLabel.iterator();
int classID=0;
while(it.hasNext()){
String classValue=(String)it.next();
indexmap.put(classID, classValue);
++classID;
}
}
//=======================================
public double QuarForVector(WordVector v){// 向量长度
Map map=v.getWordMap();
Iterator it=map.values().iterator();
double quare=0;
while(it.hasNext()){
double element=(Double)it.next();
quare+=element*element;
}
return Math.sqrt(quare);
}
//==========================================
public void LazyLearning(WordVector v, WordVector[] vectors, int numClasses)
{
if (v == null || vectors == null)
return ;
ClassSim = new double[numClasses];
for (int i = 0; i < numClasses; i++)
{
ClassSim[i] = 0;
}
k = (k < vectors.length)? k : vectors.length;
double[] Sim = new double[vectors.length];//存储测试文档与每个训练文档的相似度
for (int i = 0; i < Sim.length; i++)
{
Sim[i] = 0;
Map map1 = v.getWordMap();
Map map2 = vectors[i].getWordMap();
for (Iterator it = map1.keySet().iterator(); it.hasNext();)
{
String word1 = (String)it.next();
if (map2.containsKey(word1))
{
double value1 = Double.valueOf(map1.get(word1).toString());
double value2 = Double.valueOf(map2.get(word1).toString());
Sim[i] += (value1 * value2);
}
}
Sim[i]=Sim[i]/(QuarForVector(v)*QuarForVector(vectors[i]));//相似度为cosin
}
for (int i = 0; i < k; i++) //相似度向量排序,选取离当前实例最近的K个训练样本
{
for (int j = i + 1; j < Sim.length; j++)
{
if (Sim[j] > Sim[i])
{
double dtemp = Sim[i]; //点积向量排序
Sim[i] = Sim[j];
Sim[j] = dtemp;
WordVector wv = vectors[i]; //训练文档向量排序
vectors[i] = vectors[j];
vectors[j] = wv;
}
}
}
for (int i = 0; i < k; i++)
{
WordVector wv = vectors[i];
int numClass = wv.getDocumentInfo().getClassValue();
ClassSim[numClass] += Sim[i];
}
// 输出前topN个类型 ,首先进行排序
int[] index = new int[ClassSim.length];
for (int i = 0; i < ClassSim.length; i++)
index[i] = i;
for (int i = 0; i < topN; i++)
{
for (int j = i + 1; j < ClassSim.length; j++)
{
if (ClassSim[j] > ClassSim[i])
{
double dtemp = ClassSim[i]; //对ClassSim进行排序,只找出最大的三个
ClassSim[i] = ClassSim[j];
ClassSim[j] = dtemp;
int itemp = index[i]; //排出最大的三个ClassID
index[i] = index[j];
index[j] = itemp;
}
}
}
v.getDocumentInfo().setClassVaue(index[0]);
/*
for (int i = 0; i < k; i++)
{
WordVector wv = vectors[i];
String id;
id = wv.getDocumentInfo().getSourceName();
System.out.println(id + ": " + Sim[i]);
}
*/
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -