📄 docfrequencyselector.java
字号:
package dragon.ir.classification.featureselection;import dragon.ir.classification.DocClassSet;import dragon.ir.index.*;import dragon.matrix.*;import java.util.ArrayList;/** * <p>Unsupervised Feature Selector which exclude features with its document frequency less than a given threshold</p> * <p>Please refer the paper below for details of the algorithm.<br> * Yang, Y. and Pedersen, J.O., "A comparative study on feature selection in text categorization," * In Proceedings of International Conference on Machine Learning, 1997, pp. 412-420. * </p> * <p>Copyright: Copyright (c) 2005</p> * <p>Company: IST, Drexel University</p> * @author Davis Zhou * @version 1.0 */public class DocFrequencySelector extends AbstractFeatureSelector implements java.io.Serializable { private static final long serialVersionUID = 1L; private int minDocFrequency; public DocFrequencySelector(int minDocFrequency) { this.minDocFrequency =minDocFrequency; } protected int[] getSelectedFeatures(SparseMatrix doctermMatrix, DocClassSet trainingSet){ ArrayList list; int[] featureMap; int i,termNum; featureMap=getTermDocFrequency(doctermMatrix,trainingSet); termNum=featureMap.length; list=new ArrayList(termNum); for(i=0;i<termNum;i++){ if(featureMap[i]>=minDocFrequency) list.add(new Integer(i)); } featureMap=new int[list.size()]; for(i=0;i<featureMap.length;i++) featureMap[i]=((Integer)list.get(i)).intValue(); return featureMap; } protected int[] getSelectedFeatures(IndexReader indexReader, DocClassSet trainingSet){ IntDenseMatrix termDistri; ArrayList list; IRTerm curTerm; int[] featureMap; int i,termNum; termDistri=getTermDistribution(indexReader,trainingSet); termNum=termDistri.columns(); list=new ArrayList(termNum); for(i=0;i<termNum;i++){ if(termDistri.getColumnSum(i)<=0) continue; curTerm=indexReader.getIRTerm(i); if(curTerm.getDocFrequency()>=minDocFrequency) list.add(curTerm); } featureMap=new int[list.size()]; for(i=0;i<featureMap.length;i++) featureMap[i]=((IRTerm)list.get(i)).getIndex(); return featureMap; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -