📄 adaboost_common.cpp
字号:
{
cat[m - 1] = 1;
}
randomForest *zer_array = new randomForest(1,1);
if (missingValueTrain == 1){
/* GIVE CLASS WEIGHTS */
for (j = 1; j <= 2; ++j) {
classwt[j - 1] = (float)1.;
}
/*********************************************************/
/* There are number of ways to replacing missing values. */
/*********************************************************/
/******************************************************************************************/
/* This way it begins by doing a rough and inaccurate filling in of the missing values. */
/* Then it does a forest run and computes proximities. If x(m,n) is a missing continuous */
/* value, estimate its fill as an average over the non-missing values of the mth variables*/
/* weighted by the proximities between the nth case and the non-missing value case. */
/* If it is a missing categorical variable, replace itby the most frequent non-missing */
/* value where frequency is weighted by proximity. */
/******************************************************************************************/
if (TrainMissingDataProcedure == 1){
zer_array->roughfix(x_train, v, ncase, &mdim, &nsample, xts, &maxcat, cat, &code, nrcat, &maxcat, fill);
}
/********************************************************************************/
/* K-nearest neighbor method to estimate values for the missing expression data.*/
/********************************************************************************/
if (TrainMissingDataProcedure == 2){
zer_array->Knearestneighborfix(x_train, ncase, &mdim, &nsample, cat, &code);
}
/****************************************************************************************/
/* Ignore procedure simply ignore from missing data and return smallest matrix without */
/* this -999 */
/****************************************************************************************/
if (TrainMissingDataProcedure == 3){
zer_array->Ignorefix(x_train,v, ncase, &mdim, &nsample, cat, &code, ncolumn);
}
/****************************/
/* COUNT CLASS POPULATIONS */
/****************************/
zer_array->zerv(nc, &nclass);
for (n = 1; n <= nsample_train; ++n) {
if (cl[n - 1] < 1 || cl[n - 1] > 2) {
if (com_pro.show_action)
printm("error in class label");
}
++nc[cl[n - 1] - 1];
}
}
/****************************************************************/
/* Subroutine/Functions for new array e.g. countts, counttr ... */
/****************************************************************/
zer_array->zermr(countts, &nclass, &ntest);
zer_array->zerm(counttr, &nclass, &nsample);
zer_array->zerv(out, &nsample);
zer_array->zervr(tgini, &mdim);
zer_array->zerv(msum, &mdim);
zer_array->zermd(prox, &value_one, &value_one);
/***************************************************************************************/
/* The function do numbers of things: */
/* 1. Find how many time the class appear at the class column , e.g. for satimage.tra */
/* class 1 appear 38 times, class 2 appear 436 ... */
/* 2. divid the class appearance (e.g. 38) to the number of rows (e.g. 2296). */
/* 3. Normalize the results and if there is class (e.g. 1) the wtt array will get 0.00 */
/***************************************************************************************/
if (missingValueTrain == 0){
zer_array->prep(cl, &nsample, &nclass, &ipi, pi, pid, nc, wtt);
}
/***********************************************************************************/
/* submakea constructs the mdim x nsample int array a.if there are less than */
/* 32,000 cases, this can be declared int*2, otherwise int*4. For each */
/* numerical variable with values x(m,n), n=1,...,nsample, the x-values are sorted */
/* from lowest to highest. Denote these by xs(m,n). Then a(m,n) is the case */
/* number in which xs(m,n) occurs. The b matrix is also contructed here. */
/* if the mth variable is categorical, then a(m,n) is the category of the nth case */
/* number. */
/***********************************************************************************/
zer_array->makea(x_train,&mdim,&nsample,cat,isort,v,at,b,&mdim,ncolumn);
if (ntest > 1)
{
for (m = 1; m <= 36; ++m)
{
cat[m - 1]=1;
}
for (j = 1; j <= 6; ++j)
{
pid[j - 1]=0.000000;
}
}
/*************/
/* START RUN */
/*************/
/************************************************/
/* Build new array by zerv, zerm .. Subroutines */
/************************************************/
zer_array->zerv(nodestatus, &nrnodes);
zer_array->zerm(treemap, &value_two, &nrnodes);
zer_array->zervr(xbestsplit, &nrnodes);
zer_array->zerv(nodeclass, &nrnodes);
zer_array->zerv(jin, &nsample);
zer_array->zervr(tclasspop, &nclass);
zer_array->zervr(win, &nsample);
zer_array->zerv(bestvar, &nrnodes);
if (missingValueTrain == 1){
for (n = 1; n <= nsample; ++n) {
/******************************************************/
/* k is rundom number: k = (int) (rrand_() * 155) + 1 */
/******************************************************/
float ret_val=0;
zer_array->rrand(&ntest,&ret_val);
k= (ret_val * nsample) + 1;
if (k >= nsample){
zer_array->rrand(&ntest,&ret_val);
k= (ret_val * nsample) + 1;
}
/**************************************************/
/* The arrary "win" also get 0 or 1 according to k*/
/**************************************************/
win[k - 1] += classwt[cl[k - 1] - 1];
/**********************************************/
/* The arrary "jin" also get 1 according to k */
/**********************************************/
++jin[k - 1];
/*********************************************************************************************/
/* According to k we find the class number at data file e.g. k=300 cl[k-1]=3, k=46 cl[k-1]=7,*/
/* and we update new array "tclasspop". the arrary get 1 is the class exist, and O if not. */
/*********************************************************************************************/
tclasspop[cl[k - 1] - 1] += classwt[cl[k - 1] - 1];
}
}
if (missingValueTrain == 0){
for (n = 1; n <= nsample; ++n)
{
/**********************/
/* k is rundom number */
/**********************/
float ret_val=0;
zer_array->rrand(&ntest,&ret_val);
k= (ret_val * nsample) + 1;
//k = (int) (rrand(&ntest) * 2296) + 1;
if (k >= nsample){
zer_array->rrand(&ntest,&ret_val);
k= (ret_val * nsample) + 1;
}
/*********************************************************************************************/
/* According to k we find the class number at data file e.g. k=300 cl[k-1]=3, k=46 cl[k-1]=7,*/
/* and we update new array "tclasspop". the arrary get 1 is the class exist, and O if not. */
/*********************************************************************************************/
tclasspop[cl[k - 1] - 1] += wtt[k - 1];
/**************************************************/
/* The arrary "win" also get 0 or 1 according to k*/
/**************************************************/
win[k - 1] += wtt[k - 1];
/**********************************************/
/* The arrary "jin" also get 1 according to k */
/**********************************************/
jin[k - 1] = 1;
}
}
/***************************************************************************************************/
/* The Function / Subroutine build A new array that equal to the location of x-value before sort */
/* e.g. for column 1, before sort: v[1909]=40, after sort: v[1]=40 isort=1909. */
/* it's mean at this Subroutine array we transfar from k[m1 + n1 * k_dim1] to j[m1 + n1 * j_dim1] */
/* and we get new "j" array:j[]=1909,j[]=1576 .... */
/***************************************************************************************************/
zer_array->eqm(a, at, &mdim, &nsample,ncolumn);
/*******************************************************************************************************/
/*The Function / Subroutine update the "a" array e.g. a[37]=2005 -> a[37]=1909, a[73]=530 -> a[73]=1576*/
/*******************************************************************************************************/
zer_array->moda(a, &nuse, &nsample, &mdim, cat, &ntest, ncase, jin, ta,ncolumn);
/***********************************************************************************************/
/* Buildtree consists of repeated calls to two subroutines, Findbestsplit and Movedata. */
/* Findbestsplit does just that--it finds the best split of the current */
/* node. Movedata moves the data in the split node right and left so that the data */
/* corresponding to each child node is contiguous. */
/* The buildtree bookkeeping is different from that in Friedman's original CART program. */
/* ncur is the total number of nodes to date. nodestatus(k)=1 if the kth node has been split. */
/* nodestatus(k)=2 if the node exists but has not yet been split, and =-1 of the node is */
/* terminal. A node is terminal if its size is below a threshold value, or if it is all */
/* one class, or if all the x-values are equal. If the current node k is split, then its */
/* children are numbered ncur+1 (left), and ncur+2(right), ncur increases to ncur+2 and */
/* the next node to be split is numbered k+1. When no more nodes can be split, buildtree */
/* returns to the main program. */
/***********************************************************************************************/
zer_array->buildtree(a, b, cl, cat, &mdim, &nsample, &nclass, treemap, bestvar,
bestsplit, bestsplitnext, tgini, nodestatus, nodepop, nodestart, classpop, tclasspop, tclasscat, ta, &nrnodes,
idmove, &ntest, ncase, parent, jin, &nclass, iv, nodeclass, &ndbigtree, win, wr, wc, wl, &mdim, &nuse);
/*************************************************************************************/
/* This subroutine takes the splits on numerical variables and translates them */
/* back into x-values. It also unpacks each categorical split into a 32-dimensional */
/* vector with components of zero or one--a one indicates that the corresponding */
/* category goes left in the split. */
/*************************************************************************************/
zer_array->xtranslate(x_train, &mdim, &nrnodes, &nsample, bestvar, bestsplit, bestsplitnext, xbestsplit,
nodestatus, cat, &ndbigtree);
/************************************************/
/* GET OUT-OF-BAG ESTIMATES FOR TEST HYPOTHESIS */
/************************************************/
ntest = test_max_docs;
if (missingValueTest == 1)
{
/******************************************************************************************/
/* There are number of ways to replacing missing values. */
/* This way it begins by doing a rough and inaccurate filling in of the missing values. */
/* Then it does a forest run and computes proximities. If x(m,n) is a missing continuous */
/* value, estimate its fill as an average over the non-missing values of the mth variables*/
/* weighted by the proximities between the nth case and the non-missing value case. */
/* If it is a missing categorical variable, replace itby the most frequent non-missing */
/* value where frequency is weighted by proximity. */
/******************************************************************************************/
if (TestMissingDataProcedure == 1){
zer_array->roughfix(x_test, v, ncase, &mdim, &nsample_test, xts, &maxcat, cat, &code, nrcat, &maxcat, fill);
}
/********************************************************************************/
/* K-nearest neighbor method to estimate values for the missing expression data.*/
/********************************************************************************/
if (TestMissingDataProcedure == 2){
zer_array->Knearestneighborfix(x_test, ncase, &mdim, &nsample_test, cat, &code);
}
/****************************************************************************************/
/* Ignore procedure simply ignore from missing data and return smallest matrix without */
/* this -999 */
/****************************************************************************************/
if (TestMissingDataProcedure == 3){
zer_array->Ignorefix(x_test,v, ncase, &mdim, &nsample_test, cat, &code, ncolumn);
}
}
zer_array->testreebag(x_test, &ntest, &mdim, treemap, nodestatus, xbestsplit, cbestsplit,
bestvar, nodeclass, &nrnodes, &ndbigtree, cat, &nclass, jts, nodex, &value_one);
for (int kkk=0; kkk <= ntest; ++kkk)
{
if (jts[kkk] == 2 ){
jts[kkk]=0;
}
}
/* Put the hypothesis test(jts) into test_error_array Matrix */
Matrix* get_train_array = new Matrix(1,1);
get_train_array->copyArrayToMatrix(step,value_one,jts,*test_error_array);
/* GET OUT-OF-BAG ESTIMATES FOR TRAIN HYPOTHESIS*/
zer_array->testreebag(x_train, &nsample, &mdim, treemap, nodestatus, xbestsplit, cbestsplit,
bestvar, nodeclass, &nrnodes, &ndbigtree, cat, &nclass, jtr, nodex, &ntest);
for (int kkkk=0; kkkk <= nsample; ++kkkk)
{
if (jtr[kkkk] == 2 ){
jtr[kkkk]=0;
}
}
/* Put the hypothesis train(jtr) into test_error_array Matrix */
get_train_array->copyArrayToMatrix(step,value_one,jtr,*train_error_array);
delete get_train_array;
delete zer_array;
return (0);
}
/*************************************************************************************/
/* function [errorTrain,errorTest]=getError(boost,train,train_label,test,test_label) */
/* disp('run getError'); */
/* d=size(boost); */
/* num=size(train); */
/* prediction=zeros(num(1),1); */
/* % geting the train error */
/* for h=1:d(1) */
/* prediction=prediction-log(boost(h,1))*(train(:,boost(h,2))>=boost(h,3)); */
/* end */
/* temp=-sum(log(boost(:,1)))/2; */
/* errorTrain=sum(abs((train_label>=5)-(prediction>=temp)))/num(1); */
/* prediction=zeros(1000,1); */
/* % geting the test error */
/* for h=1:d(1) */
/* prediction=prediction-log(boost(h,1))*(test(:,boost(h,2))>=boost(h,3)); */
/* end */
/* errorTest=sum(abs((test_label>=5)-(prediction>=temp)))/1000; */
/*************************************************************************************/
int getError(Matrix* boost,
Matrix* train_hypothesis,
Matrix* train_matrix,
Matrix* train_label_matrix,
Matrix* test_hypothesis,
Matrix* test_matrix,
Matrix* test_label_matrix,
Matrix* errorTrain,
Matrix* errorTest,
Matrix* tmp_test_hypothesis,
long train_max_words_doc,
long train_max_docs,
int step,
int cycles,
int sign)
{
if (com_pro.show_action)
printm("run getError");
double value_one = 1;
double value_two = 2;
// d=size(boost);
int boost_row;
int boost_col;
Matrix* size = new Matrix(step,step);
size->matrixSize(&boost_row,&boost_col,*boost);
// num=size(train_matrix);
int train_row;
int train_col;
size->matrixSize(&train_row,&train_col,*train_matrix);
// num=size(test_matrix);
int test_row;
int test_col;
size->matrixSize(&test_row,&test_col,*test_matrix);
if (cycles < 100){
step=train_row-1;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -