📄 me_reg.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
字号:
#include "ME_REG.h"



/*********************************************************
//
// Vector and matrix multiplication functions.
//
**********************************************************/

inline double vector_self_dot_prod(const vector<double>& v)
{
	double dp=0;
	int i;

	for (i=0; i<v.size(); i++)
	{
		dp += v[i]*v[i];
	}

/*	i=0;
	while (i<v.size())
	{
		const int end_idx = (i + 1000>v.size() ? v.size() : i+1000);
		int j;
		
		double tmp=0;
		for (j=i; j<end_idx; j++)
			tmp += v[i]*v[i];
		dp+=tmp;
		i=end_idx;
	}*/

	return dp;
}


inline double vector_dot_prod(const vector<double>& v1, const vector<double>& v2 )
{
	double dp=0;
	int i;

	if (v1.size() != v2.size())
	{
		printf("mismatch in vector dimensions!\n");
		exit(1);
	}
	

	for (i=0; i<v1.size(); i++)
		dp += v1[i]*v2[i];

/*	i=0;
	while (i<v1.size())
	{
		const int end_idx = (i + 1000>v1.size() ? v1.size() : i+1000);
		int j;
		
		double tmp=0;
		for (j=i; j<end_idx; j++)
			tmp += v1[i]*v2[i];
		dp+=tmp;
		i=end_idx;
	}*/


	return dp;
}

/****************************************************************
Returns the result of multipling a vecotr by matrix. matrix should
come as a vector of rows.
*****************************************************************/
inline void row_times_matrix(const vector<double>& v, 
							 const vector< vector<double> >& m,
							 vector<double>& res)
{
	const int len=m[0].size();
	const int v_len=v.size();
	int i;

	if (v_len != m.size())
	{
		printf("mismatch in vector and matrix dimensions!\n");
		exit(1);
	}
	res.resize(len,0);
	
	for (i=0; i<len; i++)
	{
		double tmp=0;
		int j;
		for (j=0; j<v_len; j++)
			tmp += v[j]*m[j][i];
		res[i]+=tmp;
	}
}



void ME_Regression_Model::print_weights_c()
{
	int i;
	printf("weights[%d]={\n",num_features);
	for (i=0; i<num_features-1; i++)
	{
		printf("%g , ",f_weights[i]);
		if ( i>0 && i % 6 == 0)
			printf("\n");

		
	}
	printf("%g};\n",f_weights[num_features-1]);
}



struct fpair {
	fpair(int i, value_t v) : sam_idx(i), val(v) {}
	int sam_idx;
	value_t val;
};

/*********************************************************************************
// trains the model's weights using IIS
// either specify the number of iterations, or the minimal improvement in log likelihood 
// (min ratio for improvement/log_likelihood)
*************************************************************************************/
void ME_Regression_Model::train_IIS2(const ME_Regression_DataSet& ds, 
									 int max_iterations, 
									 double min_log_like_improve,
									 vector<int>& feature_idxs)
{
	vector< vector< double > > e_sums; // the exps of sums \sum_i f_i(x,y)  |sums|=k*n (k classs, n samples)

	vector<double>   e_totals; // the exp of totals \sum_y sums(x,y)  |totals|=n (n samples)

/*	int i,j;
	const int num_samples = ds.num_samples;
	num_classes=ds.num_classes;
	num_features=ds.num_features;

	bool use_ratio=(min_log_like_improve>0);
	double log_like=-9.999E99;

	// initialize weights
	f_weights.clear();
	f_weights.resize(num_features,0);


	// initalize sums and totals
	e_sums.resize(num_classes);
	for (j=0; j<num_classes; j++)
		e_sums[j].resize(ds.num_samples,1.0);

	e_totals.resize(ds.num_samples,num_classes);


	// make lists of fpairs
	vector<int> selected_feature_pos;
	selected_feature_pos.resize(num_features,NEG_INF);
	for (i=0; i<feature_idxs.size(); i++)
		selected_feature_pos[i]=feature_idxs[i];

	vector< vector<fpair> > fpairs;
	fpairs.resize(feature_idxs.size());


	for (i=0; i<ds.samples.size(); i++)
	{
		const vector<fval>& fvals = ds.samples[i].f_vals;
		int j;
		for (j=0; j<fvals.size(); j++)
		{
			const int pos = selected_feature_pos[fvals[j].f_idx];
			if (pos>=0 && fvals[j].val != 0)
				fpairs[pos].push_back(fpair(i,fvals[j].val));
		}
	}


	// clac emperical feature values (only for class 0!)
	vector<double> emp_f_vals;
	emp_f_vals.resize(num_features,0);

	for (i=0; i<num_samples; i++)
	{
		if (ds.samples[i].label != 0)
			continue;

		const vector<fval>& fvals = ds.samples[i].f_vals;
		const double& sam_weight = ds.samples[i].weight;
		int f;
		for (f=0; f<fvals.size(); f++)
		{
			const int f_idx = fvals[f].f_idx;
			emp_f_vals[f_idx]+=sam_weight * fvals[f].val;
		}
	}

	for (t_iterations=0; t_iterations<max_iterations; t_iterations++)
	{
		int f;

	//	printf("ITER: %d\n",it+1);

		for (f=0; f<feature_idxs.size(); f++)
		{
			const int f_idx = feature_idxs[f];
			vector<double> pf;
			int i;
			int counter=0;
			double delta=0;     // the delta_i to update the weights
			double exp_model_f=0; // p(f)
			double abs_diff=0.0;  // p(f)-p~(f)

			vector<double> all_pxy;

			// fill pf = p(y|x)*f_i(x,y)	- loop on all pairs
			pf.clear();
			pf.resize(ds.num_samples,0);
			for (i=0; i<fpairs[f].size(); i++)
			{
				const int sam_idx = fpairs[f][i].sam_idx;
				const value_t val = fpairs[f][i].val;
				const double e = esums[0][sam_idx];
				double prob = e/(1.0+e);

				pf[sam_idx]=prob*val;

				exp_model_f+=pf[i];
			}

			// clac diff between p(f) and p~(f)
			double emp=emp_f_vals[f_idx];

			abs_diff=fabs(exp_model_f - emp);
			if (abs_diff < 1E-12)
				abs_diff=1E-12;


			// iterate until at least 0.75 of the distance is achieved
			while (counter == 0 || (fabs(exp_model_f - emp) > 0.1*abs_diff && counter<10))
			{
				int i;

				// calculate g(a) and g'(a) for Newton's method
				double g=-emp_f_vals[f_idx];
				double g_prime=0;

				for (i=0; i<ds.num_samples; i++)
				{
					int label=ds.samples[i].label;
					double f_sharp=ds.samples[i].f_sharp[label];
					double g_val= pf[i] * exp(delta*f_sharp);
					g+= g_val;
					g_prime += g_val * f_sharp;
				}


				delta -= g/g_prime;

				// calculate new exp_model_f
				exp_model_f = 0;
				for (i=0; i<ds.num_samples; i++)
					exp_model_f+=pf[i]*exp(delta*ds.samples[i].f_sharp[ds.samples[i].label]);

				counter++;
			}

			// update weight (don't go to extremes where numerical stability is os os
			if (f_weights[f_idx]<700 && f_weights[f_idx]>-700)
			{
				int i,j;

				f_weights[f_idx]+=delta;

				// update sums and totals
				for (i=0; i<ds.num_samples; i++)
				{
					e_totals[i]=0;
					for (j=0; j<num_classes; j++)
					{
						double diff= exp(delta * ds.samples[i].f_vals[j][f_idx]);
						e_sums[j][i]*= diff;
						e_totals[i]+=e_sums[j][i];
					}

				}
				
			}
		}
		
		if (use_ratio)
		{
			double new_log_like=log_likelihood(ds);
			if (t_iterations>0 && ((log_like-new_log_like)/new_log_like) < min_log_like_improve)
				break;
			log_like=new_log_like;
		//	printf("%f\n",log_likelihood(ds));
		}
	}*/
}


/*********************************************************************/
//
//   Logisitic Regression Functions
//
/*********************************************************************/





/*********************************************************************
Calculates the vector for the first derivatives with the current lambdas
This function assumes that the values of p and fp are already computed.
**********************************************************************/
void ME_Regression_Model::calc_neg_first_derivative(const ME_Regression_DataSet& ds, 
													const vector<double>& p,
													vector<double>& vals) const
{
	int i;

	vals.clear();
	vals.resize(num_features,0);

	for (i=0; i<ds.num_samples; i++)
	{
		int f,label = ds.samples[i].label;
		const ME_Regression_Sample& sam = ds.samples[i];
		const double& sam_prob = p[i];

		if (label == 0)
		{
			double one_minus_p = 1 - sam_prob;
			for (f=0; f<sam.f_vals.size(); f++)
			{
				const int& f_idx = sam.f_vals[f].f_idx;
				vals[f_idx] += sam.weight * ( sam.f_vals[f].val * one_minus_p);
			}
		}
		else
		{
			for (f=0; f<sam.f_vals.size(); f++)
			{
				const int& f_idx = sam.f_vals[f].f_idx;
				vals[f_idx] -= sam.weight * ( sam.f_vals[f].val * sam_prob);
			}
		}
	}
}




/******************************************************************
Calculates the matrix for the Hesian of the current lambdas.
This function assumes that the values of p and fp are already computed.
*******************************************************************/
void ME_Regression_Model::calc_Hesian2(const ME_Regression_DataSet& ds, 
									   const vector<double>& p,
									   vector< vector<double> >& hes) const
{
	int i;
	int num_features=ds.num_features;
	int num_classes=ds.num_classes;

	hes.resize(num_features);
	for (i=0; i<num_features; i++)
	{
		hes[i].clear();
		hes[i].resize(num_features,0);
	}
	

	for (i=0; i<ds.num_samples; i++)
	{
		int f;
		double p_times_one_minus_p = p[i] * ( 1.0 - p[i]);
		const ME_Regression_Sample& sam = ds.samples[i];
		
		for (f=0; f<sam.f_vals.size(); f++)
		{
			const int& f_idx = sam.f_vals[f].f_idx;
			int g;
			for (g=f; g<sam.f_vals.size(); g++)
			{
				const int &g_idx = sam.f_vals[g].f_idx;

				hes[f_idx][g_idx] += sam.weight * sam.f_vals[f].val * 
									 sam.f_vals[g].val * p_times_one_minus_p;
			}
		}
	}

	// fill symmetric values
	int g;
	for (g=0; g<ds.num_features; g++)
	{
		int f;
			
		for (f=g+1 ; f<ds.num_features; f++)
			hes[f][g]=hes[g][f];
	}
}




/************************************************************************
// calculates p(y|x) for all x,y
// first index in p is class index 0..k-1
// second index in p is sample number 0..n-1
*************************************************************************/
void ME_Regression_Model::calc_p_for_all_samples(const ME_Regression_DataSet& ds, 
												 vector<double>& p) const
{
	int i;

	int num_classes = ds.num_classes;
	int num_features = ds.num_features;
	int num_samples = ds.num_samples;

	if (p.size() != num_samples)
	{
		p.resize(num_samples);
	}

	for (i=0; i<num_samples; i++)
		p[i] = this->p_y_given_x(0,ds.samples[i]);
}




/*************************************************************
Calc the log likelihood of the dataset
**************************************************************/
double ME_Regression_Model::log_likelihood(const ME_Regression_DataSet& ds) const
{
	double ll=0;
	int i;

	for (i=0; i<ds.num_samples; i++)
	{
		double p=p_y_given_x(ds.samples[i].label,ds.samples[i]);
		ll+=ds.samples[i].weight*log(p);
	}

//	cout << "Likelihood: " << ll << endl;

	return ll;
}



/***********************************************************************************
Trains the ME model using the cg method.
Returns false if there was no convergence (numerical stability issues)
************************************************************************************/
bool ME_Regression_Model::train_cg(const ME_Regression_DataSet& ds, 
								   int max_iterations,
								   double epsilon,
								   int reset_rounds)
{
	int i,k,f,n;
	double delta_old, delta_new, delta_zero, eps_sqr_delta_zero, prev_likelihood,org_likelihood;
	vector<double> r,d,p;
	
	has_weights=false;
	num_features = ds.num_features;
	num_classes = ds.num_classes;

	i=0;
	k=0;
	n=num_features;

	if (reset_rounds == 0)
		reset_rounds = n-1;

	vector<double> prev_good_weights;
	bool moderate_advance = false;
	prev_good_weights.clear();

RESET_ROUNDS:	
		

	printf("Max iterations: %d  reset CG after %d rounds\n",max_iterations,reset_rounds);

	// initialize weights (use previous values if weights already set)


	if (prev_good_weights.size() == n)
	{
		f_weights = prev_good_weights;
		moderate_advance = true;
	}
	else
	{
		f_weights.clear();
		f_weights.resize(n,0);
	}
	
	
	calc_p_for_all_samples(ds,p);
	calc_neg_first_derivative(ds,p,r);

	d=r;
	delta_new=vector_self_dot_prod(r);
	delta_zero=delta_new;
	eps_sqr_delta_zero = epsilon * epsilon * delta_zero;



	while (i<max_iterations && delta_new>eps_sqr_delta_zero)
	{
		int j=0,j_max = 10;
		double delta_d = vector_self_dot_prod(d);
		double newton_eps_sqr = 0.001;
		double alpha,beta;
		vector< vector<double> > hes;
		vector<double> temp;
		
		prev_good_weights = f_weights;
		
		bool extra_newton_raphson_iter = false;
		do
		{
			double top,denominator;
			int f;
		
			calc_Hesian2(ds,p,hes);

			top = vector_dot_prod(r,d);
			row_times_matrix(d,hes,temp);
			denominator = vector_dot_prod(temp,d);

			alpha = top / denominator; // the minus is in r (which is the negative first derivative)

			if (moderate_advance)
				alpha *= 0.25;

			if (! (alpha == alpha))
			{
				cout << "Alpha NAN!" << endl;
				exit(1);
			}

			// x = x + alpha*d   - just update the weights
			bool halt=false;
			for (f=0; f<num_features; f++)
				f_weights[f] += alpha * d[f];

			// recalculate the first derivative and p and fp
			calc_p_for_all_samples(ds,p);
			calc_neg_first_derivative(ds,p,r);
			j++;

			extra_newton_raphson_iter = (j<j_max && alpha * alpha * delta_d > newton_eps_sqr);
		}
		while(extra_newton_raphson_iter);
		

		delta_old = delta_new;
		delta_new = vector_self_dot_prod(r);
		beta = delta_new / delta_old;

		if (! (beta == beta))
		{
			cout << "Beta NAN!" << endl;
			exit(1);
		}

		for (f=0; f<num_features; f++)
			d[f] = r[f] + beta * d[f];

		k++;

		if (k == reset_rounds || vector_dot_prod(r,d) <= 0)
		{
			d=r;
			k=0;
		}

		// check if we are having a log likelihood problem 
		// can happen because of numerical stability problems
		double likelihood = log_likelihood(ds);
		if (i>0)
		{
			double like_diff = prev_likelihood - likelihood;
			if (like_diff >0)
			{
				cout << i << "\t" << "diff: " << like_diff << endl;
				cout << i << "\t" << "prev: " << prev_likelihood << endl;
				cout << i << "\t" << "curr: " << likelihood << endl;

				if (fabs(like_diff/prev_likelihood)>0.0001)
				{
					reset_rounds = reset_rounds /2;
					if (reset_rounds<1)
					{
						// maybe try using IIS for a few rounds, and use those
						// weights as a starting position for the CG...

						printf("Couldn't train model!\n");
						f_weights.clear();
						f_weights.resize(num_features,0);
						return false;
					}


					if (i>25 || likelihood/org_likelihood<0.5)
					{
						printf("Found likelihood decrease, not training this model further!\n");
						break;
					}

					printf("Likelihood decrease detected, reducing reset rounds to %d\n",reset_rounds);
					i=0;
					goto RESET_ROUNDS;
				}
			}
			else
				moderate_advance=false;

			// means we have a problem
			if (likelihood < -9E200)
			{
				// maybe try using IIS for a few rounds, and use those
				// weights as a starting position for the CG...

				f_weights.clear();
				f_weights.resize(num_features,0);

				printf("Couldn't train model!\n");
				return false;
			}

		}
		else
			org_likelihood = likelihood;

		if (i % 10 == 0)
			printf("%d  NR iterations: %d   Log-likelihood: %.8f\n",i,j,likelihood);
		prev_likelihood = likelihood;
		i++;
	}

	has_weights=true;
	return true;
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -