📄 rankboost.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
		{
			cout << "Error: coudln't open out_file for wrtiting: " << buff << endl;
			exit(1);
		}

		sprintf(buff,"%s_stats.txt",report_prefix);
		stat_file_stream.open(buff,ios::out);
		if (! stat_file_stream.is_open() || ! stat_file_stream.good())
		{
			cout << "Error: coudln't open test stats for wrtiting: " << buff << endl;
			exit(1);
		}
	
		sprintf(buff,"%s_train_res.txt",report_prefix);
		train_res.open(buff,ios::out);
		if (! train_res.is_open() || ! train_res.good())
		{
			cout << "Error: coudln't open train res for wrtiing: " << buff << endl;
			exit(1);
		}

		if (test_ds)
		{
			sprintf(buff,"%s_test_res.txt",report_prefix);
			test_res.open(buff,ios::out);
			if (! test_res.is_open() || ! test_res.good())
			{
				cout << "Error: coudln't open test res for writing: " << buff << endl;
				exit(1);
			}
		}
		use_cout = false;
	}

	ostream&  out_stream  = (use_cout ? cout : out_file_stream);
	ostream&  stat_stream = (use_cout ? cout : stat_file_stream);

	vector<weight_t> D,D0, max_D_for_normal_updates;
	vector<weight_t> *p_max_D_for_normal_updates = NULL;

	
	training_ds.compute_initial_distribution(D0);

	if (training_ds.get_max_ratio_for_regular_update()<9999)
	{
		training_ds.compute_max_weights_for_normal_update(D0,max_D_for_normal_updates);
		p_max_D_for_normal_updates = &max_D_for_normal_updates;
	}

	real_first_updates.resize(num_real_features,NEG_INF);
	real_update_counts.resize(num_real_features,0);
	binary_update_counts.resize(num_binary_features,0);
	D=D0;

	double Z_prod = 1.0;

	total_default_weight =0;

	best_test_error  = 1.0;
	best_train_error = 1.0;
	
	this->ind_was_initialized = true;

	time_t start_time = time(NULL);

	cout << "Running boosting for at most " << max_num_rounds << " iterations..." << endl;
	int t;
	for (t=1; t<=max_num_rounds; t++)
	{
		current_round = t;

		// determine how often we report progress and test training/test error
		int report_freq = 1; 
		if (t>10)     report_freq=10;
		if (t>500)    report_freq=50;
		if (t>1000)   report_freq=100;
		if (t>5000)   report_freq=500;
		if (t>10000)  report_freq=1000;
		if (t>100000) report_freq=5000;
		if (t>500000) report_freq=10000;

		const int feature_report_rounds = 10000;

		bool report_this_round = ((t % report_freq) == 0);
		
		bool use_double_theta = true;

		bool ind_only_non_zero_features = false;
		if (t>100    && t%5)   ind_only_non_zero_features = true;
		if (t>1000   && t%10)  ind_only_non_zero_features = true;
		if (t>10000  && t%25)  ind_only_non_zero_features = true;
		if (t>100000 && t%100) ind_only_non_zero_features = true;

		
		int best_binary_idx=-1;
		double best_binary_r = 0;

		int real_feature_idx=-1;
		int real_theta_start_idx = -1;
		int real_theta_end_idx   = -1;
		int real_q_def = -1;
		double real_r = 0;

		vector<weight_t> potentials;

		training_ds.calc_potentials(D,potentials);

		if (num_binary_features>0)
			binary_weak_learn(potentials,training_ds,best_binary_idx,best_binary_r);

		if (num_real_features>0)
		{
			if (use_double_theta)
			{
				real_weak_learn_double_theta(potentials, training_ds, real_feature_idx,
					real_theta_start_idx, real_theta_end_idx,
					real_q_def, real_r, ind_only_non_zero_features );
			}
			else
				real_weak_learn(potentials, training_ds, real_feature_idx,
					real_theta_start_idx,  real_q_def, real_r, ind_only_non_zero_features );
		}

		if (real_r == 0 && best_binary_r == 0)
		{
			if (t<5)
			{
				cout << "Error: model converged to quickly, there is a problem with the feature values!" << endl;
				exit(1);
			}
			break;
		}
		
		double Z=1.0;
		if (fabs(real_r)>fabs(best_binary_r))
		{
			const weight_t alpha = 0.5 * log((1+real_r)/(1-real_r));
			const float theta_start = real_limits[real_feature_idx][real_theta_start_idx-1];
			const float theta_end = ( (real_theta_end_idx>0 &&
									  real_theta_end_idx<real_limits[real_feature_idx].size()) 
									  ? real_limits[real_feature_idx][real_theta_end_idx] :
									  POS_INF);
			
			Z=training_ds.update_distribution_according_to_real_feature(real_feature_idx, 
				theta_start, theta_end, real_q_def,alpha, D , p_max_D_for_normal_updates, false);

			update_model_weights_for_real_feature(alpha, real_feature_idx, real_q_def,
				real_theta_start_idx, real_theta_end_idx);

			if (real_first_updates[real_feature_idx]<0)
				real_first_updates[real_feature_idx]=t;
		}	
		else
		{
			const weight_t alpha = 0.5 * log((1+best_binary_r)/(1-best_binary_r));

			Z=training_ds.update_dsitribution_according_to_binary_feature(best_binary_idx, alpha, D);
			
			update_model_weights_for_binary_feature(best_binary_idx, alpha);
		}
		Z_prod *= Z;

		// This part is for the feature reports / not really part of the trianing
		if (running_feature_report_idx>=0 && 
			real_feature_idx == running_feature_report_idx)
		{
			const int num_updates = real_update_counts[running_feature_report_idx];
			int sqr=1;
			while (sqr<num_updates)
				sqr*=2;

			if (sqr == num_updates ) // output only for 1,2,4,8,... updates
			{
				if (! flist_stream.is_open())
				{
					char buff[512];
					sprintf(buff,"%s_fprog_%d.txt",report_prefix,running_feature_report_idx);
					flist_stream.open(buff,ios::out);
					if (! flist_stream.is_open() || ! flist_stream.good())
					{
						cout << "Error: coudln't open flist for writing: " << buff << endl;
						exit(1);
					}
				}
				ouput_importance_ranked_feature_list(training_ds,flist_stream,running_feature_report_idx,t);
				cout << t << "\t" << "feature report: " << num_updates << endl;
			}
		}


		// all output and test should be done in this area
		if (report_this_round)
		{
			time_t current_time = time(NULL);
			double total_time = current_time - start_time;
			clock_t start_test = clock();

			int num_tested_peptides_in_train=0;
			int num_tested_peptides_in_test=0;

			int *ptr_train_num = (t<=1 ? &num_tested_peptides_in_train : NULL);
			int *ptr_test_num  = (t<=1 ? &num_tested_peptides_in_test : NULL);

			vector<peak_rank_stat> train_peak_stats, test_peak_stats;

			train_error = calc_prediction_error(training_ds, train_peak_stats, 
								test_tag3_filter_val, ptr_train_num);
			test_error  = 1.0;
			if (test_ds)
				test_error = calc_prediction_error(*test_ds, test_peak_stats, 
									test_tag3_filter_val, ptr_test_num);

			cout << "Round\t" << t << "\t#rf " << non_zero_real_idxs.size() << "\ttime: " << total_time << " secs.\t" << setprecision(5) << "[ " <<
				setprecision(5) << train_error << " " << test_error << "]" << endl;

			// these should only be outputted for the first round
			if (ptr_train_num)
			{
				out_stream << "ERRORS MEASURED FOR TAG3 VAL " << test_tag3_filter_val << endl;
				out_stream << "TRAIN ERRORS FROM " << *ptr_train_num << endl;
				if (report_prefix)
				{
					stat_stream << "ERRORS MEASURED FOR TAG3 VAL " << test_tag3_filter_val << endl;
					stat_stream << "TRAIN ERRORS FROM " << *ptr_train_num << endl;
				}
			}

			if (ptr_test_num)
			{
				out_stream << "TEST ERRORS FROM " << *ptr_test_num << endl;
				if (report_prefix)
					stat_stream << "TEST ERRORS FROM " << *ptr_test_num << endl;
			}
		
			out_stream << setprecision(7);
			out_stream << "Round " << t << "\t" << setprecision(7) << fixed  << (int)total_time << "\t";

			
			if (num_binary_features>0)
				out_stream << "Act bin " << non_zero_binary_idxs.size() << "/" << num_binary_features ;
			if (num_real_features>0)
				out_stream << " Act real " << non_zero_real_idxs.size() << "/" << num_real_features << endl;

			if (num_binary_features>0)
			{
				out_stream << "Best BINARY feature: " << best_binary_idx << " " << binary_feature_names[best_binary_idx] <<
					"    r: " << best_binary_r << endl;
			}

			if (num_real_features>0)
			{
				out_stream << "Best REAL feature  : " << real_feature_idx << " " << real_feature_names[real_feature_idx] << 
				    "   theta: " << real_theta_start_idx << "-" <<
					real_theta_end_idx << "  r: " <<  real_r << endl;
			}

			clock_t end_test = clock();
			double test_time = (end_test-start_test)/(double)CLOCKS_PER_SEC;

			out_stream << setprecision(6);
			out_stream << "train: " << train_error;
			if (test_ds)
				out_stream << "\ttest: " << test_error;
			out_stream << "\tZ_prod = " << Z_prod << "\t" << "(" << 
					test_time << ")" << endl;

			out_stream << endl;

			// full stats
			if (report_prefix)
			{
			
				stat_stream << fixed << setprecision(6);
				stat_stream << t << "\t" << (int)total_time << "\t" << non_zero_binary_idxs.size() <<
					"\t" << non_zero_real_idxs.size() << "\t" << train_error << "\t";

				int stat_size = train_peak_stats.size();
				if (test_ds)
				{
					stat_stream << test_error << "\t";
					if (test_peak_stats.size()<train_peak_stats.size())
						stat_size = test_peak_stats.size();
				}
				stat_stream << endl;

				// also to cout
				out_stream << setprecision(4) << t << "\t" << fixed << (int)total_time << "\t" << 
					non_zero_binary_idxs.size() << "\t" << non_zero_real_idxs.size() << 
					"\t" << train_error << "\t";

				if (test_ds)
					out_stream << test_error << "\t";
				out_stream << endl << endl;

				// peak stats
				train_res << t <<"\t" << setprecision(6) << train_error << "\t" << stat_size;
				int j;
				for (j=0; j<stat_size; j++)
					train_res << "\t" << setprecision(4) << train_peak_stats[j].precent_predicted_correctly;

				for (j=0; j<stat_size; j++)
					train_res << "\t" << setprecision(4) << train_peak_stats[j].avg_precited_rank + 1.0 << "\t"
							 <<	setprecision(5) << train_peak_stats[j].sd_predicted_rank;
				train_res << endl;

				if (test_ds)
				{
					test_res << t <<"\t" << setprecision(6) << test_error << "\t" << stat_size;
					int j;
					for (j=0; j<stat_size; j++)
						test_res << "\t" << setprecision(4) << test_peak_stats[j].precent_predicted_correctly;

					for (j=0; j<stat_size; j++)
						test_res << "\t" << setprecision(4) << test_peak_stats[j].avg_precited_rank + 1.0 << "\t"
								 <<	setprecision(5) << test_peak_stats[j].sd_predicted_rank;
					test_res << endl;
				}

				if (t % 100 == 0)
				{
					stat_stream.flush();
					out_stream.flush();	
				}

				if (0 && top_misclassified_pairs)
				{
					get_top_misclassified_pairs(training_ds,D,D0,*top_misclassified_pairs);
					print_top_misclassified_pairs(training_ds,D,D0,10,out_stream);
				}
			}

			// check if this is the best round
			// output the current model file
			if (test_error<best_test_error && ((t % 100 == 0) || (t<100 && t % 10 == 0)))
			{
				set_best_model_parameters_to_current_parameters();

				// write the model. Since compressing messes it up, we will copy the model
				// to a new one and write that one
				if (t % 100 == 0 && report_prefix)
				{
					RankBoostModel copy_of_model = *this;

					char name_buff[512];
					sprintf(name_buff,"%s_model.txt",report_prefix);
					ofstream model_stream(name_buff);
					if (! model_stream.is_open() || ! model_stream.good())
					{
						cout << "Error: couldn't feature_stream file for writing:" << name_buff << endl;
						exit(1);
					}

					if (model_header_strings)
					{
						int i;
						for (i=0; i<model_header_strings->size(); i++)
							model_stream << model_header_strings->at(i);
					}

					copy_of_model.write_rankboost_model(model_stream);
					model_stream.close();

				
					sprintf(name_buff,"%s_feature_list.txt",report_prefix);
					ofstream feature_stream(name_buff);
					if (! feature_stream.is_open() || ! feature_stream.good())
					{
						cout << "Error: couldn't feature_stream file for writing:" << name_buff << endl;
						exit(1);
					}
					copy_of_model.ouput_importance_ranked_feature_list(training_ds,feature_stream);
					feature_stream.close();
				}
			}
			
			// if for the last 20% of the rounds we had no improvement in the test error
			// we can stop!
			if (t>100)
			{
				double stop_ratio = (t>100000 ? 1.3 : 2.0);
				if (t<1000)
					stop_ratio = 3.0;
				if(t<20000)
					stop_ratio = 2.5;

				if (max_num_rounds<1000000 &&  // if it is 1000000 then let it run...
					t/(double)best_round_idx>=stop_ratio && 
					best_test_error<test_error)
				{
					out_stream << "TERMINATING AT ROUND " << t << ", NO PROGRESS IN TEST ERROR SINCE ROUND " << 
						best_round_idx << endl << endl;
					out_stream << fixed << setprecision(6) << "Current test error: " << test_error << ", best test error " << best_test_error << endl;
					break;
				}
			}

			
			// check signal
			if (stop_signal_file)
			{
				ifstream signal_stream(stop_signal_file);
				if (signal_stream.is_open())
				{
					out_stream << endl << "TERMINATED BECAUSE STOP FILE WAS DETECTED!" << endl;
					out_stream << "( " << stop_signal_file << " )" << endl;
					break;
				}
			}
		}
	}

	set_current_model_parameters_to_best_parameters();

	vector<peak_rank_stat> dummy_stats;
	train_error = calc_prediction_error(training_ds, dummy_stats, test_tag3_filter_val);
	test_error  = 1.0;
	if (test_ds)
		test_error = calc_prediction_error(*test_ds, dummy_stats, test_tag3_filter_val);

	out_stream << setprecision(6) << fixed << "FINAL ERRORS:" << endl;
	out_stream << "train:\t" << train_error << endl;
	out_stream << "test:\t" << test_error << endl;

	if (top_misclassified_pairs)
		get_top_misclassified_pairs(training_ds,D,D0,*top_misclassified_pairs);

	if (out_file_stream.is_open())
		out_file_stream.close();

	if (stat_file_stream.is_open())
		stat_file_stream.close();

	if (train_res.is_open())
		train_res.close();

	if (test_res.is_open())
		test_res.close();

	if (flist_stream.is_open())
		flist_stream.close();

	ind_was_initialized = true;

	cout << "Finished training boost model (" << t-1 << " rounds)" << endl;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -