📄 peakrankmodel.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 4 页
字号:
			cout << "Error: session AAs don't include: " << org_aas[i] << endl;
			exit(1);
		}
		model_aas[i]=session_aas_to_model_aas[org_aas[i]];
		if (model_aas[i]<0)
		{
			cout << "Error: couldn't find a rank model conversion for amino acid " << org_aas[i] << endl;
			exit(1);
		}
	}
}









void PeakRankModel::set_binary_feature_names()
{
	const int num_aas = this->get_num_model_aas();
	const string dis_labels[]={"-3","-2","-1","+1","+2","+3"};
	const int num_dis_labels = 6;

	int i;

//	cout << "Initialzing binary features for " << num_aas << " amino acids " << endl
//	for (i=0; i<num_aas; i++)
//		cout << i << "\t" << model_aa_labels[i] << endl;

	this->binary_feature_names.clear();
	
	for (i=0; i<num_dis_labels; i++)
	{
		const string dis_label = dis_labels[i];
		int aa;
		for (aa =0; aa<num_aas; aa++)
		{
			string fname = "IND CUT HAS " + model_aa_labels[aa] + " AT " + dis_label;
		//	binary_feature_names.push_back(fname);
		}
	}

/*	for (i=0; i<num_aas; i++)
	{
		int j;
		for (j=0; j<num_aas; j++)
		{
			string fname = "IND CUT IS BETWEEN " + model_aa_labels[i] + " - " + model_aa_labels[j];
			binary_feature_names.push_back(fname);
		}
	} */



	binary_feature_names.push_back("IND CUT AT N+1");
	binary_feature_names.push_back("IND CUT AT C-1");

/*	binary_feature_names.push_back("IND CUT AT N+1");
	binary_feature_names.push_back("IND CUT AT N+2");
	binary_feature_names.push_back("IND CUT AT N+3");
	binary_feature_names.push_back("IND CUT AT N+4");
	binary_feature_names.push_back("IND CUT AT N+5");

	binary_feature_names.push_back("IND CUT AT C-1");
	binary_feature_names.push_back("IND CUT AT C-2");
	binary_feature_names.push_back("IND CUT AT C-3");
	binary_feature_names.push_back("IND CUT AT C-4");
	binary_feature_names.push_back("IND CUT AT C-5");*/

	for (i=0; i<num_aas; i++)
	{
		string fname = "IND CUT AT +1, N AA IS " + model_aa_labels[i];
		binary_feature_names.push_back(fname);
	}

	for (i=0; i<num_aas; i++)
	{
		string fname = "IND CUT AT -1, C AA IS " + model_aa_labels[i];
		binary_feature_names.push_back(fname);
	}



/*	binary_feature_names.push_back("IND CUT LESS THAN MID PEPTIDE");
	binary_feature_names.push_back("IND CUT MORE THAN MID PEPTIDE");
	binary_feature_names.push_back("IND CUT OVER CHARGE LESS THAN MID OBS");
	binary_feature_names.push_back("IND CUT OVER CHARGE MORE THAN MID OBS");*/
}

void PeakRankModel::set_real_feature_names()
{
	const int num_aas = this->get_num_model_aas();
	int i;

	real_feature_names.clear();

	
	const string prefixes[]={"BEFORE MID, N SIDE RKH ","AFTER MID, N SIDE RKH ",
							 "BEFORE MID, C SIDE RKH ","AFTER MID, C SIDE RKH "};
	

	int c;
	for (c=0 ; c< num_RKH_combos; c++)
		real_feature_names.push_back("DIS MIN, N SIDE RHK " + combos[c]);
	for (c=0 ; c< num_RKH_combos; c++)
		real_feature_names.push_back("DIS MIN, C SIDE RHK " + combos[c]);
	for (c=0 ; c< num_RKH_combos; c++)
		real_feature_names.push_back("DIS MAX, N SIDE RHK " + combos[c]);
	for (c=0 ; c< num_RKH_combos; c++)
		real_feature_names.push_back("DIS MAX, C SIDE RHK " + combos[c]);

	int p;
	for (p=0; p<4; p++)
	{
		int c;
		for (c=0 ; c< num_RKH_combos; c++)
		{
			int aa;
			for (aa=0; aa<num_aas; aa++)
			{
				string fname = prefixes[p] + combos[c] + ", AA N of cut " + this->model_aa_labels[aa];
				real_feature_names.push_back(fname);
			}
		}

		for (c=0 ; c< num_RKH_combos; c++)
		{
			int aa;
			for (aa=0; aa<num_aas; aa++)
			{
				string fname = prefixes[p] + combos[c] + ", AA C of cut " + this->model_aa_labels[aa];
				real_feature_names.push_back(fname);
			}
		}
	}

/*	real_feature_names.push_back("CUT MASS PROPORTION (LESS THAN MID)");
	real_feature_names.push_back("CUT MASS PROPORTION (MORE THAN MID)");

	real_feature_names.push_back("CUT MASS OVER CHARGE PROPORTION (LESS THAN MID OBS)");
	real_feature_names.push_back("CUT MASS OVER CHARGE ABS DIFF (LESS THAN MID OBS)");

	real_feature_names.push_back("CUT MASS OVER CHARGE PROPORTION (MORE THAN MID OBS)");
	real_feature_names.push_back("CUT MASS OVER CHARGE ABS DIFF (MORE THAN MID OBS)"); */

	for (i=0; i<num_aas; i++)
		real_feature_names.push_back("# N SIDE " + model_aa_labels[i]);
	
	real_feature_names.push_back("# N SIDE BASIC AMINO ACIDS");

	for (i=0; i<num_aas; i++)
		real_feature_names.push_back("# C SIDE " + model_aa_labels[i]);

	real_feature_names.push_back("# C SIDE BASIC AMINO ACIDS");

	
}


struct basicity_pair {
	bool operator< (const basicity_pair& other) const
	{
		return basicity_score<other.basicity_score;
	}

	float basicity_score;
	int   idx;
};

/****************************************************************************
// reoprts sizes of partitioned training sets
// partitioned according to charge / size_idx / mobility
// if path is given, creates the training files
*****************************************************************************/
void PeakRankModel::partition_training_samples(
						const vector< vector<TrainingPeptide> >& all_tps,
						char *file_path_prefix,
						char *test_path_prefix,
						int   minimal_size,
						float prop_ts) const
{
	if (file_path_prefix)
		cout << "TRAIN: " << file_path_prefix << endl;
	if (test_path_prefix)
		cout << "TEST : " << test_path_prefix << endl;

	cout << endl << endl;
	cout << "Training partiotioning: " << endl;
	cout << "----------------------- " << endl << endl;

	int charge;
	for (charge=1; charge<all_tps.size(); charge++)
	{
		if (all_tps[charge].size()==0)
			continue;

		cout << endl << "**************************************" << endl;
		cout << endl << "CHARGE " << charge << " total " << all_tps[charge].size() << " peptides..." << endl;

		const int num_size_idxs = this->size_thresholds[charge].size()+1;
		vector< vector<int> > pep_counts;
		vector< vector< vector<int> > > pep_idxs;

		pep_counts.resize(num_size_idxs);
		pep_idxs.resize(num_size_idxs);
		int size_idx;
		for (size_idx=0; size_idx<num_size_idxs; size_idx++)
		{
			pep_counts[size_idx].resize(4,0);
			pep_idxs[size_idx].resize(4);
		}


		int i;
		for (i=0; i<all_tps[charge].size(); i++)
		{
			const TrainingPeptide& tp = all_tps[charge][i];
			const int size_idx = this->get_size_group(charge,tp.pm_with_19);
			pep_counts[size_idx][0]++;
			pep_counts[size_idx][tp.mobility]++;
			pep_idxs[size_idx][tp.mobility].push_back(i);
		}

		for (size_idx=0; size_idx<num_size_idxs; size_idx++)
		{
			cout << size_idx << "\t";
			if (pep_counts[size_idx][0]>0)
			{
				cout << pep_counts[size_idx][0] << "\t" << fixed << setprecision(4) << 
					pep_counts[size_idx][0] / (double)all_tps[charge].size() << "\t";
			}
			else
				cout << "  -\t  -\t";

			int m;
			for (m=MOBILE; m<=NONMOBILE; m++)
				if (pep_counts[size_idx][m]>0)
				{
					cout << pep_counts[size_idx][m];
					if (pep_counts[size_idx][m]<minimal_size)
						cout << " *";
					cout << "\t" << pep_counts[size_idx][m]/(double)pep_counts[size_idx][0] << "\t";
				}
				else
					cout << "  -\t 0\t";

			cout << endl;
		}
		cout << endl;

		if (! file_path_prefix)
			continue;

		
		for (size_idx=0; size_idx<num_size_idxs; size_idx++)
		{
			int m;
			for (m=MOBILE; m<=NONMOBILE; m++)
			{
				vector<int> idxs_to_write = pep_idxs[size_idx][m];

				cout << endl;
				cout << "set " << charge << " " << size_idx << " " << m << endl;
			
				
				// try adding samples from adjacent size_idxs
				if (0 && idxs_to_write.size()<minimal_size && size_idx>0)
				{
					const vector<int>& prev_size_idxs = pep_idxs[size_idx-1][m];
					int i;

					// sort samples according to size
					vector<basicity_pair> pairs;
					pairs.clear();
					
					for (i=0; i<prev_size_idxs.size(); i++)
					{
						basicity_pair bp;
						bp.idx = prev_size_idxs[i];
						bp.basicity_score = all_tps[charge][prev_size_idxs[i]].pm_with_19;
						pairs.push_back(bp);
					}
					sort(pairs.begin(),pairs.end());

					for (i=pairs.size()-1; i>=0; i--)
					{
						if (idxs_to_write.size() == minimal_size)
							break;
						idxs_to_write.push_back(pairs[i].idx);
					//	cout << setprecision(2) << all_tps[charge][pairs[i].idx].pm_with_19 << " " <<
					//		all_tps[charge][pairs[i].idx].mobility << " ";
						
					}
					cout << endl;
					cout << " .. took " << pairs.size()-i << " samples from size " << size_idx-1 << endl;
				}

				// try adding samples from adjacent size_idxs
				if (0 && idxs_to_write.size()<minimal_size && size_idx<num_size_idxs-1)
				{
					const vector<int>& next_size_idxs = pep_idxs[size_idx+1][m];
					int i;
						// sort samples according to size
					vector<basicity_pair> pairs;
					pairs.clear();
					
					for (i=0; i<next_size_idxs.size(); i++)
					{
						basicity_pair bp;
						bp.idx = next_size_idxs[i];
						bp.basicity_score = all_tps[charge][next_size_idxs[i]].pm_with_19;
						pairs.push_back(bp);
					}
					sort(pairs.begin(),pairs.end());

					for (i=0; i<pairs.size(); i++)
					{
						if (idxs_to_write.size() == minimal_size)
							break;
						idxs_to_write.push_back(pairs[i].idx);
					//	cout << setprecision(2) << all_tps[charge][pairs[i].idx].pm_with_19 << " " <<
					//		all_tps[charge][pairs[i].idx].mobility << " ";
					}
					cout << endl;
					cout << " .. took " << i << " samples from size " << size_idx+1 << endl;
				}

				// try adding samples from another mobility state
				// choose most appropriate samepls (in terms of basicity)
				if (idxs_to_write.size()<minimal_size)
				{

					int mobility_to_take_from=0;

					if (m==MOBILE || m == NONMOBILE)
					{
						mobility_to_take_from=PARTIALLYMOBILE;
					}
					else if (pep_idxs[size_idx][MOBILE].size()>
							 pep_idxs[size_idx][NONMOBILE].size())
					{
						mobility_to_take_from = MOBILE;
					}
					else
						mobility_to_take_from = NONMOBILE;

					if (idxs_to_write.size() + pep_counts[size_idx][mobility_to_take_from] < minimal_size)
					{
						cout << "Error: insufficient number of training samples for charge " <<
							charge << " size idx " << size_idx << endl;
						continue;
					} 

					vector<basicity_pair> pairs;
					int i;
					pairs.clear();
					
					for (i=0; i<pep_idxs[size_idx][mobility_to_take_from].size(); i++)
					{
						basicity_pair bp;
						bp.idx = pep_idxs[size_idx][mobility_to_take_from][i];
						bp.basicity_score = all_tps[charge][bp.idx].get_basicity_score();
						pairs.push_back(bp);
					}

					sort(pairs.begin(),pairs.end());

					if (mobility_to_take_from>m)
					{
						// take least basic peptides
						int i;
						for (i=0; i<pairs.size() && idxs_to_write.size()<minimal_size; i++)
							idxs_to_write.push_back(pairs[i].idx);

						cout << " .. took " << i << " samples mobility " << mobility_to_take_from << endl;
					}
					else
					{
						// take most basic peptides
						int i;
						for (i=pairs.size()-1; i>=0 && idxs_to_write.size()<minimal_size; i--)
						{
							idxs_to_write.push_back(pairs[i].idx);
						//	cout << pairs[i].basicity_score << " ";
						}
						cout << endl;

						cout << " .. took " << pairs.size()-i << " samples mobility " << mobility_to_take_from << endl;
					}
				}

				sort(idxs_to_write.begin(),idxs_to_write.end());

				if (idxs_to_write.size()<minimal_size)
				{
					cout << "Error: could not add enough samples to create set for charge " << charge << " size " << size_idx << " mobility " << m << endl;
					exit(1);
				}
				
				
				if (! test_path_prefix)
				{
					char fname[256];
					sprintf(fname,"%s_%d_%d_%d.txt",file_path_prefix,charge,size_idx,m);

					ofstream ofs(fname);
					ofs << idxs_to_write.size() << endl;
					for (i=0; i<idxs_to_write.size(); i++)
						all_tps[charge][idxs_to_write[i]].write_to_stream(ofs);
					ofs.close();

					cout << "Wrote " << idxs_to_write.size() << " to " << fname << endl;
				}
				else
				{
					char fname[256], ts_name[256];
					const int num_test = int(prop_ts * (float)idxs_to_write.size());
					vector<int>  ts_idxs;
					vector<bool> ts_ind;
					choose_k_from_n(num_test,idxs_to_write.size(),ts_idxs);
					ts_ind.resize(idxs_to_write.size(),false);
					int i;
					for (i=0; i<ts_idxs.size(); i++)
						ts_ind[ts_idxs[i]]=true;

					sprintf(fname,"%s_%d_%d_%d.txt",file_path_prefix,charge,size_idx,m);
					ofstream ofs(fname);
					ofs << idxs_to_write.size() - num_test<< endl;
					for (i=0; i<idxs_to_write.size(); i++)
						if (! ts_ind[i])
							all_tps[charge][idxs_to_write[i]].write_to_stream(ofs);
					ofs.close();

					cout << "Wrote " << idxs_to_write.size() - num_test << " to " << fname << endl;

					sprintf(ts_name,"%s_%d_%d_%d.txt",test_path_prefix,charge,size_idx,m);
					ofstream ofs_test(ts_name);
					ofs_test << num_test<< endl;
					for (i=0; i<idxs_to_write.size(); i++)
						if (ts_ind[i])
							all_tps[charge][idxs_to_write[i]].write_to_stream(ofs_test);
					ofs_test.close();


					cout << "Wrote " << num_test << " to " << ts_name << endl;

				}

				// find mix/max sizes and min/max basicity scores
				mass_t min_pm_with_19 = 1000000.0;
				mass_t max_pm_with_19 = 0.0;
				float min_basicity_score = 10000.0;
				float max_basicity_score = -1.0;

				for (i=0; i<idxs_to_write.size(); i++)
				{
					const TrainingPeptide& tp = all_tps[charge][idxs_to_write[i]];
					const float bs = tp.get_basicity_score();

					if (tp.pm_with_19>max_pm_with_19)
						max_pm_with_19 = tp.pm_with_19;

					if (tp.pm_with_19<min_pm_with_19)
						min_pm_with_19 = tp.pm_with_19;

					if (bs<min_basicity_score)
						min_basicity_score = bs;

					if (bs>max_basicity_score)
						max_basicity_score = bs;
				}
				cout << "pm ranges: " << setprecision(2) << min_pm_with_19 << " - " << max_pm_with_19 << endl;
				cout << "basicity : " << min_basicity_score << " - " << max_basicity_score << endl;

				cout << "-------------------------------------" << endl;
			}
		}
	}
}


/***************************************************************************
Reads the sample tps into the dataset and creats sample for a given 
fragment type idx
****************************************************************************/
void PeakRankModel::read_training_peptides_into_rank_boost_dataset(
										int frag_type_idx,
										int spec_charge,
										const vector<TrainingPeptide>& sample_tps,
										RankBoostDataset& rank_ds,
										vector<float>& peak_intens,
										vector<PeakStart>& peak_starts,
										vector<float>& max_annotated_intens) const
{
	const vector<mass_t>& aa2mass = config->get_aa2mass();
	rank_ds.clear();
	peak_intens.clear();
	peak_starts.clear();
	max_annotated_intens.clear();

	int i;
	for (i=0; i<sample_tps.size(); i++)
	{
		const TrainingPeptide& tp = sample_tps[i];
		const int frag_pos = tp.get_frag_idx_pos(frag_type_idx);

		if (frag_pos<0)
			continue;

		PeakStart ps;

		ps.peptide_sample_idx = i;
		ps.peak_start_idx=rank_ds.get_num_samples();
		ps.num_peaks=0;

		// scan all cuts for max inten
		float max_ann_inten=0;
		int f;
		for (f=0; f<tp.intens.size(); f++)
		{
			int c;
			for (c=1; c<tp.intens[f].size()-1; c++)
				if (tp.intens[f][c]>max_ann_inten)
					max_ann_inten=tp.intens[f][c];
		}

		if (max_ann_inten<=0)
			continue;
		
		const vector<float>& intens = tp.intens[frag_pos];
		vector<int> cut_ranks;

		// gives integer values to cut idxs according to intensity
		calc_cut_ranks(intens,cut_ranks);

		mass_t c_term_mass = tp.n_mass;
		int cut_idx;
		for (cut_idx=1; cut_idx<intens.size(); cut_idx++)
			c_term_mass+=aa2mass[tp.amino_acids[cut_idx-1]];

		mass_t cut_mass=tp.n_mass;
		for (cut_idx=1; cut_idx<intens.size(); cut_idx++)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -