📄 edgemodel.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
#include "EdgeModel.h"
#include "PrmGraph.h"
#include "AnnotatedSpectrum.h"
#include "Model.h"
#include "ME_REG.h"

// parameters for all regional models (magic numbers)
float EdgeModel::weight_single_state_score;
float EdgeModel::weight_single_offset_score;
float EdgeModel::weight_multi_state_score;
float EdgeModel::weight_multi_offset_score;
float EdgeModel::weight_transfer_score;
float EdgeModel::weight_combo_score;
float EdgeModel::multi_aa_penalty;
float EdgeModel::bad_pair_penalty;
float EdgeModel::problematic_pair_penalty;

vector<float> EdgeModel::tol_limits;


int RegionalEdgeModel::calc_tol_bin_idx(const Breakage* n_break, const Breakage *c_break,
						 mass_t exp_edge_mass) const
{
	mass_t total_offset=0;
	int num_offsets=0;
	int i;

	for (i=0; i<strong_frag_idxs.size(); i++)
	{
		const int n_pos = n_break->get_position_of_frag_idx(strong_frag_idxs[i]);
		const int c_pos = c_break->get_position_of_frag_idx(strong_frag_idxs[i]);
		if (n_pos<0 || c_pos<0)
			continue;

		const mass_t n_mass = n_break->fragments[n_pos].mass;
		const mass_t c_mass = c_break->fragments[c_pos].mass;
		const mass_t offset = fabs(fabs((n_mass-c_mass)*strong_frag_charges[i])-exp_edge_mass);
		total_offset+=offset;
		num_offsets++;
	}

	total_offset *= one_over_tolerance;
		
	return EdgeModel::get_tol_bin_idx((float)total_offset);
}



void RegionalEdgeModel::set_params(Config *config, int c, int s, int r)
{
	charge = c;
	region_idx = r;
	size_idx = s;

	const RegionalFragments& rf = config->get_regional_fragments(c,s,r);
	strong_frag_idxs = rf.get_strong_frag_type_idxs();
	num_states = 1;

	strong_frag_charges.resize(strong_frag_idxs.size());
	int i;
	for (i=0; i<strong_frag_idxs.size(); i++)
	{
		strong_frag_charges[i]=config->get_fragment(strong_frag_idxs[i]).charge;
		num_states *=2;
	}

	one_over_tolerance = 1.0/config->get_tolerance();
}


void RegionalEdgeModel::train_regional_edge_model(void *model_ptr, const FileManager& fm, const FileSet& fs)
{
	Model  *model = (Model *)model_ptr;
	Config *config = model->get_config();
	const vector<mass_t>& aa2mass = config->get_aa2mass();
	const vector<int>& org_aa = config->get_org_aa();
	const mass_t tolerance = config->get_tolerance();
	const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
	const int num_limits = EdgeModel::get_num_tol_limits();

	bool verbose = true;

	vector<double> good_single_state_counts, bad_single_state_counts;
	vector< vector< double > > good_single_off_counts, bad_single_off_counts;
	vector<double> good_multi_state_counts, bad_multi_state_counts;
	vector< vector< double > > good_multi_off_counts, bad_multi_off_counts;

	vector< vector< double > > good_trans_counts, bad_trans_counts;

	vector< vector< double > > good_aa_combo_counts, bad_aa_combo_counts;

	good_single_state_counts.resize(num_states,2.0);
	bad_single_state_counts.resize(num_states,2.0);
	good_multi_state_counts.resize(num_states,2.0);
	bad_multi_state_counts.resize(num_states,2.0);

	int i;
	good_single_off_counts.resize(num_states);
	bad_single_off_counts.resize(num_states);
	good_multi_off_counts.resize(num_states);
	bad_multi_off_counts.resize(num_states);
	for (i=0; i<num_states; i++)
	{
		good_single_off_counts[i].resize(num_limits,2.0);
		bad_single_off_counts[i].resize(num_limits,2.0);
		good_multi_off_counts[i].resize(num_limits,2.0);
		bad_multi_off_counts[i].resize(num_limits,2.0);
	}

	good_trans_counts.resize(num_states);
	bad_trans_counts.resize(num_states);
	for (i=0; i<num_states; i++)
	{
		good_trans_counts[i].resize(num_states,5.0);
		bad_trans_counts[i].resize(num_states,5.0);
	}

	good_aa_combo_counts.resize(Val+1);
	bad_aa_combo_counts.resize(Val+1);
	for (i=0; i<=Val; i++)
	{
		good_aa_combo_counts[i].resize(Val+1,1.0);
		bad_aa_combo_counts[i].resize(Val+1,1.0);
	}

	vector<QCPeak> peaks;
	peaks.resize(10000);
	BasicSpecReader bsr;

	if (verbose)
	{
		cout << endl;
		cout << "Computing Edge Model for " << charge << " " << size_idx << " " << region_idx << endl << endl;
	}


	double num_good_combos=0;
	double num_bad_combos=0;
	int sc;
	for (sc=0; sc<all_ssf.size(); sc++)
	{
		SingleSpectrumFile *ssf = all_ssf[sc];
		const Peptide& pep = ssf->peptide;
		const vector<int> pep_aas = pep.get_amino_acids();
		const mass_t pm_with_19 = pep.get_mass_with_19();
		BasicSpectrum bs;
						
		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
		bs.num_peaks = num_peaks;
		bs.peaks = &peaks[0];
		bs.ssf = ssf;

		AnnotatedSpectrum as;
		as.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
		as.set_peptide(pep);
		as.annotate_spectrum(pm_with_19,true);

		PrmGraph prm;
		prm.create_graph_from_spectrum(model,&as,pm_with_19,charge);
		const int num_nodes = prm.get_num_nodes();
		vector<int> good_node_inds;
		vector<mass_t> correct_break_masses;
		pep.calc_expected_breakage_masses(config,correct_break_masses);
		good_node_inds.resize(num_nodes,0);
			
		int i;
		for (i=0; i<correct_break_masses.size(); i++)
		{
			int max_node_idx = prm.get_max_score_node(correct_break_masses[i],tolerance);
			if (max_node_idx>=0)
				good_node_inds[max_node_idx]=1;
	
			max_node_idx = prm.get_max_score_node(pm_with_19 - correct_break_masses[i],tolerance);
			if (max_node_idx>=0)
				good_node_inds[max_node_idx]=2;	
		}

		const vector<Node>& nodes = prm.get_nodes();
		const vector<MultiEdge>& edges = prm.get_multi_edges();

		// collect edge data
		const vector<MultiEdge>& multi_edges = prm.get_multi_edges();
		for (i=0; i<multi_edges.size(); i++)
		{
			const MultiEdge& edge = multi_edges[i];
			if (prm.get_node(edge.n_idx).breakage.region_idx != region_idx)
				continue;

			// state offset data
			bool good_edge=false;
			if (good_node_inds[edge.n_idx] == 1 && good_node_inds[edge.c_idx] == 1)
			{
				good_edge=true;
			}
			else if (good_node_inds[edge.n_idx] == 0 && good_node_inds[edge.c_idx] == 0)
			{
				good_edge=false;
			}
			else
				continue;

			const Breakage *n_break = &prm.get_node(edge.n_idx).breakage;
			const Breakage *c_break = &prm.get_node(edge.c_idx).breakage;

			if (! n_break ||  ! c_break )
				continue;

			const int n_state = calc_state(n_break);
			const int c_state = calc_state(c_break);

	//		if (n_state==0 || c_state == 0)
	//			continue;

			const int intersec_state = this->calc_intersec_state(n_state,c_state);

			if (good_edge)
			{
				if (edge.num_aa==1)
				{
					good_single_state_counts[intersec_state]++;
				}
				else
					good_multi_state_counts[intersec_state]++;
			}
			else
				if (edge.num_aa==1)
				{	
					bad_single_state_counts[intersec_state]++;
				}
				else
					bad_multi_state_counts[intersec_state]++;

			int v;
			for (v=0; v<edge.variant_ptrs.size(); v++)
			{
				int *v_ptr = edge.variant_ptrs[v];
				const int num_aa = *v_ptr++;

				mass_t exp_edge_mass=0;
				int j;
				for (j=0; j<num_aa; j++)
					exp_edge_mass+=aa2mass[v_ptr[j]];

				const int offset_bin = this->calc_tol_bin_idx(n_break,c_break,exp_edge_mass);
				if (good_edge)
				{
					int j;
					for (j=0; j<correct_break_masses.size(); j++)
						if (fabs(correct_break_masses[j]-n_break->mass)<1.0)
								break;
					if (j>=correct_break_masses.size()-1)
					{
						continue;
					}
					else 
						if (  (num_aa == 2 && (v_ptr[0] != pep_aas[j] || v_ptr[1] != pep_aas[j+1])) ||
							  (num_aa == 1 && (v_ptr[0] != pep_aas[j]) )  )
							continue;

					if (num_aa==1)
					{
						good_single_off_counts[intersec_state][offset_bin]++;
					}
					else
						good_multi_off_counts[intersec_state][offset_bin]++;

					good_trans_counts[n_state][c_state]++;
				}
				else
				{
					if (num_aa==1)
					{
						bad_single_off_counts[intersec_state][offset_bin]++;
					}
					else
						bad_multi_off_counts[intersec_state][offset_bin]++;

					bad_trans_counts[n_state][c_state]++;
				}

				// fill reg samples
				if (num_aa==2)
				{
					const int n_aa = v_ptr[0];
					const int c_aa = v_ptr[1];
					// make sure variant is good
					if (good_edge)
					{
						good_aa_combo_counts[org_aa[n_aa]][org_aa[c_aa]]++;
						num_good_combos++;
					}
					else
					{
						bad_aa_combo_counts[org_aa[n_aa]][org_aa[c_aa]]++;
						num_bad_combos++;
					}
				}
			}
		}

		if (sc>0 && sc %500 == 0)
		{
			cout << sc << "/" << all_ssf.size() << " ..." << endl;
		}
	}

	// compute probs
	double n_good_single_state_counts=0, n_bad_single_state_counts=0;
	double n_good_multi_state_counts=0, n_bad_multi_state_counts=0;
	double n_good_trans_counts=0, n_bad_trans_counts=0;
	vector<double> n_good_single_off_counts,n_bad_single_off_counts;
	vector<double> n_good_multi_off_counts, n_bad_multi_off_counts;
	
	int j;
	for (i=0; i<good_single_state_counts.size(); i++)
	{
		n_good_single_state_counts+=good_single_state_counts[i];
		n_good_multi_state_counts+=good_multi_state_counts[i];
	}

	for (i=0; i<bad_single_state_counts.size(); i++)
	{
		n_bad_single_state_counts+=bad_single_state_counts[i];
		n_bad_multi_state_counts+=bad_multi_state_counts[i];
	}

	n_good_single_off_counts.resize(num_states,0);
	n_bad_single_off_counts.resize(num_states,0);
	n_good_multi_off_counts.resize(num_states,0);
	n_bad_multi_off_counts.resize(num_states,0);
	for (i=0; i<num_states; i++)
		for (j=0; j<num_limits; j++)
		{
			n_good_single_off_counts[i]+=good_single_off_counts[i][j];
			n_bad_single_off_counts[i]+=bad_single_off_counts[i][j];
			n_good_multi_off_counts[i]+=good_multi_off_counts[i][j];
			n_bad_multi_off_counts[i]+=bad_multi_off_counts[i][j];
		}
	
	n_good_trans_counts=0;
	n_bad_trans_counts=0;
	for (i=0; i<num_states; i++)
		for (j=0; j<num_states; j++)
		{
			n_good_trans_counts+=good_trans_counts[i][j];
			n_bad_trans_counts+=bad_trans_counts[i][j];
		}

	log_odds_state.resize(num_states);
	multi_log_odds_state.resize(num_states);
	for (i=0; i<num_states; i++)
	{
		log_odds_state[i]=log(good_single_state_counts[i]/n_good_single_state_counts)-
		log(bad_single_state_counts[i]/n_bad_single_state_counts);

		multi_log_odds_state[i]=log(good_multi_state_counts[i]/n_good_multi_state_counts)-
		log(bad_multi_state_counts[i]/n_bad_multi_state_counts);
	}

	log_odds_offset.resize(num_states);
	multi_log_odds_offset.resize(num_states);
	log_odds_transfer.resize(num_states);
	for (i=0; i<num_states; i++)
	{
		log_odds_offset[i].resize(num_limits);
		multi_log_odds_offset[i].resize(num_limits);
		log_odds_transfer[i].resize(num_states);

		int j;
		for (j=0; j<num_limits; j++)
		{
			log_odds_offset[i][j]=log(good_single_off_counts[i][j]/n_good_single_off_counts[i])-
				log(bad_single_off_counts[i][j]/n_bad_single_off_counts[i]);
			if (j>0 && log_odds_offset[i][j]>log_odds_offset[i][j-1])
				log_odds_offset[i][j]=log_odds_offset[i][j-1];

			multi_log_odds_offset[i][j]=log(good_multi_off_counts[i][j]/n_good_multi_off_counts[i])-
				log(bad_multi_off_counts[i][j]/n_bad_multi_off_counts[i]);
			if (j>0 && multi_log_odds_offset[i][j]>multi_log_odds_offset[i][j-1])
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -