⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 edgemodel.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 3 页
字号:

	// resize according to threhsolds, assume that the last mass is POS_INF
	init_edge_model_defaults();
	regional_edge_models.resize(size_thresholds.size());

	int c;
	for (c=0; c<size_thresholds.size(); c++)
		if (size_thresholds[c].size()>0)
			regional_edge_models[c].resize(size_thresholds[c].size());


	for (c=0; c<regional_edge_models.size(); c++)
	{
		if (specific_charge>0 && c != specific_charge)
			continue;

		int s;
		for (s=0; s<regional_edge_models[c].size(); s++)
		{
			const int num_regions = config->get_num_regions(c,s);
			regional_edge_models[c][s].resize(num_regions,NULL);

			// train all regions
			FileSet fs;
			mass_t min_mz=0;
			mass_t max_mz = 10000;
			if (s>0)
				min_mz = (size_thresholds[c][s-1]+c-1)/(mass_t)c;
			if (s< size_thresholds[c].size())
				max_mz = (size_thresholds[c][s]+c-1)/(mass_t)c;

			fs.select_files_in_mz_range(fm,min_mz,max_mz,c);

			if (fs.get_total_spectra()<300)
			{
				cout << "Warning: not enough spectra for charge " << c << " size " << s << ", not training edge models!" << endl;
				continue;
			}
			fs.randomly_reduce_ssfs(20000);

			// set region frag_idxs
			int r;
			for (r=0; r<num_regions; r++)
			{
				regional_edge_models[c][s][r] = new RegionalEdgeModel;
				regional_edge_models[c][s][r]->set_params(config,c,s,r);
				regional_edge_models[c][s][r]->train_regional_edge_model(model,fm,fs);
			}
		}
	}

	// train double combo model
	if (1)
	{
		FileSet fs;
		fs.select_all_files(fm);
		fs.randomly_reduce_ssfs(250000);

		Model  *model = (Model *)model_ptr;
		Config *config = model->get_config();
		const mass_t tolerance = config->get_tolerance();
		const vector<mass_t>& aa2mass = config->get_aa2mass();
		const vector<int>& org_aa = config->get_org_aa();
		const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();

	

		bool verbose = true;

		vector< vector< double > > good_aa_combo_counts, bad_aa_combo_counts;

		int i;
		good_aa_combo_counts.resize(Val+1);
		bad_aa_combo_counts.resize(Val+1);
		for (i=0; i<=Val; i++)
		{
			good_aa_combo_counts[i].resize(Val+1,1.0);
			bad_aa_combo_counts[i].resize(Val+1,1.0);
		}

		vector<QCPeak> peaks;
		peaks.resize(10000);
		BasicSpecReader bsr;

		if (verbose)
		{
			cout << endl;
			cout << "Computing double edge model..." << endl;
		}

		double num_good_combos=0;
		double num_bad_combos=0;
		int sc;
		for (sc=0; sc<all_ssf.size(); sc++)
		{
			SingleSpectrumFile *ssf = all_ssf[sc];
			const Peptide& pep = ssf->peptide;
			const vector<int> pep_aas = pep.get_amino_acids();
			const mass_t pm_with_19 = pep.get_mass_with_19();
			BasicSpectrum bs;
							
			const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
			bs.num_peaks = num_peaks;
			bs.peaks = &peaks[0];
			bs.ssf = ssf;

			AnnotatedSpectrum as;
			as.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
			as.set_peptide(pep);
			as.annotate_spectrum(pm_with_19,true);

			PrmGraph prm;
			prm.create_graph_from_spectrum(model,&as,pm_with_19,ssf->charge);
			const int num_nodes = prm.get_num_nodes();
			vector<int> good_node_inds;
			vector<mass_t> correct_break_masses;
			pep.calc_expected_breakage_masses(config,correct_break_masses);
			good_node_inds.resize(num_nodes,0);
				
			int i;
			for (i=0; i<correct_break_masses.size(); i++)
			{
				int max_node_idx = prm.get_max_score_node(correct_break_masses[i],tolerance);
				if (max_node_idx>=0)
					good_node_inds[max_node_idx]=1;
		
				max_node_idx = prm.get_max_score_node(pm_with_19 - correct_break_masses[i],tolerance);
				if (max_node_idx>=0)
					good_node_inds[max_node_idx]=2;	
			}

			const vector<Node>& nodes = prm.get_nodes();
			const vector<MultiEdge>& edges = prm.get_multi_edges();

			// collect edge data
			const vector<MultiEdge>& multi_edges = prm.get_multi_edges();
			for (i=0; i<multi_edges.size(); i++)
			{
				const MultiEdge& edge = multi_edges[i];
				if (edge.num_aa !=2)
					continue;

				// state offset data
				bool good_edge=false;
				if (good_node_inds[edge.n_idx] == 1 && good_node_inds[edge.c_idx] == 1)
				{
					good_edge=true;
				}
				else if (good_node_inds[edge.n_idx] == 0 && good_node_inds[edge.c_idx] == 0)
				{
					good_edge=false;
				}
				else
					continue;

				const Breakage *n_break = &prm.get_node(edge.n_idx).breakage;
				const Breakage *c_break = &prm.get_node(edge.c_idx).breakage;

				if (! n_break ||  ! c_break )
					continue;

				int v;
				for (v=0; v<edge.variant_ptrs.size(); v++)
				{
					int *v_ptr = edge.variant_ptrs[v];
					const int num_aa = *v_ptr++;

					if (good_edge)
					{
						int j;
						for (j=0; j<correct_break_masses.size(); j++)
							if (fabs(correct_break_masses[j]-n_break->mass)<1.0)
									break;
						if (j>=correct_break_masses.size()-1)
						{
							continue;
						}
						else 
							if ( num_aa == 2 && (v_ptr[0] != pep_aas[j] || v_ptr[1] != pep_aas[j+1])) 
								continue;
					}

					const int n_aa = v_ptr[0];
					const int c_aa = v_ptr[1];

					// make sure variant is good
					if (good_edge)
					{
						good_aa_combo_counts[org_aa[n_aa]][org_aa[c_aa]]++;
						num_good_combos++;
					}
					else
					{
						bad_aa_combo_counts[org_aa[n_aa]][org_aa[c_aa]]++;
						num_bad_combos++;
					}
				}
			}

			if (sc>0 && sc %1000 == 0)
			{
				cout << sc << "/" << all_ssf.size() << " ..." << endl;
			}
		}
	

		cout << "Combo table:" << endl;
		double_combo_scores.resize(Val+1);


		for (i=Ala; i<=Val; i++)
		{
			double_combo_scores[i].resize(Val+1,0);
			if (i==Ile)
				continue;
			int j;
			for (j=Ala; j<=Val; j++)
				if (j!=Ile)
					double_combo_scores[i][j]=log(good_aa_combo_counts[i][j]/num_good_combos)-
											  log(bad_aa_combo_counts[i][j]/num_bad_combos);

		}

		const vector<string>& aa2label = config->get_aa2label();
		cout << fixed << setprecision(1) << setw(5);
		cout << "      ";
		for (i=Ala; i<=Val; i++)
			cout << aa2label[i] << "     ";
		cout << endl;

		for (i=Ala; i<=Val; i++)
		{
			cout << aa2label[i] << "     ";
			int j;
			for (j=Ala; j<=Val; j++)
			{
				printf("%.1f  ",double_combo_scores[i][j]);
				if (double_combo_scores[i][j]>=0)
					printf(" ");
			}
			cout << endl;
		}	
	}

	ind_was_initialized = true;
}



void EdgeModel::init_edge_model_defaults()
{
	weight_single_state_score  = 1.0;
	weight_single_offset_score = 1.0;
	weight_multi_state_score   = 1.0;
	weight_multi_offset_score  = 1.0;
	weight_transfer_score      = 1.0;
	weight_combo_score         = 3.0;
	multi_aa_penalty           = -8.0;
	bad_pair_penalty		   = -5.0;
	problematic_pair_penalty   = -2.0;

	tol_limits.clear();
	tol_limits.push_back(0.1);
	tol_limits.push_back(0.25);
	tol_limits.push_back(0.5);
	tol_limits.push_back(0.9);
	tol_limits.push_back(POS_INF);

	ind_was_initialized = false;
}

// assigns probs and scores to graph edges
void EdgeModel::score_graph_edges(PrmGraph& prm) const
{
	if (! this->ind_was_initialized)
		return;

	const vector<int>& org_aa = config->get_org_aa();
	const vector<mass_t>& aa2mass = config->get_aa2mass();
	const vector< vector< bool > >& allowed_double_edge = config->get_allowed_double_edge();
	const vector< vector< bool > >& double_edge_with_same_mass_as_single = config->get_double_edge_with_same_mass_as_single();
	const int charge = prm.charge;
	const int size_idx = prm.size_idx;

	const vector<Node>& nodes = prm.nodes;
	const int num_edges = prm.multi_edges.size();
	int i;
	for (i=0; i<num_edges; i++)
	{
		MultiEdge& edge = prm.multi_edges[i];
		const Breakage *n_break = &prm.get_node(edge.n_idx).breakage;
		const Breakage *c_break = &prm.get_node(edge.c_idx).breakage;
		const int region_idx = (n_break ? n_break->region_idx : 0);
		const RegionalEdgeModel* rem = regional_edge_models[charge][size_idx][region_idx];
		if (! rem)
		{
			cout << "Error: no edge model for " << charge << " " << size_idx << " " << region_idx << "!" << endl;
			exit(1);
		}

		const int n_state = (n_break ? rem->calc_state(n_break) : 0);
		const int c_state = (c_break ? rem->calc_state(c_break) : 0);		
		const int intersec_state = rem->calc_intersec_state(n_state,c_state);

		const bool add_only_positive_state_scores = (edge.n_idx == 0 || edge.c_idx == nodes.size()-1 || 
												     nodes[edge.c_idx].type == NODE_DIGEST);
		
		edge.max_variant_score = NEG_INF;

		int v;
		for (v=0; v<edge.variant_ptrs.size(); v++)
		{
			int *v_ptr = edge.variant_ptrs[v];
			const int num_aa = *v_ptr++;

			float edge_score = 0;

			// this portion needs to be trained for each charge/size/region
			if (rem->has_values)
			{
				mass_t exp_edge_mass=0;
				int j;
				for (j=0; j<num_aa; j++)
					exp_edge_mass+=aa2mass[v_ptr[j]];

				const int offset_bin = rem->calc_tol_bin_idx(n_break,c_break,exp_edge_mass);
			
				if (num_aa == 1)
				{
					const float add_score = weight_single_state_score * rem->log_odds_state[intersec_state] +
											weight_single_offset_score * rem->log_odds_offset[intersec_state][offset_bin];

					if (! add_only_positive_state_scores || add_score>0)
						edge_score += add_score;
					
				}
				else
				{
					edge_score += multi_aa_penalty;
					if (num_aa>2)
						edge_score += (num_aa-2)*0.5*multi_aa_penalty;

					const float add_score = weight_multi_state_score * rem->multi_log_odds_state[intersec_state] +
											weight_multi_offset_score * rem->multi_log_odds_offset[intersec_state][offset_bin];

					if (! add_only_positive_state_scores || add_score>0)
						edge_score += add_score;
				}
				edge_score += weight_transfer_score * rem->log_odds_transfer[n_state][c_state];
			}

			// this is in the EdgeModel, common to all charge/size/regions
			if (num_aa>1)
			{
				const int n_aa = v_ptr[0];
				const int c_aa = v_ptr[num_aa-1];
				const int org_n_aa = org_aa[n_aa];
				const int org_c_aa = org_aa[c_aa];
				edge_score += weight_combo_score * double_combo_scores[org_n_aa][org_c_aa];

				if (! allowed_double_edge[org_n_aa][org_c_aa])
					edge_score += bad_pair_penalty;

				if (double_edge_with_same_mass_as_single[org_n_aa][org_c_aa])
					edge_score += problematic_pair_penalty;
			}

			edge.variant_scores[v] += edge_score;

			if (edge.variant_scores[v]>edge.max_variant_score)
				edge.max_variant_score = edge.variant_scores[v];
		}
	}	
}


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -