⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 discretepeakmodel.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 2 页
字号:
#include "DiscretePeakModel.h"

void DiscretePeakModel::clone_charge_model(int source_charge, int target_charge)
{
	config.clone_regional_fragment_sets(source_charge, target_charge);

	if (regional_models.size()<=target_charge)
		regional_models.resize(target_charge+1);

	regional_models[target_charge] = regional_models[source_charge];
}


/********************************************************************
Learns all different table models, and writes them to the resource
directory.
*********************************************************************/
void DiscretePeakModel::train_score_model(const char *name, const FileManager& fm, 
		 int charge, int size_idx, int region_idx)
{
	int c;

	for (c=0; c<=fm.get_max_charge(); c++)
	{
		if (charge>0 && charge != c)
			continue;

		if (fm.get_num_spectra(c) < 1)
			continue;

		learn_parent_dependencies(fm,c);

		learn_q_rand(fm,c);

		print_report(c,0,1);
	}


	convert_probs_to_scores();
	// write score model
	
	write_score_model(name);

	// trains and writes edge models
//	edge_model.train_all_edge_models(fm,this);

//	this->write_model();
//	this->read_model(name);

//	edge_model.read_edge_models(&config,(char *)config.get_model_name().c_str());
	// trains and writes tag models 
//	tag_model.train_models(fm,this);
}







// learns the probabilities of randomly observing peaks
void DiscretePeakModel::learn_q_rand(const FileManager& fm, int charge)
{
	int i;
	FileSet fs;

	const vector<FragmentType>& all_fragments = config.get_all_fragments();
	const int max_frag_type_idx = config.get_all_fragments().size();
	
	fs.select_files(fm,0,2500,-1,-1,charge);

	vector<double> noise_probs;
	vector<double> num_files_with_rank_level; // to nake unbaised calculation of q_rand


	noise_probs.resize(level_thresholds.size(),0.001);
	num_files_with_rank_level.resize(level_thresholds.size()+1,0);

	for (i=0; i<fs.get_total_spectra(); i++)
	{
		AnnotatedSpectrum as;

		fs.get_next_spectrum(fm,&config,&as);
		this->init_model_for_scoring_spectrum(&as);

		int spec_rank = this->get_lowest_level_in_spectrum(&as);
		int j;
		for (j=0; j<=spec_rank; j++)
			num_files_with_rank_level[j]++;

		as.annotate_spectrum(as.get_true_mass_with_19());

		mass_t max_peak_mass = as.get_max_peak_mass();
		if (max_peak_mass > as.get_org_pm_with_19())
			max_peak_mass = as.get_org_pm_with_19();

		mass_t peak_area = max_peak_mass - as.get_min_peak_mass();

		// add annotations to the peak statistics only from the breakages 1 to n-1
		double bin_prob = config.get_pm_tolerance()*2 / peak_area;
		int b;
		vector<Breakage>& breakages = as.get_non_const_breakages();
		const vector<Peak>& peaks = as.get_peaks();
		vector<bool> used_peaks;
		used_peaks.resize(peaks.size(),false);
		for (b=1; b<breakages.size()-1; b++)
		{
			this->set_breakage_peak_levels(&breakages[b]);
			int f;
			for (f=0; f<breakages[b].fragments.size(); f++)
			{
				const int f_idx = breakages[b].fragments[f].frag_type_idx;
				const int p_idx = breakages[b].fragments[f].peak_idx;
				used_peaks[p_idx]=true;
			}
		}

		// add unused peaks to noise statistics
		const vector< vector<PeakAnnotation> >& peak_annotations = as.get_peak_annotations();

		int p_idx;
		for (p_idx =0; p_idx<peaks.size(); p_idx++)
		{
			if (! used_peaks[p_idx] && peak_annotations[p_idx].size() == 0)
			{
				noise_probs[get_peak_level(p_idx)]+=bin_prob;
			}
		}
	}

	double inten_prob = 0;
	for (i=1; i<noise_probs.size(); i++)
	{
		if (num_files_with_rank_level[i]>0)
		{
			noise_probs[i] /= num_files_with_rank_level[i];
			inten_prob += noise_probs[i];
		}
	}
	noise_probs[0] = 1.0 - inten_prob;

	// correct frag_probs to acocunt for noise (some of the annotated peaks might be noise)

	// calculate q values
	q_rand = noise_probs;
}




void DiscretePeakModel::print_frag_probs(const vector<int>& frag_type_idxs, 
										 int charge, int size, int region, ostream& os) const
{
	const vector<FragmentType>& all_fragments = config.get_all_fragments();

	int f;
	int len=15;
	os << "Rank     " << "q_rand   ";
	for (f=0; f<frag_type_idxs.size(); f++)
	{
		int f_idx = frag_type_idxs[f];
		int sl = (len - all_fragments[f_idx].label.length())/2;
		os << setw(sl) << left << " " << setw(len -sl) << left << all_fragments[f_idx].label;
	}
	os << endl;
	os << "                  ";
	for (f=0; f<frag_type_idxs.size(); f++)
		os << " prob   score  ";

	os<< endl;


//	regional_models[2][0][1].independent_frag_tables[0].

//	const vector< vector<double> >& f_probs = regional_models[charge][size][region].independent_frag_tables;

	vector<double> totals;
	totals.resize(frag_type_idxs.size(),0);
	double total_rand=0;
	int r;
	for (r=0; r<level_thresholds.size(); r++)
	{
		if (r==0)
		{
			os << setw(8) << left << "no peak";
		}
		else if (level_thresholds[r]-level_thresholds[r-1]==1)
		{
			os << setw(8) << left << level_thresholds[r];
		}
		else
		{
			os << setw(3) << left << level_thresholds[r-1]+1 << "-" << setw(3) << right << level_thresholds[r] << " ";
		}
		os << " " << setprecision(5) << left << q_rand[r] << "   " ;
		total_rand+=q_rand[r];
		for (f=0; f<frag_type_idxs.size(); f++)
		{
			int f_idx = frag_type_idxs[f];
			score_t frag_prob = regional_models[charge][size][region].independent_frag_tables[f_idx].get_score_prob_from_idx(r); 
			os << setprecision(4) << frag_prob << "  ";
			os << setw(5) << setprecision(2) << log(frag_prob / q_rand[r]) << "  ";
			totals[f]+= frag_prob;
		}
		os << endl;
		if (r == 0)
			os << endl;
	}

	os << endl;
	os << "Toatal:  ";
	os << setw(10) << left << setprecision(4) << total_rand;
	for (f=0; f<frag_type_idxs.size(); f++)
	{
		os << setw(len) << left << setprecision(4) << totals[f];
	}
	os << endl;
}


void DiscretePeakModel::print_report(int charge, int size, int region,
									 ostream& os) const
{
	const vector<int> frag_type_idxs = config.get_regional_fragment_type_idxs(charge,size,region);

	int i;
	for (i=0; i<frag_type_idxs.size(); i+=4)
	{
		vector<int> f_idxs;
		f_idxs.clear();
		int f;
		for (f=i; f<i+4 && f<frag_type_idxs.size(); f++)
			f_idxs.push_back(frag_type_idxs[f]);

//		this->print_frag_probs(f_idxs, charge, size, region, os);
		os << endl << endl;
	}

	
}



struct idx_dkl {
	bool operator< ( const idx_dkl& other) const
	{
		return dkl_sum > other.dkl_sum;
	}

	int idx;
	double dkl_sum;
};

/*************************************************************************
// creates models allowing each fragment to have upto two parents
// (chooses the ones that give the best difference in probability
// in terms of DKL). Parent fragments must appear before the current
// fragment in the order of the regional fragment set
*************************************************************************/
void DiscretePeakModel::learn_parent_dependencies(const FileManager& fm, int charge)
{
	FileSet fs;
	const vector< vector< vector< RegionalFragments > > >& regional_fragments = config.get_regional_fragment_sets();

	fs.select_files(fm,0,1E6,-1,-1,charge);


	// init regional models
	if (regional_models.size() < charge+1)
		regional_models.resize(charge+1);

	// This holds all the possible tables (all combination of up to 2 parents)
	// for each fragments in each region
	vector< vector< vector< vector< FragProbTable > > > > all_tables; // size,region,frag,table

	int size_idx;
	regional_models[charge].resize(regional_fragments[charge].size());
	all_tables.resize(regional_fragments[charge].size());
	for (size_idx=0; size_idx<regional_fragments[charge].size(); size_idx++)
	{
		int region_idx;
		regional_models[charge][size_idx].resize(regional_fragments[charge][size_idx].size());
		all_tables[size_idx].resize(regional_fragments[charge][size_idx].size());

		cout << "SIZE " << size_idx << endl;

		for (region_idx=0; region_idx<regional_fragments[charge][size_idx].size(); region_idx++)
		{
			const vector<int>& frag_type_idxs = regional_fragments[charge][size_idx][region_idx].get_frag_type_idxs();
			regional_models[charge][size_idx][region_idx].frag_type_idxs = frag_type_idxs;
			regional_models[charge][size_idx][region_idx].independent_frag_tables.resize(frag_type_idxs.size());
			regional_models[charge][size_idx][region_idx].single_breakage_tables.resize(frag_type_idxs.size());

			all_tables[size_idx][region_idx].resize(frag_type_idxs.size());

			cout << "  REGION " << region_idx << endl;

			// create all possible tables for a given fragments (set parents according
			// to order in frag_type_idxs)
			int f;
			for (f=0; f<frag_type_idxs.size(); f++)
			{
				all_tables[size_idx][region_idx][f].clear();

				cout << "     FRAG " << config.get_fragment(frag_type_idxs[f]).label << endl;

				// first add table with no parents
				FragProbTable fpt;
				vector<int> fields,num_vals;
				fields.resize(5,-1);
				fields[0]=frag_type_idxs[f];
				num_vals.resize(5,0);
				num_vals[0]= num_peak_levels;

				fpt.init_fields(&config,fields,num_vals);
				fpt.init_counts();
				 
				all_tables[size_idx][region_idx][f].push_back(fpt);

				// add all tables with one parent
				if (f<1)
					continue;

				int p;
				for (p=0; p<f; p++)
				{
					FragProbTable fpt;
					vector<int> fields,num_vals;
					fields.resize(5,-1);
					fields[0]=frag_type_idxs[f];
					fields[1]=frag_type_idxs[p];
					num_vals.resize(5,0);
					num_vals[0]= num_peak_levels;
					num_vals[1] = 2 ; // binary values
					num_vals[1] = num_peak_levels;

					fpt.init_fields(&config,fields,num_vals);
					fpt.init_counts();
					 
					all_tables[size_idx][region_idx][f].push_back(fpt);
				}

				// add all tables with two parents
			//	if (f<2)
					continue;

				int p1,p2;
				for (p1=0; p1<f; p1++)
					for (p2=0; p2<f; p2++)
					{
						if (p1 == p2)
							continue;

						FragProbTable fpt;
						vector<int> fields,num_vals;
						fields.resize(5,-1);
						fields[0]=frag_type_idxs[f];
						fields[1]=frag_type_idxs[p1];
						fields[2]=frag_type_idxs[p2];
						num_vals.resize(5,0);
						num_vals[0]= num_peak_levels;
						num_vals[1] = 2 ; // binary values
						num_vals[1] = num_peak_levels;
						num_vals[2] = 2 ; // binary values

						if (p1>p2 && num_vals[1] == num_vals[2])
							continue;

						fpt.init_fields(&config,fields,num_vals);
						fpt.init_counts();
						 
						all_tables[size_idx][region_idx][f].push_back(fpt);
					}
			}
		}

	}


	// Add instances to all tables
	int counter=0;
	while(1)
	{
		AnnotatedSpectrum as;
		if (! fs.get_next_spectrum(fm,&config,&as))
			break;

	//	if (counter>=300)
	//		break;

		as.annotate_spectrum(as.get_true_mass_with_19());
		const int size_idx = as.get_size_idx();
		init_model_for_scoring_spectrum(&as);

		cout << counter++ << " " << as.get_file_name() << endl;
		

		int b;
		vector<Breakage>& breakages = as.get_non_const_breakages();
		for (b=0; b<breakages.size(); b++)
		{
			set_breakage_peak_levels(&breakages[b]);
			int region_idx = config.calc_region_idx(breakages[b].mass,
				as.get_true_mass_with_19(), as.get_charge(), as.get_min_peak_mass(),as.get_max_peak_mass());
			
			// add an instance to each table
			cout << region_idx;
			const vector<int>& frag_type_idxs = regional_models[charge][size_idx][region_idx].frag_type_idxs;
			int f,t;

			// only add instances if the expected peak position is within the min/max range
			for (f=0; f<frag_type_idxs.size(); f++)
			{
				mass_t exp_frag_mass = config.get_fragment(frag_type_idxs[f]).calc_expected_mass(breakages[b].mass,as.get_true_mass_with_19());
				if (exp_frag_mass < as.get_min_peak_mass() || exp_frag_mass > as.get_max_peak_mass())
					continue;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -