📄 fragmentselection.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
#include "FragmentSelection.h"


/*********************************************************************
Adds the counts for peaks around a breakage to their respective bins

**********************************************************************/
void add_offset_counts_arround_mass(vector<int>& counts, 
									Spectrum *spec,
									mass_t min_mass, 
									mass_t max_mass, 
									mass_t bin_coef, 
									mass_t break_mass,
									int charge)
{
	const mass_t low_range = (break_mass + min_mass+1)/(mass_t)charge;
	const mass_t high_range = (break_mass + max_mass-1)/(mass_t)charge;
	const PeakRange pr = spec->get_peaks_in_range(low_range, high_range);
	
	if (pr.num_peaks<=0)
		return;

	// add counts
	int p_idx;
	for (p_idx = pr.low_idx; p_idx<=pr.high_idx; p_idx++)
	{
		if (spec->get_peak_iso_level(p_idx)>0)
		{
			continue;
		}

		mass_t peak_mass = spec->get_peak_mass(p_idx);
		mass_t b_mass = break_mass/(mass_t)charge;
		int bin = (int)((peak_mass - b_mass - min_mass)*bin_coef);

		if (bin>4 && bin<counts.size()-5)
		{
			counts[bin]+=10;
			counts[bin-1]+=9;
			counts[bin+1]+=9;
			counts[bin-2]+=6;
			counts[bin+2]+=6;
			counts[bin-3]+=4;
			counts[bin+3]+=4;
			counts[bin-4]+=2;
			counts[bin+4]+=2;
		}
	}
}


/**************************************************************************
Selects the bins that have the highest counts. If a bin is chosen, then
the bins near it are removed from further selection.
***************************************************************************/
void select_fragments_from_bins(vector<int>& counts, 
								FragmentTypeSet& fts, 
								int max_num_frags, 
								int charge, 
								int orientation, 
								mass_t min_offset_mass, 
								mass_t bin_coef, 
								mass_t tolerance)
{
	// select top max_num_frags fragment bins
	int erase_bins_size = (int)((bin_coef * 0.75)/charge);
	int i;
	for (i=0; i<max_num_frags; i++)
	{
		int j;
		int max_count =0;
		int max_idx = -1;
		for (j=0; j<counts.size(); j++)
			if (counts[j]>max_count)
			{
				max_count = counts[j];
				max_idx = j;
			}

		if (max_idx<0)
			break;

		FragmentType ft;
		ft.charge=charge;
		ft.orientation = orientation;
		ft.offset = min_offset_mass + max_idx / bin_coef;
		ft.prob = max_count; // for now use the probability
		ft.make_frag_label(tolerance);

		cout << max_idx << "\t" << counts[max_idx] << "\t" << ft.label << endl;

		fts.add_fragment_type(ft);
	
		for (j=0; j<=erase_bins_size; j++)
		{
			counts[max_idx+j] = -1;
			counts[max_idx-j] = -1;
		}
	}
}


/***********************************************************************
Checks for each selected fragment what is the true count of fragments.
This is done by annotaing the highest count fragments first (and thus
removing fragments that are due to previous/next amino acid peaks).
This function also corrects the offset from the rounded-off value of
the bin to the average of the peak offsets.

If the selected fragment looks like an isotopic peak of a previously 
chosen fragment, it is removed from the list.
************************************************************************/
void calculate_true_fragment_probabilities(
							 const FileManager& fm, 
							 Config *config,
							 FragmentTypeSet& fts,  
							 float min_prob)
{
	const mass_t small_tolerance = config->get_tolerance() * 0.5;
	const int num_frags = fts.get_num_fragments();
	FileSet fs;
	vector< vector<double> > true_offsets;
	vector< vector<double> > total_breakages; // counts how many breakages are relevant for a charge and the fragment
	vector< vector<double> > charge_counts;
	vector< vector<int> > spec_counts;
	int max_charge = fm.get_max_charge();
	int c;

	total_breakages.resize(max_charge+1);
	spec_counts.resize(max_charge+1);
	charge_counts.resize(max_charge+1);
	true_offsets.resize(max_charge+1);
	for (c=0; c<=max_charge; c++)
	{
		charge_counts[c].resize(num_frags,0);
		total_breakages[c].resize(num_frags,0);
		spec_counts[c].resize(num_frags,0);
		true_offsets[c].resize(num_frags,0);
	}


	fts.sort_fragments_according_to_probs();
	fs.select_all_files(fm);

//	int f;
//	for (f=0; f<num_frags; f++)
//		fts.get_non_const_fragment(f).spec_count =0;

	while(1)
	{
		Spectrum s;
		vector<mass_t> break_masses;
		vector<bool> used_peaks;     // flag to indicate if peaks were already used by another fragment
		mass_t true_mass_with_19,true_mass;
		vector<int> spec_ind;

		if (! fs.get_next_spectrum(fm,config,&s))
			break;
		
		s.init_spectrum();
		s.get_peptide().calc_expected_breakage_masses(config,break_masses);
		const int charge = s.get_charge();

		true_mass=s.get_peptide().get_mass();
		true_mass_with_19 =  true_mass + MASS_OHHH;
		used_peaks.resize(s.get_num_peaks(),false);
		spec_ind.resize(num_frags,0);
	
		// loop on fragments first, so high count fragments get precedence over
		// low count fragments that are actually due to b/y ions of previous or
		// next amino acids
		int f;
		for (f=0; f<num_frags; f++)
		{
			const FragmentType& frag = fts.get_fragment(f);
			if (frag.charge>charge)
				continue;

			int nb=0;
			int b;
			for (b=1; b<break_masses.size()-1; b++)
			{
				const mass_t break_mass = break_masses[b];
				const mass_t exp_mass = frag.calc_expected_mass(break_mass,true_mass_with_19);
				if (exp_mass>2000)
					continue;

				if (exp_mass<s.get_peak(0).mass)
					continue;

				nb++;
				const int p_idx = s.get_max_inten_peak(exp_mass,small_tolerance);
				if (p_idx>=0 && ! used_peaks[p_idx] && s.get_peak_iso_level(p_idx)==0)
				{
					charge_counts[charge][f]++;
					used_peaks[p_idx]=true;
					mass_t base_mass = ((frag.orientation == PREFIX) ? break_mass : true_mass - break_mass);
					base_mass /= frag.charge;

					true_offsets[charge][f] += (s.get_peak_mass(p_idx)-base_mass);

					spec_ind[f]=1;
				}
			}
			total_breakages[charge][f]+= nb;
		}

		for (f=0; f<num_frags; f++)
			spec_counts[charge][f] += spec_ind[f];
	}

	int f;
	for (f=0; f<num_frags; f++)
	{
		FragmentType& frag = fts.get_non_const_fragment(f);

		// find the highest probability for this fragments (amongst all charges)
		frag.prob = -1;
		int c;
		for (c=0; c<=max_charge; c++)
		{
			if (charge_counts[c][f]>0)
			{
				float prob = charge_counts[c][f] / total_breakages[c][f];
				if (prob>frag.prob)
				{
					frag.prob = prob;
					frag.spec_count = spec_counts[c][f];
					frag.offset  = true_offsets[c][f] / charge_counts[c][f];
				}

			}
		}
		frag.make_frag_label(config->get_tolerance());
	}


	// remove fragments that are isotopic peaks of previously selected fragments
	// (e.g. p+2 which is basicly b+1), this doesn't affect -NH3 losses which
	// might appear as an isotope of -H2O

	fts.remove_isotopic_fragments(config->get_tolerance());

}






bool cmp( int a, int b ) { return a > b; }		

/***********************************************************************
Uses the offset count method to determine which fragments are most likely
to occur.
************************************************************************/
void select_frags_using_frag_offset_counts(const FileManager& fm, 
										   Config *config,
										   FragmentTypeSet& final_frags,
										   float min_frag_prob)
{
	FileSet fs;
	FragmentTypeSet fts;
	const mass_t min_offset_mass = -50;
	const mass_t max_offset_mass = 50;
	const mass_t tolerance = config->get_tolerance();
	const mass_t bin_size = tolerance * 0.1;
	const mass_t bin_coef = 1.0 / bin_size;
	const int count_size = (int)((max_offset_mass - min_offset_mass + 1) / bin_size);
	vector< vector<int> > prefix_counts, suffix_counts; // charge, bin_idx
	int i,c;


	fs.select_files(fm,0,10000,-1,-1,0);

	int max_charge = fm.get_max_charge();
	
	prefix_counts.resize(max_charge+1);
	suffix_counts.resize(max_charge+1);
	for (c=1; c<=max_charge; c++)
	{
		prefix_counts[c].resize(count_size,0);
		suffix_counts[c].resize(count_size,0);
	}

	
	int spectra_used=0;
	while(1)
	{
		Spectrum s;
		vector<mass_t> break_masses;
		mass_t true_mass;

		if (! fs.get_next_spectrum(fm,config,&s))
			break;

		spectra_used++;
		s.init_spectrum();
	//	cout << s.get_file_name() << endl;
	
		s.get_peptide().calc_expected_breakage_masses(config,break_masses);
		true_mass = s.get_peptide().get_mass();

		int b;
		for (b=1; b<break_masses.size()-1; b++)
		{
			int c;
			mass_t break_mass = break_masses[b];
			for (c=1; c<=max_charge; c++)
			{
				add_offset_counts_arround_mass(prefix_counts[c], &s,
					min_offset_mass, max_offset_mass, bin_coef, break_mass,c);

				add_offset_counts_arround_mass(suffix_counts[c], &s,
					min_offset_mass, max_offset_mass, bin_coef, (true_mass - break_mass),c);
			}
		}
	//	cout << endl;
	}



	cout << "Using: " << spectra_used << " spectra for offset counts..." << endl;

	// select 30 top fragments becasue many are likely to be caused by previous/next
	// amino acids and will be later removed
	for (c=0; c<=max_charge; c++)
	{
		select_fragments_from_bins(prefix_counts[c],fts,20,c,PREFIX,min_offset_mass,bin_coef,tolerance);
		select_fragments_from_bins(suffix_counts[c],fts,20,c,SUFFIX,min_offset_mass,bin_coef,tolerance);
	}


	cout << "BEFORE: " << fts.get_fragments().size() << endl;
	calculate_true_fragment_probabilities(fm,config,fts, min_frag_prob);
	cout << "AFTER:  " << fts.get_fragments().size() << endl;

	final_frags.clear_set();

	cout << "Fragments selected from spectra:" << endl;
	for (i=0; i<fts.get_num_fragments(); i++)
	{
		const FragmentType& frag = fts.get_fragment(i);
		if (frag.prob >= min_frag_prob)
		{
			final_frags.add_fragment_type(frag);
			cout << left << setw(3) << i << right << setw(5) << frag.spec_count << " ";
			cout << setw(6) << setprecision(3) << right << frag.prob << " ";
			frag.write_fragment(cout);
		}
	}
	cout << endl;
}

// returns the probablility of observing a the different types of
// fragments for different charges/sizes/regions
void collect_probs_of_known_fragments(const FileManager& fm, Config *config,
				vector< vector< vector<double> > >& frag_probs, // charge , size, region, frag_idx
				vector< vector< vector<double> > >& in_range_counts,
				vector< vector< vector<int> > >& spec_counts,
				double &avg_rand, int &num_files_used, int charge, bool verbose)
{
	bool very_verbose=false;
	FileSet fs;
	const vector < vector< vector< RegionalFragments> > >& regional_fragment_sets = 
														config->get_regional_fragment_sets();

	const vector<FragmentType>& all_fragments = config->get_all_fragments();

	vector<int> num_regions;

	int i;

	if (charge<1)
	{
		cout << "Error: charge must be > 1" << endl;
		exit(1);
	}

	fs.select_files(fm,0,5000,-1,-1,charge);

	num_files_used = fs.get_total_spectra();
	

	// resize the frag_probs
	frag_probs.resize(regional_fragment_sets[charge].size());
	in_range_counts.resize(regional_fragment_sets[charge].size());
	spec_counts.resize(regional_fragment_sets[charge].size());
	num_regions.resize(regional_fragment_sets[charge].size(),0);

	int j;
	for (j=0; j<regional_fragment_sets[charge].size(); j++)
	{
		int k;
		frag_probs[j].resize(regional_fragment_sets[charge][j].size());
		in_range_counts[j].resize(regional_fragment_sets[charge][j].size());
		spec_counts[j].resize(regional_fragment_sets[charge][j].size());
		num_regions[j] = regional_fragment_sets[charge][j].size();

		for (k=0; k<regional_fragment_sets[charge][j].size(); k++)
		{
			frag_probs[j][k].resize(all_fragments.size(),0);
			in_range_counts[j][k].resize(all_fragments.size(),0);
			spec_counts[j][k].resize(all_fragments.size(),0);

//				cout << "SIZE " << frag_probs[j][k].size() << endl;
		}
	}
	


	// fill in the stats
	avg_rand=0;
	for (i=0; i<fs.get_total_spectra(); i++)
	{
		int j;
		AnnotatedSpectrum as;
		fs.get_next_spectrum(fm,config,&as);

		if (very_verbose)
		{
			cout << i << "  " << as.get_file_name();
		}
		else if (verbose && i>0 && i % 1000 == 0)
		{
			cout << i << "/" << fs.get_total_spectra() << endl;
		}


		as.annotate_spectrum(as.get_true_mass_with_19());

	//	as.print_annotated();
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -