⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 model.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 2 页
字号:


	if (end_stage<=6)
	{
		write_model();
		exit(0);
	}


	write_model();
	exit(0);
}


/*********************************************************************
// this function performs the entire training process of the model
**********************************************************************/
void Model::full_train_model(const char *name, const FileManager& fm, 
							 mass_t initial_tolerance)
{
	config.set_tolerances(initial_tolerance);

	model_name = name;
	config.set_model_name(string(name));

	int charge;
	for (charge = fm.get_min_charge(); charge<= fm.get_max_charge(); charge++)
	{
		vector<mass_t> spectra_masses;
		FileSet fs;
		fs.select_all_files(fm);
		const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
		int i;
		for (i=0; i<all_ssf.size(); i++)
			if (all_ssf[i]->charge == charge)
				spectra_masses.push_back(all_ssf[i]->org_pm_with_19);

		config.set_size_thresholds_according_to_set_of_masses(charge,spectra_masses);
	}


	cout << "Selecting fragments ..." << endl;
	select_fragments(name,fm,15,0.01);


	// find the tolerance and precursor mass tolerance

	cout << "Calculating precursor mass tolerance..." << endl;

	float pm_tol = calc_parent_mass_tolerance_distribution(this, fm, 0.95);

	cout << "Calculating fragment mass tolerance..." << endl;

	float tol    = calc_tolerance_distribution(this, fm , initial_tolerance*1.2,0.96);


	config.set_pm_tolerance(pm_tol);

	if (pm_tol <0.000001)
	{
		pm_tol = tol;
	}

	if (pm_tol<tol)
	{
		config.set_tolerance(tol+pm_tol);
	}
	else
		config.set_tolerance(tol);

	if (config.get_tolerance()<0.03)
	{
		config.set_terminal_score(12.0);
		config.set_max_edge_length(3);
	}
	else if (config.get_tolerance()<0.1)
	{
		config.set_terminal_score(8.0);
		config.set_max_edge_length(2);
	}
	else
	{
		config.set_terminal_score(5.0);
		config.set_max_edge_length(2);
	}

	write_model();

	init_score_model();

	train_score_model(name,fm,0);

	write_model();
}



/******************************************************************************
	This model selects the fragment types that will take part in the models.
	The fragments are selected according to the offset frequency function.
	If there isn't a suffcient number of spectra from the desired charge,
	the fragment selection is skipped.
*******************************************************************************/
bool Model::select_fragments(const char *name, 
							 const FileManager& fm, 
							 int   max_num_frags, 
							 float min_prob)
{
	FragmentTypeSet fragment_types;

	int c;
	cout << "Training set consists of:" << endl;
	for (c=fm.get_min_charge(); c<=fm.get_max_charge(); c++)
		cout << "Charge " << c <<"  " << fm.get_num_spectra(c) << " spectra."<< endl;
	cout<<endl;

	// select potential fragment type using the fragment offset test
	select_frags_using_frag_offset_counts(fm,&config, fragment_types, min_prob);

	// add these fragments to the existing set
	config.add_fragment_types(fragment_types);

	for (c=1; c<=fm.get_max_charge(); c++)
	{
	
		// check that there is a minimal number of files...
		int num_charge_spectra = fm.get_num_spectra(c);

		if (num_charge_spectra<MINIMAL_NUMBER_SPECTRA_FOR_FRAGMENT_SELECTION)
			continue;

		config.init_regional_fragment_set_defaults(0,c);

		select_regional_fragments(fm,&config,c,true);

		config.select_fragments_in_sets(1.5,max_num_frags);

		// select strong, combos...
		int max_num_combos = max_num_frags > 0 ? max_num_frags : 2;
		if (max_num_combos>3)
			max_num_combos = 3;
		
		config.select_strong_fragments(c,0.5,3);

		select_frag_combos(fm,&config,c,max_num_combos);
	}

	string fragments_file = config.get_resource_dir() + "/" + string(name) + "_fragments.txt";
	ofstream os(fragments_file.c_str(),ios::out);
	config.print_fragments(os);
	config.set_fragments_file(fragments_file);
	os.close();

	string regional_fragment_sets_file = config.get_resource_dir() + "/" + string(name) + "_fragment_sets.txt";
	os.open(regional_fragment_sets_file.c_str(),ios::out);
	config.print_regional_fragment_sets(os);
	config.set_regional_fragment_sets_file(regional_fragment_sets_file);
	os.close();

	return true;
}




// determines the tolerance for which *cuttoff_prob* of the abundant fragments
// are caught
mass_t calc_tolerance_distribution(Model *model, 
								   const FileManager& fm, 
								   mass_t max_tolerance,
								   float cutoff_prob)
{
	FileSet fs;
	Config *config = model->get_config();
	FragmentTypeSet frags;
	vector<string> file_list;

	fs.select_all_files(fm);

	vector<int> test_frag_idxs;
	test_frag_idxs.clear();
	
	if (config->get_strong_type1_idx()>=0)
		test_frag_idxs.push_back(config->get_strong_type1_idx());

	if (config->get_strong_type2_idx()>=0)
		test_frag_idxs.push_back(config->get_strong_type2_idx());

	if (test_frag_idxs.size()==0)
	{
		cout << endl <<"Warning: no strong fragments selected, using maximal tolerance!!!" << endl << endl;
		return max_tolerance;
	}

	mass_t tol_increment = max_tolerance * 0.05;
	vector<float> offset_bins;
	vector<float> offsets;

	offset_bins.resize(41,0);

	int total_frag_count=0;
	while(1)
	{
		Spectrum s;
		vector<mass_t> break_masses;
		mass_t true_mass_with_19,true_mass;
	
		if (! fs.get_next_spectrum(fm,config,&s))
			break;
		
		s.init_spectrum();
		s.get_peptide().calc_expected_breakage_masses(config,break_masses);
		true_mass=s.get_peptide().get_mass();
		true_mass_with_19 =  true_mass + MASS_OHHH;

		if (break_masses.size()<3)
			continue;		
	
		// loop on fragments first, so high count fragments get precedence over
		// low count fragments that are actually due to b/y ions of previous or
		// next amino acids
		int f;
		for (f=0; f<test_frag_idxs.size(); f++)
		{
			const FragmentType& frag = config->get_fragment(test_frag_idxs[f]);
			int b;

			
			for (b=1; b<break_masses.size()-1; b++)
			{
				mass_t break_mass = break_masses[b];

				const mass_t exp_mass = frag.calc_expected_mass(break_mass,true_mass_with_19);
				const int p_idx = s.get_max_inten_peak(exp_mass,max_tolerance);

				if (p_idx>=0)
				{
					
					total_frag_count++;
					mass_t offset =  s.get_peak_mass(p_idx) - exp_mass;

					int bin_idx = 20 + (int)((offset / max_tolerance)*20);
					if (bin_idx<0)
						bin_idx=0;
					if (bin_idx>40)
						bin_idx=40;

					offset_bins[bin_idx]++;
					offsets.push_back(offset);
				}
			}
		}
	}

	int i;
	cout << "bin histogram: " << endl;
	for (i=0; i<=40; i++)
		cout << setprecision(4) << (20-i)*tol_increment << " " << 
			    offset_bins[i]/total_frag_count << endl;

	// find the offset that keeps the desired proportion of fragments
	sort(offsets.begin(),offsets.end());
	int count=0;
	int target_count = (int)((1.0 - cutoff_prob)*total_frag_count);
	int left_idx=0;
	int right_idx=offsets.size()-1;
	mass_t cutoff_offset=-1;
	while (count<target_count)
	{
		if (fabs(offsets[left_idx])>offsets[right_idx])
		{
			left_idx++;
		}
		else
			right_idx--;

		if (++count == target_count)
		{
			if (fabs(offsets[left_idx])>fabs(offsets[right_idx]))
			{
				cutoff_offset = fabs(offsets[left_idx]); 
			}
			else
				cutoff_offset = fabs(offsets[right_idx]);

			break;
		}
	}

	cout << "offset for " << cutoff_prob << " is " << cutoff_offset << endl;
	return cutoff_offset;
}


// determines the parent mass tolerance for which *cuttoff_prob* of the abundant fragments
// are caught
mass_t calc_parent_mass_tolerance_distribution(Model *model,  const FileManager& fm, 
											   float cutoff_prob)
{
	FileSet fs;
	Config *config = model->get_config();
	FragmentTypeSet frags;
	
	fs.select_all_files(fm);
	const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();

	vector<float> offsets;

	int total_frag_count=0;
	int i;
	for (i=0; i<all_ssf.size(); i++)
	{
		SingleSpectrumFile *ssf = all_ssf[i];
		if (ssf->peptide.get_num_aas()<3)
			continue;

		total_frag_count++;

		ssf->pm_with_19 = ssf->charge * ssf->m_over_z - MASS_PROTON * (ssf->charge -1 );
		mass_t offset =  ssf->pm_with_19  - ssf->peptide.get_mass_with_19();
	//	cout << setprecision(3) << offset << " " << ssf->charge << " " << ssf->peptide.as_string(config) << endl;
		offsets.push_back(offset);
	}



	// find the offset that keeps the desired proportion of fragments
	sort(offsets.begin(),offsets.end());
	int count=0;
	int target_count = (int)((1.0 - cutoff_prob)*total_frag_count);
	int left_idx=0;
	int right_idx=offsets.size()-1;
	mass_t cutoff_offset=-1;
	while (count<target_count)
	{
		if (fabs(offsets[left_idx])>offsets[right_idx])
		{
			left_idx++;
		}
		else
			right_idx--;

		if (++count == target_count)
		{
			if (fabs(offsets[left_idx])>fabs(offsets[right_idx]))
			{
				cutoff_offset = fabs(offsets[left_idx]); 
			}
			else
				cutoff_offset = fabs(offsets[right_idx]);

			break;
		}
	}

	cout << "Parent mass offset for " << setprecision(4) << cutoff_prob << " is " << cutoff_offset << endl;
	return cutoff_offset;
}







⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -