⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pmc_rank.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 4 页
字号:
#include "PMCSQS.h"
#include "auxfun.h"


bool PMCSQS_Scorer::read_pmc_rank_models(Config *_config, char *file_name)
{
	config = _config;

	string path = config->get_resource_dir() + "/" + file_name;
	ifstream in_stream(path.c_str(),ios::in);
	if (! in_stream.good())
	{
		cout << "Warning: couldn't open pmc rank model for reading: " << path << endl;
		return false;
	}


	char buff[512];
	int num_charges=-1;

	in_stream.getline(buff,256);
	istringstream iss1(buff);

	frag_pair_sum_offset=NEG_INF;
	bin_increment=NEG_INF;
	iss1 >> bin_increment >> this->frag_pair_sum_offset;
	if (frag_pair_sum_offset==NEG_INF || bin_increment == NEG_INF)
	{
		cout << "Error in pmc model file!" << endl;
		exit(1);
	}

	in_stream.getline(buff,256);
	istringstream iss(buff);

	iss >> num_charges;
	max_model_charge=num_charges-1;
	
	pmc_rank_models.resize(num_charges);
	pmc_rank_mass_thresholds.resize(num_charges);
	pmc_charge_mz_biases.resize(num_charges);


	int i;
	for (i=0; i<num_charges; i++)
	{
		in_stream.getline(buff,256);
		istringstream iss(buff);
		int num_threshes=0;
		iss >> num_threshes;
		
		pmc_rank_mass_thresholds[i].resize(num_threshes,NEG_INF);
		int j;
		for (j=0; j<num_threshes; j++)
			iss >> pmc_rank_mass_thresholds[i][j];
	}

	for (i=0; i<num_charges; i++)
	{
		in_stream.getline(buff,256);
		istringstream iss(buff);
		int num_biases=0;
		iss >> num_biases;
		
		pmc_charge_mz_biases[i].resize(num_biases,NEG_INF);
		int j;
		for (j=0; j<num_biases; j++)
			iss >> pmc_charge_mz_biases[i][j];
	}
	
	// read Boost models
	for (i=0; i<num_charges; i++)
	{
		in_stream.getline(buff,256);
		istringstream iss(buff);

		int num_models=-1;
		iss >> num_models;

		if (num_models<0)
		{
			cout << "Error: bad parsing of PMCR model file!" << endl;
			exit(0);
		}
		pmc_rank_models[i].resize(num_models,NULL);

		int j;
		for (j=0; j<num_models; j++)
		{
			pmc_rank_models[i][j]=new RankBoostModel;
			pmc_rank_models[i][j]->read_rankboost_model(in_stream);
		}
		
	}
	in_stream.close();

//	this->write_pmc_rank_models("XXX.txt");

	this->ind_initialized_pmcr = true;
	return true;
}


void PMCSQS_Scorer::write_pmc_rank_models(const char *path) const
{
	ofstream out_stream(path,ios::out);
	if (! out_stream.good())
	{
		cout << "Error: couldn't open pmc model for writing: " << path << endl;
		exit(1);
	}

	out_stream << this->bin_increment << " " << this->frag_pair_sum_offset << endl;
	out_stream << this->pmc_rank_mass_thresholds.size() << endl;
	out_stream << setprecision(3) << fixed;

	int i;
	for (i=0; i<this->pmc_rank_mass_thresholds.size(); i++)
	{
		out_stream << pmc_rank_mass_thresholds[i].size();
		int j;
		for (j=0; j<pmc_rank_mass_thresholds[i].size(); j++)
			out_stream << " " << pmc_rank_mass_thresholds[i][j];
		out_stream << endl;
	}

	
	for (i=0; i<this->pmc_charge_mz_biases.size(); i++)
	{
		out_stream << pmc_charge_mz_biases[i].size();
		int j;
		for (j=0; j<pmc_charge_mz_biases[i].size(); j++)
			out_stream << " " << pmc_charge_mz_biases[i][j];
		out_stream << endl;
	}

	// write RankBoost models
	for (i=0; i<pmc_rank_models.size(); i++)
	{
		int j;
		
		if (pmc_rank_models[i].size()==1 && ! pmc_rank_models[i][0])
		{
			out_stream << 0 << endl;
			continue;
		}

		out_stream << pmc_rank_models[i].size() << endl;
		for (j=0; j<pmc_rank_models[i].size(); j++)
		{
			if (pmc_rank_models[i][j])
			{
				pmc_rank_models[i][j]->write_rankboost_model(out_stream,true);
			}
			else
			{
				cout << "Error: non intialized rank pmc model!" << endl;
				exit(1);
			}
		}
	}
	
	out_stream.close();
}


void PMCSQS_Scorer::set_pmc_mass_thresholds(int option)
{
	if (option==0)
	{
		pmc_rank_mass_thresholds = config->get_size_thresholds();
		int i;
		for (i=0; i<pmc_rank_mass_thresholds.size(); i++)
		{
			if (pmc_rank_mass_thresholds[i].size()>0)
			{
				if (pmc_rank_mass_thresholds[i][pmc_rank_mass_thresholds[i].size()-1]>10000)
					pmc_rank_mass_thresholds[i].pop_back();
			}
		}
	}

	if (option==1)
	{
		pmc_rank_mass_thresholds.resize(4);
		pmc_rank_mass_thresholds[1].push_back(1150.0);
		pmc_rank_mass_thresholds[1].push_back(1400.0);
 
		pmc_rank_mass_thresholds[2].push_back(1100.0);
		pmc_rank_mass_thresholds[2].push_back(1300.0);
		pmc_rank_mass_thresholds[2].push_back(1600.0);
		pmc_rank_mass_thresholds[2].push_back(1900.0);
		pmc_rank_mass_thresholds[2].push_back(2400.0);

		pmc_rank_mass_thresholds[3].push_back(1950.0);
		pmc_rank_mass_thresholds[3].push_back(2450.0);
		pmc_rank_mass_thresholds[3].push_back(3000.0);
	}

	if (option==2)
	{
		pmc_rank_mass_thresholds.resize(4);
		
		pmc_rank_mass_thresholds[2].push_back(1300.0);
	
		pmc_rank_mass_thresholds[2].push_back(1900.0);
	}
}



void convert_ME_to_RankBoostSample(const ME_Regression_Sample& me,
								   RankBoostSample& rbs)
{
	rbs.clear();
	int i;
	for (i=0; i<me.f_vals.size(); i++)
		rbs.add_real_feature(me.f_vals[i].f_idx,me.f_vals[i].val);
}


/******************************************************************************
Train PMC models from positive example files
*******************************************************************************/
void PMCSQS_Scorer::train_pmc_rank_models(Config *config, const FileManager& fm, 
										  int sel_charge, bool overwrite)
{	
	const bool sample_diagnostic = false;
	const vector<int>& spectra_counts = fm.get_spectra_counts();
	
	max_model_charge=0;

	int charge;
	for (charge=1; charge<spectra_counts.size(); charge++)
	{
		if (spectra_counts[charge]>=MIN_SPECTRA_FOR_PMCSQS_MODEL)
			max_model_charge=charge;
	}

	const int max_to_read_per_file = 40000;
	
	vector<string> real_names;
	init_PMC_feature_names(real_names);


	// try and read existing pmc model, otherwise init a new one
	string pmc_path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCR.txt";
	ifstream model_stream(pmc_path.c_str());
	if (model_stream.is_open() && model_stream.good())
	{
		model_stream.close();
		string pmcr_name = config->get_model_name() + "_PMCR.txt";
		const char *path = pmc_path.c_str();
		this->read_pmc_rank_models(config,(char *)pmcr_name.c_str());
	}
	else
	{
		set_pmc_mass_thresholds();
	
		this->set_frag_pair_sum_offset(MASS_PROTON); // b+y - PM+19
		this->set_bin_increment(0.1);
		pmc_rank_models.resize(pmc_rank_mass_thresholds.size());
		pmc_charge_mz_biases.resize(pmc_rank_mass_thresholds.size());
	}
	
	const double prop_train = 0.5;


	// It is assumed that the mass thresholds were set according to the training data
	// (this is done manually with values encoded in the set_mass_threhsolds function)
	for (charge=1; charge<=max_model_charge; charge++)
	{
		if (sel_charge>0 && charge != sel_charge)
			continue;

		const int num_sizes = pmc_rank_mass_thresholds[charge].size();
		pmc_rank_models[charge].resize(num_sizes+1,NULL);
		pmc_charge_mz_biases[charge].resize(num_sizes+1,0);

		
		int size_idx;
		for (size_idx=0; size_idx<=num_sizes; size_idx++)
		{
			if (pmc_rank_models[charge][size_idx] && ! overwrite)
				continue;

			vector<SingleSpectrumFile *> test_ssfs;
			BasicSpecReader bsr;
			static QCPeak peaks[5000];
			RankBoostDataset train_ds, test_ds, pos_ds, neg_ds;

			mass_t min_mass =0;
			mass_t max_mass = POS_INF;

			if (size_idx>0)
				min_mass = pmc_rank_mass_thresholds[charge][size_idx-1];

			if (size_idx<num_sizes)
				max_mass = pmc_rank_mass_thresholds[charge][size_idx];

			// these ranges are given according to pm_with_19
			// so files should be selected through select_files and not
			// select_file_in_mz_range
			FileSet fs;		
			fs.select_files(fm,min_mass,max_mass,-1,-1,charge);

			if (fs.get_total_spectra()<500)
				continue;

			
			int num_groups_in_train=0;
			int num_groups_in_test=0;

			cout << "TRAINING charge " << charge << " size " << size_idx << "  (" <<
				min_mass << "-" << max_mass << ")" << endl;

			fs.randomly_reduce_ssfs(max_to_read_per_file);
			const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
			const int num_samples = all_ssf.size();
			
			// first find the bias in number of bins between the true m/z bin and
			// the optimal m/z bin
			vector<bool> skipped_idxs;
			skipped_idxs.resize(num_samples,false);
			int skipped_bad_mz=0;
			mass_t total_bias=0;
			int i;
			for (i=0; i<num_samples; i++)
			{
				SingleSpectrumFile* ssf = all_ssf[i];
				BasicSpectrum bs;
			
				bs.num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
				bs.peaks = peaks;
				bs.ssf = ssf;

				ssf->peptide.calc_mass(config);
				
				const mass_t true_mz = (ssf->peptide.get_mass()+MASS_H2O+(mass_t)charge)/(mass_t)charge;

				if (fabs(true_mz - bs.ssf->m_over_z)>2.5)
				{
					//cout << setprecision(2) << true_mz << " <---> " << bs.ssf->m_over_z << " skipping" << endl;
					skipped_bad_mz++;
					skipped_idxs[i]=true;
					continue;
				} 

				init_for_current_spec(config,bs);
				calculate_curr_spec_pmc_values(bs, bin_increment);

				// find the true_mz_bin_idx
				
				const vector<PMCRankStats>& pmc_stats = curr_spec_rank_pmc_tables[charge];
				int true_mz_bin_idx=0;
				while (true_mz_bin_idx<pmc_stats.size() && pmc_stats[true_mz_bin_idx].m_over_z<true_mz)
					true_mz_bin_idx++;

				if (true_mz_bin_idx == pmc_stats.size())
					true_mz_bin_idx--;

				if (true_mz_bin_idx>0 && pmc_stats[true_mz_bin_idx].m_over_z-true_mz>true_mz-pmc_stats[true_mz_bin_idx-1].m_over_z)
					true_mz_bin_idx--;

				int opt_bin_idx = get_optimal_bin(true_mz_bin_idx, charge);

				if (opt_bin_idx <=0 || opt_bin_idx == pmc_stats.size()-1)
				{
					skipped_bad_mz++;
					skipped_idxs[i]=true;
					continue;
				}

				total_bias += (pmc_stats[opt_bin_idx].m_over_z - pmc_stats[true_mz_bin_idx].m_over_z);

				if (fabs(pmc_stats[opt_bin_idx].m_over_z - pmc_stats[true_mz_bin_idx].m_over_z)>4.0)
				{
					cout << "opt bin: " << opt_bin_idx << " (" << pmc_stats[opt_bin_idx].m_over_z << ")  ";
					cout << "tru bin: " << true_mz_bin_idx << " ("<< pmc_stats[true_mz_bin_idx].m_over_z << ")" << endl;
				}
			} 

			mass_t mz_bias = total_bias / (mass_t)(num_samples-skipped_bad_mz);
			pmc_charge_mz_biases[charge][size_idx]=mz_bias;

			cout << "m/z bias: " << setprecision(4) << mz_bias << endl;
			cout << "skipped " << skipped_bad_mz << "/" << num_samples <<
				"  because of m/z more than 2.5 away from observed..." << endl; 

		//	pmc_charge_mz_biases[charge][size_idx] = 0;

			for (i=0; i<num_samples; i++)
			{
				if (skipped_idxs[i])
					continue;

				SingleSpectrumFile* ssf = all_ssf[i];
				BasicSpectrum bs;
			
				bs.num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
				bs.peaks = peaks;
				bs.ssf = ssf;
				const mass_t true_mz = (ssf->peptide.get_mass()+MASS_H2O+(mass_t)charge)/(mass_t)charge;

				init_for_current_spec(config,bs);
				calculate_curr_spec_pmc_values(bs, bin_increment);

				// find the true_mz_bin_idx
				
				const vector<PMCRankStats>& pmc_stats = curr_spec_rank_pmc_tables[charge];
				int true_mz_bin_idx=0;
				while (true_mz_bin_idx<pmc_stats.size() && pmc_stats[true_mz_bin_idx].m_over_z<true_mz)
					true_mz_bin_idx++;

				if (true_mz_bin_idx == pmc_stats.size())
					true_mz_bin_idx--;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -