📄 denovoranktrain.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
		exit(1);
	}

	if (peak_model->get_feature_set_type() <= 2)
	{
		cout << "Error: training function intended for combined peak model!" << endl;
		exit(1);
	}

	vector<bool>   file_indicators;
	PeptideSetMap   psm;

	if (peptide_strings)
		peptide_strings->clear();

	double start_time = time(NULL);

	// read sample peptide sequences
	create_complete_dbh_set_map(config,mgf_list,db_dir,correct_dir,
		charge,size_idx,psm, file_indicators);

	// Read previous rerank path
	DeNovoRankScorer previous_rerank_model;
	if (rerank_path)
	{

		cout << "Read previous rerank model, type " << previous_rerank_model.get_model_type() << endl;
		cout << "Option not supported anymore!" << endl;
		exit(1);
	}

	const float ratio_db = 1.0 - ratio_denovo;
	const int num_db_pairs     = (int)(max_num_pairs * ratio_db);
	const int num_denovo_pairs = (int)(max_num_pairs * ratio_denovo);

	// read spectra
	FileManager fm;
	FileSet	    fs;
	BasicSpecReader bsr;
	QCPeak		peaks[2000];

	fm.init_from_list_file(model->get_config(),mgf_list.c_str(),file_indicators);
	fs.select_all_files(fm);

	const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();

	int i;
	int num_spectra_read=0;

	cout << endl << "Read " << all_ssfs.size() << " headers" << endl << endl;
	cout << "Creating RankBoostDatasets... proportion for training " << train_ratio << endl;
	cout << "Using following proportions for training data:" << endl;
	cout << ratio_denovo   << " pairs of correct, incorrect full dnv (same spectrum)" << endl;
	cout << ratio_db	   << " pairs of correct, incorrect db hits (same spectrum)" << endl;
	
	// first select most abundant fragments, use first 1000 ssfs for statistics
	vector<int> frag_counts;
	frag_counts.resize(config->get_all_fragments().size(),0);
	for (i=0; i<all_ssfs.size() && i<1000; i++)
	{
		MGF_single *ssf = (MGF_single *)all_ssfs[i];
		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
		BasicSpectrum     bs;
		bs.peaks = peaks;
		bs.num_peaks = num_peaks;
		bs.ssf = ssf;

		vector< vector<float> > ann_intens;
		vector< vector<mass_t> > ann_masses;
		AnnotatedSpectrum as;
		as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
		as.set_peptide(ssf->peptide);
		as.annotate_spectrum(ssf->peptide.get_mass_with_19(),true);
		as.extract_annotated_intens_and_masses(ann_intens,ann_masses);

		int f;
		for (f=0; f<ann_intens.size(); f++)
		{
			int j;
			for (j=0; j<ann_intens[f].size(); j++)
				if (ann_intens[f][j]>0)
					frag_counts[f]++;
		}
	}


	vector<int> ppp_frag_type_idxs;
	ppp_frag_type_idxs.clear();

	cout << "Using a combined peak model" << endl;
	cout << "Selecting peaks of the paritally mobile model" << endl;

	int mobility;
	for (mobility=MOBILE+1; mobility<NONMOBILE; mobility++)
	{
		const vector<int>& frag_idxs = peak_model->get_model_ptr(charge,size_idx,mobility)->get_fragment_type_idxs();
		int f;
		for (f=0; f<frag_idxs.size(); f++)
		{
			if (ppp_frag_type_idxs.size() == max_num_ppp_frags)
				break;

			int j;
			for (j=0; j<ppp_frag_type_idxs.size(); j++)
				if (ppp_frag_type_idxs[j] == frag_idxs[f])
					break;
			if (j==ppp_frag_type_idxs.size())
				ppp_frag_type_idxs.push_back(frag_idxs[f]);
		}
	}
	int f;
	for (f=0; f<ppp_frag_type_idxs.size(); f++)
		cout << f+1 << "\t" << config->get_fragment(ppp_frag_type_idxs[f]).label << endl;
	cout << endl;
	

	sort(ppp_frag_type_idxs.begin(),ppp_frag_type_idxs.end());

	DeNovoPartitionModel *&part_model = dnv_part_models[charge][size_idx];
	part_model->init_features(model_type,charge,size_idx,ppp_frag_type_idxs,config);
		
	int num_groups_in_train=0;
	int num_groups_in_test=0;
	int num_train_pairs=0;

	ofstream test_scan_stream;
	if (test_scan_file)
	{
		test_scan_stream.open(test_scan_file);
		if (! test_scan_stream.is_open())
		{
			cout << "Error: couldn't open test stream!" << endl;
			exit(1);
		}
	}
	
	int set_counter=0;
	for (i=0; i<all_ssfs.size(); i++)
	{
		PeptideSetMap::const_iterator it;
		MGF_single *ssf = (MGF_single *)all_ssfs[i];
		scan_pair key(ssf->file_idx,ssf->idx_in_file);

		it = psm.find(key);
		if (it == psm.end())
			continue;

		set_counter++;
	}

	cout << set_counter << " sets available." << endl;
	int num_db_peptides_per_set = (int)(0.5 + (float)num_db_pairs/set_counter);
	if (num_db_peptides_per_set>4)
		num_db_peptides_per_set=4;
	if (num_db_peptides_per_set<1)
		num_db_peptides_per_set=1;

	int db_pairs = num_db_peptides_per_set * set_counter;

	int num_denovo_peptides_per_set = (int)(0.5 + (float)(max_num_pairs-db_pairs)/set_counter);
	if (num_denovo_peptides_per_set<1)
		num_denovo_peptides_per_set=1;
	if (num_denovo_peptides_per_set>25)
		num_denovo_peptides_per_set=25;

	cout << "NUM DB PER SET     : " << num_db_peptides_per_set << endl;
	cout << "NUM DE NOVO PER SET: " << num_denovo_peptides_per_set << endl;


	static PrmGraph *prm_ptr=NULL;
	static vector<PrmGraph *> prm_ptrs;
	static vector<SeqPath>    seqpath_solutions;

	int num_rand=0;
	int num_first=0;

	// Generate various types of samples from spectra
	int ssf_idx;
	for (ssf_idx=0; ssf_idx<all_ssfs.size(); ssf_idx++)
	{
		MGF_single *ssf = (MGF_single *)all_ssfs[ssf_idx];
		const mass_t true_mass_with_19 = ssf->peptide.get_mass_with_19();
		const Peptide& correct_peptide = ssf->peptide;
		PeptideSetMap::const_iterator it;
		
		scan_pair key(ssf->file_idx,ssf->idx_in_file);

	//	if (num_groups_in_train>800)
	//		break;

		it = psm.find(key);
		if (it == psm.end())
			continue;

		const vector<int>& aas = ssf->peptide.get_amino_acids();
		int a;
		for (a=0; a<aas.size(); a++)
			if (aas[a]>Val)
				break;
		if (a<aas.size())
			continue;

		const PeptideSet& set = (*it).second;
		BasicSpectrum     bs;
		AnnotatedSpectrum as;
		vector<PmcSqsChargeRes> pmc_sqs_res;

		if (fabs(set.correct_sol.pep.get_mass_with_19() - ssf->peptide.get_mass_with_19())>0.001)
		{
			cout << "Error: mismatch between set peptide and peptide in file:" << endl;
			cout << "Set : " << set.correct_sol.pep.as_string(config) << "\t" << set.correct_sol.pep.get_mass_with_19() <<endl;
			cout << "Spec: " << ssf->peptide.as_string(config) << "\t" << ssf->peptide.get_mass_with_19() << endl;
		}
		
		int charge1=0,charge2=0;
		mass_t mz1=0,mz2=0;
		float prob1=0,prob2=0;

		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
		bs.peaks = peaks;
		bs.num_peaks = num_peaks;
		bs.ssf = ssf;

		// calc corrected pm_with_19, if it is good use it, otherwise, use a value with +- U[0,toleance/2]
		model->get_best_mz_charge(config,bs,&mz1,&charge1,&prob1,&mz2,&charge2,&prob2,&pmc_sqs_res);
		mass_t pm_with_19=NEG_INF;
		bool good_first = true;
		const mass_t corr1_pm_with_19 = mz1*charge1 - MASS_PROTON*(charge1-1);
		const mass_t corr2_pm_with_19 = mz2*charge2 - MASS_PROTON*(charge2-1);
		if (fabs(corr2_pm_with_19-true_mass_with_19)<tolerance_diff)
		{
			pm_with_19 = corr2_pm_with_19;
			good_first=false;
		}

		if (fabs(corr1_pm_with_19-true_mass_with_19)<tolerance_diff)
		{
			pm_with_19 = corr1_pm_with_19;
		}
		else
			good_first=false;
		
		bool bad_pm=false;
	
		if (pm_with_19<0) // use a random value
		{
			double r=my_random();
			mass_t offset = r*r*tolerance_diff;
			if (my_random()<0.5)
				offset *= -1;
			pm_with_19 = true_mass_with_19 + offset;
			bad_pm=true;
		}


		as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
		as.set_corrected_pm_with_19(pm_with_19);
	
		const bool add_to_train = (my_random() <= train_ratio);
		RankBoostDataset& rds = (add_to_train ? train_ds : test_ds);

		if (test_scan_file && ! add_to_train)
			test_scan_stream << ssf->file_idx << " " << ssf->idx_in_file << " " <<
			ssf->peptide.get_num_aas() << endl;
	
		bool first=true;
		int corr_sam_idx=-1;
		int group_idx=-1;

		if (add_to_train)
		{
			group_idx = num_groups_in_train++;
		}
		else
		{
			group_idx = num_groups_in_test++;
		}

		RankBoostSample corr_rbs;
		fill_complete_peptide_rbs(set.correct_sol, peaks, num_peaks, as, pmc_sqs_res, corr_rbs, size_idx);
		corr_sam_idx = rds.get_num_samples();
		corr_rbs.group_idx=group_idx;
		corr_rbs.tag1=NEG_INF;
		corr_rbs.float_tag1 = set.correct_sol.pm_with_19;

		if (peptide_strings && add_to_train)
		{
			corr_rbs.tag1 = peptide_strings->size();
			peptide_strings->push_back(set.correct_sol.pep.as_string(config));
		}

		corr_rbs.rank_in_group=0;
		rds.add_sample(corr_rbs);

		if (print_sams)
		{
			float score;
			corr_rbs.get_feature_val(117,&score);
			cout << endl << ssf_idx << "\t"<< set.correct_sol.pep.as_string(config) << "\t" << score << endl;
		}

		// add db samples
		int j;
		for (j=0; j<set.incorrect_sols.size() && j<num_db_peptides_per_set; j++)
		{
			const int type = set.incorrect_sols[j].type;
			if (type != 1)
				break;
			
			if (are_same_upto_NDILQK(set.correct_sol.pep,set.incorrect_sols[j].pep))
				continue;

			RankBoostSample bad_rbs;
			fill_complete_peptide_rbs(set.incorrect_sols[j], peaks, num_peaks, as, pmc_sqs_res, 
									bad_rbs, size_idx);

			int bad_sam_idx=rds.get_num_samples();

			bad_rbs.group_idx = group_idx;
			bad_rbs.rank_in_group=(j == 0 ? 1 : 2);

			if (peptide_strings && add_to_train)
			{		
				bad_rbs.tag1=peptide_strings->size();	
				peptide_strings->push_back(set.incorrect_sols[j].pep.as_string(config));
			}

			bad_rbs.tag2=set.incorrect_sols[j].type;
			bad_rbs.tag3=set.incorrect_sols[j].num_correct_aas;
			bad_rbs.float_tag1 = set.correct_sol.pm_with_19;

			rds.add_sample(bad_rbs);
			rds.add_to_phi_vector(bad_sam_idx, corr_sam_idx, set.incorrect_sols[j].type); 
			if (add_to_train)
				num_train_pairs++;

			if (print_sams)
			{
				float score;
				bad_rbs.get_feature_val(117,&score);
				cout << j << "\t" <<  
					set.incorrect_sols[j].pep.as_string(config) << "\t" << score << endl;
			}
		}
	
		// add denovo samples (if the rerank model exists, rerank them)
		if (num_denovo_peptides_per_set>0)
		{
			vector<mass_t> pms_with_19;
			vector<int> charges;
			pms_with_19.push_back(pm_with_19);
			charges.push_back(charge);

		/*	if (my_random()<0.75)
			{
				pms_with_19.push_back(pm_with_19-MASS_PROTON);
				charges.push_back(charge);
			}

			if (my_random()<0.5)
			{
				pms_with_19.push_back(pm_with_19-2*MASS_PROTON);
				charges.push_back(charge);
			}

			if (my_random()<0.15)
			{
				pms_with_19.push_back(pm_with_19+MASS_PROTON);
				charges.push_back(charge);
			}

			if (my_random()<0.1)
			{
				pms_with_19.push_back(pm_with_19+2*MASS_PROTON);
				charges.push_back(charge);
			} */

			if (prm_ptrs.size()<pms_with_19.size())
				prm_ptrs.resize(pms_with_19.size(),NULL);
			
			generate_denovo_solutions_from_several_pms_with_good_start_end_idxs(prm_ptrs,
				model,&as,true,300,6,14,pms_with_19,charges,seqpath_solutions);

		//	generate_denovo_solutions_with_good_start_end_idxs(prm_ptr,model,&as,true,
		//								pm_with_19,charge,300,6,14,seqpath_solutions);

	
			int j;
			for (j=0; j<seqpath_solutions.size(); j++)
			{
				const int num_correct_aas = seqpath_solutions[j].get_num_correct_aas(correct_peptide,config);
				if (num_correct_aas==seqpath_solutions[j].get_num_aa())
				{
					seqpath_solutions[j]=seqpath_solutions[seqpath_solutions.size()-1];
					seqpath_solutions.pop_back();
				}
			}
		
			vector<PeptideSolution> pep_solutions;
			if (seqpath_solutions.size()>0)
			{
				// create peptide solutions
				pep_solutions.resize(seqpath_solutions.size());
				int s_idx;
				for (s_idx=0; s_idx<seqpath_solutions.size(); s_idx++)
				{
					convert_seq_path_to_peptide_soluition_and_fill_in_aas(config,
						correct_peptide, seqpath_solutions[s_idx],pep_solutions[s_idx]);
				}
			}
			else
				continue;


			vector<int> pep_sol_idxs;
			pep_sol_idxs.clear();
		
			vector<score_pair> rerank_scores;
			// re-order the de novo solutions according to previous model
			if (0 || rerank_path)
			{
				
				previous_rerank_model.score_complete_sequences(pep_solutions,ssf,peaks,
									  bs.num_peaks, rerank_scores, size_idx);
				
				sort(rerank_scores.begin(),rerank_scores.end());
				pep_sol_idxs.push_back(0); // include top-scoring de novo in any case
				pep_sol_idxs.push_back(1);

				const int num_top = num_denovo_peptides_per_set/2+1;
				int j;
				for (j=0; j<num_top; j++)
					pep_sol_idxs.push_back(rerank_scores[j].idx);
				
				const int num_left = num_denovo_peptides_per_set - pep_sol_idxs.size();

				for (j=0; j<num_left; j++)
				{
					int p = int(my_random()*(rerank_scores.size()-pep_sol_idxs.size()))+pep_sol_idxs.size();
					pep_sol_idxs.push_back(rerank_scores[p].idx);
				}
				sort(pep_sol_idxs.begin(),pep_sol_idxs.end());
			}
			else // use original order
			{
				const int num_top = int(num_denovo_peptides_per_set*0.6)+1;
				int j;
				for (j=0; j<num_top; j++)
					pep_sol_idxs.push_back(j);
				
				const int num_left = num_denovo_peptides_per_set - pep_sol_idxs.size();
				for (j=0; j<num_left; j++)
				{
					int p = int(my_random()*(pep_solutions.size()-pep_sol_idxs.size()))+pep_sol_idxs.size();
					pep_sol_idxs.push_back(p);
				}
			}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -