📄 denovoranktrain.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
		{
			int k;
			for (k=0; k<num_den_available; k++)
				den_idxs.push_back(den_start_idx+k);
		}

		for (j=0; j<den_idxs.size(); j++)
		{
			const int sam_idx = den_idxs[j];
			const int type = set.incorrect_sols[sam_idx].type;
			if (type != 2)
			{
				cout << "Error: adding non denovo type: " << set.incorrect_sols[sam_idx].type << endl;
				cout << "sam idx: " << sam_idx << "  out of " << set.incorrect_sols.size() << endl;
				int k;
				for (k=0; k<set.incorrect_sols.size(); k++)
				{
					cout << k << "\t" << set.incorrect_sols[k].type << endl;
				}
				exit(1);
			}

			RankBoostSample bad_rbs;
			fill_complete_peptide_rbs(set.incorrect_sols[sam_idx], peaks, num_peaks, as, 
				pmc_sqs_res, bad_rbs, size_idx);

			const int bad_sam_idx=rds.get_num_samples();

			bad_rbs.group_idx = group_idx;
			bad_rbs.rank_in_group=2;

			if (peptide_strings && add_to_train)
			{		
				bad_rbs.tag1=peptide_strings->size();	
				peptide_strings->push_back(set.incorrect_sols[sam_idx].pep.as_string(config));

			}

			// give higher ranked db mismatches a stronger weight;
			bad_rbs.tag2=set.incorrect_sols[sam_idx].type;
			bad_rbs.tag3=set.incorrect_sols[sam_idx].num_correct_aas;

			rds.add_sample(bad_rbs);
			rds.add_to_phi_vector(bad_sam_idx, corr_sam_idx, set.incorrect_sols[sam_idx].type);

			if (add_to_train)
				num_train_pairs++;
		}
		num_spectra_read++;

		if ((model_type == 2 && num_spectra_read % 200 == 0) ||
		    (model_type == 0 && num_spectra_read % 1000 == 0) )
		{
			cout << "processed " << num_spectra_read << "/" << set_counter << " spectra (";
			double curr_time = time(NULL);
			cout << curr_time - start_time << " secs.)" << endl;
		}
	}

	if (test_scan_file)
		test_scan_stream.close();

	
	const int total_num_cross = max_num_pairs - train_ds.get_phi_support().size() - test_ds.get_phi_support().size();
	const int num_train_db_cross = (int)(total_num_cross * train_ratio);
	const int num_test_db_cross  = total_num_cross - num_train_db_cross;

	if (ratio_db_cross>0)
	{
		cout  << "Adding " << num_train_db_cross << " db cross samples to train" <<endl;
		// add cross db samples to train
		if (num_train_db_cross>0) 
		{
			const vector<RankBoostSample>& samples = train_ds.get_samples();
			vector<SamplePairWeight>& phi_support  = train_ds.get_non_const_phi_support();
			
			int i;
			for (i=0; i<num_train_db_cross; i++)
			{
				const int corr_idx = (int)(num_train_pairs * my_random());
				int bad_idx=-1;
				
				while (bad_idx<0 || phi_support[bad_idx].tag != SOL_INCORRECT_DB) 
				//	samples[phi_support[bad_idx].idx1].rank_in_group != 1) // cross db with strongest bad db hit
					bad_idx = (int)(num_train_pairs * my_random());
			
				SamplePairWeight new_pair = phi_support[bad_idx];
				new_pair.idx2 = phi_support[corr_idx].idx2;
				new_pair.tag = SOL_INCORRECT_DB_CROSS;
				new_pair.weight=1.0;
				phi_support.push_back(new_pair);
			}
			re_weight_complete_denovo_phi_support(phi_support, ratio_db, ratio_denovo, ratio_db_cross);
		}

		cout  << "Adding " << num_test_db_cross << " db cross samples to test" <<endl;
		// add cross db samples to test
		if (num_test_db_cross>0)
		{
			const vector<RankBoostSample>& samples = test_ds.get_samples();
			vector<SamplePairWeight>& phi_support  = test_ds.get_non_const_phi_support();
			const int org_support_size = phi_support.size();
			for (i=0; i<num_test_db_cross; i++)
			{
				const int corr_idx = (int)(org_support_size * my_random());
				int bad_idx=-1;
				
				while (bad_idx<0 || phi_support[bad_idx].tag != SOL_INCORRECT_DB )
				//	samples[phi_support[bad_idx].idx1].rank_in_group != 1) // cross db with strongest bad db hit
					bad_idx = (int)(org_support_size * my_random());

				SamplePairWeight new_pair = phi_support[bad_idx];
				new_pair.idx2 = phi_support[corr_idx].idx2;
				new_pair.tag = SOL_INCORRECT_DB_CROSS;
				new_pair.weight=1.0;
				phi_support.push_back(new_pair);
			}
			re_weight_complete_denovo_phi_support(phi_support,ratio_db,ratio_denovo,ratio_db_cross);
		}
	}


	train_ds.set_num_groups(num_groups_in_train);

	test_ds.set_num_groups(num_groups_in_test);
			
	cout << "computing phi weights..." << endl;
	train_ds.compute_total_phi_weight();
	
	cout << "initializing potential lists..." << endl;
	train_ds.initialize_potenital_lists();
	
	cout << "initializing real feature table..." << endl;
	train_ds.initialzie_real_feature_table(part_model->get_feature_names().size());

	cout << "Processed " << num_spectra_read << " spectra" << endl;
	cout << num_groups_in_train << " sets in training set (" << train_ds.get_phi_support().size() << " pairs)" << endl;
	cout << num_groups_in_test  << " sets in test set     (" << test_ds.get_phi_support().size() << " pairs)" << endl << endl;
}



bool cmp_rerank_score(const SeqPath& a, const SeqPath& b)
{
	return (a.rerank_score>b.rerank_score);
}


void DeNovoRankScorer::select_sample_pairs(const vector<SeqPath>& solutions, 
										   const vector<int>& corr_idxs,
										   const vector<int>& bad_idxs, 
										   vector<weight_pair>& sample_pairs, 
										   int num_pairs) const
{
	const int num_iters=3*num_pairs;
	const double log_bad_size = log((double)bad_idxs.size());
	sample_pairs.clear();
	float total_w = 0;

	int i;
	for (i=0; i<num_iters && sample_pairs.size()<num_pairs; i++)
	{
		int corr_pos = 0;
		int bad_pos = i;

		if (i>2 && corr_idxs.size()>0)
		{
			double r = my_random();
			if ((r<0.4 && solutions[0].org_rank>0) || r<0.15)
			{
				r=0;
			}
			else
			{
				r = my_random();
			}
			corr_pos = int(r*(double)corr_idxs.size()); 
		}

		const int corr_idx = corr_idxs[corr_pos];
		const int rank_corr = solutions[corr_idx].org_rank;
		
		if (! (i<3 && rank_corr < 25))
		{
			double log_corr_rank = log(1.0 + rank_corr);
			double min_log = 0;
			if (rank_corr>150)
				min_log = log_corr_rank - 3.5;
			
			double max_u = exp(min_log+(log_bad_size+1-min_log) * my_random());
			if (max_u>(double)bad_idxs.size())
				max_u=(double)bad_idxs.size();

			double min_u = exp(min_log)-1;
			
			bad_pos = (int)(min_u + my_random()*(max_u-min_u));
		}

	/*	double min_log,max_log;
		if (rank_corr<=5)
		{
			min_log=0;
			max_log=2.75;
		} 
		else if (rank_corr<=25)
		{
			min_log=0;
			max_log=3.25;
		}
		else if (rank_corr<500)
		{
			min_log=log_corr_rank-3.0;
			max_log=log_corr_rank;
		}
		else
		{
			min_log=log_corr_rank-3.0;
			max_log=log_corr_rank-1.0;
		}

		
		if (max_log>log_bad_size)
			max_log = log_bad_size;

		if (min_log<0)
			min_log=0;

		int bad_pos=i;
		if (rank_corr<20 && i<4)
		{
			bad_pos = i;
			if (i==3)
				bad_pos+=(int)(my_random()*3);
		}
		else
		{
			if (my_random()<0.8)
			{
				bad_pos = (int)(exp(min_log + (my_random()*(max_log-min_log))))-1;
			}
			else
			{
				double m = (log_corr_rank < log_bad_size-1.5 ? log_corr_rank : log_bad_size -1.5);
				bad_pos = (int)(exp(m+ (my_random()*(log_bad_size-m))))-1;
			}
		} */
		
		if (bad_pos<0)
			bad_pos=0;
		if (bad_pos>=bad_idxs.size())
			continue;

		const int bad_idx  = bad_idxs[bad_pos];
		int j;
		for (j=0; j<sample_pairs.size(); j++)
			if (sample_pairs[j].idx_bad == bad_idx && sample_pairs[j].idx_corr == corr_idx)
				break;

		if (j<sample_pairs.size())
			continue;

		
		int rank_bad  = solutions[bad_idx].org_rank;
	//	float w = (rank_bad<rank_corr ? 1.0 : 1.0 + 0.333*(log((double)(1+rank_bad)) - log((double)(1+rank_corr))));
		float w=1.0;
		sample_pairs.push_back(weight_pair(corr_idx,bad_idx,w));
		total_w += w;
	//	cout << "Added " << sample_pairs.size() << " :\t" << corr_idx << "\t" << bad_idx << "\t" << w << endl;
	}
//	cout << endl;

	if (total_w>0)
	{
		float mult_val = (float)(sample_pairs.size())/total_w;
		for (i=0; i<sample_pairs.size(); i++)
			sample_pairs[i].weight*=mult_val;
	}
}






/*************************************************************************************
Creates the training data for models of de novo predictions (6-14).
These sequences do not have to reach the N- or C-terminals
**************************************************************************************/
void DeNovoRankScorer::create_training_data_for_partial_denovo_ranking(
				const string& mgf_list,
				const double train_ratio,
				const int max_num_pairs,
				const int charge,
				const int size_idx,
				const float penalty_for_bad_aa,
				RankBoostDataset& train_ds, 
				RankBoostDataset& test_ds,
				char *test_scan_file,
				int   length_limit)
{	
	PeakRankModel *&peak_model = peak_prediction_models[model_type];
	const int max_num_ppp_frags = 4;
	const int max_num_pairs_per_spec = 80;
	const mass_t tolerance_diff = 0.275;

	if (model_type !=1 && model_type !=3)
	{
		cout << "Error: this training function is only intended for partial de novo samples!" << endl;
		cout << "Need to set model type to 1  or 3, not " << model_type << endl;
		exit(1);
	} 
	

	train_ds.clear();
	test_ds.clear();

	Config *config = model->get_config();

	config->set_use_spectrum_charge(1);

	const vector< vector<mass_t> >& size_thresholds = peak_model->get_size_thresholds();

	mass_t min_mz=0;
	mass_t max_mz = 10000;
	if (size_idx>0)
		min_mz = (size_thresholds[charge][size_idx-1]+charge-1)/(mass_t)charge;
	if (size_idx< size_thresholds[charge].size())
		max_mz = (size_thresholds[charge][size_idx]+charge-1)/(mass_t)charge;

	int num_solutions= (length_limit<=8 ? 1000 :  2000);

	if (model_type == 3 && model_length<10)
	{
		if (model_length<= 4)
			num_solutions= 100;
		if (model_length == 5)
			num_solutions = 250;
		if (model_length > 5)
			num_solutions = 750;
	}

	if (length_limit>15)
		length_limit=15;

	if (charge >2 && length_limit>10)
		length_limit=10;


	cout << endl << "Model type " << model_type << ", model length " << model_length << endl;
	cout << "Length limit : " << length_limit << endl;
	cout << "Num solutions : " << num_solutions << endl;
//	max_mz = min_mz + 30;
	
	cout << "Charge " << charge << " size " << size_idx << endl;
	cout << "Min m/z " << setprecision(2) << fixed << min_mz  << "  Max m/z " << max_mz << endl;

	double start_time = time(NULL);

	// read spectra
	FileManager fm;
	FileSet	    fs;
	BasicSpecReader bsr;
	QCPeak		peaks[4000];

	fm.init_from_list_file(model->get_config(),mgf_list.c_str());
	fs.select_files_in_mz_range(fm,min_mz,max_mz,charge);
	fs.randomly_reduce_ssfs(36000);
	
	int num_pairs_per_spec = (int)(max_num_pairs/(float)fs.get_total_spectra() + 0.5);
	if (num_pairs_per_spec>max_num_pairs_per_spec)
		num_pairs_per_spec=max_num_pairs_per_spec;

	cout << "Read " << fs.get_total_spectra() << " spectra headers" << endl;
	if (fs.get_total_spectra()<10)
	{
		cout << "Error: not enough spectra to train!" << endl;
		exit(1);
	}

	
	cout << "Training with " << num_pairs_per_spec << " pairs per specturm.." << endl;
	cout << "Creating RankBoostDatasets... proportion for training " << train_ratio << endl;

	vector<int> ppp_frag_type_idxs;
	ppp_frag_type_idxs.clear();

	cout << "Using a combined peak model" << endl;
	cout << "Selecting peaks of the paritally mobile model" << endl;

	ppp_frag_type_idxs = peak_model->get_model_ptr(charge,size_idx,PARTIALLYMOBILE)->get_fragment_type_idxs();
	int f;
	for (f=0; f<ppp_frag_type_idxs.size(); f++)
		cout << f+1 << "\t" << config->get_fragment(ppp_frag_type_idxs[f]).label << endl;
	cout << endl;

	sort(ppp_frag_type_idxs.begin(),ppp_frag_type_idxs.end());
	DeNovoPartitionModel *&part_model = dnv_part_models[charge][size_idx];
	part_model->init_features(model_type, charge,size_idx,ppp_frag_type_idxs,config);
		
	int num_groups_in_train=0;
	int num_groups_in_test=0;
	int num_train_pairs=0;

	ofstream test_scan_stream;
	if (test_scan_file)
	{
		test_scan_stream.open(test_scan_file);
		if (! test_scan_stream.is_open())
		{
			cout << "Error: couldn't open test stream!" << endl;
			exit(1);
		}
	}
	
	
	static vector<PrmGraph *> prm_ptrs;
	static vector<SeqPath> solutions;
	const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();
	int spec_idx;
	int num_spectra_read=0;

	int num_without_sol_in_graph=0;
	int num_without_correct_sol =0;
	int num_possible=0;

	config->set_use_spectrum_charge(1);

	for (spec_idx=0; spec_idx<all_ssfs.size(); spec_idx++)
	{
		MGF_single *ssf = (MGF_single *)all_ssfs[spec_idx];
		const Peptide& full_pep = ssf->peptide;
		const mass_t true_mass_with_19 = full_pep.get_mass_with_19();
		BasicSpectrum     bs;
		AnnotatedSpectrum as;
		PrmGraph prm;

		const vector<int>& aas = ssf->peptide.get_amino_acids();
		int a;
		for (a=0; a<aas.size(); a++)
			if (aas[a]>Val)
				break;
		if (a<aas.size())
			continue;

		num_possible++;
//
//		if (spec_idx>2000)
//			break;

		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
		bs.peaks = peaks;
		bs.num_peaks = num_peaks;
		bs.ssf = ssf;
		as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
		
		prm.create_graph_from_spectrum(model,
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -