📄 denovoranktrain.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:

			sort(pep_sol_idxs.begin(),pep_sol_idxs.end());

			// add samples
			for (j=0; j<pep_sol_idxs.size(); j++)
			{
				const int pep_sol_idx = pep_sol_idxs[j];

				if (are_same_upto_NDILQK(set.correct_sol.pep,pep_solutions[pep_sol_idx].pep))
					continue;
				
				RankBoostSample bad_rbs;
				fill_complete_peptide_rbs(pep_solutions[pep_sol_idx], peaks, num_peaks, as, 
						pmc_sqs_res, bad_rbs, size_idx);

				const int bad_sam_idx=rds.get_num_samples();

				bad_rbs.group_idx = group_idx;
				bad_rbs.rank_in_group=2;

				if (peptide_strings && add_to_train)
				{		
					bad_rbs.tag1=peptide_strings->size();	
					peptide_strings->push_back(pep_solutions[pep_sol_idx].pep.as_string(config));
				}

				// give higher ranked db mismatches a stronger weight;
				bad_rbs.tag2= 2; // full de novo
				bad_rbs.float_tag1 = set.correct_sol.pm_with_19;
				
				rds.add_sample(bad_rbs);
				rds.add_to_phi_vector(bad_sam_idx, corr_sam_idx, 2);
	
				if (add_to_train)
					num_train_pairs++;

				if (print_sams)
				{
					float score;
					bad_rbs.get_feature_val(117,&score);
					cout << j << "\t" << pep_sol_idx << "\t" << 
						pep_solutions[pep_sol_idx].pep.as_string(config) << "\t" << score << endl;
				}
			}
		}

		if (bad_pm)
			num_rand++;

		if (good_first)
			num_first++;
		
		int num_denovo_to_add=0;
	
		num_spectra_read++;

		if ((model_type == 2 && num_spectra_read % 200 == 0) ||
		    (model_type == 0 && num_spectra_read % 1000 == 0) )
		{
			cout << "processed " << num_spectra_read << "/" << set_counter << " spectra (";
			double curr_time = time(NULL);
			cout << curr_time - start_time << " secs.)\t" << setprecision(3) << fixed << " pm acc: " <<
				1.0 - num_rand/(float)num_spectra_read <<"\tfirst: " << num_first/(float)num_spectra_read<<  endl;
		}
	}

	if (test_scan_file)
		test_scan_stream.close();

	train_ds.set_num_groups(num_groups_in_train);

	test_ds.set_num_groups(num_groups_in_test);
			
	cout << "computing phi weights..." << endl;
	train_ds.compute_total_phi_weight();
	
	cout << "initializing potential lists..." << endl;
	train_ds.initialize_potenital_lists();
	
	cout << "initializing real feature table..." << endl;
	train_ds.initialzie_real_feature_table(part_model->get_feature_names().size());

	cout << "Processed " << num_spectra_read << " spectra" << endl;
	cout << num_groups_in_train << " sets in training set (" << train_ds.get_phi_support().size() << " pairs)" << endl;
	cout << num_groups_in_test  << " sets in test set     (" << test_ds.get_phi_support().size() << " pairs)" << endl << endl;
}


/*************************************************************************************
Creates the training data for models of complete predictions, i.e., it is assumed that
all peptides being scores start at mass 0 and end at the c-terminal. Therefore, we can
use the peptides mass as the spectrum's pm_with_19.
**************************************************************************************/
void DeNovoRankScorer::create_training_data_for_complete_sequence_ranking(
				const string& db_dir,
				const string& correct_dir,
				const string& denovo_dir,
				const string& mgf_list,
				const double train_ratio,
				const int max_num_pairs,
				const int charge,
				const int size_idx,
				RankBoostDataset& train_ds, 
				RankBoostDataset& test_ds,
				vector<string>* peptide_strings,
				char *test_scan_file,
				float ratio_db,
				float ratio_denovo,
				float ratio_db_cross)
{	
	const int max_num_ppp_frags = 4;
	if (model_type !=0 && model_type != 2)
	{
		cout << "Error: this training function is only intended for full sequence samples!" << endl;
		cout << "Need to set model type to 0  or 2, not " << model_type << endl;
		exit(1);
	}

	if (ratio_db<=0)
		ratio_db=0;

	if (ratio_denovo<0)
		ratio_denovo=0;

	if (ratio_db_cross<0)
		ratio_db_cross=0;

	vector<bool>   file_indicators;
	PeptideSetMap	 psm;

	Config *config = model->get_config();

	if (peptide_strings)
		peptide_strings->clear();

	double start_time = time(NULL);

	// read sample peptide sequences
	create_complete_denovo_set_map(config,mgf_list,db_dir,correct_dir,
		denovo_dir,charge,size_idx,psm, file_indicators);

	// Read previous rerank path
	if (model_type == 2)
		ratio_db_cross = 0;


	const int total_num_pairs=assign_denovo_weights_to_sets(psm, ratio_db, ratio_denovo);
	const float total_ratio = ratio_db + ratio_denovo+ratio_db_cross;
	const int num_db_pairs     = (int)(max_num_pairs * (ratio_db/total_ratio));
	const int num_denovo_pairs = (int)(max_num_pairs * (ratio_denovo/total_ratio));

	// read spectra
	FileManager fm;
	FileSet	    fs;
	BasicSpecReader bsr;
	QCPeak		peaks[2000];

	fm.init_from_list_file(model->get_config(),mgf_list.c_str(),file_indicators);
	fs.select_all_files(fm);

	const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();

	int i;
	int num_spectra_read=0;

	cout << endl << "Read " << all_ssfs.size() << " headers" << endl << endl;
	cout << "Creating RankBoostDatasets... proportion for training " << train_ratio << endl;
	cout << "Using following proportions for training data:" << endl;
	cout << ratio_db << " pairs of correct, incorrect db hits (same spectrum)" << endl;
	cout << ratio_denovo   << " pairs of correct, incorrect full dnv (same spectrum)" << endl;
	cout << ratio_db_cross << " pairs of correct, incorrect db hits (different spectra)" << endl;
	
	// first select most abundant fragments, use first 1000 ssfs for statistics
	vector<int> frag_counts;
	frag_counts.resize(config->get_all_fragments().size(),0);
	for (i=0; i<all_ssfs.size() && i<1000; i++)
	{
		MGF_single *ssf = (MGF_single *)all_ssfs[i];
		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
		BasicSpectrum     bs;
		bs.peaks = peaks;
		bs.num_peaks = num_peaks;
		bs.ssf = ssf;

		vector< vector<float> > ann_intens;
		vector< vector<mass_t> > ann_masses;
		AnnotatedSpectrum as;
		as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
		as.set_peptide(ssf->peptide);
		as.annotate_spectrum(ssf->peptide.get_mass_with_19(),true);
		as.extract_annotated_intens_and_masses(ann_intens,ann_masses);

		int f;
		for (f=0; f<ann_intens.size(); f++)
		{
			int j;
			for (j=0; j<ann_intens[f].size(); j++)
				if (ann_intens[f][j]>0)
					frag_counts[f]++;
		}
	}


	vector<int> ppp_frag_type_idxs;
	ppp_frag_type_idxs.clear();

	// The peak prediction fragments are used elsewhere too
	// select them here unless they are provided by the PeakRankModel
	// (if it is a combined model)
	PeakRankModel *&peak_model = peak_prediction_models[model_type];
	if (peak_model->get_feature_set_type() <= 2)
	{
		cout << "Selecting frag models:" << endl;
		
		for (i=0; i<max_num_ppp_frags; i++)
		{
			int max_frag=-1;
			int max_count=-1;
			int f;
			for (f=0; f<frag_counts.size(); f++)
				if (frag_counts[f]>max_count && 
					peak_model->has_intialized_models(charge,size_idx,f))
				{
					max_count = frag_counts[f];
					max_frag=f;
				}

			if (max_count<0)
			{
				cout << "Warning: not enough frag models initialized for charge " << charge <<
					"  size " << size_idx << endl;
				break;
			}
			ppp_frag_type_idxs.push_back(max_frag);
			cout << max_frag << "\t" << config->get_fragment(max_frag).label << "\t" << frag_counts[max_frag] << endl;
			frag_counts[max_frag]=-1;
		}
	}
	else
	{
		cout << "Using a combined peak model" << endl;
		cout << "Selecting peaks of the paritally mobile model" << endl;

		int mobility;
		for (mobility=MOBILE+1; mobility<NONMOBILE; mobility++)
		{
			const vector<int>& frag_idxs = peak_model->get_model_ptr(charge,size_idx,mobility)->get_fragment_type_idxs();
			int f;
			for (f=0; f<frag_idxs.size(); f++)
			{
				if (ppp_frag_type_idxs.size() == max_num_ppp_frags)
					break;

				int j;
				for (j=0; j<ppp_frag_type_idxs.size(); j++)
					if (ppp_frag_type_idxs[j] == frag_idxs[f])
						break;
				if (j==ppp_frag_type_idxs.size())
					ppp_frag_type_idxs.push_back(frag_idxs[f]);
			}
		}
		int f;
		for (f=0; f<ppp_frag_type_idxs.size(); f++)
			cout << f+1 << "\t" << config->get_fragment(ppp_frag_type_idxs[f]).label << endl;
		cout << endl;
	}

	sort(ppp_frag_type_idxs.begin(),ppp_frag_type_idxs.end());

	DeNovoPartitionModel *&part_model = dnv_part_models[charge][size_idx];
	part_model->init_features(model_type,charge,size_idx,ppp_frag_type_idxs,config);
		
	int num_groups_in_train=0;
	int num_groups_in_test=0;
	int num_train_pairs=0;

	ofstream test_scan_stream;
	if (test_scan_file)
	{
		test_scan_stream.open(test_scan_file);
		if (! test_scan_stream.is_open())
		{
			cout << "Error: couldn't open test stream!" << endl;
			exit(1);
		}
	}
	
	int set_counter=0;
	for (i=0; i<all_ssfs.size(); i++)
	{
		PeptideSetMap::const_iterator it;
		MGF_single *ssf = (MGF_single *)all_ssfs[i];
		scan_pair key(ssf->file_idx,ssf->idx_in_file);

		it = psm.find(key);
		if (it == psm.end())
			continue;

		set_counter++;
	}

	cout << set_counter << " sets available." << endl;
	int num_db_peptides_per_set = (int)(0.5 + (float)num_db_pairs/set_counter);
	if (num_db_peptides_per_set>7)
		num_db_peptides_per_set=7;
	if (num_db_peptides_per_set<1)
		num_db_peptides_per_set=1;
	if (ratio_db_cross>0 && num_db_peptides_per_set<4)
		num_db_peptides_per_set=4;

	int num_denovo_peptides_per_set = (int)(0.75 + (float)num_denovo_pairs/set_counter);
	if (num_denovo_peptides_per_set<1)
		num_denovo_peptides_per_set=1;
	if (num_denovo_peptides_per_set>10)
		num_denovo_peptides_per_set=10;

	const int org_num_denovo_peptides_per_set = num_denovo_peptides_per_set;

	cout << "NUM DB PER SET     : " << num_db_peptides_per_set << endl;
	cout << "NUM DE NOVO PER SET: " << num_denovo_peptides_per_set << endl;

	static vector<PrmGraph *> prm_ptrs;
	static vector<SeqPath>    seqpath_solutions;

	// Generate various types of samples from spectra
	for (i=0; i<all_ssfs.size(); i++)
	{
		PeptideSetMap::const_iterator it;
		MGF_single *ssf = (MGF_single *)all_ssfs[i];

		// remove samples with PTMs
		const vector<int>& aas = ssf->peptide.get_amino_acids();
		int j;
		for (j=0; j<aas.size(); j++)
			if (aas[j]>Val)
				break;
		if (j<aas.size())
			continue;

		scan_pair key(ssf->file_idx,ssf->idx_in_file);

		it = psm.find(key);
		if (it == psm.end())
			continue;

		const PeptideSet& set = (*it).second;
		BasicSpectrum     bs;
		AnnotatedSpectrum as;
		vector<PmcSqsChargeRes> pmc_sqs_res;

		int charge1=0,charge2=0;
		mass_t mz1=0,mz2=0;
		float prob1=0,prob2=0;

		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
		bs.peaks = peaks;
		bs.num_peaks = num_peaks;
		bs.ssf = ssf;

		model->get_best_mz_charge(config,bs, &mz1, &charge1, &prob1,
								  &mz2, &charge2, &prob2, &pmc_sqs_res);

		const mass_t mass_with_19 = mz1*charge1 - (charge1-1.0);
		as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
		as.set_corrected_pm_with_19(mass_with_19);
	
		const bool add_to_train = (my_random() <= train_ratio);
		RankBoostDataset& rds = (add_to_train ? train_ds : test_ds);

		if (test_scan_file && ! add_to_train)
			test_scan_stream << ssf->file_idx << " " << ssf->idx_in_file << " " <<
			ssf->peptide.get_num_aas() << endl;
	
		bool first=true;
		int corr_sam_idx=-1;
		int group_idx=-1;

		if (add_to_train)
		{
			group_idx = num_groups_in_train++;
		}
		else
		{
			group_idx = num_groups_in_test++;
		}

		RankBoostSample corr_rbs;
		fill_complete_peptide_rbs(set.correct_sol, peaks, num_peaks, as, pmc_sqs_res, corr_rbs, size_idx);
		corr_sam_idx = rds.get_num_samples();
		corr_rbs.group_idx=group_idx;
		corr_rbs.tag1=NEG_INF;
		corr_rbs.float_tag1 = set.correct_sol.pm_with_19;

		if (peptide_strings && add_to_train)
		{
			corr_rbs.tag1 = peptide_strings->size();
			peptide_strings->push_back(set.correct_sol.pep.as_string(config));
		}

		corr_rbs.rank_in_group=0;
		rds.add_sample(corr_rbs);

		// add db samples
		for (j=0; j<set.incorrect_sols.size() && j<num_db_peptides_per_set; j++)
		{
			const int type = set.incorrect_sols[j].type;
			if (type != 1)
				break;
			
			RankBoostSample bad_rbs;
			fill_complete_peptide_rbs(set.incorrect_sols[j], peaks, num_peaks, as, pmc_sqs_res, 
									bad_rbs, size_idx);

			int bad_sam_idx=rds.get_num_samples();

			bad_rbs.group_idx = group_idx;
			bad_rbs.rank_in_group=(j == 0 ? 1 : 2);

			if (peptide_strings && add_to_train)
			{		
				bad_rbs.tag1=peptide_strings->size();	
				peptide_strings->push_back(set.incorrect_sols[j].pep.as_string(config));
			}

			// give higher ranked db mismatches a stronger weight;

			bad_rbs.tag2=set.incorrect_sols[j].type;
			bad_rbs.tag3=set.incorrect_sols[j].num_correct_aas;
			bad_rbs.float_tag1 = set.correct_sol.pm_with_19;

			rds.add_sample(bad_rbs);
			rds.add_to_phi_vector(bad_sam_idx, corr_sam_idx, set.incorrect_sols[j].type); 
			
			if (add_to_train)
				num_train_pairs++;
		}
		int num_db_added=j;

		while (j<set.incorrect_sols.size() && set.incorrect_sols[j].type==1)
			j++;

		const int den_start_idx = j;
		
		// add the rerank samples
		int num_denovo_to_add = org_num_denovo_peptides_per_set;

		// add precomputed denovo samples
		int num_den_available = set.incorrect_sols.size() - den_start_idx;
		vector<int> den_idxs;
		den_idxs.clear();
		if (num_den_available>num_denovo_to_add)
		{
			choose_k_from_n(num_denovo_to_add,num_den_available,den_idxs);
			int k;
			for (k=0; k<num_denovo_to_add; k++)
				den_idxs[k]+=den_start_idx;
		}
		else
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -