⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovosolutions.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
			{
				vector<SeqPath> alt_solutions;

				generate_tags(prm_ptrs,&model,bs,&s, num_tags, main_length,
							  alt_pms_with_19, alt_charges, alt_solutions, false, alt_solutions.size());
				int j;
				for (j=0; j<alt_solutions.size(); j++)
					solutions.push_back(alt_solutions[j]);


			}
			
			
			if (solutions.size() == 0)
			{
				ssf->print_ssf_stats(config,out_stream);
				out_stream << "   - no tags found" << endl;
				no_tags++;
				continue;
			}
			

			output_tag_solutions(ssf,config,out_stream,solutions);

			if (ssf->peptide.get_num_aas()>2)
			{
				num_benchmark++;
				int idx;
				for (idx=0; idx<solutions.size() && num_solutions; idx++)
					if (solutions[idx].check_if_correct(ssf->peptide.as_string(config),config))
					{
						correct_benchmark++;
						break;
					}
			}


			scan_count++;
			
		}
	}
	

	if (report_progress)
	{
		time_t curr_time = time(NULL);
		double elapsed_time = (curr_time - last_time);
		cout << "Total running time: " << elapsed_time << endl;
	}

	if (num_benchmark>0)
	{
		cout << endl << "Correct " << correct_benchmark << "/" << num_benchmark << " (" <<
			setprecision(3) << fixed << (float)correct_benchmark/num_benchmark << ")" << endl;
	}
}




// makes a FASTA file with the sequences of full denovo sequences (completed
// from the SEQ in the annotated mgf file)
void make_denovo_training_fa(AdvancedScoreModel& model,
								 char *spectrum_file)
{
	Config *config = model.get_config();
	FileManager fm;
	FileSet fs;


	// Quick read, get all pointers to begining of spectra
	if (get_file_extension_type(spectrum_file) != MZXML)
	{
		fm.init_from_file(config,spectrum_file);
	}
	else // reads peaks 
		fm.init_and_read_single_mzXML(config,spectrum_file,0);

	fs.select_all_files(fm);

	cout << "Generating fa for: " << spectrum_file << endl;
	cout << "Scans: " << fs.get_total_spectra() << endl;

	string file_name,fa_file;
	get_file_name_without_extension(spectrum_file,file_name);
	fa_file = file_name + ".dnv.fa";
	ofstream fa_stream(fa_file.c_str());
	
	time_t start_time;
	start_time = time(NULL);

	vector<PrmGraph *> prm_ptrs;
	BasicSpecReader bsr;
	int too_few_peaks=0;
	int bad_charge=0;
	int no_sols=0;
	int scan_count=0;

	prm_ptrs.resize(50,NULL);

	///////////////////////////////////////////////
	const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
	int sc;
	for (sc=0; sc<all_ssf.size(); sc++)
	{
		static vector<QCPeak> peaks;
		SingleSpectrumFile *ssf = all_ssf[sc];
		if (peaks.size()<ssf->num_peaks)
		{
			int new_size = ssf->num_peaks*2;
			if (new_size<2500)
				new_size=2500;
			peaks.resize(new_size); 
		}

	//	if (ssf->get_scan()<11254)
	//		continue;

	
		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
		ssf->file_idx = 0;

		if (num_peaks<10)
		{
			too_few_peaks++;
			continue;
		}

		Spectrum s;
		s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);

		if ( ssf->charge > model.get_max_score_model_charge())
		{
			bad_charge++;
			continue;
		}

		vector<SeqPath> seqpath_solutions;
		vector<int> charges;
		vector<mass_t> pms_with_19;

		pms_with_19.clear();
		charges.clear();		
		BasicSpectrum bs;
		bs.ssf = ssf;
		bs.peaks = &peaks[0];
		bs.num_peaks = num_peaks;

		Peptide correct_peptide = ssf->peptide;

		// output m/z and prob values for the different charge states
		vector<PmcSqsChargeRes> pmcsqs_res;
		model.select_pms_and_charges(config,bs,pms_with_19,charges,true,&pmcsqs_res);
	
		cout << "Scan " << bs.ssf->get_scan() << ",\tch: " << charges[0] << setprecision(1) << "\tpm19: " <<
			pms_with_19[0] << setprecision(3) << " \tSQS: " <<  pmcsqs_res[charges[0]].sqs_prob;
		if (pmcsqs_res[charges[0]].sqs_prob<0.01 && ! config->get_use_spectrum_charge())
		{
		//	cout << "  - skipping..." << endl;
		//	continue;
		}

		pms_with_19.clear();
		charges.clear();

		pms_with_19.push_back(s.get_true_mass_with_19());
		charges.push_back(s.get_charge());
	
		generate_denovo_solutions_from_several_pms_with_good_start_end_idxs(prm_ptrs,
				&model,&s,true,150,6,14,pms_with_19,charges,seqpath_solutions);
		
		int j;
		for (j=0; j<seqpath_solutions.size(); j++)
		{
			const int num_correct_aas = seqpath_solutions[j].get_num_correct_aas(correct_peptide,config);
			if (num_correct_aas==seqpath_solutions[j].get_num_aa())
			{
				seqpath_solutions[j]=seqpath_solutions[seqpath_solutions.size()-1];
				seqpath_solutions.pop_back();
			}
		}
		
		vector<PeptideSolution> pep_solutions;
		if (seqpath_solutions.size()>0)
		{
			// create peptide solutions
			pep_solutions.resize(seqpath_solutions.size());
			int s_idx;
			for (s_idx=0; s_idx<seqpath_solutions.size(); s_idx++)
			{
				convert_seq_path_to_peptide_soluition_and_fill_in_aas(config,
					correct_peptide, seqpath_solutions[s_idx],pep_solutions[s_idx]);
			}
		}
		else
			continue;
		

		if (seqpath_solutions.size() == 0)
		{
			cout << "   - no sols" << endl;
			no_sols++;
			continue;
		}
		

		fa_stream << ">" << file_name << ":" << ssf->get_scan() << endl;

		// sample de novo sequences
		int i;
		for (i=9; i<pep_solutions.size(); i+=10)
		{
			int idx = int(my_random()*i);
			pep_solutions[idx].pep.convert_to_org(config);
			fa_stream << pep_solutions[idx].pep.as_string(config);
			pep_solutions[idx].num_correct_aas=pep_solutions[idx].pep.calc_number_of_correct_aas(config,correct_peptide);
			cout << " " << idx << "(" << pep_solutions[idx].num_correct_aas << ")";
		}
		cout << endl;
		fa_stream << endl;


		scan_count++;

	//	cout << "   " << seqpath_solutions.size() << " sols." << endl;

		if (sc % 200 == 0)
		{
			time_t curr_time = time(NULL);
			cout << setprecision(1) << fixed;
			cout << "done " << sc << "/" << all_ssf.size() << "  " << curr_time - start_time << " secs.,  "
				<< scan_count << " scans completed." << endl;
		}
		
	}
	fa_stream.close();

	cout << "Done..." << endl;
	cout << "fa file: " << fa_file << endl;
	cout << "Wrote fa for " << scan_count << " spectra." << endl;
	if (bad_charge>0)
		cout << "Scans with bad charges  : " << bad_charge << endl;
	if (too_few_peaks>0)
		cout << "Scans with too few peaks: " << too_few_peaks << endl;
	if (no_sols>0)
		cout << "Scans for which no tags were found: " << no_sols << endl;
}


void benchmark_tags(AdvancedScoreModel& model,
					char *list,
					char *tag_string,
					int num_test_cases)
{
	Config *config = model.get_config();
	FileManager fm;
	FileSet fs;

	vector<int> num_tags; // how many tags from each length
	int main_length;

	parse_tag_string(tag_string,main_length,num_tags);
	fm.init_from_list_file(config,list);
	fs.select_all_files(fm);

	cout << "benchmarking tags for : " << list << endl;
	cout << "Scans                 : " << fs.get_total_spectra() << endl;
	cout << "Tag types             : " << tag_string << endl;
	if (num_test_cases<0)
	{
		num_test_cases=fs.get_total_spectra();
	}
	else
	{
		fs.randomly_reduce_ssfs(num_test_cases);
	}
	cout << "Will try to get " << num_test_cases << " test cases" << endl;
	
	time_t start_time;
	start_time = time(NULL);

	vector<PrmGraph *> prm_ptrs;
	BasicSpecReader bsr;
	int too_few_peaks=0;
	int bad_charge=0;
	int no_tags=0;
	int scan_count=0;
	vector<int> first_correct_len;
	vector<int> correct_ranks;

	int num_spectra_tested=0;
	int had_correct =0;
	first_correct_len.resize(10,0);

	prm_ptrs.resize(50,NULL);

	///////////////////////////////////////////////
	const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
	int sc;
	for (sc=0; sc<all_ssf.size(); sc++)
	{
		static vector<QCPeak> peaks;
		SingleSpectrumFile *ssf = all_ssf[sc];
		string peptide_str = ssf->peptide.as_string(config);

		if (peaks.size()<ssf->num_peaks)
		{
			int new_size = ssf->num_peaks*2;
			if (new_size<2500)
				new_size=2500;
			peaks.resize(new_size); 
		}

		vector<int>    tweak_charges;
		vector<mass_t> tweak_pms_with_19;

		tweak_charges.resize(6,0);
		tweak_pms_with_19.resize(6,0);

		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
		ssf->file_idx = 0;

		if (num_peaks<10)
		{
			too_few_peaks++;
			continue;
		}

		Spectrum s;
		s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);

	//	s.print_spectrum();

		if ( ssf->charge > model.get_max_score_model_charge())
		{
			bad_charge++;
			continue;
		}

		vector<SeqPath> solutions;
		vector<mass_t> pms_with_19, alt_pms_with_19;
		vector<int>    charges,     alt_charges;

		pms_with_19.clear();
		charges.clear();		
		BasicSpectrum bs;
		bs.ssf = ssf;
		bs.peaks = &peaks[0];
		bs.num_peaks = num_peaks;

	
		
		// output m/z and prob values for the different charge states
		vector<PmcSqsChargeRes> pmcsqs_res;
		model.select_pms_and_charges(config,bs,pms_with_19,charges,true,&pmcsqs_res);

		if (pmcsqs_res[charges[0]].sqs_prob<0.01)
		{
		//	continue;
		}

		cout << sc << "\t" << num_peaks << "\t" << setprecision(2) << pmcsqs_res[charges[0]].sqs_prob << "\t" <<
			ssf->peptide.as_string(config) << "  (" << ssf->peptide.get_mass_with_19() - s.get_org_pm_with_19() << ")" << endl;

		
		alt_charges.clear();
		alt_pms_with_19.clear();
		while (charges.size()>0 && charges[0] != charges[charges.size()-1])
		{
			alt_charges.push_back(charges[charges.size()-1]);
			charges.pop_back();
			alt_pms_with_19.push_back(pms_with_19[pms_with_19.size()-1]);
			pms_with_19.pop_back();
		}

		if (pms_with_19.size()>0)
		{
			generate_tags(prm_ptrs,&model,bs,&s,num_tags, main_length,
						  pms_with_19,charges,solutions);

	
		}

		if (alt_pms_with_19.size()>0)
		{
			vector<SeqPath> alt_solutions;

			generate_tags(prm_ptrs,&model,bs,&s, num_tags, main_length,
						  alt_pms_with_19, alt_charges, alt_solutions, false, alt_solutions.size());
			int j;
			for (j=0; j<alt_solutions.size(); j++)
				solutions.push_back(alt_solutions[j]);
		}
		
		num_spectra_tested++;

		if (num_test_cases==num_spectra_tested)
			break;
		
		if (solutions.size() == 0)
		{
			no_tags++;
			continue;
		}
		

		int j;
		for (j=0; j<solutions.size(); j++)
			if (solutions[j].check_if_correct(peptide_str,config))
			{
				had_correct++;
				first_correct_len[solutions[j].get_num_aa()]++;
				correct_ranks.push_back(j);
				break;
			}

		if (sc % 200 == 0)
		{
			time_t curr_time = time(NULL);
			cout << setprecision(1) << fixed;
			cout << "done " << sc << "/" << all_ssf.size() << "  " << curr_time - start_time << " secs.,  "
				<< scan_count << " scans completed." << endl;
		}	
	}

	cout << "Tested " << num_spectra_tested << " spectra" << endl;
	if (bad_charge>0)
		cout << "Scans with bad charges  : " << bad_charge << endl;
	if (too_few_peaks>0)
		cout << "Scans with too few peaks: " << too_few_peaks << endl;
	if (no_tags>0)
		cout << "Scans for which no tags were found: " << no_tags << endl;

	cout << "% with correct " << setprecision(3) << (float)had_correct/(float)num_spectra_tested << endl;
	cout << "First length breakdown: " << endl;
	int j;
	for (j=1; j<first_correct_len.size(); j++)
		if (first_correct_len[j]>0)
			cout << j << "\t" << (float)first_correct_len[j]/(float)num_spectra_tested << endl;
	
	cout << endl;

//	1 3 5 10 25 50 100
	vector<int> counts,seps;

	seps.push_back(1);
	seps.push_back(3);
	seps.push_back(5);
	seps.push_back(10);
	seps.push_back(25);
	seps.push_back(50);
	seps.push_back(100);
	seps.push_back(250);
	seps.push_back(500);

	counts.resize(seps.size(),0);
	for (j=0; j<correct_ranks.size(); j++)
	{
		int k;
		for (k=0; k<seps.size(); k++)
			if (correct_ranks[j]<seps[k])
				counts[k]++;
	}
	
	for (j=0; j<seps.size(); j++)
	{
		cout << seps[j] << "\t" << counts[j]/(float)all_ssf.size() << endl;
	}
	cout << endl;
}


void perform_denovo_on_list_of_files(AdvancedScoreModel& model, 
									 const vector<string>& list_vector, 
									 int file_start_idx,
									 int num_solutions, 
									 int min_length, 
									 int max_length,
									 bool report_progress,
									 ostream& out_stream)
{

	int num_rerank_sols = 200;
	
	Config *config = model.get_config();
	int correct_benchmark=0;
	int total_benchmark=0;
	int spec_counter=0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -