⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovosolutions.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:

	DeNovoRankScorer *rank_model = (DeNovoRankScorer *)model.get_rank_model_ptr(1);
	static PrmGraph *prm_ptr = NULL;
	static vector<PrmGraph *> prm_ptrs;

	time_t start_time,last_time;
	start_time = time(NULL);
	last_time = start_time;

	int f;
	for (f=0; f<list_vector.size(); f++) 
	{
		const char *spectra_file = list_vector[f].c_str();
		FileManager fm;
		FileSet fs;
		BasicSpecReader bsr;

		///////////////////////////////////////////////
		// Quick read, get all pointers to begining of spectra
		if (get_file_extension_type(list_vector[f]) != MZXML)
		{
			fm.init_from_file(config,spectra_file);
		}
		else // reads peaks 
			fm.init_and_read_single_mzXML(config,spectra_file,f);

		fs.select_all_files(fm);
	//	fs.select_files_in_mz_range(fm,555,645);

		const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
		int sc;
		for (sc=0; sc<all_ssf.size(); sc++)
		{
			static vector<QCPeak> peaks;
			SingleSpectrumFile *ssf = all_ssf[sc];
			if (peaks.size()<ssf->num_peaks)
			{
				int new_size = ssf->num_peaks*2;
				if (new_size<2500)
					new_size=2500;
				peaks.resize(new_size); 
			}

		//	if (sc>500)
		//		break;

		//	if (ssf->get_scan() != 164)
		//		continue;

			time_t curr_time = time(NULL);
			double elapsed_time = (curr_time - last_time);
			if (report_progress && elapsed_time>5)
			{
				last_time = curr_time;
				cout << "Processing file " << f+1 << "/" << list_vector.size();
				if (all_ssf.size() == 1)
				{
					cout << "  spectrum 1/1 of current file." << endl;
				}
				else
					cout << "  spectrum " << sc+1 << "/" << all_ssf.size() << 
					" of current file." << endl;
			}

			const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
			ssf->file_idx = f+file_start_idx;

			// convert peak list ot a spectrum with charge (if original charge ==0)
			// the spectrum gets charge 2, but the true charge is computed from the data
		
			if (num_peaks<5)
			{
				ssf->print_ssf_stats(config,out_stream);
				out_stream << "# too few peaks..." << endl;
				continue;
			}

			spec_counter++;
			Spectrum s;
			s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);

			vector<SeqPath> solutions;
			solutions.clear();

			if ( ssf->charge > model.get_max_score_model_charge())
			{
				ssf->print_ssf_stats(config,out_stream);
				out_stream << "# Charge " << s.get_charge() << " not supported yet..." << endl << endl;
				continue;
			}


			bool perform_rerank=false;
			int rerank_size_idx = NEG_INF;
			int num_sols_to_generate_before_ranking=num_solutions;
		
			if (1)
			{
				vector<mass_t> pms_with_19;
				vector<int>    charges;
				pms_with_19.clear();
				charges.clear();		
				BasicSpectrum bs;
				bs.ssf = ssf;
				bs.peaks = &peaks[0];
				bs.num_peaks = num_peaks;

					// output m/z and prob values for the different charge states
				model.select_pms_and_charges(config,bs,pms_with_19,charges,true);
	
				if (prm_ptrs.size()<pms_with_19.size())
					prm_ptrs.resize(pms_with_19.size(),NULL);
				
		
				if (rank_model && num_sols_to_generate_before_ranking<num_rerank_sols)
						num_sols_to_generate_before_ranking=num_rerank_sols;
				
				generate_denovo_solutions_from_several_pms(
					prm_ptrs,
					&model,
					&s,
					true, 
					num_sols_to_generate_before_ranking,
					min_length,
					max_length,
					pms_with_19,
					charges,
					solutions,
					false);

				// use charge of top scoring solution
				if (solutions.size()>1)
				{
					const int sol_charge = solutions[0].charge;
					int j;
					for (j=1; j<solutions.size(); j++)
					{
						if (solutions[j].charge != sol_charge)
						{
							if (j==solutions.size()-1)
							{
								solutions.pop_back();
							}
							else
							{
								solutions[j]=solutions[solutions.size()-1];
								solutions.pop_back();
								j--;
							}
						}
					}
				}
			}

			if (rank_model && solutions.size()>0)
			{
				rerank_size_idx = config->calc_size_idx(solutions[0].charge,solutions[0].pm_with_19);
				if (rank_model->get_ind_part_model_was_initialized(solutions[0].charge,rerank_size_idx))
				{
					perform_rerank=true;
				}
			}

			
			vector<score_pair> scores;
			if (perform_rerank)
			{
				rank_model->score_denovo_sequences(solutions,ssf,&peaks[0],num_peaks,scores,rerank_size_idx);
				sort(scores.begin(),scores.end());
			}
			else
			{
				scores.resize(solutions.size());
				int i;
				for (i=0; i<solutions.size(); i++)
					scores[i].idx=i;
			}

			////////////////////////////////////////////////////////////
			// if we are here it is only for denovo/tags
			// print results
			////////////////////////////////////////////////////////////

			
			bool had_pep = false;
			bool had_correct = false;

			ssf->print_ssf_stats(config,out_stream);

			if (solutions.size() == 0)
			{
				out_stream << "No solutions found." << endl;
			}
			else 
			{
				out_stream << "#Index\t";
				if (perform_rerank)
					out_stream << "RnkScr\t";
				out_stream << "PnvScr\tN-Gap\tC-Gap\t[M+H]\tCharge\tSequence" << endl;

				int i; 	
				for (i=0; i<solutions.size() && i<num_solutions; i++) 
				{
					const int idx = (perform_rerank ? scores[i].idx : i);

					mass_t c_gap=solutions[idx].pm_with_19 - solutions[idx].c_term_mass;
					if (c_gap<24.0)
						c_gap = 0;

					out_stream << setprecision(3) << fixed << i << "\t";
				
					if (perform_rerank)
						out_stream << scores[i].score << "\t";

					out_stream << solutions[idx].path_score << "\t";
					out_stream << solutions[idx].n_term_mass << "\t";
					out_stream << c_gap << "\t";
					out_stream << solutions[idx].pm_with_19 << "\t";
					out_stream << solutions[idx].charge << "\t";
					out_stream << solutions[idx].seq_str;	
				//	out_stream << solutions[i].num_forbidden_nodes ;

					if (ssf->peptide.get_num_aas()>2)
					{
						if (solutions[idx].check_if_correct(ssf->peptide.as_string(config),config))
						{

							out_stream << " *";

							if (! had_correct)
							{
								correct_benchmark++;
								had_correct=true;
							}
						}
						had_pep=true;
					}
					out_stream << endl;

				//	out_stream << endl;
				//	solutions[i].print_full(config);
				//	cout << endl;
				}
			}

			if (had_pep) // for annotated spectra (benchmark)
				total_benchmark++;

			if (0)
			{
				s.print_expected_by();
			//	s.print_spectrum();
			//	prm_ptrs[0]->print_with_combo_tables();
				prm_ptrs[0]->print();
				exit(0);
			}

			out_stream << endl;
		}
	}

	/////////////////////////////////////////////////////////////////
	// this part works only if the spectra are annotated (benchmark)
	/////////////////////////////////////////////////////////////////
	if (total_benchmark>0)
	{
		cout << "Correct spectra " << correct_benchmark << "/" << total_benchmark << " (" <<
			fixed << setprecision(3) << (double)correct_benchmark/(double)total_benchmark << ")" << endl;
	}

	
	if (report_progress)
	{
		time_t curr_time = time(NULL);
		double elapsed_time = (last_time - start_time);
		cout << "Total running time: " << elapsed_time << endl;
		cout << "Processed " << list_vector.size() << " files and " << spec_counter << " spectra." << endl;
	}

}





void benchmark_shew(Model& model, char *mgf_file)
{
	Config *config = model.get_config();


	time_t start_time,last_time;
	start_time = time(NULL);
	last_time = start_time;


	FileManager fm;
	FileSet fs;
	BasicSpecReader bsr;

	///////////////////////////////////////////////
	// Quick read, get all pointers to begining of spectra
	if (get_file_extension_type(mgf_file) != MZXML)
	{
		fm.init_from_file(config,mgf_file);
	}
	else // reads peaks 
		fm.init_and_read_single_mzXML(config,mgf_file,0);

	fs.select_all_files(fm);

	const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
	vector<int> max_ml_ranks;

	int good_seqs=0;
	int total_good_aa =0;
	int total_aa_pred =0;

	int sc;
	for (sc=0; sc<all_ssf.size(); sc++)
	{
		static vector<QCPeak> peaks;
		SingleSpectrumFile *ssf = all_ssf[sc];
		if (peaks.size()<ssf->num_peaks)
		{
			int new_size = ssf->num_peaks*2;
			if (new_size<2500)
				new_size=2500;
			peaks.resize(new_size); 
		}

		time_t curr_time = time(NULL);
		double elapsed_time = (curr_time - last_time);
	
		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
		ssf->file_idx = 0;

		// convert peak list ot a spectrum with charge (if original charge ==0)
		// the spectrum gets charge 2, but the true charge is computed from the data
		
		if (num_peaks<5)
		{
			ssf->print_ssf_stats(config);
			cout << "# too few peaks..." << endl;
			continue;
		}

		
		Spectrum s;
		s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);

		vector<SeqPath> solutions;
		solutions.clear();

		if ( ssf->charge > model.get_max_score_model_charge())
		{
			ssf->print_ssf_stats(config);
			cout << "# Charge " << s.get_charge() << " not supported yet..." << endl << endl;
			continue;
		}

		vector<mass_t> pms_with_19;
		vector<int>    charges;
		pms_with_19.clear();
		charges.clear();

		BasicSpectrum bs;
		bs.ssf = ssf;
		bs.peaks = &peaks[0];
		bs.num_peaks = num_peaks;

		// output m/z and prob values for the different charge states
	
		PrmGraph tprm;
		tprm.create_graph_from_spectrum(&model,&s,s.get_true_mass_with_19(),s.get_charge());
	
		PrmGraph *prm_ptr = &tprm;

		generate_denovo_solutions(prm_ptr, &model,&s,true,s.get_true_mass_with_19(),s.get_charge(),
			1, 6, 12, NEG_INF, solutions, true);

		vector<int> true_aas = ssf->peptide.get_amino_acids();
		vector<int> sol_aas;

		solutions[0].get_amino_acids(sol_aas);

		int j;
		for (j=0; j<true_aas.size(); j++)
		{
			if (true_aas[j]==Gln)
				true_aas[j]=Lys;
			if (true_aas[j]==Ile)
				true_aas[j]=Leu;
		}

		for (j=0; j<sol_aas.size(); j++)
		{
			if (sol_aas[j]==Gln)
				sol_aas[j]=Lys;
			if (sol_aas[j]==Ile)
				sol_aas[j]=Leu;
			solutions[0].positions[j].aa = sol_aas[j];
		}
		Peptide corr_pep;
		corr_pep.set_peptide_aas(true_aas);
		corr_pep.calc_mass(config);

		int num_corr_aa = solutions[0].get_num_correct_aas(corr_pep,config);
		bool is_good=false;
		if (num_corr_aa == true_aas.size())
		{
			is_good=true;
			good_seqs++;
		}
		
		total_good_aa += num_corr_aa;
		total_aa_pred += sol_aas.size();

		cout << sc << "\t" << corr_pep.as_string(config) << "\t" << solutions[0].seq_str;
		if (is_good)
			cout << " *";
		cout << endl;
	}
	
	cout << "Good de novo : " << good_seqs << " " << fixed << setprecision(3) << (float)good_seqs/sc << endl;
	cout << "AA prediction: " << total_good_aa/(float) total_aa_pred << endl;

	
	time_t curr_time = time(NULL);
	double elapsed_time = (curr_time - last_time);
	cout << "Total running time: " << elapsed_time << endl;
}


void perform_prm_on_list_of_files(Model& model, 
								  const vector<string>& list_vector,
								  float sqs_filter_prob,
								  int file_start_idx)
{
	Config *config = model.get_config();

	int f;
	for (f=0; f<list_vector.size(); f++) 
	{
		const char *spectra_file = list_vector[f].c_str();
		FileManager fm;
		FileSet fs;
		BasicSpecReader bsr;

		///////////////////////////////////////////////
		// Quick read, get all pointers to begining of spectra
		if (get_file_extension_type(list_vector[f]) != MZXML)
		{
			fm.init_from_file(config,spectra_file);
		}
		else // reads peaks 
			fm.init_and_read_single_mzXML(config,spectra_file,f);

		fs.select_all_files(fm);

		const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
		int sc;
		for (sc=0; sc<all_ssf.size(); sc++)
		{
			static vector<QCPeak> peaks;
			SingleSpectrumFile *ssf = all_ssf[sc];
			if (peaks.size()<ssf->num_peaks)
			{
				int new_size = ssf->num_peaks*2;
				if (new_size<2500)
					new_size=2500;
				peaks.resize(new_size); 
			}

			const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
			ssf->file_idx = f+file_start_idx;

			if (num_peaks<0)
				continue;

			// convert peak list ot a spectrum with charge (if original charge ==0)
			// the spectrum gets charge 2, but the true charge is computed from the data
		
			Spectrum s;
			s.init

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -