⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovosolutions.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
	if (solutions.size() == 0)
	{
		out_stream << "No solutions found." << endl;
	}
	else 
	{
		out_stream << "#Index\tRnkScr\tPnvScr\tN-Gap\tC-Gap\t[M+H]\tCharge\tSequence" << endl;
		int i; 	
		for (i=0; i<solutions.size(); i++) 
		{
			mass_t c_gap=solutions[i].pm_with_19 - solutions[i].c_term_mass;
			if (c_gap<24.0)
				c_gap = 0;

			out_stream << setprecision(3) << fixed << i << "\t";
			out_stream << solutions[i].rerank_score << "\t";
			out_stream << solutions[i].path_score << "\t";
			out_stream << solutions[i].n_term_mass << "\t";
			out_stream << c_gap << "\t";
			out_stream << solutions[i].pm_with_19 << "\t";
			out_stream << solutions[i].charge << "\t";
			out_stream << solutions[i].seq_str;
			out_stream << endl;
		}
	}
	out_stream << endl;
}







void parse_tag_string(char *tag_string, int& main_length, vector<int>& num_tags)
{
	num_tags.clear();
	num_tags.resize(10,0);

	string ts(tag_string);

	int i;
	for (i=0; i<ts.length(); i++)
		if (ts[i]==':')
			ts[i]=' ';

	istringstream iss(ts);

	int len=0;
	while (iss>>len)
	{
		if (len<2 || len>10)
		{
			cout << "Error: bad string for tag lengths, should be something like \"4:20:5:30\""<< endl;
			exit(1);
		}

		int n=0;
		iss >> n;
		if (n<0 || n>1000)
		{
			cout << "Error: number of tags for length " << len << " should be 1-1000" << endl;
			exit(1);
		}
		num_tags[len]=n;
	}

	main_length=0;
	for (i=0; i<10; i++)
		if (num_tags[i]>=5)
		{
			main_length=i;
			break;
		}

	if (main_length<=0)
	{
		for (i=0; i<10; i++)
			if (num_tags[i]>0)
			{
				main_length=i;
				break;
			}
	}

	if (main_length<=0)
	{
		cout << "Error parsing tag_string: " << tag_string << endl;
		exit(1);
	}

}

/****************************************************************
The input uses a tag string which has the format
4:20:5:30 - meaning 20 tags of length 4 and 30 tags of length 5
Generate a text file containing tags for a whole spectrum.
The format is:
<scan number> <number tags>
tweak 0   (each tweak is charge pm_with_19)
..
tweak 5
tag 0 (tweak_idx score num_aas n_term mass AASeq)
..
tag n-1
*****************************************************************/
void create_tag_file_for_inspect(AdvancedScoreModel& model,
								 char *spectrum_file,
								 char *tag_string,
								 char *tag_suffix)
{
	const int max_tweak_charge =3;
	Config *config = model.get_config();
	FileManager fm;
	FileSet fs;

	vector<int> num_tags; // how many tags from each length
	int main_length;

	parse_tag_string(tag_string,main_length,num_tags);

	// Quick read, get all pointers to begining of spectra
	if (get_file_extension_type(spectrum_file) != MZXML)
	{
		fm.init_from_file(config,spectrum_file);
	}
	else // reads peaks 
		fm.init_and_read_single_mzXML(config,spectrum_file,0);

	fs.select_all_files(fm);

	cout << "Generating tags for: " << spectrum_file << endl;
	cout << "Scans: " << fs.get_total_spectra() << endl;

	string file_name,tag_file;
	get_file_name_without_extension(spectrum_file,file_name);
	tag_file = file_name + "_" + string(tag_suffix) + ".txt";
	ofstream tag_stream(tag_file.c_str());
	
	time_t start_time;
	start_time = time(NULL);

	vector<PrmGraph *> prm_ptrs;
	BasicSpecReader bsr;
	int too_few_peaks=0;
	int bad_charge=0;
	int no_tags=0;
	int scan_count=0;

	prm_ptrs.resize(50,NULL);

	///////////////////////////////////////////////
	const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
	int sc;
	for (sc=0; sc<all_ssf.size(); sc++)
	{
		static vector<QCPeak> peaks;
		SingleSpectrumFile *ssf = all_ssf[sc];
		if (peaks.size()<ssf->num_peaks)
		{
			int new_size = ssf->num_peaks*2;
			if (new_size<2500)
				new_size=2500;
			peaks.resize(new_size); 
		}

	//	if (ssf->get_scan()<11254)
	//		continue;

		vector<int>    tweak_charges;
		vector<mass_t> tweak_pms_with_19;

		tweak_charges.resize(6,0);
		tweak_pms_with_19.resize(6,0);

		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
		ssf->file_idx = 0;

		if (num_peaks<10)
		{
			too_few_peaks++;
			continue;
		}

		Spectrum s;
		s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);

		if ( ssf->charge > model.get_max_score_model_charge())
		{
			bad_charge++;
			continue;
		}

		vector<SeqPath> solutions;
		vector<mass_t> pms_with_19, alt_pms_with_19;
		vector<int>    charges,     alt_charges;

		pms_with_19.clear();
		charges.clear();		
		BasicSpectrum bs;
		bs.ssf = ssf;
		bs.peaks = &peaks[0];
		bs.num_peaks = num_peaks;

		// output m/z and prob values for the different charge states
		vector<PmcSqsChargeRes> pmcsqs_res;
		model.select_pms_and_charges(config,bs,pms_with_19,charges,true,&pmcsqs_res);

		cout << "Scan " << bs.ssf->get_scan() << ",\tch: " << charges[0] << setprecision(1) << "\tpm19: " <<
			pms_with_19[0] << setprecision(3) << " \tSQS: " <<  pmcsqs_res[charges[0]].sqs_prob;
		if (config->get_filter_flag() && 
			! config->get_use_spectrum_charge() &&
			pmcsqs_res[charges[0]].sqs_prob<0.01)
		{
			cout << "  - skipping..." << endl;
			continue;
		}
		
		alt_charges.clear();
		alt_pms_with_19.clear();
		while (charges.size()>0 && charges[0] != charges[charges.size()-1])
		{
			alt_charges.push_back(charges[charges.size()-1]);
			charges.pop_back();
			alt_pms_with_19.push_back(pms_with_19[pms_with_19.size()-1]);
			pms_with_19.pop_back();
		}

		if (pms_with_19.size()>0)
		{
			generate_tags(prm_ptrs,&model,bs,&s,num_tags, main_length,
						  pms_with_19,charges,solutions);

			// set tweaks
			int charge = charges[0];
			if (charge>max_tweak_charge)
			{
				cout << "Error: max allowed tweak charge is " << max_tweak_charge << ", selected charge " << charge << endl;
				exit(1);
			}
			tweak_charges[2*(charge-1)]=charge;
			tweak_pms_with_19[2*(charge-1)]=pms_with_19[0];

			if (pms_with_19.size()>1)
			{
				tweak_charges[2*(charge-1)+1]=charge;
				tweak_pms_with_19[2*(charge-1)+1]=pms_with_19[1];
			}
		}

		if (alt_pms_with_19.size()>0)
		{
			vector<SeqPath> alt_solutions;

			generate_tags(prm_ptrs,&model,bs,&s, num_tags, main_length,
						  alt_pms_with_19, alt_charges, alt_solutions, false, alt_solutions.size());
			int j;
			for (j=0; j<alt_solutions.size(); j++)
				solutions.push_back(alt_solutions[j]);

			// set tweaks
			int charge = alt_charges[0];
			if (charge>max_tweak_charge)
			{
				cout << "Error: max allowed tweak charge is " << max_tweak_charge << ", selected charge " << charge << endl;
				exit(1);
			}
			tweak_charges[2*(charge-1)]=charge;
			tweak_pms_with_19[2*(charge-1)]=alt_pms_with_19[0];

			if (alt_pms_with_19.size()>1)
			{
				tweak_charges[2*(charge-1)+1]=charge;
				tweak_pms_with_19[2*(charge-1)+1]=alt_pms_with_19[1];
			}
		}
		
		
		if (solutions.size() == 0)
		{
			cout << "   - no tags" << endl;
			no_tags++;
			continue;
		}
		
		tag_stream << fixed << setprecision(3);
		tag_stream << ssf->get_scan() << "\t" << solutions.size() << endl;
		int i; 
		for (i=0; i<tweak_charges.size(); i++)
			tag_stream << tweak_charges[i] << "\t" << tweak_pms_with_19[i] << endl;

		for (i=0; i<solutions.size(); i++) 
		{
			const float score = ( solutions[i].rerank_score>-200 ? solutions[i].rerank_score : solutions[i].path_score);
			const int charge = solutions[i].charge;
			int best_tweak_idx=2*(charge-1);
			if (tweak_charges[2*(charge-1)+1]>0 && 
				fabs(tweak_pms_with_19[2*(charge-1)+1]-solutions[i].pm_with_19)<
				fabs(tweak_pms_with_19[2*(charge-1)]-solutions[i].pm_with_19))
				best_tweak_idx = 2*(charge-1)+1;
			
			tag_stream << best_tweak_idx << "\t" << setprecision(2) << score << "\t" << setprecision(3) << solutions[i].n_term_mass << "\t" << solutions[i].seq_str << endl;
		}

		scan_count++;

		cout << "   " << solutions.size() << " tags." << endl;

		if (sc % 200 == 0)
		{
			time_t curr_time = time(NULL);
			cout << setprecision(1) << fixed;
			cout << "done " << sc << "/" << all_ssf.size() << "  " << curr_time - start_time << " secs.,  "
				<< scan_count << " scans completed." << endl;
		}
		
	}
	tag_stream.close();

	cout << "Done..." << endl;
	cout << "Tag file: " << tag_file << endl;
	cout << "Wrote tags for " << scan_count << " spectra." << endl;
	if (bad_charge>0)
		cout << "Scans with bad charges  : " << bad_charge << endl;
	if (too_few_peaks>0)
		cout << "Scans with too few peaks: " << too_few_peaks << endl;
	if (no_tags>0)
		cout << "Scans for which no tags were found: " << no_tags << endl;

}







void perform_tags_on_list_of_files(AdvancedScoreModel& model, 
									 const vector<string>& list_vector, 
									 int file_start_idx,
									 int num_solutions, 
									 int	tag_length,
									 bool report_progress,
									 ostream& out_stream)
{
	Config *config = model.get_config();
	

	vector<int> num_tags; // how many tags from each length
	int main_length;

	
	num_tags.resize(10,0);
	num_tags[tag_length]=num_solutions;
	main_length = tag_length;

	// Quick read, get all pointers to begining of spectra

	time_t start_time, last_time;
	start_time = time(NULL);
	last_time = start_time;

	vector<PrmGraph *> prm_ptrs;
	BasicSpecReader bsr;
	int too_few_peaks=0;
	int bad_charge=0;
	int no_tags=0;
	int scan_count=0;
	int	correct_benchmark=0;
	int num_benchmark=0;


	prm_ptrs.resize(50,NULL);

	int f_idx;
	for (f_idx=0; f_idx<list_vector.size(); f_idx++)
	{
		FileManager fm;
		FileSet fs;

		const string spectrum_file = list_vector[f_idx];

		if (get_file_extension_type(spectrum_file) != MZXML)
		{
			fm.init_from_file(config,spectrum_file.c_str());
		}
		else // reads peaks 
			fm.init_and_read_single_mzXML(config,spectrum_file.c_str(),0);

		fs.select_all_files(fm);
		
		///////////////////////////////////////////////
		const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
		int sc;
		for (sc=0; sc<all_ssf.size(); sc++)
		{
			static vector<QCPeak> peaks;
			SingleSpectrumFile *ssf = all_ssf[sc];
			if (peaks.size()<ssf->num_peaks)
			{
				int new_size = ssf->num_peaks*2;
				if (new_size<2500)
					new_size=2500;
				peaks.resize(new_size); 
			}

		
			time_t curr_time = time(NULL);
			double elapsed_time = (curr_time - last_time);
			if (report_progress && elapsed_time>5)
			{
				last_time = curr_time;
				cout << "Processing file " << f_idx+1 << "/" << list_vector.size();
				if (all_ssf.size() == 1)
				{
					cout << "  spectrum 1/1 of current file." << endl;
				}
				else
					cout << "  spectrum " << sc+1 << "/" << all_ssf.size() << 
					" of current file." << endl;
				last_time=curr_time;
			}

			const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
			ssf->file_idx = f_idx+file_start_idx;

			// convert peak list ot a spectrum with charge (if original charge ==0)
			// the spectrum gets charge 2, but the true charge is computed from the data
		
			if (num_peaks<5)
			{
				ssf->print_ssf_stats(config,out_stream);
				out_stream << "# too few peaks..." << endl;
				continue;
			}

	
			Spectrum s;
			s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);

			if ( ssf->charge > model.get_max_score_model_charge())
			{
				bad_charge++;
				continue;
			}

			vector<SeqPath> solutions;
			vector<mass_t> pms_with_19, alt_pms_with_19;
			vector<int>    charges,     alt_charges;

			pms_with_19.clear();
			charges.clear();		
			BasicSpectrum bs;
			bs.ssf = ssf;
			bs.peaks = &peaks[0];
			bs.num_peaks = num_peaks;

			// output m/z and prob values for the different charge states
			vector<PmcSqsChargeRes> pmcsqs_res;
			model.select_pms_and_charges(config,bs,pms_with_19,charges,true,&pmcsqs_res);

			if (! config->get_use_spectrum_charge() && 
				config->get_filter_flag() &&
				pmcsqs_res[charges[0]].sqs_prob<0.01)
			{
				ssf->print_ssf_stats(config,out_stream);
				out_stream << "  - low quality, skipping..." << endl;
				continue;
			}
			
			alt_charges.clear();
			alt_pms_with_19.clear();
			while (charges.size()>0 && charges[0] != charges[charges.size()-1])
			{
				alt_charges.push_back(charges[charges.size()-1]);
				charges.pop_back();
				alt_pms_with_19.push_back(pms_with_19[pms_with_19.size()-1]);
				pms_with_19.pop_back();
			}

			if (pms_with_19.size()>0)
			{
				generate_tags(prm_ptrs,&model,bs,&s,num_tags, main_length,
							  pms_with_19,charges,solutions);

			}

			if (alt_pms_with_19.size()>0)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -