⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovoranktrain.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
#include "DeNovoRankScore.h"
#include "DeNovoSolutions.h"
#include "auxfun.h"


bool add_peptide_set_to_psm(PeptideSetMap& psm, PeptideSet& pep_set)
{
	PeptideSetMap::iterator it;

	scan_pair key(pep_set.file_idx,pep_set.scan);

	it=psm.find(key);
	if (it == psm.end())
	{
		psm.insert(pair<scan_pair,PeptideSet>(key,pep_set));
		return false;
	}
	
	PeptideSet& set_in_map = (*it).second;
	const int corr_aa_before = set_in_map.correct_sol.pep.get_aa_before();
	const int corr_aa_after  = set_in_map.correct_sol.pep.get_aa_after();
	int i;
	for (i=0; i<pep_set.incorrect_sols.size(); i++)
	{
		PeptideSolution& sol = pep_set.incorrect_sols[i];
	
		// check if sol is already there
		int j;
		for (j=0; j<set_in_map.incorrect_sols.size(); j++)
			if (sol.pep == set_in_map.incorrect_sols[j].pep)
				break;

	//	if (j<set_in_map.incorrect_sols.size())
	//		continue;
		
		if (sol.pep.get_aa_before()<0 && sol.pep.get_aa_after()<0)
		{
			sol.pep.set_aa_before(corr_aa_before);
			sol.pep.set_aa_after(corr_aa_after);
		}
		set_in_map.incorrect_sols.push_back(sol);
	}
	
	return true;
}

void read_correct_seqs(const string& path, 
					   vector<string>& seqs, 
					   vector<char>& n_aas,
					   vector<char>& c_aas,
					   vector<float>& mqscores)
{
	ifstream ifs(path.c_str());
	if (! ifs.is_open())
	{
		cout << "Error: couldn't find file with correct sequences: " << path << endl;
		exit(1);
	}

	seqs.clear();
	n_aas.clear();
	c_aas.clear();

	while (! ifs.eof())
	{
		char buff[128];
		string pep_seq;
		char n_aa='$',c_aa='$';
		float mqscore=NEG_INF;
		ifs.getline(buff,128);
		if (ifs.gcount()<5)
			break;

		istringstream iss(buff);
		iss >> pep_seq >> n_aa >> c_aa >> mqscore;

		if (n_aa == '$' || c_aa == '$' || pep_seq.length()<4 || pep_seq.length()>100)
		{
			cout << "Error: bad line:|" << buff << "|" << endl;
			exit(1);
		}
		
		seqs.push_back(pep_seq);
		n_aas.push_back(n_aa);
		c_aas.push_back(c_aa);
		mqscores.push_back(mqscore);
	}
	ifs.close();
}

/************************************************************************
*************************************************************************/
void add_db_hits_to_psm(Config *config,
						const vector<string>& names,
						const string& db_dir,
						const string& correct_seq_dir,
						const string& file_suffix,
						int   file_charge,
						PeptideSetMap& psm)
{
	const vector<int>& char2aa = config->get_char2aa();
	int num_read=0;
	int total_num_sets=0;
	int total_num_peptides=0;
	
	int file_idx;
	for (file_idx=0; file_idx<names.size(); file_idx++)
	{
		string db_file = db_dir + "/" + names[file_idx] + file_suffix;
		ifstream ifs(db_file.c_str());

		if (! ifs.is_open())
			continue;

		cout << file_idx << "\t" << db_file << endl;

		num_read++;
		int num_sets=0;
		int num_peptides=0;
		int num_with_correct=0;
		int num_first_correct=0;

		vector<string> correct_seqs;
		vector<char>   correct_n_aas, correct_c_aas;
		vector<float>  mqscores;

		string seq_path = correct_seq_dir + "/" + names[file_idx] + ".txt";
		read_correct_seqs(seq_path,correct_seqs, correct_n_aas, correct_c_aas, mqscores);
		
		while (! ifs.eof())
		{
			char buff[2048];
			ifs.getline(buff,2048);
			if (ifs.gcount()>2046)
				cout << "Warning: buffer size?" << endl;

			if (ifs.gcount()<5)
				continue;
			
			istringstream iss(buff);
			int scan=-1;
			mass_t mass_with_19 = -1;
			int num_seqs = -1;

			iss >> scan >> mass_with_19 >> num_seqs;

			if (scan<0 || 
				mass_with_19<0 || 
				mass_with_19>10000 || 
				num_seqs<0 || 
				num_seqs>100)
			{
				cout << scan << "\t" <<mass_with_19 << "\t" << num_seqs << endl;
				cout << "Error parsing line in file " << db_file << endl << "LINE:" <<buff << endl;
				exit(1);
			}

			if (scan>correct_seqs.size())
			{
				cout << "Error: mismatch in correct seqs file!" << endl;
				exit(1);
			}

			const string& correct_pep_str = correct_seqs[scan];
			bool has_correct = false;
			bool correct_first = false;

			PeptideSet set;
			set.file_idx=file_idx;
			set.scan = scan;
			set.correct_sol.charge = file_charge;
			set.correct_sol.type = SOL_CORRECT;
			set.correct_sol.pep.parse_from_string(config,correct_pep_str);
			set.correct_sol.pep.set_aa_before(char2aa[correct_n_aas[scan]]);
			set.correct_sol.pep.set_aa_after(char2aa[correct_c_aas[scan]]);
			set.correct_sol.num_correct_aas = set.correct_sol.pep.get_num_aas();
			set.correct_sol.pm_with_19 = set.correct_sol.pep.get_mass_with_19();
			set.correct_sol.reaches_n_terminal = true;
			set.correct_sol.reaches_c_terminal = true;
			set.correct_sol.MQScore= mqscores[scan];
		
			vector<string> inserted_strings;
			inserted_strings.clear();
			int i;
			for (i=0; i<num_seqs; i++)
			{
				char char_before, char_after;
				int charge=0;
				float mqscore;
				string peptide_str;
				iss >> char_before >> char_after >> charge >> mqscore >> peptide_str;

				const int aa_before = char2aa[char_before];
				const int aa_after  = char2aa[char_after];

				if (charge<=0 || charge>5)
				{
					cout << "Error: bad charge: " << charge << " in line:" << endl << buff << endl;
					exit(1);
				}


				int j;
				for (j=0; j<peptide_str.length(); j++)
					if (peptide_str[j]=='I')
						peptide_str[j]='L' ;

				// check if peptide is same as correct don't add it
				if (peptide_str == correct_pep_str)
				{
					has_correct   = true;
					if (i==0)
						correct_first = true;
					continue;
				}

				for (j=0; j<inserted_strings.size(); j++)
					if (inserted_strings[j] == peptide_str)
						break;
				if (j<inserted_strings.size())
					continue;

				inserted_strings.push_back(peptide_str);

				PeptideSolution sol;
				sol.charge = charge;
				sol.pep.parse_from_string(config,peptide_str);
				sol.pep.set_aa_before(aa_before);
				sol.pep.set_aa_after(aa_after);
				sol.MQScore = mqscore;
				sol.num_correct_aas = set.correct_sol.pep.calc_number_of_correct_aas(config,sol.pep);
				sol.type= SOL_INCORRECT_DB;
				sol.pm_with_19 = sol.pep.get_mass_with_19();
				sol.reaches_c_terminal=true;
				sol.reaches_n_terminal=true;
				set.incorrect_sols.push_back(sol);
				num_peptides++;
			}

			if (add_peptide_set_to_psm(psm,set))
				total_num_sets++;

			num_sets++;
			if (has_correct)
				num_with_correct++;
			if (correct_first)
				num_first_correct++;
		}
		ifs.close();

		cout << "\tsets added:   " << num_sets << endl;
		cout << "\thad correct:  " << float(num_with_correct)/num_sets << endl;
		cout << "\tcorrect first:" << float(num_first_correct)/num_sets << endl;
		cout << endl;
	}

	cout << "Processed " << num_read << "/" << names.size() << " dbh files." << endl;
}


void add_denovo_dbh_to_psm(Config *config,
						const vector<string>& names,
						const string& denovo_dir,
						const string& correct_seq_dir,
						const string& file_suffix,
						int file_charge,
						PeptideSetMap& psm)
{
	const vector<int>& char2aa = config->get_char2aa();
	int num_read=0;
	int total_num_peptides=0;
	
	int file_idx;
	for (file_idx=0; file_idx<names.size(); file_idx++)
	{
		string dnv_file = denovo_dir + "/" + names[file_idx] + file_suffix;
		ifstream ifs(dnv_file.c_str());

		if (! ifs.is_open())
			continue;

		cout << file_idx << "\t" << dnv_file << endl;

		num_read++;
		int num_new_sets=0;
		int num_sets=0;
		int num_with_correct=0;
		int num_first_correct=0;

		vector<string> correct_seqs;
		vector<char>   correct_n_aas, correct_c_aas;
		vector<float>  mqscores;

		string seq_path = correct_seq_dir + "/" + names[file_idx] + ".txt";
		read_correct_seqs(seq_path,correct_seqs, correct_n_aas, correct_c_aas, mqscores);
		
		while (! ifs.eof())
		{
			char buff[2048];
			ifs.getline(buff,2048);
			if (ifs.gcount()>2046)
				cout << "Warning: buffer size?" << endl;

			if (ifs.gcount()<5)
				continue;
			
			istringstream iss(buff);
			int scan=-1;
			mass_t mass_with_19 = -1;
			int num_seqs = -1;

			iss >> scan >> mass_with_19 >> num_seqs;

			if (scan<0 || 
				mass_with_19<0 || 
				mass_with_19>10000 || 
				num_seqs<0 || 
				num_seqs>100)
			{
				cout << scan << "\t" <<mass_with_19 << "\t" << num_seqs << endl;
				cout << "Error parsing line in file " << dnv_file << endl << "LINE:" <<buff << endl;
				exit(1);
			}

			if (scan>correct_seqs.size())
			{
				cout << "Error: mismatch in correct seqs file!" << endl;
				exit(1);
			}

			const string& correct_pep_str = correct_seqs[scan];
			bool has_correct = false;
			bool correct_first = false;

			PeptideSet set;
			set.file_idx=file_idx;
			set.scan = scan;
			set.correct_sol.charge = file_charge;
			set.correct_sol.type = SOL_CORRECT;
			set.correct_sol.pep.parse_from_string(config,correct_pep_str);
			set.correct_sol.pep.set_aa_before(char2aa[correct_n_aas[scan]]);
			set.correct_sol.pep.set_aa_after(char2aa[correct_c_aas[scan]]);
			set.correct_sol.num_correct_aas = set.correct_sol.pep.get_num_aas();
			set.correct_sol.pm_with_19 = set.correct_sol.pep.get_mass_with_19();
			set.correct_sol.reaches_n_terminal = true;
			set.correct_sol.reaches_c_terminal = true;
			set.correct_sol.MQScore= mqscores[scan];
		
			vector<string> inserted_strings;
			inserted_strings.clear();
			int i;
			for (i=0; i<num_seqs; i++)
			{
				char char_before, char_after;
				int charge=0;
				float mqscore;
				string peptide_str;
				iss >> char_before >> char_after >> charge >> mqscore >> peptide_str;

				const int aa_before = char2aa[char_before];
				const int aa_after  = char2aa[char_after];

				if (charge<=0 || charge>5)
				{
					cout << "Error: bad charge: " << charge << " in line:" << endl << buff << endl;
					exit(1);
				}


				int j;
				for (j=0; j<peptide_str.length(); j++)
					if (peptide_str[j]=='I')
						peptide_str[j]='L' ;

				// check if peptide is same as correct don't add it
				if (peptide_str == correct_pep_str)
				{
					has_correct   = true;
					if (i==0)
						correct_first = true;
					continue;
				}

				for (j=0; j<inserted_strings.size(); j++)
					if (inserted_strings[j] == peptide_str)
						break;
				if (j<inserted_strings.size())
					continue;

				inserted_strings.push_back(peptide_str);

				PeptideSolution sol;
				sol.charge = charge;
				sol.pep.parse_from_string(config,peptide_str);
				sol.pep.set_aa_before(aa_before);
				sol.pep.set_aa_after(aa_after);
				sol.MQScore = mqscore;
				sol.num_correct_aas = set.correct_sol.pep.calc_number_of_correct_aas(config,sol.pep);
				sol.type= SOL_INCORRECT_DENOVO;
				sol.pm_with_19 = sol.pep.get_mass_with_19();
				sol.reaches_c_terminal=true;
				sol.reaches_n_terminal=true;
				set.incorrect_sols.push_back(sol);
			}

			if (! add_peptide_set_to_psm(psm,set))
				num_new_sets++;

			num_sets++;
			if (has_correct)
				num_with_correct++;
			if (correct_first)
				num_first_correct++;
		}
		ifs.close();

		cout << "\tnew sets added : " << num_new_sets << endl;
		cout << "\tsets added :     " << num_sets << endl;
		cout << "\thad correct :    " << float(num_with_correct)/num_sets << endl;
		cout << "\tcorrect first :  " << float(num_first_correct)/num_sets << endl;
		cout << endl;
	}

	cout << "Processed " << num_read << "/" << names.size() << " dbh files." << endl;
}



void add_denovo_paths_to_psm(Config *config,
							const vector<string>& names,
							const string& denovo_dir,
							const string& file_suffix,
							int   file_charge,
							PeptideSetMap& psm)
{
	const vector<int>& char2aa = config->get_char2aa();
	int num_read=0;
	int total_num_sets=0;
	int total_num_peptides=0;
	
	int file_idx;
	for (file_idx=0; file_idx<names.size(); file_idx++)
	{
		string denovo_file = denovo_dir + "/" + names[file_idx] + file_suffix;
		ifstream ifs(denovo_file.c_str());

		if (! ifs.is_open())
			continue;

		cout << file_idx << "\t" << denovo_file << endl;

		num_read++;
		int num_sets=0;
		int num_with_correct=0;
		int num_first_correct=0;

		while (! ifs.eof())
		{

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -