⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 basicdatastructs.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
字号:
#include "BasicDataStructs.h"
#include "auxfun.h"

int Peptide::calc_charge(mass_t& m_over_z) const
{
	if (this->mass<25.0)
	{
		cout << "Error: must first calc peptide mass!" << endl;
		exit(1);
	}

	int c;
	for (c=1; c<20; c++)
	{
		mass_t c_mass = c *m_over_z - 18 - c;
		if (fabs(c_mass-mass)<15)
			return c;

		if (c_mass - mass>50)
			break;
	}

	cout << "Error: couldn't figure out charge for peptide with mass " << m_over_z << endl;
	exit(1);

}

/**********************************************************
Parses a string into amino acids.
Terminals are stored in n_term and c_term.
Final mass = sum mass aas + mass terminals
***********************************************************/
void Peptide::parse_from_string(const Config* config, const string &str)
{
	int i;
	const vector<int>& char2aa = config->get_char2aa();

/*	// hack - modify M* to M+16
	string str="";
	for (i=0; i<org_str.length(); i++)
		if (org_str[i] != 'M')
		{
			str += org_str[i];
		}
		else
		{
			if (i<org_str.length()-1 && org_str[i+1] == '*')
			{
				str += "M+16";
				i++;
			}
		}*/


	
	amino_acids.clear();
	n_gap=0;

	int str_length = str.length();
	while (str_length>0 && (str[str_length-1] == '\n' || str[str_length-1] == '\t' ||
				str[str_length-1] == '\r' ||  str[str_length-1] == ' ') )
		str_length--;
	
	for (i=0; i<str_length; i++)
	{
		if (char2aa[str[i]]<0 && str[i] != '[')
		{
			cout << "Error parsing peptide string!" << endl;
			cout << str << endl << setw(i+1) << right << "^" << endl;
			cout << "Bad token: "<< str[i] << endl;
			exit(1);
		}

		// this can only be a single char unmodifed AA
		if (i == str_length-1)
		{
			int aa_idx=char2aa[str[i]];
			if (aa_idx>Val)
			{
				cout << "Error parsing peptide string!" << endl;
				cout << str << endl << setw(i+1) << right << "^" << endl;
				cout << "Bad token: "<< str[i] << endl;
				exit(1);
			}

			// do not push unmodified terminals
			if (aa_idx>=Ala)
				amino_acids.push_back(aa_idx);

			continue;
		}

		// parse gap
		if (str[i] == '[')
		{
			if (i>0)
			{
				cout<< "Error: gap only allowed at N-term side of peptide : e.g. \"[450.3]GHPLERTK\" is ok" << endl;
				exit(1);
			}
			int close_idx = i+1;
			while (close_idx<str_length && str[close_idx] != ']')
				close_idx++;

			if (close_idx == str_length)
			{
				cout << "Error parsing peptide string!" << endl;
				cout << str << endl << setw(i+1) << right << "^" << endl;
				cout << "(No terminating ']' found)." << endl;
				exit(1);
			}

			double gap_size=NEG_INF;
			istringstream is(str.substr(i+1,close_idx-i-1));
			is >> gap_size;
			
			if (gap_size<=0)
			{
				cout << "Error parsing peptide string!" << endl;
				cout << str << endl << setw(i+1) << right << "^" << endl;
				cout << "Bad gap string:" << is.str() << endl;
				exit(1);
			}

			n_gap=gap_size;
			i=close_idx;
			continue;
		}

		// parse an aa label
		// find label end
		{
			int end_idx = i+1;
			while (end_idx < str_length && char2aa[str[end_idx]]<0 && str[end_idx] != '[')
				end_idx++;

			// look for label
			string label = str.substr(i,end_idx-i);
			int aa_idx = config->get_aa_from_label(label);
			if (aa_idx <0)
			{
				cout << "Error parsing peptide string!" << endl;
				cout << str << endl << setw(i+1) << right << "^" << endl;
				cout << "Bad AA label: " << label << endl;
				exit(1);
			}

		/*	if (label[0] == '^')
			{
				n_term = aa_idx;
				if (amino_acids.size()>0)
				{
					cout << "Error: N-terminal should start peptide string: " << label << endl;
					exit(1);
				}
				i=end_idx-1;
				continue;
			}
			else if (label[0] == '$')
			{
				c_term = aa_idx;
				break;
			}*/

			else
			{
				amino_acids.push_back(aa_idx);
				i=end_idx-1;
			}
			continue;
		}
	}

	//calculate total mass
	
	int g_count=0;
	const ConversionTables& session_tables = config->get_session_tables();

	// add N-terminal
	mass = 0;

	if (amino_acids.size()>0)
	{
		// add first aa
		if (amino_acids[0] == Gap)
		{
			//mass += gaps[g_count++];
			cout << "Error: Should only have amino acids in peptide!" << endl;
			exit(1);
		}
		else
			mass += session_tables.get_aa2mass(amino_acids[0]);

		// add middle aas
		int i;
		for (i=1; i<amino_acids.size()-1; i++)
		{
			if (amino_acids[i] == Gap)
			{
				//mass += gaps[g_count++];
				cout << "Error: Should only have amino acids in peptide!" << endl;
				exit(1);
			}
			else
				mass += session_tables.get_aa2mass(amino_acids[i]);
		}

		// add last aa
		if (amino_acids.size()>1)
		{
			int last_idx = amino_acids.size()-1;
			if (amino_acids[last_idx] == Gap)
			{
				//mass += gaps[g_count++];
				cout << "Error: Should only have amino acids in peptide!" << endl;
				exit(1);
			}
			else
				mass += session_tables.get_aa2mass(amino_acids[last_idx]);
		}
	}

	// add C-terminal
//	mass += session_tables.get_aa2mass(c_term);

}


void Peptide::generate_random_peptide(const Config *config, int peptide_length)
{
	amino_acids.clear();
	n_gap=0;

	int i;

//	n_term = N_TERM;
//	c_term = C_TERM;

	mass=0;
	for (i=0; i<peptide_length; i++)
	{
		int aa = Ala + (int)(my_random() * (Val-Ala));
		mass += config->get_aa2mass()[aa];
		amino_acids.push_back(aa);
	}
}


void Peptide::set_peptide(vector<int>& aas, 
						  mass_t peptide_mass,
						  mass_t gap, 
						  int n_term_aa, 
						  int c_term_aa)
{
	amino_acids = aas;
	mass = peptide_mass;
	n_gap = gap, 
	aa_before = n_term_aa;
	aa_after = c_term_aa;
}

// the mass without 19
void Peptide::calc_mass(const Config *config)
{
	const vector<mass_t>& aa2mass = config->get_aa2mass();
	int i;

//	mass=aa2mass[n_term]+aa2mass[c_term];
	mass = 0;

	for (i=0; i<amino_acids.size(); i++)
		mass+=aa2mass[amino_acids[i]];
}



// changes the amino acids I->L
// an Q->K if not at terminal and tolerance > 0.1
void Peptide::convert_ILQK(const Config *config)
{
	int i;
	for (i=0; i<amino_acids.size(); i++)
		if (amino_acids[i] == Ile)
			amino_acids[i] = Leu;

	if (config->get_tolerance()>0.1)
		for (i=0; i<amino_acids.size()-1; i++)
			if (amino_acids[i]==Lys)
				amino_acids[i]=Gln;
}

// changes the amino acids I->L
void Peptide::convert_IL()
{
	int i;
	for (i=0; i<amino_acids.size(); i++)
		if (amino_acids[i] == Ile)
			amino_acids[i] = Leu;
}

void Peptide::reverse()
{
	vector<int> tmp_aas;
	int i;

	tmp_aas.clear();
	for (i=amino_acids.size()-1; i>=0; i--)
		tmp_aas.push_back(amino_acids[i]);
	amino_acids = tmp_aas;
}



string Peptide::as_string(const Config* config) const
{
	const vector<string>& aa2label = config->get_aa2label();
	int g_count =0 ;
	int i;

	string peptide_str = "";

//	if (n_term>N_TERM)
//		peptide_str += aa2label[n_term] ;

	if (n_gap>0)
	{
		ostringstream os;
		os << setprecision(3) << fixed << n_gap;
		
		peptide_str += '[' + os.str() + ']';
	}

	
	for (i=0; i<amino_acids.size(); i++)
	{
		peptide_str += aa2label[amino_acids[i]];
	}

//	if (c_term>C_TERM)
//		peptide_str +=  aa2label[c_term];

	return peptide_str;
}



void Peptide::calc_expected_breakage_masses(Config *config, vector<mass_t>& break_masses) const
{
	int i;
	int aa_pos=0;
	int g_count=0;
	mass_t m=0;
	break_masses.clear();
	if (amino_acids.size() == 0)
		return;

	const ConversionTables& session_tables = config->get_session_tables();

	if (n_gap>0)
	{
		m=n_gap;
		break_masses.push_back(n_gap);
	}
	else
		break_masses.push_back(0);

	// use pre tables for first amino acid
	if (amino_acids[0] > Gap)
	{
		m+= session_tables.get_aa2mass(amino_acids[0]);
		break_masses.push_back(m);
		aa_pos++;
	}

	// use mid tables for center amino acids
	for (i=aa_pos; i<amino_acids.size()-1; i++)
	{
	
		m+= session_tables.get_aa2mass(amino_acids[i]);
		break_masses.push_back(m);
	}

	// use suf tables for last amino acids
	if (amino_acids.size()>1)
	{
		const int aa_pos = amino_acids.size()-1;
		m += session_tables.get_aa2mass(amino_acids[aa_pos]);

		// add C-terminal
//		m+= session_tables.get_aa2mass(c_term);
	}

	break_masses.push_back(m);
}


int Peptide::calc_number_of_correct_aas(Config *config,const Peptide& other) const
{
	const vector<int>& other_amino_acids = other.get_amino_acids();
	const int num_aas = amino_acids.size();
	const int num_other_aas = other_amino_acids.size();

	vector<mass_t> this_breakages,other_breakages;

	calc_expected_breakage_masses(config,this_breakages);
	other.calc_expected_breakage_masses(config,other_breakages);

	int this_idx=0;
	int other_idx=0;
	int num_correct_aas=0;

	while (this_idx<num_aas && other_idx<num_other_aas)
	{
		if (fabs(this_breakages[this_idx]-other_breakages[other_idx])<1.0)
		{
			if (amino_acids[this_idx]==other_amino_acids[other_idx])
				num_correct_aas++;

			this_idx++;
			other_idx++;
		}
		else
		{
			if (this_breakages[this_idx]<other_breakages[other_idx])
			{
				this_idx++;
			}
			else
				other_idx++;
		}
	}
	return num_correct_aas;
}



/**************************************************************************
Returns the global edit distance between two peptides.
Gap cost = 1.
d(I,L)=0
d(K,Q)=0
d(F/M*)=0
d(N,D)=0.5;
d(X,X)=0
d(X,Y)=1
***************************************************************************/
float Peptide::peptide_edit_distance(Config *config, Peptide& other_pep) const
{
	int i;
	vector<int> other_aa = other_pep.get_amino_acids();
	const int *pep1 = &amino_acids[0];
	const int pep_len1 = amino_acids.size();
	const int *pep2 = &other_aa[0];
	const int pep_len2 = other_aa.size(); 
	const int max_width = 5;
	const int ox_met_aa = config->get_aa_from_label(string("M+16"));

	float row1[max_width*2+1], row2[max_width*2+1];
	float *old_row, *new_row;

	if (abs(pep_len1-pep_len2)>=max_width)
		return pep_len1;

	// check that no little switch of two aa can make them the same
	if (pep_len1 == pep_len2)
	{
		int i;
		int err_pos=-1;
		int errs=0;
		for (i=0; i<pep_len1; i++)
		{
			if ( (pep1[i] != pep2[i]) &&
			    ! (((pep1[i]==Ile ||pep1[i]==Leu) && (pep2[i]==Ile || pep2[i]==Leu)) ||
		          ((pep1[i]==Gln || pep1[i]==Lys) && (pep2[i]==Gln || pep2[i]==Lys)) ) )
			{
				err_pos=i;
				errs++;
			}
			if (errs>2)
				break;
		}

		if (errs == 0)
			return 0;

		if (errs == 2)
		{
			if ( (pep1[err_pos] == pep2[err_pos-1]) && (pep1[err_pos-1] == pep2[err_pos]))
				return 1;
		}
	}

	
	old_row=row1;
	new_row=row2;
	for (i=0; i<max_width; i++)
	{
		old_row[i]=9999;
		old_row[i+max_width]=i;
		new_row[i]=9999;
		new_row[i+max_width]=9999;
	}
	old_row[2*max_width]=9999;
	new_row[2*max_width]=9999;

	int start_new_row = max_width-1;
	for (i=1; i<=pep_len1; i++)
	{	
		int j;

		if (start_new_row>0)
		{
			new_row[start_new_row]=i;
			new_row[--start_new_row]=9999;
		}
		else
			new_row[0] = 9999;
	
		for (j=1; j<= 2*max_width; j++)
		{
			int p2_pos = i + j - max_width;
			if (p2_pos<1)
				continue;
			if (p2_pos>pep_len2)
				break;

			float v1,v2,v3,dxy=0;
			int p1=pep1[i-1],p2=pep2[p2_pos-1];

			if (p1 != p2)
			{
				dxy=1;
				if (  ((p1==Ile || p1==Leu) && (p2==Ile || p2==Leu)) ||
					  ((p1==Gln || p1==Lys) && (p2==Gln || p2==Lys)) )
				{
					dxy=0;
				}
				else if ( ((p1==Asn && p2==Asp) || (p1==Asp && p2==Asn)) ||
						  ((pep1[i]==Gln && p2==Glu) || (p1==Glu && p2==Gln))  ||
					  ((pep1[i]==Lys && p2==Glu) || (p1==Glu && p2==Lys))  ||
					  ((pep1[i]==Ile && p2==Asn) || (p1==Asn && p2==Ile))  ||
					  ((pep1[i]==Leu && p2==Asn) || (p1==Asn && p2==Leu))  )
				{
					dxy=0.5;
				}
				else if (ox_met_aa>0 && (( p1 == ox_met_aa && p2 == Phe) ||
										 ( p1 == Phe  && p2 == ox_met_aa)) )
				{
					dxy = 0;
				}
			}

			v1= old_row[j]+dxy;
			v2= new_row[j-1]+1;
			v3= (p2_pos<pep_len2 && j<2*max_width) ? old_row[j+1]+1 : 9999;

			new_row[j]=v1;
			if (new_row[j]>v2)
				new_row[j]=v2;
			if (new_row[j]>v3)
				new_row[j]=v3;
		}

		float *tmp;
		tmp = old_row;
		old_row = new_row;
		new_row = tmp;
	}

	return old_row[pep_len2-pep_len1+max_width];
}





// changes all the amino acids to their original form (without PTMs)
void Peptide::convert_to_org(const Config *config)
{
	const vector<int>& org_aa = config->get_org_aa();
	int i;
	for (i=0; i<amino_acids.size(); i++)
		amino_acids[i]=org_aa[amino_acids[i]];
}






void Breakage::print() const
{
	int i;
	cout << this->mass << " (" << region_idx << "), " << fragments.size()
		 << "  fragments" << endl;

	for (i=0; i<fragments.size(); i++)
	{
		cout << fragments[i].frag_type_idx << "  " << fragments[i].mass << "  "
			 << fragments[i].intensity << " (" << fragments[i].peak_level << ")"  ;
		if (fragments[i].is_strong_fragment)
			cout << " *";
		cout << endl;
	}
}


void Breakage::print(Config* config, ostream& os) const
{
	int i;
	os << this->mass << " (" << region_idx << "), " << fragments.size()
		 << "  fragments:";

	for (i=0; i<fragments.size(); i++)
		os << "  " << config->get_fragment(fragments[i].frag_type_idx).label << 
			    "  " << fragments[i].mass;	
}


void Breakage::print_fragments(Config* config, ostream& os) const
{
	int i;
	for (i=0; i<fragments.size(); i++)
		os << "   " << config->get_fragment(fragments[i].frag_type_idx).label << 
			    " " << fragments[i].mass;	
}





⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -