⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovoranktrain.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
			char buff[2048];
			ifs.getline(buff,2048);
			if (ifs.gcount()>2046)
				cout << "Warning: buffer size?" << endl;
			
			if (ifs.gcount()<5)
				continue;

			istringstream iss(buff);

		//	>> 92 0 6_6260 EGSSLLGSDAGELAGAGK 1618.79
			string dummy;
			int dummy_file;
			int scan=-1;
			string title;
			string correct_pep_str;
			mass_t mass_with_19 = -1;
		
			iss >> dummy >> dummy_file >> scan >> title >> correct_pep_str >> mass_with_19;
			if (scan<0 || mass_with_19<0 || mass_with_19>10000)
			{
				cout << scan << "\t" << mass_with_19 << endl;
				cout << "Error parsing line in file " << denovo_file << endl << "LINE:" <<buff << endl;
				exit(1);
			}

			int num_seqs=0;
			ifs.getline(buff,2048);
			sscanf(buff,"%d",&num_seqs);

			if (num_seqs<=0 || num_seqs>500)
			{
				cout << "Error parsing line in file " << denovo_file << endl << "LINE:" <<buff << endl;
				exit(1);
			}

		
			bool has_correct = false;
			bool correct_first = false;

			PeptideSet set;
			set.file_idx=file_idx;
			set.scan = scan;
			set.correct_sol.charge = file_charge;
			set.correct_sol.type = SOL_CORRECT;
			set.correct_sol.pep.parse_from_string(config,correct_pep_str);
			set.correct_sol.pep.set_aa_before(-1);
			set.correct_sol.pep.set_aa_after(-1);
			set.correct_sol.num_correct_aas = set.correct_sol.pep.get_num_aas();
			set.correct_sol.pm_with_19 = set.correct_sol.pep.get_mass_with_19();
			set.correct_sol.reaches_n_terminal = true;
			set.correct_sol.reaches_c_terminal = true;

			vector<string> inserted_strings;
			inserted_strings.clear();

			int i;
			for (i=0; i<num_seqs; i++)
			{
				ifs.getline(buff,2048);
				mass_t start_mass=-1;
				int num_correct_aas =-1;
				int num_aas = -1;
				string denovo_seq="";

				istringstream iss(buff);
				iss >> start_mass >> num_correct_aas >> num_aas >> denovo_seq;

				if (num_correct_aas==0 && num_aas==0)
					continue;

				if (start_mass<0 || start_mass > 10000 || num_correct_aas<0 ||
					num_aas<0 || num_aas>100 || num_correct_aas> num_aas || 
					denovo_seq.length()<3)
				{
					cout << "Error parsing " << denovo_file << endl;
					cout << "LINE: " << buff << endl;
					exit(1);
				}

				// check if peptide is same as correct don't add it
				if (denovo_seq == correct_pep_str || num_correct_aas == num_aas)
				{
					has_correct   = true;
					if (i==0)
						correct_first = true;
					continue;
				}

				int j;
				for (j=0; j<inserted_strings.size(); j++)
					if (inserted_strings[j] == denovo_seq)
						break;
				if (j<inserted_strings.size())
					continue;

				// check how similar the cuts are

				inserted_strings.push_back(denovo_seq);

				PeptideSolution sol;
				sol.charge = file_charge;
				sol.pep.parse_from_string(config,denovo_seq);
				sol.pep.set_aa_before(-1);
				sol.pep.set_aa_after(-1);
				sol.num_correct_aas = num_correct_aas;
				sol.type=SOL_INCORRECT_DENOVO;
				sol.pm_with_19 = sol.pep.get_mass_with_19();
				sol.reaches_n_terminal = true;
				sol.reaches_c_terminal = true;

				set.incorrect_sols.push_back(sol);
			}

			if (add_peptide_set_to_psm(psm,set))
				total_num_sets++;

			num_sets++;
			if (has_correct)
				num_with_correct++;
			if (correct_first)
				num_first_correct++;
				
		}

		ifs.close();

		cout << "\tsets added:   " << num_sets << endl;
		cout << "\thad correct:  " << float(num_with_correct)/num_sets << endl;
		cout << "\tcorrect first:" << float(num_first_correct)/num_sets << endl;
		cout << endl;
	}

	cout << "Processed " << num_read << "/" << names.size() << " denovo files" << endl;
}




/*******************************************************************************
Reads the sets of predicted peptides:
list of db hits amd of complete de novo sequences for the spectrum.
********************************************************************************/
void create_complete_denovo_set_map(
						Config *config,
						const string& mgf_list,
						const string& db_dir,
						const string& correct_dir,
						const string& denovo_dir,
						int charge,
						int size_idx,
						PeptideSetMap& psm,
						vector<bool>& file_indicators)
{
	vector<string> base_names,list;
	read_paths_into_list(mgf_list.c_str(), list);
	base_names.resize(list.size());

	int i;
	for (i=0; i<list.size(); i++)
		get_file_name_without_extension(list[i],base_names[i]);

	file_indicators.clear();
	file_indicators.resize(list.size(),false);

	// create map
	psm.clear();

	char suf_buf[64];
	sprintf(suf_buf,"_dbh_%d_%d.txt",charge,size_idx);
	string db_suffix = string(suf_buf);
	add_db_hits_to_psm(config,base_names,db_dir,correct_dir,db_suffix,charge,psm);

	
	sprintf(suf_buf,"_full_dnv_%d_%d.txt",charge,size_idx);
	string dnv_full_suffix = string(suf_buf);
//	add_denovo_paths_to_psm(config,base_names,denovo_dir,db_suffix,charge,psm);
	
	add_denovo_dbh_to_psm(config,base_names,denovo_dir,correct_dir,db_suffix,charge,psm);
	

	
	//
	cout << "Read info for charge " << charge << " size " << size_idx << endl;
	vector<int> type_sum;
	type_sum.resize(10,0);

	int num_sets=0;
	int num_incorrect=0;

	for (PeptideSetMap::const_iterator it = psm.begin(); it != psm.end(); it++)
	{
		num_sets++;
		file_indicators[(*it).first.file_idx]=true;
		num_incorrect += (*it).second.incorrect_sols.size();
		int j;
		for (j=0; j<(*it).second.incorrect_sols.size(); j++)
			type_sum[(*it).second.incorrect_sols[j].type]++;
	}

	cout << "\t" << num_sets << "\tsets" << endl;
	cout << "\t" << num_incorrect <<"\tnegtaive samples" << endl;

	for (i=0; i<type_sum.size(); i++)
		if (type_sum[i]>0)
			cout << "\ttype " << i << " " << type_sum[i] << endl;

}


/*******************************************************************************
Reads the sets of predicted peptides:
list of db hits amd of complete de novo sequences for the spectrum.
********************************************************************************/
void create_complete_dbh_set_map(
						Config *config,
						const string& mgf_list,
						const string& db_dir,
						const string& correct_dir,
						int charge,
						int size_idx,
						PeptideSetMap& psm,
						vector<bool>& file_indicators)
{
	vector<string> base_names,list;
	read_paths_into_list(mgf_list.c_str(), list);
	base_names.resize(list.size());

	int i;
	for (i=0; i<list.size(); i++)
		get_file_name_without_extension(list[i],base_names[i]);

	file_indicators.clear();
	file_indicators.resize(list.size(),false);

	// create map
	psm.clear();

	char suf_buf[64];
	sprintf(suf_buf,"_dbh_%d_%d.txt",charge,size_idx);
	string db_suffix = string(suf_buf);
	add_db_hits_to_psm(config,base_names,db_dir,correct_dir,db_suffix,charge,psm);
	
	//
	cout << "Read info for charge " << charge << " size " << size_idx << endl;
	vector<int> type_sum;
	type_sum.resize(10,0);

	int num_sets=0;
	int num_incorrect=0;

	for (PeptideSetMap::const_iterator it = psm.begin(); it != psm.end(); it++)
	{
		num_sets++;
		file_indicators[(*it).first.file_idx]=true;
		num_incorrect += (*it).second.incorrect_sols.size();
		int j;
		for (j=0; j<(*it).second.incorrect_sols.size(); j++)
			type_sum[(*it).second.incorrect_sols[j].type]++;
	}

	cout << "\t" << num_sets << "\tsets" << endl;
	cout << "\t" << num_incorrect <<"\tnegtaive samples" << endl;

	for (i=0; i<type_sum.size(); i++)
		if (type_sum[i]>0)
			cout << "\ttype " << i << " " << type_sum[i] << endl;

}


// returns the total number of pairs in the psm
// that had weights assigned
// reassigns weights to sets according to weights of db / full de novo
int assign_denovo_weights_to_sets(PeptideSetMap& psm, 
								  float ratio_db, 
								  float ratio_full_denovo)
{
	const float sum = ratio_db + ratio_full_denovo;
	const float req_type1_weight = ratio_db/sum;
	const float req_type2_weight = 1 - req_type1_weight;
	int total_num_pairs =0;

	for (PeptideSetMap::iterator it = psm.begin(); it != psm.end(); it++)
	{
		PeptideSet& set = (*it).second;

		int num_type1=0;
		int num_type2=0;
		float total_type1_weight=0;
		float total_type2_weight=0;

		// set weight according to proportion of correct amino acids

		int i;
		for (i=0; i<set.incorrect_sols.size(); i++)
		{
			PeptideSolution& sol = set.incorrect_sols[i];
			const float correct_aa_ratio = ((float)sol.num_correct_aas/(float)sol.pep.get_num_aas());
			sol.weight = 1.0 - correct_aa_ratio;
			if (sol.type==1)
			{
				num_type1++;
				total_type1_weight+=sol.weight;
			}
			else if (sol.type==2)
			{
				num_type2++;
				total_type2_weight+=sol.weight;
			}
			else
			{
				cout << "Error: unknown de novo sample type: " << sol.type << endl;
				exit(1);
			}
			
		}
		total_num_pairs+=set.incorrect_sols.size();

		const float mul1= (num_type1>0 ? req_type1_weight/total_type1_weight : 0);
		const float mul2= (num_type2>0 ? req_type2_weight/total_type2_weight : 0);

		set.total_set_weight=0;
		for (i=0; i<set.incorrect_sols.size(); i++)
		{
			PeptideSolution& sol = set.incorrect_sols[i];

			sol.weight *= (sol.type == 1 ? mul1 : mul2);
			set.total_set_weight += sol.weight;
		}
	}
	return total_num_pairs;
}


// also removes pairs with weight 0
void re_weight_complete_denovo_phi_support(vector<SamplePairWeight>& phi_support,	
							float ratio_db, float ratio_denovo, float ratio_db_cross)
{
	const int num_types = 4;
	const double ratio_sum = (double)(ratio_db + ratio_denovo + ratio_db_cross);
	const double per_weights[num_types] = {0, (double)ratio_db / ratio_sum, (double)ratio_denovo/ratio_sum,
		(double)ratio_db_cross / ratio_sum};
	double total_weights[num_types]={0};
	int	   num_samples[num_types]={0};

	int i;
	for (i=0; i<phi_support.size(); i++)
	{
		const int type = phi_support[i].tag;
		total_weights[type]+=phi_support[i].weight;
		num_samples[type]++;
	}

	double total_weight=0;
	for (i=0; i<num_types; i++)
		total_weight+=total_weights[i];

	double mult_vals[num_types]={0};
	for (i=0; i<num_types; i++)
		if (per_weights[i]>0)
			mult_vals[i]= (per_weights[i]*total_weight) / total_weights[i];

	for (i=0; i<phi_support.size(); i++)
		phi_support[i].weight *= mult_vals[phi_support[i].tag];

	for (i=0; i<phi_support.size(); i++)
	{
		if (phi_support[i].weight<=0)
		{
			phi_support[i]=phi_support[phi_support.size()-1];
			phi_support.pop_back();
		}
	}


	for (i=0; i<num_types; i++)
	{
		total_weights[i]=0;
		num_samples[i]=0;
	}

	for (i=0; i<phi_support.size(); i++)
	{
		const int type = phi_support[i].tag;
		total_weights[type]+=phi_support[i].weight;
		num_samples[type]++;
	}

	total_weight=0;
	for (i=0; i<num_types; i++)
		total_weight+=total_weights[i];

	cout << endl << "Reweighted samples in set: " << endl;
	for (i=0; i<num_types; i++)
		if (num_samples[i]>0)
			cout << "Type " << i << " " << num_samples[i] << "\t" << per_weights[i] << "\t" << " : " <<
				total_weights[i] / total_weight << endl;
	cout << endl;
}


bool are_same_upto_NDILQK(const Peptide& pep1, const Peptide& pep2)
{
	if (pep1.get_num_aas() != pep2.get_num_aas())
		return false;

	const vector<int>& aas1 = pep1.get_amino_acids();
	const vector<int>& aas2 = pep2.get_amino_acids();
	int i;

	for (i=0; i<aas1.size()-1; i++)
	{
		if (aas1[i] == aas2[i] )
			continue;

		if ((aas1[i] == Ile && aas2[i] == Leu) ||
			(aas1[i] == Leu && aas2[i] == Ile) ||
			(aas1[i] == Asn && aas2[i] == Asp) ||
			(aas1[i] == Asp && aas2[i] == Asn) ||
			(aas1[i] == Gln && aas2[i] == Lys) ||
			(aas1[i] == Lys && aas2[i] == Gln) )
			continue;
		return false;
	}
	
	return (aas1[i]==aas2[i]);
}



void DeNovoRankScorer::create_training_data_for_complete_denovo_ranking(	
				const string& db_dir,
				const string& correct_dir,
				const string& mgf_list,
				const double train_ratio,
				const int max_num_pairs,
				const int charge,
				const int size_idx,
				RankBoostDataset& train_ds, 
				RankBoostDataset& test_ds,
				vector<string>* peptide_strings,
				char *test_scan_file,
				float ratio_denovo,
				char *rerank_path,
				int   rerank_depth)
{
	PeakRankModel *&peak_model = peak_prediction_models[model_type];
	Config *config = model->get_config();
	const mass_t tolerance_diff = config->get_tolerance()*0.5;
	const int max_num_ppp_frags = 4;

	bool print_sams=false;

	if (model_type != 2)
	{
		cout << "Error: this training function is only intended for full de novo sequence samples!" << endl;
		cout << "Need to set model type to 2, not " << model_type << endl;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -