📄 quickclustering.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 3 页
字号:
			total_mismatched += n_mis;
			if (n_mis>0)
				clusters[i].print_cluster_peptides();
		}
		spec_idx = end_stage_one_idx + 1; // go back to the end of stage one

		// update the file which holds the last mass clustered
		ofstream last_mass_stream(last_good_mass_name.c_str(),ios::out);
		last_mass_stream << fixed << setprecision(3) << max_m_over_z << endl;
		last_mass_stream.close();
	}

	
	if (sim_matrix)
		delete [] sim_matrix;

	cout << endl << endl << "Total spectra read and clustered: " << total_spectra_read << " (" <<
		total_spectra << ")" << endl;

	cout << "# spectra in clusters: " << total_spectra_in_clusters << endl;
	cout << "% spectra in clusters: " << setprecision(3) << 
		(double)total_spectra_in_clusters / (double)total_spectra << endl;

	cout << "# clusters: " << num_clusters << "  , " << "Avg cluster size: " << 
		avg_cluster_size/(double)num_clusters << endl;

	if (total_mismatched>0)
		cout << "Total mismatched spectra: " << total_mismatched << "  (" <<
			(double)total_mismatched/total_spectra_read << ")" << endl;

	// cluster histogram
	cout << "Histogram of clusters: " << endl;
	cout << "max size     count" << endl;
	int i;
	for (i=0; i<num_clust_vals; i++)
	{
		cout << setw(8) << left << clust_vals[i] << clust_counts[i] << endl;
	}
	cout << ">" << setw(7) << left << clust_vals[num_clust_vals-1] <<
			clust_counts[num_clust_vals] << endl;

	double ratio = (double)total_spectra_in_clusters / (double)total_spectra;
	if ((filter_prob_for_min_size_clusters<0.2 && ratio<0.45) ||
		 (filter_prob_for_min_size_clusters>0.2 &&  ratio < 0.4))
	{
		cout << endl << "WARNING: only " << fixed << setprecision(2) << ratio*100 << "%" <<
			" of the spectra found there way into clusters. This might lead to loss of idnetifications." << endl;
		cout << "If you are filtering, you might consider reducing the quality threshold to a lower value,";
		cout << "e.g., by using the flag \"-min_filter_prob " << filter_prob*0.5 << endl;
	}

}





/**********************************************************************
	Creates cluster spec for the set of basic spectra.
	First reads the spectra and copies the peaks into the bulk peak allocation
	returns number of spectra actually read (does not read spectra that
	were already assigned to a cluster).

	The clustering is done in two phases. First a tight distance threshold
	is implemented, and in the second phase it is relaxed (this way the clusters
	should be more homegneous).
***********************************************************************/
int cluster_spec_in_file_set(Config *config, 
							 const FileManager& fm, 
							 FileSet& cluster_fs,
							 mass_t tolerance,
							 QCPeak *basic_peaks, 
							 vector<ClusterSpectrum>& clusters, 
							 float	min_similarity,
							 int	max_small_cluster_size,
							 int	max_cluster_size,
							 int	num_top_peaks_per_1000_da,
							 bool	verbose,
							 void	*pmcsqs_ptr,
							 float	filter_prob,
							 map<mzXML_annotation,int> *ann_map_ptr)
{
	// set clustering similarity thresholds
	vector<float> similarity_vals;
	int   num_rounds;
	if (min_similarity >= 0.9)
	{
		similarity_vals.push_back(min_similarity);
		num_rounds=1;
	}
	else
	{
		similarity_vals.push_back(0.9);
		if (min_similarity>=0.8)
		{
			similarity_vals.push_back(min_similarity);
			num_rounds=2;
		}
		else
		{
			similarity_vals.push_back((min_similarity+0.9)/2.0);
			similarity_vals.push_back(min_similarity);
			num_rounds=3;
		}
	}
	const float min_similarity_thresh = (min_similarity <0.2 ? min_similarity : 0.2); // don't test similarity if a previously recored																				 // similarity between clusters is less than this value

	PMCSQS_Scorer * pmcsqs = (PMCSQS_Scorer *)pmcsqs_ptr;

	BasicSpecReader bsr;
	const int num_spectra = cluster_fs.get_total_spectra();
	vector<SingleSpectrumFile *>& all_ssf = cluster_fs.get_non_const_ssf_pointers();
	static vector<BasicSpectrum> basic_spectra;
	int i;

	// set max_small_cluster_size
	if (max_small_cluster_size<0)
		max_small_cluster_size = 10 + (int)(0.8*log(all_ssf.size()));

	if (max_small_cluster_size > max_cluster_size)
		max_small_cluster_size = max_cluster_size;

	// read all the basic spectra into a central spectra repository
	int total_peaks_read=0;
	mass_t min_m_over_z = 1E7;
	mass_t max_m_over_z = 0;

	int num_filtered=0;

	basic_spectra.clear();
	basic_spectra.reserve(num_spectra);
	for (i=0; i<num_spectra; i++)
	{
		if (! all_ssf[i]) // invalidated
			continue;

		const int num_spec_peaks = bsr.read_basic_spec(config,fm,all_ssf[i], basic_peaks + total_peaks_read);
		if (num_spec_peaks<5)
		{
			all_ssf[i]=NULL;
			continue;
		}

		BasicSpectrum bs;
		bs.num_peaks = num_spec_peaks;
		bs.peaks = basic_peaks + total_peaks_read;
		bs.ssf = all_ssf[i];

		if (pmcsqs && bs.ssf->sqs<0)
		{
			int max_charge=0;
			const float prob = pmcsqs->get_sqs_for_spectrum(config,bs,&max_charge);
			if (prob<filter_prob)
			{
				if (ann_map_ptr)
				{
					DAT_single *dat_ssf = (DAT_single *)bs.ssf;

					map<mzXML_annotation,int>::const_iterator it;
					mzXML_annotation ann_pos;
					ann_pos.mzXML_file_idx = dat_ssf->mzxml_file_idx;
					ann_pos.scan = dat_ssf->scan_number;

					it = ann_map_ptr->find(ann_pos);
					if (it != ann_map_ptr->end())
					{
						
						cout << ">> " << ++wrongly_filtered_spectra << "\t" <<
						dat_ssf->mzxml_file_idx << " " << dat_ssf->scan_number << 
						" --> " << prob << endl;
					}
				}
				all_ssf[i]=NULL; // this specturm was filtered!
				num_filtered++;
				continue;
			}

			

			// update m/z and charge state (yes it is supposed to be const...)	
			bs.ssf->charge = max_charge;
			bs.ssf->sqs = prob;
		}
		

		basic_spectra.push_back(bs);

		total_peaks_read += num_spec_peaks;

		mass_t& m_over_z = bs.ssf->m_over_z;
		if (m_over_z<min_m_over_z)
			min_m_over_z = m_over_z;
		if (m_over_z>max_m_over_z)
			max_m_over_z = m_over_z;
	}


	// First stage, compare the basic spectra with clusters
	// Use high similarity threshold
	// If no cluster is found, create a new clusters for the spectrum
	// the calculated simlarities are stored is the sim matrix and can be used
	// in later stages to detect the need to re test the similarity

	const float first_stage_sim = similarity_vals[0];
	
	unsigned char * start_pos = sim_matrix;

	static vector<int> idx_permutations;
	idx_permutations.resize(basic_spectra.size());
	for (i=0; i<basic_spectra.size(); i++)
		idx_permutations[i]=i;

	permute_vector(idx_permutations);

	static vector<int> spec_top_idxs;
	static float top_x_masses[NUM_TOP_CLUSTER_PEAKS];
	for (i=0; i<basic_spectra.size(); i++)
	{
		const int spec_idx = idx_permutations[i];
		BasicSpectrum& spec = basic_spectra[spec_idx];
		const int spec_charge = spec.ssf->charge;
	
		set_adjusted_inten(spec.peaks,spec.num_peaks);
		select_top_peak_idxs(spec.peaks,spec.num_peaks,spec.ssf->m_over_z,
			tolerance, spec_top_idxs, top_x_masses, 
			ClusterSpectrum::get_num_top_peaks_per_1000_da(), config);

		// compare to previous clusters
		
		int j;
		for (j=0; j<clusters.size(); j++)
		{ 
			ClusterSpectrum& curr_clust = clusters[j];
	
			if (! curr_clust.find_match_in_top_masses(top_x_masses))
			{
				mark_bit_zero(start_pos,j);
				continue;
			}

			if (curr_clust.get_num_basic_spectra()>= max_cluster_size)
				continue;

			if (curr_clust.get_charge()>0 && spec_charge>0 && curr_clust.get_charge() !=spec_charge)
			{
				mark_bit_zero(start_pos,j);
				continue;
			}

			const float sim = calc_selected_dot_prod(tolerance,
				spec.peaks,spec.num_peaks, spec_top_idxs,
				curr_clust.get_peaks_pointer(),curr_clust.get_num_peaks(), 
				curr_clust.get_top_ranked_idxs(),verbose);

			if (sim >= min_similarity_thresh)
			{
				mark_bit_one(start_pos,j);
			}
			else
				mark_bit_zero(start_pos,j);
			

			// add this spectrum to an existing cluster
			if (sim > first_stage_sim)
			{
				curr_clust.add_spectrum_to_cluster(spec, spec_top_idxs,top_x_masses);
				break;
			}
		}


		if (j<clusters.size())  // we added the spectrum to an existing cluster
			continue;

	
		// create new cluster from spectrum
		
		clusters.resize(clusters.size()+1);
		ClusterSpectrum& cs = clusters[clusters.size()-1];

		cs.create_new_cluster(config, spec, clusters.size()-1);
		cs.set_charge(spec.ssf->charge);
		cs.set_top_ranked_idxs(spec_top_idxs);
		cs.set_top_masses(top_x_masses);
		cs.set_sim_matrix_row_start(start_pos);
		
		// round off the start position to the next byte
		unsigned char *old = start_pos;
		start_pos += ((j+7) >> 3);
	}



	// second stage try joining clusters
	// first start with the joining larger clusters
	// use lower threshold

	int round;
	for (round=1; round<num_rounds; round++)
	{
		const float round_similarity = similarity_vals[round];
		const float tighter_similarity = 1.0 - (1.0 - similarity_vals[round])/2.0;

		// join larger clusters, use tighter similaarity
		for (i=clusters.size()-1; i>0; i--)
		{
			if (clusters[i].get_tmp_cluster_idx()<0)
				continue;

			if (clusters[i].get_num_basic_spectra()>=max_cluster_size)
				continue;

			unsigned char *sim_row_start = clusters[i].get_sim_matrix_row_start();
			const int num_spec_i = clusters[i].get_num_basic_spectra();

			int j;
			for (j=i-1; j>=0; j--)
			{
				const int size_sum = clusters[j].get_num_basic_spectra() + num_spec_i;
				if (clusters[j].get_tmp_cluster_idx()<0 ||
					size_sum <= max_small_cluster_size ||
					size_sum >= max_cluster_size) 
					continue;

				// skip 32 places if the matirx is all zeros in that area
				if ((j % 32 == 31) &&  ! get_matrix_32_bits(sim_row_start,j))
				{
					j-=31;
					continue;
				}

				if (! get_matrix_val(sim_row_start,j))
					continue;

				const float sim = calc_selected_dot_prod(tolerance,
					clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(), 
					clusters[j].get_top_ranked_idxs(),
					clusters[i].get_peaks_pointer(),clusters[i].get_num_peaks(), 
					clusters[i].get_top_ranked_idxs());


				if (sim >= min_similarity_thresh)
				{
					mark_bit_one(sim_row_start,j);
				}
				else
					mark_bit_zero(sim_row_start,j);
				

				if (sim > tighter_similarity)
				{
					if (clusters[j].add_cluster(clusters[i], tighter_similarity))
					{
						clusters[i].set_tmp_cluster_idx(-1);
						break;
					}
				}
			}
		} 


		// join smaller clusters, use the round similarity
		for (i=clusters.size()-1; i>0; i--)
		{
			if (clusters[i].get_tmp_cluster_idx()<0 || 
				clusters[i].get_num_basic_spectra() >= max_small_cluster_size)
				continue;

			unsigned char * sim_row_start = clusters[i].get_sim_matrix_row_start();
			
			const int num_spec_i = clusters[i].get_num_basic_spectra();
			int j;
			for (j=i-1; j>=0; j--)
			{
				if (clusters[j].get_tmp_cluster_idx()<0 ||
					num_spec_i + clusters[j].get_num_basic_spectra() >= max_small_cluster_size)
					continue;

				// skip 32 places if the matirx is all zeros in that area
				if ((j % 32 == 31) &&  ! get_matrix_32_bits(sim_row_start,j))
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -