⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 quickclustering.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 3 页
字号:
				{
					j-=31;
					continue;
				}

				if ( ! get_matrix_val(sim_row_start,j))
					continue;

				if (clusters[j].get_num_basic_spectra()>max_cluster_size)
					continue;


				const float sim = calc_selected_dot_prod(tolerance,
					clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(), 
					clusters[j].get_top_ranked_idxs(),
					clusters[i].get_peaks_pointer(),clusters[i].get_num_peaks(), 
					clusters[i].get_top_ranked_idxs());


				if (sim >= min_similarity_thresh)
				{
					mark_bit_one(sim_row_start,j);
				}
				else
					mark_bit_zero(sim_row_start,j);
				
				if (sim > round_similarity)
				{
					if (clusters[j].add_cluster(clusters[i],round_similarity))
					{
						clusters[i].set_tmp_cluster_idx(-1);
						break;
					}
				}
			}
		}
	} 



//	cout << "Filtered " << num_filtered << "/" << num_spectra << endl;

	if (verbose)
	{
		for (i=0; i<clusters.size(); i++)
			if (clusters[i].get_tmp_cluster_idx() >=0 &&
				clusters[i].get_basic_spectra().size() == 1 && 
				clusters[i].get_basic_spectra()[0].ssf->peptide.get_num_aas()>0)
				cout << clusters[i].get_basic_spectra()[0].ssf->peptide.as_string(config) << endl;
	}


	
	return num_spectra;
}



/**********************************************************************
	Adds spectra from the additional set to the existing clusters.
	If they are added they are invalidated from further clusetering.
***********************************************************************/
int add_additional_spectra_to_existing_clusters(
							Config *config, 
							const FileManager& fm, 
							FileSet& additional_fs, 
							mass_t tolerance, 
							QCPeak *basic_peaks, 
							vector<ClusterSpectrum>& clusters, 
							float min_similarity, 
							int max_cluster_size,
							void *pmcsqs_ptr,
							float filter_prob,
							map<mzXML_annotation,int> *ann_map_ptr,
							bool verbose)
{
	float spectrum_join_similarity = 0.8;
	if (min_similarity>spectrum_join_similarity)
		spectrum_join_similarity = min_similarity;

	// read spectra
	BasicSpecReader bsr;
	const int num_spectra = additional_fs.get_total_spectra();
	vector<SingleSpectrumFile *>& all_ssf = additional_fs.get_non_const_ssf_pointers();
	
	static vector<BasicSpectrum> basic_spectra;

	basic_spectra.clear();
	basic_spectra.reserve(num_spectra);

	PMCSQS_Scorer * pmcsqs = (PMCSQS_Scorer *)pmcsqs_ptr;

	int i,total_peaks_read=0;

	mass_t min_m_over_z = 1E7;
	mass_t max_m_over_z = 0;

	for (i=0; i<num_spectra; i++)
	{
		if (! all_ssf[i] || all_ssf[i]->assigned_cluster>=0)
			continue;

		const int num_spec_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],
												 basic_peaks + total_peaks_read);
		
		if (num_spec_peaks<5)
		{
			all_ssf[i]=NULL;
			continue;
		}

		BasicSpectrum bs;
		bs.num_peaks = num_spec_peaks;
		bs.peaks = basic_peaks + total_peaks_read;
		bs.ssf = all_ssf[i];

		if (pmcsqs && bs.ssf->sqs<0)
		{
			int max_charge=0;
			const float prob = pmcsqs->get_sqs_for_spectrum(config,bs,&max_charge);
			if (prob<filter_prob)
			{
				if (ann_map_ptr)
				{
					DAT_single *dat_ssf = (DAT_single *)bs.ssf;

					map<mzXML_annotation,int>::const_iterator it;
					mzXML_annotation ann_pos;
					ann_pos.mzXML_file_idx = dat_ssf->mzxml_file_idx;
					ann_pos.scan = dat_ssf->scan_number;

					it = ann_map_ptr->find(ann_pos);
					if (it != ann_map_ptr->end())
					{
						
						cout << ">> " << ++wrongly_filtered_spectra << "\t" <<
						dat_ssf->mzxml_file_idx << " " << dat_ssf->scan_number << 
						" --> " << prob << endl;
					}
				}
				all_ssf[i]=NULL; // this specturm was filtered!
				continue; 
			}

			// update m/z and charge state (yes it is supposed to be const...)	
			//ssf->charge=max_charge;
			//ssf->m_over_z = res[max_charge].mz1;
			bs.ssf->sqs = prob;
		}

		basic_spectra.push_back(bs);

		total_peaks_read += num_spec_peaks;

		mass_t& m_over_z = bs.ssf->m_over_z;
		if (m_over_z<min_m_over_z)
			min_m_over_z = m_over_z;
		if (m_over_z>max_m_over_z)
			max_m_over_z = m_over_z;
	}

//	cout << "add:  " << setw(8) << min_m_over_z << " - " << setw(8) << max_m_over_z << " : " <<
//						setw(6) << basic_spectra.size() << " " << total_peaks_read << endl;

	static vector<int> idx_permutations;
	idx_permutations.resize(basic_spectra.size());
	for (i=0; i<basic_spectra.size(); i++)
		idx_permutations[i]=i;

	permute_vector(idx_permutations);

	// cluster the spectra
	static float top_x_masses[NUM_TOP_CLUSTER_PEAKS];
	static vector<int> spec_top_idxs;
	int num_added=0;
	for (i=0; i<basic_spectra.size(); i++)
	{
		const int spec_idx = idx_permutations[i];
		BasicSpectrum& spec = basic_spectra[spec_idx];
		const float spec_retention_time = spec.ssf->retention_time;
	
		set_adjusted_inten(spec.peaks,spec.num_peaks);
		select_top_peak_idxs(spec.peaks,spec.num_peaks,spec.ssf->m_over_z,
			tolerance,spec_top_idxs, top_x_masses, 
			ClusterSpectrum::get_num_top_peaks_per_1000_da(),
			config);

		// compare to previous clusters
		int j;
		for (j=0; j<clusters.size(); j++)
		{
			if (clusters[j].get_tmp_cluster_idx() < 0)
				continue;

			if (! clusters[j].find_match_in_top_masses(top_x_masses))
				continue;

			if (clusters[j].get_num_basic_spectra()>=max_cluster_size)
				continue;

			const float sim = calc_selected_dot_prod(tolerance,
				spec.peaks,spec.num_peaks, spec_top_idxs,
				clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(), 
				clusters[j].get_top_ranked_idxs());

			if (sim > spectrum_join_similarity)
			{
				clusters[j].add_spectrum_to_cluster(spec, spec_top_idxs, top_x_masses);
				num_added++;
				break;
			}
		}
	}

	return num_added;
}



void check_fs_for_missing_anns_and_test_sqs(Config *config,
											const vector<SingleSpectrumFile *>& ssfs,
											const FileManager& fm,
											void *pmcsqs_ptr,
											char *anns_file)
{
	const mass_t tolerance = config->get_tolerance();

	map<mzXML_annotation,int> ann_map, ssf_map;
	read_mzXML_annotations_to_map(anns_file,ann_map);
	
	int i;
	for (i=0; i<ssfs.size(); i++)
	{
		DAT_single *dat_ssf = (DAT_single *)ssfs[i];
		mzXML_annotation ann;
		ann.charge = 0;
		ann.scan = dat_ssf->scan_number;
		ann.mzXML_file_idx = dat_ssf->mzxml_file_idx;


		ssf_map.insert(make_pair(ann,i));
	}

	// check if anns are in map
	int num_read=0;
	int num_not_read=0;
	vector<SingleSpectrumFile *> miss_ssfs;

	map<mzXML_annotation,int>::const_iterator it;
	for (it = ann_map.begin(); it != ann_map.end(); it++)
	{
		map<mzXML_annotation,int>::const_iterator ssf_it;
		mzXML_annotation ann_pos;
		ann_pos.mzXML_file_idx = it->first.mzXML_file_idx;
		ann_pos.scan = it->first.scan;

		ssf_it = ssf_map.find(ann_pos);
		if (ssf_it != ssf_map.end())
		{
			num_read++;
			miss_ssfs.push_back(ssfs[ssf_it->second]);
			ssfs[ssf_it->second]->peptide.parse_from_string(config,it->first.pep);
		}
		else
			num_not_read++;
	}
				

	cout << "NUM anns read in SSFs:    " << num_read << endl;
	cout << "NUM in the twilight zone: " << num_not_read << endl;

	PMCSQS_Scorer * pmcsqs = (PMCSQS_Scorer *)pmcsqs_ptr;

	BasicSpecReader bsr;
	vector<QCPeak> peaks;
	peaks.resize(6000);

	QCOutputter qco;
	qco.init("missed_xxx","out",0);

	int num_with_little_peaks=0;
	for (i=0; i<miss_ssfs.size(); i++)
	{
		const int num_spec_peaks = bsr.read_basic_spec(config,fm,miss_ssfs[i],
											 &peaks[0]);
		if (num_spec_peaks<3)
		{
			num_with_little_peaks++;
			continue;
		}

		BasicSpectrum bs;
		
		bs.num_peaks = num_spec_peaks;
		bs.peaks = &peaks[0];
		bs.ssf = miss_ssfs[i];


		if (pmcsqs && bs.ssf->sqs<0)
		{
			static QCPeak *tmp_peaks = NULL;
			static int num_tmp_peaks = 0;
			if (! tmp_peaks || bs.num_peaks>num_tmp_peaks)
			{
				if (tmp_peaks)
					delete [] tmp_peaks;
				num_tmp_peaks = (int)(bs.num_peaks * 1.5 + 1);
				tmp_peaks = new QCPeak[num_tmp_peaks];
			}

			// need to create a special peak list that passes filtering
			QCPeak *org_peaks = bs.peaks;
			int    num_org_peaks = bs.num_peaks;
			BasicSpectrum sqs_bs = bs; 
			sqs_bs.peaks = tmp_peaks;
			sqs_bs.ssf   = bs.ssf;

			const mass_t join_tolerance = (tolerance < 0.05 ? tolerance : 0.6 * tolerance);
			int p_idx=0;
			int j=1;
			while (j<num_org_peaks)
			{
				if (org_peaks[j].mass - org_peaks[p_idx].mass<=join_tolerance)
				{
					intensity_t inten_sum = org_peaks[i].intensity + org_peaks[p_idx].intensity;
					mass_t new_mass = (org_peaks[j].intensity * org_peaks[j].mass + 
									   org_peaks[p_idx].intensity * org_peaks[p_idx].mass ) / inten_sum;

					org_peaks[p_idx].mass = new_mass;
					org_peaks[p_idx].intensity = inten_sum;	
				}
				else
				{
					org_peaks[++p_idx]=org_peaks[j];
				}
				j++;
			}
			num_org_peaks = p_idx+1;

			vector<bool> indicators;
			mark_top_peaks_with_sliding_window(bs.peaks, 
											   bs.num_peaks,
											   config->get_local_window_size(),
											   config->get_max_number_peaks_per_local_window(),
											   indicators);
		
			if (bs.ssf->precursor_intensity <=0)
			{
				float inten=0;
				int j;
				for (j=0; j<num_org_peaks; i++)
					inten+=org_peaks[j].intensity;
				bs.ssf->precursor_intensity=inten;
			}

			p_idx=0;
			for (j=0; j<num_org_peaks; j++)
				if (indicators[j])
					tmp_peaks[p_idx++]=org_peaks[j];
			sqs_bs.num_peaks = p_idx;
		//	cout << num_org_peaks << "->" << sqs_bs.num_peaks << endl;

			int max_charge=0;
			const float prob = pmcsqs->get_sqs_for_spectrum(config,sqs_bs,&max_charge);
			cout << i << "\t" << setprecision(3) << prob << "\t";
			bs.ssf->type = DAT;
			bs.ssf->print_ssf_stats(config);
		}

		ClusterSpectrum cs;
		cs.create_new_cluster(config,bs,i);
		qco.output_cluster_spectrum(cs);
	}
	cout << "NUM WITH LITTLE PEAKS: " << num_with_little_peaks << endl;
}




⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -