⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filemanagement.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
			{
				const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
				const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];

				if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
				 ( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.005) )
				{
					ssf_pointers.pop_back();
					continue;
				}
			}

//			cout << setprecision(9)<< ssf->m_over_z << " " << ssf->num_peaks << endl;
		}
	}

	for (i=0; i<fm.pkl_dirs.size(); i++)
	{
		int j;
		for (j=0; j<fm.pkl_dirs[i].single_spectra.size(); j++)
		{
			const SingleSpectrumFile *ssf = &fm.pkl_dirs[i].single_spectra[j];

			if (ssf->num_peaks<5)
				continue;

			this->ssf_pointers.push_back((struct PKL_single *)ssf);

			// check that this doesn't correpond to the immediate previous spec
			if (remove_duplicates && ssf_pointers.size()>1)
			{
				const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
				const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];

				if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
				 ( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.001) )
				{
					ssf_pointers.pop_back();
					continue;
				}
			}

//			cout << setprecision(9)<< ssf->m_over_z << " " << ssf->num_peaks << endl;
		}
	}


	this->next_ssf_pointer=0;
}


/**********************************************************************
Selects dat ssfs that have an mzxml file idx of at most max_mzxml idx
***********************************************************************/
void FileSet::filter_dat_spectra_by_mzxml_idx(int max_mzxml_idx)
{
	int i;
	vector<SingleSpectrumFile *> new_ssfs;
	
	for (i=0; i<ssf_pointers.size(); i++)
	{
		DAT_single *dat_ssf = (DAT_single *)ssf_pointers[i];
		if (dat_ssf->mzxml_file_idx > max_mzxml_idx)
			continue;
		new_ssfs.push_back(ssf_pointers[i]);
	}
	ssf_pointers = new_ssfs;
}



/*************************************************************************
Select the file pointers from the FileManager
**************************************************************************/
void FileSet::select_files(const FileManager& fm, 
						   mass_t min_pm_with_19, 
						   mass_t max_pm_with_19, 
						   score_t min_sqs,       
						   score_t max_sqs, 
						   int charge,            
						   bool only_unassigned)
{
	int i;

	this->ssf_pointers.clear();

		
	if (charge !=0)
	{
		min_charge = charge;
		max_charge = charge;
	}

	for (i=0; i<fm.dta_files.size(); i++)
	{
		if (fm.dta_files[i].org_pm_with_19 < min_pm_with_19 ||
			fm.dta_files[i].org_pm_with_19 > max_pm_with_19)
			continue;

		if (fm.dta_files[i].sqs>-1 && 
			(fm.dta_files[i].sqs<min_sqs || fm.dta_files[i].sqs>max_sqs) )
			continue;

		if (charge>0 && fm.dta_files[i].charge != charge)
			continue;

		if (only_unassigned && fm.dta_files[i].assigned_cluster>=0)
			continue;

		if (fm.dta_files[i].charge<min_charge)
			min_charge =  fm.dta_files[i].charge;

		if (fm.dta_files[i].charge>max_charge)
			max_charge =  fm.dta_files[i].charge;

		ssf_pointers.push_back((struct DTA_file *)&fm.dta_files[i]);
	}


	for (i=0; i<fm.mgf_files.size(); i++)
	{
		int j;
		for (j=0; j<fm.mgf_files[i].single_spectra.size(); j++)
		{
			if (fm.mgf_files[i].single_spectra[j].org_pm_with_19 < min_pm_with_19 ||
				fm.mgf_files[i].single_spectra[j].org_pm_with_19 > max_pm_with_19)
				continue;

			if ( fm.mgf_files[i].single_spectra[j].sqs > -1 &&
				(fm.mgf_files[i].single_spectra[j].sqs<min_sqs || fm.mgf_files[i].single_spectra[j].sqs>max_sqs))
				continue;

			if (charge>0 && fm.mgf_files[i].single_spectra[j].charge != charge)
				continue;

			if (only_unassigned && fm.mgf_files[i].single_spectra[j].assigned_cluster>=0)
				continue;

			if (fm.mgf_files[i].single_spectra[j].charge>max_charge)
				max_charge = fm.mgf_files[i].single_spectra[j].charge;
			
			if (fm.mgf_files[i].single_spectra[j].charge<min_charge)
				min_charge = fm.mgf_files[i].single_spectra[j].charge;

			this->ssf_pointers.push_back((struct MGF_single *)&fm.mgf_files[i].single_spectra[j]);
		}
	}
	this->next_ssf_pointer=0;
}



/*************************************************************************
Select the file pointers from the FileManager
**************************************************************************/
void FileSet::select_files_in_mz_range(
						   const FileManager& fm, 
						   mass_t min_mz, 
						   mass_t max_mz, 
						   int charge)
{
	int i;

	this->ssf_pointers.clear();

		
	if (charge !=0)
	{
		min_charge = charge;
		max_charge = charge;
	}


	for (i=0; i<fm.mgf_files.size(); i++)
	{
		int j;
		for (j=0; j<fm.mgf_files[i].single_spectra.size(); j++)
		{
			if (fm.mgf_files[i].single_spectra[j].m_over_z < min_mz ||
				fm.mgf_files[i].single_spectra[j].m_over_z > max_mz)
				continue;

			if (charge>0 && fm.mgf_files[i].single_spectra[j].charge != charge)
				continue;

			if (fm.mgf_files[i].single_spectra[j].charge>max_charge)
				max_charge = fm.mgf_files[i].single_spectra[j].charge;
			
			if (fm.mgf_files[i].single_spectra[j].charge<min_charge)
				min_charge = fm.mgf_files[i].single_spectra[j].charge;

			this->ssf_pointers.push_back((struct MGF_single *)&fm.mgf_files[i].single_spectra[j]);
		}
	}
	this->next_ssf_pointer=0;
}


// Randomly select n ssf pointers
void FileSet::randomly_reduce_ssfs(int n)
{
	if (n>= ssf_pointers.size())
		return;

	vector<int> idxs;
	choose_k_from_n(n,ssf_pointers.size(),idxs);

	vector<SingleSpectrumFile *> red_ssfs;
	red_ssfs.resize(n,NULL);

	int i;
	for (i=0; i<n; i++)
		red_ssfs[i] = ssf_pointers[idxs[i]];

	ssf_pointers = red_ssfs;
}

struct mz_pair {
	bool operator< (const mz_pair& other) const
	{
		return (m_over_z < other.m_over_z);
	}

	mass_t m_over_z;
	SingleSpectrumFile *ssf;
};

void FileSet::sort_according_to_m_over_z()
{
	vector<mz_pair> pairs;
		
	pairs.resize(ssf_pointers.size());
	int i;
	for (i=0; i<ssf_pointers.size(); i++)
	{
		pairs[i].m_over_z=ssf_pointers[i]->m_over_z;
		pairs[i].ssf = ssf_pointers[i];
	}

	sort(pairs.begin(),pairs.end());
	
	for (i=0; i<pairs.size(); i++)
		ssf_pointers[i]=pairs[i].ssf;
}


// removes all ssf without a peptides
void FileSet::keep_only_spectra_with_peptides()
{
	int i;
	vector<SingleSpectrumFile *> keep_ssfs;

	keep_ssfs.clear();

	for (i=0; i<ssf_pointers.size(); i++)
		if (ssf_pointers[i]->peptide.get_num_aas()>3)
			keep_ssfs.push_back(ssf_pointers[i]);

	ssf_pointers = keep_ssfs;
}



bool comp_ssf_pointers(const SingleSpectrumFile *ss1,
			           const SingleSpectrumFile *ss2)
{
	return ( (ss1->file_idx < ss2->file_idx) ||
		     (ss1->file_idx == ss2->file_idx && ss1->file_pos < ss2->file_pos));
}

// copies the ssf pointers from another FileSet
// sorts them according to the file order (not m_over_z);
void FileSet::init_from_another_fs(const FileSet& other_fs, 
						int start_ssf_idx, int end_ssf_idx)
{
	int i;

	reset_pointers();

	ssf_pointers.clear();
	ssf_pointers.reserve(end_ssf_idx-start_ssf_idx+1);

	int not_added=0;
	for (i=start_ssf_idx; i<=end_ssf_idx; i++)
		if (other_fs.ssf_pointers[i]->assigned_cluster<0)
		{
			ssf_pointers.push_back(other_fs.ssf_pointers[i]);
		}
	//	else
	//		not_added++;

//	if (not_added>0)
//		cout << "Not added: " << not_added << endl;

	// the comp_ssf_pointers is used so the ssfs are addressed according to 
	// the order in which they appear in the list file
	// if this causes problems can use regular sort according to pointer value
	// this will cause the files to be accessed in an unpredictable order (however
	// the spectra in the files will be accessed in the correct order)
	sort(ssf_pointers.begin(),ssf_pointers.end(),comp_ssf_pointers);

}


/************************************************************
// reads the next spectrum into spec
// returns false if no more spectra are available
// returns the mgf_pointer through mp, if file = -1 this means
// this was a dta
*************************************************************/
bool FileSet::get_next_spectrum(const FileManager& fm, 
								Config *config, 
								Spectrum *spec, 
								SingleSpectrumFile **ssfp,
								bool perform_init_spectrum, 
								bool set_charge_to_zero)
{
	SingleSpectrumFile *ssf;
	if (next_ssf_pointer < ssf_pointers.size())
	{
		ssf = ssf_pointers[next_ssf_pointer++];
		if (ssfp)
			*ssfp=ssf;

		if (ssf->type == DTA)
		{
			if (set_charge_to_zero)
			{
				spec->read_from_dta(config,ssf->single_name.c_str(),0);
			}
			else
				spec->read_from_dta(config,ssf->single_name.c_str());

			if (perform_init_spectrum)
				spec->init_spectrum();

			return true;
		}
		else if (ssf->type == MGF)
		{
			if (this->current_mgf_file_idx != ssf->file_idx)
			{
				if (mgf_stream)
					fclose(mgf_stream);
				
				mgf_stream=fopen(fm.mgf_files[ssf->file_idx].mgf_name.c_str(),"r");
				if (! mgf_stream)
				{
					cout << "Error: Couldn't open file for reading: " << 
						fm.mgf_files[ssf->file_idx].mgf_name << endl;
					exit(1);
				}
			
				this->current_mgf_file_idx = ssf->file_idx;
			}

			if ( fseek(mgf_stream,ssf->file_pos,0) )
			{
				cout << "Error: could not skip in file!" << endl;
				exit(1);
			}

			spec->read_from_MGF_stream(config,mgf_stream);
			if (perform_init_spectrum)
				spec->init_spectrum();

			return true;
		}
		else
		{
			cout << "Error: invalid file type:" << ssf->type <<  endl;
			exit(1);
		}
		
	}
	else
		return false;
}



inline bool comp_ssf(const SingleSpectrumFile* ssf1, const SingleSpectrumFile* ssf2)
{
	return (   ssf1->file_idx < ssf2->file_idx || 
		    ( (ssf1->file_idx == ssf2->file_idx) && (ssf1->file_pos<ssf2->file_pos) ) );
}








// creates an mgf file with the desred number of spectra per charge as designated
// in the spectra_per_charges vector(strating from charge 0). so the maximum
// number of from each charge in the outputted in the mgf file will be at most
// the numbers desginated in the vector
void FileSet::create_mgf_file(const FileManager& fm, Config *config, const char *file_name,
						 vector<int> spectra_per_charges)
{
	int max_charge = spectra_per_charges.size()-1;
	vector<int> spectra_written;

	spectra_written.resize(max_charge+1,0);

	ofstream ofs(file_name,ios::out);
	if (! ofs.good())
	{
		cout << "Error: couldn't open file for writing: " << file_name << endl;
		exit(1);
	}

	this->reset_pointers();

	while (1 )
	{
		Spectrum s;
		if (! this->get_next_spectrum(fm,config,&s))
			break;

	//	s.set_charge(0);

		int spec_charge = s.get_charge();
		if (spec_charge>max_charge)
			continue;

		if (spectra_written[spec_charge]>=spectra_per_charges[spec_charge])
			continue;

		spectra_written[spec_charge]++;

		s.output_as_MGF(ofs);
	}

	ofs.close();

}





/*************************************************************
// iterates over the files and makes a fasta out of the seq
//  puts 10 in a row, calls it TRUE_X 
  uses the org_aa
**************************************************************/
void FileSet::make_fasta_from_file_seqs(const FileManager& fm, Config *config,
										int inc, ostream& os)
{
	int seq_counter=0;
	int line_counter=0;

	Spectrum s;

	while ( 1 )
	{
		if ( ! this->get_next_spectrum(fm,config,&s))
			break;

		if (s.get_peptide().get_num_aas()>0)
		{
			if (seq_counter == 0)
				os << ">TRUE_" << line_counter*inc << endl;
	
			Peptide p = s.get_peptide();
			p.convert_to_org(config);

			os << p.as_string(config);	
			seq_counter++;
			if (seq_counter == inc)
			{
				os << endl;
				seq_counter=0;
				line_counter++;
			}
		}
	}
	os << endl;
}






// concatonates several mgf files into one large file
// slow and stupid...
void concat_mgf_files(Config *config, const char *mgf_file_list, 
					  const char *big_mgf_file)
{
	int i;
	ofstream ofs(big_mgf_file,ios::out);
	if (! ofs.good() )
	{
		cout << "Error: couldn't open file for writing: " << big_mgf_file << endl;
		exit(1);
	}

	vector<string> list;
	read_paths_into_list(mgf_file_list,list);
	
	for (i=0; i<list.size(); i++)
	{
		Spectrum s;
		
		FILE *mgf_stream = fopen(list[i].c_str(),"r");
		if (mgf_stream)
		{
			cout << "Error: couldn't open file for reading: " << list[i] << endl;
			exit(1);
		}

		while ( s.read_from_MGF_stream(config,mgf_stream) )
		{
		//	s.init_spectrum(); // so we filter peaks
			s.output_as_MGF(ofs);
		}
		fclose(mgf_stream);
	}
	ofs.close();
}


/****************************

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -