⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filemanagement.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
	int i;
	vector<string> list;
	read_paths_into_list(list_file,list);

	mass_t min_m_over_z=0;
	mass_t max_m_over_z=100000;

	num_spectra.resize(20,0);

	for (i=0; i<list.size(); i++)
    {
		if (list[i][0] == '#')
			continue;

		bool read_file = file_indicators[i];
		
		int last_pos = list[i].length()-1;
		if (list[i][last_pos] == '\n' || list[i][last_pos] == '\r' || 
			list[i][last_pos] == '\t' || list[i][last_pos] == '\f')
			last_pos--;

		if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='t' && 
			list[i][last_pos  ]=='a')
		{
			DTA_file dta_file;

			dta_file.scan_dta(list[i],config);

			if (dta_file.m_over_z>=min_m_over_z && dta_file.m_over_z<max_m_over_z)
				dta_files.push_back(dta_file);
			
		}	
		else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' && 
			list[i][last_pos  ]=='f')
		{
			MGF_file mgf_file;
			mgf_file.mgf_name =list[i];

			if (read_file)
			{
				mgf_file.initial_read(config,mgf_files.size(),true);

				// change the single spectrum pointers in the mgf file record
				// to include only those that have a mass that is in the permitted range

				vector<MGF_single> good_singles;
				int j;
				for (j=0; j<mgf_file.single_spectra.size(); j++)
				{
					if (mgf_file.single_spectra[j].m_over_z >= min_m_over_z && 
						mgf_file.single_spectra[j].m_over_z <  max_m_over_z)
					{
						good_singles.push_back(mgf_file.single_spectra[j]);
					}
				}

				mgf_file.single_spectra = good_singles;
			}
			else
				mgf_file.single_spectra.clear();

			mgf_files.push_back(mgf_file);
		}
		else if (list[i][last_pos-4] == 'm' &&
				 list[i][last_pos-3] == 'z' &&
				list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' && 
				list[i][last_pos  ]=='L')
		{
			MZXML_file mzxml;
			mzxml.mzxml_name =list[i];

			if (read_file)
			{

				mzxml.initial_read(config,mzxml_files.size());

				// change the single spectrum pointers in the mgf file record
				// to include only those that have a mass that is in the permitted range

				vector<MZXML_single> good_singles;
				int j;
				for (j=0; j<mzxml.single_spectra.size(); j++)
				{
					if (mzxml.single_spectra[j].m_over_z >= min_m_over_z && 
						mzxml.single_spectra[j].m_over_z <  max_m_over_z)
					{
						good_singles.push_back(mzxml.single_spectra[j]);
					}
				}

				mzxml.single_spectra = good_singles;
			}
			else
				mzxml.single_spectra.clear();

			mzxml_files.push_back(mzxml);
		}
		else if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' && 
				list[i][last_pos  ]=='t')
		{
			DAT_file dat;
			dat.dat_name =list[i];

			if (read_file)
			{

				dat.initial_read(config,dat_files.size());

				// change the single spectrum pointers in the mgf file record
				// to include only those that have a mass that is in the permitted range

				vector<DAT_single> good_singles;
				int j;
				for (j=0; j<dat.single_spectra.size(); j++)
				{
					if (dat.single_spectra[j].m_over_z >= min_m_over_z && 
						dat.single_spectra[j].m_over_z <  max_m_over_z)
					{
						good_singles.push_back(dat.single_spectra[j]);
					}
				}

				dat.single_spectra = good_singles;
			}
			else
				dat.single_spectra.clear();

			dat_files.push_back(dat);
		}
		else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='s' && 
			list[i][last_pos  ]=='2')
		{
			MS2_file ms2_file;
			ms2_file.ms2_name =list[i];

			ms2_file.initial_read(config,ms2_files.size(),true);

			// change the single spectrum pointers in the mgf file record
			// to include only those that have a mass that is in the permitted range

			vector<MS2_single> good_singles;
			int j;
			for (j=0; j<ms2_file.single_spectra.size(); j++)
			{
				if (ms2_file.single_spectra[j].m_over_z >= min_m_over_z && 
					ms2_file.single_spectra[j].m_over_z <  max_m_over_z)
				{
					good_singles.push_back(ms2_file.single_spectra[j]);
				}
			}

			ms2_file.single_spectra = good_singles;
			ms2_files.push_back(ms2_file);
		}
		else
		{
			cout << "Error: couldn't recognize file type for:: " << list[i] << endl;
			exit(1);
		}
	}

	count_num_spectra();
}


// only keeps ssfs of mzXML singles that have an annotation
void FileManager::init_from_list_file(Config *config, const char* list_file,
		const vector< vector<int> >& annotation_idxs)
{
	int i;
	vector<string> list;
	read_paths_into_list(list_file,list);

	mgf_files.clear();
	int total_dat_read=0;

	for (i=0; i<list.size(); i++)
    {
		if (list[i][0] == '#')
			continue;

		int last_pos = list[i].length()-1;

		if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' && 
				list[i][last_pos  ]=='f')
		{
			MGF_file mgf;
			mgf.mgf_name =list[i];

			bool has_anns=false;
			const vector<int>& anns = annotation_idxs[i];
			const int ann_size = anns.size();
			int j;
			for (j=0; j<ann_size; j++)
				if (anns[j]>=0)
					break;
			if (j<anns.size())
				has_anns = true;

			if (has_anns)
			{
//				cout << "DD2 reading idx " << mgf_files.size() << endl;

				mgf.initial_read(config,mgf_files.size(),true);

				cout << i <<  " " << mgf.mgf_name << " ";

				// change the single spectrum pointers in the mgf file record
				// to include only those that have a mass that is in the permitted range

				vector<MGF_single> good_singles;
				int j;
				for (j=0; j<mgf.single_spectra.size(); j++)
				{
					int mgf_file_idx = mgf.single_spectra[j].file_idx;
					int scan_number =  mgf.single_spectra[j].scan_number;

				

				//	cout << mgf_file_idx <<" " << scan_number << endl;

					if (mgf_file_idx < annotation_idxs.size() &&
						scan_number < annotation_idxs[mgf_file_idx].size() && 
						annotation_idxs[mgf_file_idx][scan_number]>=0) 
					{
						mgf.single_spectra[j].ann_idx=annotation_idxs[mgf_file_idx][scan_number];
						good_singles.push_back(mgf.single_spectra[j]);
					}
				}

				cout << good_singles.size() << " ..." << endl;
				mgf.single_spectra = good_singles;
			}
			mgf_files.push_back(mgf);
		}
		else if (list[i][last_pos-4] == 'm' &&
				 list[i][last_pos-3] == 'z' &&
				list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' && 
				list[i][last_pos  ]=='L')
		{
			MZXML_file mzxml;
			mzxml.mzxml_name =list[i];

			bool has_anns=false;
			const vector<int>& anns = annotation_idxs[i];
			const int ann_size = anns.size();
			int j;
			for (j=0; j<ann_size; j++)
				if (anns[j]>=0)
					break;
			if (j<anns.size())
				has_anns = true;

			if (has_anns)
			{
				mzxml.initial_read(config,i);

				vector<MZXML_single> good_singles;
				int j;
				for (j=0; j<mzxml.single_spectra.size(); j++)
				{
					int scan_num  = mzxml.single_spectra[j].scan_number;
					if (scan_num < ann_size &&  annotation_idxs[i][scan_num]>=0)
					{
						int ann_val = annotation_idxs[i][scan_num];
						mzxml.single_spectra[j].ann_idx=annotation_idxs[i][scan_num];
						good_singles.push_back(mzxml.single_spectra[j]);
					}
				}

				mzxml.single_spectra = good_singles;
			}

			mzxml_files.push_back(mzxml);
		}
		else if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' && 
				list[i][last_pos  ]=='t')
		{
			DAT_file dat;
			dat.dat_name =list[i];

			dat.initial_read(config,dat_files.size());

			cout << dat.dat_name << " ";

			// change the single spectrum pointers in the mgf file record
			// to include only those that have a mass that is in the permitted range

			vector<DAT_single> good_singles;
			int j;
			for (j=0; j<dat.single_spectra.size(); j++)
			{
				int mzxml_file_idx = dat.single_spectra[j].mzxml_file_idx;
				int scan_number = dat.single_spectra[j].scan_number;
				if (mzxml_file_idx < annotation_idxs.size() &&
					scan_number < annotation_idxs[mzxml_file_idx].size() && 
					annotation_idxs[mzxml_file_idx][scan_number]>=0) 
				{
					dat.single_spectra[j].ann_idx = annotation_idxs[mzxml_file_idx][scan_number];
					good_singles.push_back(dat.single_spectra[j]);
				}
			}

			cout << good_singles.size() << " ..." << endl;
			total_dat_read += good_singles.size();

			dat.single_spectra = good_singles;
			dat_files.push_back(dat);
		}
		else
		{
			cout << "Error: couldn't recognize file type for: " << list[i] << endl;
			exit(1);
		}
	}

	count_num_spectra();

	if (total_dat_read>0)
	{
		cout << "Read " << total_dat_read << " DAT spectra" << endl;
	}
}


// adds the annotations to the ssfs
void FileManager::init_from_list_file_and_add_annotations(Config *config, const char* list_file,
		const vector< vector<int> >& annotation_idxs, vector<mzXML_annotation>& annotations,
		bool read_only_annotated )
{
	int i;
	vector<string> list;
	read_paths_into_list(list_file,list);

	mgf_files.clear();
	int total_dat_read=0;

	for (i=0; i<list.size(); i++)
    {
		if (list[i][0] == '#')
			continue;

		int last_pos = list[i].length()-1;

		if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' && 
				list[i][last_pos  ]=='f')
		{
			MGF_file mgf;
			mgf.mgf_name =list[i];

//			cout << "DD3 reading mgf idx : " << mgf_files.size() << endl;

			mgf.initial_read(config,mgf_files.size(),true);

			cout << mgf.mgf_name << " ";

			// change the single spectrum pointers in the mgf file record
			// to include only those that have a mass that is in the permitted range

			vector<MGF_single> good_singles;
			int j;
			for (j=0; j<mgf.single_spectra.size(); j++)
			{
				int mgf_file_idx = mgf.single_spectra[j].file_idx;
				int scan_number =  mgf.single_spectra[j].idx_in_file;

			

			//	cout << mgf_file_idx <<" " << scan_number << endl;

				if (mgf_file_idx < annotation_idxs.size() &&
					scan_number < annotation_idxs[mgf_file_idx].size() && 
					annotation_idxs[mgf_file_idx][scan_number]>=0) 
				{
					mgf.single_spectra[j].peptide.parse_from_string(config,
						annotations[annotation_idxs[mgf_file_idx][scan_number]].pep);
					mgf.single_spectra[j].charge = annotations[annotation_idxs[mgf_file_idx][scan_number]].charge;
					good_singles.push_back(mgf.single_spectra[j]);
				}
				else
					if (! read_only_annotated)
						good_singles.push_back(mgf.single_spectra[j]);
			}

			cout << good_singles.size() << " ..." << endl;
			mgf.single_spectra = good_singles;
			mgf_files.push_back(mgf);
		}
		else if (list[i][last_pos-4] == 'm' &&
				 list[i][last_pos-3] == 'z' &&
				list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' && 
				list[i][last_pos  ]=='L')
		{
			MZXML_file mzxml;
			mzxml.mzxml_name =list[i];

			cout << i << " " << list[i] << endl;

			if (i>= annotation_idxs.size() && ! read_only_annotated)
			{
				mzxml_files.push_back(mzxml);
				continue;
			}

			bool has_anns=false;
			const vector<int>& anns = annotation_idxs[i];
			const int ann_size = anns.size();
			int j;
			for (j=0; j<ann_size; j++)
				if (anns[j]>=0)
					break;
			if (j<anns.size())
				has_anns = true;

			if (has_anns)
			{
				mzxml.initial_read(config,i);

				vector<MZXML_single> good_singles;
				int j;
				for (j=0; j<mzxml.single_spectra.size(); j++)
				{
					int scan_num  = mzxml.single_spectra[j].scan_number;
					if (scan_num < ann_size &&  annotation_idxs[i][scan_num]>=0)
					{
						int ann_val = annotation_idxs[i][scan_num];
						mzxml.single_spectra[j].peptide.parse_from_string(config,
							annotations[ann_val].pep);
						mzxml.single_spectra[j].charge = annotations[ann_val].charge;
						good_singles.push_back(mzxml.single_spectra[j]);
					}
					else
						if (! read_only_annotated)
							good_singles.push_back(mzxml.single_spectra[j]);
				}

				mzxml.single_spectra = good_singles;
			}

			mzxml_files.push_back(mzxml);
		}
		else if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' && 
				list[i][last_pos  ]=='t')
		{
			DAT_file dat;
			dat.dat_name =list[i];

			dat.initial_read(config,dat_files.size());

			cout << dat.dat_name << " ";

			// change the single spectrum pointers in the mgf file record
			// to include only those that have a mass that is in the permitted range

			vector<DAT_single> good_singles;
			int j;
			for (j=0; j<dat.single_spectra.size(); j++)
			{
				int mzxml_file_idx = dat.single_spectra[j].mzxml_file_idx;
				int scan_number = dat.single_spectra[j].scan_number;
				if (mzxml_file_idx < annotation_idxs.size() &&
					scan_number < annotation_idxs[mzxml_file_idx].size() && 
					annotation_idxs[mzxml_file_idx][scan_number]>=0) 
				{
					int ann_val = annotation_idxs[mzxml_file_idx][scan_number];
					dat.single_spectra[j].peptide.parse_from_string(config,
						annotations[ann_val].pep);
					good_singles.push_back(dat.single_spectra[j]);
				}
				else
					if (! read_only_annotated)
						good_singles.push_back(dat.single_spectra[j]);
			}

			cout << good_singles.size() << " ..." << endl;
			total_dat_read += good_singles.size();

			dat.single_spectra = good_singles;
			dat_files.push_back(dat);
		}
		else
		{
			cout << "Error: couldn't recognize file type for: " << list[i] << endl;
			exit(1);
		}
	}

	count_num_spectra();

	if (total_dat_read>0)
	{
		cout << "Read " << total_dat_read << " DAT spectra" << endl;
	}
}







void DTA_file::initial_read(Config *config)
{

	if (single_name.length() == 0)
	{
		cout << "Error: must first copy name to DTA_file!" << endl;
		exit(1);
	}

	Spectrum s;

	s.read_from_dta(config,single_name.c_str());
	s.init_spectrum();

	this->type = DTA;
	this->single_name = single_name;
	this->peptide = s.get_peptide();
	this->charge = s.get_charge();
	this->org_pm_with_19 = s.get_org_pm_with_19();
	this->pm_with_19 = s.get_corrected_pm_with_19();
	this->num_peaks = s.get_num_peaks();
	this->m_over_z = s.get_m_over_z();
}



void MGF_file::initial_read(Config *config, int file_idx, bool quick_flag)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -