⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 qcdat.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 3 页
字号:
#include "QuickClustering.h"
#include "base64.h"

DAT_FileBuff::~DAT_FileBuff()
{
	if (buff && pos>buff)
		flush_buff();

	if (buff)
		delete [] buff; 
}

void DAT_FileBuff::init(string& _path,  int dat_buff_size)
{
	if (! ind_was_initialized)
	{
		buff = new char[dat_buff_size];
		if (! buff)
		{
			cout << "Error: couldn't allocate memory for DAT file buff!" << endl;
			exit(1);
		}
	}

	ind_was_initialized = 1;
	path = _path;
	max_pos = buff + dat_buff_size - 100;
	pos = buff;
	ind_first_write = 1; 
}


// copies the files to the DAT file
void DAT_FileBuff::add_spec_to_DAT_file(
							mass_t m_over_z, 
							int charge, 
							int mzxml_file_idx,
							int scan_number, 
							float retention_time, 
							float precursor_intensity, 
							int num_peaks, 
							char *peak_buff)
{
	const int spec_bytes = sizeof(mass_t) + 4 * sizeof(int) + 2 * sizeof(float) * num_peaks;

	if (num_peaks>5000 || num_peaks<0)
		cout << mzxml_file_idx << " " << scan_number << " " << " p: " << num_peaks << endl;

	if (mzxml_file_idx<0 || scan_number<0)
	{
		cout <<"Error: bad file idx or scan number! " << mzxml_file_idx << ", " << scan_number << endl;
		exit(1);
	}

	if (pos + spec_bytes >= max_pos)
		flush_buff();

	mass_t *m_ptr = (mass_t *)pos;
	*m_ptr++ = m_over_z;

	int *i_ptr = (int *)m_ptr;

	*i_ptr++ = charge;
	*i_ptr++ = mzxml_file_idx;
	*i_ptr++ = scan_number;
	*i_ptr++ = num_peaks;

	float *f_ptr = (float *)i_ptr;

	*f_ptr++ = retention_time;
	*f_ptr++ = precursor_intensity;
	
	pos = (char *)f_ptr;
	memcpy(pos,peak_buff,2 * sizeof(float) * num_peaks);
	pos += 2 * sizeof(float) * num_peaks;

	this->counter++;

}



void DAT_FileBuff::flush_buff()
{
	FILE* dat_stream;

	if (pos == buff)
		return;

	if (! ind_was_initialized)
	{
		cout << "Error: must first initialize the DAT_FileBuff!" << endl;
		exit(1);
	}

	if (ind_first_write)
	{
		dat_stream=fopen(path.c_str(),"wb");
		ind_first_write = 0;
	}
	else
		dat_stream=fopen(path.c_str(),"ab");

	if (! dat_stream)
	{
		cout << "Error: couldn't open DAT file for writing: " << path.c_str() << endl;
		exit(1);
	}


	fwrite(buff,1,pos-buff,dat_stream);
	fclose(dat_stream);

	pos=buff;
}



void DAT_Converter::init_DAT_Converter(mass_t _max_m_over_z, mass_t _mass_increment, int _dat_buff_size)
{
//	name = _name;
//	out_dir = _out_dir;
	max_m_over_z = _max_m_over_z;
	mass_increment = _mass_increment;
	dat_buff_size = _dat_buff_size;

	max_dat_file_idx = (int)(max_m_over_z / mass_increment) + 1;

	dat_buffs.resize(max_dat_file_idx+1);

	ind_was_initialized =1;
}




// creates a file with the list of DAT files
void DAT_Converter::create_list_file() const
{
	FILE *list_stream;

	ostringstream oss;
	oss << batch;
	string list_path = out_dir + "/" + name + "_" + oss.str() + "_list.txt";

	list_stream = fopen(list_path.c_str(),"w");
	if (! list_stream)
	{
		cout << "Error: couldn't open list file for writing: " << list_path.c_str() << endl;
		exit(1);
	}

	int i;
	for (i=0; i<dat_buffs.size(); i++)
	{
		if (dat_buffs[i].buff)
		{
			fprintf(list_stream,"%s\n",dat_buffs[i].path.c_str());
		}
	}

	fclose(list_stream);
}








int DAT_Converter::convert_single_non_MZXML_file_to_DAT(Config* config, string file, 
							mass_t min_m_over_z, mass_t max_m_over_z, int file_idx)
{
	static QCPeak *peak_buff=NULL;    
	static float  *dat_peak_buff=NULL;

	if (! peak_buff)
	{
		peak_buff     = new QCPeak[20000];
		dat_peak_buff = new  float[40000];
	}

	if (! peak_buff || ! dat_peak_buff)
	{
		cout << "Error: couldn't allocate memory for DAT conversion!" << endl;
		exit(1);
	}

	if (! ind_was_initialized)
	{
		cout << "Error: must initialize DAT_Converter!" << endl;
		exit(0);
	}

	vector<string> list;
	list.push_back(file);

	BasicSpecReader bsr;
	FileManager fm;
	FileSet all_spec_fs;


	
	fm.init_from_list(config,list,true,file_idx);

	all_spec_fs.select_all_files(fm);

	const int total_spectra = all_spec_fs.get_total_spectra();
	const vector<SingleSpectrumFile *>& all_ssf = all_spec_fs.get_ssf_pointers();

	cout << file_idx << " " << file << " ... " << endl;

	int num_spectra_extracted=0;
	int i;
	for (i=0; i<all_ssf.size(); i++)
	{

		SingleSpectrumFile *ssf = all_ssf[i];
		// no small spectra
		if (ssf->num_peaks<5 || ssf->m_over_z<=min_m_over_z || ssf->m_over_z > max_m_over_z)
			continue;

		int j,pos=0;
		int num_spec_peaks = bsr.read_basic_spec(config,fm,ssf,peak_buff,true);

		if (num_spec_peaks<5)
			continue;


		mass_t m_over_z = ssf->m_over_z;
		int scan_number=-1, file_idx=-1;
		float precursor_intensity=0;
		
		if (ssf->type == DTA)
		{
			file_idx = 0;
			scan_number=-1;
		}
		else if (ssf->type == MGF)
		{
			MGF_single *mgf_single = (MGF_single *)ssf;
			file_idx    = mgf_single->file_idx;
			scan_number = mgf_single->idx_in_file;	
		}
		else if (ssf->type == MZXML)
		{
			MZXML_single *mzxml_single = (MZXML_single *)ssf;
			file_idx = mzxml_single->file_idx;
			scan_number = mzxml_single->scan_number;
			precursor_intensity = mzxml_single->precursor_intensity;
		}
		else if (ssf->type == DAT)
		{
			DAT_single *dat_single = (DAT_single *)ssf;
			file_idx = dat_single->mzxml_file_idx;
			scan_number = dat_single->scan_number;
		}

		// copy peaks
		
		for (j=0; j<num_spec_peaks; j++)
		{
			dat_peak_buff[pos++]=(float)peak_buff[j].mass;
			dat_peak_buff[pos++]=(float)peak_buff[j].intensity;
		}

		
		// add spectrum
		int DAT_file_idx =  (int)(m_over_z/mass_increment);
		if (DAT_file_idx > max_dat_file_idx)
			DAT_file_idx = max_dat_file_idx;

		if (! dat_buffs[DAT_file_idx].ind_was_initialized)
		{
			ostringstream os,os_batch;
			os << DAT_file_idx;
			os_batch << batch;
			string path = out_dir + "/" + name + "_" + os_batch.str() + "_" + os.str() + ".dat";
			dat_buffs[DAT_file_idx].init(path,dat_buff_size);
		}

		int charge = ssf->charge;
		if (charge<0 || charge>1000)
			charge=0;

	
		dat_buffs[DAT_file_idx].add_spec_to_DAT_file(
			m_over_z,
			charge,
			file_idx,
			scan_number,ssf->retention_time,
			precursor_intensity,
			num_spec_peaks,(char *)dat_peak_buff);

		num_spectra_extracted++;
	}
	cout << num_spectra_extracted << " spectra..." << endl;

	return num_spectra_extracted;
}

int DAT_Converter::convert_PKL_dir_to_DAT(Config* config, char *file_list, int file_start_idx,
							mass_t min_m_over_z, mass_t max_m_over_z)
{
	FILE *list_stream=fopen(file_list,"r");
	if (! list_stream)
	{
		cout << "Error: couldn't open pkl dir list for reading: " << file_list << endl;
		exit(1);
	}

	char line_buff[1024];

	cout << "Read pkl files (idx path #spectra):" << endl;

	int total_extracted=0;
	int pkl_dir_idx=0;
	while (fgets(line_buff,1024,list_stream))
	{
		istringstream is(line_buff);

		string pkl_dir_path = "";
		string tsv_file = "";

		is >> pkl_dir_path >> tsv_file;

		if (pkl_dir_path.length()>2 && tsv_file.length() >2)
		{
			FileManager fm;

			fm.init_from_single_pkl_dir(config, pkl_dir_path, tsv_file, pkl_dir_idx,
										min_m_over_z, max_m_over_z);

			FileSet fs;
			fs.select_all_files(fm);
			const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
			
			cout << pkl_dir_idx << "\t" << pkl_dir_path << "\t" << all_ssf.size() << endl;


			static QCPeak *peak_buff=NULL;    
			static float  *dat_peak_buff=NULL;

			if (! peak_buff)
			{
				peak_buff     = new QCPeak[20000];
				dat_peak_buff = new  float[40000];
			}

			if (! peak_buff || ! dat_peak_buff)
			{
				cout << "Error: couldn't allocate memory for DAT conversion!" << endl;
				exit(1);
			}

			if (! ind_was_initialized)
			{
				cout << "Error: must initialize DAT_Converter!" << endl;
				exit(0);
			}


			BasicSpecReader bsr;

			int num_spectra_extracted=0;
			int i;
			for (i=0; i<all_ssf.size(); i++)
			{

				PKL_single *ssf = (PKL_single *)all_ssf[i];
				// no small spectra
				if (ssf->num_peaks<5 || ssf->m_over_z<=min_m_over_z || ssf->m_over_z > max_m_over_z)
					continue;

				int j,pos=0;
				int num_spec_peaks = bsr.read_basic_spec(config,fm,ssf,peak_buff,true);

				if (num_spec_peaks<5)
					continue;


				mass_t m_over_z = ssf->m_over_z;
				int scan_number=ssf->scan_number;
				float precursor_intensity= (ssf->precursor_intensity >0 ? ssf->precursor_intensity : 0);
		
				// copy peaks
		
				for (j=0; j<num_spec_peaks; j++)
				{
					dat_peak_buff[pos++]=(float)peak_buff[j].mass;
					dat_peak_buff[pos++]=(float)peak_buff[j].intensity;
				}

		
				// add spectrum
				int DAT_file_idx =  (int)(m_over_z/mass_increment);
				if (DAT_file_idx > max_dat_file_idx)
					DAT_file_idx = max_dat_file_idx;

				if (! dat_buffs[DAT_file_idx].ind_was_initialized)
				{
					ostringstream os,os_batch;
					os << DAT_file_idx;
					os_batch << batch;
					string path = out_dir + "/" + name + "_" + os_batch.str() + "_" + os.str() + ".dat";
					dat_buffs[DAT_file_idx].init(path,dat_buff_size);
				}

				int charge = ssf->charge;
				if (charge<0 || charge>1000)
					charge=0;

	
				dat_buffs[DAT_file_idx].add_spec_to_DAT_file(
					m_over_z,
					charge,
					pkl_dir_idx + file_start_idx,
					scan_number,ssf->retention_time,
					precursor_intensity,
					num_spec_peaks,(char *)dat_peak_buff);

				num_spectra_extracted++;
			}
			cout << "(" << num_spectra_extracted << ")" << endl;

			total_extracted += num_spectra_extracted;

			pkl_dir_idx++;
		}
	}

	return total_extracted;
}

void DAT_Converter::convert_files_to_DAT_on_the_fly(Config* config, char *file_list, 
							char * _out_dir, char * _name, int _batch, 
							mass_t min_m_over_z, 
							mass_t max_m_over_z, 
							int file_start_idx,
							bool ind_is_pkl_dir)
{
	name = _name;
	out_dir = _out_dir;
	batch = _batch;

	int num_spectra_extracted=0;

	name = _name;
	out_dir = _out_dir;

	if (! ind_was_initialized)
	{
		cout << "Error: must initialize DAT_Converter!" << endl;
		exit(0);
	}


	if (ind_is_pkl_dir)
	{
		num_spectra_extracted = convert_PKL_dir_to_DAT(config,file_list, file_start_idx, min_m_over_z,max_m_over_z);
	}
	else
	{
		vector<string> list;
		read_paths_into_list(file_list,list);

		cout << endl << endl <<"Extracting spectra and writing dat files for " << list.size() << " Files. " << endl << endl;

		int i;
		for (i=0; i<list.size(); i++)
		{
			int file_type = get_file_extension_type(list[i]);

			if (file_type == MZXML)
			{
				num_spectra_extracted += parse_single_MZXML_file(config,list[i],file_start_idx+i,
					min_m_over_z,max_m_over_z);
			}
			else
			{
				num_spectra_extracted += convert_single_non_MZXML_file_to_DAT(config,list[i],
											min_m_over_z,max_m_over_z,file_start_idx+i);
			}
		}
	}
	

	int d;
	for (d=0; d<dat_buffs.size(); d++)
	{
		if (dat_buffs[d].ind_was_initialized && dat_buffs[d].pos > dat_buffs[d].buff)
			dat_buffs[d].flush_buff();
	}
	
	
	cout << "Total spectra extracted and converted to DAT: " << num_spectra_extracted << endl;

	create_list_file();
}





int DAT_Converter::parse_single_MZXML_file(Config *config, 
										   string& mzxml_name, 
										   int file_idx,
										   mass_t min_m_over_z, 
										   mass_t max_m_over_z)
{
	static char* Buffer = NULL;
    static char* DecodedPeakBuffer = NULL;
    static float* Peaks = NULL;
	static float* FilteredPeaks = NULL;
	static int PeakBufferSize = 0;
    int Trail;
    static char* PrecursorStr;
    int FloatIndex;
    char* ByteOrderStr;
    int ByteOrderLittle = 1;

    int BytesToRead;
    int BufferStartPos = 0;
    int BytesRead;
    int BufferEnd = 0;
    FILE* MZXMLFile;
    int ParseState = 0;
    int FilePos = 0;
    

    // allocate
	if (! Buffer)
		Buffer = (char*)calloc(XML_BUFFER_SIZE + 1, sizeof(char));

    MZXMLFile = fopen(mzxml_name.c_str(), "rb");
    if (!MZXMLFile)
    {
        cout << "Error: Can't open MZXML file " <<  mzxml_name << endl;
        exit(1);
    }

    printf("File idx %d , '%s'...\n",file_idx, mzxml_name.c_str());

	int idx_in_file = 0;
	int spec_counter =0;
	char *scan_start_ptr = NULL;
    while (1)
    {
		char* ScanStr;
		char* ScanNumberStr;
		int ScanNumber;
		char* MSLevelStr;
		int MSLevel;
		char *PrecursorStr;
		mass_t PrecursorMZ;
		char *retentionTimeStr;
		float retentionTime;
		char *precursorIntensityStr;
		float precursorIntensity;
		char* PeakStr;
		char* PeakBuffer;
		int  BufferPos;

        // Read more data, to fill up the buffer:
	
		if ( ! scan_start_ptr || 
			( (Buffer + BufferEnd - scan_start_ptr) < XML_BUFFER_HALF_SIZE) )
		{
			// try shunt half of the buffer
			if (scan_start_ptr)
			{
				if (BufferEnd - XML_BUFFER_HALF_SIZE>0)
				{
					memmove(Buffer, Buffer + XML_BUFFER_HALF_SIZE, BufferEnd - XML_BUFFER_HALF_SIZE);
					BufferEnd -= XML_BUFFER_HALF_SIZE;
					scan_start_ptr -= XML_BUFFER_HALF_SIZE;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -