📄 wavex01.cu

📁 一个基于GPU运算的FIR滤波器程序
💻 CU
📖 第 1 页 / 共 2 页
字号:
上一页 12
		s[15] = 0x71;
		fwrite(s, 1, 16, fo);

		//CUE Chunk
		s[0] = 'c';
		s[1] = 'u';
		s[2] = 'e';
		s[3] = ' ';
		fwrite(s, 1, 4, fo);
		//CUE Chunk size
		temp_l = 0x34;
		fwrite(&(temp_l), sizeof(long), 1, fo);
		//blank CUE Chunk
		temp_l = 0;
		for (int i = 0; i < 13; i++)	fwrite(&(temp_l), sizeof(long), 1, fo);

	}

	// data chunk
	s[0] = 'd';
	s[1] = 'a';
	s[2] = 't';
	s[3] = 'a';
	fwrite(s, 1, 4, fo);
	// data chunk size
	fwrite(&(len), sizeof(long), 1, fo);
}

int ProcessWav(char *infilename, char *outfilename)
{

	//Variables
	//p					: pointer for original wav data memory. (BYTE pointer)
	//WaveBytesLen		: Total Data BYTEs in original wav file.
	//PacketLength		: Processing in GPU, Data Count. handled data is float.
	//TotalDataSize		: Resulting WAVx file, data chunk size.
	//MaxDelaySize		: each channel has delay sampling count. most large one, it effects result size.
	//h_idata			: memory, Host, to put into GPU.
	//InputDataLen			: Posted Data length as float, = PacketLength + DEFTAPS.
	//d_coeff			: memory, GPU, to store coefficient.size is 4*DEFTAPS
	//d_idata			: memory, GPU, to store input wave data. length needs packetlen + coeff len.
	//d_odata			: memory, GPU, result data stored. size is equal to input packet size. (not including coeffs tail size)
	//h_odata			: memory, Host, to store result. there are total 8 (4Way * L,R) buffer. size is same as packet.
	//sWave_Out_Data	: memory, Host, to store 16 bit WAVx format. type = short. PacketLength*sizeof(short)*2*DivideCount 
	//wWave_Out_Data	: memory, Host, to store 24 bit WAVx format. type = 3 bytes. PacketLength*sizeof(char)*3*2*DivideCount
	//FIR[j].DelayLength: Channel Delay, as Sample Length.
	//FIR[j].chPointer[k]	: Current sampling pointer count for the channel. k=0(L),1(R)
	//FIR[j].Enabled	: channel enabled flag.
	//FIR[j].FIRCoeff[m]: Channel FIR Coeff.
	//LoopCount			: calculated main loop max count. ceiled to int.

    FILE *f, *fo;
    unsigned long WaveBytesLen;
    unsigned char s[10];

	CUT_DEVICE_INIT();

    printf("finename = '%s'\n", infilename);
	f = fopen(infilename, "rb");

	if (CheckInFile(f) == -1)
		return -1;

    while (fread(s, 4, 1, f) == 1) 
	{
        WaveBytesLen = read4bytes(f);
        s[4] = 0;
        printf("[%s] (%lu bytes)\n", s, WaveBytesLen);
        if (memcmp(s, "data", 4) == 0) break;
        for (int i = 0; i < (int)WaveBytesLen; i++)
            printf("%02x ", fgetc(f));
        printf("\n");
    }

	const unsigned int PacketLength = 65536;

	//File Data Size
	int TotalDataSize = 0;
	//Delay size
	int MaxDelaySize = 0;
	//Padding Size
	int PaddingLen = 0;
	//Sample Size
	int SampleCount = WaveBytesLen / 4;
	//Size of actually to be calculated
	int CalculationSize = 0;
	//File Output Mode, Prepare output file
	if (FileOut)
	{
		fo = fopen(outfilename, "wb");
		if (fo == NULL) {
			printf("Can not create %s\n", outfilename);
			return -1;
		}
		//Calculate File Length
		//TotalSize = (WaveLength + MaxDelay)*ways*Bytes
		for (int i = 0; i < DivideCount; i++)
		{
			if (FIR[i].DelayLength > MaxDelaySize)
				MaxDelaySize = FIR[i].DelayLength;
		}
		// Calculation have to be N x PacketLength.
		// Each calculation requires PacketLength + DEFTAPS samples. 
		CalculationSize = (DEFTAPS + MaxDelaySize + SampleCount);
		PaddingLen = PacketLength - (CalculationSize % PacketLength);

		TotalDataSize = (CalculationSize + PaddingLen) * DivideCount * 2 * (OutputFormat_Bit/8) ;

		PrepareOutFile(fo, TotalDataSize, DivideCount * 2, SampleRate, OutputFormat_Bit);
	}

	//allocate memory for CalculationSize + Padding + Tailing TAPS area bytes.
	unsigned char * p;
	p = (unsigned char *)malloc((CalculationSize + PaddingLen + DEFTAPS)*sizeof(short)*2   );

	//fill top taps (L & R) 
	for (int i = 0; i < DEFTAPS*2; i++)
	{
		(*(p + i)) = 0;
	}
	//Copy File to Memory, offset Head Taps area(L&R)
	int readsize = fread((p + DEFTAPS*2), 1, WaveBytesLen ,f);
	//Fill Tail with 0
	for (int i = 0; i < sizeof(short)*2*(MaxDelaySize + PaddingLen + DEFTAPS); i++)
	{
		(*(p  + DEFTAPS*2 + WaveBytesLen + i)) = 0;
	}
	// Now WAV file is ready in Host Memory.
	
	//Prepare Host and Device Memory
	float * h_idata;	//Host Data to Go GPU
	int InputDataLen = PacketLength + DEFTAPS;
	cudaError_t cret;
	cret = cudaMallocHost((void**)&h_idata, sizeof(float)*InputDataLen);
	if (cret != CUDA_SUCCESS)
	{
        printf("Can not allocate %d bytes host memory.\n", sizeof(float)*InputDataLen);
        return -1;
	}

	//allocate GPU device input memory
	float * d_idata;
	CUDA_SAFE_CALL(cudaMalloc((void**) &d_idata, sizeof(float)*InputDataLen) );
	// allocate GPU device memory for result
	float * d_odata;
	CUDA_SAFE_CALL(cudaMalloc((void**) &d_odata, sizeof(float)*PacketLength) );

	//allocate host output (from GPU) memory
	float * h_odata[8];	//for 4 way * (L&R)
	for (int i = 0 ; i < 8; i++)
	{
		cret = cudaMallocHost((void**)&(h_odata[i]), sizeof(float)*PacketLength);
		if (cret != CUDA_SUCCESS)
		{
			printf("Can not allocate %d bytes host memory.\n", sizeof(float)*PacketLength);
			return -1;
		}
	}

	//allocate host WAVE format buffer
	//16bit buffer. 1L1L 1R1R 2L2L 2R2R 3L3L 3R3R 4L4L 4R4R
	short * sWave_Out_Data;
	cret = cudaMallocHost((void**)&sWave_Out_Data, PacketLength*sizeof(short)*2*DivideCount);
	if (cret != CUDA_SUCCESS)
	{
        printf("Can not allocate %d bytes host memory.\n", PacketLength*sizeof(short)*2*DivideCount);
        return -1;
	}

	//24bit buffer. 1L1L1L 1R1R1R 2L2L2L ...
	unsigned char * wWave_Out_Data;
	cret = cudaMallocHost((void**)&wWave_Out_Data, PacketLength*sizeof(char)*3*2*DivideCount);
	if (cret != CUDA_SUCCESS)
	{
        printf("Can not allocate %d bytes host memory.\n", PacketLength*sizeof(char)*3*2*DivideCount);
        return -1;
	}

    // setup execution parameters
    dim3  grid( CUDABLOCKS, 1, 1);
    dim3  threads( THREAD_NUM, 1, 1);

	//Loop for each Packet(total length/65536)
	int LoopCount = 0;
	LoopCount = (CalculationSize + PaddingLen) / PacketLength;
	for (int i = 0; i < LoopCount; i++)
	{
		//Loop for FIR[0], [1], [2], [3]: Needs to change Coeffs.
		for (int j = 0; j < DivideCount; j++)
		{
			//Loop for L, R: prepare 65536 words + 8192 words to process.
			for (int k = 0; k < 2; k++)
			{
				unsigned char LData1, LData2;
				short LData;

				for (int m = 0; m < InputDataLen; m++)
				{
					if (FIR[j].chPointer[k] >= 0)
					{
						//regular case. copy actual data.
						LData1 =  (unsigned char) (* (p+FIR[j].chPointer[k]*4+k*2    ) );
						LData2 =  (unsigned char) (* (p+FIR[j].chPointer[k]*4+k*2 + 1) );
						LData = (short)(LData1 + (LData2<<8));
						h_idata[m] = ((float)(LData))/32768.0;
						FIR[j].chPointer[k] += 1;		
					}
					else
					{
						//beggining, copy 0 for delay padding.
						h_idata[m] = 0.0;
						FIR[j].chPointer[k] += 1;		
					}
				}
				//TAP area was copied to buffer, by m. get step back data pointer.
				FIR[j].chPointer[k] -= DEFTAPS;
				//Copy Wave Data to Device
				// copy host memory to device
				CUDA_SAFE_CALL( cudaMemcpy(d_idata, h_idata,sizeof(float)*InputDataLen,cudaMemcpyHostToDevice));
				//copy coeffs to constant
				CUDA_SAFE_CALL( cudaMemcpyToSymbol(coeff_Kernel, FIR[j].FIRCoeff, sizeof(float)*DEFTAPS) );

				//Call Kernel
				CUDA_SAFE_CALL( cudaThreadSynchronize() );
				calcFIR<<<grid,threads>>>(d_idata, d_odata, PacketLength);
				CUT_CHECK_ERROR("calcFIR failed");
				CUDA_SAFE_CALL( cudaThreadSynchronize() );
				//Get processed data to Packet Output Buffer
				CUDA_SAFE_CALL( cudaMemcpy( (void *)(h_odata[j*2 + k]), d_odata, sizeof(float)*PacketLength, cudaMemcpyDeviceToHost) );
			}//L, R Loop End		
		}//0,1,2,3 Loop End
		printf("Calculating %4d / %4d \r", i+1, LoopCount);

		//8xPacket buffer ready
		//Copy to Wave Format Buffer
		switch (OutputFormat_Bit)
		{
		case 16:
			double LValue;
			double RValue;
			// convert back to 16bit short
			for ( int j = 0; j < DivideCount; j++)
			{
				for (int i = 0; i < PacketLength; i++)
				{
					LValue = (h_odata[j*2  ][i]);
					RValue = (h_odata[j*2+1][i]);

					LValue *= FIR[j].dOffset;
					RValue *= FIR[j].dOffset;

					//Moved - acceralated Sigmoid Function
					if (LValue > 0.95)
					{
						LValue = 0.95+((1.0/(1.0 + exp(-20.0*(LValue-0.95))) - 0.5)*0.1);
					}
					if (LValue < -0.95)
					{
						LValue = -0.95- (0.5 - (1.0/(1.0 + exp(-20.0*(LValue+0.95))))) * 0.1;
					}
					if (RValue > 0.95)
					{
						RValue = 0.95+((1.0/(1.0 + exp(-20.0*(RValue-0.95))) - 0.5)*0.1);
					}
					if (RValue < -0.95)
					{
						RValue = -0.95- (0.5 - (1.0/(1.0 + exp(-20.0*(RValue+0.95))))) * 0.1;
					}


					sWave_Out_Data[i*DivideCount*2 + j*2    ] = (short)(LValue*32768.0);
					sWave_Out_Data[i*DivideCount*2 + j*2 + 1] = (short)(RValue*32768.0);
				}
			}
			break;
		case 24:
			for (int i = 0; i < PacketLength; i++)
			{
				for ( int j = 0; j < DivideCount; j++)
				{
					double LValue;
					double RValue;
					LValue = (h_odata[j*2  ][i]);
					RValue = (h_odata[j*2+1][i]);

					LValue *= FIR[j].dOffset;
					RValue *= FIR[j].dOffset;

					if (LValue > 0.95)
					{
						LValue = 0.95+((1.0/(1.0 + exp(-20.0*(LValue-0.95))) - 0.5)*0.1);
					}
					if (LValue < -0.95)
					{
						LValue = -0.95-((1.0/(1.0 + exp(-20.0*(LValue+0.95))) + 0.5)*0.1);
					}
					if (RValue > 0.95)
					{
						RValue = 0.95+((1.0/(1.0 + exp(-20.0*(RValue-0.95))) - 0.5)*0.1);
					}
					if (RValue < -0.95)
					{
						RValue = -0.95-((1.0/(1.0 + exp(-20.0*(RValue+0.95))) + 0.5)*0.1);
					}
					
					long LConv, RConv;
					LConv = (long)(LValue * 256.0 * 32768.0);
					RConv = (long)(RValue * 256.0 * 32768.0);
					wWave_Out_Data[i*DivideCount*3*2 + j*3*2    ] = (unsigned char)( LConv      & 0xFF);	//lower byte
					wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 1] = (unsigned char)((LConv>>8 ) & 0xFF);	//mid byte
					wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 2] = (unsigned char)((LConv>>16) & 0xFF);	//upper byte
					wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 3] = (unsigned char)( RConv      & 0xFF);	//lower byte
					wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 4] = (unsigned char)((RConv>>8 ) & 0xFF);	//mid byte
					wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 5] = (unsigned char)((RConv>>16) & 0xFF);	//upper byte
				}
			}

			break;
		}
		
		//if streamout = true
			//Push to FX2 stream
			//blocked until PC buffer < Half Full

		if (FileOut)
		{
			//write packet data to file
			switch (OutputFormat_Bit)
			{
			case 16:
				fwrite(sWave_Out_Data, sizeof(short), PacketLength*DivideCount*2, fo);
				break;
			case 24:
				fwrite(wWave_Out_Data, sizeof(char), PacketLength*DivideCount*2*3, fo);
				break;
			}
		}
		//End Loop for Packet

	}
	printf("\ndone FIR processing.\n");

	//Free Memory
    fclose(f);
	if (FileOut) fclose(fo);
	free(p);

	cudaFreeHost(h_idata);
	cudaFree(d_idata);
	cudaFree(d_odata);
	for (int i = 0; i < 8; i++)
	{
		cudaFreeHost(h_odata[i]);
	}
	cudaFreeHost(sWave_Out_Data);
	cudaFreeHost(wWave_Out_Data);

	return 0;
}
上一页 12
💿 文件大小 300 K
👤 上传用户 susanxuwenjun
📂 所属分类并行计算
🏷️ 相关标签

#GPU #FIR #运算 #滤波器
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -