📄 wavex01.cu
字号:
s[15] = 0x71;
fwrite(s, 1, 16, fo);
//CUE Chunk
s[0] = 'c';
s[1] = 'u';
s[2] = 'e';
s[3] = ' ';
fwrite(s, 1, 4, fo);
//CUE Chunk size
temp_l = 0x34;
fwrite(&(temp_l), sizeof(long), 1, fo);
//blank CUE Chunk
temp_l = 0;
for (int i = 0; i < 13; i++) fwrite(&(temp_l), sizeof(long), 1, fo);
}
// data chunk
s[0] = 'd';
s[1] = 'a';
s[2] = 't';
s[3] = 'a';
fwrite(s, 1, 4, fo);
// data chunk size
fwrite(&(len), sizeof(long), 1, fo);
}
int ProcessWav(char *infilename, char *outfilename)
{
//Variables
//p : pointer for original wav data memory. (BYTE pointer)
//WaveBytesLen : Total Data BYTEs in original wav file.
//PacketLength : Processing in GPU, Data Count. handled data is float.
//TotalDataSize : Resulting WAVx file, data chunk size.
//MaxDelaySize : each channel has delay sampling count. most large one, it effects result size.
//h_idata : memory, Host, to put into GPU.
//InputDataLen : Posted Data length as float, = PacketLength + DEFTAPS.
//d_coeff : memory, GPU, to store coefficient.size is 4*DEFTAPS
//d_idata : memory, GPU, to store input wave data. length needs packetlen + coeff len.
//d_odata : memory, GPU, result data stored. size is equal to input packet size. (not including coeffs tail size)
//h_odata : memory, Host, to store result. there are total 8 (4Way * L,R) buffer. size is same as packet.
//sWave_Out_Data : memory, Host, to store 16 bit WAVx format. type = short. PacketLength*sizeof(short)*2*DivideCount
//wWave_Out_Data : memory, Host, to store 24 bit WAVx format. type = 3 bytes. PacketLength*sizeof(char)*3*2*DivideCount
//FIR[j].DelayLength: Channel Delay, as Sample Length.
//FIR[j].chPointer[k] : Current sampling pointer count for the channel. k=0(L),1(R)
//FIR[j].Enabled : channel enabled flag.
//FIR[j].FIRCoeff[m]: Channel FIR Coeff.
//LoopCount : calculated main loop max count. ceiled to int.
FILE *f, *fo;
unsigned long WaveBytesLen;
unsigned char s[10];
CUT_DEVICE_INIT();
printf("finename = '%s'\n", infilename);
f = fopen(infilename, "rb");
if (CheckInFile(f) == -1)
return -1;
while (fread(s, 4, 1, f) == 1)
{
WaveBytesLen = read4bytes(f);
s[4] = 0;
printf("[%s] (%lu bytes)\n", s, WaveBytesLen);
if (memcmp(s, "data", 4) == 0) break;
for (int i = 0; i < (int)WaveBytesLen; i++)
printf("%02x ", fgetc(f));
printf("\n");
}
const unsigned int PacketLength = 65536;
//File Data Size
int TotalDataSize = 0;
//Delay size
int MaxDelaySize = 0;
//Padding Size
int PaddingLen = 0;
//Sample Size
int SampleCount = WaveBytesLen / 4;
//Size of actually to be calculated
int CalculationSize = 0;
//File Output Mode, Prepare output file
if (FileOut)
{
fo = fopen(outfilename, "wb");
if (fo == NULL) {
printf("Can not create %s\n", outfilename);
return -1;
}
//Calculate File Length
//TotalSize = (WaveLength + MaxDelay)*ways*Bytes
for (int i = 0; i < DivideCount; i++)
{
if (FIR[i].DelayLength > MaxDelaySize)
MaxDelaySize = FIR[i].DelayLength;
}
// Calculation have to be N x PacketLength.
// Each calculation requires PacketLength + DEFTAPS samples.
CalculationSize = (DEFTAPS + MaxDelaySize + SampleCount);
PaddingLen = PacketLength - (CalculationSize % PacketLength);
TotalDataSize = (CalculationSize + PaddingLen) * DivideCount * 2 * (OutputFormat_Bit/8) ;
PrepareOutFile(fo, TotalDataSize, DivideCount * 2, SampleRate, OutputFormat_Bit);
}
//allocate memory for CalculationSize + Padding + Tailing TAPS area bytes.
unsigned char * p;
p = (unsigned char *)malloc((CalculationSize + PaddingLen + DEFTAPS)*sizeof(short)*2 );
//fill top taps (L & R)
for (int i = 0; i < DEFTAPS*2; i++)
{
(*(p + i)) = 0;
}
//Copy File to Memory, offset Head Taps area(L&R)
int readsize = fread((p + DEFTAPS*2), 1, WaveBytesLen ,f);
//Fill Tail with 0
for (int i = 0; i < sizeof(short)*2*(MaxDelaySize + PaddingLen + DEFTAPS); i++)
{
(*(p + DEFTAPS*2 + WaveBytesLen + i)) = 0;
}
// Now WAV file is ready in Host Memory.
//Prepare Host and Device Memory
float * h_idata; //Host Data to Go GPU
int InputDataLen = PacketLength + DEFTAPS;
cudaError_t cret;
cret = cudaMallocHost((void**)&h_idata, sizeof(float)*InputDataLen);
if (cret != CUDA_SUCCESS)
{
printf("Can not allocate %d bytes host memory.\n", sizeof(float)*InputDataLen);
return -1;
}
//allocate GPU device input memory
float * d_idata;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_idata, sizeof(float)*InputDataLen) );
// allocate GPU device memory for result
float * d_odata;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_odata, sizeof(float)*PacketLength) );
//allocate host output (from GPU) memory
float * h_odata[8]; //for 4 way * (L&R)
for (int i = 0 ; i < 8; i++)
{
cret = cudaMallocHost((void**)&(h_odata[i]), sizeof(float)*PacketLength);
if (cret != CUDA_SUCCESS)
{
printf("Can not allocate %d bytes host memory.\n", sizeof(float)*PacketLength);
return -1;
}
}
//allocate host WAVE format buffer
//16bit buffer. 1L1L 1R1R 2L2L 2R2R 3L3L 3R3R 4L4L 4R4R
short * sWave_Out_Data;
cret = cudaMallocHost((void**)&sWave_Out_Data, PacketLength*sizeof(short)*2*DivideCount);
if (cret != CUDA_SUCCESS)
{
printf("Can not allocate %d bytes host memory.\n", PacketLength*sizeof(short)*2*DivideCount);
return -1;
}
//24bit buffer. 1L1L1L 1R1R1R 2L2L2L ...
unsigned char * wWave_Out_Data;
cret = cudaMallocHost((void**)&wWave_Out_Data, PacketLength*sizeof(char)*3*2*DivideCount);
if (cret != CUDA_SUCCESS)
{
printf("Can not allocate %d bytes host memory.\n", PacketLength*sizeof(char)*3*2*DivideCount);
return -1;
}
// setup execution parameters
dim3 grid( CUDABLOCKS, 1, 1);
dim3 threads( THREAD_NUM, 1, 1);
//Loop for each Packet(total length/65536)
int LoopCount = 0;
LoopCount = (CalculationSize + PaddingLen) / PacketLength;
for (int i = 0; i < LoopCount; i++)
{
//Loop for FIR[0], [1], [2], [3]: Needs to change Coeffs.
for (int j = 0; j < DivideCount; j++)
{
//Loop for L, R: prepare 65536 words + 8192 words to process.
for (int k = 0; k < 2; k++)
{
unsigned char LData1, LData2;
short LData;
for (int m = 0; m < InputDataLen; m++)
{
if (FIR[j].chPointer[k] >= 0)
{
//regular case. copy actual data.
LData1 = (unsigned char) (* (p+FIR[j].chPointer[k]*4+k*2 ) );
LData2 = (unsigned char) (* (p+FIR[j].chPointer[k]*4+k*2 + 1) );
LData = (short)(LData1 + (LData2<<8));
h_idata[m] = ((float)(LData))/32768.0;
FIR[j].chPointer[k] += 1;
}
else
{
//beggining, copy 0 for delay padding.
h_idata[m] = 0.0;
FIR[j].chPointer[k] += 1;
}
}
//TAP area was copied to buffer, by m. get step back data pointer.
FIR[j].chPointer[k] -= DEFTAPS;
//Copy Wave Data to Device
// copy host memory to device
CUDA_SAFE_CALL( cudaMemcpy(d_idata, h_idata,sizeof(float)*InputDataLen,cudaMemcpyHostToDevice));
//copy coeffs to constant
CUDA_SAFE_CALL( cudaMemcpyToSymbol(coeff_Kernel, FIR[j].FIRCoeff, sizeof(float)*DEFTAPS) );
//Call Kernel
CUDA_SAFE_CALL( cudaThreadSynchronize() );
calcFIR<<<grid,threads>>>(d_idata, d_odata, PacketLength);
CUT_CHECK_ERROR("calcFIR failed");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
//Get processed data to Packet Output Buffer
CUDA_SAFE_CALL( cudaMemcpy( (void *)(h_odata[j*2 + k]), d_odata, sizeof(float)*PacketLength, cudaMemcpyDeviceToHost) );
}//L, R Loop End
}//0,1,2,3 Loop End
printf("Calculating %4d / %4d \r", i+1, LoopCount);
//8xPacket buffer ready
//Copy to Wave Format Buffer
switch (OutputFormat_Bit)
{
case 16:
double LValue;
double RValue;
// convert back to 16bit short
for ( int j = 0; j < DivideCount; j++)
{
for (int i = 0; i < PacketLength; i++)
{
LValue = (h_odata[j*2 ][i]);
RValue = (h_odata[j*2+1][i]);
LValue *= FIR[j].dOffset;
RValue *= FIR[j].dOffset;
//Moved - acceralated Sigmoid Function
if (LValue > 0.95)
{
LValue = 0.95+((1.0/(1.0 + exp(-20.0*(LValue-0.95))) - 0.5)*0.1);
}
if (LValue < -0.95)
{
LValue = -0.95- (0.5 - (1.0/(1.0 + exp(-20.0*(LValue+0.95))))) * 0.1;
}
if (RValue > 0.95)
{
RValue = 0.95+((1.0/(1.0 + exp(-20.0*(RValue-0.95))) - 0.5)*0.1);
}
if (RValue < -0.95)
{
RValue = -0.95- (0.5 - (1.0/(1.0 + exp(-20.0*(RValue+0.95))))) * 0.1;
}
sWave_Out_Data[i*DivideCount*2 + j*2 ] = (short)(LValue*32768.0);
sWave_Out_Data[i*DivideCount*2 + j*2 + 1] = (short)(RValue*32768.0);
}
}
break;
case 24:
for (int i = 0; i < PacketLength; i++)
{
for ( int j = 0; j < DivideCount; j++)
{
double LValue;
double RValue;
LValue = (h_odata[j*2 ][i]);
RValue = (h_odata[j*2+1][i]);
LValue *= FIR[j].dOffset;
RValue *= FIR[j].dOffset;
if (LValue > 0.95)
{
LValue = 0.95+((1.0/(1.0 + exp(-20.0*(LValue-0.95))) - 0.5)*0.1);
}
if (LValue < -0.95)
{
LValue = -0.95-((1.0/(1.0 + exp(-20.0*(LValue+0.95))) + 0.5)*0.1);
}
if (RValue > 0.95)
{
RValue = 0.95+((1.0/(1.0 + exp(-20.0*(RValue-0.95))) - 0.5)*0.1);
}
if (RValue < -0.95)
{
RValue = -0.95-((1.0/(1.0 + exp(-20.0*(RValue+0.95))) + 0.5)*0.1);
}
long LConv, RConv;
LConv = (long)(LValue * 256.0 * 32768.0);
RConv = (long)(RValue * 256.0 * 32768.0);
wWave_Out_Data[i*DivideCount*3*2 + j*3*2 ] = (unsigned char)( LConv & 0xFF); //lower byte
wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 1] = (unsigned char)((LConv>>8 ) & 0xFF); //mid byte
wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 2] = (unsigned char)((LConv>>16) & 0xFF); //upper byte
wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 3] = (unsigned char)( RConv & 0xFF); //lower byte
wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 4] = (unsigned char)((RConv>>8 ) & 0xFF); //mid byte
wWave_Out_Data[i*DivideCount*3*2 + j*3*2 + 5] = (unsigned char)((RConv>>16) & 0xFF); //upper byte
}
}
break;
}
//if streamout = true
//Push to FX2 stream
//blocked until PC buffer < Half Full
if (FileOut)
{
//write packet data to file
switch (OutputFormat_Bit)
{
case 16:
fwrite(sWave_Out_Data, sizeof(short), PacketLength*DivideCount*2, fo);
break;
case 24:
fwrite(wWave_Out_Data, sizeof(char), PacketLength*DivideCount*2*3, fo);
break;
}
}
//End Loop for Packet
}
printf("\ndone FIR processing.\n");
//Free Memory
fclose(f);
if (FileOut) fclose(fo);
free(p);
cudaFreeHost(h_idata);
cudaFree(d_idata);
cudaFree(d_odata);
for (int i = 0; i < 8; i++)
{
cudaFreeHost(h_odata[i]);
}
cudaFreeHost(sWave_Out_Data);
cudaFreeHost(wWave_Out_Data);
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -