📄 matrixmultexture_kernel.cu
字号:
#define BLOCK_SIZE 16
texture<float,2,cudaReadModeElementType> texMatrixA;
texture<float,2,cudaReadModeElementType> texMatrixB;
__global__ void MatMulTexture_kernel(float *C,int hA,int wA,int wB)
{
/*const int bx=blockIdx.x;
const int by=blockIdx.y;
const int tx=threadIdx.x;
const int ty=threadIdx.y;
int aStart=by*hA*BLOCK_SIZE;
int aEnd=aStart+wA-1;
int aStep=BLOCK_SIZE;
int bStart=BLOCK_SIZE*bx;
int bStep=BLOCK_SIZE*wB;
float temp=0;
for(int a=aStart,b=bStart;a<aEnd;a+=aStep,b+=bStep)
{
for(int k=0;k<BLOCK_SIZE;k++)
temp+=tex2D(texMatrixA,by*BLOCK_SIZE+ty,k)*tex2D(texMatrixB,k,bx*BLOCK_SIZE+tx);
}*/
int idx=blockIdx.x*blockDim.x+threadIdx.x;
int idy=blockIdx.y*blockDim.y+threadIdx.y;
float temp=0;
for(int k=0;k<wA;k++)
temp+=tex2D(texMatrixA,idy,k)*tex2D(texMatrixB,k,idx);
C[idx*wB+idy]=temp;
}
void MatMulTexture(float *A,float *B,float *C,int hA,int wA,int wB)
{
float *d_C;
cudaMalloc((void**)&d_C,hA*wB*sizeof(float));
cudaChannelFormatDesc ChFDesc1=cudaCreateChannelDesc<float>();
cudaArray *cuArray_A,*cuArray_B;
cudaMallocArray(&cuArray_A,&ChFDesc1,hA,wA);
cudaChannelFormatDesc ChFDesc2=cudaCreateChannelDesc<float>();
cudaMallocArray(&cuArray_B,&ChFDesc2,wA,wB);
cudaMemcpyToArray(cuArray_A,0,0,A,sizeof(float)*hA*wA,cudaMemcpyHostToDevice);
cudaMemcpyToArray(cuArray_B,0,0,B,sizeof(float)*wA*wB,cudaMemcpyHostToDevice);
cudaBindTextureToArray(texMatrixA,cuArray_A);
cudaBindTextureToArray(texMatrixB,cuArray_B);
dim3 block(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid(wB/BLOCK_SIZE,hA/BLOCK_SIZE);
MatMulTexture_kernel<<<grid,block>>>(d_C,hA,wA,wB);
cudaMemcpy(C,d_C,sizeof(float)*hA*wB,cudaMemcpyDeviceToHost);
cudaUnbindTexture(texMatrixA);
cudaUnbindTexture(texMatrixB);
cudaFree(d_C);
cudaFreeArray(cuArray_A);
cudaFreeArray(cuArray_B);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -