📄 fourway_matmult_4x4.cpp

📁 用于GPU通用计算的编程语言BrookGPU 0.4
💻 CPP
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
				"#var samplerRECT __structsampler1_a : TEXUNIT1 : texunit 1 : 1 : 1\n"
				"#var samplerRECT __structsampler2_a : TEXUNIT2 : texunit 2 : 2 : 1\n"
				"#var samplerRECT __structsampler3_a : TEXUNIT3 : texunit 3 : 3 : 1\n"
				"#var float2 _tex_a_pos : $vin.TEXCOORD0 : TEXCOORD0 : 4 : 1\n"
				"#var samplerRECT __structsampler0_b : TEXUNIT4 : texunit 4 : 5 : 1\n"
				"#var samplerRECT __structsampler1_b : TEXUNIT5 : texunit 5 : 6 : 1\n"
				"#var samplerRECT __structsampler2_b : TEXUNIT6 : texunit 6 : 7 : 1\n"
				"#var samplerRECT __structsampler3_b : TEXUNIT7 : texunit 7 : 8 : 1\n"
				"#var float2 _tex_b_pos : $vin.TEXCOORD1 : TEXCOORD1 : 9 : 1\n"
				"#var float4 __output_2 : $vout.COLOR0 : COLOR0 : 10 : 1\n"
				"#var float4 __workspace : C0 :  : 11 : 1\n"
				"DECLARE __workspace;\n"
				"TEX R0, f[TEX0].xyxx, TEX2, RECT;\n"
				"TEX R1, f[TEX1].xyxx, TEX7, RECT;\n"
				"DP4R R1.x, R0, R1;\n"
				"TEX R2, f[TEX1].xyxx, TEX6, RECT;\n"
				"TEX R3, f[TEX1].xyxx, TEX5, RECT;\n"
				"DP4R R1.y, R0, R2;\n"
				"DP4R R1.z, R0, R3;\n"
				"MOVR o[COLR].w, R1.x;\n"
				"MOVR o[COLR].z, R1.y;\n"
				"MOVR o[COLR].y, R1.z;\n"
				"TEX R1, f[TEX1].xyxx, TEX4, RECT;\n"
				"DP4R R0.x, R0, R1;\n"
				"MOVR o[COLR].x, R0.x;\n"
				"END \n"
				"##!!BRCC\n"
				"##narg:3\n"
				"##s:0:a\n"
				"##s:0:b\n"
				"##o:0:result\n"
				"##workspace:1024\n"
				"##!!multipleOutputInfo:2:1:\n"
				"##!!fullAddressTrans:0:\n"
				"##!!reductionFactor:0:\n"
				"")
				.sampler(1, 0)
				.sampler(1, 1)
				.sampler(1, 2)
				.sampler(1, 3)
				.sampler(2, 0)
				.sampler(2, 1)
				.sampler(2, 2)
				.sampler(2, 3)
				.interpolant(1, kStreamInterpolant_Position)
				.interpolant(2, kStreamInterpolant_Position)
				.output(3, 2)
			)
			.pass( gpu_pass_desc(
				"!!FP1.0\n"
				"# NV_fragment_program generated by NVIDIA Cg compiler\n"
				"# cgc version 1.1.0003, build date Jul  7 2003  11:55:19\n"
				"# command line args: -quiet -profile fp30 -DUSERECT=1 -DCGC=1\n"
				"#vendor NVIDIA Corporation\n"
				"#version 1.0.02\n"
				"#profile fp30\n"
				"#program main\n"
				"#semantic main.__structsampler0_a : TEXUNIT0\n"
				"#semantic main.__structsampler1_a : TEXUNIT1\n"
				"#semantic main.__structsampler2_a : TEXUNIT2\n"
				"#semantic main.__structsampler3_a : TEXUNIT3\n"
				"#semantic main.__structsampler0_b : TEXUNIT4\n"
				"#semantic main.__structsampler1_b : TEXUNIT5\n"
				"#semantic main.__structsampler2_b : TEXUNIT6\n"
				"#semantic main.__structsampler3_b : TEXUNIT7\n"
				"#semantic main.__workspace : C0\n"
				"#var samplerRECT __structsampler0_a : TEXUNIT0 : texunit 0 : 0 : 1\n"
				"#var samplerRECT __structsampler1_a : TEXUNIT1 : texunit 1 : 1 : 1\n"
				"#var samplerRECT __structsampler2_a : TEXUNIT2 : texunit 2 : 2 : 1\n"
				"#var samplerRECT __structsampler3_a : TEXUNIT3 : texunit 3 : 3 : 1\n"
				"#var float2 _tex_a_pos : $vin.TEXCOORD0 : TEXCOORD0 : 4 : 1\n"
				"#var samplerRECT __structsampler0_b : TEXUNIT4 : texunit 4 : 5 : 1\n"
				"#var samplerRECT __structsampler1_b : TEXUNIT5 : texunit 5 : 6 : 1\n"
				"#var samplerRECT __structsampler2_b : TEXUNIT6 : texunit 6 : 7 : 1\n"
				"#var samplerRECT __structsampler3_b : TEXUNIT7 : texunit 7 : 8 : 1\n"
				"#var float2 _tex_b_pos : $vin.TEXCOORD1 : TEXCOORD1 : 9 : 1\n"
				"#var float4 __output_3 : $vout.COLOR0 : COLOR0 : 10 : 1\n"
				"#var float4 __workspace : C0 :  : 11 : 1\n"
				"DECLARE __workspace;\n"
				"TEX R0, f[TEX0].xyxx, TEX3, RECT;\n"
				"TEX R1, f[TEX1].xyxx, TEX7, RECT;\n"
				"DP4R R1.x, R0, R1;\n"
				"TEX R2, f[TEX1].xyxx, TEX6, RECT;\n"
				"TEX R3, f[TEX1].xyxx, TEX5, RECT;\n"
				"DP4R R1.y, R0, R2;\n"
				"DP4R R1.z, R0, R3;\n"
				"MOVR o[COLR].w, R1.x;\n"
				"MOVR o[COLR].z, R1.y;\n"
				"MOVR o[COLR].y, R1.z;\n"
				"TEX R1, f[TEX1].xyxx, TEX4, RECT;\n"
				"DP4R R0.x, R0, R1;\n"
				"MOVR o[COLR].x, R0.x;\n"
				"END \n"
				"##!!BRCC\n"
				"##narg:3\n"
				"##s:0:a\n"
				"##s:0:b\n"
				"##o:0:result\n"
				"##workspace:1024\n"
				"##!!multipleOutputInfo:3:1:\n"
				"##!!fullAddressTrans:0:\n"
				"##!!reductionFactor:0:\n"
				"")
				.sampler(1, 0)
				.sampler(1, 1)
				.sampler(1, 2)
				.sampler(1, 3)
				.sampler(2, 0)
				.sampler(2, 1)
				.sampler(2, 2)
				.sampler(2, 3)
				.interpolant(1, kStreamInterpolant_Position)
				.interpolant(2, kStreamInterpolant_Position)
				.output(3, 3)
			)
		);
	static const void* __fourway_matmult_4x4_pretransposed_fp30 = &__fourway_matmult_4x4_pretransposed_fp30_desc;
}


namespace {
	using namespace ::brook::desc;
	static const gpu_kernel_desc __fourway_matmult_4x4_pretransposed_arb_desc = gpu_kernel_desc()
		.technique( gpu_technique_desc()
			.pass( gpu_pass_desc(
				"!!ARBfp1.0\n"
				"OUTPUT oC0 = result.color;\n"
				"TEMP r0;\n"
				"TEMP r1;\n"
				"TEMP r2;\n"
				"TEMP r3;\n"
				"TEMP r4;\n"
				"ATTRIB t0 = fragment.texcoord[0];\n"
				"ATTRIB t1 = fragment.texcoord[1];\n"
				"TEX r0, t1, texture[4], RECT;\n"
				"TEX r1, t0, texture[0], RECT;\n"
				"TEX r4, t1, texture[5], RECT;\n"
				"TEX r3, t1, texture[6], RECT;\n"
				"TEX r2, t1, texture[7], RECT;\n"
				"DP4 r0.x, r1, r0;\n"
				"DP4 r0.y, r1, r4;\n"
				"DP4 r0.z, r1, r3;\n"
				"DP4 r0.w, r1, r2;\n"
				"MOV oC0, r0;\n"
				"END\n"
				" \n"
				"##!!BRCC\n"
				"##narg:3\n"
				"##s:0:a\n"
				"##s:0:b\n"
				"##o:0:result\n"
				"##workspace:1024\n"
				"##!!multipleOutputInfo:0:1:\n"
				"##!!fullAddressTrans:0:\n"
				"##!!reductionFactor:0:\n"
				"")
				.sampler(1, 0)
				.sampler(1, 1)
				.sampler(1, 2)
				.sampler(1, 3)
				.sampler(2, 0)
				.sampler(2, 1)
				.sampler(2, 2)
				.sampler(2, 3)
				.interpolant(1, kStreamInterpolant_Position)
				.interpolant(2, kStreamInterpolant_Position)
				.output(3, 0)
			)
			.pass( gpu_pass_desc(
				"!!ARBfp1.0\n"
				"OUTPUT oC0 = result.color;\n"
				"TEMP r0;\n"
				"TEMP r1;\n"
				"TEMP r2;\n"
				"TEMP r3;\n"
				"TEMP r4;\n"
				"ATTRIB t0 = fragment.texcoord[0];\n"
				"ATTRIB t1 = fragment.texcoord[1];\n"
				"TEX r0, t1, texture[4], RECT;\n"
				"TEX r1, t0, texture[1], RECT;\n"
				"TEX r4, t1, texture[5], RECT;\n"
				"TEX r3, t1, texture[6], RECT;\n"
				"TEX r2, t1, texture[7], RECT;\n"
				"DP4 r0.x, r1, r0;\n"
				"DP4 r0.y, r1, r4;\n"
				"DP4 r0.z, r1, r3;\n"
				"DP4 r0.w, r1, r2;\n"
				"MOV oC0, r0;\n"
				"END\n"
				" \n"
				"##!!BRCC\n"
				"##narg:3\n"
				"##s:0:a\n"
				"##s:0:b\n"
				"##o:0:result\n"
				"##workspace:1024\n"
				"##!!multipleOutputInfo:1:1:\n"
				"##!!fullAddressTrans:0:\n"
				"##!!reductionFactor:0:\n"
				"")
				.sampler(1, 0)
				.sampler(1, 1)
				.sampler(1, 2)
				.sampler(1, 3)
				.sampler(2, 0)
				.sampler(2, 1)
				.sampler(2, 2)
				.sampler(2, 3)
				.interpolant(1, kStreamInterpolant_Position)
				.interpolant(2, kStreamInterpolant_Position)
				.output(3, 1)
			)
			.pass( gpu_pass_desc(
				"!!ARBfp1.0\n"
				"OUTPUT oC0 = result.color;\n"
				"TEMP r0;\n"
				"TEMP r1;\n"
				"TEMP r2;\n"
				"TEMP r3;\n"
				"TEMP r4;\n"
				"ATTRIB t0 = fragment.texcoord[0];\n"
				"ATTRIB t1 = fragment.texcoord[1];\n"
				"TEX r0, t1, texture[4], RECT;\n"
				"TEX r1, t0, texture[2], RECT;\n"
				"TEX r4, t1, texture[5], RECT;\n"
				"TEX r3, t1, texture[6], RECT;\n"
				"TEX r2, t1, texture[7], RECT;\n"
				"DP4 r0.x, r1, r0;\n"
				"DP4 r0.y, r1, r4;\n"
				"DP4 r0.z, r1, r3;\n"
				"DP4 r0.w, r1, r2;\n"
				"MOV oC0, r0;\n"
				"END\n"
				" \n"
				"##!!BRCC\n"
				"##narg:3\n"
				"##s:0:a\n"
				"##s:0:b\n"
				"##o:0:result\n"
				"##workspace:1024\n"
				"##!!multipleOutputInfo:2:1:\n"
				"##!!fullAddressTrans:0:\n"
				"##!!reductionFactor:0:\n"
				"")
				.sampler(1, 0)
				.sampler(1, 1)
				.sampler(1, 2)
				.sampler(1, 3)
				.sampler(2, 0)
				.sampler(2, 1)
				.sampler(2, 2)
				.sampler(2, 3)
				.interpolant(1, kStreamInterpolant_Position)
				.interpolant(2, kStreamInterpolant_Position)
				.output(3, 2)
			)
			.pass( gpu_pass_desc(
				"!!ARBfp1.0\n"
				"OUTPUT oC0 = result.color;\n"
				"TEMP r0;\n"
				"TEMP r1;\n"
				"TEMP r2;\n"
				"TEMP r3;\n"
				"TEMP r4;\n"
				"ATTRIB t0 = fragment.texcoord[0];\n"
				"ATTRIB t1 = fragment.texcoord[1];\n"
				"TEX r0, t1, texture[4], RECT;\n"
				"TEX r1, t0, texture[3], RECT;\n"
				"TEX r4, t1, texture[5], RECT;\n"
				"TEX r3, t1, texture[6], RECT;\n"
				"TEX r2, t1, texture[7], RECT;\n"
				"DP4 r0.x, r1, r0;\n"
				"DP4 r0.y, r1, r4;\n"
				"DP4 r0.z, r1, r3;\n"
				"DP4 r0.w, r1, r2;\n"
				"MOV oC0, r0;\n"
				"END\n"
				" \n"
				"##!!BRCC\n"
				"##narg:3\n"
				"##s:0:a\n"
				"##s:0:b\n"
				"##o:0:result\n"
				"##workspace:1024\n"
				"##!!multipleOutputInfo:3:1:\n"
				"##!!fullAddressTrans:0:\n"
				"##!!reductionFactor:0:\n"
				"")
				.sampler(1, 0)
				.sampler(1, 1)
				.sampler(1, 2)
				.sampler(1, 3)
				.sampler(2, 0)
				.sampler(2, 1)
				.sampler(2, 2)
				.sampler(2, 3)
				.interpolant(1, kStreamInterpolant_Position)
				.interpolant(2, kStreamInterpolant_Position)
				.output(3, 3)
			)
		);
	static const void* __fourway_matmult_4x4_pretransposed_arb = &__fourway_matmult_4x4_pretransposed_arb_desc;
}

void  fourway_matmult_4x4_pretransposed (::brook::stream a,
		::brook::stream b,
		::brook::stream result) {
  static const void *__fourway_matmult_4x4_pretransposed_fp[] = {
     "fp30", __fourway_matmult_4x4_pretransposed_fp30,
     "arb", __fourway_matmult_4x4_pretransposed_arb,
     "ps20", __fourway_matmult_4x4_pretransposed_ps20,
     NULL, NULL };
  static brook::kernel k(__fourway_matmult_4x4_pretransposed_fp);

  k->PushStream(a);
  k->PushStream(b);
  k->PushOutput(result);
  k->Map();

}

static void fillEntry( float* outEntry, float inValue )
{
  float* e = outEntry;
  *e++ = inValue;
  *e++ = 0.0f;
  *e++ = 0.0f;
  *e++ = 0.0f;

  *e++ = 0.0f;
  *e++ = inValue;
  *e++ = 0.0f;
  *e++ = 0.0f;

  *e++ = 0.0f;
  *e++ = 0.0f;
  *e++ = inValue;
  *e++ = 0.0f;

  *e++ = 0.0f;
  *e++ = 0.0f;
  *e++ = 0.0f;
  *e++ = inValue;
}

static void matrixFill( float* outBuffer, int inSize )
{
  int i,j;
  float* b = outBuffer;
  for( i = 0; i < inSize; i++ )
  {
    for( j = 0; j < inSize; j++ )
    {
      fillEntry( b, (float)(i % 16) );
      b += 16;
    }
  }
}

static void checkEntry( float* inEntry, float inValue )
{
  float* e = inEntry;

  if( *e++ != inValue ) assert(false);
  if( *e++ != 0.0f ) assert(false);
  if( *e++ != 0.0f ) assert(false);
  if( *e++ != 0.0f ) assert(false);

  if( *e++ != 0.0f ) assert(false);
  if( *e++ != inValue ) assert(false);
  if( *e++ != 0.0f ) assert(false);
  if( *e++ != 0.0f ) assert(false);

  if( *e++ != 0.0f ) assert(false);
  if( *e++ != 0.0f ) assert(false);
  if( *e++ != inValue ) assert(false);
  if( *e++ != 0.0f ) assert(false);

  if( *e++ != 0.0f ) assert(false);
  if( *e++ != 0.0f ) assert(false);
  if( *e++ != 0.0f ) assert(false);
  if( *e++ != inValue ) assert(false);
}

static void matrixCheck( float* inBuffer, int inSize )
{
  int i,j;
  int f;
  float* b = inBuffer;
  for( i = 0; i < inSize; i++ )
  {
    for( j = 0; j < inSize; j++ )
    {
      f = (i % 16) * (i % 16);
      checkEntry( b, (float)(f) );
      b += 16;
    }
  }
}

static void  runTest(int  inSize, int  inIterations, int  *outTime, float  *outFlops)
{
  ::brook::stream a(::brook::getStreamType(( matrix4  *)0), inSize , inSize,-1);
  ::brook::stream b(::brook::getStreamType(( matrix4  *)0), inSize , inSize,-1);
  ::brook::stream c(::brook::getStreamType(( matrix4  *)0), inSize , inSize,-1);
  matrix4  *data;
  int  i;
  int  startTime;
  int  stopTime;
  int  elapsed;

  data = (matrix4 *) (malloc(inSize * inSize * sizeof(matrix4 ) ));
  matrixFill((float *) (data),inSize);
  startTime = GetTimeMillis();
  streamRead(a,data);
  streamRead(b,data);
  for (i = 0; i < inIterations; i++)
  {
    fourway_matmult_4x4(a,b,c);
  }

  streamWrite(c,data);
  stopTime = GetTimeMillis();
  elapsed = (int ) (stopTime - startTime);
  *outTime = elapsed;
  *outFlops = 0.001f * 23.000000f * inIterations * inSize * inSize / (float ) (elapsed);
  matrixCheck((float *) (data),inSize);
}

static void  runPretransposedTest(int  inSize, int  inIterations, int  *outTime, float  *outFlops)
{
  ::brook::stream a(::brook::getStreamType(( matrix4  *)0), inSize , inSize,-1);
  ::brook::stream b(::brook::getStreamType(( matrix4  *)0), inSize , inSize,-1);
  ::brook::stream c(::brook::getStreamType(( matrix4  *)0), inSize , inSize,-1);
  matrix4  *data;
  int  i;
  int  startTime;
  int  stopTime;
  int  elapsed;

  data = (matrix4 *) (malloc(inSize * inSize * sizeof(matrix4 ) ));
  matrixFill((float *) (data),inSize);
  startTime = GetTimeMillis();
  streamRead(a,data);
  streamRead(b,data);
  for (i = 0; i < inIterations; i++)
  {
    fourway_matmult_4x4_pretransposed(a,b,c);
  }

  streamWrite(c,data);
  stopTime = GetTimeMillis();
  elapsed = (int ) (stopTime - startTime);
  *outTime = elapsed;
  *outFlops = 0.001f * 23.000000f * inIterations * inSize * inSize / (float ) (elapsed);
//  matrixCheck((float *) (data),inSize);
}

#define MAX_ITERS 1000


void  Matmult4x4_4way_Time(int  inStreamSize)
{
  int  time;
  float  flops;
//  int  pretransposedTime;
//  float  pretransposedFlops;

  runTest(inStreamSize,MAX_ITERS,&time,&flops);
//  runPretransposedTest(inStreamSize,MAX_ITERS,&pretransposedTime,&pretransposedFlops);
  printf("matmult4x4 4-way\n");
  printf("stream size = %d * %d * 4*float4\n",inStreamSize,inStreamSize);
//  printf("\n\n");
//  printf("default\n");
  printf("%4d  %9d",MAX_ITERS,time);
//  printf("\n\n");
//  printf("pretransposed\n");
//  printf("%4d  %9d  %5.6f",MAX_ITERS,pretransposedTime,pretransposedFlops);
}
上一页 1 23
💿 文件大小 951 K
👤 上传用户 zjf646
📂 所属分类数学计算
🏷️ 相关标签

#BrookGPU #GPU #0.4 #计算
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -