📄 fourway_matmult_4x4.cpp
字号:
"#var samplerRECT __structsampler1_a : TEXUNIT1 : texunit 1 : 1 : 1\n"
"#var samplerRECT __structsampler2_a : TEXUNIT2 : texunit 2 : 2 : 1\n"
"#var samplerRECT __structsampler3_a : TEXUNIT3 : texunit 3 : 3 : 1\n"
"#var float2 _tex_a_pos : $vin.TEXCOORD0 : TEXCOORD0 : 4 : 1\n"
"#var samplerRECT __structsampler0_b : TEXUNIT4 : texunit 4 : 5 : 1\n"
"#var samplerRECT __structsampler1_b : TEXUNIT5 : texunit 5 : 6 : 1\n"
"#var samplerRECT __structsampler2_b : TEXUNIT6 : texunit 6 : 7 : 1\n"
"#var samplerRECT __structsampler3_b : TEXUNIT7 : texunit 7 : 8 : 1\n"
"#var float2 _tex_b_pos : $vin.TEXCOORD1 : TEXCOORD1 : 9 : 1\n"
"#var float4 __output_2 : $vout.COLOR0 : COLOR0 : 10 : 1\n"
"#var float4 __workspace : C0 : : 11 : 1\n"
"DECLARE __workspace;\n"
"TEX R0, f[TEX0].xyxx, TEX2, RECT;\n"
"TEX R1, f[TEX1].xyxx, TEX7, RECT;\n"
"DP4R R1.x, R0, R1;\n"
"TEX R2, f[TEX1].xyxx, TEX6, RECT;\n"
"TEX R3, f[TEX1].xyxx, TEX5, RECT;\n"
"DP4R R1.y, R0, R2;\n"
"DP4R R1.z, R0, R3;\n"
"MOVR o[COLR].w, R1.x;\n"
"MOVR o[COLR].z, R1.y;\n"
"MOVR o[COLR].y, R1.z;\n"
"TEX R1, f[TEX1].xyxx, TEX4, RECT;\n"
"DP4R R0.x, R0, R1;\n"
"MOVR o[COLR].x, R0.x;\n"
"END \n"
"##!!BRCC\n"
"##narg:3\n"
"##s:0:a\n"
"##s:0:b\n"
"##o:0:result\n"
"##workspace:1024\n"
"##!!multipleOutputInfo:2:1:\n"
"##!!fullAddressTrans:0:\n"
"##!!reductionFactor:0:\n"
"")
.sampler(1, 0)
.sampler(1, 1)
.sampler(1, 2)
.sampler(1, 3)
.sampler(2, 0)
.sampler(2, 1)
.sampler(2, 2)
.sampler(2, 3)
.interpolant(1, kStreamInterpolant_Position)
.interpolant(2, kStreamInterpolant_Position)
.output(3, 2)
)
.pass( gpu_pass_desc(
"!!FP1.0\n"
"# NV_fragment_program generated by NVIDIA Cg compiler\n"
"# cgc version 1.1.0003, build date Jul 7 2003 11:55:19\n"
"# command line args: -quiet -profile fp30 -DUSERECT=1 -DCGC=1\n"
"#vendor NVIDIA Corporation\n"
"#version 1.0.02\n"
"#profile fp30\n"
"#program main\n"
"#semantic main.__structsampler0_a : TEXUNIT0\n"
"#semantic main.__structsampler1_a : TEXUNIT1\n"
"#semantic main.__structsampler2_a : TEXUNIT2\n"
"#semantic main.__structsampler3_a : TEXUNIT3\n"
"#semantic main.__structsampler0_b : TEXUNIT4\n"
"#semantic main.__structsampler1_b : TEXUNIT5\n"
"#semantic main.__structsampler2_b : TEXUNIT6\n"
"#semantic main.__structsampler3_b : TEXUNIT7\n"
"#semantic main.__workspace : C0\n"
"#var samplerRECT __structsampler0_a : TEXUNIT0 : texunit 0 : 0 : 1\n"
"#var samplerRECT __structsampler1_a : TEXUNIT1 : texunit 1 : 1 : 1\n"
"#var samplerRECT __structsampler2_a : TEXUNIT2 : texunit 2 : 2 : 1\n"
"#var samplerRECT __structsampler3_a : TEXUNIT3 : texunit 3 : 3 : 1\n"
"#var float2 _tex_a_pos : $vin.TEXCOORD0 : TEXCOORD0 : 4 : 1\n"
"#var samplerRECT __structsampler0_b : TEXUNIT4 : texunit 4 : 5 : 1\n"
"#var samplerRECT __structsampler1_b : TEXUNIT5 : texunit 5 : 6 : 1\n"
"#var samplerRECT __structsampler2_b : TEXUNIT6 : texunit 6 : 7 : 1\n"
"#var samplerRECT __structsampler3_b : TEXUNIT7 : texunit 7 : 8 : 1\n"
"#var float2 _tex_b_pos : $vin.TEXCOORD1 : TEXCOORD1 : 9 : 1\n"
"#var float4 __output_3 : $vout.COLOR0 : COLOR0 : 10 : 1\n"
"#var float4 __workspace : C0 : : 11 : 1\n"
"DECLARE __workspace;\n"
"TEX R0, f[TEX0].xyxx, TEX3, RECT;\n"
"TEX R1, f[TEX1].xyxx, TEX7, RECT;\n"
"DP4R R1.x, R0, R1;\n"
"TEX R2, f[TEX1].xyxx, TEX6, RECT;\n"
"TEX R3, f[TEX1].xyxx, TEX5, RECT;\n"
"DP4R R1.y, R0, R2;\n"
"DP4R R1.z, R0, R3;\n"
"MOVR o[COLR].w, R1.x;\n"
"MOVR o[COLR].z, R1.y;\n"
"MOVR o[COLR].y, R1.z;\n"
"TEX R1, f[TEX1].xyxx, TEX4, RECT;\n"
"DP4R R0.x, R0, R1;\n"
"MOVR o[COLR].x, R0.x;\n"
"END \n"
"##!!BRCC\n"
"##narg:3\n"
"##s:0:a\n"
"##s:0:b\n"
"##o:0:result\n"
"##workspace:1024\n"
"##!!multipleOutputInfo:3:1:\n"
"##!!fullAddressTrans:0:\n"
"##!!reductionFactor:0:\n"
"")
.sampler(1, 0)
.sampler(1, 1)
.sampler(1, 2)
.sampler(1, 3)
.sampler(2, 0)
.sampler(2, 1)
.sampler(2, 2)
.sampler(2, 3)
.interpolant(1, kStreamInterpolant_Position)
.interpolant(2, kStreamInterpolant_Position)
.output(3, 3)
)
);
static const void* __fourway_matmult_4x4_pretransposed_fp30 = &__fourway_matmult_4x4_pretransposed_fp30_desc;
}
namespace {
using namespace ::brook::desc;
static const gpu_kernel_desc __fourway_matmult_4x4_pretransposed_arb_desc = gpu_kernel_desc()
.technique( gpu_technique_desc()
.pass( gpu_pass_desc(
"!!ARBfp1.0\n"
"OUTPUT oC0 = result.color;\n"
"TEMP r0;\n"
"TEMP r1;\n"
"TEMP r2;\n"
"TEMP r3;\n"
"TEMP r4;\n"
"ATTRIB t0 = fragment.texcoord[0];\n"
"ATTRIB t1 = fragment.texcoord[1];\n"
"TEX r0, t1, texture[4], RECT;\n"
"TEX r1, t0, texture[0], RECT;\n"
"TEX r4, t1, texture[5], RECT;\n"
"TEX r3, t1, texture[6], RECT;\n"
"TEX r2, t1, texture[7], RECT;\n"
"DP4 r0.x, r1, r0;\n"
"DP4 r0.y, r1, r4;\n"
"DP4 r0.z, r1, r3;\n"
"DP4 r0.w, r1, r2;\n"
"MOV oC0, r0;\n"
"END\n"
" \n"
"##!!BRCC\n"
"##narg:3\n"
"##s:0:a\n"
"##s:0:b\n"
"##o:0:result\n"
"##workspace:1024\n"
"##!!multipleOutputInfo:0:1:\n"
"##!!fullAddressTrans:0:\n"
"##!!reductionFactor:0:\n"
"")
.sampler(1, 0)
.sampler(1, 1)
.sampler(1, 2)
.sampler(1, 3)
.sampler(2, 0)
.sampler(2, 1)
.sampler(2, 2)
.sampler(2, 3)
.interpolant(1, kStreamInterpolant_Position)
.interpolant(2, kStreamInterpolant_Position)
.output(3, 0)
)
.pass( gpu_pass_desc(
"!!ARBfp1.0\n"
"OUTPUT oC0 = result.color;\n"
"TEMP r0;\n"
"TEMP r1;\n"
"TEMP r2;\n"
"TEMP r3;\n"
"TEMP r4;\n"
"ATTRIB t0 = fragment.texcoord[0];\n"
"ATTRIB t1 = fragment.texcoord[1];\n"
"TEX r0, t1, texture[4], RECT;\n"
"TEX r1, t0, texture[1], RECT;\n"
"TEX r4, t1, texture[5], RECT;\n"
"TEX r3, t1, texture[6], RECT;\n"
"TEX r2, t1, texture[7], RECT;\n"
"DP4 r0.x, r1, r0;\n"
"DP4 r0.y, r1, r4;\n"
"DP4 r0.z, r1, r3;\n"
"DP4 r0.w, r1, r2;\n"
"MOV oC0, r0;\n"
"END\n"
" \n"
"##!!BRCC\n"
"##narg:3\n"
"##s:0:a\n"
"##s:0:b\n"
"##o:0:result\n"
"##workspace:1024\n"
"##!!multipleOutputInfo:1:1:\n"
"##!!fullAddressTrans:0:\n"
"##!!reductionFactor:0:\n"
"")
.sampler(1, 0)
.sampler(1, 1)
.sampler(1, 2)
.sampler(1, 3)
.sampler(2, 0)
.sampler(2, 1)
.sampler(2, 2)
.sampler(2, 3)
.interpolant(1, kStreamInterpolant_Position)
.interpolant(2, kStreamInterpolant_Position)
.output(3, 1)
)
.pass( gpu_pass_desc(
"!!ARBfp1.0\n"
"OUTPUT oC0 = result.color;\n"
"TEMP r0;\n"
"TEMP r1;\n"
"TEMP r2;\n"
"TEMP r3;\n"
"TEMP r4;\n"
"ATTRIB t0 = fragment.texcoord[0];\n"
"ATTRIB t1 = fragment.texcoord[1];\n"
"TEX r0, t1, texture[4], RECT;\n"
"TEX r1, t0, texture[2], RECT;\n"
"TEX r4, t1, texture[5], RECT;\n"
"TEX r3, t1, texture[6], RECT;\n"
"TEX r2, t1, texture[7], RECT;\n"
"DP4 r0.x, r1, r0;\n"
"DP4 r0.y, r1, r4;\n"
"DP4 r0.z, r1, r3;\n"
"DP4 r0.w, r1, r2;\n"
"MOV oC0, r0;\n"
"END\n"
" \n"
"##!!BRCC\n"
"##narg:3\n"
"##s:0:a\n"
"##s:0:b\n"
"##o:0:result\n"
"##workspace:1024\n"
"##!!multipleOutputInfo:2:1:\n"
"##!!fullAddressTrans:0:\n"
"##!!reductionFactor:0:\n"
"")
.sampler(1, 0)
.sampler(1, 1)
.sampler(1, 2)
.sampler(1, 3)
.sampler(2, 0)
.sampler(2, 1)
.sampler(2, 2)
.sampler(2, 3)
.interpolant(1, kStreamInterpolant_Position)
.interpolant(2, kStreamInterpolant_Position)
.output(3, 2)
)
.pass( gpu_pass_desc(
"!!ARBfp1.0\n"
"OUTPUT oC0 = result.color;\n"
"TEMP r0;\n"
"TEMP r1;\n"
"TEMP r2;\n"
"TEMP r3;\n"
"TEMP r4;\n"
"ATTRIB t0 = fragment.texcoord[0];\n"
"ATTRIB t1 = fragment.texcoord[1];\n"
"TEX r0, t1, texture[4], RECT;\n"
"TEX r1, t0, texture[3], RECT;\n"
"TEX r4, t1, texture[5], RECT;\n"
"TEX r3, t1, texture[6], RECT;\n"
"TEX r2, t1, texture[7], RECT;\n"
"DP4 r0.x, r1, r0;\n"
"DP4 r0.y, r1, r4;\n"
"DP4 r0.z, r1, r3;\n"
"DP4 r0.w, r1, r2;\n"
"MOV oC0, r0;\n"
"END\n"
" \n"
"##!!BRCC\n"
"##narg:3\n"
"##s:0:a\n"
"##s:0:b\n"
"##o:0:result\n"
"##workspace:1024\n"
"##!!multipleOutputInfo:3:1:\n"
"##!!fullAddressTrans:0:\n"
"##!!reductionFactor:0:\n"
"")
.sampler(1, 0)
.sampler(1, 1)
.sampler(1, 2)
.sampler(1, 3)
.sampler(2, 0)
.sampler(2, 1)
.sampler(2, 2)
.sampler(2, 3)
.interpolant(1, kStreamInterpolant_Position)
.interpolant(2, kStreamInterpolant_Position)
.output(3, 3)
)
);
static const void* __fourway_matmult_4x4_pretransposed_arb = &__fourway_matmult_4x4_pretransposed_arb_desc;
}
void fourway_matmult_4x4_pretransposed (::brook::stream a,
::brook::stream b,
::brook::stream result) {
static const void *__fourway_matmult_4x4_pretransposed_fp[] = {
"fp30", __fourway_matmult_4x4_pretransposed_fp30,
"arb", __fourway_matmult_4x4_pretransposed_arb,
"ps20", __fourway_matmult_4x4_pretransposed_ps20,
NULL, NULL };
static brook::kernel k(__fourway_matmult_4x4_pretransposed_fp);
k->PushStream(a);
k->PushStream(b);
k->PushOutput(result);
k->Map();
}
static void fillEntry( float* outEntry, float inValue )
{
float* e = outEntry;
*e++ = inValue;
*e++ = 0.0f;
*e++ = 0.0f;
*e++ = 0.0f;
*e++ = 0.0f;
*e++ = inValue;
*e++ = 0.0f;
*e++ = 0.0f;
*e++ = 0.0f;
*e++ = 0.0f;
*e++ = inValue;
*e++ = 0.0f;
*e++ = 0.0f;
*e++ = 0.0f;
*e++ = 0.0f;
*e++ = inValue;
}
static void matrixFill( float* outBuffer, int inSize )
{
int i,j;
float* b = outBuffer;
for( i = 0; i < inSize; i++ )
{
for( j = 0; j < inSize; j++ )
{
fillEntry( b, (float)(i % 16) );
b += 16;
}
}
}
static void checkEntry( float* inEntry, float inValue )
{
float* e = inEntry;
if( *e++ != inValue ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != inValue ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != inValue ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != 0.0f ) assert(false);
if( *e++ != inValue ) assert(false);
}
static void matrixCheck( float* inBuffer, int inSize )
{
int i,j;
int f;
float* b = inBuffer;
for( i = 0; i < inSize; i++ )
{
for( j = 0; j < inSize; j++ )
{
f = (i % 16) * (i % 16);
checkEntry( b, (float)(f) );
b += 16;
}
}
}
static void runTest(int inSize, int inIterations, int *outTime, float *outFlops)
{
::brook::stream a(::brook::getStreamType(( matrix4 *)0), inSize , inSize,-1);
::brook::stream b(::brook::getStreamType(( matrix4 *)0), inSize , inSize,-1);
::brook::stream c(::brook::getStreamType(( matrix4 *)0), inSize , inSize,-1);
matrix4 *data;
int i;
int startTime;
int stopTime;
int elapsed;
data = (matrix4 *) (malloc(inSize * inSize * sizeof(matrix4 ) ));
matrixFill((float *) (data),inSize);
startTime = GetTimeMillis();
streamRead(a,data);
streamRead(b,data);
for (i = 0; i < inIterations; i++)
{
fourway_matmult_4x4(a,b,c);
}
streamWrite(c,data);
stopTime = GetTimeMillis();
elapsed = (int ) (stopTime - startTime);
*outTime = elapsed;
*outFlops = 0.001f * 23.000000f * inIterations * inSize * inSize / (float ) (elapsed);
matrixCheck((float *) (data),inSize);
}
static void runPretransposedTest(int inSize, int inIterations, int *outTime, float *outFlops)
{
::brook::stream a(::brook::getStreamType(( matrix4 *)0), inSize , inSize,-1);
::brook::stream b(::brook::getStreamType(( matrix4 *)0), inSize , inSize,-1);
::brook::stream c(::brook::getStreamType(( matrix4 *)0), inSize , inSize,-1);
matrix4 *data;
int i;
int startTime;
int stopTime;
int elapsed;
data = (matrix4 *) (malloc(inSize * inSize * sizeof(matrix4 ) ));
matrixFill((float *) (data),inSize);
startTime = GetTimeMillis();
streamRead(a,data);
streamRead(b,data);
for (i = 0; i < inIterations; i++)
{
fourway_matmult_4x4_pretransposed(a,b,c);
}
streamWrite(c,data);
stopTime = GetTimeMillis();
elapsed = (int ) (stopTime - startTime);
*outTime = elapsed;
*outFlops = 0.001f * 23.000000f * inIterations * inSize * inSize / (float ) (elapsed);
// matrixCheck((float *) (data),inSize);
}
#define MAX_ITERS 1000
void Matmult4x4_4way_Time(int inStreamSize)
{
int time;
float flops;
// int pretransposedTime;
// float pretransposedFlops;
runTest(inStreamSize,MAX_ITERS,&time,&flops);
// runPretransposedTest(inStreamSize,MAX_ITERS,&pretransposedTime,&pretransposedFlops);
printf("matmult4x4 4-way\n");
printf("stream size = %d * %d * 4*float4\n",inStreamSize,inStreamSize);
// printf("\n\n");
// printf("default\n");
printf("%4d %9d",MAX_ITERS,time);
// printf("\n\n");
// printf("pretransposed\n");
// printf("%4d %9d %5.6f",MAX_ITERS,pretransposedTime,pretransposedFlops);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -