📄 t_vb_arbprogram_sse.c
字号:
static GLboolean emit_MSK( struct compilation *cp, union instruction op )
{
struct x86_reg arg = get_arg(cp, op.msk.file, op.msk.idx);
struct x86_reg dst0 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst);
/* Note that dst and dst0 refer to the same program variable, but
* will definitely be different XMM registers. We're effectively
* treating this as a 2 argument SEL now, just one of which happens
* always to be the same register as the destination.
*/
switch (op.msk.mask) {
case 0:
sse_movups(&cp->func, dst, dst0);
return GL_TRUE;
case WRITEMASK_X:
if (arg.file == file_XMM) {
sse_movups(&cp->func, dst, dst0);
sse_movss(&cp->func, dst, arg);
}
else {
struct x86_reg tmp = get_xmm_reg(cp);
sse_movups(&cp->func, dst, dst0);
sse_movss(&cp->func, tmp, arg);
sse_movss(&cp->func, dst, tmp);
}
return GL_TRUE;
case WRITEMASK_XY:
sse_movups(&cp->func, dst, dst0);
sse_shufps(&cp->func, dst, arg, SHUF(X, Y, Z, W));
return GL_TRUE;
case WRITEMASK_ZW:
sse_movups(&cp->func, dst, arg);
sse_shufps(&cp->func, dst, dst0, SHUF(X, Y, Z, W));
return GL_TRUE;
case WRITEMASK_YZW:
if (dst0.file == file_XMM) {
sse_movups(&cp->func, dst, arg);
sse_movss(&cp->func, dst, dst0);
}
else {
struct x86_reg tmp = get_xmm_reg(cp);
sse_movups(&cp->func, dst, arg);
sse_movss(&cp->func, tmp, dst0);
sse_movss(&cp->func, dst, tmp);
}
return GL_TRUE;
case WRITEMASK_Y:
emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Y,X,Z,W));
return GL_TRUE;
case WRITEMASK_Z:
emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
return GL_TRUE;
case WRITEMASK_W:
emit_shuf_copy1(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
return GL_TRUE;
case WRITEMASK_XZ:
emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,Z,Y,W));
return GL_TRUE;
case WRITEMASK_XW:
emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,W,Z,Y));
case WRITEMASK_YZ:
emit_shuf_copy2(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
return GL_TRUE;
case WRITEMASK_YW:
emit_shuf_copy2(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
return GL_TRUE;
case WRITEMASK_XZW:
emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Y,X,Z,W));
return GL_TRUE;
case WRITEMASK_XYW:
emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Z,Y,X,W));
return GL_TRUE;
case WRITEMASK_XYZ:
emit_shuf_copy1(cp, dst, dst0, arg, SHUF(W,Y,Z,X));
return GL_TRUE;
case WRITEMASK_XYZW:
sse_movups(&cp->func, dst, arg);
return GL_TRUE;
default:
assert(0);
break;
}
}
static GLboolean emit_PRT( struct compilation *cp, union instruction op )
{
FAIL;
}
/**
* The traditional instructions. All operate on internal registers
* and ignore write masks and swizzling issues.
*/
static GLboolean emit_ABS( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
sse_movups(&cp->func, dst, arg0);
sse_mulps(&cp->func, dst, neg);
sse_maxps(&cp->func, dst, arg0);
return GL_TRUE;
}
static GLboolean emit_ADD( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
sse_movups(&cp->func, dst, arg0);
sse_addps(&cp->func, dst, arg1);
return GL_TRUE;
}
/* The dotproduct instructions don't really do that well in sse:
*/
static GLboolean emit_DP3( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
struct x86_reg tmp = get_xmm_reg(cp);
sse_movups(&cp->func, dst, arg0);
sse_mulps(&cp->func, dst, arg1);
/* Now the hard bit: sum the first 3 values:
*/
sse_movhlps(&cp->func, tmp, dst);
sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
sse_addss(&cp->func, dst, tmp);
sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
return GL_TRUE;
}
static GLboolean emit_DP4( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
struct x86_reg tmp = get_xmm_reg(cp);
sse_movups(&cp->func, dst, arg0);
sse_mulps(&cp->func, dst, arg1);
/* Now the hard bit: sum the values:
*/
sse_movhlps(&cp->func, tmp, dst);
sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
sse_addss(&cp->func, dst, tmp);
sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
return GL_TRUE;
}
static GLboolean emit_DPH( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
struct x86_reg tmp = get_xmm_reg(cp);
emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z));
sse_movss(&cp->func, dst, ones);
emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z));
sse_mulps(&cp->func, dst, arg1);
/* Now the hard bit: sum the values (from DP4):
*/
sse_movhlps(&cp->func, tmp, dst);
sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
sse_addss(&cp->func, dst, tmp);
sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
return GL_TRUE;
}
#if 0
static GLboolean emit_DST( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);
struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
/* dst[0] = 1.0 * 1.0F; */
/* dst[1] = arg0[1] * arg1[1]; */
/* dst[2] = arg0[2] * 1.0; */
/* dst[3] = 1.0 * arg1[3]; */
/* Would rather do some of this with integer regs, but:
* 1) No proper support for immediate values yet
* 2) I'd need to push/pop somewhere to get a free reg.
*/
x87_fld1(&cp->func);
x87_fstp(&cp->func, dst); /* would rather do an immediate store... */
x87_fld(&cp->func, x86_make_disp(arg0, 4));
x87_fmul(&cp->func, x86_make_disp(arg1, 4));
x87_fstp(&cp->func, x86_make_disp(dst, 4));
if (!eq(arg0, dst)) {
x86_fld(&cp->func, x86_make_disp(arg0, 8));
x86_stp(&cp->func, x86_make_disp(dst, 8));
}
if (!eq(arg1, dst)) {
x86_fld(&cp->func, x86_make_disp(arg0, 12));
x86_stp(&cp->func, x86_make_disp(dst, 12));
}
return GL_TRUE;
}
#else
static GLboolean emit_DST( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
struct x86_reg tmp = get_xmm_reg(cp);
struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
sse_mulps(&cp->func, dst, tmp);
/* dst[0] = 1.0 * 1.0F; */
/* dst[1] = arg0[1] * arg1[1]; */
/* dst[2] = arg0[2] * 1.0; */
/* dst[3] = 1.0 * arg1[3]; */
return GL_TRUE;
}
#endif
static GLboolean emit_LG2( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
x87_fld1(&cp->func); /* 1 */
x87_fld(&cp->func, arg0); /* a0 1 */
x87_fyl2x(&cp->func); /* log2(a0) */
x87_fst(&cp->func, x86_make_disp(dst, 0));
x87_fst(&cp->func, x86_make_disp(dst, 4));
x87_fst(&cp->func, x86_make_disp(dst, 8));
x87_fstp(&cp->func, x86_make_disp(dst, 12));
return GL_TRUE;
}
static GLboolean emit_EX2( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
/* CAUTION: dst may alias arg0!
*/
x87_fld(&cp->func, arg0);
emit_x87_ex2(cp);
x87_fst(&cp->func, x86_make_disp(dst, 0));
x87_fst(&cp->func, x86_make_disp(dst, 4));
x87_fst(&cp->func, x86_make_disp(dst, 8));
x87_fst(&cp->func, x86_make_disp(dst, 12));
return GL_TRUE;
}
static GLboolean emit_EXP( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
struct x86_reg st0 = x86_make_reg(file_x87, 0);
struct x86_reg st1 = x86_make_reg(file_x87, 1);
struct x86_reg st3 = x86_make_reg(file_x87, 3);
/* CAUTION: dst may alias arg0!
*/
x87_fld(&cp->func, arg0); /* arg0.x */
x87_fld(&cp->func, st0); /* arg arg */
/* by default, fpu is setup to round-to-nearest. We want to
* change this now, and track the state through to the end of the
* generated function so that it isn't repeated unnecessarily.
* Alternately, could subtract .5 to get round to -inf behaviour.
*/
set_fpu_round_neg_inf( cp );
x87_fprndint( &cp->func ); /* flr(a) a */
x87_fld(&cp->func, st0); /* flr(a) flr(a) a */
x87_fld1(&cp->func); /* 1 floor(a) floor(a) a */
x87_fst(&cp->func, x86_make_disp(dst, 12)); /* stack unchanged */
x87_fscale(&cp->func); /* 2^floor(a) floor(a) a */
x87_fst(&cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/
x87_fstp(&cp->func, x86_make_disp(dst, 0)); /* flr(a) a 2^flr(a) */
x87_fsubrp(&cp->func, st1); /* frac(a) 2^flr(a) */
x87_fst(&cp->func, x86_make_disp(dst, 4)); /* frac(a) 2^flr(a) */
x87_f2xm1(&cp->func); /* (2^frac(a))-1 2^flr(a)*/
x87_fld1(&cp->func); /* 1 (2^frac(a))-1 2^flr(a)*/
x87_faddp(&cp->func, st1); /* 2^frac(a) 2^flr(a) */
x87_fmulp(&cp->func, st1); /* 2^a */
x87_fst(&cp->func, x86_make_disp(dst, 8));
/* dst[0] = 2^floor(tmp); */
/* dst[1] = frac(tmp); */
/* dst[2] = 2^floor(tmp) * 2^frac(tmp); */
/* dst[3] = 1.0F; */
return GL_TRUE;
}
static GLboolean emit_LOG( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
struct x86_reg st0 = x86_make_reg(file_x87, 0);
struct x86_reg st1 = x86_make_reg(file_x87, 1);
struct x86_reg st2 = x86_make_reg(file_x87, 2);
/* CAUTION: dst may alias arg0!
*/
x87_fld(&cp->func, arg0); /* arg0.x */
x87_fabs(&cp->func); /* |arg0.x| */
x87_fxtract(&cp->func); /* mantissa(arg0.x), exponent(arg0.x) */
x87_fst(&cp->func, st2); /* mantissa, exponent, mantissa */
x87_fld1(&cp->func); /* 1, mantissa, exponent, mantissa */
x87_fyl2x(&cp->func); /* log2(mantissa), exponent, mantissa */
x87_fadd(&cp->func, st0, st1); /* e+l2(m), e, m */
x87_fstp(&cp->func, x86_make_disp(dst, 8)); /* e, m */
x87_fld1(&cp->func); /* 1, e, m */
x87_fsub(&cp->func, st1, st0); /* 1, e-1, m */
x87_fstp(&cp->func, x86_make_disp(dst, 12)); /* e-1,m */
x87_fstp(&cp->func, dst); /* m */
x87_fadd(&cp->func, st0, st0); /* 2m */
x87_fstp(&cp->func, x86_make_disp(dst, 4));
return GL_TRUE;
}
static GLboolean emit_FLR( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
int i;
set_fpu_round_neg_inf( cp );
for (i = 0; i < 4; i++) {
x87_fld(&cp->func, x86_make_disp(arg0, i*4));
x87_fprndint( &cp->func );
x87_fstp(&cp->func, x86_make_disp(dst, i*4));
}
return GL_TRUE;
}
static GLboolean emit_FRC( struct compilation *cp, union instruction op )
{
struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
struct x86_reg st0 = x86_make_reg(file_x87, 0);
struct x86_reg st1 = x86_make_reg(file_x87, 1);
int i;
set_fpu_round_neg_inf( cp );
/* Knowing liveness info or even just writemask would be useful
* here:
*/
for (i = 0; i < 4; i++) {
x87_fld(&cp->func, x86_make_disp(arg0, i*4));
x87_fld(&cp->func, st0); /* a a */
x87_fprndint( &cp->func ); /* flr(a) a */
x87_fsubrp(&cp->func, st1); /* frc(a) */
x87_fstp(&cp->func, x86_make_disp(dst, i*4));
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -