📄 t_vb_arbprogram_sse.c
字号:
sse_movups(&cp->func, dst, dst0); return GL_TRUE; case WRITEMASK_X: if (arg.file == file_XMM) { sse_movups(&cp->func, dst, dst0); sse_movss(&cp->func, dst, arg); } else { struct x86_reg tmp = get_xmm_reg(cp); sse_movups(&cp->func, dst, dst0); sse_movss(&cp->func, tmp, arg); sse_movss(&cp->func, dst, tmp); } return GL_TRUE; case WRITEMASK_XY: sse_movups(&cp->func, dst, dst0); sse_shufps(&cp->func, dst, arg, SHUF(X, Y, Z, W)); return GL_TRUE; case WRITEMASK_ZW: sse_movups(&cp->func, dst, arg); sse_shufps(&cp->func, dst, dst0, SHUF(X, Y, Z, W)); return GL_TRUE; case WRITEMASK_YZW: if (dst0.file == file_XMM) { sse_movups(&cp->func, dst, arg); sse_movss(&cp->func, dst, dst0); } else { struct x86_reg tmp = get_xmm_reg(cp); sse_movups(&cp->func, dst, arg); sse_movss(&cp->func, tmp, dst0); sse_movss(&cp->func, dst, tmp); } return GL_TRUE; case WRITEMASK_Y: emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Y,X,Z,W)); return GL_TRUE; case WRITEMASK_Z: emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Z,Y,X,W)); return GL_TRUE; case WRITEMASK_W: emit_shuf_copy1(cp, dst, arg, dst0, SHUF(W,Y,Z,X)); return GL_TRUE; case WRITEMASK_XZ: emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,Z,Y,W)); return GL_TRUE; case WRITEMASK_XW: emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,W,Z,Y)); case WRITEMASK_YZ: emit_shuf_copy2(cp, dst, arg, dst0, SHUF(Z,Y,X,W)); return GL_TRUE; case WRITEMASK_YW: emit_shuf_copy2(cp, dst, arg, dst0, SHUF(W,Y,Z,X)); return GL_TRUE; case WRITEMASK_XZW: emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Y,X,Z,W)); return GL_TRUE; case WRITEMASK_XYW: emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Z,Y,X,W)); return GL_TRUE; case WRITEMASK_XYZ: emit_shuf_copy1(cp, dst, dst0, arg, SHUF(W,Y,Z,X)); return GL_TRUE; case WRITEMASK_XYZW: sse_movups(&cp->func, dst, arg); return GL_TRUE; default: assert(0); break; }}static GLboolean emit_PRT( struct compilation *cp, union instruction op ){ FAIL;}/** * The traditional instructions. All operate on internal registers * and ignore write masks and swizzling issues. */static GLboolean emit_ABS( struct compilation *cp, union instruction op ) { struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG); sse_movups(&cp->func, dst, arg0); sse_mulps(&cp->func, dst, neg); sse_maxps(&cp->func, dst, arg0); return GL_TRUE;}static GLboolean emit_ADD( struct compilation *cp, union instruction op ){ struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); sse_movups(&cp->func, dst, arg0); sse_addps(&cp->func, dst, arg1); return GL_TRUE;}/* The dotproduct instructions don't really do that well in sse: */static GLboolean emit_DP3( struct compilation *cp, union instruction op ){ struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); struct x86_reg tmp = get_xmm_reg(cp); sse_movups(&cp->func, dst, arg0); sse_mulps(&cp->func, dst, arg1); /* Now the hard bit: sum the first 3 values: */ sse_movhlps(&cp->func, tmp, dst); sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(&cp->func, dst, tmp); sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X)); return GL_TRUE;}static GLboolean emit_DP4( struct compilation *cp, union instruction op ){ struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); struct x86_reg tmp = get_xmm_reg(cp); sse_movups(&cp->func, dst, arg0); sse_mulps(&cp->func, dst, arg1); /* Now the hard bit: sum the values: */ sse_movhlps(&cp->func, tmp, dst); sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(&cp->func, dst, tmp); sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X)); return GL_TRUE;}static GLboolean emit_DPH( struct compilation *cp, union instruction op ){ struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES); struct x86_reg tmp = get_xmm_reg(cp); emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z)); sse_movss(&cp->func, dst, ones); emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z)); sse_mulps(&cp->func, dst, arg1); /* Now the hard bit: sum the values (from DP4): */ sse_movhlps(&cp->func, tmp, dst); sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(&cp->func, dst, tmp); sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X)); return GL_TRUE;}#if 0static GLboolean emit_DST( struct compilation *cp, union instruction op ){ struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0); struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1); struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst); /* dst[0] = 1.0 * 1.0F; *//* dst[1] = arg0[1] * arg1[1]; *//* dst[2] = arg0[2] * 1.0; *//* dst[3] = 1.0 * arg1[3]; */ /* Would rather do some of this with integer regs, but: * 1) No proper support for immediate values yet * 2) I'd need to push/pop somewhere to get a free reg. */ x87_fld1(&cp->func); x87_fstp(&cp->func, dst); /* would rather do an immediate store... */ x87_fld(&cp->func, x86_make_disp(arg0, 4)); x87_fmul(&cp->func, x86_make_disp(arg1, 4)); x87_fstp(&cp->func, x86_make_disp(dst, 4)); if (!eq(arg0, dst)) { x86_fld(&cp->func, x86_make_disp(arg0, 8)); x86_stp(&cp->func, x86_make_disp(dst, 8)); } if (!eq(arg1, dst)) { x86_fld(&cp->func, x86_make_disp(arg0, 12)); x86_stp(&cp->func, x86_make_disp(dst, 12)); } return GL_TRUE;}#elsestatic GLboolean emit_DST( struct compilation *cp, union instruction op ){ struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); struct x86_reg tmp = get_xmm_reg(cp); struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES); emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y)); emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W)); sse_mulps(&cp->func, dst, tmp);/* dst[0] = 1.0 * 1.0F; *//* dst[1] = arg0[1] * arg1[1]; *//* dst[2] = arg0[2] * 1.0; *//* dst[3] = 1.0 * arg1[3]; */ return GL_TRUE;}#endifstatic GLboolean emit_LG2( struct compilation *cp, union instruction op ) { struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0); struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst); x87_fld1(&cp->func); /* 1 */ x87_fld(&cp->func, arg0); /* a0 1 */ x87_fyl2x(&cp->func); /* log2(a0) */ x87_fst(&cp->func, x86_make_disp(dst, 0)); x87_fst(&cp->func, x86_make_disp(dst, 4)); x87_fst(&cp->func, x86_make_disp(dst, 8)); x87_fstp(&cp->func, x86_make_disp(dst, 12)); return GL_TRUE;}static GLboolean emit_EX2( struct compilation *cp, union instruction op ) { struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0); struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst); /* CAUTION: dst may alias arg0! */ x87_fld(&cp->func, arg0); emit_x87_ex2(cp); x87_fst(&cp->func, x86_make_disp(dst, 0)); x87_fst(&cp->func, x86_make_disp(dst, 4)); x87_fst(&cp->func, x86_make_disp(dst, 8)); x87_fst(&cp->func, x86_make_disp(dst, 12)); return GL_TRUE;}static GLboolean emit_EXP( struct compilation *cp, union instruction op ){ struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0); struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst); struct x86_reg st0 = x86_make_reg(file_x87, 0); struct x86_reg st1 = x86_make_reg(file_x87, 1); struct x86_reg st3 = x86_make_reg(file_x87, 3); /* CAUTION: dst may alias arg0! */ x87_fld(&cp->func, arg0); /* arg0.x */ x87_fld(&cp->func, st0); /* arg arg */ /* by default, fpu is setup to round-to-nearest. We want to * change this now, and track the state through to the end of the * generated function so that it isn't repeated unnecessarily. * Alternately, could subtract .5 to get round to -inf behaviour. */ set_fpu_round_neg_inf( cp ); x87_fprndint( &cp->func ); /* flr(a) a */ x87_fld(&cp->func, st0); /* flr(a) flr(a) a */ x87_fld1(&cp->func); /* 1 floor(a) floor(a) a */ x87_fst(&cp->func, x86_make_disp(dst, 12)); /* stack unchanged */ x87_fscale(&cp->func); /* 2^floor(a) floor(a) a */ x87_fst(&cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/ x87_fstp(&cp->func, x86_make_disp(dst, 0)); /* flr(a) a 2^flr(a) */ x87_fsubrp(&cp->func, st1); /* frac(a) 2^flr(a) */ x87_fst(&cp->func, x86_make_disp(dst, 4)); /* frac(a) 2^flr(a) */ x87_f2xm1(&cp->func); /* (2^frac(a))-1 2^flr(a)*/ x87_fld1(&cp->func); /* 1 (2^frac(a))-1 2^flr(a)*/ x87_faddp(&cp->func, st1); /* 2^frac(a) 2^flr(a) */ x87_fmulp(&cp->func, st1); /* 2^a */ x87_fst(&cp->func, x86_make_disp(dst, 8)); /* dst[0] = 2^floor(tmp); *//* dst[1] = frac(tmp); *//* dst[2] = 2^floor(tmp) * 2^frac(tmp); *//* dst[3] = 1.0F; */ return GL_TRUE;}static GLboolean emit_LOG( struct compilation *cp, union instruction op ){ struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0); struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst); struct x86_reg st0 = x86_make_reg(file_x87, 0); struct x86_reg st1 = x86_make_reg(file_x87, 1); struct x86_reg st2 = x86_make_reg(file_x87, 2); /* CAUTION: dst may alias arg0! */ x87_fld(&cp->func, arg0); /* arg0.x */ x87_fabs(&cp->func); /* |arg0.x| */ x87_fxtract(&cp->func); /* mantissa(arg0.x), exponent(arg0.x) */ x87_fst(&cp->func, st2); /* mantissa, exponent, mantissa */ x87_fld1(&cp->func); /* 1, mantissa, exponent, mantissa */ x87_fyl2x(&cp->func); /* log2(mantissa), exponent, mantissa */ x87_fadd(&cp->func, st0, st1); /* e+l2(m), e, m */ x87_fstp(&cp->func, x86_make_disp(dst, 8)); /* e, m */ x87_fld1(&cp->func); /* 1, e, m */ x87_fsub(&cp->func, st1, st0); /* 1, e-1, m */ x87_fstp(&cp->func, x86_make_disp(dst, 12)); /* e-1,m */ x87_fstp(&cp->func, dst); /* m */ x87_fadd(&cp->func, st0, st0); /* 2m */ x87_fstp(&cp->func, x86_make_disp(dst, 4)); return GL_TRUE;}static GLboolean emit_FLR( struct compilation *cp, union instruction op ) { struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0); struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst); int i; set_fpu_round_neg_inf( cp ); for (i = 0; i < 4; i++) { x87_fld(&cp->func, x86_make_disp(arg0, i*4)); x87_fprndint( &cp->func ); x87_fstp(&cp->func, x86_make_disp(dst, i*4)); } return GL_TRUE;}static GLboolean emit_FRC( struct compilation *cp, union instruction op ) { struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0); struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst); struct x86_reg st0 = x86_make_reg(file_x87, 0); struct x86_reg st1 = x86_make_reg(file_x87, 1); int i; set_fpu_round_neg_inf( cp ); /* Knowing liveness info or even just writemask would be useful * here: */ for (i = 0; i < 4; i++) { x87_fld(&cp->func, x86_make_disp(arg0, i*4)); x87_fld(&cp->func, st0); /* a a */ x87_fprndint( &cp->func ); /* flr(a) a */ x87_fsubrp(&cp->func, st1); /* frc(a) */ x87_fstp(&cp->func, x86_make_disp(dst, i*4)); } return GL_TRUE;}static GLboolean emit_LIT( struct compilation *cp, union instruction op ){#if 1 struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0); struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst); struct x86_reg lit = get_arg(cp, FILE_REG, REG_LIT); struct x86_reg tmp = get_xmm_reg(cp); struct x86_reg st1 = x86_make_reg(file_x87, 1); struct x86_reg regEAX = x86_make_reg(file_REG32, reg_AX); GLubyte *fixup1, *fixup2; /* Load the interesting parts of arg0: */ x87_fld(&cp->func, x86_make_disp(arg0, 12)); /* a3 */ x87_fld(&cp->func, x86_make_disp(arg0, 4)); /* a1 a3 */ x87_fld(&cp->func, x86_make_disp(arg0, 0)); /* a0 a1 a3 */ /* Intialize dst: */ sse_movaps(&cp->func, tmp, lit); sse_movaps(&cp->func, dst, tmp);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -