📄 t_vb_arbprogram_sse.c

📁 mesa-6.5-minigui源码
💻 C
📖 第 1 页 / 共 3 页
字号:
      sse_movups(&cp->func, dst, dst0);      return GL_TRUE;   case WRITEMASK_X:      if (arg.file == file_XMM) {	 sse_movups(&cp->func, dst, dst0);	 sse_movss(&cp->func, dst, arg);      }      else {	 struct x86_reg tmp = get_xmm_reg(cp);	 sse_movups(&cp->func, dst, dst0);	 sse_movss(&cp->func, tmp, arg);	 sse_movss(&cp->func, dst, tmp);      }      return GL_TRUE;   case WRITEMASK_XY:      sse_movups(&cp->func, dst, dst0);      sse_shufps(&cp->func, dst, arg, SHUF(X, Y, Z, W));      return GL_TRUE;   case WRITEMASK_ZW:       sse_movups(&cp->func, dst, arg);      sse_shufps(&cp->func, dst, dst0, SHUF(X, Y, Z, W));      return GL_TRUE;   case WRITEMASK_YZW:       if (dst0.file == file_XMM) {	 sse_movups(&cp->func, dst, arg);	 sse_movss(&cp->func, dst, dst0);      }      else {	 struct x86_reg tmp = get_xmm_reg(cp);      	 sse_movups(&cp->func, dst, arg);	 sse_movss(&cp->func, tmp, dst0);	 sse_movss(&cp->func, dst, tmp);      }      return GL_TRUE;   case WRITEMASK_Y:      emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Y,X,Z,W));      return GL_TRUE;   case WRITEMASK_Z:       emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Z,Y,X,W));      return GL_TRUE;   case WRITEMASK_W:       emit_shuf_copy1(cp, dst, arg, dst0, SHUF(W,Y,Z,X));      return GL_TRUE;   case WRITEMASK_XZ:      emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,Z,Y,W));      return GL_TRUE;   case WRITEMASK_XW:       emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,W,Z,Y));   case WRITEMASK_YZ:            emit_shuf_copy2(cp, dst, arg, dst0, SHUF(Z,Y,X,W));      return GL_TRUE;   case WRITEMASK_YW:      emit_shuf_copy2(cp, dst, arg, dst0, SHUF(W,Y,Z,X));      return GL_TRUE;   case WRITEMASK_XZW:      emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Y,X,Z,W));      return GL_TRUE;   case WRITEMASK_XYW:       emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Z,Y,X,W));      return GL_TRUE;   case WRITEMASK_XYZ:       emit_shuf_copy1(cp, dst, dst0, arg, SHUF(W,Y,Z,X));      return GL_TRUE;   case WRITEMASK_XYZW:      sse_movups(&cp->func, dst, arg);      return GL_TRUE;         default:      assert(0);      break;   }}static GLboolean emit_PRT( struct compilation *cp, union instruction op ){   FAIL;}/** * The traditional instructions.  All operate on internal registers * and ignore write masks and swizzling issues. */static GLboolean emit_ABS( struct compilation *cp, union instruction op ) {   struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);   struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);   struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);   sse_movups(&cp->func, dst, arg0);   sse_mulps(&cp->func, dst, neg);   sse_maxps(&cp->func, dst, arg0);   return GL_TRUE;}static GLboolean emit_ADD( struct compilation *cp, union instruction op ){   struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);   struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);   struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);   sse_movups(&cp->func, dst, arg0);   sse_addps(&cp->func, dst, arg1);   return GL_TRUE;}/* The dotproduct instructions don't really do that well in sse: */static GLboolean emit_DP3( struct compilation *cp, union instruction op ){   struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);   struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);   struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);   struct x86_reg tmp = get_xmm_reg(cp);    sse_movups(&cp->func, dst, arg0);   sse_mulps(&cp->func, dst, arg1);      /* Now the hard bit: sum the first 3 values:    */    sse_movhlps(&cp->func, tmp, dst);   sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));   sse_addss(&cp->func, dst, tmp);   sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));   return GL_TRUE;}static GLboolean emit_DP4( struct compilation *cp, union instruction op ){   struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);   struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);   struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);   struct x86_reg tmp = get_xmm_reg(cp);         sse_movups(&cp->func, dst, arg0);   sse_mulps(&cp->func, dst, arg1);      /* Now the hard bit: sum the values:    */    sse_movhlps(&cp->func, tmp, dst);   sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));   sse_addss(&cp->func, dst, tmp);   sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));   return GL_TRUE;}static GLboolean emit_DPH( struct compilation *cp, union instruction op ){   struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);   struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);   struct x86_reg tmp = get_xmm_reg(cp);         emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z));   sse_movss(&cp->func, dst, ones);   emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z));   sse_mulps(&cp->func, dst, arg1);      /* Now the hard bit: sum the values (from DP4):    */    sse_movhlps(&cp->func, tmp, dst);   sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));   sse_addss(&cp->func, dst, tmp);   sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));   return GL_TRUE;}#if 0static GLboolean emit_DST( struct compilation *cp, union instruction op ){    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);     struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);     struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst); /*    dst[0] = 1.0     * 1.0F; *//*    dst[1] = arg0[1] * arg1[1]; *//*    dst[2] = arg0[2] * 1.0; *//*    dst[3] = 1.0     * arg1[3]; */    /* Would rather do some of this with integer regs, but:     *  1) No proper support for immediate values yet     *  2) I'd need to push/pop somewhere to get a free reg.     */     x87_fld1(&cp->func);    x87_fstp(&cp->func, dst); /* would rather do an immediate store... */    x87_fld(&cp->func, x86_make_disp(arg0, 4));    x87_fmul(&cp->func, x86_make_disp(arg1, 4));    x87_fstp(&cp->func, x86_make_disp(dst, 4));        if (!eq(arg0, dst)) {       x86_fld(&cp->func, x86_make_disp(arg0, 8));       x86_stp(&cp->func, x86_make_disp(dst, 8));    }    if (!eq(arg1, dst)) {       x86_fld(&cp->func, x86_make_disp(arg0, 12));       x86_stp(&cp->func, x86_make_disp(dst, 12));    }     return GL_TRUE;}#elsestatic GLboolean emit_DST( struct compilation *cp, union instruction op ){    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);     struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);     struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);     struct x86_reg tmp = get_xmm_reg(cp);    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);    emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));    emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));    sse_mulps(&cp->func, dst, tmp);/*    dst[0] = 1.0     * 1.0F; *//*    dst[1] = arg0[1] * arg1[1]; *//*    dst[2] = arg0[2] * 1.0; *//*    dst[3] = 1.0     * arg1[3]; */    return GL_TRUE;}#endifstatic GLboolean emit_LG2( struct compilation *cp, union instruction op ) {   struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);    x87_fld1(&cp->func);		/* 1 */   x87_fld(&cp->func, arg0);	/* a0 1 */   x87_fyl2x(&cp->func);	/* log2(a0) */   x87_fst(&cp->func, x86_make_disp(dst, 0));   x87_fst(&cp->func, x86_make_disp(dst, 4));   x87_fst(&cp->func, x86_make_disp(dst, 8));   x87_fstp(&cp->func, x86_make_disp(dst, 12));      return GL_TRUE;}static GLboolean emit_EX2( struct compilation *cp, union instruction op ) {   struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);   /* CAUTION: dst may alias arg0!    */   x87_fld(&cp->func, arg0);	   emit_x87_ex2(cp);   x87_fst(&cp->func, x86_make_disp(dst, 0));       x87_fst(&cp->func, x86_make_disp(dst, 4));       x87_fst(&cp->func, x86_make_disp(dst, 8));       x87_fst(&cp->func, x86_make_disp(dst, 12));       return GL_TRUE;}static GLboolean emit_EXP( struct compilation *cp, union instruction op ){    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);     struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);     struct x86_reg st0 = x86_make_reg(file_x87, 0);    struct x86_reg st1 = x86_make_reg(file_x87, 1);    struct x86_reg st3 = x86_make_reg(file_x87, 3);    /* CAUTION: dst may alias arg0!     */    x87_fld(&cp->func, arg0);	/* arg0.x */    x87_fld(&cp->func, st0); /* arg arg */    /* by default, fpu is setup to round-to-nearest.  We want to     * change this now, and track the state through to the end of the     * generated function so that it isn't repeated unnecessarily.     * Alternately, could subtract .5 to get round to -inf behaviour.     */    set_fpu_round_neg_inf( cp );    x87_fprndint( &cp->func );	/* flr(a) a */    x87_fld(&cp->func, st0); /* flr(a) flr(a) a */    x87_fld1(&cp->func);    /* 1 floor(a) floor(a) a */    x87_fst(&cp->func, x86_make_disp(dst, 12));  /* stack unchanged */    x87_fscale(&cp->func);  /* 2^floor(a) floor(a) a */    x87_fst(&cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/    x87_fstp(&cp->func, x86_make_disp(dst, 0)); /* flr(a) a 2^flr(a) */    x87_fsubrp(&cp->func, st1); /* frac(a) 2^flr(a) */    x87_fst(&cp->func, x86_make_disp(dst, 4));    /* frac(a) 2^flr(a) */    x87_f2xm1(&cp->func);    /* (2^frac(a))-1 2^flr(a)*/    x87_fld1(&cp->func);    /* 1 (2^frac(a))-1 2^flr(a)*/    x87_faddp(&cp->func, st1);	/* 2^frac(a) 2^flr(a) */    x87_fmulp(&cp->func, st1);	/* 2^a */    x87_fst(&cp->func, x86_make_disp(dst, 8));        /*    dst[0] = 2^floor(tmp); *//*    dst[1] = frac(tmp); *//*    dst[2] = 2^floor(tmp) * 2^frac(tmp); *//*    dst[3] = 1.0F; */    return GL_TRUE;}static GLboolean emit_LOG( struct compilation *cp, union instruction op ){    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);     struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);     struct x86_reg st0 = x86_make_reg(file_x87, 0);    struct x86_reg st1 = x86_make_reg(file_x87, 1);    struct x86_reg st2 = x86_make_reg(file_x87, 2);     /* CAUTION: dst may alias arg0!     */    x87_fld(&cp->func, arg0);	/* arg0.x */    x87_fabs(&cp->func);	/* |arg0.x| */    x87_fxtract(&cp->func);	/* mantissa(arg0.x), exponent(arg0.x) */    x87_fst(&cp->func, st2);	/* mantissa, exponent, mantissa */    x87_fld1(&cp->func);	/* 1, mantissa, exponent, mantissa */    x87_fyl2x(&cp->func); 	/* log2(mantissa), exponent, mantissa */    x87_fadd(&cp->func, st0, st1);	/* e+l2(m), e, m  */    x87_fstp(&cp->func, x86_make_disp(dst, 8)); /* e, m */    x87_fld1(&cp->func);	/* 1, e, m */    x87_fsub(&cp->func, st1, st0);	/* 1, e-1, m */    x87_fstp(&cp->func, x86_make_disp(dst, 12)); /* e-1,m */    x87_fstp(&cp->func, dst);	/* m */    x87_fadd(&cp->func, st0, st0);	/* 2m */    x87_fstp(&cp->func, x86_make_disp(dst, 4));	    return GL_TRUE;}static GLboolean emit_FLR( struct compilation *cp, union instruction op ) {   struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);    int i;   set_fpu_round_neg_inf( cp );   for (i = 0; i < 4; i++) {      x87_fld(&cp->func, x86_make_disp(arg0, i*4));         x87_fprndint( &cp->func );         x87_fstp(&cp->func, x86_make_disp(dst, i*4));   }   return GL_TRUE;}static GLboolean emit_FRC( struct compilation *cp, union instruction op ) {   struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);    struct x86_reg st0 = x86_make_reg(file_x87, 0);   struct x86_reg st1 = x86_make_reg(file_x87, 1);   int i;   set_fpu_round_neg_inf( cp );   /* Knowing liveness info or even just writemask would be useful    * here:    */   for (i = 0; i < 4; i++) {      x87_fld(&cp->func, x86_make_disp(arg0, i*4));         x87_fld(&cp->func, st0);	/* a a */      x87_fprndint( &cp->func );   /* flr(a) a */      x87_fsubrp(&cp->func, st1); /* frc(a) */      x87_fstp(&cp->func, x86_make_disp(dst, i*4));   }   return GL_TRUE;}static GLboolean emit_LIT( struct compilation *cp, union instruction op ){#if 1   struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);    struct x86_reg lit = get_arg(cp, FILE_REG, REG_LIT);   struct x86_reg tmp = get_xmm_reg(cp);   struct x86_reg st1 = x86_make_reg(file_x87, 1);   struct x86_reg regEAX = x86_make_reg(file_REG32, reg_AX);   GLubyte *fixup1, *fixup2;   /* Load the interesting parts of arg0:    */   x87_fld(&cp->func, x86_make_disp(arg0, 12));	/* a3 */   x87_fld(&cp->func, x86_make_disp(arg0, 4)); /* a1 a3 */   x87_fld(&cp->func, x86_make_disp(arg0, 0)); /* a0 a1 a3 */      /* Intialize dst:    */   sse_movaps(&cp->func, tmp, lit);   sse_movaps(&cp->func, dst, tmp);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -