📄 t_vb_arbprogram_sse.c
字号:
/* * Mesa 3-D graphics library * Version: 6.3 * * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *//** * \file t_vb_arb_program_sse.c * * Translate simplified vertex_program representation to * x86/x87/SSE/SSE2 machine code using mesa's rtasm runtime assembler. * * This is very much a first attempt - build something that works. * There are probably better approaches for applying SSE to vertex * programs, and the whole thing is crying out for static analysis of * the programs to avoid redundant operations. * * \author Keith Whitwell */#include "glheader.h"#include "context.h"#include "imports.h"#include "macros.h"#include "mtypes.h"#include "arbprogparse.h"#include "program.h"#include "program_instruction.h"#include "math/m_matrix.h"#include "math/m_translate.h"#include "t_context.h"#include "t_vb_arbprogram.h"#if defined(USE_SSE_ASM)#include "x86/rtasm/x86sse.h"#include "x86/common_x86_asm.h"#define X 0#define Y 1#define Z 2#define W 3/* Reg usage: * * EAX - temp * EBX - point to 'm->File[0]' * ECX - point to 'm->File[3]' * EDX - holds 'm' * EBP, * ESI, * EDI */#define DISASSEM 0#define FAIL \do { \ _mesa_printf("x86 translation failed in %s\n", __FUNCTION__); \ return GL_FALSE; \} while (0)struct compilation { struct x86_function func; struct tnl_compiled_program *p; GLuint insn_counter; struct { GLuint file:2; GLuint idx:7; GLuint dirty:1; GLuint last_used:10; } xmm[8]; struct { struct x86_reg base; } file[4]; GLboolean have_sse2; GLshort fpucntl;};static INLINE GLboolean eq( struct x86_reg a, struct x86_reg b ){ return (a.file == b.file && a.idx == b.idx && a.mod == b.mod && a.disp == b.disp);} static GLint get_offset( const void *a, const void *b ){ return (const char *)b - (const char *)a;}static struct x86_reg get_reg_ptr(GLuint file, GLuint idx ){ struct x86_reg reg; switch (file) { case FILE_REG: reg = x86_make_reg(file_REG32, reg_BX); assert(idx != REG_UNDEF); break; case FILE_STATE_PARAM: reg = x86_make_reg(file_REG32, reg_CX); break; default: assert(0); } return x86_make_disp(reg, 16 * idx);} static void spill( struct compilation *cp, GLuint idx ){ struct x86_reg oldval = get_reg_ptr(cp->xmm[idx].file, cp->xmm[idx].idx); assert(cp->xmm[idx].dirty); sse_movups(&cp->func, oldval, x86_make_reg(file_XMM, idx)); cp->xmm[idx].dirty = 0;}static struct x86_reg get_xmm_reg( struct compilation *cp ){ GLuint i; GLuint oldest = 0; for (i = 0; i < 8; i++) if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) oldest = i; /* Need to write out the old value? */ if (cp->xmm[oldest].dirty) spill(cp, oldest); assert(cp->xmm[oldest].last_used != cp->insn_counter); cp->xmm[oldest].file = FILE_REG; cp->xmm[oldest].idx = REG_UNDEF; cp->xmm[oldest].last_used = cp->insn_counter; return x86_make_reg(file_XMM, oldest);}static void invalidate_xmm( struct compilation *cp, GLuint file, GLuint idx ){ GLuint i; /* Invalidate any old copy of this register in XMM0-7. */ for (i = 0; i < 8; i++) { if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { cp->xmm[i].file = FILE_REG; cp->xmm[i].idx = REG_UNDEF; cp->xmm[i].dirty = 0; break; } }} /* Return an XMM reg to receive the results of an operation. */static struct x86_reg get_dst_xmm_reg( struct compilation *cp, GLuint file, GLuint idx ){ struct x86_reg reg; /* Invalidate any old copy of this register in XMM0-7. Don't reuse * as this may be one of the arguments. */ invalidate_xmm( cp, file, idx ); reg = get_xmm_reg( cp ); cp->xmm[reg.idx].file = file; cp->xmm[reg.idx].idx = idx; cp->xmm[reg.idx].dirty = 1; return reg; }/* As above, but return a pointer. Note - this pointer may alias * those returned by get_arg_ptr(). */static struct x86_reg get_dst_ptr( struct compilation *cp, GLuint file, GLuint idx ){ /* Invalidate any old copy of this register in XMM0-7. Don't reuse * as this may be one of the arguments. */ invalidate_xmm( cp, file, idx ); return get_reg_ptr(file, idx);}/* Return an XMM reg if the argument is resident, otherwise return a * base+offset pointer to the saved value. */static struct x86_reg get_arg( struct compilation *cp, GLuint file, GLuint idx ){ GLuint i; for (i = 0; i < 8; i++) { if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { cp->xmm[i].last_used = cp->insn_counter; return x86_make_reg(file_XMM, i); } } return get_reg_ptr(file, idx);}/* As above, but always return a pointer: */static struct x86_reg get_arg_ptr( struct compilation *cp, GLuint file, GLuint idx ){ GLuint i; /* If there is a modified version of this register in one of the * XMM regs, write it out to memory. */ for (i = 0; i < 8; i++) { if (cp->xmm[i].file == file && cp->xmm[i].idx == idx && cp->xmm[i].dirty) spill(cp, i); } return get_reg_ptr(file, idx);}/* Emulate pshufd insn in regular SSE, if necessary: */static void emit_pshufd( struct compilation *cp, struct x86_reg dst, struct x86_reg arg0, GLubyte shuf ){ if (cp->have_sse2) { sse2_pshufd(&cp->func, dst, arg0, shuf); cp->func.fn = 0; } else { if (!eq(dst, arg0)) sse_movups(&cp->func, dst, arg0); sse_shufps(&cp->func, dst, dst, shuf); }}static void set_fpu_round_neg_inf( struct compilation *cp ){ if (cp->fpucntl != RND_NEG_FPU) { struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX); struct arb_vp_machine *m = NULL; cp->fpucntl = RND_NEG_FPU; x87_fnclex(&cp->func); x87_fldcw(&cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_rnd_neg))); }}/* Perform a reduced swizzle. */static GLboolean emit_RSW( struct compilation *cp, union instruction op ) { struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0); struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst); GLuint swz = op.rsw.swz; GLuint neg = op.rsw.neg; emit_pshufd(cp, dst, arg0, swz); if (neg) { struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ); struct x86_reg tmp = get_xmm_reg(cp); /* Load 1,-1,0,0 * Use neg as arg to pshufd * Multiply */ emit_pshufd(cp, tmp, negs, SHUF((neg & 1) ? 1 : 0, (neg & 2) ? 1 : 0, (neg & 4) ? 1 : 0, (neg & 8) ? 1 : 0)); sse_mulps(&cp->func, dst, tmp); } return GL_TRUE;}/* Helper for writemask: */static GLboolean emit_shuf_copy1( struct compilation *cp, struct x86_reg dst, struct x86_reg arg0, struct x86_reg arg1, GLubyte shuf ){ struct x86_reg tmp = get_xmm_reg(cp); sse_movups(&cp->func, dst, arg1); emit_pshufd(cp, dst, dst, shuf); emit_pshufd(cp, tmp, arg0, shuf); sse_movss(&cp->func, dst, tmp); emit_pshufd(cp, dst, dst, shuf); return GL_TRUE;}/* Helper for writemask: */static GLboolean emit_shuf_copy2( struct compilation *cp, struct x86_reg dst, struct x86_reg arg0, struct x86_reg arg1, GLubyte shuf ){ struct x86_reg tmp = get_xmm_reg(cp); emit_pshufd(cp, dst, arg1, shuf); emit_pshufd(cp, tmp, arg0, shuf); sse_shufps(&cp->func, dst, tmp, SHUF(X, Y, Z, W)); emit_pshufd(cp, dst, dst, shuf); return GL_TRUE;}static void emit_x87_ex2( struct compilation *cp ){ struct x86_reg st0 = x86_make_reg(file_x87, 0); struct x86_reg st1 = x86_make_reg(file_x87, 1); struct x86_reg st3 = x86_make_reg(file_x87, 3); set_fpu_round_neg_inf( cp ); x87_fld(&cp->func, st0); /* a a */ x87_fprndint( &cp->func ); /* int(a) a */ x87_fld(&cp->func, st0); /* int(a) int(a) a */ x87_fstp(&cp->func, st3); /* int(a) a int(a)*/ x87_fsubp(&cp->func, st1); /* frac(a) int(a) */ x87_f2xm1(&cp->func); /* (2^frac(a))-1 int(a)*/ x87_fld1(&cp->func); /* 1 (2^frac(a))-1 int(a)*/ x87_faddp(&cp->func, st1); /* 2^frac(a) int(a) */ x87_fscale(&cp->func); /* 2^a */}#if 0static GLboolean emit_MSK2( struct compilation *cp, union instruction op ){ struct x86_reg arg0 = get_arg(cp, op.msk.file, op.msk.arg); struct x86_reg arg1 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */ struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst); /* make full width bitmask in tmp * dst = ~tmp * tmp &= arg0 * dst &= arg1 * dst |= tmp */ emit_pshufd(cp, tmp, get_arg(cp, FILE_REG, REG_NEGS), SHUF((op.msk.mask & 1) ? 2 : 0, (op.msk.mask & 2) ? 2 : 0, (op.msk.mask & 4) ? 2 : 0, (op.msk.mask & 8) ? 2 : 0)); sse2_pnot(&cp->func, dst, tmp); sse2_pand(&cp->func, arg0, tmp); sse2_pand(&cp->func, arg1, dst); sse2_por(&cp->func, tmp, dst); return GL_TRUE;}#endif/* Used to implement write masking. This and most of the other instructions * here would be easier to implement if there had been a translation * to a 2 argument format (dst/arg0, arg1) at the shader level before * attempting to translate to x86/sse code. */static GLboolean emit_MSK( struct compilation *cp, union instruction op ){ struct x86_reg arg = get_arg(cp, op.msk.file, op.msk.idx); struct x86_reg dst0 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */ struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst); /* Note that dst and dst0 refer to the same program variable, but * will definitely be different XMM registers. We're effectively * treating this as a 2 argument SEL now, just one of which happens * always to be the same register as the destination. */ switch (op.msk.mask) { case 0:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -