📄 t_vertex_sse.c
字号:
/* * Copyright 2003 Tungsten Graphics, inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Keith Whitwell <keithw@tungstengraphics.com> */#include "glheader.h"#include "context.h"#include "colormac.h"#include "t_context.h"#include "t_vertex.h"#include "simple_list.h"#include "enums.h"#if defined(USE_SSE_ASM)#include "x86/rtasm/x86sse.h"#include "x86/common_x86_asm.h"/** * Number of bytes to allocate for generated SSE functions */#define MAX_SSE_CODE_SIZE 1024#define X 0#define Y 1#define Z 2#define W 3struct x86_program { struct x86_function func; GLcontext *ctx; GLboolean inputs_safe; GLboolean outputs_safe; GLboolean have_sse2; struct x86_reg identity; struct x86_reg chan0;};static struct x86_reg get_identity( struct x86_program *p ){ return p->identity;}static void emit_load4f_4( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ sse_movups(&p->func, dest, arg0);}static void emit_load4f_3( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ /* Have to jump through some hoops: * * c 0 0 0 * c 0 0 1 * 0 0 c 1 * a b c 1 */ sse_movss(&p->func, dest, x86_make_disp(arg0, 8)); sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) ); sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) ); sse_movlps(&p->func, dest, arg0);}static void emit_load4f_2( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ /* Initialize from identity, then pull in low two words: */ sse_movups(&p->func, dest, get_identity(p)); sse_movlps(&p->func, dest, arg0);}static void emit_load4f_1( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ /* Pull in low word, then swizzle in identity */ sse_movss(&p->func, dest, arg0); sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );}static void emit_load3f_3( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ /* Over-reads by 1 dword - potential SEGV if input is a vertex * array. */ if (p->inputs_safe) { sse_movups(&p->func, dest, arg0); } else { /* c 0 0 0 * c c c c * a b c c */ sse_movss(&p->func, dest, x86_make_disp(arg0, 8)); sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X)); sse_movlps(&p->func, dest, arg0); }}static void emit_load3f_2( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ emit_load4f_2(p, dest, arg0);}static void emit_load3f_1( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ emit_load4f_1(p, dest, arg0);}static void emit_load2f_2( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ sse_movlps(&p->func, dest, arg0);}static void emit_load2f_1( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ emit_load4f_1(p, dest, arg0);}static void emit_load1f_1( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ sse_movss(&p->func, dest, arg0);}static void (*load[4][4])( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ) = { { emit_load1f_1, emit_load1f_1, emit_load1f_1, emit_load1f_1 }, { emit_load2f_1, emit_load2f_2, emit_load2f_2, emit_load2f_2 }, { emit_load3f_1, emit_load3f_2, emit_load3f_3, emit_load3f_3 }, { emit_load4f_1, emit_load4f_2, emit_load4f_3, emit_load4f_4 } };static void emit_load( struct x86_program *p, struct x86_reg dest, GLuint sz, struct x86_reg src, GLuint src_sz){ load[sz-1][src_sz-1](p, dest, src);}static void emit_store4f( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ sse_movups(&p->func, dest, arg0);}static void emit_store3f( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ if (p->outputs_safe) { /* Emit the extra dword anyway. This may hurt writecombining, * may cause other problems. */ sse_movups(&p->func, dest, arg0); } else { /* Alternate strategy - emit two, shuffle, emit one. */ sse_movlps(&p->func, dest, arg0); sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ sse_movss(&p->func, x86_make_disp(dest,8), arg0); }}static void emit_store2f( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ sse_movlps(&p->func, dest, arg0);}static void emit_store1f( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ){ sse_movss(&p->func, dest, arg0);}static void (*store[4])( struct x86_program *p, struct x86_reg dest, struct x86_reg arg0 ) = { emit_store1f, emit_store2f, emit_store3f, emit_store4f };static void emit_store( struct x86_program *p, struct x86_reg dest, GLuint sz, struct x86_reg temp ){ store[sz-1](p, dest, temp);}static void emit_pack_store_4ub( struct x86_program *p, struct x86_reg dest, struct x86_reg temp ){ /* Scale by 255.0 */ sse_mulps(&p->func, temp, p->chan0); if (p->have_sse2) { sse2_cvtps2dq(&p->func, temp, temp); sse2_packssdw(&p->func, temp, temp); sse2_packuswb(&p->func, temp, temp); sse_movss(&p->func, dest, temp); } else { struct x86_reg mmx0 = x86_make_reg(file_MMX, 0); struct x86_reg mmx1 = x86_make_reg(file_MMX, 1); sse_cvtps2pi(&p->func, mmx0, temp); sse_movhlps(&p->func, temp, temp); sse_cvtps2pi(&p->func, mmx1, temp); mmx_packssdw(&p->func, mmx0, mmx1); mmx_packuswb(&p->func, mmx0, mmx0); mmx_movd(&p->func, dest, mmx0); }}static GLint get_offset( const void *a, const void *b ){ return (const char *)b - (const char *)a;}/* Not much happens here. Eventually use this function to try and * avoid saving/reloading the source pointers each vertex (if some of * them can fit in registers). */static void get_src_ptr( struct x86_program *p, struct x86_reg srcREG, struct x86_reg vtxREG, struct tnl_clipspace_attr *a ){ struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx); struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr)); /* Load current a[j].inputptr */ x86_mov(&p->func, srcREG, ptr_to_src);}static void update_src_ptr( struct x86_program *p, struct x86_reg srcREG, struct x86_reg vtxREG, struct tnl_clipspace_attr *a ){ if (a->inputstride) { struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx); struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr)); /* add a[j].inputstride (hardcoded value - could just as easily * pull the stride value from memory each time). */ x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride)); /* save new value of a[j].inputptr */ x86_mov(&p->func, ptr_to_src, srcREG); }}/* Lots of hardcoding * * EAX -- pointer to current output vertex * ECX -- pointer to current attribute * */static GLboolean build_vertex_emit( struct x86_program *p )
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -