📄 hxfassemblearm.s
字号:
;/* ************************************************************************ *\
;** INTEL Corporation Proprietary Information
;**
;** This listing is supplied under the terms of a license
;** agreement with INTEL Corporation and may not be copied
;** nor disclosed except in accordance with the terms of
;** that agreement.
;**
;** Copyright (c) 2003 Intel Corporation.
;** All Rights Reserved.
;**
;** ************************************************************************ **
; FILE: HPMUAssembleARM.s
; DESCRIPTION:
;
; AUTHOR: Cian Montgomery
; CREATED: July 31, 2003
;
; $Date: 3/19/04 6:07p $ $Revision: 20 $
; $Log: /Intel_Development/Drivers/Marathon/WinCE42/opengles/HXFAssembleARM.s $
; *
; * 20 3/19/04 6:07p Clmontgo
; * Fix conformance issues
; *
; * 19 3/19/04 2:11p Clmontgo
; * Moved VP xform to after clip and 1/w
; *
; * 18 3/18/04 10:37p Clmontgo
; * Fixes for Clipping and SP clean up
; *
; * 17 2/25/04 11:39a Clmontgo
; * Fix a number of comparisons that were doing sing dependent gt instead
; * of equality based ne tests.
; *
; * 16 2/04/04 5:42p Clmontgo
; * Rewrite of Slaveport code to do incremental writes, should eliminate
; * Slaveport write buffer stalls. Added some error handling to Draw*
; * functions.
; *
; * 15 2/02/04 2:23p Clmontgo
; * Fixed Clipping Trivially in Test
; *
; * 14 1/21/04 2:09p Cmdoan
; * fixed hw level strip cull direction
;
; 13 12/17/03 9:20a Clmontgo
; Added Version ID and log to file headers
;\* ************************************************************************ */
INCLUDE HXFSTATE.INC
;** ************************************************************************ **
;** CONSTANTS
;** ************************************************************************ **
HXF_PA_IN_PRIMITIVE EQU (1 << 30)
HXF_PA_CULLDIR_CW EQU (1 << 28)
HXF_PA_CULL_ENABLE EQU (1 << 27)
; Assume iteration is about 100 cycles
HXF_PA_IDX_PREFETCH_DISTANCE EQU (4) ; Number of iteration to fetch ahead
HXF_PA_VTX_PREFETCH_DISTANCE EQU (2)
HXF_PA_IDX_PREFETCH_LIST EQU (HXF_PA_IDX_PREFETCH_DISTANCE * 3 * HXF_INDEX_SIZE)
HXF_PA_IDX_PREFETCH_STRIP EQU (HXF_PA_IDX_PREFETCH_DISTANCE * 1 * HXF_INDEX_SIZE)
HXF_PA_VTX_PREFETCH_BASEIDX_LIST EQU (HXF_PA_VTX_PREFETCH_DISTANCE * 3 * HXF_INDEX_SIZE)
HXF_PA_VTX_PREFETCH_BASEIDX_STRIP EQU (HXF_PA_VTX_PREFETCH_DISTANCE * 1 * HXF_INDEX_SIZE)
;** ************************************************************************ **
;** EXPORTS
;** ************************************************************************ **
EXPORT |HXFAssembleIndexedTriList|
EXPORT |HXFAssembleIndexedTriStrip|
EXPORT |HXFAssembleIndexedTriFan|
EXPORT |HXFAssembleTriList|
EXPORT |HXFAssembleTriStrip|
EXPORT |HXFAssembleTriFan|
;** ************************************************************************ **
;** IMPORTS
;** ************************************************************************ **
IMPORT |HXFClipTriangle|
IMPORT |HXFDrawClippedTriangles|
IMPORT |ctlBeginPrimitiveList|
IMPORT |ctlEndPrimitiveList|
IMPORT |ctlVertex|
;** ************************************************************************ **
;** VARIABLES
;** ************************************************************************ **
;** ************************************************************************ **
;** FUNCTIONS
;** ************************************************************************ **
AREA HXFASSEMBLE, CODE, READONLY
;** ************************************************************************ **
; Name: HXFAssembleIndexedTriList
; Description: This draws an indexed TriList.
; Here's a layout of what we need to do:
; Init - parameters, variables, and conditional stuff
; Based on the cullFace direction and clip enabled, we'll branch to the appropriate
; parts of the iterative loop (CCW/CW, or no cull check, clip) to perform.
; Also, set up bDraw and HXF_PA_IN_PRIMITIVE bits in the count register
; Loop (CCW, CW, and no cull, and clipping)
; Submit a triangle for the cull & clip tests (if enabled)
; Note: Complex clipped triangles will be handled directly by the Clip test
; DRAW: If passes both test, flag bDraw = 1 and HXF_PA_IN_PRIMITIVE = 1
; But, if HXF_PA_IN_PRIMITIVE was 0, then call StartList
; NO_DRAW: If bDraw is 0 (failed cull or clip), bDraw = HXF_PA_IN_PRIMITIVE = 0
; if HXF_PA_IN_PRIMITIVE was 1, call EndList
; if HXF_PA_IN_PRIMITIVE was already 0, check to see if ClippedTriangleBuffer needs
; to be flushed.
; reset for next loop; flip cullFace direction, move pointers back
; End
; If HXF_PA_IN_PRIMITIVE, call Endlist.
; flush ClippedTriangleBuffer if needed
; Input Arguments: pState in r0
; Output Argument: none
; Prototype in C: void HXFAssembleIndexedTriList(HXFState* pState);
;** ************************************************************************ **
|HXFAssembleIndexedTriList| PROC
; ---------------------------------------------------------------------- --
; Register Map
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = r5 = pVtxB r9 = r13 = sp
; r2 = r6 = pVtxC r10 = r14 =
; r3 = r0 = r11 = r15 = pc
; ---------------------------------------------------------------------- --
; Basically, r1-r3, r7, r9, r10, and r12 are free for use.
; r11 is a special case, where we must do stuff with it
stmfd sp!, {r4-r11, lr} ; stack push!
ldr r12, [r0, #HXFSTATE_OFFSET_FLAGS]
mov r1, #0 ; use as loop counter up to NumPrimitives
; also, implicitly set bits 31-27 to 0 (bDraw, HXF_PA_IN_PRIMITIVE, bClipEnable,
; HXF_PA_CULLDIR_CW, HXF_PA_CULL_ENABLE
ands r12, r12, #HXF_CULL_MASK ; mask out all but the Culling bits
orrne r1, r1, #HXF_PA_CULL_ENABLE ; set to 1 if enabled
cmp r12, #HXF_CULL_CW ; if HXF_CULL_CW, then start with CW iteration
orreq r1, r1, #HXF_PA_CULLDIR_CW ; set to 1 for CW, 0 for CCW
mov r8, r1 ; send in the count & flags..
ldr r9, [r0, #HXFSTATE_OFFSET_PINDICES] ; point to index list
; which is always incremented sequentially
str r8, [r0, #HXFSTATE_OFFSET_STORAGE_LR] ; Store Cull info for clipper
; The only difference between CULL_CW and CULL_CCW are a couple the
; order of subtraction during vector calculation, so we went in and used
; some local bits to do it, rather than maintain 3 different loops.
; Furthermore, I will use a conditional to skip over culling entirely to keep
; just a single loop. If we need more performance due to branching penalties,
; then we can unroll things.
; In the C code, we store both the indices and the vertex pointer.
; In ARM, we'll only store the vertex pointer; we always know that
; we can find the index at count, count - 1, and count - 2 in the
; index list when we need them.
ARM_ASSEMBLE_IDX_TRILIST_LOOP
; ---------------------------------------------------------------------- --
; Register Map - Loop Setup
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = r5 = pVtxB r9 = pIndices r13 = sp
; r2 = r6 = pVtxC r10 = pOutVtx r14 =
; r3 = r7 = r11 = r15 = pc
; ---------------------------------------------------------------------- --
; Now, go snag 3 vertices in the list.
ldr r7, [r0, #HXFSTATE_OFFSET_OUTVERTEXSIZE] ; stride
ldr r10, [r0, #HXFSTATE_OFFSET_POUTVERTICES] ; Point to output list
ldr r12, [r0, #HXFSTATE_OFFSET_BASEINDEX]
; Prefetch
pld [r9, #HXF_PA_IDX_PREFETCH_LIST] ; prefetch the indices ahead - 3 iterations ahead
ldrh r1, [r9, #HXF_PA_VTX_PREFETCH_BASEIDX_LIST] ; get point a's index - 2 ahead
ldrh r2, [r9, #HXF_PA_VTX_PREFETCH_BASEIDX_LIST+HXF_INDEX_SIZE] ; get point b's index - 2 ahead
ldrh r3, [r9, #HXF_PA_VTX_PREFETCH_BASEIDX_LIST+(HXF_INDEX_SIZE<<1)] ; get point c's index - 2 ahead
sub r1, r1, r12 ; Factor in the Base Index
mul r14, r1, r7 ; multiply by stride
add r1, r10, r14 ; first vertex pointer
sub r2, r2, r12 ; Factor in the Base Index
mul r14, r2, r7 ; multiply by stride
add r2, r10, r14 ; second vertex pointer
sub r3, r3, r12 ; Factor in the Base Index
mul r14, r3, r7 ; multiply by stride
add r3, r10, r14 ; third vertex pointer
pld [r1] ; prefetch vertex a +2
pld [r2] ; prefetch vertex b +2
pld [r3] ; prefetch vertex c +2
; Load the Indices
ldrh r4, [r9, #0] ; first index
ldrh r5, [r9, #2] ; second index
ldrh r6, [r9, #4] ; third index
sub r4, r4, r12 ; Factor in the Base Index
mul r14, r4, r7 ; multiply by stride
add r4, r10, r14 ; first vertex pointer
sub r5, r5, r12 ; Factor in the Base Index
mul r14, r5, r7 ; multiply by stride
add r5, r10, r14 ; second vertex pointer
sub r6, r6, r12 ; Factor in the Base Index
mul r14, r6, r7 ; multiply by stride
add r6, r10, r14 ; third vertex pointer
ARM_ASSEMBLE_IDX_TRILIST_CLIP
; ---------------------------------------------------------------------- --
; Register Map - Clipping
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = k1* r5 = pVtxB r9 = pIndices r13 = sp
; r2 = k2* r6 = pVtxC r10 = pOutVtx r14 =
; r3 = k3* r7 = r11 = r15 = pc
; ---------------------------------------------------------------------- --
; Load Clip flags
; otherwise enabled, so send clip flag information to clip tester
ldr r12, [r0, #HXFSTATE_OFFSET_BASEINDEX]
ldr r7, [r0, #HXFSTATE_OFFSET_POUTCLIPFLAGS] ; pointer to clip flag array
ldrh r1, [r9] ; get point a's index
ldrh r2, [r9, #HXF_INDEX_SIZE] ; get point b's index
ldrh r3, [r9, #(HXF_INDEX_SIZE<<1)] ; get point c's index
; Account for base index
sub r1, r1, r12
sub r2, r2, r12
sub r3, r3, r12
add r1, r7, r1 ; Byte pointer to get the correct clip flag info
add r2, r7, r2 ; for k1, k2, k3
add r3, r7, r3
ldrb r7, [r1] ; k1
ldrb r11, [r2] ; k2
ldrb r12, [r3] ; k3
; k1, k2, k3 are clip flags per vertex
; if ((k1&k2) && (k2&k3) && (k3&k1)) != 0) means totally outside (all segments totally outside)
; if (k1|k2|k3) means no clipping necessary (totally in)
; else clip it.
; if all results are a 1, then the trivial test out passes...
; that is, if all of the points are outside of the clipping planes
; and none of the segments cross a clip plane (all withing a clip region), then
; the triangle is trivially out.
; Otherwise, more testing is needed.
tst r7, r11 ; first check k1 & k2
tstne r11, r12 ; if (k1&k2) was 1, check (k2&k3)
tstne r7, r12 ; if (k2&k3) was 1 , check (k1&k3)
bne ARM_ASSEMBLE_IDX_TRILIST_NODRAW ; trivial rejection if 1
ARM_ASSEMBLE_IDX_TRILIST_TRIVIAL_CLIP
; if any result is 1, then the trivial boundary span test fails!
orrs r10, r7, r11 ; if ORing is ever nonzero, then we aren't trivially in
orrs r10, r10, r12 ; if first OR was zero, must check second OR
; if no 1's, then just draw (trivially in)...
; but if 0's, then clip first.
beq ARM_ASSEMBLE_IDX_TRILIST_CULL
ARM_ASSEMBLE_IDX_TRILIST_CALLCLIP
; void HXFClipTriangle(HXFState* pState, HUINT8* pV1, HUINT8* pV2, HUINT8* pV3)
blne |HXFClipTriangle|
b ARM_ASSEMBLE_IDX_TRILIST_NODRAW
ARM_ASSEMBLE_IDX_TRILIST_CULL
; ---------------------------------------------------------------------- --
; Register Map - Cull
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = r5 = pVtxB r9 = pIndices r13 = sp
; r2 = r6 = pVtxC r10 = pOutVtx r14 =
; r3 = r7 = r11 = r15 = pc
; ---------------------------------------------------------------------- --
; Test if we need to cull
tst r8, #HXF_PA_CULL_ENABLE ; cull check
beq ARM_ASSEMBLE_IDX_TRILIST_DRAW
; CLIP SPACE vertex pointers A, B, C in r4, r5, r6
; result: signed value of dot product; if negative, visible, otherwise
; not visible
; The culling algorithm is basically a dot product of the camera to the
; surface normal. But, we have to generate the normal here from the
; triangle orientation. Since we are in clip space, the camera is
; [0,0,+/-1] depending on your LHR CCW or CW flag. This means we only
; really need the Z component of the face normal.
; 1) Create vectors
ldr r7, [r4] ; ax
ldr r2, [r5] ; bx
ldr r1, [r4,#4] ; ay
ldr r3, [r5,#4] ; by
tst r8, #HXF_PA_CULLDIR_CW ; if 0, CCW; 1, CW
; The only difference in computation is really a which term (Ax oy Ay) gets negated.
; CW: formula Nz.EyeZ = (AxBy - AyBx ) . [0,0,-1]
; CW: = (AyBx - AxBy)
; CCW: formula Nz.EyeZ = AxBy - AyBx . [0,0,1]
; CCW: = (AxBy - AyBx)
subne r11, r7, r2 ; CW: -Ax = -(bx - ax ) = ax - bx
subeq r11, r2, r7 ; CCW: Ax = bx - ax ;
ldr r2, [r6] ; cx
subne r12, r3, r1 ; CW: Ay = by - ay
subeq r12, r1, r3 ; CCW: -Ay = -(by - ay) = ay -by ;
ldr r3, [r6,#4] ; cy
; r7 = ax
; r1 = ay
; r2 = cx
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -