📄 hxfassemblearm.s
字号:
blne |HXFClipTriangle|
b ARM_ASSEMBLE_IDX_TRIFAN_NODRAW
ARM_ASSEMBLE_IDX_TRIFAN_CULL
; ---------------------------------------------------------------------- --
; Register Map - Cull
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = r5 = pVtxB r9 = pIndices r13 = sp
; r2 = r6 = pVtxC r10 = pOutVtx r14 =
; r3 = r7 = r11 = r15 = pc
; ---------------------------------------------------------------------- --
; Test if we need to cull
tst r8, #HXF_PA_CULL_ENABLE ; cull check
beq ARM_ASSEMBLE_IDX_TRIFAN_DRAW ; ... and submit it to the culling function (inlined) if needed
; CLIP SPACE vertex pointers A, B, C in r4, r5, r6
; result: signed value of dot product; if negative, visible, otherwise not visible
; The culling algorithm is basically a dot product of the camera to the surface normal.
; But, we have to generate the normal here from the triangle orientation.
; Since we are in clip space, the camera is [0,0,+/-1] depending on your LHR CCW or CW flag.
; This means we only really need the Z component of the face normal.
; 1) Create vectors
ldr r7, [r4] ; ax
ldr r2, [r5] ; bx
ldr r1, [r4,#4] ; ay
ldr r3, [r5,#4] ; by
tst r8, #HXF_PA_CULLDIR_CW ; if 0, CCW; 1, CW
; The only difference in computation is really a which term (Ax oy Ay) gets negated.
; CW: formula Nz.EyeZ = (AxBy - AyBx ) . [0,0,-1]
; CW: = (AyBx - AxBy)
; CCW: formula Nz.EyeZ = AxBy - AyBx . [0,0,1]
; CCW: = (AxBy - AyBx)
subne r11, r7, r2 ; CW: -Ax = -(bx - ax ) = ax - bx
subeq r11, r2, r7 ; CCW: Ax = bx - ax ;
ldr r2, [r6] ; cx
subne r12, r3, r1 ; CW: Ay = by - ay
subeq r12, r1, r3 ; CCW: -Ay = -(by - ay) = ay -by ;
ldr r3, [r6,#4] ; cy
; r7 = ax
; r1 = ay
; r2 = cx
; r3 = cy
; r11 = -/+Ax (CW/CCW)
; r12 = +/-Ay
sub r7, r2, r7 ; Bx = cx - ax
sub r1, r3, r1 ; By = cy - ay
; r7 = Bx
; r1 = By
; r11 = -/+Ax (CW/CCW)
; r12 = +/-Ay
; 2) Cross A into B to get Nz ONLY, then compare sign (don't need anything else,
; as camera in projected space is at the origin)
smull r3, r2, r11, r1 ; AxBy in 64bits of r3r12
smlals r3, r2, r12, r7 ; add -AyBx in 64 bits to r3r12, need middle 32
; r2 = 32.0 of Nz.EyeZ
; r3 = 0.32 of Nz.EyeZ
; Since we only care about the sign, just look at hi word in r3
; We expect the program to check for a negative!
blt ARM_ASSEMBLE_IDX_TRIFAN_NODRAW
; Draw; gets here from cull disabled, cull pass and clip disabled, or cull/clip pass.
ARM_ASSEMBLE_IDX_TRIFAN_DRAW
; ---------------------------------------------------------------------- --
; Register Map - Draw
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = r5 = pVtxB r9 = pIndices r13 = sp
; r2 = r6 = pVtxC r10 = pOutVtx r14 =
; r3 = r7 = r11 = r15 = pc
; ---------------------------------------------------------------------- --
; bDraw implicitly 1
; check HXF_PA_IN_PRIMITIVE; if 0, call Startlist
; ALWAYS call DrawVertex
tst r8, #HXF_PA_IN_PRIMITIVE ; if 0, call startlist and set to true
bne ARM_ASSEMBLE_IDX_TRIFAN_STARTDRAW
orr r8, r8, #HXF_PA_IN_PRIMITIVE ; set to 1 in local flags
;Call HXFState* HXFBeginPrimitiveList(HXFState*, void* pVtx0, void* pVtx1);
mov r1, r4 ; vtx0
mov r2, r5 ; vtx1
mov r3, #0
bl |ctlBeginPrimitiveList|
; Now, do cumpulsory DrawVertex
ARM_ASSEMBLE_IDX_TRIFAN_STARTDRAW
;Call HXFState* pState->pSPProc(HXFState*, void* vtx0, void* vtx1, void* vtx2);
;ldr r12, [r0, #HXFSTATE_OFFSET_PSPPROC]
mov r1, r6 ; load vtx2
mov r2, #0
mov r3, #0
;mov lr, pc
;mov pc, r12
bl |ctlVertex|
b ARM_ASSEMBLE_IDX_TRIFAN_LOOPBACK ; Go and see if we're done!
ARM_ASSEMBLE_IDX_TRIFAN_NODRAW
; exit point for any failed bDraw checks
; since bDraw is false, check to see if HXF_PA_IN_PRIMITIVE was true.
; if HXF_PA_IN_PRIMITIVE is true, call Endlist and clear the HXF_PA_IN_PRIMITIVE flag
tst r8, #HXF_PA_IN_PRIMITIVE
bicne r8, r8, #HXF_PA_IN_PRIMITIVE
blne |ctlEndPrimitiveList|
; Now, check to see if the clipped triangle buffer is full.
ldr r12, [r0, #HXFSTATE_OFFSET_NUMCLIPPEDVERTICES]
ldr r11, [r0, #HXFSTATE_OFFSET_NUMCLIPPEDPRIMITIVES]
rsbs r12, r12, #(HXF_CLIP_VTX_SPACE - HXF_CLIP_VERTEX_BUFFER_PAD)
rsbgts r11, r11, #(HXF_CLIP_PRIMITIVE_SPACE- HXF_CLIP_PRIMITIVE_BUFFER_PAD)
blle |HXFDrawClippedTriangles| ; Flush the clip buffers
; check to see if loop is complete!
ARM_ASSEMBLE_IDX_TRIFAN_LOOPBACK
; ---------------------------------------------------------------------- --
; Register Map -
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = r5 = pVtxB r9 = pIndices r13 = sp
; r2 = r6 = pVtxC r10 = pOutVtx r14 =
; r3 = r7 = Stride r11 = r15 = pc
; ---------------------------------------------------------------------- --
ldr r12, [r0, #HXFSTATE_OFFSET_NUMPRIMITIVES] ; count target
ldr r7, [r0, #HXFSTATE_OFFSET_OUTVERTEXSIZE] ; stride
ldr r10, [r0, #HXFSTATE_OFFSET_POUTVERTICES] ; Point to output list
mov r5, r6 ; shift vertex down
add r9, r9, #2 ; increment to second index of next triangle
add r8, r8, #1 ; increment count of primitives
bic r1, r8, #0xFF<<24 ; clear out high bits!
cmp r1, r12 ; check to see if at end of loop
blt ARM_ASSEMBLE_IDX_TRIFAN_LOOP ; no?
ARM_ASSEMBLE_IDX_TRIFAN_DONE
; Exit the whole shebang....
; Preload as much as possible
ldr r5, [r0, #HXFSTATE_OFFSET_NUMCLIPPEDVERTICES]
; first, if HXF_PA_IN_PRIMITIVE, then call EndList
tst r8, #HXF_PA_IN_PRIMITIVE
blne |ctlEndPrimitiveList|
; Next, if any remaining clipped vertices, draw them
cmp r5, #0
blgt |HXFDrawClippedTriangles| ; Flush the clip buffers
ldmfd sp!, {r4-r11, pc}
ENDP
;** ************************************************************************ **
;** ************************************************************************ **
; Name: HXFAssembleTriList
; Description: This draws a Triangle List.
;
; Plan:
;
; Here's a layout of what we need to do:
; Init - parameters, variables, and conditional stuff
; Based on the cullFace direction and clip enabled, we'll branch to the appropriate
; parts of the iterative loop (CCW/CW, or no cull check, clip) to perform.
; Also, set up bDraw and HXF_PA_IN_PRIMITIVE bits in the count register
;
; Loop (CCW, CW, and no cull, and clipping)
; Submit a triangle for the cull & clip tests (if enabled)
; Note: Complex clipped triangles will be handled directly by the Clip test
; DRAW: If passes both test, flag bDraw = 1 and HXF_PA_IN_PRIMITIVE = 1
; But, if HXF_PA_IN_PRIMITIVE was 0, then call StartList
; NO_DRAW: If bDraw is 0 (failed cull or clip), bDraw = HXF_PA_IN_PRIMITIVE = 0
; if HXF_PA_IN_PRIMITIVE was 1, call EndList
; if HXF_PA_IN_PRIMITIVE was already 0, check to see if ClippedTriangleBuffer needs
; to be flushed.
; reset for next loop; flip cullFace direction, move pointers back
; End
; If HXF_PA_IN_PRIMITIVE, call Endlist.
; flush ClippedTriangleBuffer if needed
; Input Arguments: pState in r7
; Output Argument: none
; Prototype in C: void HXFAssembleTriList(HXFState* pState);
;** ************************************************************************ **
|HXFAssembleTriList| PROC
; ---------------------------------------------------------------------- --
; Register Map -
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = r5 = pVtxB r9 = r13 = sp
; r2 = r6 = pVtxC r10 = pOutVtx r14 =
; r3 = r7 = r11 = r15 = pc
; ---------------------------------------------------------------------- --
; Note: "temp" storage has no guarantees of data persistency in the function call
; convention. "persistent" storage refers to callee saved registers, which can
; be used by us freely.
; Basically, r1-r3, r7m r9, r10, and r12 are free for use.
; r11 is a special case, where we must do stuff with it
stmfd sp!, {r4-r11,lr} ; stack push!
ldr r12, [r0, #HXFSTATE_OFFSET_FLAGS]
mov r1, #0 ; use as loop counter up to NumPrimitives
; also, implicitly set bits 31-27 to 0 (bDraw, HXF_PA_IN_PRIMITIVE, bClipEnable,
; HXF_PA_CULLDIR_CW, HXF_PA_CULL_ENABLE
ands r12, r12, #HXF_CULL_MASK ; mask out all but the Culling bits
orrne r1, r1, #HXF_PA_CULL_ENABLE ; set to 1 if enabled
cmp r12, #HXF_CULL_CW ; if HXF_CULL_CW, then start with CW iteration
orreq r1, r1, #HXF_PA_CULLDIR_CW ; set to 1 for CW, 0 for CCW
ldr r4, [r0, #HXFSTATE_OFFSET_POUTVERTICES] ; Point to output list
ldr r7, [r0, #HXFSTATE_OFFSET_OUTVERTEXSIZE] ; stride
ldr r9, [r0, #HXFSTATE_OFFSET_POUTCLIPFLAGS] ; pointer to clip flag array
mov r8, r1 ; send in the count & flags..
str r8, [r0, #HXFSTATE_OFFSET_STORAGE_LR] ; Store Cull info for clipper
; Set up r4, r5, r6...
add r5, r4, r7
add r6, r5, r7
; The only difference between CULL_CW and CULL_CCW are a couple the
; order of subtraction during vector calculation, so we went in and used
; some local bits to do it, rather than maintain 3 different loops.
; Furthermore, I will use a conditional to skip over culling entirely to keep
; just a single loop. If we need more performance due to branching penalties,
; then we can unroll things.
; In the C code, we store both the indices and the vertex pointer.
; In ARM, we'll only store the vertex pointer; we always know that
; we can find the index at count, count - 1, and count - 2 in the
; index list when we need them.
ARM_ASSEMBLE_TRILIST_LOOP
; ---------------------------------------------------------------------- --
; Register Map -
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = r5 = pVtxB r9 = ClipFlags r13 = sp
; r2 = r6 = pVtxC r10 = r14 =
; r3 = r7 = stride r11 = r15 = pc
; ---------------------------------------------------------------------- --
; Prefetch
mov r1, #HXF_PA_VTX_PREFETCH_DISTANCE
mul r12, r7, r1
add r1, r6, r12
add r2, r1, r7
add r3, r2, r7
pld [r1] ; prefetch vertex a +2
pld [r2] ; prefetch vertex b +2
pld [r3] ; prefetch vertex c +2
ARM_ASSEMBLE_TRILIST_CLIP
; ---------------------------------------------------------------------- --
; Register Map - Clipping
; ---------------------------------------------------------------------- --
; r0 = pState r4 = pVtxA r8 = flags|cnt r12 =
; r1 = k1* r5 = pVtxB r9 = clipFlags r13 = sp
; r2 = k2* r6 = pVtxC r10 = r14 =
; r3 = k3* r7 = stride r11 = r15 = pc
; ---------------------------------------------------------------------- --
; Load Clip flags
add r1, r9, #0 ; Byte pointer to get the correct clip flag info
add r2, r1, #1 ; for k1, k2, k3
add r3, r2, #1
ldrb r7, [r1] ; k1
ldrb r11, [r2] ; k2
ldrb r12, [r3] ; k3
; logic:
; k1, k2, k3 are clip flags per vertex
; if ((k1&k2) && (k2&k3) && (k3&k1)) != 0) means totally outside (all segments totally outside)
; if (k1|k2|k3) means no clipping necessary (totally in)
; else clip it.
; if all results are a 1, then the trivial test out passes...
; that is, if all of the points are outside of the clipping planes
; and none of the segments cross a clip plane (all withing a clip region), then
; the triangle is trivially out.
; Otherwise, more testing is needed.
tst r7, r11 ; first check if(k1&k2)
tstne r11, r12 ; if (k1&k2) was 1, check (k2&k3)
tstne r7, r12 ; if (k2&k3) was 1 , check (k1&k3)
bne ARM_ASSEMBLE_TRILIST_NODRAW ; trivial rejection if 1
; so it wasn't entirely out, let's see if it's trivially in!
ARM_ASSEMBLE_TRILIST_TRIVIAL_CLIP
; if any result is 1, then the trivial boundary span test fails!
; we only care about z violations
orrs r10, r7, r11 ; if ORing is ever nonzero, then we aren't trivially in
orrs r10, r10, r12 ; if first OR was zero, must check second OR
; if no 1's, then just draw (trivially in)...
; but if 1's, then clip first.
beq ARM_ASSEMBLE_TRILIST_CULL
ARM_ASSEMBLE_TRILIST_CALLCLIP
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -