📄 bni8086.asm
字号:
;; performance out of a Pentium. If you want to follow the code,
;; pretend that the sections actually come in the following order:
;; 1) prologue (push registers)
;; 2) load (fetch arguments)
;; 3) first multiply
;; 4) loop unrolling
;;
;; The loop unrolling setup consists of taking the count, adjusting
;; it to account for the first multiply, and splitting it into
;; two parts: the high bits are a loop count, while the low bits are
;; used to find the right entry in the Duff's device jump table and
;; to adjust the initial data pointers.
;;
;; Known slack: There is one instruction in the prologue and one in
;; the epilogue that could go in the V pipe if I could find a U-pipe
;; instruction to pair them with, but all the U-pipe instructions
;; are already paired, so it looks difficult.
;;
;; There is a cycle of Address Generation Interlock in the bniMulN1_32
;; code on the Pentium (not on a '486). I can't figure out how to
;; get rid of it without wasting time elsewhere. The problem is that
;; the load of bx needs to be done as soon as possible to let it
;; be set up in time for the switch(). The other problem is the
;; epilogue code which can waste time if the order of the pushed
;; registers is diddled with so that ds doesn't come between si and di.
;;
;; The increment of si after the last load is redundant, and the
;; copy of the high word of the product to the carry after the last
;; multiply is likewise unnecessary.
;;
;; In these cases, the operations were done that way in order to remove
;; cycles from the loop on the '486 and/or Pentium, even though it costs
;; a few overhead cycles on a '386.
;; The increment fo si has to be done early because a load based on si
;; is the first thing in any given multiply step, and the address
;; generation interlock on the '486 and Pentium requires that a full
;; cycle (i.e. possibly two instructions on a Pentium) pass between
;; incrementing a register and using it in an address.
;; This saves one cycle per multiply on a '486 and Pentium, and costs
;; 2 cycles per call to the function on a '386 and 1 cycle on a '486.
;;
;; The carry word is copied where it is so that the decrement of the loop
;; counter happens in the V pipe. The instruction between the decrement
;; of the loop counter and the branch should be a U-pipe instruction that
;; doesn't affect the flags. Thus, the "mov" was rotated down from
;; the top of the loop to fill the slot.
;; This is a bit more marginal: it saves one cycle per loop iteration on
;; a Pentium, and costs 2 cycles per call on a '386, '486 or Pentium.
;;
;; The same logic applies to the copy of the carry and increment of si
;; before the test, in case 0, for skipping the loop entirely.
;; It makes no difference in speed if the loop is executed, but
;; incrementing si before saves an address generation interlock cycle
;; On a '486 and Pentium in the case that the loop is executed.
;; And the loop is executed more often than not.
;;
;; Given that just one multiply on a '386 takes 12 to 41 cycles (with the
;; average being very much at the high end of that) 4 cycles of additional
;; overhead per call is not a big deal.
;;
;; On a Pentium, it would actually be easier to *not* unroll the loop
;; at all, since the decrement and compare are completely hidden
;; in the V-pipe and it wouldn't cost anything to do them more often.
;; That would save the setup for the unrolling and Duff's device at the
;; beginning. But the overhead for that is pretty minor: ignoring what's
;; hidden in the V pipe, it's two cycles plus the indirect jump.
;; Not too much, and special-casing the pentium is quite a hassle.
;; (For starters, you have to detect it, and since you're probably in
;; V86 mode, without access to the EFLAGS register to test the CPUID bit.)
align 16
_bniMulN1_32 proc far
push bp ; U prologue ** Could be V
mov bp,sp ; V prologue
push si ; U prologue ** Could be V
mov bx,[bp+14] ; U load len ** Could be V (AGI!)r
push ds ; NP prologue
les si,[bp+10] ; NP load in
mov ecx,[bp+16] ; U load k
dec bx ; V loop unrolling
shl bx,2 ; U loop unrolling
push di ; V prologue
lds di,[bp+6] ; NP load out
mov bp,bx ; U loop unrolling ** Could be V
and bx,12 ; V loop unrolling
;; First multiply step has no carry in.
mov eax,es:[si] ; U first multiply
add si,bx ; V loop unrolling
mul ecx ; NP first multiply
mov [di],eax ; U first multiply
add di,bx ; V loop unrolling
;; The switch() for Duff's device. This jump table is (slightly!) faster
;; than a bunch of branches on a '386 and '486, and is probably better yet
;; on higher processors.
jmp WORD PTR cs:m32_jumptable[bx] ; NP loop unrolling
align 2
m32_jumptable:
dw OFFSET m32_case0, 0
dw OFFSET m32_case1, 0
dw OFFSET m32_case2, 0
dw OFFSET m32_case3, 0, 0, 0, 0 ; Get loop aligned properly
m32_case0:
add si,16 ; U Fix up si ** Could be V
test bp,bp ; V
mov ebx,edx ; U Remember carry for later
jbe SHORT m32_done ; V Avoid entire loop if loop count is 0
m32_loop:
mov eax,es:[si-12] ; U
add di, 16 ; V
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
mov [di-12],eax ; U
m32_case3:
mov ebx,edx ; U Remember carry for later
mov eax,es:[si-8] ; U
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
mov [di-8],eax ; U
m32_case2:
mov ebx,edx ; U Remember carry for later
mov eax,es:[si-4] ; U
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
mov [di-4],eax ; U
m32_case1:
mov ebx,edx ; U Remember carry for later
mov eax,es:[si] ; U
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
add si,16 ; V
mov [di],eax ; U
sub bp,16 ; V
mov ebx,edx ; U Remember carry for later
ja m32_loop ; V
m32_done:
mov [di+4],edx ; U
pop di ; V
pop ds ; NP
pop si ; U ** Could be V
pop bp ; V
ret ; NP
_bniMulN1_32 endp
align 16
_bniMulAdd1_32 proc far
push bp ; U prologue ** Could be V
mov bp,sp ; V prologue
push ds ; NP prologue
mov ecx,[bp+16] ; U load k
mov bx,[bp+14] ; V load len
push di ; U prologue ** Could be V
dec bx ; V loop unrolling
lds di,[bp+6] ; NP load out
shl bx,2 ; U loop unrolling
push si ; V prologue
les si,[bp+10] ; NP load in
mov bp,bx ; U loop unrolling ** Could be V
and bx,12 ; V loop unrolling
;; First multiply step has no carry in.
mov eax,es:[si] ; U first multiply
add si,bx ; V loop unrolling
mul ecx ; NP first multiply
add [di],eax ; U first multiply
adc edx,0 ; U first multiply
add di,bx ; V loop unrolling
;; The switch() for Duff's device. This jump table is (slightly!) faster
;; than a bunch of branches on a '386 and '486, and is probably better yet
;; on higher processors.
jmp WORD PTR cs:ma32_jumptable[bx] ; NP loop unrolling
align 2
ma32_jumptable:
dw OFFSET ma32_case0, 0
dw OFFSET ma32_case1, 0
dw OFFSET ma32_case2, 0
dw OFFSET ma32_case3, 0, 0 ; To get loop aligned properly
ma32_case0:
add si,16 ; U Fix up si ** Could be V
test bp,bp ; V
mov ebx,edx ; U Remember carry for later
jbe SHORT ma32_done ; V Avoid entire loop if loop count is 0
ma32_loop:
mov eax,es:[si-12] ; U
add di, 16 ; V
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
add [di-12],eax ; U
adc edx,0 ; U
ma32_case3:
mov ebx,edx ; U Remember carry for later
mov eax,es:[si-8] ; U
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
add [di-8],eax ; U
adc edx,0 ; U
ma32_case2:
mov ebx,edx ; U Remember carry for later
mov eax,es:[si-4] ; U
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
add [di-4],eax ; U
adc edx,0 ; U
ma32_case1:
mov ebx,edx ; U Remember carry for later
mov eax,es:[si] ; U
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
add si,16 ; V
add [di],eax ; U
adc edx,0 ; U
sub bp,16 ; V
mov ebx,edx ; U Remember carry for later
ja ma32_loop ; V
ma32_done:
pop si ; U ** Could be V
pop di ; V
mov ax,dx ; U return value low ** Could be V
pop ds ; NP
shr edx,16 ; U return value high
pop bp ; V
ret ; NP
_bniMulAdd1_32 endp
align 16
_bniMulSub1_32 proc far
push bp ; U prologue ** Could be V
mov bp,sp ; V prologue
push ds ; NP prologue
mov ecx,[bp+16] ; U load k
mov bx,[bp+14] ; V load len
push di ; U prologue ** Could be V
dec bx ; V loop unrolling
lds di,[bp+6] ; NP load out
shl bx,2 ; U loop unrolling
push si ; V prologue
les si,[bp+10] ; NP load in
mov bp,bx ; U loop unrolling ** Could be V
and bx,12 ; V loop unrolling
;; First multiply step has no carry in.
mov eax,es:[si] ; U first multiply
add si,bx ; V loop unrolling
mul ecx ; NP first multiply
sub [di],eax ; U first multiply
adc edx,0 ; U first multiply
add di,bx ; V loop unrolling
;; The switch() for Duff's device. This jump table is (slightly!) faster
;; than a bunch of branches on a '386 and '486, and is probably better yet
;; on higher processors.
jmp WORD PTR cs:ms32_jumptable[bx] ; NP loop unrolling
align 2
ms32_jumptable:
dw OFFSET ms32_case0, 0
dw OFFSET ms32_case1, 0
dw OFFSET ms32_case2, 0
dw OFFSET ms32_case3, 0, 0 ; To get loop aligned properly
ms32_case0:
add si,16 ; U Fix up si ** Could be V
test bp,bp ; V
mov ebx,edx ; U Remember carry for later
jbe SHORT ms32_done ; V Avoid entire loop if loop count is 0
ms32_loop:
mov eax,es:[si-12] ; U
add di, 16 ; V
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
sub [di-12],eax ; U
adc edx,0 ; U
ms32_case3:
mov ebx,edx ; U Remember carry for later
mov eax,es:[si-8] ; U
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
sub [di-8],eax ; U
adc edx,0 ; U
ms32_case2:
mov ebx,edx ; U Remember carry for later
mov eax,es:[si-4] ; U
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
sub [di-4],eax ; U
adc edx,0 ; U
ms32_case1:
mov ebx,edx ; U Remember carry for later
mov eax,es:[si] ; U
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
add si,16 ; V
sub [di],eax ; U
adc edx,0 ; U
sub bp,16 ; V
mov ebx,edx ; U Remember carry for later
ja ms32_loop ; V
ms32_done:
pop si ; U ** Could be V
pop di ; V
mov ax,dx ; U return value low ** Could be V
pop ds ; NP
shr edx,16 ; U return value high
pop bp ; V
ret ; NP
_bniMulSub1_32 endp
;; Just for interest's sake, here's a completely Pentium-optimized version.
;; In addition to being smaller, it takes 8 + (8+mul_time)*n cycles, as
;; compared to the 10 + jmp_time + (8+mul_time)*n cycles for the loop above.
;; (I don't know how long a 32x32->64 bit multiply or an indirect jump
;; take on a Pentium, so plug those numbers in.)
; align 2
; nop ; To align loop nicely
;P_bniMulAdd1_32 proc far
;
; push bp ; U prologue ** Could be V
; mov bp,sp ; V prologue
; push ds ; NP prologue
; mov ecx,[bp+16] ; U load k
; push si ; V prologue
; lds si,[bp+10] ; NP load in
; mov eax,[si] ; U first multiply
; push di ; V prologue
; mul ecx ; NP first multiply
; les di,[bp+6] ; NP load out
; add es:[di],eax ; U first multiply
; mov bp,[bp+14] ; V load len
; adc edx,0 ; U first multiply
; dec bp ; V
; mov ebx,edx ; U Remember carry for later
; je Pma32_done ; V
;Pma32_loop:
; mov eax,[si+4] ; U
; add di,4 ; V
; mul ecx ; NP
; add eax,ebx ; U Add carry in from previous word
; adc edx,0 ; U
; add si,4 ; V
; add es:[di],eax ; U
; adc edx,0 ; U
; dec bp ; V
; mov ebx,edx ; U Remember carry for later
; jne Pma32_loop ; V
;Pma32_done:
; pop di ; U ** Could be V
; pop si ; V
; pop ds ; NP
; mov ax,dx ; U return value low ** Could be V
; pop bp ; V
; shr edx,16 ; U return value high
; ret ; NP
;
;P_bniMulAdd1_32 endp
;; Two-word by one-word divide. Stores quotient, returns remainder.
;; BNWORD32 bniDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
;; 4 8 12 16
align 16
_bniDiv21_32 proc far
mov cx,bp ; U bp NOT pushed; offsets differ
mov bp,sp ; V
; AGI
mov edx,[bp+8] ; U
mov eax,[bp+12] ; U
div DWORD PTR [bp+16] ; NP
les bx,[bp+4] ; NP
mov es:[bx],eax ; U
mov ax,dx ; V
shr edx,16 ; U
mov bp,cx ; V
ret ; NP
nop
nop
nop
nop ; Get bniModQ_32 aligned properly
_bniDiv21_32 endp
;; Multi-word by one-word remainder.
;; This speeds up key generation. It's not worth unrolling and so on;
;; using 32-bit divides is enough of a speedup.
;;
;; bp is used as a counter so that all the 32-bit values can be in
;; caller-save registers (eax, ecx, edx). bx is needed as a pointer.
;;
;; The modulus (in ebp) is 16 bits. Given that the dividend is 32 bits,
;; the chances of saving the first divide because the high word of the
;; dividend is less than the modulus are low enough it's not worth taking
;; the cycles to test for it.
;;
;; unsigned bniModQ_32(BNWORD16 *q, unsigned len, unsigned d)
;; 6 10 12
_bniModQ_32 proc far
xor ecx,ecx ; U Clear ecx (really, the high half)
push bp ; V
mov edx,ecx ; U Clear high word for first divide
mov bp,sp ; V
push ds ; NP
lds ax,[bp+6] ; NP Load dividend pointer
mov bx,[bp+10] ; U Load count ** Could be V
sub ax,4 ; V Offset dividend pointer
mov cx,[bp+12] ; U Load modulus ** Could be V
mov bp,bx ; V Copy count
shl bx,2 ; U Shift index
add bx,ax ; U Add base ** Could be V
; lea bx,[eax+ebp*4-4]; U Move pointer to high word
modq32_loop:
mov eax,[bx] ; U
sub bx,4 ; V
div ecx ; NP
dec bp ; U ** Could be V
jnz modq32_loop ; V
modq32_done:
pop ds ; NP
mov ax,dx ; U ** Could be V
pop bp ; V
ret ; NP
_bniModQ_32 endp
;; int not386(void) returns 0 on a 32-bit (386 or better) processor;
;; non-zero if an 80286 or lower. The Z flag is set to reflect
;; ax on return. This is only called once, so it doesn't matter how
;; it's aligned.
_not386 proc far
;;
;; This first test detects 80x86 for x < 2. On the 8086 and '186,
;; "push sp" does "--sp; sp[0] = sp". On all later processors, it does
;; "sp[-1] = sp; --sp".
;;
push sp
pop ax
sub ax,sp
jne SHORT return
;; This test is the key one. It will probably detect 8086, V30 and 80186
;; as well as 80286, but I haven't had access to test it on any of those,
;; so it's protected by the well-known test above. It has been tested
;; on the 80286, 80386, 80486, Pentium and AMD tested it on their K5.
;; I have not been able to confirm effectiveness on the P6 yet, although
;; someone I spoke to at Intel said it should work.
;;
;; This test uses the fact that the '386 and above have a barrel shifter
;; to do shifts, while the '286 does left shifts by releated adds.
;; That means that on the '286, the auxilliary carry gets a copy of
;; bit 4 of the shift output, while on the '386 and up, it's trashed
;; (as it happens, set to 1) independent of the result. (It's documented
;; as undefined.)
;;
;; We do two shifts, which should produce different auxilliary carries
;; on a '286 and XOR them to see if they are different. Even on a
;; future processor that does something different with the aux carry
;; flag, it probably does something data-independent, so this will still
;; work. Note that all flags except aux carry are defined for shl
;; output and will be the same for both cases.
mov al,4
shl al,1 ; Expected to produce ac = 0 on a '286
lahf
shl al,1 ; Expected to produce ac = 1 on a '286
mov al,ah
lahf
xor al,ah ; Xor the flags together to detect the difference
mov ah,al ; Clear ah if al is clear, leave Z flag alone
return:
ret
_not386 endp
_TEXT ends
end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -