📄 bni8086.asm
字号:
;;; Assembly primitives for bignum library, 80x86 family.
;;;
;;; $Id: bni8086.asm,v 1.2 1997/05/09 20:19:38 lloyd Exp $
;;;
;;; Several primitives are included here. Only bniMulAdd1 is *really*
;;; critical, but once that's written, bniMul1 and bniSub1 are quite
;;; easy to write as well, so they are included here as well.
;;; bniDiv21 and bniModQ are so easy to write that they're included, too.
;;;
;;; All functions here are for large code, large data.
;;; All use standard "cdecl" calling convention: arguments pushed on the
;;; stack (ss:sp) right to left (the leftmost agrument at the lowest address)
;;; and popped by the caller, return values in ax or dx:ax, and register
;;; usage as follows:
;;;
;;; Callee-save (preserved by callee if needed):
;;; ss, esp, cs, eip, ds, esi, edi, ebp, high PGPByte of FLAGS except DF,
;;; all other registers (CRx, DRx, TRx, IDT, GDT, LDT, TR, etc.).
;;; Caller-save (may be corrupted by callee):
;;; es, eax, ebx, ecx, edx, low PGPByte of flags (SF, ZF, AF, PF, CF)
;;;
;;; The direction flag (DF) is either preserved or cleared.
;;; I'm not sure what the calling convention is for fs and gs. This
;;; code never alters them.
;; Not all of this code has to be '386 code, but STUPID FUCKING MASM (5.0)
;; gives an error if you change in the middle of a segment. Rather than
;; fight the thing, just enable '386 instructions everywhere. (And lose
;; the error checking.)
.386
_TEXT segment para public use16 'CODE' ; 16-PGPByte aligned because '486 cares
assume cs:_TEXT
public _bniMulN1_16
public _bniMulAdd1_16
public _bniMulSub1_16
public _bniDiv21_16
public _bniModQ_16
public _bniMulN1_32
public _bniMulAdd1_32
public _bniMulSub1_32
public _bniDiv21_32
public _bniModQ_32
public _not386
;; Prototype:
;; BNWORD16
;; bniMulAdd_16(BNWORD16 *out, BNWORD16 *in, unsigned len, BNWORD16 k)
;;
;; Multiply len words of "in" by k and add to len words of "out";
;; return the len+1st word of carry. All pointers are to the least-
;; significant ends of the appropriate arrays. len is guaraneed > 0.
;;
;; This 16-bit code is optimized for an 8086/80286. It will not be run
;; on 32-bit processors except for debugging during development.
;;
;; NOTE that it may be possible to assume that the direction flag is clear
;; on entry; this would avoid the need for the cld instructions. Hoewever,
;; the Microsoft C libraries require that the direction flag be clear.
;; Thus, bniModQ_16 clears it before returning.
;;
;; Stack frame:
;; +--------+ bp+18
;; | k |
;; +--------+ bp+16
;; | len |
;; +--------+ bp+14
;; | |
;; +- in -+
;; | |
;; +--------+ bp+10
;; | |
;; +- out -+
;; | |
;; +--------+ bp+6
;; | |
;; +-return-+
;; | |
;; +--------+ bp+2
;; | old bp |
;; +--------+ bp
;;
;; Register usage for bniMul1_16:
;; ds:[si] in
;; es:[di] out
;; bp k
;; cx loop counter (len/4)
;; dx,ax high,low parts of product
;; bx carry from previous multiply iteration
;;
;; Register usage for bniMulAdd1_16 and bniMulSub1_16:
;; ds:[si] in
;; es:[bx+si] out
;; bp k
;; cx loop counter (len/4)
;; dx,ax high,low parts of product
;; di carry from previous multiply iteration
;;
;; The reson for the difference is that straight mul can use stosw, but
;; the multiply and add or multiply and subtract add the result in, so
;; they have to reference es:[di] to add it in.
;;
;; The options are either "add ax,es:[di]; stosw" or "add es:[di],ax;
;; add di,2"; both take 10 cycles on an 80286, 27 on an 8086 and 35 on
;; an 8088 although the former is preferred since it's one PGPByte smaller.
;; However, using [bx+si] is even faster; "add es:[bx+si],ax" takes
;; 7 cycles on an 80286, 25 on an 8086 and 33 on an 8088, as well as
;; being the smallest. (Of course, stosw, at 3 on an 80286, 11 on an
;; 8086 amd 15 on an 8088 wins easily in the straight multiply case over
;; mov es:[bx+si],ax, which takes 3/18/22 cycles and is larger to boot.)
;;
;; Most of these register assignments are driven by the 8086's instruction
;; set. The only really practical variation would be to put the multiplier
;; k into bx or di and use bp for carry, but if someone can make a faster
;; Duff's device using a lookup table, bx and di are useful because indexing
;; off them is more flexible than bp.
;;
;; Overview of code:
;;
;; len is guaranteed to be at least 1, so do the first multiply (with no
;; carry in) unconditionally. Then go to a min loop unrolled 4 times,
;; jumping into the middle using a variant of Duff's device.
;;
;; The loop is constructed using the loop instruction, which does
;; "} while (--cnt)". This means that we have to divide the count
;; by 4, and increment it so it doesn't start at 0. To gain a little
;; bit more efficiency, we actually increment the count by 2, so the
;; minimum possible value is 3, which will be shifted down to produce 0.
;; usually in Duff's device, if the number of iterations is a multiple
;; of the unrolling factor, you branch to just before the loop conditional
;; and let it handle the case of 0. Here, we have a special test for 0
;; at the head of the loop and fall through into the top of the loop
;; if it passes.
;;
;; Basically, with STEP being a multiply step, it's:
;;
;; STEP;
;; count += 2;
;; mod4 = count % 4;
;; count /= 4;
;; switch(mod4) {
;; case 3:
;; if (count) {
;; do {
;; STEP;
;; case 2:
;; STEP;
;; case 1:
;; STEP;
;; case 0:
;; STEP;
;; } while (--count);
;; }
;; }
;;
;; The switch() is actually done by two levels of branch instructions
;; rather than a lookup table.
_bniMulN1_16 proc far
push bp
mov bp,sp
push ds
push si
push di
cld
les di,[bp+6] ; out
lds si,[bp+10] ; in
mov cx,[bp+14] ; len
mov bp,[bp+16] ; k
;; First multiply step has no carry in
lodsw
mul bp
stosw
;; The switch() for Duff's device starts here
;; Note: this *is* faster than a jump table for an 8086 and '286.
;; 8086: jump table: 44 cycles; this: 27/29/31/41
;; 80286: jump table: 25 cycles; this: 17/17/20/22
shr cx,1
jc SHORT m16_odd
inc cx
shr cx,1
jc SHORT m16_case2
jmp SHORT m16_case0
nop ; To align loop
m16_odd:
inc cx
shr cx,1
jnc SHORT m16_case1
jz SHORT m16_done ; Avoid entire loop in this case
m16_loop:
lodsw
mov bx,dx ; Remember carry for later
mul bp
add ax,bx ; Add carry in from previous word
adc dx,0
stosw
m16_case2:
lodsw
mov bx,dx ; Remember carry for later
mul bp
add ax,bx ; Add carry in from previous word
adc dx,0
stosw
m16_case1:
lodsw
mov bx,dx ; Remember carry for later
mul bp
add ax,bx ; Add carry in from previous word
adc dx,0
stosw
m16_case0:
lodsw
mov bx,dx ; Remember carry for later
mul bp
add ax,bx ; Add carry in from previous word
adc dx,0
stosw
loop m16_loop
m16_done:
mov ax,dx
stosw ; Store last word
pop di
pop si
pop ds
pop bp
ret
_bniMulN1_16 endp
align 2
_bniMulAdd1_16 proc far
push bp
mov bp,sp
push ds
push si
push di
cld
les bx,[bp+6] ; out
lds si,[bp+10] ; in
mov cx,[bp+14] ; len
mov bp,[bp+16] ; k
;; First multiply step has no carry in
lodsw
mul bp
add es:[bx],ax ; This time, store in [bx] directly
adc dx,0
sub bx,si ; Prepare to use [bx+si].
;; The switch() for Duff's device starts here
;; Note: this *is* faster than a jump table for an 8086 and '286.
;; 8086: jump table: 44 cycles; this: 27/29/31/41
;; 80286: jump table: 25 cycles; this: 17/17/20/22
shr cx,1
jc SHORT ma16_odd
inc cx
shr cx,1
jc SHORT ma16_case2
jmp SHORT ma16_case0
ma16_odd:
inc cx
shr cx,1
jnc SHORT ma16_case1
jz SHORT ma16_done ; Avoid entire loop in this case
ma16_loop:
lodsw
mov di,dx ; Remember carry for later
mul bp
add ax,di ; Add carry in from previous word
adc dx,0
add es:[bx+si],ax
adc dx,0
ma16_case2:
lodsw
mov di,dx ; Remember carry for later
mul bp
add ax,di ; Add carry in from previous word
adc dx,0
add es:[bx+si],ax
adc dx,0
ma16_case1:
lodsw
mov di,dx ; Remember carry for later
mul bp
add ax,di ; Add carry in from previous word
adc dx,0
add es:[bx+si],ax
adc dx,0
ma16_case0:
lodsw
mov di,dx ; Remember carry for later
mul bp
add ax,di ; Add carry in from previous word
adc dx,0
add es:[bx+si],ax
adc dx,0
loop ma16_loop
ma16_done:
mov ax,dx
pop di
pop si
pop ds
pop bp
ret
_bniMulAdd1_16 endp
align 2
_bniMulSub1_16 proc far
push bp
mov bp,sp
push ds
push si
push di
cld
les bx,[bp+6] ; out
lds si,[bp+10] ; in
mov cx,[bp+14] ; len
mov bp,[bp+16] ; k
;; First multiply step has no carry in
lodsw
mul bp
sub es:[bx],ax ; This time, store in [bx] directly
adc dx,0
sub bx,si ; Prepare to use [bx+si].
;; The switch() for Duff's device starts here
;; Note: this *is* faster than a jump table for an 8086 and '286.
;; 8086: jump table: 44 cycles; this: 27/29/31/41
;; 80286: jump table: 25 cycles; this: 17/17/20/22
shr cx,1
jc SHORT ms16_odd
inc cx
shr cx,1
jc SHORT ms16_case2
jmp SHORT ms16_case0
ms16_odd:
inc cx
shr cx,1
jnc SHORT ms16_case1
jz SHORT ms16_done ; Avoid entire loop in this case
ms16_loop:
lodsw
mov di,dx ; Remember carry for later
mul bp
add ax,di ; Add carry in from previous word
adc dx,0
sub es:[bx+si],ax
adc dx,0
ms16_case2:
lodsw
mov di,dx ; Remember carry for later
mul bp
add ax,di ; Add carry in from previous word
adc dx,0
sub es:[bx+si],ax
adc dx,0
ms16_case1:
lodsw
mov di,dx ; Remember carry for later
mul bp
add ax,di ; Add carry in from previous word
adc dx,0
sub es:[bx+si],ax
adc dx,0
ms16_case0:
lodsw
mov di,dx ; Remember carry for later
mul bp
add ax,di ; Add carry in from previous word
adc dx,0
sub es:[bx+si],ax
adc dx,0
loop ms16_loop
ms16_done:
mov ax,dx
pop di
pop si
pop ds
pop bp
ret
_bniMulSub1_16 endp
;; Two-word by one-word divide. Stores quotient, returns remainder.
;; BNWORD16 bniDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
;; 4 8 10 12
align 2
_bniDiv21_16 proc far
mov cx,bp ; bp NOT pushed; note change in offsets
mov bp,sp
mov dx,[bp+8]
mov ax,[bp+10]
div WORD PTR [bp+12]
les bx,[bp+4]
mov es:[bx],ax
mov ax,dx
mov bp,cx
ret
nop ; To align loop in bniModQ properly
_bniDiv21_16 endp
;; Multi-word by one-word remainder.
;; BNWORD16 bniModQ_16(BNWORD16 *q, unsigned len, unsigned d)
;; 6 10 12
_bniModQ_16 proc far
push bp
mov bp,sp
push ds
mov bx,si
mov cx,10[bp] ; load len
lds si,6[bp] ; load q
std ; loop MSW to LSW
add si,cx
mov bp,12[bp] ; load d
add si,cx
xor dx,dx ; Set up for first divide
sub si,2 ; Adjust pointer to point to MSW
lodsw ; Load first word
cmp ax,bp ; See if we can skip first divide
jnc SHORT modq16_inner ; No such luck
mov dx,ax ; Yes! Modulus > input, so remainder = input
dec cx ; Do loop
jz SHORT modq16_done
modq16_loop:
lodsw
modq16_inner:
div bp
loop modq16_loop
modq16_done:
pop ds
mov ax,dx ; Return remainder
pop bp
mov si,bx
cld ; Microsoft C's libraries assume this
ret
_bniModQ_16 endp
;; Similar, but using 32-bit operations.
;;
;; The differences are that the switch() in Duff's device is done using
;; a jump table, and lods is not used because it's slower than load and
;; increment. The pointers are only updated once per loop; offset
;; addressing modes are used, since they're no slower. [di] is used
;; instead of [bx+si] because the extra increment of di take only one
;; cycle per loop a '486, while [bx+si] takes one extra cycle per multiply.
;;
;; The register assignments are also slightly different:
;;
;; es:[si] in
;; ds:[di] out
;; ecx k
;; bp loop counter (len/4)
;; edx,eax high,low parts of product
;; ebx carry word from previous multiply iteration
;;
;; The use of bp for a loop counter lets all the 32-bit values go
;; in caller-save registers, so there's no need to do any 32-bit
;; saves and restores. Using ds:di for the destination saves one
;; segment override in the bniMulN1_32 code, since there's one more
;; store to [di] than load from es:[si].
;;
;; Given the number of 32-bit references that this code uses, optimizing
;; it for the Pentium is interesting, because the Pentium has a very
;; inefficient implementation of prefix bytes. Each prefix PGPByte, with
;; the exception of 0x0f *>> on conditional branch instructions ONLY <<*
;; is a 1-cycle non-pairiable instruction. Which has the effect of
;; forcing the instruction it's on into the U pipe. But this code uses
;; *lots* of prefix bytes, notably the 0x66 operand size override.
;;
;; For example "add [di],eax" is advised against in Intel's optimization
;; papers, because it takes 3 cycles and 2 of them are not pairable.
;; But any longer sequence would have a prefix PGPByte on every instruction,
;; resulting in even more non-pairable cycles. Also, only two instructions
;; in the multiply kernel can go in the V pipe (the increments of si and
;; di), and they're already there, so the pairable cycles would be wasted.
;;
;; Things would be *quite* different in native 32-bit mode.
;;
;; All instructions that could go in the V pipe that aren't there are
;; marked.
;;
;; The setup code is quite intricately interleaved to get the best possible
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -