📄 lbn80386.asm
字号:
;;; Assembly primitives for bignum library, 80386 family, 32-bit code.;;;;;; Copyright (c) 1995, Colin Plumb.;;; For licensing and other legal details, see the file legal.c.;;;;;; Several primitives are included here. Only lbnMulAdd1 is *really*;;; critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite;;; easy to write as well, so they are included here as well.;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.;;;;;; All functions here are for 32-bit flat mode. I.e. near code and;;; near data, although the near offsets are 32 bits..386;_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares;_TEXT endsifdef @Versionif @Version le 510FLAT group _TEXTendifelseFLAT group _TEXTendif assume cs:FLAT, ds:FLAT, ss:FLAT_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares public _lbnMulN1_32 public _lbnMulAdd1_32 public _lbnMulSub1_32 public _lbnDiv21_32 public _lbnModQ_32;; Register usage:;; eax - low half of product;; ebx - carry to next iteration;; ecx - multiplier (k);; edx - high half of product;; esi - source pointer;; edi - dest pointer;; ebp - loop counter;;;; Stack frame:;; +--------+ esp+20 esp+24 esp+28 esp+32;; | k |;; +--------+ esp+16 esp+20 esp+24 esp+28;; | len |;; +--------+ esp+12 esp+16 esp+20 esp+24;; | in |;; +--------+ esp+8 esp+12 esp+16 esp+20;; | out |;; +--------+ esp+4 esp+8 esp+12 esp+16;; | return |;; +--------+ esp esp+4 esp+8 esp+12;; | esi |;; +--------+ esp esp+4 esp+8;; | ebp |;; +--------+ esp esp+4;; | edi |;; +--------+ esp align 16_lbnMulN1_32 proc near push esi ; U mov esi,[esp+12] ; V load in push ebp ; U mov ebp,[esp+20] ; V load len mov ecx,[esp+24] ; U load k push edi ; V mov edi,[esp+16] ; U load out;; First multiply step has no carry in. mov eax,[esi] ; V lea ebx,[ebp*4-4] ; U loop unrolling mul ecx ; NP first multiply mov [edi],eax ; U and ebx,12 ; V loop unrolling add esi,ebx ; U loop unrolling add edi,ebx ; V loop unrolling jmp DWORD PTR m32_jumptable[ebx] ; NP loop unrolling align 4m32_jumptable: dd m32_case0 dd m32_case1 dd m32_case2 dd m32_case3 nop align 8 nop nop nop ; Get loop nicely alignedm32_case0: sub ebp,4 ; U jbe SHORT m32_done ; Vm32_loop: mov eax,[esi+4] ; U mov ebx,edx ; V Remember carry for later add esi,16 ; U add edi,16 ; V mul ecx ; NP add eax,ebx ; U Add carry in from previous word adc edx,0 ; U mov [edi-12],eax ; Vm32_case3: mov eax,[esi-8] ; U mov ebx,edx ; V Remember carry for later mul ecx ; NP add eax,ebx ; U Add carry in from previous word adc edx,0 ; U mov [edi-8],eax ; Vm32_case2: mov eax,[esi-4] ; U mov ebx,edx ; V Remember carry for later mul ecx ; NP add eax,ebx ; U Add carry in from previous word adc edx,0 ; U mov [edi-4],eax ; Vm32_case1: mov eax,[esi] ; U mov ebx,edx ; V Remember carry for later mul ecx ; NP add eax,ebx ; U Add carry in from previous word adc edx,0 ; U mov [edi],eax ; V sub ebp,4 ; U ja SHORT m32_loop ; Vm32_done: mov [edi+4],edx ; U pop edi ; V pop ebp ; U pop esi ; V ret ; NP_lbnMulN1_32 endp align 16_lbnMulAdd1_32 proc near push esi ; U mov esi,[esp+12] ; V load in push edi ; U mov edi,[esp+12] ; V load out push ebp ; U mov ebp,[esp+24] ; V load len mov ecx,[esp+28] ; U load k;; First multiply step has no carry in. mov eax,[esi] ; V mov ebx,[edi] ; U mul ecx ; NP first multiply add ebx,eax ; U lea eax,[ebp*4-4] ; V loop unrolling adc edx,0 ; U and eax,12 ; V loop unrolling mov [edi],ebx ; U add esi,eax ; V loop unrolling add edi,eax ; U loop unrolling jmp DWORD PTR ma32_jumptable[eax] ; NP loop unrolling align 4ma32_jumptable: dd ma32_case0 dd ma32_case1 dd ma32_case2 dd ma32_case3 nop align 8 nop nop nop ; To align loop properlyma32_case0: sub ebp,4 ; U jbe SHORT ma32_done ; Vma32_loop: mov eax,[esi+4] ; U mov ebx,edx ; V Remember carry for later add esi,16 ; U add edi,16 ; V mul ecx ; NP add eax,ebx ; U Add carry in from previous word mov ebx,[edi-12] ; V adc edx,0 ; U add ebx,eax ; V adc edx,0 ; U mov [edi-12],ebx ; Vma32_case3: mov eax,[esi-8] ; U mov ebx,edx ; V Remember carry for later mul ecx ; NP add eax,ebx ; U Add carry in from previous word mov ebx,[edi-8] ; V adc edx,0 ; U add ebx,eax ; V adc edx,0 ; U mov [edi-8],ebx ; Vma32_case2: mov eax,[esi-4] ; U mov ebx,edx ; V Remember carry for later mul ecx ; NP add eax,ebx ; U Add carry in from previous word mov ebx,[edi-4] ; V adc edx,0 ; U add ebx,eax ; V adc edx,0 ; U mov [edi-4],ebx ; Vma32_case1: mov eax,[esi] ; U mov ebx,edx ; V Remember carry for later mul ecx ; NP add eax,ebx ; U Add carry in from previous word mov ebx,[edi] ; V adc edx,0 ; U add ebx,eax ; V adc edx,0 ; U mov [edi],ebx ; V sub ebp,4 ; U ja SHORT ma32_loop ; Vma32_done: pop ebp ; U mov eax,edx ; V pop edi ; U pop esi ; V ret ; NP_lbnMulAdd1_32 endp align 16_lbnMulSub1_32 proc near push esi ; U mov esi,[esp+12] ; V load in push edi ; U mov edi,[esp+12] ; V load out push ebp ; U mov ebp,[esp+24] ; V load len mov ecx,[esp+28] ; U load k;; First multiply step has no carry in. mov eax,[esi] ; V mov ebx,[edi] ; U mul ecx ; NP first multiply sub ebx,eax ; U lea eax,[ebp*4-4] ; V loop unrolling adc edx,0 ; U and eax,12 ; V loop unrolling mov [edi],ebx ; U add esi,eax ; V loop unrolling add edi,eax ; U loop unrolling jmp DWORD PTR ms32_jumptable[eax] ; NP loop unrolling align 4ms32_jumptable: dd ms32_case0 dd ms32_case1 dd ms32_case2 dd ms32_case3 nop align 8 nop nop nopms32_case0: sub ebp,4 ; U jbe SHORT ms32_done ; Vms32_loop: mov eax,[esi+4] ; U mov ebx,edx ; V Remember carry for later add esi,16 ; U add edi,16 ; V mul ecx ; NP add eax,ebx ; U Add carry in from previous word mov ebx,[edi-12] ; V adc edx,0 ; U sub ebx,eax ; V adc edx,0 ; U mov [edi-12],ebx ; Vms32_case3: mov eax,[esi-8] ; U mov ebx,edx ; V Remember carry for later mul ecx ; NP add eax,ebx ; U Add carry in from previous word mov ebx,[edi-8] ; V adc edx,0 ; U sub ebx,eax ; V adc edx,0 ; U mov [edi-8],ebx ; Vms32_case2: mov eax,[esi-4] ; U mov ebx,edx ; V Remember carry for later mul ecx ; NP add eax,ebx ; U Add carry in from previous word mov ebx,[edi-4] ; V adc edx,0 ; U sub ebx,eax ; V adc edx,0 ; U mov [edi-4],ebx ; Vms32_case1: mov eax,[esi] ; U mov ebx,edx ; V Remember carry for later mul ecx ; NP add eax,ebx ; U Add carry in from previous word mov ebx,[edi] ; V adc edx,0 ; U sub ebx,eax ; V adc edx,0 ; U mov [edi],ebx ; V sub ebp,4 ; U ja SHORT ms32_loop ; Vms32_done: pop ebp ; U mov eax,edx ; V pop edi ; U pop esi ; V ret ; NP_lbnMulSub1_32 endp;; Two-word by one-word divide. Stores quotient, returns remainder.;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d);; 4 8 12 16align 4_lbnDiv21_32 proc near mov edx,[esp+8] ; U Load nh mov eax,[esp+12] ; V Load nl mov ebx,[esp+4] ; U Load q div DWORD PTR [esp+16] ; NP mov [ebx],eax ; U Store quotient mov eax,edx ; V Return remainder ret_lbnDiv21_32 endp;; Multi-word by one-word remainder.;; This speeds up key generation. It's not worth unrolling and so on;;; using 32-bit divides is enough of a speedup.;;;; The modulus (in ebp) is often 16 bits. Given that the dividend is 32;; bits, the chances of saving the first divide because the high word of the;; dividend is less than the modulus are low enough it's not worth taking;; the cycles to test for it.;;;; unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d);; 4 8 12align 4_lbnModQ_32 proc near mov ebx,[esp+4] ; U Load n mov ecx,[esp+12] ; V Load d push ebp ; U mov ebp,[esp+12] ; V Load len xor edx,edx ; Umodq32_loop: mov eax,[ebx] ; U add ebx,4 ; V div ecx ; NP dec ebp ; U jnz SHORT modq32_loop ; V pop ebp ; U mov edx,eax ; V ret ; NP_lbnModQ_32 endp_TEXT ends end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -