lbn8086.asm

来自「包含哈希,对称以及非对称的经典算法包含经典事例」· 汇编代码 · 共 1,039 行 · 第 1/2 页
ASM
1,039 行
;; performance out of a Pentium.  If you want to follow the code,;; pretend that the sections actually come in the following order:;; 1) prologue (push registers);; 2) load (fetch arguments);; 3) first multiply;; 4) loop unrolling;;;; The loop unrolling setup consists of taking the count, adjusting;; it to account for the first multiply, and splitting it into;; two parts: the high bits are a loop count, while the low bits are;; used to find the right entry in the Duff's device jump table and;; to adjust the initial data pointers.;;;; Known slack: There is one instruction in the prologue and one in;; the epilogue that could go in the V pipe if I could find a U-pipe;; instruction to pair them with, but all the U-pipe instructions;; are already paired, so it looks difficult.;;;; There is a cycle of Address Generation Interlock in the lbnMulN1_32;; code on the Pentium (not on a '486).  I can't figure out how to;; get rid of it without wasting time elsewhere.  The problem is that;; the load of bx needs to be done as soon as possible to let it;; be set up in time for the switch().  The other problem is the;; epilogue code which can waste time if the order of the pushed;; registers is diddled with so that ds doesn't come between si and di.;;;; The increment of si after the last load is redundant, and the;; copy of the high word of the product to the carry after the last;; multiply is likewise unnecessary.;;;; In these cases, the operations were done that way in order to remove;; cycles from the loop on the '486 and/or Pentium, even though it costs;; a few overhead cycles on a '386.;; The increment fo si has to be done early because a load based on si;; is the first thing in any given multiply step, and the address;; generation interlock on the '486 and Pentium requires that a full;; cycle (i.e. possibly two instructions on a Pentium) pass between;; incrementing a register and using it in an address.;; This saves one cycle per multiply on a '486 and Pentium, and costs;; 2 cycles per call to the function on a '386 and 1 cycle on a '486.;;;; The carry word is copied where it is so that the decrement of the loop;; counter happens in the V pipe.  The instruction between the decrement;; of the loop counter and the branch should be a U-pipe instruction that;; doesn't affect the flags.  Thus, the "mov" was rotated down from;; the top of the loop to fill the slot.;; This is a bit more marginal: it saves one cycle per loop iteration on;; a Pentium, and costs 2 cycles per call on a '386, '486 or Pentium.;;;; The same logic applies to the copy of the carry and increment of si;; before the test, in case 0, for skipping the loop entirely.;; It makes no difference in speed if the loop is executed, but;; incrementing si before saves an address generation interlock cycle;; On a '486 and Pentium in the case that the loop is executed.;; And the loop is executed more often than not.;;;; Given that just one multiply on a '386 takes 12 to 41 cycles (with the;; average being very much at the high end of that) 4 cycles of additional;; overhead per call is not a big deal.;;;; On a Pentium, it would actually be easier to *not* unroll the loop;; at all, since the decrement and compare are completely hidden;; in the V-pipe and it wouldn't cost anything to do them more often.;; That would save the setup for the unrolling and Duff's device at the;; beginning.  But the overhead for that is pretty minor: ignoring what's;; hidden in the V pipe, it's two cycles plus the indirect jump.;; Not too much, and special-casing the pentium is quite a hassle.;; (For starters, you have to detect it, and since you're probably in;; V86 mode, without access to the EFLAGS register to test the CPUID bit.)	align	16_lbnMulN1_32	proc	far	push	bp		; U	prologue	** Could be V	mov	bp,sp		; V	prologue	push	si		; U	prologue	** Could be V	mov	bx,[bp+14]	; U	load len	** Could be V (AGI!)r	push	ds		; NP	prologue	les	si,[bp+10]	; NP	load in	mov     ecx,[bp+16]	; U	load k	dec	bx		; V	loop unrolling	shl	bx,2		; U	loop unrolling	push	di		; V	prologue	lds	di,[bp+6]	; NP	load out	mov	bp,bx		; U	loop unrolling	** Could be V	and	bx,12		; V	loop unrolling;; First multiply step has no carry in.	mov	eax,es:[si]	; U	first multiply	add	si,bx		; V	loop unrolling	mul	ecx		; NP	first multiply	mov	[di],eax	; U	first multiply	add	di,bx		; V	loop unrolling;; The switch() for Duff's device.  This jump table is (slightly!) faster;; than a bunch of branches on a '386 and '486, and is probably better yet;; on higher processors.	jmp	WORD PTR cs:m32_jumptable[bx]	; NP	loop unrolling	align 2m32_jumptable:	dw	OFFSET m32_case0, 0	dw	OFFSET m32_case1, 0	dw	OFFSET m32_case2, 0	dw	OFFSET m32_case3, 0, 0, 0, 0	; Get loop aligned properlym32_case0:	add	si,16		; U	Fix up si	** Could be V	test	bp,bp		; V	mov	ebx,edx		; U	Remember carry for later	jbe	SHORT m32_done	; V	Avoid entire loop if loop count is 0m32_loop:	mov	eax,es:[si-12]	; U	add	di, 16		; V	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	mov	[di-12],eax	; Um32_case3:	mov	ebx,edx		; U	Remember carry for later	mov	eax,es:[si-8]	; U	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	mov	[di-8],eax	; Um32_case2:	mov	ebx,edx		; U	Remember carry for later	mov	eax,es:[si-4]	; U	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	mov	[di-4],eax	; Um32_case1:	mov	ebx,edx		; U	Remember carry for later	mov	eax,es:[si]	; U	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	add	si,16  		; V	mov	[di],eax	; U	sub	bp,16		; V	mov	ebx,edx		; U	Remember carry for later	ja	m32_loop	; Vm32_done:	mov	[di+4],edx	; U	pop	di		; V	pop	ds		; NP	pop	si		; U	** Could be V	pop	bp		; V	ret			; NP_lbnMulN1_32	endp	align	16_lbnMulAdd1_32	proc	far	push	bp		; U	prologue	** Could be V	mov	bp,sp		; V	prologue	push	ds		; NP	prologue	mov     ecx,[bp+16]	; U	load k	mov	bx,[bp+14]	; V	load len	push	di		; U	prologue	** Could be V	dec	bx		; V	loop unrolling	lds	di,[bp+6]	; NP	load out	shl	bx,2		; U	loop unrolling	push	si		; V	prologue	les	si,[bp+10]	; NP	load in	mov	bp,bx		; U	loop unrolling	** Could be V	and	bx,12		; V	loop unrolling;; First multiply step has no carry in.	mov	eax,es:[si]	; U	first multiply	add	si,bx		; V	loop unrolling	mul	ecx		; NP	first multiply	add	[di],eax	; U	first multiply	adc	edx,0		; U	first multiply	add	di,bx		; V	loop unrolling;; The switch() for Duff's device.  This jump table is (slightly!) faster;; than a bunch of branches on a '386 and '486, and is probably better yet;; on higher processors.	jmp	WORD PTR cs:ma32_jumptable[bx]	; NP	loop unrolling	align 2ma32_jumptable:	dw	OFFSET ma32_case0, 0	dw	OFFSET ma32_case1, 0	dw	OFFSET ma32_case2, 0	dw	OFFSET ma32_case3, 0, 0	; To get loop aligned properlyma32_case0:	add	si,16		; U	Fix up si	** Could be V	test	bp,bp		; V	mov	ebx,edx		; U	Remember carry for later	jbe	SHORT ma32_done	; V	Avoid entire loop if loop count is 0ma32_loop:	mov	eax,es:[si-12]	; U	add	di, 16		; V	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	add	[di-12],eax	; U	adc	edx,0		; Uma32_case3:	mov	ebx,edx		; U	Remember carry for later	mov	eax,es:[si-8]	; U	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	add	[di-8],eax	; U	adc	edx,0		; Uma32_case2:	mov	ebx,edx		; U	Remember carry for later	mov	eax,es:[si-4]	; U	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	add	[di-4],eax	; U	adc	edx,0		; Uma32_case1:	mov	ebx,edx		; U	Remember carry for later	mov	eax,es:[si]	; U	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	add	si,16  		; V	add	[di],eax	; U	adc	edx,0		; U	sub	bp,16		; V	mov	ebx,edx		; U	Remember carry for later	ja	ma32_loop	; Vma32_done:	pop	si	; U	** Could be V	pop	di	; V	mov	ax,dx	; U	return value low	** Could be V	pop	ds	; NP	shr	edx,16	; U	return value high	pop	bp	; V	ret		; NP_lbnMulAdd1_32	endp	align	16_lbnMulSub1_32	proc	far	push	bp		; U	prologue	** Could be V	mov	bp,sp		; V	prologue	push	ds		; NP	prologue	mov     ecx,[bp+16]	; U	load k	mov	bx,[bp+14]	; V	load len	push	di		; U	prologue	** Could be V	dec	bx		; V	loop unrolling	lds	di,[bp+6]	; NP	load out	shl	bx,2		; U	loop unrolling	push	si		; V	prologue	les	si,[bp+10]	; NP	load in	mov	bp,bx		; U	loop unrolling	** Could be V	and	bx,12		; V	loop unrolling;; First multiply step has no carry in.	mov	eax,es:[si]	; U	first multiply	add	si,bx		; V	loop unrolling	mul	ecx		; NP	first multiply	sub	[di],eax	; U	first multiply	adc	edx,0		; U	first multiply	add	di,bx		; V	loop unrolling;; The switch() for Duff's device.  This jump table is (slightly!) faster;; than a bunch of branches on a '386 and '486, and is probably better yet;; on higher processors.	jmp	WORD PTR cs:ms32_jumptable[bx]	; NP	loop unrolling	align 2ms32_jumptable:	dw	OFFSET ms32_case0, 0	dw	OFFSET ms32_case1, 0	dw	OFFSET ms32_case2, 0	dw	OFFSET ms32_case3, 0, 0	; To get loop aligned properlyms32_case0:	add	si,16		; U	Fix up si	** Could be V	test	bp,bp		; V	mov	ebx,edx		; U	Remember carry for later	jbe	SHORT ms32_done	; V	Avoid entire loop if loop count is 0ms32_loop:	mov	eax,es:[si-12]	; U	add	di, 16		; V	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	sub	[di-12],eax	; U	adc	edx,0		; Ums32_case3:	mov	ebx,edx		; U	Remember carry for later	mov	eax,es:[si-8]	; U	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	sub	[di-8],eax	; U	adc	edx,0		; Ums32_case2:	mov	ebx,edx		; U	Remember carry for later	mov	eax,es:[si-4]	; U	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	sub	[di-4],eax	; U	adc	edx,0		; Ums32_case1:	mov	ebx,edx		; U	Remember carry for later	mov	eax,es:[si]	; U	mul	ecx		; NP	add	eax,ebx		; U	Add carry in from previous word	adc	edx,0		; U	add	si,16  		; V	sub	[di],eax	; U	adc	edx,0		; U	sub	bp,16		; V	mov	ebx,edx		; U	Remember carry for later	ja	ms32_loop	; Vms32_done:	pop	si	; U	** Could be V	pop	di	; V	mov	ax,dx	; U	return value low	** Could be V	pop	ds	; NP	shr	edx,16	; U	return value high	pop	bp	; V	ret		; NP_lbnMulSub1_32	endp;; Just for interest's sake, here's a completely Pentium-optimized version.;; In addition to being smaller, it takes 8 + (8+mul_time)*n cycles, as;; compared to the 10 + jmp_time + (8+mul_time)*n cycles for the loop above.;; (I don't know how long a 32x32->64 bit multiply or an indirect jump;; take on a Pentium, so plug those numbers in.);	align	2;	nop	; To align loop nicely;P_lbnMulAdd1_32	proc	far;;	push	bp		; U	prologue	** Could be V;	mov	bp,sp		; V	prologue;	push	ds		; NP	prologue;	mov     ecx,[bp+16]	; U	load k;	push	si		; V	prologue;	lds	si,[bp+10]	; NP	load in;	mov	eax,[si]	; U	first multiply;	push	di		; V	prologue;	mul	ecx		; NP	first multiply;	les	di,[bp+6]	; NP	load out;	add	es:[di],eax	; U	first multiply;	mov	bp,[bp+14]	; V	load len;	adc	edx,0		; U	first multiply;	dec     bp		; V;	mov	ebx,edx		; U	Remember carry for later;	je	Pma32_done	; V;Pma32_loop:;	mov	eax,[si+4]	; U;	add	di,4		; V;	mul	ecx		; NP;	add	eax,ebx		; U	Add carry in from previous word;	adc	edx,0		; U;	add	si,4		; V;	add	es:[di],eax	; U;	adc	edx,0		; U;	dec	bp		; V;	mov	ebx,edx		; U	Remember carry for later;	jne	Pma32_loop	; V;Pma32_done:;	pop	di	; U	** Could be V;	pop	si	; V;	pop	ds	; NP;	mov	ax,dx	; U	return value low	** Could be V;	pop	bp	; V;	shr	edx,16	; U	return value high;	ret		; NP;;P_lbnMulAdd1_32	endp;; Two-word by one-word divide.  Stores quotient, returns remainder.;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d);;                      4            8            12           16	align	16_lbnDiv21_32	proc	far	mov	cx,bp			; U	bp NOT pushed; offsets differ	mov	bp,sp			; V					; AGI	mov	edx,[bp+8]		; U	mov	eax,[bp+12]		; U	div	DWORD PTR [bp+16]	; NP	les	bx,[bp+4]		; NP	mov	es:[bx],eax		; U	mov	ax,dx			; V	shr	edx,16			; U	mov	bp,cx			; V	ret				; NP	nop	nop	nop	nop				; Get lbnModQ_32 aligned properly_lbnDiv21_32	endp;; Multi-word by one-word remainder.;; This speeds up key generation.  It's not worth unrolling and so on;;; using 32-bit divides is enough of a speedup.;;;; bp is used as a counter so that all the 32-bit values can be in;; caller-save registers (eax, ecx, edx).  bx is needed as a pointer.;;;; The modulus (in ebp) is 16 bits.  Given that the dividend is 32 bits,;; the chances of saving the first divide because the high word of the;; dividend is less than the modulus are low enough it's not worth taking;; the cycles to test for it.;;;; unsigned lbnModQ_32(BNWORD16 *q, unsigned len, unsigned d);;                     6            10            12_lbnModQ_32	proc	far	xor	ecx,ecx		; U	Clear ecx (really, the high half)	push	bp		; V	mov	edx,ecx		; U	Clear high word for first divide	mov	bp,sp		; V	push	ds		; NP	lds	ax,[bp+6]	; NP	Load dividend pointer	mov	bx,[bp+10]	; U	Load count	** Could be V	sub	ax,4		; V	Offset dividend pointer	mov	cx,[bp+12]	; U	Load modulus	** Could be V	mov	bp,bx		; V	Copy count	shl	bx,2		; U	Shift index	add	bx,ax		; U	Add base	** Could be V;	lea	bx,[eax+ebp*4-4]; U	Move pointer to high wordmodq32_loop:	mov	eax,[bx]	; U	sub	bx,4		; V	div	ecx		; NP	dec	bp		; U	** Could be V	jnz	modq32_loop	; Vmodq32_done:	pop	ds		; NP	mov	ax,dx		; U	** Could be V	pop	bp		; V	ret			; NP_lbnModQ_32	endp;; int not386(void) returns 0 on a 32-bit (386 or better) processor;;; non-zero if an 80286 or lower.  The Z flag is set to reflect;; ax on return.  This is only called once, so it doesn't matter how;; it's aligned._not386 proc	far;;;; This first test detects 80x86 for x < 2.  On the 8086 and '186,;; "push sp" does "--sp; sp[0] = sp".  On all later processors, it does;; "sp[-1] = sp; --sp".;;	push	sp	pop	ax	sub	ax,sp	jne	SHORT return;; This test is the key one.  It will probably detect 8086, V30 and 80186;; as well as 80286, but I haven't had access to test it on any of those,;; so it's protected by the well-known test above.  It has been tested;; on the 80286, 80386, 80486, Pentium and AMD tested it on their K5.;; I have not been able to confirm effectiveness on the P6 yet, although;; someone I spoke to at Intel said it should work.;;;; This test uses the fact that the '386 and above have a barrel shifter;; to do shifts, while the '286 does left shifts by releated adds.;; That means that on the '286, the auxilliary carry gets a copy of;; bit 4 of the shift output, while on the '386 and up, it's trashed;; (as it happens, set to 1) independent of the result.  (It's documented;; as undefined.);;;; We do two shifts, which should produce different auxilliary carries;; on a '286 and XOR them to see if they are different.  Even on a;; future processor that does something different with the aux carry;; flag, it probably does something data-independent, so this will still;; work.  Note that all flags except aux carry are defined for shl;; output and will be the same for both cases.	mov	al,4	shl	al,1	; Expected to produce ac = 0 on a '286	lahf	shl	al,1	; Expected to produce ac = 1 on a '286	mov	al,ah	lahf	xor	al,ah	; Xor the flags together to detect the difference	mov	ah,al	; Clear ah if al is clear, leave Z flag alonereturn:	ret_not386	endp_TEXT	ends	end
lbn8086.asm - 源码说明

本页面展示了「包含哈希,对称以及非对称的经典算法包含经典事例」中的 lbn8086.asm 源码文件，采用汇编编程语言编写，共 1,039 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与对称相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?