⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sad_ia64.s

📁 这是一个压缩解压包,用C语言进行编程的,里面有详细的源代码.
💻 S
📖 第 1 页 / 共 2 页
字号:
//   ------------------------------------------------------------------------------//   *//   * Optimized Assembler Versions of sad8 and sad16//   *//   ------------------------------------------------------------------------------//   *//   * Hannes J黷ting and Christopher 謟bek //   * {s_juetti,s_oezbek}@ira.uka.de//   *//   * Programmed for the IA64 laboratory held at University Karlsruhe 2002//   * http://www.info.uni-karlsruhe.de/~rubino/ia64p///   *//   ------------------------------------------------------------------------------//   *//   * These are the optimized assembler versions of sad8 and sad16, which calculate //   * the sum of absolute differences between two 8x8/16x16 block matrices. //   *//   * Our approach uses://   *   - The Itanium command psad1, which solves the problem in hardware. //   *   - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64 //   *     EPIC architecture//   *   - Alignment resolving to avoid memory faults//   *//   ------------------------------------------------------------------------------						.common	sad16bi#,8,8	.align 16	.global sad16bi_ia64#	.proc sad16bi_ia64#sad16bi_ia64:	.prologue	.save ar.lc, r2	mov r2 = ar.lc	.body	zxt4 r35 = r35	mov r8 = r0	mov r23 = r0	addl r22 = 255, r0.L21:	addl r14 = 7, r0	mov r19 = r32	mov r21 = r34	mov r20 = r33	;;	mov ar.lc = r14	;;.L105:	mov r17 = r20	mov r18 = r21	;;	ld1 r14 = [r17], 1	ld1 r15 = [r18], 1	;;	add r14 = r14, r15	;;	adds r14 = 1, r14	;;	shr.u r16 = r14, 1	;;	cmp4.le p6, p7 = r0, r16	;;	(p7) mov r16 = r0	(p7) br.cond.dpnt .L96	;;	cmp4.ge p6, p7 = r22, r16	;;	(p7) addl r16 = 255, r0.L96:	ld1 r14 = [r19]	adds r20 = 2, r20	adds r21 = 2, r21	;;	sub r15 = r14, r16	;;	cmp4.ge p6, p7 = 0, r15	;;	(p6) sub r14 = r16, r14	(p7) add r8 = r8, r15	;;	(p6) add r8 = r8, r14	ld1 r15 = [r18]	ld1 r14 = [r17]	;;	add r14 = r14, r15	adds r17 = 1, r19	;;	adds r14 = 1, r14	;;	shr.u r16 = r14, 1	;;	cmp4.le p6, p7 = r0, r16	;;	(p7) mov r16 = r0	(p7) br.cond.dpnt .L102	;;	cmp4.ge p6, p7 = r22, r16	;;	(p7) addl r16 = 255, r0.L102:	ld1 r14 = [r17]	adds r19 = 2, r19	;;	sub r15 = r14, r16	;;	cmp4.ge p6, p7 = 0, r15	;;	(p7) add r8 = r8, r15	(p6) sub r14 = r16, r14	;;	(p6) add r8 = r8, r14	br.cloop.sptk.few .L105	adds r23 = 1, r23	add r32 = r32, r35	add r33 = r33, r35	add r34 = r34, r35	;;	cmp4.geu p6, p7 = 15, r23	(p6) br.cond.dptk .L21	mov ar.lc = r2	br.ret.sptk.many b0	.endp sad16bi_ia64#					.text	.align 16	.global dev16_ia64#	.proc dev16_ia64#.autodev16_ia64:	// renamings for better readability	stride = r18	pfs = r19			//for saving previous function state	cura0 = r20			//address of first 8-byte block of cur	cura1 = r21			//address of second 8-byte block of cur	mean0 = r22			//registers for calculating the sum in parallel	mean1 = r23	mean2 = r24	mean3 = r25	dev0 = r26			//same for the deviation	dev1 = r27				dev2 = r28	dev3 = r29		.body	alloc pfs = ar.pfs, 2, 38, 0, 40	mov cura0  = in0	mov stride = in1	add cura1 = 8, cura0		.rotr c[32], psad[8] 		// just using rotating registers to get an array ;-).explicit{.mmi	ld8 c[0] = [cura0], stride	// load them ...	ld8 c[1] = [cura1], stride	;; }	{.mmi		ld8 c[2] = [cura0], stride	ld8 c[3] = [cura1], stride	;; }{.mmi		ld8 c[4] = [cura0], stride	ld8 c[5] = [cura1], stride	;;}{.mmi	ld8 c[6] = [cura0], stride	ld8 c[7] = [cura1], stride	;;}{.mmi		ld8 c[8] = [cura0], stride	ld8 c[9] = [cura1], stride	;;}{.mmi	ld8 c[10] = [cura0], stride	ld8 c[11] = [cura1], stride	;;}{.mii	ld8 c[12] = [cura0], stride	psad1 mean0 = c[0], r0		// get the sum of them ...	psad1 mean1 = c[1], r0}{.mmi	ld8 c[13] = [cura1], stride	;; 	ld8 c[14] = [cura0], stride	psad1 mean2 = c[2], r0}{.mii	ld8 c[15] = [cura1], stride	psad1 mean3 = c[3], r0	;; 	psad1 psad[0] = c[4], r0}{.mmi	ld8 c[16] = [cura0], stride	ld8 c[17] = [cura1], stride	psad1 psad[1] = c[5], r0	;;}{.mii		ld8 c[18] = [cura0], stride	psad1 psad[2] = c[6], r0	psad1 psad[3] = c[7], r0}{.mmi		ld8 c[19] = [cura1], stride	;; 	ld8 c[20] = [cura0], stride	psad1 psad[4] = c[8], r0}{.mii	ld8 c[21] = [cura1], stride	psad1 psad[5] = c[9], r0	;;	add mean0 = mean0, psad[0]}{.mmi	ld8 c[22] = [cura0], stride	ld8 c[23] = [cura1], stride	add mean1 = mean1, psad[1]	;; }{.mii	ld8 c[24] = [cura0], stride	psad1 psad[0] = c[10], r0	psad1 psad[1] = c[11], r0}{.mmi	ld8 c[25] = [cura1], stride	;; 	ld8 c[26] = [cura0], stride	add mean2 = mean2, psad[2]}{.mii	ld8 c[27] = [cura1], stride	add mean3 = mean3, psad[3]	;; 	psad1 psad[2] = c[12], r0}{.mmi	ld8 c[28] = [cura0], stride	ld8 c[29] = [cura1], stride	psad1 psad[3] = c[13], r0	;; }{.mii	ld8 c[30] = [cura0]	psad1 psad[6] = c[14], r0	psad1 psad[7] = c[15], r0}{.mmi	ld8 c[31] = [cura1]	;; 	add mean0 = mean0, psad[0]	add mean1 = mean1, psad[1]}{.mii	add mean2 = mean2, psad[4]	add mean3 = mean3, psad[5]	;;	psad1 psad[0] = c[16], r0}{.mmi	add mean0 = mean0, psad[2]	add mean1 = mean1, psad[3]	psad1 psad[1] = c[17], r0	;;}{.mii	add mean2 = mean2, psad[6]	psad1 psad[2] = c[18], r0	psad1 psad[3] = c[19], r0}{.mmi	add mean3 = mean3, psad[7]	;; 	add mean0 = mean0, psad[0]	psad1 psad[4] = c[20], r0}{.mii	add mean1 = mean1, psad[1]	psad1 psad[5] = c[21], r0	;;	psad1 psad[6] = c[22], r0}{.mmi	add mean2 = mean2, psad[2]	add mean3 = mean3, psad[3]	psad1 psad[7] = c[23], r0	;;}{.mii	add mean0 = mean0, psad[4]	psad1 psad[0] = c[24], r0	psad1 psad[1] = c[25], r0}{.mmi	add mean1 = mean1, psad[5]	;;	add mean2 = mean2, psad[6]	psad1 psad[2] = c[26], r0}{.mii	add mean3 = mean3, psad[7]	psad1 psad[3] = c[27], r0	;; 	psad1 psad[4] = c[28], r0}{.mmi	add mean0 = mean0, psad[0]	add mean1 = mean1, psad[1]	psad1 psad[5] = c[29], r0	;;}{.mii	add mean2 = mean2, psad[2]	psad1 psad[6] = c[30], r0	psad1 psad[7] = c[31], r0}{.mmi	add mean3 = mean3, psad[3]	;;	add mean0 = mean0, psad[4]	add mean1 = mean1, psad[5]}{.mbb	add mean2 = mean2, mean3	nop.b 1	nop.b 1	;;}{.mib	add mean0 = mean0, psad[6]	add mean1 = mean1, psad[7]	nop.b 1	;;}{.mib	add mean0 = mean0, mean1	// add mean2 = 127, mean2	// this could make our division more exactly, but does not help much	;;}{.mib	add mean0 = mean0, mean2	;;}{.mib	shr.u mean0 = mean0, 8		// divide them ...	;;}{.mib	mux1 mean0 = mean0, @brcst	;; }	{.mii	nop.m 0	psad1 dev0 = c[0], mean0	// and do a sad again ...	psad1 dev1 = c[1], mean0}{.mii	nop.m 0	psad1 dev2 = c[2], mean0	psad1 dev3 = c[3], mean0}{.mii	nop.m 0	psad1 psad[0] = c[4], mean0	psad1 psad[1] = c[5], mean0}{.mii	nop.m 0	psad1 psad[2] = c[6], mean0	psad1 psad[3] = c[7], mean0}{.mii	nop.m 0	psad1 psad[4] = c[8], mean0	psad1 psad[5] = c[9], mean0	;; }{.mii	add dev0 = dev0, psad[0]	psad1 psad[6] = c[10], mean0	psad1 psad[7] = c[11], mean0}{.mmi	add dev1 = dev1, psad[1]	add dev2 = dev2, psad[2]	psad1 psad[0] = c[12], mean0}{.mii	add dev3 = dev3, psad[3]	psad1 psad[1] = c[13], mean0	;; 	psad1 psad[2] = c[14], mean0}{.mmi	add dev0 = dev0, psad[4]	add dev1 = dev1, psad[5]	psad1 psad[3] = c[15], mean0}{.mii	add dev2 = dev2, psad[6]	psad1 psad[4] = c[16], mean0	psad1 psad[5] = c[17], mean0}{.mmi	add dev3 = dev3, psad[7]	;; 	add dev0 = dev0, psad[0]	psad1 psad[6] = c[18], mean0}{.mii	add dev1 = dev1, psad[1]	psad1 psad[7] = c[19], mean0		psad1 psad[0] = c[20], mean0}{.mmi		add dev2 = dev2, psad[2]	add dev3 = dev3, psad[3]	psad1 psad[1] = c[21], mean0	;;}{.mii	add dev0 = dev0, psad[4]	psad1 psad[2] = c[22], mean0	psad1 psad[3] = c[23], mean0}{.mmi	add dev1 = dev1, psad[5]	add dev2 = dev2, psad[6]	psad1 psad[4] = c[24], mean0}{.mii	add dev3 = dev3, psad[7]	psad1 psad[5] = c[25], mean0	;; 	psad1 psad[6] = c[26], mean0}{.mmi	add dev0 = dev0, psad[0]	add dev1 = dev1, psad[1]	psad1 psad[7] = c[27], mean0}{.mii	add dev2 = dev2, psad[2]	psad1 psad[0] = c[28], mean0	psad1 psad[1] = c[29], mean0}{.mmi	add dev3 = dev3, psad[3]	;;	add dev0 = dev0, psad[4]	psad1 psad[2] = c[30], mean0}{.mii	add dev1 = dev1, psad[5]	psad1 psad[3] = c[31], mean0	;; 	add dev2 = dev2, psad[6]}{.mmi	add dev3 = dev3, psad[7]	add dev0 = dev0, psad[0]	add dev1 = dev1, psad[1]	;;}{.mii	add dev2 = dev2, psad[2]	add dev3 = dev3, psad[3]	add ret0 = dev0, dev1	;; }{.mib	add dev2 = dev2, dev3	nop.i 1	nop.b 1	;; }{.mib	add ret0 = ret0, dev2	nop.i 1	br.ret.sptk.many b0}	.endp dev16_ia64#// ###########################################################// ###########################################################// Neue version von gruppe 01 ################################// ###########################################################// ###########################################################				.text	.align 16	.global sad16_ia64#	.proc sad16_ia64#sad16_ia64:	alloc r1 = ar.pfs, 4, 76, 0, 0	mov r2 = pr	dep r14 = r0, r33, 0, 3		// r14 = (r33 div 8)*8 (aligned version of ref)	dep.z r31 = r33, 0, 3		// r31 = r33 mod 8 (misalignment of ref)	;;	mov r64 = r34			//(1) calculate multiples of stride	shl r65 = r34, 1		//(2) for being able to load all the	shladd r66 = r34, 1, r34	//(3) data at once	shl r67 = r34, 2		//(4)	shladd r68 = r34, 2, r34	//(5)	shl r71 = r34, 3		//(8)	shladd r72 = r34, 3, r34	//(9)	;;	shl r69 = r66, 1		//(6)	shladd r70 = r66, 1, r34	//(7)	shl r73 = r68, 1		//(10)	shladd r74 = r68, 1, r34	//(11)	shl r75 = r66, 2		//(12)	shladd r76 = r66, 2, r34	//(13)	shladd r77 = r66, 2, r65	//(14)	shladd r78 = r66, 2, r66	//(15)	;;	cmp.eq p16, p17 = 0, r31	// prepare predicates according to the misalignment	cmp.eq p18, p19 = 2, r31	// ref	cmp.eq p20, p21 = 4, r31	cmp.eq p22, p23 = 6, r31	cmp.eq p24, p25 = 1, r31	cmp.eq p26, p27 = 3, r31	cmp.eq p28, p29 = 5, r31	mov r96 = r14			// and calculate all the adresses where we have	mov r33 = r32			// to load from	add r97 = r14, r64	add r35 = r32, r64	add r98 = r14, r65	add r37 = r32, r65	add r99 = r14, r66	add r39 = r32, r66	add r100 = r14, r67	add r41 = r32, r67	add r101 = r14, r68	add r43 = r32, r68	add r102 = r14, r69	add r45 = r32, r69	add r103 = r14, r70	add r47 = r32, r70	add r104 = r14, r71	add r49 = r32, r71	add r105 = r14, r72	add r51 = r32, r72	add r106 = r14, r73	add r53 = r32, r73	add r107 = r14, r74	add r55 = r32, r74	add r108 = r14, r75	add r57 = r32, r75	add r109 = r14, r76	add r59 = r32, r76	add r110 = r14, r77	add r61 = r32, r77	add r111 = r14, r78	add r63 = r32, r78	;;	ld8 r32 = [r33], 8		// Load all the data which is needed for the sad	ld8 r34 = [r35], 8		// in the registers. the goal is to have the array

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -