📄 sad_ia64.s
字号:
// ------------------------------------------------------------------------------// *// * Optimized Assembler Versions of sad8 and sad16// *// ------------------------------------------------------------------------------// *// * Hannes J黷ting and Christopher 謟bek // * {s_juetti,s_oezbek}@ira.uka.de// *// * Programmed for the IA64 laboratory held at University Karlsruhe 2002// * http://www.info.uni-karlsruhe.de/~rubino/ia64p/// *// ------------------------------------------------------------------------------// *// * These are the optimized assembler versions of sad8 and sad16, which calculate // * the sum of absolute differences between two 8x8/16x16 block matrices. // *// * Our approach uses:// * - The Itanium command psad1, which solves the problem in hardware. // * - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64 // * EPIC architecture// * - Alignment resolving to avoid memory faults// *// ------------------------------------------------------------------------------ .common sad16bi#,8,8 .align 16 .global sad16bi_ia64# .proc sad16bi_ia64#sad16bi_ia64: .prologue .save ar.lc, r2 mov r2 = ar.lc .body zxt4 r35 = r35 mov r8 = r0 mov r23 = r0 addl r22 = 255, r0.L21: addl r14 = 7, r0 mov r19 = r32 mov r21 = r34 mov r20 = r33 ;; mov ar.lc = r14 ;;.L105: mov r17 = r20 mov r18 = r21 ;; ld1 r14 = [r17], 1 ld1 r15 = [r18], 1 ;; add r14 = r14, r15 ;; adds r14 = 1, r14 ;; shr.u r16 = r14, 1 ;; cmp4.le p6, p7 = r0, r16 ;; (p7) mov r16 = r0 (p7) br.cond.dpnt .L96 ;; cmp4.ge p6, p7 = r22, r16 ;; (p7) addl r16 = 255, r0.L96: ld1 r14 = [r19] adds r20 = 2, r20 adds r21 = 2, r21 ;; sub r15 = r14, r16 ;; cmp4.ge p6, p7 = 0, r15 ;; (p6) sub r14 = r16, r14 (p7) add r8 = r8, r15 ;; (p6) add r8 = r8, r14 ld1 r15 = [r18] ld1 r14 = [r17] ;; add r14 = r14, r15 adds r17 = 1, r19 ;; adds r14 = 1, r14 ;; shr.u r16 = r14, 1 ;; cmp4.le p6, p7 = r0, r16 ;; (p7) mov r16 = r0 (p7) br.cond.dpnt .L102 ;; cmp4.ge p6, p7 = r22, r16 ;; (p7) addl r16 = 255, r0.L102: ld1 r14 = [r17] adds r19 = 2, r19 ;; sub r15 = r14, r16 ;; cmp4.ge p6, p7 = 0, r15 ;; (p7) add r8 = r8, r15 (p6) sub r14 = r16, r14 ;; (p6) add r8 = r8, r14 br.cloop.sptk.few .L105 adds r23 = 1, r23 add r32 = r32, r35 add r33 = r33, r35 add r34 = r34, r35 ;; cmp4.geu p6, p7 = 15, r23 (p6) br.cond.dptk .L21 mov ar.lc = r2 br.ret.sptk.many b0 .endp sad16bi_ia64# .text .align 16 .global dev16_ia64# .proc dev16_ia64#.autodev16_ia64: // renamings for better readability stride = r18 pfs = r19 //for saving previous function state cura0 = r20 //address of first 8-byte block of cur cura1 = r21 //address of second 8-byte block of cur mean0 = r22 //registers for calculating the sum in parallel mean1 = r23 mean2 = r24 mean3 = r25 dev0 = r26 //same for the deviation dev1 = r27 dev2 = r28 dev3 = r29 .body alloc pfs = ar.pfs, 2, 38, 0, 40 mov cura0 = in0 mov stride = in1 add cura1 = 8, cura0 .rotr c[32], psad[8] // just using rotating registers to get an array ;-).explicit{.mmi ld8 c[0] = [cura0], stride // load them ... ld8 c[1] = [cura1], stride ;; } {.mmi ld8 c[2] = [cura0], stride ld8 c[3] = [cura1], stride ;; }{.mmi ld8 c[4] = [cura0], stride ld8 c[5] = [cura1], stride ;;}{.mmi ld8 c[6] = [cura0], stride ld8 c[7] = [cura1], stride ;;}{.mmi ld8 c[8] = [cura0], stride ld8 c[9] = [cura1], stride ;;}{.mmi ld8 c[10] = [cura0], stride ld8 c[11] = [cura1], stride ;;}{.mii ld8 c[12] = [cura0], stride psad1 mean0 = c[0], r0 // get the sum of them ... psad1 mean1 = c[1], r0}{.mmi ld8 c[13] = [cura1], stride ;; ld8 c[14] = [cura0], stride psad1 mean2 = c[2], r0}{.mii ld8 c[15] = [cura1], stride psad1 mean3 = c[3], r0 ;; psad1 psad[0] = c[4], r0}{.mmi ld8 c[16] = [cura0], stride ld8 c[17] = [cura1], stride psad1 psad[1] = c[5], r0 ;;}{.mii ld8 c[18] = [cura0], stride psad1 psad[2] = c[6], r0 psad1 psad[3] = c[7], r0}{.mmi ld8 c[19] = [cura1], stride ;; ld8 c[20] = [cura0], stride psad1 psad[4] = c[8], r0}{.mii ld8 c[21] = [cura1], stride psad1 psad[5] = c[9], r0 ;; add mean0 = mean0, psad[0]}{.mmi ld8 c[22] = [cura0], stride ld8 c[23] = [cura1], stride add mean1 = mean1, psad[1] ;; }{.mii ld8 c[24] = [cura0], stride psad1 psad[0] = c[10], r0 psad1 psad[1] = c[11], r0}{.mmi ld8 c[25] = [cura1], stride ;; ld8 c[26] = [cura0], stride add mean2 = mean2, psad[2]}{.mii ld8 c[27] = [cura1], stride add mean3 = mean3, psad[3] ;; psad1 psad[2] = c[12], r0}{.mmi ld8 c[28] = [cura0], stride ld8 c[29] = [cura1], stride psad1 psad[3] = c[13], r0 ;; }{.mii ld8 c[30] = [cura0] psad1 psad[6] = c[14], r0 psad1 psad[7] = c[15], r0}{.mmi ld8 c[31] = [cura1] ;; add mean0 = mean0, psad[0] add mean1 = mean1, psad[1]}{.mii add mean2 = mean2, psad[4] add mean3 = mean3, psad[5] ;; psad1 psad[0] = c[16], r0}{.mmi add mean0 = mean0, psad[2] add mean1 = mean1, psad[3] psad1 psad[1] = c[17], r0 ;;}{.mii add mean2 = mean2, psad[6] psad1 psad[2] = c[18], r0 psad1 psad[3] = c[19], r0}{.mmi add mean3 = mean3, psad[7] ;; add mean0 = mean0, psad[0] psad1 psad[4] = c[20], r0}{.mii add mean1 = mean1, psad[1] psad1 psad[5] = c[21], r0 ;; psad1 psad[6] = c[22], r0}{.mmi add mean2 = mean2, psad[2] add mean3 = mean3, psad[3] psad1 psad[7] = c[23], r0 ;;}{.mii add mean0 = mean0, psad[4] psad1 psad[0] = c[24], r0 psad1 psad[1] = c[25], r0}{.mmi add mean1 = mean1, psad[5] ;; add mean2 = mean2, psad[6] psad1 psad[2] = c[26], r0}{.mii add mean3 = mean3, psad[7] psad1 psad[3] = c[27], r0 ;; psad1 psad[4] = c[28], r0}{.mmi add mean0 = mean0, psad[0] add mean1 = mean1, psad[1] psad1 psad[5] = c[29], r0 ;;}{.mii add mean2 = mean2, psad[2] psad1 psad[6] = c[30], r0 psad1 psad[7] = c[31], r0}{.mmi add mean3 = mean3, psad[3] ;; add mean0 = mean0, psad[4] add mean1 = mean1, psad[5]}{.mbb add mean2 = mean2, mean3 nop.b 1 nop.b 1 ;;}{.mib add mean0 = mean0, psad[6] add mean1 = mean1, psad[7] nop.b 1 ;;}{.mib add mean0 = mean0, mean1 // add mean2 = 127, mean2 // this could make our division more exactly, but does not help much ;;}{.mib add mean0 = mean0, mean2 ;;}{.mib shr.u mean0 = mean0, 8 // divide them ... ;;}{.mib mux1 mean0 = mean0, @brcst ;; } {.mii nop.m 0 psad1 dev0 = c[0], mean0 // and do a sad again ... psad1 dev1 = c[1], mean0}{.mii nop.m 0 psad1 dev2 = c[2], mean0 psad1 dev3 = c[3], mean0}{.mii nop.m 0 psad1 psad[0] = c[4], mean0 psad1 psad[1] = c[5], mean0}{.mii nop.m 0 psad1 psad[2] = c[6], mean0 psad1 psad[3] = c[7], mean0}{.mii nop.m 0 psad1 psad[4] = c[8], mean0 psad1 psad[5] = c[9], mean0 ;; }{.mii add dev0 = dev0, psad[0] psad1 psad[6] = c[10], mean0 psad1 psad[7] = c[11], mean0}{.mmi add dev1 = dev1, psad[1] add dev2 = dev2, psad[2] psad1 psad[0] = c[12], mean0}{.mii add dev3 = dev3, psad[3] psad1 psad[1] = c[13], mean0 ;; psad1 psad[2] = c[14], mean0}{.mmi add dev0 = dev0, psad[4] add dev1 = dev1, psad[5] psad1 psad[3] = c[15], mean0}{.mii add dev2 = dev2, psad[6] psad1 psad[4] = c[16], mean0 psad1 psad[5] = c[17], mean0}{.mmi add dev3 = dev3, psad[7] ;; add dev0 = dev0, psad[0] psad1 psad[6] = c[18], mean0}{.mii add dev1 = dev1, psad[1] psad1 psad[7] = c[19], mean0 psad1 psad[0] = c[20], mean0}{.mmi add dev2 = dev2, psad[2] add dev3 = dev3, psad[3] psad1 psad[1] = c[21], mean0 ;;}{.mii add dev0 = dev0, psad[4] psad1 psad[2] = c[22], mean0 psad1 psad[3] = c[23], mean0}{.mmi add dev1 = dev1, psad[5] add dev2 = dev2, psad[6] psad1 psad[4] = c[24], mean0}{.mii add dev3 = dev3, psad[7] psad1 psad[5] = c[25], mean0 ;; psad1 psad[6] = c[26], mean0}{.mmi add dev0 = dev0, psad[0] add dev1 = dev1, psad[1] psad1 psad[7] = c[27], mean0}{.mii add dev2 = dev2, psad[2] psad1 psad[0] = c[28], mean0 psad1 psad[1] = c[29], mean0}{.mmi add dev3 = dev3, psad[3] ;; add dev0 = dev0, psad[4] psad1 psad[2] = c[30], mean0}{.mii add dev1 = dev1, psad[5] psad1 psad[3] = c[31], mean0 ;; add dev2 = dev2, psad[6]}{.mmi add dev3 = dev3, psad[7] add dev0 = dev0, psad[0] add dev1 = dev1, psad[1] ;;}{.mii add dev2 = dev2, psad[2] add dev3 = dev3, psad[3] add ret0 = dev0, dev1 ;; }{.mib add dev2 = dev2, dev3 nop.i 1 nop.b 1 ;; }{.mib add ret0 = ret0, dev2 nop.i 1 br.ret.sptk.many b0} .endp dev16_ia64#// ###########################################################// ###########################################################// Neue version von gruppe 01 ################################// ###########################################################// ########################################################### .text .align 16 .global sad16_ia64# .proc sad16_ia64#sad16_ia64: alloc r1 = ar.pfs, 4, 76, 0, 0 mov r2 = pr dep r14 = r0, r33, 0, 3 // r14 = (r33 div 8)*8 (aligned version of ref) dep.z r31 = r33, 0, 3 // r31 = r33 mod 8 (misalignment of ref) ;; mov r64 = r34 //(1) calculate multiples of stride shl r65 = r34, 1 //(2) for being able to load all the shladd r66 = r34, 1, r34 //(3) data at once shl r67 = r34, 2 //(4) shladd r68 = r34, 2, r34 //(5) shl r71 = r34, 3 //(8) shladd r72 = r34, 3, r34 //(9) ;; shl r69 = r66, 1 //(6) shladd r70 = r66, 1, r34 //(7) shl r73 = r68, 1 //(10) shladd r74 = r68, 1, r34 //(11) shl r75 = r66, 2 //(12) shladd r76 = r66, 2, r34 //(13) shladd r77 = r66, 2, r65 //(14) shladd r78 = r66, 2, r66 //(15) ;; cmp.eq p16, p17 = 0, r31 // prepare predicates according to the misalignment cmp.eq p18, p19 = 2, r31 // ref cmp.eq p20, p21 = 4, r31 cmp.eq p22, p23 = 6, r31 cmp.eq p24, p25 = 1, r31 cmp.eq p26, p27 = 3, r31 cmp.eq p28, p29 = 5, r31 mov r96 = r14 // and calculate all the adresses where we have mov r33 = r32 // to load from add r97 = r14, r64 add r35 = r32, r64 add r98 = r14, r65 add r37 = r32, r65 add r99 = r14, r66 add r39 = r32, r66 add r100 = r14, r67 add r41 = r32, r67 add r101 = r14, r68 add r43 = r32, r68 add r102 = r14, r69 add r45 = r32, r69 add r103 = r14, r70 add r47 = r32, r70 add r104 = r14, r71 add r49 = r32, r71 add r105 = r14, r72 add r51 = r32, r72 add r106 = r14, r73 add r53 = r32, r73 add r107 = r14, r74 add r55 = r32, r74 add r108 = r14, r75 add r57 = r32, r75 add r109 = r14, r76 add r59 = r32, r76 add r110 = r14, r77 add r61 = r32, r77 add r111 = r14, r78 add r63 = r32, r78 ;; ld8 r32 = [r33], 8 // Load all the data which is needed for the sad ld8 r34 = [r35], 8 // in the registers. the goal is to have the array
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -