⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sad_ia64.s

📁 wince下的xvidcore开发库,可用于MP4等视频播放开发
💻 S
📖 第 1 页 / 共 3 页
字号:
//   ------------------------------------------------------------------------------
//   *
//   * Optimized Assembler Versions of sad8 and sad16
//   *
//   ------------------------------------------------------------------------------
//   *
//   * Hannes J黷ting and Christopher 謟bek 
//   * {s_juetti,s_oezbek}@ira.uka.de
//   *
//   * Programmed for the IA64 laboratory held at University Karlsruhe 2002
//   * http://www.info.uni-karlsruhe.de/~rubino/ia64p/
//   *
//   ------------------------------------------------------------------------------
//   *
//   * These are the optimized assembler versions of sad8 and sad16, which calculate 
//   * the sum of absolute differences between two 8x8/16x16 block matrices. 
//   *
//   * Our approach uses:
//   *   - The Itanium command psad1, which solves the problem in hardware. 
//   *   - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64 
//   *     EPIC architecture
//   *   - Alignment resolving to avoid memory faults
//   *
//   ------------------------------------------------------------------------------

                
        
                
        .common sad16bi#,8,8
        .align 16
        .global sad16bi_ia64#
        .proc sad16bi_ia64#
sad16bi_ia64:
        .prologue
        .save ar.lc, r2
        mov r2 = ar.lc
        .body
        zxt4 r35 = r35
        mov r8 = r0
        mov r23 = r0
        addl r22 = 255, r0
.L21:
        addl r14 = 7, r0
        mov r19 = r32
        mov r21 = r34
        mov r20 = r33
        ;;
        mov ar.lc = r14
        ;;
.L105:
        mov r17 = r20
        mov r18 = r21
        ;;
        ld1 r14 = [r17], 1
        ld1 r15 = [r18], 1
        ;;
        add r14 = r14, r15
        ;;
        adds r14 = 1, r14
        ;;
        shr.u r16 = r14, 1
        ;;
        cmp4.le p6, p7 = r0, r16
        ;;
        (p7) mov r16 = r0
        (p7) br.cond.dpnt .L96
        ;;
        cmp4.ge p6, p7 = r22, r16
        ;;
        (p7) addl r16 = 255, r0
.L96:
        ld1 r14 = [r19]
        adds r20 = 2, r20
        adds r21 = 2, r21
        ;;
        sub r15 = r14, r16
        ;;
        cmp4.ge p6, p7 = 0, r15
        ;;
        (p6) sub r14 = r16, r14
        (p7) add r8 = r8, r15
        ;;
        (p6) add r8 = r8, r14
        ld1 r15 = [r18]
        ld1 r14 = [r17]
        ;;
        add r14 = r14, r15
        adds r17 = 1, r19
        ;;
        adds r14 = 1, r14
        ;;
        shr.u r16 = r14, 1
        ;;
        cmp4.le p6, p7 = r0, r16
        ;;
        (p7) mov r16 = r0
        (p7) br.cond.dpnt .L102
        ;;
        cmp4.ge p6, p7 = r22, r16
        ;;
        (p7) addl r16 = 255, r0
.L102:
        ld1 r14 = [r17]
        adds r19 = 2, r19
        ;;
        sub r15 = r14, r16
        ;;
        cmp4.ge p6, p7 = 0, r15
        ;;
        (p7) add r8 = r8, r15
        (p6) sub r14 = r16, r14
        ;;
        (p6) add r8 = r8, r14
        br.cloop.sptk.few .L105
        adds r23 = 1, r23
        add r32 = r32, r35
        add r33 = r33, r35
        add r34 = r34, r35
        ;;
        cmp4.geu p6, p7 = 15, r23
        (p6) br.cond.dptk .L21
        mov ar.lc = r2
        br.ret.sptk.many b0
        .endp sad16bi_ia64#



        
        
        
        
        
.text
        .align 16
        .global dev16_ia64#
        .proc dev16_ia64#
.auto
dev16_ia64:
        // renamings for better readability
        stride = r18
        pfs = r19                       //for saving previous function state
        cura0 = r20                     //address of first 8-byte block of cur
        cura1 = r21                     //address of second 8-byte block of cur
        mean0 = r22                     //registers for calculating the sum in parallel
        mean1 = r23
        mean2 = r24
        mean3 = r25
        dev0 = r26                      //same for the deviation
        dev1 = r27                      
        dev2 = r28
        dev3 = r29
        
        .body
        alloc pfs = ar.pfs, 2, 38, 0, 40

        mov cura0  = in0
        mov stride = in1
        add cura1 = 8, cura0
        
        .rotr c[32], psad[8]            // just using rotating registers to get an array ;-)

.explicit
{.mmi
        ld8 c[0] = [cura0], stride      // load them ...
        ld8 c[1] = [cura1], stride
        ;; 
}       
{.mmi   
        ld8 c[2] = [cura0], stride
        ld8 c[3] = [cura1], stride
        ;; 
}
{.mmi   
        ld8 c[4] = [cura0], stride
        ld8 c[5] = [cura1], stride
        ;;
}
{.mmi
        ld8 c[6] = [cura0], stride
        ld8 c[7] = [cura1], stride
        ;;
}
{.mmi   
        ld8 c[8] = [cura0], stride
        ld8 c[9] = [cura1], stride
        ;;
}
{.mmi
        ld8 c[10] = [cura0], stride
        ld8 c[11] = [cura1], stride
        ;;
}
{.mii
        ld8 c[12] = [cura0], stride
        psad1 mean0 = c[0], r0          // get the sum of them ...
        psad1 mean1 = c[1], r0
}
{.mmi
        ld8 c[13] = [cura1], stride
        ;; 
        ld8 c[14] = [cura0], stride
        psad1 mean2 = c[2], r0
}
{.mii
        ld8 c[15] = [cura1], stride
        psad1 mean3 = c[3], r0
        ;; 
        psad1 psad[0] = c[4], r0
}
{.mmi
        ld8 c[16] = [cura0], stride
        ld8 c[17] = [cura1], stride
        psad1 psad[1] = c[5], r0
        ;;
}
{.mii   
        ld8 c[18] = [cura0], stride
        psad1 psad[2] = c[6], r0
        psad1 psad[3] = c[7], r0
}
{.mmi   
        ld8 c[19] = [cura1], stride
        ;; 
        ld8 c[20] = [cura0], stride
        psad1 psad[4] = c[8], r0
}
{.mii
        ld8 c[21] = [cura1], stride
        psad1 psad[5] = c[9], r0
        ;;
        add mean0 = mean0, psad[0]
}
{.mmi
        ld8 c[22] = [cura0], stride
        ld8 c[23] = [cura1], stride
        add mean1 = mean1, psad[1]
        ;; 
}
{.mii
        ld8 c[24] = [cura0], stride
        psad1 psad[0] = c[10], r0
        psad1 psad[1] = c[11], r0
}
{.mmi
        ld8 c[25] = [cura1], stride
        ;; 
        ld8 c[26] = [cura0], stride
        add mean2 = mean2, psad[2]
}
{.mii
        ld8 c[27] = [cura1], stride
        add mean3 = mean3, psad[3]
        ;; 
        psad1 psad[2] = c[12], r0
}
{.mmi
        ld8 c[28] = [cura0], stride
        ld8 c[29] = [cura1], stride
        psad1 psad[3] = c[13], r0
        ;; 
}
{.mii
        ld8 c[30] = [cura0]
        psad1 psad[6] = c[14], r0
        psad1 psad[7] = c[15], r0
}
{.mmi
        ld8 c[31] = [cura1]
        ;; 
        add mean0 = mean0, psad[0]
        add mean1 = mean1, psad[1]
}
{.mii
        add mean2 = mean2, psad[4]
        add mean3 = mean3, psad[5]
        ;;
        psad1 psad[0] = c[16], r0
}
{.mmi
        add mean0 = mean0, psad[2]
        add mean1 = mean1, psad[3]
        psad1 psad[1] = c[17], r0
        ;;
}
{.mii
        add mean2 = mean2, psad[6]
        psad1 psad[2] = c[18], r0
        psad1 psad[3] = c[19], r0
}
{.mmi
        add mean3 = mean3, psad[7]
        ;; 
        add mean0 = mean0, psad[0]
        psad1 psad[4] = c[20], r0
}
{.mii
        add mean1 = mean1, psad[1]
        psad1 psad[5] = c[21], r0
        ;;
        psad1 psad[6] = c[22], r0
}
{.mmi
        add mean2 = mean2, psad[2]
        add mean3 = mean3, psad[3]
        psad1 psad[7] = c[23], r0
        ;;
}
{.mii
        add mean0 = mean0, psad[4]
        psad1 psad[0] = c[24], r0
        psad1 psad[1] = c[25], r0
}
{.mmi
        add mean1 = mean1, psad[5]
        ;;
        add mean2 = mean2, psad[6]
        psad1 psad[2] = c[26], r0
}
{.mii
        add mean3 = mean3, psad[7]
        psad1 psad[3] = c[27], r0
        ;; 
        psad1 psad[4] = c[28], r0
}
{.mmi
        add mean0 = mean0, psad[0]
        add mean1 = mean1, psad[1]
        psad1 psad[5] = c[29], r0
        ;;
}
{.mii
        add mean2 = mean2, psad[2]
        psad1 psad[6] = c[30], r0
        psad1 psad[7] = c[31], r0
}
{.mmi
        add mean3 = mean3, psad[3]
        ;;
        add mean0 = mean0, psad[4]
        add mean1 = mean1, psad[5]
}
{.mbb
        add mean2 = mean2, mean3
        nop.b 1
        nop.b 1
        ;;
}
{.mib
        add mean0 = mean0, psad[6]
        add mean1 = mean1, psad[7]
        nop.b 1
        ;;
}
{.mib
        add mean0 = mean0, mean1
        // add mean2 = 127, mean2       // this could make our division more exactly, but does not help much
        ;;
}
{.mib
        add mean0 = mean0, mean2
        ;;
}

{.mib
        shr.u mean0 = mean0, 8          // divide them ...
        ;;
}
{.mib
        mux1 mean0 = mean0, @brcst
        ;; 
}       
{.mii
        nop.m 0
        psad1 dev0 = c[0], mean0        // and do a sad again ...
        psad1 dev1 = c[1], mean0
}
{.mii
        nop.m 0
        psad1 dev2 = c[2], mean0
        psad1 dev3 = c[3], mean0
}
{.mii
        nop.m 0
        psad1 psad[0] = c[4], mean0
        psad1 psad[1] = c[5], mean0
}
{.mii
        nop.m 0
        psad1 psad[2] = c[6], mean0
        psad1 psad[3] = c[7], mean0

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -