📄 sad_ia64.s
字号:
}
{.mii
nop.m 0
psad1 psad[4] = c[8], mean0
psad1 psad[5] = c[9], mean0
;;
}
{.mii
add dev0 = dev0, psad[0]
psad1 psad[6] = c[10], mean0
psad1 psad[7] = c[11], mean0
}
{.mmi
add dev1 = dev1, psad[1]
add dev2 = dev2, psad[2]
psad1 psad[0] = c[12], mean0
}
{.mii
add dev3 = dev3, psad[3]
psad1 psad[1] = c[13], mean0
;;
psad1 psad[2] = c[14], mean0
}
{.mmi
add dev0 = dev0, psad[4]
add dev1 = dev1, psad[5]
psad1 psad[3] = c[15], mean0
}
{.mii
add dev2 = dev2, psad[6]
psad1 psad[4] = c[16], mean0
psad1 psad[5] = c[17], mean0
}
{.mmi
add dev3 = dev3, psad[7]
;;
add dev0 = dev0, psad[0]
psad1 psad[6] = c[18], mean0
}
{.mii
add dev1 = dev1, psad[1]
psad1 psad[7] = c[19], mean0
psad1 psad[0] = c[20], mean0
}
{.mmi
add dev2 = dev2, psad[2]
add dev3 = dev3, psad[3]
psad1 psad[1] = c[21], mean0
;;
}
{.mii
add dev0 = dev0, psad[4]
psad1 psad[2] = c[22], mean0
psad1 psad[3] = c[23], mean0
}
{.mmi
add dev1 = dev1, psad[5]
add dev2 = dev2, psad[6]
psad1 psad[4] = c[24], mean0
}
{.mii
add dev3 = dev3, psad[7]
psad1 psad[5] = c[25], mean0
;;
psad1 psad[6] = c[26], mean0
}
{.mmi
add dev0 = dev0, psad[0]
add dev1 = dev1, psad[1]
psad1 psad[7] = c[27], mean0
}
{.mii
add dev2 = dev2, psad[2]
psad1 psad[0] = c[28], mean0
psad1 psad[1] = c[29], mean0
}
{.mmi
add dev3 = dev3, psad[3]
;;
add dev0 = dev0, psad[4]
psad1 psad[2] = c[30], mean0
}
{.mii
add dev1 = dev1, psad[5]
psad1 psad[3] = c[31], mean0
;;
add dev2 = dev2, psad[6]
}
{.mmi
add dev3 = dev3, psad[7]
add dev0 = dev0, psad[0]
add dev1 = dev1, psad[1]
;;
}
{.mii
add dev2 = dev2, psad[2]
add dev3 = dev3, psad[3]
add ret0 = dev0, dev1
;;
}
{.mib
add dev2 = dev2, dev3
nop.i 1
nop.b 1
;;
}
{.mib
add ret0 = ret0, dev2
nop.i 1
br.ret.sptk.many b0
}
.endp dev16_ia64#
// ###########################################################
// ###########################################################
// Neue version von gruppe 01 ################################
// ###########################################################
// ###########################################################
.text
.align 16
.global sad16_ia64#
.proc sad16_ia64#
sad16_ia64:
alloc r1 = ar.pfs, 4, 76, 0, 0
mov r2 = pr
dep r14 = r0, r33, 0, 3 // r14 = (r33 div 8)*8 (aligned version of ref)
dep.z r31 = r33, 0, 3 // r31 = r33 mod 8 (misalignment of ref)
;;
mov r64 = r34 //(1) calculate multiples of stride
shl r65 = r34, 1 //(2) for being able to load all the
shladd r66 = r34, 1, r34 //(3) data at once
shl r67 = r34, 2 //(4)
shladd r68 = r34, 2, r34 //(5)
shl r71 = r34, 3 //(8)
shladd r72 = r34, 3, r34 //(9)
;;
shl r69 = r66, 1 //(6)
shladd r70 = r66, 1, r34 //(7)
shl r73 = r68, 1 //(10)
shladd r74 = r68, 1, r34 //(11)
shl r75 = r66, 2 //(12)
shladd r76 = r66, 2, r34 //(13)
shladd r77 = r66, 2, r65 //(14)
shladd r78 = r66, 2, r66 //(15)
;;
cmp.eq p16, p17 = 0, r31 // prepare predicates according to the misalignment
cmp.eq p18, p19 = 2, r31 // ref
cmp.eq p20, p21 = 4, r31
cmp.eq p22, p23 = 6, r31
cmp.eq p24, p25 = 1, r31
cmp.eq p26, p27 = 3, r31
cmp.eq p28, p29 = 5, r31
mov r96 = r14 // and calculate all the adresses where we have
mov r33 = r32 // to load from
add r97 = r14, r64
add r35 = r32, r64
add r98 = r14, r65
add r37 = r32, r65
add r99 = r14, r66
add r39 = r32, r66
add r100 = r14, r67
add r41 = r32, r67
add r101 = r14, r68
add r43 = r32, r68
add r102 = r14, r69
add r45 = r32, r69
add r103 = r14, r70
add r47 = r32, r70
add r104 = r14, r71
add r49 = r32, r71
add r105 = r14, r72
add r51 = r32, r72
add r106 = r14, r73
add r53 = r32, r73
add r107 = r14, r74
add r55 = r32, r74
add r108 = r14, r75
add r57 = r32, r75
add r109 = r14, r76
add r59 = r32, r76
add r110 = r14, r77
add r61 = r32, r77
add r111 = r14, r78
add r63 = r32, r78
;;
ld8 r32 = [r33], 8 // Load all the data which is needed for the sad
ld8 r34 = [r35], 8 // in the registers. the goal is to have the array
ld8 r36 = [r37], 8 // adressed by cur in the registers r32 - r63 and
ld8 r38 = [r39], 8 // the aray adressed by ref in the registers
ld8 r40 = [r41], 8 // r64 - r95. The registers r96 - r111 are needed
ld8 r42 = [r43], 8 // to load the aligned 24 bits in which the
ld8 r44 = [r45], 8 // needed misaligned 16 bits must be.
ld8 r46 = [r47], 8 // After loading we start a preprocessing which
ld8 r48 = [r49], 8 // guarantees that the data adressed by ref is in
ld8 r50 = [r51], 8 // the registers r64 - r95.
ld8 r52 = [r53], 8
ld8 r54 = [r55], 8
ld8 r56 = [r57], 8
ld8 r58 = [r59], 8
ld8 r60 = [r61], 8
ld8 r62 = [r63], 8
ld8 r64 = [r96], 8
ld8 r66 = [r97], 8
ld8 r68 = [r98], 8
ld8 r70 = [r99], 8
ld8 r72 = [r100], 8
ld8 r74 = [r101], 8
ld8 r76 = [r102], 8
ld8 r78 = [r103], 8
ld8 r80 = [r104], 8
ld8 r82 = [r105], 8
ld8 r84 = [r106], 8
ld8 r86 = [r107], 8
ld8 r88 = [r108], 8
ld8 r90 = [r109], 8
ld8 r92 = [r110], 8
ld8 r94 = [r111], 8
;;
ld8 r33 = [r33]
ld8 r35 = [r35]
ld8 r37 = [r37]
ld8 r39 = [r39]
ld8 r41 = [r41]
ld8 r43 = [r43]
ld8 r45 = [r45]
ld8 r47 = [r47]
ld8 r49 = [r49]
ld8 r51 = [r51]
ld8 r53 = [r53]
ld8 r55 = [r55]
ld8 r57 = [r57]
ld8 r59 = [r59]
ld8 r61 = [r61]
ld8 r63 = [r63]
ld8 r65 = [r96], 8
ld8 r67 = [r97], 8
ld8 r69 = [r98], 8
ld8 r71 = [r99], 8
ld8 r73 = [r100], 8
ld8 r75 = [r101], 8
ld8 r77 = [r102], 8
ld8 r79 = [r103], 8
ld8 r81 = [r104], 8
ld8 r83 = [r105], 8
ld8 r85 = [r106], 8
ld8 r87 = [r107], 8
ld8 r89 = [r108], 8
ld8 r91 = [r109], 8
ld8 r93 = [r110], 8
ld8 r95 = [r111], 8
(p16) br.cond.dptk.many .Lber // If ref is aligned, everything is loaded and we can start the calculation
;;
ld8 r96 = [r96] // If not, we have to load a bit more
ld8 r97 = [r97]
ld8 r98 = [r98]
ld8 r99 = [r99]
ld8 r100 = [r100]
ld8 r101 = [r101]
ld8 r102 = [r102]
ld8 r103 = [r103]
ld8 r104 = [r104]
ld8 r105 = [r105]
ld8 r106 = [r106]
ld8 r107 = [r107]
ld8 r108 = [r108]
ld8 r109 = [r109]
ld8 r110 = [r110]
ld8 r111 = [r111]
(p24) br.cond.dptk.many .Lmod1 // according to the misalignment, we have
(p18) br.cond.dpnt.many .Lmod2 // to jump to different preprocessing routines
(p26) br.cond.dpnt.many .Lmod3
(p20) br.cond.dpnt.many .Lmod4
(p28) br.cond.dpnt.many .Lmod5
(p22) br.cond.dpnt.many .Lmod6
;;
.Lmod7: // this jump point is not needed
shrp r64 = r65, r64, 56 // in these blocks, we do the preprocessing
shrp r65 = r96, r65, 56
shrp r66 = r67, r66, 56
shrp r67 = r97, r67, 56
shrp r68 = r69, r68, 56
shrp r69 = r98, r69, 56
shrp r70 = r71, r70, 56
shrp r71 = r99, r71, 56
shrp r72 = r73, r72, 56
shrp r73 = r100, r73, 56
shrp r74 = r75, r74, 56
shrp r75 = r101, r75, 56
shrp r76 = r77, r76, 56
shrp r77 = r102, r77, 56
shrp r78 = r79, r78, 56
shrp r79 = r103, r79, 56
shrp r80 = r81, r80, 56
shrp r81 = r104, r81, 56
shrp r82 = r83, r82, 56
shrp r83 = r105, r83, 56
shrp r84 = r85, r84, 56
shrp r85 = r106, r85, 56
shrp r86 = r87, r86, 56
shrp r87 = r107, r87, 56
shrp r88 = r89, r88, 56
shrp r89 = r108, r89, 56
shrp r90 = r91, r90, 56
shrp r91 = r109, r91, 56
shrp r92 = r93, r92, 56
shrp r93 = r110, r93, 56
shrp r94 = r95, r94, 56
shrp r95 = r111, r95, 56
br.cond.sptk.many .Lber // and then we jump to the calculation
;;
.Lmod6:
shrp r64 = r65, r64, 48
shrp r65 = r96, r65, 48
shrp r66 = r67, r66, 48
shrp r67 = r97, r67, 48
shrp r68 = r69, r68, 48
shrp r69 = r98, r69, 48
shrp r70 = r71, r70, 48
shrp r71 = r99, r71, 48
shrp r72 = r73, r72, 48
shrp r73 = r100, r73, 48
shrp r74 = r75, r74, 48
shrp r75 = r101, r75, 48
shrp r76 = r77, r76, 48
shrp r77 = r102, r77, 48
shrp r78 = r79, r78, 48
shrp r79 = r103, r79, 48
shrp r80 = r81, r80, 48
shrp r81 = r104, r81, 48
shrp r82 = r83, r82, 48
shrp r83 = r105, r83, 48
shrp r84 = r85, r84, 48
shrp r85 = r106, r85, 48
shrp r86 = r87, r86, 48
shrp r87 = r107, r87, 48
shrp r88 = r89, r88, 48
shrp r89 = r108, r89, 48
shrp r90 = r91, r90, 48
shrp r91 = r109, r91, 48
shrp r92 = r93, r92, 48
shrp r93 = r110, r93, 48
shrp r94 = r95, r94, 48
shrp r95 = r111, r95, 48
br.cond.sptk.many .Lber
;;
.Lmod5:
shrp r64 = r65, r64, 40
shrp r65 = r96, r65, 40
shrp r66 = r67, r66, 40
shrp r67 = r97, r67, 40
shrp r68 = r69, r68, 40
shrp r69 = r98, r69, 40
shrp r70 = r71, r70, 40
shrp r71 = r99, r71, 40
shrp r72 = r73, r72, 40
shrp r73 = r100, r73, 40
shrp r74 = r75, r74, 40
shrp r75 = r101, r75, 40
shrp r76 = r77, r76, 40
shrp r77 = r102, r77, 40
shrp r78 = r79, r78, 40
shrp r79 = r103, r79, 40
shrp r80 = r81, r80, 40
shrp r81 = r104, r81, 40
shrp r82 = r83, r82, 40
shrp r83 = r105, r83, 40
shrp r84 = r85, r84, 40
shrp r85 = r106, r85, 40
shrp r86 = r87, r86, 40
shrp r87 = r107, r87, 40
shrp r88 = r89, r88, 40
shrp r89 = r108, r89, 40
shrp r90 = r91, r90, 40
shrp r91 = r109, r91, 40
shrp r92 = r93, r92, 40
shrp r93 = r110, r93, 40
shrp r94 = r95, r94, 40
shrp r95 = r111, r95, 40
br.cond.sptk.many .Lber
;;
.Lmod4:
shrp r64 = r65, r64, 32
shrp r65 = r96, r65, 32
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -