📄 sad_ia64.s
字号:
shrp r66 = r67, r66, 32
shrp r67 = r97, r67, 32
shrp r68 = r69, r68, 32
shrp r69 = r98, r69, 32
shrp r70 = r71, r70, 32
shrp r71 = r99, r71, 32
shrp r72 = r73, r72, 32
shrp r73 = r100, r73, 32
shrp r74 = r75, r74, 32
shrp r75 = r101, r75, 32
shrp r76 = r77, r76, 32
shrp r77 = r102, r77, 32
shrp r78 = r79, r78, 32
shrp r79 = r103, r79, 32
shrp r80 = r81, r80, 32
shrp r81 = r104, r81, 32
shrp r82 = r83, r82, 32
shrp r83 = r105, r83, 32
shrp r84 = r85, r84, 32
shrp r85 = r106, r85, 32
shrp r86 = r87, r86, 32
shrp r87 = r107, r87, 32
shrp r88 = r89, r88, 32
shrp r89 = r108, r89, 32
shrp r90 = r91, r90, 32
shrp r91 = r109, r91, 32
shrp r92 = r93, r92, 32
shrp r93 = r110, r93, 32
shrp r94 = r95, r94, 32
shrp r95 = r111, r95, 32
br.cond.sptk.many .Lber
;;
.Lmod3:
shrp r64 = r65, r64, 24
shrp r65 = r96, r65, 24
shrp r66 = r67, r66, 24
shrp r67 = r97, r67, 24
shrp r68 = r69, r68, 24
shrp r69 = r98, r69, 24
shrp r70 = r71, r70, 24
shrp r71 = r99, r71, 24
shrp r72 = r73, r72, 24
shrp r73 = r100, r73, 24
shrp r74 = r75, r74, 24
shrp r75 = r101, r75, 24
shrp r76 = r77, r76, 24
shrp r77 = r102, r77, 24
shrp r78 = r79, r78, 24
shrp r79 = r103, r79, 24
shrp r80 = r81, r80, 24
shrp r81 = r104, r81, 24
shrp r82 = r83, r82, 24
shrp r83 = r105, r83, 24
shrp r84 = r85, r84, 24
shrp r85 = r106, r85, 24
shrp r86 = r87, r86, 24
shrp r87 = r107, r87, 24
shrp r88 = r89, r88, 24
shrp r89 = r108, r89, 24
shrp r90 = r91, r90, 24
shrp r91 = r109, r91, 24
shrp r92 = r93, r92, 24
shrp r93 = r110, r93, 24
shrp r94 = r95, r94, 24
shrp r95 = r111, r95, 24
br.cond.sptk.many .Lber
;;
.Lmod2:
shrp r64 = r65, r64, 16
shrp r65 = r96, r65, 16
shrp r66 = r67, r66, 16
shrp r67 = r97, r67, 16
shrp r68 = r69, r68, 16
shrp r69 = r98, r69, 16
shrp r70 = r71, r70, 16
shrp r71 = r99, r71, 16
shrp r72 = r73, r72, 16
shrp r73 = r100, r73, 16
shrp r74 = r75, r74, 16
shrp r75 = r101, r75, 16
shrp r76 = r77, r76, 16
shrp r77 = r102, r77, 16
shrp r78 = r79, r78, 16
shrp r79 = r103, r79, 16
shrp r80 = r81, r80, 16
shrp r81 = r104, r81, 16
shrp r82 = r83, r82, 16
shrp r83 = r105, r83, 16
shrp r84 = r85, r84, 16
shrp r85 = r106, r85, 16
shrp r86 = r87, r86, 16
shrp r87 = r107, r87, 16
shrp r88 = r89, r88, 16
shrp r89 = r108, r89, 16
shrp r90 = r91, r90, 16
shrp r91 = r109, r91, 16
shrp r92 = r93, r92, 16
shrp r93 = r110, r93, 16
shrp r94 = r95, r94, 16
shrp r95 = r111, r95, 16
br.cond.sptk.many .Lber
;;
.Lmod1:
shrp r64 = r65, r64, 8
shrp r65 = r96, r65, 8
shrp r66 = r67, r66, 8
shrp r67 = r97, r67, 8
shrp r68 = r69, r68, 8
shrp r69 = r98, r69, 8
shrp r70 = r71, r70, 8
shrp r71 = r99, r71, 8
shrp r72 = r73, r72, 8
shrp r73 = r100, r73, 8
shrp r74 = r75, r74, 8
shrp r75 = r101, r75, 8
shrp r76 = r77, r76, 8
shrp r77 = r102, r77, 8
shrp r78 = r79, r78, 8
shrp r79 = r103, r79, 8
shrp r80 = r81, r80, 8
shrp r81 = r104, r81, 8
shrp r82 = r83, r82, 8
shrp r83 = r105, r83, 8
shrp r84 = r85, r84, 8
shrp r85 = r106, r85, 8
shrp r86 = r87, r86, 8
shrp r87 = r107, r87, 8
shrp r88 = r89, r88, 8
shrp r89 = r108, r89, 8
shrp r90 = r91, r90, 8
shrp r91 = r109, r91, 8
shrp r92 = r93, r92, 8
shrp r93 = r110, r93, 8
shrp r94 = r95, r94, 8
shrp r95 = r111, r95, 8
.Lber:
;;
psad1 r32 = r32, r64 // Here we do the calculation.
psad1 r33 = r33, r65 // The machine is providing a fast method
psad1 r34 = r34, r66 // for calculating sad, so we use it
psad1 r35 = r35, r67
psad1 r36 = r36, r68
psad1 r37 = r37, r69
psad1 r38 = r38, r70
psad1 r39 = r39, r71
psad1 r40 = r40, r72
psad1 r41 = r41, r73
psad1 r42 = r42, r74
psad1 r43 = r43, r75
psad1 r44 = r44, r76
psad1 r45 = r45, r77
psad1 r46 = r46, r78
psad1 r47 = r47, r79
psad1 r48 = r48, r80
psad1 r49 = r49, r81
psad1 r50 = r50, r82
psad1 r51 = r51, r83
psad1 r52 = r52, r84
psad1 r53 = r53, r85
psad1 r54 = r54, r86
psad1 r55 = r55, r87
psad1 r56 = r56, r88
psad1 r57 = r57, r89
psad1 r58 = r58, r90
psad1 r59 = r59, r91
psad1 r60 = r60, r92
psad1 r61 = r61, r93
psad1 r62 = r62, r94
psad1 r63 = r63, r95
;;
add r32 = r32, r63 // at last, we have to sum up
add r33 = r33, r62 // in 5 stages
add r34 = r34, r61
add r35 = r35, r60
add r36 = r36, r59
add r37 = r37, r58
add r38 = r38, r57
add r39 = r39, r56
add r40 = r40, r55
add r41 = r41, r54
add r42 = r42, r53
add r43 = r43, r52
add r44 = r44, r51
add r45 = r45, r50
add r46 = r46, r49
add r47 = r47, r48
;;
add r32 = r32, r47
add r33 = r33, r46
add r34 = r34, r45
add r35 = r35, r44
add r36 = r36, r43
add r37 = r37, r42
add r38 = r38, r41
add r39 = r39, r40
;;
add r32 = r32, r39
add r33 = r33, r38
add r34 = r34, r37
add r35 = r35, r36
;;
add r32 = r32, r35
add r33 = r33, r34
;;
add r8 = r32, r33 // and store the result in r8
mov pr = r2, -1
mov ar.pfs = r1
br.ret.sptk.many b0
.endp sad16_ia64#
.align 16
.global sad8_ia64#
.proc sad8_ia64#
sad8_ia64:
alloc r1 = ar.pfs, 3, 21, 0, 0
mov r2 = pr
dep r14 = r0, r33, 0, 3 // calculate aligned version of ref
dep.z r31 = r33, 0, 3 // calculate misalignment of ref
;;
mov r40 = r34 //(1) calculate multiples of stride
shl r41 = r34, 1 //(2)
shladd r42 = r34, 1, r34 //(3)
shl r43 = r34, 2 //(4)
shladd r44 = r34, 2, r34 //(5)
;;
cmp.eq p16, p17 = 0, r31 // set predicates according to the misalignment of ref
cmp.eq p18, p19 = 2, r31
shl r45 = r42, 1 //(6)
cmp.eq p20, p21 = 4, r31
cmp.eq p22, p23 = 6, r31
shladd r46 = r42, 1, r34 //(7)
cmp.eq p24, p25 = 1, r31
cmp.eq p26, p27 = 3, r31
cmp.eq p28, p29 = 5, r31
;;
mov r48 = r14 // calculate memory adresses of data
add r33 = r32, r40
add r49 = r14, r40
add r34 = r32, r41
add r50 = r14, r41
add r35 = r32, r42
add r51 = r14, r42
add r36 = r32, r43
add r52 = r14, r43
add r37 = r32, r44
add r53 = r14, r44
add r38 = r32, r45
add r54 = r14, r45
add r39 = r32, r46
add r55 = r14, r46
;;
ld8 r32 = [r32] // load everythingund alles wird geladen
ld8 r33 = [r33] // cur is located in r32 - r39
ld8 r34 = [r34] // ref in r40 - r47
ld8 r35 = [r35]
ld8 r36 = [r36]
ld8 r37 = [r37]
ld8 r38 = [r38]
ld8 r39 = [r39]
ld8 r40 = [r48] ,8
ld8 r41 = [r49] ,8
ld8 r42 = [r50] ,8
ld8 r43 = [r51] ,8
ld8 r44 = [r52] ,8
ld8 r45 = [r53] ,8
ld8 r46 = [r54] ,8
ld8 r47 = [r55] ,8
(p16) br.cond.dptk.many .Lber2 // if ref is aligned, we can start the calculation
;;
ld8 r48 = [r48] // if not, we have to load some more
ld8 r49 = [r49] // because of the alignment of ld8
ld8 r50 = [r50]
ld8 r51 = [r51]
ld8 r52 = [r52]
ld8 r53 = [r53]
ld8 r54 = [r54]
ld8 r55 = [r55]
(p24) br.cond.dptk.many .Lmode1
(p18) br.cond.dpnt.many .Lmode2
(p26) br.cond.dpnt.many .Lmode3
(p20) br.cond.dpnt.many .Lmode4
(p28) br.cond.dpnt.many .Lmode5
(p22) br.cond.dpnt.many .Lmode6
;;
.Lmode7: // this jump piont is not needed, it is for better understandment
shrp r40 = r48, r40, 56 // here we do some preprocessing on the data
shrp r41 = r49, r41, 56 // this is because of the alignment problem of ref
shrp r42 = r50, r42, 56
shrp r43 = r51, r43, 56
shrp r44 = r52, r44, 56
shrp r45 = r53, r45, 56
shrp r46 = r54, r46, 56
shrp r47 = r55, r47, 56
br.cond.sptk.many .Lber2
;;
.Lmode6:
shrp r40 = r48, r40, 48
shrp r41 = r49, r41, 48
shrp r42 = r50, r42, 48
shrp r43 = r51, r43, 48
shrp r44 = r52, r44, 48
shrp r45 = r53, r45, 48
shrp r46 = r54, r46, 48
shrp r47 = r55, r47, 48
br.cond.sptk.many .Lber2
;;
.Lmode5:
shrp r40 = r48, r40, 40
shrp r41 = r49, r41, 40
shrp r42 = r50, r42, 40
shrp r43 = r51, r43, 40
shrp r44 = r52, r44, 40
shrp r45 = r53, r45, 40
shrp r46 = r54, r46, 40
shrp r47 = r55, r47, 40
br.cond.sptk.many .Lber2
;;
.Lmode4:
shrp r40 = r48, r40, 32
shrp r41 = r49, r41, 32
shrp r42 = r50, r42, 32
shrp r43 = r51, r43, 32
shrp r44 = r52, r44, 32
shrp r45 = r53, r45, 32
shrp r46 = r54, r46, 32
shrp r47 = r55, r47, 32
br.cond.sptk.many .Lber2
;;
.Lmode3:
shrp r40 = r48, r40, 24
shrp r41 = r49, r41, 24
shrp r42 = r50, r42, 24
shrp r43 = r51, r43, 24
shrp r44 = r52, r44, 24
shrp r45 = r53, r45, 24
shrp r46 = r54, r46, 24
shrp r47 = r55, r47, 24
br.cond.sptk.many .Lber2
;;
.Lmode2:
shrp r40 = r48, r40, 16
shrp r41 = r49, r41, 16
shrp r42 = r50, r42, 16
shrp r43 = r51, r43, 16
shrp r44 = r52, r44, 16
shrp r45 = r53, r45, 16
shrp r46 = r54, r46, 16
shrp r47 = r55, r47, 16
br.cond.sptk.many .Lber2
;;
.Lmode1:
shrp r40 = r48, r40, 8
shrp r41 = r49, r41, 8
shrp r42 = r50, r42, 8
shrp r43 = r51, r43, 8
shrp r44 = r52, r44, 8
shrp r45 = r53, r45, 8
shrp r46 = r54, r46, 8
shrp r47 = r55, r47, 8
.Lber2:
;;
psad1 r32 = r32, r40 // we start calculating sad
psad1 r33 = r33, r41 // using th psad1 command of IA64
psad1 r34 = r34, r42
psad1 r35 = r35, r43
psad1 r36 = r36, r44
psad1 r37 = r37, r45
psad1 r38 = r38, r46
psad1 r39 = r39, r47
;;
add r32 = r32, r33 // then we sum up everything
add r33 = r34, r35
add r34 = r36, r37
add r35 = r38, r39
;;
add r32 = r32, r33
add r33 = r34, r35
;;
add r8 = r32, r33 // and store the result un r8
mov pr = r2, -1
mov ar.pfs = r1
br.ret.sptk.many b0
.endp sad8_ia64#
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -