📄 halfpel8_refine_ia64.s
字号:
// ------------------------------------------------------------------------------
// * Programmed by
// * Johannes Singler (email@jsingler.de), Daniel Winkler (infostudent@uni.de)
// *
// * Programmed for the IA64 laboratory held at University Karlsruhe 2002
// * http://www.info.uni-karlsruhe.de/~rubino/ia64p/
// *
// ------------------------------------------------------------------------------
// *
// * This is the optimized assembler version of Halfpel8_Refine. This function
// * is worth it to be optimized for the IA-64 architecture because of the huge
// * register set. We can hold all necessary data in general use registers
// * and reuse it.
// *
// * Our approach uses:
// * - The Itanium command psad1, which solves the problem in hardware.
// * - Alignment resolving to avoid memory faults
// * - Massive lopp unrolling
// *
// ------------------------------------------------------------------------------
// *
// * ------- Half-pixel steps around the center (*) and corresponding
// * |0|1|0| register set parts.
// * -------
// * |2|*|2|
// * -------
// * |0|1|0|
// * -------
// *
// ------------------------------------------------------------------------------
// * calc_delta is split up in three parts wich are included from
// *
// * calc_delta_1.s
// * calc_delta_2.s
// * calc_delta_3.s
// *
// ------------------------------------------------------------------------------
// * We assume min_dx <= currX <= max_dx && min_dy <= currY <= max_dy
.sdata
.align 4
.type lambda_vec8#,@object
.size lambda_vec8#,128
lambda_vec8:
data4 0
data4 1
data4 1
data4 1
data4 1
data4 2
data4 2
data4 2
data4 2
data4 3
data4 3
data4 3
data4 4
data4 4
data4 4
data4 5
data4 5
data4 6
data4 7
data4 7
data4 8
data4 9
data4 10
data4 11
data4 13
data4 14
data4 16
data4 18
data4 21
data4 25
data4 30
data4 36
.type mvtab#,@object
.size mvtab#,132
mvtab:
data4 1
data4 2
data4 3
data4 4
data4 6
data4 7
data4 7
data4 7
data4 9
data4 9
data4 9
data4 10
data4 10
data4 10
data4 10
data4 10
data4 10
data4 10
data4 10
data4 10
data4 10
data4 10
data4 10
data4 10
data4 10
data4 11
data4 11
data4 11
data4 11
data4 11
data4 11
data4 12
data4 12
.text
.align 16
.global Halfpel8_Refine_ia64#
.proc Halfpel8_Refine_ia64#
Halfpel8_Refine_ia64:
pfs = r14
prsave = r15
// Save important registers
alloc pfs = ar.pfs, 18, 74, 4, 96
mov prsave = pr
// Naming registers for better readability
pRef = in0
pRefH = in1
pRefV = in2
pRefHV = in3
cura = in4
x = in5
y = in6
currMV = in7
iMinSAD = in8
dx = in9
dy = in10
min_dx = in11
max_dx = in12
min_dy = in13
max_dy = in14
iFcode = in15
iQuant = in16
iEdgedWidth = in17
iSAD = r17
backupX = r18
backupY = r19
currX = r20
currY = r21
currYAddress = r22
bitX0 = r23
bitY0 = r24
dxd2 = r25
dyd2 = r26
offset = r27
block = r28
nob02 = r29
nob1 = r30
nob64m02 = r31
nob64m1 = r127
const7 = r126
nob56m02 = r125
oldX = r124
oldY = r123
.rotr inregisters[18], refaa[3], refab[3], cur[8], ref0a[9], ref0b[9], ref1a[9], mpr[9], ref2a[8], ref2b[8], component[2], sc[2], tabaddress[2]
fx = f8
fy = f9
fblock = f10
fiEdgedWidth = f11
fdxd2 = f12
fdyd2 = f13
foffset = f14
fydiEdgedWidth = f15
fQuant = f32
fmv = f33
n = p16
h = p17
v = p18
hv = p19
l = p20
r = p21
t = p22
b = p23
lt = p24
lb = p25
rt = p26
rb = p27
fb = p28
non0_0 = p30
non0_1 = p31
non0_2 = p32
non0_3 = p33
neg_0 = p34
neg_1 = p35
neg_2 = p36
neg_3 = p37
cg32_0 = p29
cg32_1 = p38
// Initialize input variables
add sp = 16, sp
;;
ld4 iMinSAD = [sp], 8
;;
sxt4 iMinSAD = iMinSAD
ld4 dx = [sp], 8
;;
sxt4 dx = dx
ld4 dy = [sp], 8
;;
sxt4 dy = dy
ld4 min_dx = [sp], 8
;;
sxt4 min_dx = min_dx
ld4 max_dx = [sp], 8
;;
sxt4 max_dx = max_dx
ld4 min_dy = [sp], 8
;;
sxt4 min_dy = min_dy
ld4 max_dy = [sp], 8
;;
sxt4 max_dy = max_dy
ld4 iFcode = [sp], 8
;;
sxt4 iFcode = iFcode
ld4 iQuant = [sp], 8
add tabaddress[0] = @gprel(lambda_vec8#), gp
;;
shladd tabaddress[0] = iQuant, 2, tabaddress[0]
;;
ld4 iQuant = [tabaddress[0]]
;;
sxt4 iQuant = iQuant
;;
add iFcode = -1, iFcode //only used in decreased version
shl iQuant = iQuant, 1
;;
setf.sig fQuant = iQuant
ld4 iEdgedWidth = [sp]
add sp = -88, sp
// Initialize local variables
ld4 currX = [currMV]
add currYAddress = 4, currMV
;;
sxt4 currX = currX
ld4 currY = [currYAddress]
;;
sxt4 currY = currY
;;
// Calculate references
cmp.gt l, p0 = currX, min_dx
cmp.lt r, p0 = currX, max_dx
cmp.gt t, p0 = currY, min_dy
cmp.lt b, p0 = currY, max_dy
add backupX = -1, currX //move to left upper corner of quadrate
add backupY = -1, currY
;;
(b) cmp.gt.unc lb, p0 = currX, min_dx
(t) cmp.lt.unc rt, p0 = currX, max_dx
(l) cmp.gt.unc lt, p0 = currY, min_dy
(r) cmp.lt.unc rb, p0 = currY, max_dy
and bitX0 = 1, backupX
and bitY0 = 1, backupY
;;
cmp.eq n, p0 = 0, bitX0
cmp.eq h, p0 = 1, bitX0
cmp.eq v, p0 = 0, bitX0
cmp.eq hv, p0 = 1, bitX0
;;
cmp.eq.and n, p0 = 0, bitY0
cmp.eq.and h, p0 = 0, bitY0
cmp.eq.and v, p0 = 1, bitY0
cmp.eq.and hv, p0 = 1, bitY0
;;
.pred.rel "mutex", p16, p17, p18, p19 //n, h, v, hv
(n) mov refaa[0] = pRef
(h) mov refaa[0] = pRefH
(v) mov refaa[0] = pRefV
(hv) mov refaa[0] = pRefHV
(n) mov refaa[1] = pRefH
(h) mov refaa[1] = pRef
(v) mov refaa[1] = pRefHV
(hv) mov refaa[1] = pRefV
(n) mov refaa[2] = pRefV
(h) mov refaa[2] = pRefHV
(v) mov refaa[2] = pRef
(hv) mov refaa[2] = pRefH
// Calculate offset (integer multiplication on IA-64 sucks!)
mov block = 8
shr dxd2 = backupX, 1
shr dyd2 = backupY, 1
setf.sig fx = x
setf.sig fy = y
;;
setf.sig fblock = block
setf.sig fiEdgedWidth = iEdgedWidth
;;
setf.sig fdxd2 = dxd2
setf.sig fdyd2 = dyd2
;;
xma.l foffset = fx, fblock, fdxd2
xma.l fydiEdgedWidth = fy, fblock, fdyd2
;;
xma.l foffset = fydiEdgedWidth, fiEdgedWidth, foffset
;;
getf.sig offset = foffset
;;
add refaa[0] = refaa[0], offset
add refaa[1] = refaa[1], offset
add refaa[2] = refaa[2], offset
;;
(h) add refaa[1] = 1, refaa[1]
(hv) add refaa[1] = 1, refaa[1]
(v) add refaa[2] = iEdgedWidth, refaa[2]
(hv) add refaa[2] = iEdgedWidth, refaa[2]
// Load respecting misalignment of refx...
mov const7 = 7
;;
dep.z nob02 = refaa[0], 3, 3
dep.z nob1 = refaa[1], 3, 3
;;
andcm refaa[0] = refaa[0], const7 // set last 3 bits = 0
andcm refaa[1] = refaa[1], const7
andcm refaa[2] = refaa[2], const7
;;
add refab[0] = 8, refaa[0]
add refab[1] = 8, refaa[1]
add refab[2] = 8, refaa[2]
;;
ld8 cur[0] = [cura], iEdgedWidth
ld8 ref0a[0] = [refaa[0]], iEdgedWidth
sub nob64m02 = 64, nob02 // 64 - nob
ld8 ref0b[0] = [refab[0]], iEdgedWidth
ld8 ref1a[0] = [refaa[1]], iEdgedWidth
sub nob56m02 = 56, nob02 // 56 - nob
ld8 mpr[0] = [refab[1]], iEdgedWidth
ld8 ref2a[0] = [refaa[2]], iEdgedWidth
sub nob64m1 = 64, nob1
ld8 ref2b[0] = [refab[2]], iEdgedWidth
;;
ld8 cur[1] = [cura], iEdgedWidth
ld8 ref0a[1] = [refaa[0]], iEdgedWidth
ld8 ref0b[1] = [refab[0]], iEdgedWidth
ld8 ref1a[1] = [refaa[1]], iEdgedWidth
ld8 mpr[1] = [refab[1]], iEdgedWidth
ld8 ref2a[1] = [refaa[2]], iEdgedWidth
ld8 ref2b[1] = [refab[2]], iEdgedWidth
;;
ld8 cur[2] = [cura], iEdgedWidth
ld8 ref0a[2] = [refaa[0]], iEdgedWidth
ld8 ref0b[2] = [refab[0]], iEdgedWidth
ld8 ref1a[2] = [refaa[1]], iEdgedWidth
ld8 mpr[2] = [refab[1]], iEdgedWidth
ld8 ref2a[2] = [refaa[2]], iEdgedWidth
ld8 ref2b[2] = [refab[2]], iEdgedWidth
;;
ld8 cur[3] = [cura], iEdgedWidth
ld8 ref0a[3] = [refaa[0]], iEdgedWidth
ld8 ref0b[3] = [refab[0]], iEdgedWidth
ld8 ref1a[3] = [refaa[1]], iEdgedWidth
ld8 mpr[3] = [refab[1]], iEdgedWidth
ld8 ref2a[3] = [refaa[2]], iEdgedWidth
ld8 ref2b[3] = [refab[2]], iEdgedWidth
;;
ld8 cur[4] = [cura], iEdgedWidth
ld8 ref0a[4] = [refaa[0]], iEdgedWidth
ld8 ref0b[4] = [refab[0]], iEdgedWidth
ld8 ref1a[4] = [refaa[1]], iEdgedWidth
ld8 mpr[4] = [refab[1]], iEdgedWidth
ld8 ref2a[4] = [refaa[2]], iEdgedWidth
ld8 ref2b[4] = [refab[2]], iEdgedWidth
;;
ld8 cur[5] = [cura], iEdgedWidth
ld8 ref0a[5] = [refaa[0]], iEdgedWidth
ld8 ref0b[5] = [refab[0]], iEdgedWidth
ld8 ref1a[5] = [refaa[1]], iEdgedWidth
ld8 mpr[5] = [refab[1]], iEdgedWidth
ld8 ref2a[5] = [refaa[2]], iEdgedWidth
ld8 ref2b[5] = [refab[2]], iEdgedWidth
;;
ld8 cur[6] = [cura], iEdgedWidth
ld8 ref0a[6] = [refaa[0]], iEdgedWidth
ld8 ref0b[6] = [refab[0]], iEdgedWidth
ld8 ref1a[6] = [refaa[1]], iEdgedWidth
ld8 mpr[6] = [refab[1]], iEdgedWidth
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -