📄 getdiff.asm
字号:
.global _xhGetDiff8x8_8u16s_C1;
.global _GetDiff8x8_8u16s_C1;
#define IPPVC_MC_APX_FF 0
#define IPPVC_MC_APX_FH 4
#define IPPVC_MC_APX_HF 8
#define IPPVC_MC_APX_HH 12
.section L1_data_b;
.byte _interpolate_8x8[64];
.section L1_code;
/***************************************************************************
ippiGetDiff8x8_8u16s_C1 (const Ipp8u* pSrcCur, Ipp32s srcCurStep,
const Ipp8u* pSrcRef, Ipp32s srcRefStep, Ipp16s* pDstDiff, Ipp32s
dstDiffStep, Ipp16s* pDstPredictor, Ipp32s dstPredictorStep, Ipp32s mcType,
Ipp32s roundControl);
performance:
ASM C
cycle count : 169 4752 :FF
203 6977 :FH
233 6982 :HF
286 8969 :HH
*****************************************************************************/
_xhGetDiff8x8_8u16s_C1:
// 没有考虑roundcontrol位
[--sp]=(r7:4,p5:3);
[--sp]=i0;
[--sp]=l0;
[--sp]=i1;
[--sp]=l1;
[--sp]=i2;
[--sp]=l2;
[--sp]=i3;
[--sp]=l3;
[--sp]=rets;
l3=0;
i3.l=_interpolate_8x8;
i3.h=_interpolate_8x8;
l2=0;
l0=0;
i0=r2;//the address of pSrcRef
l1=0;
i1=r2;
m3=r0;//the address of pSrcCur
b3=r0;
p5=r1;//the srcCurStep
r7=[sp+12+64];//srcrefStep
r6=[sp+32+64];//mcType
m1=r7;
p0=8;
/***********************IPPVC_MC_APX_FF**********************/
cc=r6==IPPVC_MC_APX_FF;
if !cc jump FH_8x8;
r7+=-8;
m0=r7;//refstep modify
i1=i0;
lsetup (ff_8x8_start,ff_8x8_end)lc0=p0;
disalgnexcpt||r0=[i0++]||r2=[i1++];
ff_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i1++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i1++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
ff_8x8_end:
disalgnexcpt||[i3++]=r7||r2=[i1++];
jump interpolate_8x8_end;
/**********************IPPVC_MC_APX_FH***********************/
FH_8x8:
r5=IPPVC_MC_APX_FH;
cc=r6==r5;
if !cc jump HF_8x8;
m1=r7;//ref step
r7+=-8;
m0=r7;//refmodify
lsetup (fh_8x8_start,fh_8x8_end)lc0=p0;
i1=i0;
i1+=m1;
disalgnexcpt||r0=[i0++]||r2=[i1++];
fh_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i1++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i1++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
fh_8x8_end:
disalgnexcpt||[i3++]=r7||r2=[i1++];
jump interpolate_8x8_end;
/************************IPPVC_MC_APX_HF************************/
HF_8x8:
r5=IPPVC_MC_APX_HF;
cc=r6==r5;
if !cc jump HH_8x8;
m1=1;//to make the align8
r7+=-8;
m0=r7;
i1=i0;
i1+=m1;
r6=3; //这一段主要是为I0的后两位全为1
r7=i0; //考虑的。因为此时I0+1后,后两位
r7=r7&r6; //全为0,此时所选的寄存器,就不是
cc=r7==r6; //R3,而是R2,故分开考虑。
if cc jump byte_align_HF;
lsetup(hf_8x8_start,hf_8x8_end)lc0=p0;
i2=i0;
disalgnexcpt||r0=[i0++]||r2=[i2++];
hf_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i2++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
hf_8x8_end:
disalgnexcpt||[i3++]=r7||r2=[i2++];
jump interpolate_8x8_end;
byte_align_HF: //特殊情况
lsetup(bytealign_start,bytealign_end) lc0=p0;
i2=i0;
i2+=4;
disalgnexcpt||r0=[i0++]||r2=[i2++];
bytealign_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i2++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
bytealign_end:
disalgnexcpt||[i3++]=r7||r2=[i2++];
jump interpolate_8x8_end;
/************************IPPVC_MC_APX_HH************************/
HH_8x8:
/*
m1=1;
m2=r7;//src step
r7+=-8;
m0=r7;
i1=i0;
r6=3; //这一段主要是为I0的后两位全为1
r7=i0; //考虑的。因为此时I0+1后,后两位
r7=r7&r6; //全为0,此时所选的寄存器,就不是
cc=r7==r6; //R3,而是R2,故分开考虑。
if cc jump byte_align_HH;
// p1=2;
lsetup(hh_8x8_start,hh_8x8_end)lc0=p0;
i2=i0;
i2+=m2;
disalgnexcpt||r0=[i0++]||r2=[i2++];
hh_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop2p(r1:0,r3:2)(rndl);
i0+=m1;
i1+=m1;
r7=byteop2p(r1:0,r3:2)(rndh);
i0-=m1;
i1-=m1;
r7=r7+r6;
disalgnexcpt||r0=[i0++m0];
disalgnexcpt||[i3++]=r7||r2=[i2++m0];
r6=byteop2p(r1:0,r3:2)(rndl,r);
i1+=m1;
i0+=m1;
r7=byteop2p(r1:0,r3:2)(rndh,r);
i1-=m1;
i0-=m1;
disalgnexcpt||r0=[i0++];
r7=r7+r6;
hh_8x8_end:
disalgnexcpt||[i3++]=r7||r2=[i2++];
jump interpolate_8x8_end;
byte_align_HH://特殊情况
lsetup(bytealign_hh_start,bytealign_hh_end)lc0=p0;
i2=i0;
i2+=m2;
disalgnexcpt||r0=[i0++]||r2=[i2++];
bytealign_hh_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop2p(r1:0,r3:2)(rndl);
i0+=m1;
i1+=m1;
r7=byteop2p(r1:0,r3:2)(rndh,r);
i0-=m1;
i1-=m1;
r7=r7+r6;
disalgnexcpt||r0=[i0++m0];
disalgnexcpt||[i3++]=r7||r2=[i2++m0];
r6=byteop2p(r1:0,r3:2)(rndl,r);
i0+=m1;
i1+=m1;
r7=byteop2p(r1:0,r3:2)(rndh);
i0-=m1;
i1-=m1;
disalgnexcpt||r0=[i0++];
r7=r7+r6;
bytealign_hh_end:
disalgnexcpt||[i3++]=r7||r2=[i2++];
//jump interpolate_8x8_end;
*/
M0 = 7;
M3 = -3 (X);
I2 = R2; // address of the top-left pel in best match reference block
R2 = R2 + R7 (S); //next row
I1 = R2; // Address of best match - stride
I0 = R2;
R7 += -5; //-9
M1 = R7;
P2 = 8; //9
LSETUP(AVGHV_RND_ST, AVGHV_RND_END) LC0 = P2;
DISALGNEXCPT || R0 = [I1++] || R2 = [I2++];
DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3];
R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || R0 = [I1++M0];
AVGHV_RND_ST:
DISALGNEXCPT || I0 += M3 || R2 = [I2++M0];
R6 = BYTEOP2P(R1:0,R3:2) (RNDH) || R0 = [I1++M3] || R2 = [I2++M3];
R7 = R6 + R7 (NS) || I0 -= M3;
R7 = BYTEOP2P(R1:0,R3:2) (RNDL, R) || [I3++] = R7 || R1 = [I1++M1];
DISALGNEXCPT || I0 += M3 || R3 = [I2++M1];
R6 = BYTEOP2P(R1:0,R3:2) (RNDH, R) || R0 = [I1++] || R2 = [I2++];
R7 = R6 + R7 (NS) || I0 -= M3;
DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3];
AVGHV_RND_END:
R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || [I3++] = R7 || R0 = [I1++M0];
interpolate_8x8_end:
i3.l=_interpolate_8x8;
i3.h=_interpolate_8x8;
r6=i3;
i0=b3;//the address of pSrc
r7=p5;//tht srcstep
i1=i0;
p0=8;
p4=[sp+16+64];
r7+=-4;
m0=r7;
lsetup(getdiff_start,getdiff_end)lc0=p0;
r0=[i0++]||r2=[i3++];
getdiff_start:
(r4,r5)=byteop16m(r1:0,r3:2)||r1=[i0++m0]||r3=[i3++];
[p4++]=r5||r0=[i0++];
(r4,r5)=byteop16m(r1:0,r3:2)(r)||r2=[i3++]||[p4++]=r4;
[p4++]=r5;
getdiff_end:
[p4++]=r4;
r0=r6;
rets=[sp++];
l3=[sp++];
i3=[sp++];
l2=[sp++];
i2=[sp++];
l1=[sp++];
i1=[sp++];
l0=[sp++];
i0=[sp++];
(r7:4,p5:3)=[sp++];
_xhGetDiff8x8_8u16s_C1.end:
rts;
/**********************************************************************************
GetDiff8x8_8u16s_C1(Ipp8u *pCur,int step,Ipp8u *_match_16x16,Ipp16s* pDstDiff)
必须保证pCur 和match块的步长一致。
**********************************************************************************/
_GetDiff8x8_8u16s_C1:
[--sp]=(r7:4,p5:3);
[--sp]=i0;
[--sp]=l0;
[--sp]=i1;
[--sp]=l1;
[--sp]=i2;
[--sp]=l2;
[--sp]=i3;
[--sp]=l3;
[--sp]=rets;
i0=r0;//src1
I1=I0;
i2=r2;//src2
r6=r1;//step
r6+=-4;
m0=r6;
r7=[sp+12+64];
i3=r7;//dst
p0=8;
r0=[i0++]||r2=[i2++];
lsetup(getdiff_st,getdiff_ed)LC0=p0;
getdiff_st:
(r4,r5)=byteop16m(r1:0,r3:2)||r1=[i0++m0]||r3=[i2++m0];
[i3++]=r5||r0=[i0++];
(r4,r5)=byteop16m(r1:0,r3:2)(r)||r2=[i2++]||[i3++]=r4;
[i3++]=r5;
getdiff_ed:
[i3++]=r4;
rets=[sp++];
l3=[sp++];
i3=[sp++];
l2=[sp++];
i2=[sp++];
l1=[sp++];
i1=[sp++];
l0=[sp++];
i0=[sp++];
(r7:4,p5:3)=[sp++];
_GetDiff8x8_8u16s_C1.end:
rts;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -