📄 reconblock.asm
字号:
.section L1_code;
//.global _xhReconBlockHalfpel_MPEG4_8u;
.global _xhReconBlockHalfpel_MPEG4_8u_MOD;
/*****************************************************************************************
ippiReconBlockHalfpel_MPEG4_8u (const Ipp8u* pSrc, int srcStep,
Ipp16s pResidue[64], Ipp8u* pDst, int dstStep, const IppMotionVector* pMV,
int roundControl)
performance:
ASM C
cycle: 728 10750
*******************************************************************************************/
/*
_xhReconBlockHalfpel_MPEG4_8u:
[--sp]=(r7:4,p5:3);
[--sp]=i0;
[--sp]=l0;
[--sp]=i1;
[--sp]=l1;
[--sp]=i2;
[--sp]=l2;
[--sp]=i3;
[--sp]=l3;
[--sp]=rets;
b0=r1;//srcstep
//liu move here
r7=[sp+20+64];//PMV
i3=r7;
l2=0;
l1=0;
l0=0;
l3=0;
//liu r7=[sp+20+64];//PMV
// i3=r7;
r7.l=w[i3++];//pMV->dx
r6.l=w[i3++];//pMV->dy
r7=r7.l(x);
r6=r6.l(x);
r5=r7>>>1;//mv_off_x
r4=r6>>>1;//mv_off_y
r4*=r1;//mv_off_y*srcstep
r4=r4+r5;//mv_off_y*srcstep+mv_off_x
r0=r0+r4;//point to pstart
r1=[sp+64+12]; //pDst
i3=r1;
i0=r0;
i1=r0;//psrc
r4=b0;//srcstep
r5=[sp+16+64];//dststep
b1=r5;
p5=r2;// point to the Presidual
p0=8;
cc=bittst(r7,0);//mv_off_x
if cc jump HF_HH8;
cc=bittst(r6,0);
if cc jump FH8;
r5+=-4;
r4+=-8;
m0=r4;//src step modify
m1=r5;//dst step modify
i1=i0;
lsetup (ff_8x8_start,ff_8x8_end)lc0=p0;
disalgnexcpt||r0=[i0++]||r2=[i1++];
ff_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i1++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i1++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
ff_8x8_end:
disalgnexcpt||[i3++m1]=r7||r2=[i1++];
jump halfcopyblock_end;
FH8:
m1=r4;//src step
r4+=-8;
m0=r4;//src modify
r5+=-4;
m2=r5;//dst modify
lsetup (fh_8x8_start,fh_8x8_end)lc0=p0;
i1=i0;
i1+=m1;
disalgnexcpt||r0=[i0++]||r2=[i1++];
fh_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i1++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i1++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
fh_8x8_end:
disalgnexcpt||[i3++m2]=r7||r2=[i1++];
jump halfcopyblock_end;
HF_HH8:
cc=bittst(r6,0);
if cc jump HH8;
m1=1;//to make the align8
r4+=-8;
m0=r4;
i1=i0;
i1+=m1;
r5+=-4;
m3=r5;
r6=3; //这一段主要是为I0的后两位全为1
r7=i0; //考虑的。因为此时I0+1后,后两位
r7=r7&r6; //全为0,此时所选的寄存器,就不是
cc=r7==r6; //R3,而是R2,故分开考虑。
if cc jump byte_align_HF;
lsetup(hf_8x8_start,hf_8x8_end)lc0=p0;
i2=i0;
disalgnexcpt||r0=[i0++]||r2=[i2++];
hf_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i2++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
hf_8x8_end:
disalgnexcpt||[i3++m3]=r7||r2=[i2++];
jump halfcopyblock_end;
byte_align_HF: //特殊情况
lsetup(bytealign_start,bytealign_end) lc0=p0;
i2=i0;
i2+=4;
disalgnexcpt||r0=[i0++]||r2=[i2++];
bytealign_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i2++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
bytealign_end:
disalgnexcpt||[i3++m3]=r7||r2=[i2++];
jump halfcopyblock_end;
HH8:
r5+=-4;
m3=r5;
m1=1;
m2=r4;//src step
r4+=-8;
m0=r4;
i1=i0;
r6=3; //这一段主要是为I0的后两位全为1
r7=i0; //考虑的。因为此时I0+1后,后两位
r7=r7&r6; //全为0,此时所选的寄存器,就不是
cc=r7==r6; //R3,而是R2,故分开考虑。
if cc jump byte_align_HH;
// p1=2;
lsetup(hh_8x8_start,hh_8x8_end)lc0=p0;
i2=i0;
i2+=m2;
disalgnexcpt||r0=[i0++]||r2=[i2++];
hh_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop2p(r1:0,r3:2)(rndl);
i0+=m1;
i1+=m1;
r7=byteop2p(r1:0,r3:2)(rndh);
i0-=m1;
i1-=m1;
r7=r7+r6;
disalgnexcpt||r0=[i0++m0];
disalgnexcpt||[i3++]=r7||r2=[i2++m0];
r6=byteop2p(r1:0,r3:2)(rndl,r);
i1+=m1;
i0+=m1;
r7=byteop2p(r1:0,r3:2)(rndh,r);
i1-=m1;
i0-=m1;
disalgnexcpt||r0=[i0++];
r7=r7+r6;
hh_8x8_end:
disalgnexcpt||[i3++m3]=r7||r2=[i2++];
jump halfcopyblock_end;
byte_align_HH://特殊情况
lsetup(bytealign_hh_start,bytealign_hh_end)lc0=p0;
i2=i0;
i2+=m2;
disalgnexcpt||r0=[i0++]||r2=[i2++];
bytealign_hh_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop2p(r1:0,r3:2)(rndl);
i0+=m1;
i1+=m1;
r7=byteop2p(r1:0,r3:2)(rndh,r);
i0-=m1;
i1-=m1;
r7=r7+r6;
disalgnexcpt||r0=[i0++m0];
disalgnexcpt||[i3++]=r7||r2=[i2++m0];
r6=byteop2p(r1:0,r3:2)(rndl,r);
i0+=m1;
i1+=m1;
r7=byteop2p(r1:0,r3:2)(rndh);
i0-=m1;
i1-=m1;
disalgnexcpt||r0=[i0++];
r7=r7+r6;
bytealign_hh_end:
disalgnexcpt||[i3++m3]=r7||r2=[i2++];
//jump halfcopyblock_end;
halfcopyblock_end:
r1=[sp+64+12]; //pDst
i0=r1;
//liu p3=i0;
// p4=i0;
i1=r1;
// r2=p5;//residual
i2=p5;
r7=b1;//the step of pDst
r7+=-9;
//liu move here
p3=i0;
p4=i0;
p2=r7;
p0=8;
p1=8;
lsetup(reconstruct_start,reconstruct_end)lc0=p0;
r0=b[p3++](z)||r2.l=w[i2++];
reconstruct_start:
lsetup(reconstruct1_start,reconstruct1_end)lc1=p1;
reconstruct1_start:
r2=r2.l(x);
r0=r0<<8;
r5=byteop3p(r3:2,r1:0)(lo);
r0=b[p3++](z)||r2.l=w[i2++];
reconstruct1_end:
b[p4++]=r5;
p3=p3+p2;
p4=p4+p2;
r6=p4;
r6+=1;
p4=r6;
reconstruct_end:
r0=b[p3++](z);
rets=[sp++];
l3=[sp++];
i3=[sp++];
l2=[sp++];
i2=[sp++];
l1=[sp++];
i1=[sp++];
l0=[sp++];
i0=[sp++];
(r7:4,p5:3)=[sp++];
rts;
_xhReconBlockHalfpel_MPEG4_8u.end:
*/
#if 0
/*****************************************************************************
ippiReconBlockHalfpel_MPEG4_8u (const Ipp8u* pSrc, int srcStep,
Ipp16s pResidue[64], Ipp8u* pDst, int dstStep, int roundControl)
*****************************************************************************/
_xhReconBlockHalfpel_MPEG4_8u_MOD:
[--sp]=(r7:4,p5:3);
[--sp]=i0;
[--sp]=l0;
[--sp]=i1;
[--sp]=l1;
[--sp]=i2;
[--sp]=l2;
[--sp]=i3;
[--sp]=l3;
[--sp]=rets;
L0=0;
L1=0;
I1=R0;//SRC
P5=R1;//SRC Step
R1+=-4;
M1=R1;
I2=R2;//Residue
R2=[SP+76];
I0=R2;//DST
R3=[SP+80];//DST Step
R3+=-4;
M0=R3;
P0=8;
LSETUP(reconstruct_mod_start,reconstruct_mod_end)LC0=P0;
R0=[I1++]||R4=[I2++];
R1=[I1++M1];
(R7,R6) = BYTEUNPACK R1:0||R5=[I2++];
reconstruct_mod_start:
R2=R4+|+R6(S)||R4=[I2++];
R3=R5+|+R7(S)||R5=[I2++];
R2 = BYTEPACK (R2,R3);
(R7,R6) = BYTEUNPACK R1:0(R)||[I0++]=R2;
R2=R4+|+R6(S)||R4=[I2++]||R0=[I1++];
R3=R5+|+R7(S)||R5=[I2++]||R1=[I1++M1];
R2 = BYTEPACK (R2,R3);
reconstruct_mod_end:
(R7,R6) = BYTEUNPACK R1:0||[I0++M0]=R2;
rets=[sp++];
l3=[sp++];
i3=[sp++];
l2=[sp++];
i2=[sp++];
l1=[sp++];
i1=[sp++];
l0=[sp++];
i0=[sp++];
(r7:4,p5:3)=[sp++];
_xhReconBlockHalfpel_MPEG4_8u_MOD.END:
RTS;
#else
#if 0
/*****************************************************************************
ippiReconBlockHalfpel_MPEG4_8u (const Ipp8u* pSrc, int srcStep,
Ipp16s pResidue[64], Ipp8u* pDst, int dstStep, int roundControl)
*****************************************************************************/
_xhReconBlockHalfpel_MPEG4_8u_MOD:
[--sp]=(r7:4,p5:3);
[--sp]=rets;
R7=0;
L0=R7;
L1=R7;
L2=R7;
L3=R7;
I1=R0;//SRC
R1+=-4; //src step;
M1=R1;
I2=R2;//Residue
R2+=4;
I3=R2;
M2=8;
R2=[SP+44];
I0=R2;//DST
R3=[SP+48];//DST Step
R3+=-4;
M0=R3;
P0=8;
R4=[I2++M2] || R5=[I3++M2];
R2=PACK(R5.L, R4.L) || R0=[I1++];
LSETUP(reconstruct_mod_start,reconstruct_mod_end)LC0=P0;
reconstruct_mod_start:
R6=BYTEOP3P(R3:2, R1:0)(LO) || R1=[I1++M1];
R2=PACK(R5.H, R4.H) || R4=[I2++M2] || R5=[I3++M2];
R6<<=8;
R7=BYTEOP3P(R3:2, R1:0)(HI) || R0=[I1++];
R3=PACK(R5.L, R4.L);
R7>>=8;
R6=R6|R7;
R6=BYTEOP3P(R3:2, R1:0)(LO,R) || [I0++]=R6;
R3=PACK(R5.H, R4.H) || R4=[I2++M2] || R5=[I3++M2];
R6<<=8;
R7=BYTEOP3P(R3:2, R1:0)(HI,R) || R0=[I1++];
R7>>=8;
R6=R6|R7;
reconstruct_mod_end:
R2=PACK(R5.L, R4.L) || [I0++M0]=R6;
_xhReconBlockHalfpel_MPEG4_8u_MOD.END:
rets=[sp++];
(r7:4,p5:3)=[sp++];
RTS;
#endif
/*****************************************************************************
ippiReconBlockHalfpel_MPEG4_8u (const Ipp8u* pSrc, int srcStep,
Ipp16s pResidue[64], Ipp8u* pDst, int dstStep, int roundControl)
*****************************************************************************/
_xhReconBlockHalfpel_MPEG4_8u_MOD:
[--sp]=(r7:4,p5:3);
[--sp]=rets;
R7=0;
L0=R7;
L1=R7;
L2=R7;
// L3=R7;
I1=R0;//SRC
R1+=-4; //src step;
M1=R1;
I2=R2;//Residue
// R2+=4;
// I3=R2;
// M2=8;
R2=[SP+44];
I0=R2;//DST
R3=[SP+48];//DST Step
R3+=-4;
M0=R3;
P0=8;
R2=[I2++] || R0=[I1++];
(R1, R0)=BYTEUNPACK R1:0 || R3=[I2++];
LSETUP(reconstruct_mod_start,reconstruct_mod_end)LC0=P0;
reconstruct_mod_start:
R0<<=8;
R1<<=8;
R6=BYTEOP3P(R3:2, R1:0)(LO) || R2=[I2++];
R7=BYTEOP3P(R3:2, R1:0)(LO,R) || R3=[I2++] || R0=[I1++M1];
(R1, R0)=BYTEUNPACK R1:0;
R0<<=8;
R1<<=8;
R5=BYTEPACK(R6, R7);
R6=BYTEOP3P(R3:2, R1:0)(LO) || [I0++]=R5 || R2=[I2++];
R7=BYTEOP3P(R3:2, R1:0)(LO,R) || R3=[I2++] || R0=[I1++];
R5=BYTEPACK(R6, R7);
reconstruct_mod_end:
(R1, R0)=BYTEUNPACK R1:0 || [I0++M0]=R5;
_xhReconBlockHalfpel_MPEG4_8u_MOD.END:
rets=[sp++];
(r7:4,p5:3)=[sp++];
RTS;
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -