📄 halfcopy.asm
字号:
.section L1_code;
.global _xhCopyBlockHalfpel_MPEG4_8u;
.global _xhCopyMBHalfpel_MPEG4_8u;
/**********************************************************************
ippiCopyBlockHalfpel_MPEG4_8u (const Ipp8u* pSrc, int srcStep, Ipp8u*
pDst, int dstStep, const IppMotionVector* pMV, int roundControl);
//round_control位没有用
//I0和I1的值,要注意字节对齐的问题,尤其是pstart的值
performance:
ASM C
cycle: 264 5820
************************************************************************/
_xhCopyBlockHalfpel_MPEG4_8u:
[--sp]=(r7:4,p5:3);
[--sp]=i0;
[--sp]=l0;
[--sp]=i1;
[--sp]=l1;
[--sp]=i2;
[--sp]=l2;
[--sp]=i3;
[--sp]=l3;
[--sp]=rets;
b0=r1;//srcstep
l2=0;
l1=0;
l0=0;
l3=0;
r7=[sp+16+64];//PMV
i3=r7;
r7.l=w[i3++];//pMV->dx
r6.l=w[i3++];//pMV->dy
r7=r7.l(x);
r6=r6.l(x);
r5=r7>>>1;//mv_off_x
r4=r6>>>1;//mv_off_y
r4=r4.l*r1.l(is);//mv_off_y*srcstep
r4=r4+r5;//mv_off_y*srcstep+mv_off_x
r0=r0+r4;//point to pstart
i3=r2; //pDst
i0=r0;
i1=r0;//psrc
r4=b0;//srcstep
r5=[sp+12+64];//dststep
p0=8;
/********************************************/
cc=bittst(r7,0);//mv_off_x
if cc jump HF_HH8;
cc=bittst(r6,0);
if cc jump FH8;
r5+=-4;
r4+=-8;
m0=r4;//src step modify
m1=r5;//dst step modify
i1=i0;
lsetup (ff_8x8_start,ff_8x8_end)lc0=p0;
disalgnexcpt||r0=[i0++]||r2=[i1++];
ff_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i1++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i1++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
ff_8x8_end:
disalgnexcpt||[i3++m1]=r7||r2=[i1++];
jump halfcopyblock_end;
FH8:
m1=r4;//src step
r4+=-8;
m0=r4;//src modify
r5+=-4;
m2=r5; //dst modify
lsetup (fh_8x8_start,fh_8x8_end)lc0=p0;
i1=i0;
i1+=m1;
disalgnexcpt||r0=[i0++]||r2=[i1++];
fh_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i1++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i1++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
fh_8x8_end:
disalgnexcpt||[i3++m2]=r7||r2=[i1++];
jump halfcopyblock_end;
HF_HH8:
cc=bittst(r6,0);
if cc jump HH8;
m1=1;//to make the align8
r4+=-8;
m0=r4;
i1=i0;
i1+=m1;
r5+=-4;
m3=r5;
r6=3; //这一段主要是为I0的后两位全为1
r7=i0; //考虑的。因为此时I0+1后,后两位
r7=r7&r6; //全为0,此时所选的寄存器,就不是
cc=r7==r6; //R3,而是R2,故分开考虑。
if cc jump byte_align_HF;
lsetup(hf_8x8_start,hf_8x8_end)lc0=p0;
i2=i0;
disalgnexcpt||r0=[i0++]||r2=[i2++];
hf_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i2++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
hf_8x8_end:
disalgnexcpt||[i3++m3]=r7||r2=[i2++];
jump halfcopyblock_end;
byte_align_HF: //特殊情况
lsetup(bytealign_start,bytealign_end) lc0=p0;
i2=i0;
i2+=4;
disalgnexcpt||r0=[i0++]||r2=[i2++];
bytealign_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i2++m0];
r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
bytealign_end:
disalgnexcpt||[i3++m3]=r7||r2=[i2++];
//jump halfcopyblock_end;
/*
//R0 = P5; // Address of the best match
//I0 = R0; // Address of best match
//R0 += -1;
//I1 = R0; // Address of best match - 1
//I3 = SP; // Output buffer
// Address of the best match
// Address of best match
R0 += 1; // Address of best match - 1
I0 = R0;
P2 = 8 (Z);
R5 +=-4;
M3 = R5;
R4 += -12;
M1 = R4;
MNOP;
LSETUP(AVG2_LR_ST, AVG2_LR_END) LC0 = P2;
DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
AVG2_LR_ST:
DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2 =[I1++];
R6 = BYTEOP1P(R1:0,R3:2)(R) || [I3++] = R6 || R1 = [I0++M1];
DISALGNEXCPT || [I3++M3] = R6 || R3 =[I1++M1];
//R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++];
AVG2_LR_END:
DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
*/
jump halfcopyblock_end;
HH8:
/*
r5+=-4;
m3=r5;
m1=1;
m2=r4;//src step
r4+=-8;
m0=r4;
i1=i0;
r6=3; //这一段主要是为I0的后两位全为1
r7=i0; //考虑的。因为此时I0+1后,后两位
r7=r7&r6; //全为0,此时所选的寄存器,就不是
cc=r7==r6; //R3,而是R2,故分开考虑。
if cc jump byte_align_HH;
// p1=2;
lsetup(hh_8x8_start,hh_8x8_end)lc0=p0;
i2=i0;
i2+=m2;
disalgnexcpt||r0=[i0++]||r2=[i2++];
hh_8x8_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop2p(r1:0,r3:2)(rndl);
i0+=m1;
i1+=m1;
r7=byteop2p(r1:0,r3:2)(rndh);
i0-=m1;
i1-=m1;
r7=r7+r6;
disalgnexcpt||r0=[i0++m0];
disalgnexcpt||[i3++]=r7||r2=[i2++m0];
r6=byteop2p(r1:0,r3:2)(rndl,r);
i1+=m1;
i0+=m1;
r7=byteop2p(r1:0,r3:2)(rndh,r);
i1-=m1;
i0-=m1;
disalgnexcpt||r0=[i0++];
r7=r7+r6;
hh_8x8_end:
disalgnexcpt||[i3++m3]=r7||r2=[i2++];
jump halfcopyblock_end;
byte_align_HH://特殊情况
lsetup(bytealign_hh_start,bytealign_hh_end)lc0=p0;
i2=i0;
i2+=m2;
disalgnexcpt||r0=[i0++]||r2=[i2++];
bytealign_hh_start:
disalgnexcpt||r1=[i0++]||r3=[i2++];
r6=byteop2p(r1:0,r3:2)(rndl);
i0+=m1;
i1+=m1;
r7=byteop2p(r1:0,r3:2)(rndh,r);
i0-=m1;
i1-=m1;
r7=r7+r6;
disalgnexcpt||r0=[i0++m0];
disalgnexcpt||[i3++]=r7||r2=[i2++m0];
r6=byteop2p(r1:0,r3:2)(rndl,r);
i0+=m1;
i1+=m1;
r7=byteop2p(r1:0,r3:2)(rndh);
i0-=m1;
i1-=m1;
disalgnexcpt||r0=[i0++];
r7=r7+r6;
bytealign_hh_end:
disalgnexcpt||[i3++m3]=r7||r2=[i2++];
//jump halfcopyblock_end;
*/
M0 = 7;
M3 = -3 (X);
I2 = R0;
// WINWIDTH
R1 = R0 + R4 (S);
I0 = R1; // Address of best match - (WINWIDTH+1)
I1 = R1;
R4+=-5; //-17
M1 = R4;
R5 +=-4;
M2=R5;
P2 = 8; //17
LSETUP(AVG4_ST, AVG4_END) LC0 = P2;
DISALGNEXCPT || R0 = [I1++] || R2 = [I2++];
DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3];
R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || R0 = [I1++M0];
AVG4_ST:
DISALGNEXCPT || I0 += M3 || R2 = [I2++M0];
R6 = BYTEOP2P(R1:0,R3:2) (RNDH) || R0 = [I1++M3] || R2 = [I2++M3];
R7 = R6 + R7 (NS) || I0 -= M3;
R7 = BYTEOP2P(R1:0,R3:2) (RNDL, R) || [I3++] = R7 || R1 = [I1++M1];
DISALGNEXCPT || I0 += M3 || R3 = [I2++M1];
R6 = BYTEOP2P(R1:0,R3:2) (RNDH, R) || R0 = [I1++] || R2 = [I2++];
R7 = R6 + R7 (NS) || I0 -= M3;
DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3];
//R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || [I3++] = R7 || R0 = [I1++M1];
//DISALGNEXCPT || I0 += M3 || R2 = [I2++M1];
//R6 = BYTEOP2P(R1:0,R3:2) (RNDH) || R0 = [I1++] || R2 = [I2++];
//R7 = R6 + R7 (NS) || I0 -= M3;
//DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3];
AVG4_END:
R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || [I3++M2] = R7 || R0 = [I1++M0];
halfcopyblock_end:
rets=[sp++];
l3=[sp++];
i3=[sp++];
l2=[sp++];
i2=[sp++];
l1=[sp++];
i1=[sp++];
l0=[sp++];
i0=[sp++];
(r7:4,p5:3)=[sp++];
rts;
_xhCopyBlockHalfpel_MPEG4_8u.end:
/*************************************************************************
ippiCopyMBHalfpel_MPEG4_8u (const Ipp8u* pSrc, int srcStep, Ipp8u*
pDst, int dstStep, const IppMotionVector* pMV, int roundControl);
performance:
ASM C
cycle: 653 22389
***************************************************************************/
_xhCopyMBHalfpel_MPEG4_8u:
[--sp]=(r7:4,p5:3);
[--sp]=i0;
[--sp]=l0;
[--sp]=i1;
[--sp]=l1;
[--sp]=i2;
[--sp]=l2;
[--sp]=i3;
[--sp]=l3;
[--sp]=rets;
b0=r1;//srcstep
l3=0;
r7=[sp+16+64];//PMV
i3=r7;
r7.l=w[i3++];//pMV->dx
r6.l=w[i3++];//pMV->dy
r7=r7.l(x);
r6=r6.l(x);
r5=r7>>>1;//mv_off_x
r4=r6>>>1;//mv_off_y
r4*=r1;//mv_off_y*srcstep
r4=r4+r5;//mv_off_y*srcstep+mv_off_x
r0=r0+r4;//point to pstart
i3=r2; //pDst
i0=r0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -