📄 meanabsdev.asm
字号:
.section L1_code;
.global _xhMeanAbsDev16x16_8u32s_C1R_MOD;
//.global _xhMeanAbsDev16x16_8u32s_C1R;
/*******************************************************************
ippiMeanAbsDev16x16_8u32s_C1R (const Ipp8u *pSrc, int srcStep,
Ipp32s *pDst);
I0和I1必须是4得倍数。
performance:
ASM C
cycle: 274 16813
*********************************************************************/
/*_xhMeanAbsDev16x16_8u32s_C1R:
[--sp]=(r7:4,p5:3);
[--sp]=i0;
[--sp]=l0;
[--sp]=i1;
[--sp]=l1;
[--sp]=i2;
[--sp]=l2;
[--sp]=i3;
[--sp]=l3;
[--sp]=rets;
l0=0;
l1=0;
a0=0;
i0=r0;//psrc
i1=r0;//psrc
b0=r0;//psrc
p5=r1;//srcstep
m1=r1;
i2=r2;//pdst
i1+=m1;//下一行
r7=0;//sum
r1+=-16;
r2=p5;
r1=r2+r1;
m0=r1;
p0=8;
lsetup(sum_start,sum_end)lc0=p0;
r0=[i0++]||r2=[i1++];
sum_start:
(r5,r4)=byteop16p(r1:0,r3:2)||r1=[i0++]||r3=[i1++];
r6=r5+|+r4;
(r5,r4)=byteop16p(r1:0,r3:2)(r)||r0=[i0++]||r2=[i1++];
r1=r5+|+r4(s);
r6=r6+|+r1(s);
r7=r7+|+r6(s);
(r5,r4)=byteop16p(r1:0,r3:2)||r1=[i0++]||r3=[i1++];
r6=r5+|+r4(s);
(r5,r4)=byteop16p(r1:0,r3:2)(r)||r0=[i0++m0]||r2=[i1++m0];
r1=r5+|+r4(s);
r6=r1+|+r6(s);
r7=r7+|+r6(s);
sum_end:
r0=[i0++]||r2=[i1++];
r6.l=r7.h+r7.l(s);
r7=r6.l(x);
r6=128;
r7=r6+r7;
r7=r7>>8;//the value is in [0,255]
r6=r7<<8;
r2=r7|r6;
r6=r6<<8;
r2=r2|r6;
r6=r6<<8;
r2=r2|r6;//in order to use saa,the all bytes must be the mean
r3=r2;
i0=b0;//psrc
i1=b0;
r0=p5;//srcstep
r0+=-12;
m0=r0;
p0=16;
a1=a0=0;
lsetup(dev_start,dev_end)lc0=p0;
r0=[i0++];
dev_start:
saa(r1:0,r3:2)||r1=[i0++];
saa(r1:0,r3:2)(r)||r0=[i0++];
saa(r1:0,r3:2)||r1=[i0++m0];
dev_end:
saa(r1:0,r3:2)(r)||r0=[i0++];
r6=a1.l+a1.h,r7=a0.l+a0.h;
r3=r6+r7(s);
[i2]=r3;//put the dev in pdst
rets=[sp++];
l3=[sp++];
i3=[sp++];
l2=[sp++];
i2=[sp++];
l1=[sp++];
i1=[sp++];
l0=[sp++];
i0=[sp++];
(r7:4,p5:3)=[sp++];
_xhMeanAbsDev16x16_8u32s_C1R.end:
rts;
*/
_xhMeanAbsDev16x16_8u32s_C1R_MOD:
[--sp]=(r7:4,p5:3);
[--sp]=i0;
[--sp]=l0;
[--sp]=i1;
[--sp]=l1;
[--sp]=i2;
[--sp]=l2;
[--sp]=i3;
[--sp]=l3;
[--sp]=rets;
l0=0;
l1=0;
a0=0;
i0=r0;//psrc
i1=r0;//psrc
b0=r0;//psrc
m1=16;
i2=r1;//pdst
i1+=m1;//下一行
r7=0;//sum
m0=20;
p0=8;
lsetup(sum_start,sum_end)lc0=p0;
r0=[i0++]||r2=[i1++];
sum_start:
(r5,r4)=byteop16p(r1:0,r3:2)||r1=[i0++]||r3=[i1++];
r6=r5+|+r4;
(r5,r4)=byteop16p(r1:0,r3:2)(r)||r0=[i0++]||r2=[i1++];
r1=r5+|+r4(s);
r6=r6+|+r1(s);
r7=r7+|+r6(s);
(r5,r4)=byteop16p(r1:0,r3:2)||r1=[i0++m0]||r3=[i1++m0];
r6=r5+|+r4(s);
(r5,r4)=byteop16p(r1:0,r3:2)(r)||r0=[i0++]||r2=[i1++];
r1=r5+|+r4(s);
r6=r1+|+r6(s);
sum_end:
r7=r7+|+r6(s);
r6.l=r7.h+r7.l(s);
r7=r6.l(x);
r6=128;
r7=r6+r7;
r7=r7>>8;//the value is in [0,255]
r6=r7<<8;
r2=r7|r6;
r6=r6<<8;
r2=r2|r6;
r6=r6<<8;
r2=r2|r6;//in order to use saa,the all bytes must be the mean
r3=r2;
i0=b0;//psrc
i1=b0;
p0=16;
a1=a0=0;
lsetup(dev_start,dev_end)lc0=p0;
r0=[i0++];
dev_start:
saa(r1:0,r3:2)||r1=[i0++];
saa(r1:0,r3:2)(r)||r0=[i0++];
saa(r1:0,r3:2)||r1=[i0++];
dev_end:
saa(r1:0,r3:2)(r)||r0=[i0++];
r6=a1.l+a1.h,r7=a0.l+a0.h;
r3=r6+r7(s);
[i2]=r3;//put the dev in pdst
rets=[sp++];
l3=[sp++];
i3=[sp++];
l2=[sp++];
i2=[sp++];
l1=[sp++];
i1=[sp++];
l0=[sp++];
i0=[sp++];
(r7:4,p5:3)=[sp++];
_xhMeanAbsDev16x16_8u32s_C1R_MOD.end:
rts;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -