📄 vc1dsp.c
字号:
static void put_vc1_mspel_mc11_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ int j,r; int tmp[12*4]; int16_t *tptr = tmp; r = 15 + rnd; src -= 1; uint32_t src_aln0,src_rs0; int mul_ct0,mul_ct1,mul_ct2,mul_ct3; int rnd1; mul_ct1=0x12121212; //18 18 18 18 mul_ct0=0x35353535; //53 53 53 53 mul_ct3=0x03030303; //3 3 3 3 mul_ct2=0x04040404; //4 4 4 4 rnd1 = r * 0x00010001; src_aln0 = (((uint32_t)src - stride) & 0xFFFFFFFC); src_rs0 = 4-(((uint32_t)src - stride) & 3); S32I2M(xr15,mul_ct0); //xr15:53 53 53 53 S32I2M(xr14,mul_ct1); //xr14:18 18 18 18 S32I2M(xr11,mul_ct2); //xr11:4 4 4 4 S32I2M(xr10,mul_ct3); //xr10:3 3 3 3 S32I2M(xr12,rnd1); //xr12: r r for(j=0;j<8;j++){ S32LDD(xr1,src_aln0,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr5,src_aln0,stride,0); S32LDD(xr6,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); S32ALN(xr7,xr6,xr5,src_rs0); S32LDIV(xr1,src_aln0,stride,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr5,src_aln0,stride,0); S32LDD(xr6,src_aln0,4); S32ALN(xr4,xr2,xr1,src_rs0); S32ALN(xr8,xr6,xr5,src_rs0); Q8MUL(xr1,xr7,xr15,xr2); Q8MAC_SS(xr1,xr3,xr11,xr2); //xr1:-4*src[-stride+3] + 53*src[3] -4*src[-stride+2] + 53*src[2] xr2: Q8MUL(xr5,xr4,xr14,xr6); Q8MAC_SS(xr5,xr8,xr10,xr6); //xr5:18*src[stride+3]-3*src[stride*2+3] 18*src[stride+2]-3*src[stride*2+2] xr6: Q16ADD_AA_WW(xr0,xr5,xr1,xr5); Q16ADD_AA_WW(xr0,xr6,xr2,xr6); Q16ACC_AA(xr5,xr12,xr0,xr6); //two instructions with one Q16SAR(xr5,xr5,xr6,xr6,5); //xr5:tptr[3] tptr[2] xr6:tptr[1] tptr[0] src_aln0 = src_aln0 - 3*stride; S32LDI(xr1,src_aln0,4); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr13,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); S32ALN(xr7,xr13,xr9,src_rs0); S32LDIV(xr1,src_aln0,stride,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr13,src_aln0,4); S32ALN(xr4,xr2,xr1,src_rs0); S32ALN(xr8,xr13,xr9,src_rs0); Q8MUL(xr1,xr7,xr15,xr2); Q8MAC_SS(xr1,xr3,xr11,xr2); //xr1:-4*src[-stride+3] + 53*src[3] -4*src[-stride+2] + 53*src[2] xr2: Q8MUL(xr9,xr4,xr14,xr13); Q8MAC_SS(xr9,xr8,xr10,xr13); Q16ADD_AA_WW(xr0,xr9,xr1,xr9); Q16ADD_AA_WW(xr0,xr13,xr2,xr13); Q16ACC_SS(xr9,xr12,xr0,xr13); // Q16SAR(xr9,xr9,xr13,xr13,5); //xr9:tptr[7] tptr[6] xr13:tptr[5] tptr[4] S32STD(xr6,tptr,0x0); S32STD(xr5,tptr,0x4); S32STD(xr13,tptr,0x8); S32STD(xr9,tptr,0xc); src_aln0 = src_aln0 - 3*stride; S32LDI(xr1,src_aln0,4); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr13,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); S32ALN(xr7,xr13,xr9,src_rs0); S32LDIV(xr1,src_aln0,stride,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr13,src_aln0,4); S32ALN(xr4,xr2,xr1,src_rs0); S32ALN(xr8,xr13,xr9,src_rs0); Q8MUL(xr1,xr7,xr15,xr2); Q8MAC_SS(xr1,xr3,xr11,xr2); //xr1:-4*src[-stride+3] + 53*src[3] -4*src[-stride+2] + 53*src[2] xr2: Q8MUL(xr9,xr4,xr14,xr13); Q8MAC_SS(xr9,xr8,xr10,xr13); Q16ADD_AA_WW(xr0,xr9,xr1,xr9); Q16ADD_AA_WW(xr0,xr13,xr2,xr13); Q16ACC_SS(xr9,xr12,xr0,xr13); //two instructions with one Q16SAR(xr9,xr9,xr13,xr13,5); //xr9:tptr[11] tptr[10] xr13:tptr[9] tptr[8] S32STD(xr13,tptr,0x10); S32STD(xr9,tptr,0x14); tptr += 12; src_aln0+=(-2*stride-8); } r = 64-rnd; tptr = tmp; S32I2M(xr15,(w1<<16)|w2); //xr15:53|4 S32I2M(xr14,(w3<<16)|w4); //xr14:18|3 S32I2M(xr13,r); for(j=0;j<8;j++) { S32LDD(xr1,tptr,0x0); //xr1:tptr[1] tptr[0] S32LDD(xr2,tptr,0x4); //xr2:tptr[3] tptr[2] S32LDD(xr3,tptr,0x8); //xr3:tptr[5] tptr[4] S32LDD(xr4,tptr,0xc); //xr4:tptr[7] tptr[6] S32ALN(xr5,xr2,xr1,2); //xr5:tptr[2] tptr[1] S32ALN(xr6,xr3,xr2,2); //xr6:tptr[4] tptr[3] S32ALN(xr7,xr4,xr3,2); //xr7:tptr[6] tptr[5] D16MUL_HW(xr9,xr15,xr5,xr10); //xr9:53*tptr[2] xr10:53*tptr[1] D16MAC_SS_LW(xr9,xr15,xr1,xr10); //xr9:53*tptr[2]-4*tptr[1] xr10:53*tptr[1]-4*tptr[0] D16MAC_AA_HW(xr9,xr14,xr2,xr10);//xr9:53*tptr[2]-4*tptr[1]+18*tptr[3] xr10:53*tptr[1]-4*tptr[0]+18*tptr[2] D16MAC_SS_LW(xr9,xr14,xr6,xr10); //xr9:dst[1] xr10:dst[0] D32ACC_AA(xr9,xr0,xr13,xr10); D32SARL(xr9,xr9,xr10,7); D16MUL_HW(xr11,xr15,xr6,xr10); // D16MAC_SS_LW(xr11,xr15,xr2,xr10); // D16MAC_AA_HW(xr11,xr14,xr3,xr10); // D16MAC_SS_LW(xr11,xr14,xr7,xr10); // D32ACC_AA(xr11,xr0,xr13,xr10); D32SARL(xr10,xr11,xr10,7); Q16SAT(xr10,xr10,xr9); S32LDD(xr1,tptr,0x10); //xr1:tptr[9] tptr[8] S32LDD(xr2,tptr,0x14); //xr2:tptr[11] tptr[10] S32ALN(xr5,xr2,xr1,2); //xr5:tptr[10] tptr[9] S32ALN(xr6,xr1,xr4,2); //xr6:tptr[8] tptr[7] D16MUL_HW(xr9,xr15,xr7,xr8); // D16MAC_SS_LW(xr9,xr15,xr3,xr8); // D16MAC_AA_HW(xr9,xr14,xr4,xr8); // D16MAC_SS_LW(xr9,xr14,xr6,xr8); // D32ACC_AA(xr9,xr0,xr13,xr8); D32SARL(xr9,xr9,xr8,7); D16MUL_HW(xr11,xr15,xr6,xr8); // D16MAC_SS_LW(xr11,xr15,xr4,xr8); // D16MAC_AA_HW(xr11,xr14,xr1,xr8); // D16MAC_SS_LW(xr11,xr14,xr5,xr8); // D32ACC_AA(xr11,xr0,xr13,xr8); D32SARL(xr11,xr11,xr8,7); Q16SAT(xr11,xr11,xr9); S32STD(xr10,dst,0); S32STD(xr11,dst,4); dst+=stride; tptr+=12; } }#else static void put_vc1_mspel_mc11_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ static const int shift_value[] = { 0, 5, 1, 5 }; int shift = (shift_value[1]+shift_value[1])>>1; int tmp[12*4]; int16_t *tptr = tmp; int j,r; r = (1<<(shift-1)) + rnd-1; src -= 1; for(j = 0; j < 8; j++) { tptr[0]=(-4*src[-stride] + 53*src[0] + 18*src[stride] - 3*src[stride*2]+r)>>shift; tptr[1]=(-4*src[-stride+1] + 53*src[1] + 18*src[stride+1] - 3*src[stride*2+1]+r)>>shift; tptr[2]=(-4*src[-stride+2] + 53*src[2] + 18*src[stride+2] - 3*src[stride*2+2]+r)>>shift; tptr[3]=(-4*src[-stride+3] + 53*src[3] + 18*src[stride+3] - 3*src[stride*2+3]+r)>>shift; tptr[4]=(-4*src[-stride+4] + 53*src[4] + 18*src[stride+4] - 3*src[stride*2+4]+r)>>shift; tptr[5]=(-4*src[-stride+5] + 53*src[5] + 18*src[stride+5] - 3*src[stride*2+5]+r)>>shift; tptr[6]=(-4*src[-stride+6] + 53*src[6] + 18*src[stride+6] - 3*src[stride*2+6]+r)>>shift; tptr[7]=(-4*src[-stride+7] + 53*src[7] + 18*src[stride+7] - 3*src[stride*2+7]+r)>>shift; tptr[8]=(-4*src[-stride+8] + 53*src[8] + 18*src[stride+8] - 3*src[stride*2+8]+r)>>shift; tptr[9]=(-4*src[-stride+9] + 53*src[9] + 18*src[stride+9] - 3*src[stride*2+9]+r)>>shift; tptr[10]=(-4*src[-stride+10] + 53*src[10] + 18*src[stride+10] - 3*src[stride*2+10]+r)>>shift; src += stride; tptr += 12; } r = 64-rnd; tptr = tmp; for(j = 0; j < 8; j++) { dst[0]=av_clip_uint8((-4*tptr[0] + 53*tptr[1] + 18*tptr[2] - 3*tptr[3]+r)>>7); dst[1]=av_clip_uint8((-4*tptr[1] + 53*tptr[2] + 18*tptr[3] - 3*tptr[4]+r)>>7); dst[2]=av_clip_uint8((-4*tptr[2] + 53*tptr[3] + 18*tptr[4] - 3*tptr[5]+r)>>7); dst[3]=av_clip_uint8((-4*tptr[3] + 53*tptr[4] + 18*tptr[5] - 3*tptr[6]+r)>>7); dst[4]=av_clip_uint8((-4*tptr[4] + 53*tptr[5] + 18*tptr[6] - 3*tptr[7]+r)>>7); dst[5]=av_clip_uint8((-4*tptr[5] + 53*tptr[6] + 18*tptr[7] - 3*tptr[8]+r)>>7); dst[6]=av_clip_uint8((-4*tptr[6] + 53*tptr[7] + 18*tptr[8] - 3*tptr[9]+r)>>7); dst[7]=av_clip_uint8((-4*tptr[7] + 53*tptr[8] + 18*tptr[9] - 3*tptr[10]+r)>>7); dst += stride; tptr += 12; } }#endif//PUT_VC1_MSPEL(2, 1)/*static void put_vc1_mspel_mc21_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ vc1_mspel_mc(dst,src,stride,2,1,rnd);} */#ifdef JZ4740_MXU_OPT#define w5 9#define w6 1static void put_vc1_mspel_mc21_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ int j,r; int tmp[12*4]; int16_t *tptr = tmp; r = 3 + rnd; src -= 1; uint32_t src_aln0,src_rs0; int mul_ct0,mul_ct1,mul_ct2,mul_ct3; int rnd1; mul_ct1=0x12121212; //18 18 18 18 mul_ct0=0x35353535; //53 53 53 53 mul_ct3=0x03030303; //3 3 3 3 mul_ct2=0x04040404; //4 4 4 4 rnd1 = r * 0x00010001; src_aln0 = (((uint32_t)src - stride) & 0xFFFFFFFC); src_rs0 = 4-(((uint32_t)src - stride) & 3); S32I2M(xr15,mul_ct0); //xr15:53 53 53 53 S32I2M(xr14,mul_ct1); //xr14:18 18 18 18 S32I2M(xr11,mul_ct2); //xr11:4 4 4 4 S32I2M(xr10,mul_ct3); //xr10:3 3 3 3 S32I2M(xr12,rnd1); //xr12: r r for(j=0;j<8;j++){ S32LDD(xr1,src_aln0,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr5,src_aln0,stride,0); S32LDD(xr6,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); S32ALN(xr7,xr6,xr5,src_rs0); S32LDIV(xr1,src_aln0,stride,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr5,src_aln0,stride,0); S32LDD(xr6,src_aln0,4); S32ALN(xr4,xr2,xr1,src_rs0); S32ALN(xr8,xr6,xr5,src_rs0); Q8MUL(xr1,xr7,xr15,xr2); Q8MAC_SS(xr1,xr3,xr11,xr2); //xr1:-4*src[-stride+3] + 53*src[3] -4*src[-stride+2] + 53*src[2] xr2: Q8MUL(xr5,xr4,xr14,xr6); Q8MAC_SS(xr5,xr8,xr10,xr6); //xr5:18*src[stride+3]-3*src[stride*2+3] 18*src[stride+2]-3*src[stride*2+2] xr6: Q16ADD_AA_WW(xr0,xr5,xr1,xr5); Q16ADD_AA_WW(xr0,xr6,xr2,xr6); Q16ACC_AA(xr5,xr12,xr0,xr6); //two instructions with one Q16SAR(xr5,xr5,xr6,xr6,3); //xr5:tptr[3] tptr[2] xr6:tptr[1] tptr[0] src_aln0 = src_aln0 - 3*stride; S32LDI(xr1,src_aln0,4); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr13,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); S32ALN(xr7,xr13,xr9,src_rs0); S32LDIV(xr1,src_aln0,stride,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr13,src_aln0,4); S32ALN(xr4,xr2,xr1,src_rs0); S32ALN(xr8,xr13,xr9,src_rs0); Q8MUL(xr1,xr7,xr15,xr2); Q8MAC_SS(xr1,xr3,xr11,xr2); //xr1:-4*src[-stride+3] + 53*src[3] -4*src[-stride+2] + 53*src[2] xr2: Q8MUL(xr9,xr4,xr14,xr13); Q8MAC_SS(xr9,xr8,xr10,xr13); //xr9:18*src[stride+3]-3*src[stride*2+3] 18*src[stride+2]-3*src[stride*2+2] xr13: Q16ADD_AA_WW(xr0,xr9,xr1,xr9); Q16ADD_AA_WW(xr0,xr13,xr2,xr13); Q16ACC_SS(xr9,xr12,xr0,xr13); //two instructions with one Q16SAR(xr9,xr9,xr13,xr13,3); //xr9:tptr[7] tptr[6] xr13:tptr[5] tptr[4] S32STD(xr6,tptr,0x0); S32STD(xr5,tptr,0x4); S32STD(xr13,tptr,0x8); S32STD(xr9,tptr,0xc); src_aln0 = src_aln0 - 3*stride; S32LDI(xr1,src_aln0,4); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr13,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); S32ALN(xr7,xr13,xr9,src_rs0); S32LDIV(xr1,src_aln0,stride,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr13,src_aln0,4); S32ALN(xr4,xr2,xr1,src_rs0); S32ALN(xr8,xr13,xr9,src_rs0); Q8MUL(xr1,xr7,xr1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -