📄 vc1dsp.c
字号:
{ vc1_mspel_mc(dst,src,stride,1,0,rnd);} *///PUT_VC1_MSPEL(2, 0)/*static void put_vc1_mspel_mc20_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ vc1_mspel_mc(dst,src,stride,2,0,rnd);} */#ifdef JZ4740_MXU_OPTstatic void put_vc1_mspel_mc20_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ uint32_t i,j; uint32_t src_aln0,src_aln1,src_rs0,src_rs1,src_rs2,src_rs3; int mul_ct0,mul_ct1; int ct0,rnd1; mul_ct0=0x09090909; //9 9 9 9 mul_ct1=0x01010101; //1 1 1 1 ct0 = 0x00080008; rnd1 = rnd * 0x00010001; src_aln0 = (((uint32_t)src-1) & 0xFFFFFFFC); src_aln1 = (((uint32_t)src+1) & 0xFFFFFFFC); src_rs0 = 4-(((uint32_t)src - 1) & 3); src_rs1 = src_rs0-1; src_rs2 = 4-(((int)src + 1) & 3); src_rs3 = src_rs2-1; S32I2M(xr15,mul_ct0); //xr15:9 9 9 9 S32I2M(xr14,mul_ct1); //xr14:1 1 1 1 S32I2M(xr13,ct0); //xr13: 8 8 S32I2M(xr12,rnd1); //xr12:rnd rnd for(j=0;j<8;j++){ S32LDD(xr1,src_aln0,0); S32LDD(xr2,src_aln0,4); S32LDD(xr5,src_aln1,0); S32LDD(xr6,src_aln1,4); S32ALN(xr3,xr2,xr1,src_rs0); //xr3:src[2] src[1] src[0] src[-1] S32ALN(xr4,xr2,xr1,src_rs1); //xr4:src[3] src[2] src[1] src[0] S32ALN(xr7,xr6,xr5,src_rs2); //xr7:src[4] src[3] src[2] src[1] S32ALN(xr8,xr6,xr5,src_rs3); //xr8:src[5] src[4] src[3] src[2] Q8MUL(xr5,xr4,xr15,xr6); //xr5:9*src[3] 9*src[2] xr6:9*src[1] 9*src[0] Q8MAC_SS(xr5,xr3,xr14,xr6);//xr5:9*src[3]-src[2] 9*src[2]-src[1] xr6:9*src[1]-src[0] 9*src[0]-src[-1] Q8MUL(xr9,xr7,xr15,xr10); //xr9:9*src[4] 9*src[3] xr10:9*src[2] 9*src[1] Q8MAC_SS(xr9,xr8,xr14,xr10); //xr9:9*src[4]-src[5] 9*src[3]-src[4] xr10:9*src[2]-src[3] 9*src[1]-src[2] Q16ADD_AA_WW(xr0,xr5,xr9,xr5); //xr5:dst[3] 9*src[3]-src[2]+9*src[4]-src[5] dst[2] 9*src[2]-src[1]+9*src[3]-src[4] Q16ADD_AA_WW(xr0,xr6,xr10,xr6); //xr6:dst[1] 9*src[1]-src[0]+9*src[2]-src[3] dst[0] 9*src[0]-src[-1]+9*src[1]-src[2] Q16ACC_SS(xr5,xr13,xr12,xr6); //two instructions with one Q16SAR(xr5,xr5,xr6,xr6,4); Q16SAT(xr5,xr5,xr6); //xr5:dst[3]dst[2]dst[1]dst[0] S32LDI(xr1,src_aln0,4); S32LDD(xr2,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); //xr3:src[6] src[5] src[4] src[3] S32ALN(xr4,xr2,xr1,src_rs1); //xr4:src[7] src[6] src[5] src[4] S32LDI(xr1,src_aln1,4); S32LDD(xr2,src_aln1,4); S32ALN(xr7,xr2,xr1,src_rs2); //xr7:src[8] src[7] src[6] src[5] S32ALN(xr8,xr2,xr1,src_rs3); //xr8:src[9] src[8] src[7] src[6] Q8MUL(xr1,xr4,xr15,xr2); //xr5:9*src[3] 9*src[2] xr6:9*src[1] 9*src[0] Q8MAC_SS(xr1,xr3,xr14,xr2);//xr5:9*src[3]-src[2] 9*src[2]-src[1] xr6:9*src[1]-src[0] 9*src[0]-src[-1] Q8MUL(xr9,xr7,xr15,xr10); //xr9:9*src[4] 9*src[3] xr10:9*src[2] 9*src[1] Q8MAC_SS(xr9,xr8,xr14,xr10); //xr9:9*src[4]-src[5] 9*src[3]-src[4] xr10:9*src[2]-src[3] 9*src[1]-src[2] Q16ADD_AA_WW(xr0,xr1,xr9,xr1); //xr5:dst[3] 9*src[3]-src[2]+9*src[4]-src[5] dst[2] 9*src[2]-src[1]+9*src[3]-src[4] Q16ADD_AA_WW(xr0,xr2,xr10,xr2); //xr6:dst[1] 9*src[1]-src[0]+9*src[2]-src[3] dst[0] 9*src[0]-src[-1]+9*src[1]-src[2] Q16ACC_SS(xr1,xr13,xr12,xr2); Q16SAR(xr1,xr1,xr2,xr2,4); Q16SAT(xr1,xr1,xr2); //xr5:dst[3]dst[2]dst[1]dst[0] S32STD(xr5,dst,0); S32STD(xr1,dst,4); dst+=stride; src_aln0+=(stride-4); src_aln1+=(stride-4);}}#elsestatic void put_vc1_mspel_mc20_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){int i,j;for(j = 0; j < 8; j++) { dst[0]=av_clip_uint8((-src[-1] + 9*src[0] + 9*src[1] - src[2] + 8 - rnd) >> 4); dst[1]=av_clip_uint8((-src[0] + 9*src[1] + 9*src[2] - src[3] + 8 - rnd) >> 4); dst[2]=av_clip_uint8((-src[1] + 9*src[2] + 9*src[3] - src[4] + 8 - rnd) >> 4); dst[3]=av_clip_uint8((-src[2] + 9*src[3] + 9*src[4] - src[5] + 8 - rnd) >> 4); dst[4]=av_clip_uint8((-src[3] + 9*src[4] + 9*src[5] - src[6] + 8 - rnd) >> 4); dst[5]=av_clip_uint8((-src[4] + 9*src[5] + 9*src[6] - src[7] + 8 - rnd) >> 4); dst[6]=av_clip_uint8((-src[5] + 9*src[6] + 9*src[7] - src[8] + 8 - rnd) >> 4); dst[7]=av_clip_uint8((-src[6] + 9*src[7] + 9*src[8] - src[9] + 8 - rnd) >> 4); dst += stride; src += stride; }}#endif //PUT_VC1_MSPEL(3, 0)/*static void put_vc1_mspel_mc30_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ vc1_mspel_mc(dst,src,stride,3,0,rnd);} */#ifdef JZ4740_MXU_OPTstatic void put_vc1_mspel_mc30_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ uint32_t i,j; uint32_t src_aln0,src_aln1,src_rs0,src_rs1,src_rs2,src_rs3; int mul_ct0,mul_ct1,mul_ct2,mul_ct3; int ct0,rnd1; mul_ct0=0x12121212; //18 18 18 18 mul_ct1=0x35353535; //53 53 53 53 mul_ct2=0x03030303; //3 3 3 3 mul_ct3=0x04040404; //4 4 4 4 ct0 = 0x00200020; //32 32 rnd1 = rnd * 0x00010001; src_aln0 = (((uint32_t)src-1) & 0xFFFFFFFC); src_aln1 = (((uint32_t)src+1) & 0xFFFFFFFC); src_rs0 = 4-(((uint32_t)src - 1) & 3); src_rs1 = src_rs0-1; src_rs2 = 4-(((int)src + 1) & 3); src_rs3 = src_rs2-1; S32I2M(xr15,mul_ct0); //xr15:18 18 18 18 S32I2M(xr14,mul_ct1); //xr14:53 53 53 53 S32I2M(xr11,mul_ct2); //xr11:3 3 3 3 S32I2M(xr10,mul_ct3); //xr10:4 4 4 4 S32I2M(xr13,ct0); //xr13: 32 32 S32I2M(xr12,rnd1); //xr12:rnd rnd for(j=0;j<8;j++){ S32LDD(xr1,src_aln0,0); S32LDD(xr2,src_aln0,4); S32LDD(xr5,src_aln1,0); S32LDD(xr6,src_aln1,4); S32ALN(xr3,xr2,xr1,src_rs0); //xr3:src[2] src[1] src[0] src[-1] S32ALN(xr4,xr2,xr1,src_rs1); //xr4:src[3] src[2] src[1] src[0] S32ALN(xr7,xr6,xr5,src_rs2); //xr7:src[4] src[3] src[2] src[1] S32ALN(xr8,xr6,xr5,src_rs3); //xr8:src[5] src[4] src[3] src[2] Q8MUL(xr5,xr4,xr15,xr6); //xr5:18*src[3] 18*src[2] xr6:18*src[1] 18*src[0] Q8MAC_SS(xr5,xr3,xr11,xr6);//xr5:18*src[3]-3*src[2] 18*src[2]-3*src[1] xr6:18*src[1]-3*src[0] 18*src[0]-3*src[-1] Q8MUL(xr9,xr7,xr14,xr1); //xr9:53*src[4] 53*src[3] xr1:53*src[2] 53*src[1] Q8MAC_SS(xr9,xr8,xr10,xr1); //xr9:53*src[4]-4*src[5] 53*src[3]-4*src[4] xr1:53*src[2]-4*src[3] 53*src[1]-4*src[2] Q16ADD_AA_WW(xr0,xr5,xr9,xr5); //xr5:dst[3] 18*src[3]-3*src[2]+53*src[4]-4*src[5] dst[2] 18*src[2]-3*src[1]+53*src[3]-4*src[4] Q16ADD_AA_WW(xr0,xr6,xr1,xr6); //xr6:dst[1] 18*src[1]-3*src[0]+53*src[2]-4*src[3] dst[0] 18*src[0]-3*src[-1]+53*src[1]-4src[2] Q16ACC_SS(xr5,xr13,xr12,xr6); //two instructions with one Q16SAR(xr5,xr5,xr6,xr6,6); Q16SAT(xr5,xr5,xr6); //xr5:dst[3]dst[2]dst[1]dst[0] S32LDI(xr1,src_aln0,4); S32LDD(xr2,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); //xr3:src[6] src[5] src[4] src[3] S32ALN(xr4,xr2,xr1,src_rs1); //xr4:src[7] src[6] src[5] src[4] S32LDI(xr1,src_aln1,4); S32LDD(xr2,src_aln1,4); S32ALN(xr7,xr2,xr1,src_rs2); //xr7:src[8] src[7] src[6] src[5] S32ALN(xr8,xr2,xr1,src_rs3); //xr8:src[9] src[8] src[7] src[6] Q8MUL(xr1,xr4,xr15,xr2); //xr1:18*src[7] 18*src[6] xr2:18*src[5] 18*src[4] Q8MAC_SS(xr1,xr3,xr11,xr2);//xr1:18*src[7]-3*src[6] 18*src[6]-3*src[5] xr2:18*src[5]-3*src[4] 18*src[4]-3*src[3] Q8MUL(xr9,xr7,xr14,xr3); //xr9:53*src[8] 53*src[7] xr3:53*src[6] 53*src[5] Q8MAC_SS(xr9,xr8,xr10,xr3); //xr9:53*src[8]-4*src[9] 53*src[7]-4*src[8] xr3:53*src[6]-4*src[7] 53*src[5]-4*src[6] Q16ADD_AA_WW(xr0,xr1,xr9,xr1); //xr1:dst[3] 18*src[7]-3*src[6]+53*src[8]-4*src[9] dst[2] 18*src[6]-3*src[5]+53*src[7]-4*src[8] Q16ADD_AA_WW(xr0,xr2,xr3,xr2); //xr2:dst[1] 18*src[5]-3*src[4]+53*src[6]-4*src[7] dst[0] 18*src[4]-3*src[3]53+*src[5]-4*src[6] Q16ACC_SS(xr1,xr13,xr12,xr2); Q16SAR(xr1,xr1,xr2,xr2,6); Q16SAT(xr1,xr1,xr2); //xr1:dst[7]dst[6]dst[5]dst[4] S32STD(xr5,dst,0); S32STD(xr1,dst,4); dst+=stride; src_aln0+=(stride-4); src_aln1+=(stride-4);}}#elsestatic void put_vc1_mspel_mc30_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){int j;for(j = 0; j < 8; j++) { dst[0]=av_clip_uint8((-3*src[-1]+ 18*src[0] + 53*src[1] - 4*src[2] + 32 - rnd) >> 6); dst[1]=av_clip_uint8((-3*src[0] + 18*src[1] + 53*src[2] - 4*src[3] + 32 - rnd) >> 6); dst[2]=av_clip_uint8((-3*src[1] + 18*src[2] + 53*src[3] - 4*src[4] + 32 - rnd) >> 6); dst[3]=av_clip_uint8((-3*src[2] + 18*src[3] + 53*src[4] - 4*src[5] + 32 - rnd) >> 6); dst[4]=av_clip_uint8((-3*src[3] + 18*src[4] + 53*src[5] - 4*src[6] + 32 - rnd) >> 6); dst[5]=av_clip_uint8((-3*src[4] + 18*src[5] + 53*src[6] - 4*src[7] + 32 - rnd) >> 6); dst[6]=av_clip_uint8((-3*src[5] + 18*src[6] + 53*src[7] - 4*src[8] + 32 - rnd) >> 6); dst[7]=av_clip_uint8((-3*src[6] + 18*src[7] + 53*src[8] - 4*src[9] + 32 - rnd) >> 6); dst += stride; src += stride; }}#endif //PUT_VC1_MSPEL(0, 1)/*static void put_vc1_mspel_mc01_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ vc1_mspel_mc(dst,src,stride,0,1,rnd);} */#ifdef JZ4740_MXU_OPTstatic void put_vc1_mspel_mc01_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ uint32_t j,r; r = 1-rnd; uint32_t src_aln0,src_rs0; int mul_ct0,mul_ct1,mul_ct2,mul_ct3; int ct0,rnd1; mul_ct1=0x12121212; //18 18 18 18 mul_ct0=0x35353535; //53 53 53 53 mul_ct3=0x03030303; //3 3 3 3 mul_ct2=0x04040404; //4 4 4 4 ct0 = 0x00200020; //32 32 rnd1 = r * 0x00010001; src_aln0 = (((uint32_t)src - stride) & 0xFFFFFFFC); src_rs0 = 4-(((uint32_t)src - stride) & 3); S32I2M(xr15,mul_ct0); //xr15:53 53 53 53 S32I2M(xr14,mul_ct1); //xr14:18 18 18 18 S32I2M(xr11,mul_ct2); //xr11:4 4 4 4 S32I2M(xr10,mul_ct3); //xr10:3 3 3 3 S32I2M(xr13,ct0); //xr13: 32 32 S32I2M(xr12,rnd1); //xr12: r r for(j=0;j<8;j++){ S32LDD(xr1,src_aln0,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr5,src_aln0,stride,0); S32LDD(xr6,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); S32ALN(xr7,xr6,xr5,src_rs0); S32LDIV(xr1,src_aln0,stride,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr5,src_aln0,stride,0); S32LDD(xr6,src_aln0,4); S32ALN(xr4,xr2,xr1,src_rs0); S32ALN(xr8,xr6,xr5,src_rs0); Q8MUL(xr1,xr7,xr15,xr2); Q8MAC_SS(xr1,xr3,xr11,xr2); //xr1:-4*src[-stride+3] + 53*src[3] -4*src[-stride+2] + 53*src[2] xr2: Q8MUL(xr5,xr4,xr14,xr6); Q8MAC_SS(xr5,xr8,xr10,xr6); //xr5:18*src[stride+3]-3*src[stride*2+3] 18*src[stride+2]-3*src[stride*2+2] xr6: Q16ADD_AA_WW(xr0,xr5,xr1,xr5); Q16ADD_AA_WW(xr0,xr6,xr2,xr6); Q16ACC_SS(xr5,xr13,xr12,xr6); //two instructions with one Q16SAR(xr5,xr5,xr6,xr6,6); Q16SAT(xr5,xr5,xr6); //xr5:dst[3]dst[2]dst[1]dst[0] src_aln0 = src_aln0 - 3*stride; S32LDI(xr1,src_aln0,4); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr6,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); S32ALN(xr7,xr6,xr9,src_rs0); S32LDIV(xr1,src_aln0,stride,0); S32LDD(xr2,src_aln0,4); S32LDIV(xr9,src_aln0,stride,0); S32LDD(xr6,src_aln0,4); S32ALN(xr4,xr2,xr1,src_rs0); S32ALN(xr8,xr6,xr9,src_rs0); Q8MUL(xr1,xr7,xr15,xr2); Q8MAC_SS(xr1,xr3,xr11,xr2); //xr1:-4*src[-stride+3] + 53*src[3] -4*src[-stride+2] + 53*src[2] xr2: Q8MUL(xr9,xr4,xr14,xr6); Q8MAC_SS(xr9,xr8,xr10,xr6); //xr5:18*src[stride+3]-3*src[stride*2+3] 18*src[stride+2]-3*src[stride*2+2] xr6: Q16ADD_AA_WW(xr0,xr9,xr1,xr9); Q16ADD_AA_WW(xr0,xr6,xr2,xr6); Q16ACC_SS(xr9,xr13,xr12,xr6); //two instructions with one Q16SAR(xr9,xr9,xr6,xr6,6); Q16SAT(xr9,xr9,xr6); //xr5:dst[3]dst[2]dst[1]dst[0] S32STD(xr5,dst,0); S32STD(xr9,dst,4); dst+=stride; src_aln0+=(-2*stride-4); }}#else static void put_vc1_mspel_mc01_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ int j,r; r = 1-rnd; for(j = 0; j < 8; j++) { dst[0] = av_clip_uint8((-4*src[-stride] + 53*src[0] + 18*src[stride] - 3*src[stride*2] + 32 - r) >> 6); dst[1] = av_clip_uint8((-4*src[-stride+1] + 53*src[1] + 18*src[stride+1] - 3*src[stride*2+1] + 32 - r) >> 6); dst[2] = av_clip_uint8((-4*src[-stride+2] + 53*src[2] + 18*src[stride+2] - 3*src[stride*2+2] + 32 - r) >> 6); dst[3] = av_clip_uint8((-4*src[-stride+3] + 53*src[3] + 18*src[stride+3] - 3*src[stride*2+3] + 32 - r) >> 6); dst[4] = av_clip_uint8((-4*src[-stride+4] + 53*src[4] + 18*src[stride+4] - 3*src[stride*2+4] + 32 - r) >> 6); dst[5] = av_clip_uint8((-4*src[-stride+5] + 53*src[5] + 18*src[stride+5] - 3*src[stride*2+5] + 32 - r) >> 6); dst[6] = av_clip_uint8((-4*src[-stride+6] + 53*src[6] + 18*src[stride+6] - 3*src[stride*2+6] + 32 - r) >> 6); dst[7] = av_clip_uint8((-4*src[-stride+7] + 53*src[7] + 18*src[stride+7] - 3*src[stride*2+7] + 32 - r) >> 6); src += stride; dst += stride; }}#endif//PUT_VC1_MSPEL(1, 1)/*static void put_vc1_mspel_mc11_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ vc1_mspel_mc(dst,src,stride,1,1,rnd);} */ #ifdef JZ4740_MXU_OPT#define w1 53#define w2 4#define w3 18#define w4 3
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -