📄 vc1dsp.c
字号:
static void vc1_inv_trans_4x4_c(DCTELEM block[64], int n){ int i; DCTELEM *src, *dst; int off; off = (n&1) * 4 + (n&2) * 16; src = block + off; dst = block + off; S32I2M(xr15,W7<<16|W8); //xr15:17|17 S32I2M(xr14,W8<<16|W9); //xr14:22|10 for(i = 0; i < 4; i++){ S32LDD(xr1,src,0x0); //xr1:src[1] src[0] S32LDD(xr2,src,0x4); //xr2:src[3] src[2] D16MUL_LW(xr3,xr1,xr15,xr4); //xr3:17*src[0] xr4:17*src[0] D16MAC_AS_LW(xr3,xr2,xr15,xr4); //xr3:t1 17*src[0]+src[2]*17 //xr4:t2 17*src[0]-17*src[2] S32I2M(xr13,4); D32ACC_AS(xr3,xr13,xr0,xr4); //xr3:t1+4 xr4:t2+4 D16MUL_HW(xr5,xr1,xr14,xr6); //xr5:t3 22*src[1] xr6:t5 10*src[1] D16MUL_HW(xr7,xr2,xr14,xr8); //xr7:t4 22*src[3] xr8:t6 10*src[3] D32ADD_AS(xr5,xr5,xr8,xr0); //xr5:t3+t6 D32ADD_AS(xr0,xr7,xr6,xr6); //xr6:t4-t5 D32ADD_AS(xr3,xr3,xr5,xr5); //xr3:t1+4+t3+t6 xr5:t1+4-t3-t6 D32ADD_AS(xr4,xr4,xr6,xr6); //xr4:t2+4+t4-t5 xr6:t2+4-t4+t5 D32SARL(xr3,xr6,xr3,3); D32SARL(xr5,xr5,xr4,3); S32STD(xr3,dst,0x0); S32STD(xr5,dst,0x4); src += 8; dst += 8; } src = block + off; dst = block + off; for(i = 0; i < 2; i++){ S32LDD(xr1,src,0x00); //xr1:src[1] src[0] S32LDD(xr2,src,0x10); //xr2:src[9] src[8] S32LDD(xr3,src,0x20); //xr3:src[17] src[16] S32LDD(xr4,src,0x30); //xr4:src[25] src[24] Q16ADD_AS_WW(xr5,xr1,xr3,xr6); //xr5:src[1]+src[17] src[0]+src[16] xr6:src[1]-src[17] src[0]-src[16] Q16SLL(xr7,xr5,xr6,xr8,4); //xr7:(src[1]+src[17])<<4 (src[0]+src[16])<<4 xr8:(src[1]-src[17])<<4 (src[0]-src[16])<<4 Q16ADD_AS_WW(xr5,xr7,xr5,xr0); //xr5:pt1 t1 Q16ADD_AS_WW(xr6,xr8,xr6,xr0); //xr6:pt2 t2 S32I2M(xr11,64<<16|64); Q16ACC_AS(xr5,xr11,xr0,xr6); Q16SLL(xr1,xr2,xr4,xr3,3); //xr1:src[9]<<3 src[8]<<3 xr3:src[25]<<3 src[24]<<3 Q16SLL(xr7,xr2,xr4,xr8,1); //xr7:src[9]<<1 src[8]<<1 xr8:src[25]<<1 src[24]<<1 Q16ADD_AS_WW(xr9,xr1,xr7,xr0); //xr9:pt5 t5 Q16ADD_AS_WW(xr10,xr3,xr8,xr0);//xr10:pt6 t6 Q16SLL(xr1,xr1,xr3,xr3,1); //xr1:src[9]<<16 src[8]<<16 xr3:src[25]<<16 src[24]<<16 Q16SLL(xr2,xr2,xr4,xr4,2); //xr2:src[9]<<2 src[8]<<2 xr4:src[25]<<2 src[24]<<2 Q16ADD_AS_WW(xr1,xr1,xr2,xr0); Q16ADD_AS_WW(xr3,xr3,xr4,xr0); Q16ADD_AS_WW(xr1,xr1,xr7,xr0); //xr1:pt3 t3 Q16ADD_AS_WW(xr3,xr3,xr8,xr0); //xr3:pt4 t4 Q16ADD_AS_WW(xr5,xr5,xr1,xr1); //xr5:pt1+pt3 t1+t3 xr1:pt1-pt3 t1-t3 Q16ADD_AS_WW(xr6,xr6,xr3,xr3); //xr6:pt2+pt4 t2+t4 xr3:pt2-pt4 t2-t4 Q16ADD_AS_WW(xr5,xr5,xr10,xr0); //xr5:pt1+pt3+pt6 t1+t3+t6 Q16ADD_AS_WW(xr0,xr1,xr10,xr1); //xr1:pt1-pt3-pt6 t1-t3-t6 Q16ADD_AS_WW(xr3,xr3,xr9,xr0); //xr3:pt2-pt4+pt5 t2-t4+t5 Q16ADD_AS_WW(xr0,xr6,xr9,xr6); //xr6:pt2+pt4-pt5 t2+t4-t5 Q16SAR(xr5,xr5,xr1,xr1,7); Q16SAR(xr3,xr3,xr6,xr6,7); S32STD(xr5,dst,0x00); S32STD(xr3,dst,0x10); S32STD(xr6,dst,0x20); S32STD(xr1,dst,0x30); src +=2; dst +=2; }}#else/** Do inverse transform on 4x4 part of block*/static void vc1_inv_trans_4x4_c(DCTELEM block[64], int n){ int i; register int t1,t2,t3,t4,t5,t6; DCTELEM *src, *dst; int off; off = (n&1) * 4 + (n&2) * 16; src = block + off; dst = block + off; for(i = 0; i < 4; i++){ t1 = 17 * (src[0] + src[2]); t2 = 17 * (src[0] - src[2]); t3 = 22 * src[1]; t4 = 22 * src[3]; t5 = 10 * src[1]; t6 = 10 * src[3]; dst[0] = (t1 + t3 + t6 + 4) >> 3; dst[1] = (t2 - t4 + t5 + 4) >> 3; dst[2] = (t2 + t4 - t5 + 4) >> 3; dst[3] = (t1 - t3 - t6 + 4) >> 3; src += 8; dst += 8; } src = block + off; dst = block + off; for(i = 0; i < 4; i++){ t1 = 17 * (src[ 0] + src[16]); t2 = 17 * (src[ 0] - src[16]); t3 = 22 * src[ 8]; t4 = 22 * src[24]; t5 = 10 * src[ 8]; t6 = 10 * src[24]; dst[ 0] = (t1 + t3 + t6 + 64) >> 7; dst[ 8] = (t2 - t4 + t5 + 64) >> 7; dst[16] = (t2 + t4 - t5 + 64) >> 7; dst[24] = (t1 - t3 - t6 + 64) >> 7; src ++; dst ++; }}#endif/* motion compensation functions *//** Filter in case of 2 filters */#define VC1_MSPEL_FILTER_16B(DIR, TYPE) \static av_always_inline int vc1_mspel_ ## DIR ## _filter_16bits(const TYPE *src, int stride, int mode) \{\ switch(mode){ \ case 0: /* no shift - should not occur */ \ return 0; \ case 1: /* 1/4 shift */ \ return -4*src[-stride] + 53*src[0] + 18*src[stride] - 3*src[stride*2]; \ case 2: /* 1/2 shift */ \ return -src[-stride] + 9*src[0] + 9*src[stride] - src[stride*2]; \ case 3: /* 3/4 shift */ \ return -3*src[-stride] + 18*src[0] + 53*src[stride] - 4*src[stride*2]; \ } \ return 0; /* should not occur */ \}\VC1_MSPEL_FILTER_16B(ver, uint8_t);VC1_MSPEL_FILTER_16B(hor, int16_t);/** Filter used to interpolate fractional pel values */static av_always_inline int vc1_mspel_filter(const uint8_t *src, int stride, int mode, int r){ switch(mode){ case 0: //no shift return src[0]; case 1: // 1/4 shift return (-4*src[-stride] + 53*src[0] + 18*src[stride] - 3*src[stride*2] + 32 - r) >> 6; case 2: // 1/2 shift return (-src[-stride] + 9*src[0] + 9*src[stride] - src[stride*2] + 8 - r) >> 4; case 3: // 3/4 shift return (-3*src[-stride] + 18*src[0] + 53*src[stride] - 4*src[stride*2] + 32 - r) >> 6; } return 0; //should not occur}/** Function used to do motion compensation with bicubic interpolation */static void vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride, int hmode, int vmode, int rnd){ int i, j; if (vmode) { /* Horizontal filter to apply */ int r; if (hmode) { /* Vertical filter to apply, output to tmp */ static const int shift_value[] = { 0, 5, 1, 5 }; int shift = (shift_value[hmode]+shift_value[vmode])>>1; int16_t tmp[11*8], *tptr = tmp; r = (1<<(shift-1)) + rnd-1; src -= 1; for(j = 0; j < 8; j++) { for(i = 0; i < 11; i++) tptr[i] = (vc1_mspel_ver_filter_16bits(src + i, stride, vmode)+r)>>shift; src += stride; tptr += 11; } r = 64-rnd; tptr = tmp+1; for(j = 0; j < 8; j++) { for(i = 0; i < 8; i++) dst[i] = av_clip_uint8((vc1_mspel_hor_filter_16bits(tptr + i, 1, hmode)+r)>>7); dst += stride; tptr += 11; } return; } else { /* No horizontal filter, output 8 lines to dst */ r = 1-rnd; for(j = 0; j < 8; j++) { for(i = 0; i < 8; i++) dst[i] = av_clip_uint8(vc1_mspel_filter(src + i, stride, vmode, r)); src += stride; dst += stride; } return; } } /* Horizontal mode with no vertical mode */ for(j = 0; j < 8; j++) { for(i = 0; i < 8; i++) dst[i] = av_clip_uint8(vc1_mspel_filter(src + i, 1, hmode, rnd)); dst += stride; src += stride; }}/* pixel functions - really are entry points to vc1_mspel_mc *//* this one is defined in dsputil.c */void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);/*#define PUT_VC1_MSPEL(a, b)\static void put_vc1_mspel_mc ## a ## b ##_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ vc1_mspel_mc(dst, src, stride, a, b, rnd); \}*///PUT_VC1_MSPEL(1, 0)#ifdef JZ4740_MXU_OPTstatic void put_vc1_mspel_mc10_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ uint32_t i,j; uint32_t src_aln0,src_aln1,src_rs0,src_rs1,src_rs2,src_rs3; int mul_ct0,mul_ct1,mul_ct2,mul_ct3; int ct0,rnd1; mul_ct1=0x12121212; //18 18 18 18 mul_ct0=0x35353535; //53 53 53 53 mul_ct3=0x03030303; //3 3 3 3 mul_ct2=0x04040404; //4 4 4 4 ct0 = 0x00200020; //32 32 rnd1 = rnd * 0x00010001; src_aln0 = (((uint32_t)src-1) & 0xFFFFFFFC); src_aln1 = (((uint32_t)src+1) & 0xFFFFFFFC); src_rs0 = 4-(((uint32_t)src - 1) & 3); src_rs1 = src_rs0-1; src_rs2 = 4-(((int)src + 1) & 3); src_rs3 = src_rs2-1; S32I2M(xr15,mul_ct0); //xr15:18 18 18 18 S32I2M(xr14,mul_ct1); //xr14:53 53 53 53 S32I2M(xr11,mul_ct2); //xr11:3 3 3 3 S32I2M(xr10,mul_ct3); //xr10:4 4 4 4 S32I2M(xr13,ct0); //xr13: 32 32 S32I2M(xr12,rnd1); //xr12:rnd rnd for(j=0;j<8;j++){ S32LDD(xr1,src_aln0,0); S32LDD(xr2,src_aln0,4); S32LDD(xr5,src_aln1,0); S32LDD(xr6,src_aln1,4); S32ALN(xr3,xr2,xr1,src_rs0); //xr3:src[2] src[1] src[0] src[-1] S32ALN(xr4,xr2,xr1,src_rs1); //xr4:src[3] src[2] src[1] src[0] S32ALN(xr7,xr6,xr5,src_rs2); //xr7:src[4] src[3] src[2] src[1] S32ALN(xr8,xr6,xr5,src_rs3); //xr8:src[5] src[4] src[3] src[2] Q8MUL(xr5,xr4,xr15,xr6); //xr5:18*src[3] 18*src[2] xr6:18*src[1] 18*src[0] Q8MAC_SS(xr5,xr3,xr11,xr6);//xr5:18*src[3]-3*src[2] 18*src[2]-3*src[1] xr6:18*src[1]-3*src[0] 18*src[0]-3*src[-1] Q8MUL(xr9,xr7,xr14,xr1); //xr9:53*src[4] 53*src[3] xr1:53*src[2] 53*src[1] Q8MAC_SS(xr9,xr8,xr10,xr1); //xr9:53*src[4]-4*src[5] 53*src[3]-4*src[4] xr1:53*src[2]-4*src[3] 53*src[1]-4*src[2] Q16ADD_AA_WW(xr0,xr5,xr9,xr5); //xr5:dst[3] 18*src[3]-3*src[2]+53*src[4]-4*src[5] dst[2] 18*src[2]-3*src[1]+53*src[3]-4*src[4] Q16ADD_AA_WW(xr0,xr6,xr1,xr6); //xr6:dst[1] 18*src[1]-3*src[0]+53*src[2]-4*src[3] dst[0] 18*src[0]-3*src[-1]+53*src[1]-4src[2] Q16ACC_SS(xr5,xr13,xr12,xr6); //two instructions with one Q16SAR(xr5,xr5,xr6,xr6,6); Q16SAT(xr5,xr5,xr6); //xr5:dst[3]dst[2]dst[1]dst[0] S32LDI(xr1,src_aln0,4); S32LDD(xr2,src_aln0,4); S32ALN(xr3,xr2,xr1,src_rs0); //xr3:src[6] src[5] src[4] src[3] S32ALN(xr4,xr2,xr1,src_rs1); //xr4:src[7] src[6] src[5] src[4] S32LDI(xr1,src_aln1,4); S32LDD(xr2,src_aln1,4); S32ALN(xr7,xr2,xr1,src_rs2); //xr7:src[8] src[7] src[6] src[5] S32ALN(xr8,xr2,xr1,src_rs3); //xr8:src[9] src[8] src[7] src[6] Q8MUL(xr1,xr4,xr15,xr2); //xr1:18*src[7] 18*src[6] xr2:18*src[5] 18*src[4] Q8MAC_SS(xr1,xr3,xr11,xr2);//xr1:18*src[7]-3*src[6] 18*src[6]-3*src[5] xr2:18*src[5]-3*src[4] 18*src[4]-3*src[3] Q8MUL(xr9,xr7,xr14,xr3); //xr9:53*src[8] 53*src[7] xr3:53*src[6] 53*src[5] Q8MAC_SS(xr9,xr8,xr10,xr3); //xr9:53*src[8]-4*src[9] 53*src[7]-4*src[8] xr3:53*src[6]-4*src[7] 53*src[5]-4*src[6] Q16ADD_AA_WW(xr0,xr1,xr9,xr1); //xr1:dst[3] 18*src[7]-3*src[6]+53*src[8]-4*src[9] dst[2] 18*src[6]-3*src[5]+53*src[7]-4*src[8] Q16ADD_AA_WW(xr0,xr2,xr3,xr2); //xr2:dst[1] 18*src[5]-3*src[4]+53*src[6]-4*src[7] dst[0] 18*src[4]-3*src[3]53+*src[5]-4*src[6] Q16ACC_SS(xr1,xr13,xr12,xr2); Q16SAR(xr1,xr1,xr2,xr2,6); Q16SAT(xr1,xr1,xr2); //xr1:dst[7]dst[6]dst[5]dst[4] S32STD(xr5,dst,0); S32STD(xr1,dst,4); dst+=stride; src_aln0+=(stride-4); src_aln1+=(stride-4);}}#elsestatic void put_vc1_mspel_mc10_c(uint8_t *dst,const uint8_t *src,int stride,int rnd){ int j; for(j = 0; j < 8; j++) { dst[j*stride+0]=av_clip_uint8((-4 * src[j*stride-1] + 53 * src[j*stride+0] + 18 * src[j*stride+1] - 3 * src[j*stride+2] + 32 - rnd)>>6); dst[j*stride+1]=av_clip_uint8((-4 * src[j*stride+0] + 53 * src[j*stride+1] + 18 * src[j*stride+2] - 3 * src[j*stride+3] + 32 - rnd)>>6); dst[j*stride+2]=av_clip_uint8((-4 * src[j*stride+1] + 53 * src[j*stride+2]+ 18 * src[j*stride+3] - 3 * src[j*stride+4] + 32 - rnd)>>6); dst[j*stride+3]=av_clip_uint8((-4 * src[j*stride+2] + 53 * src[j*stride+3] + 18 * src[j*stride+4] -3 * src[j*stride+5] + 32 - rnd)>>6); dst[j*stride+4]=av_clip_uint8((-4 * src[j*stride+3] + 53 * src[j*stride+4] + 18 * src[j*stride+5] -3 * src[j*stride+6] + 32 - rnd)>>6); dst[j*stride+5]=av_clip_uint8((-4 * src[j*stride+4] + 53 * src[j*stride+5]+ 18 * src[j*stride+6] - 3 * src[j*stride+7] + 32 - rnd)>>6); dst[j*stride+6]=av_clip_uint8((-4 * src[j*stride+5] + 53 * src[j*stride+6]+ 18 * src[j*stride+7] - 3 * src[j*stride+8] + 32 - rnd)>>6); dst[j*stride+7]=av_clip_uint8((-4 * src[j*stride+6] + 53 * src[j*stride+7]+ 18 * src[j*stride+8] - 3 * src[j*stride+9] + 32 - rnd)>>6);}}#endif/*static void put_vc1_mspel_mc10_c(uint8_t *dst,const uint8_t *src,int stride,int rnd)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -