📄 h264dsputil.c
字号:
}\\static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\ uint8_t full[SIZE*(SIZE+5)];\ uint8_t * const full_mid= full + SIZE*2;\ int16_t tmp[SIZE*(SIZE+5)];\ uint8_t halfV[SIZE*SIZE];\ uint8_t halfHV[SIZE*SIZE];\ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\}\#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)#define op_put(a, b) a = cm[((b) + 16)>>5]#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)#define op2_put(a, b) a = cm[((b) + 512)>>10]#ifdef JZ4740_MXU_OPTstatic void put_h264_qpel4_h_lowpass_mxu(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ uint32_t src_aln1,src_aln2; uint32_t src_rs1,src_rs2; int i; uint32_t mul_1 = 0x14140501; //(20,20,5,1) uint32_t mul_2 = 0x01050105; //(1,5,1,5) uint32_t round = 0x100010; //(16,16) src_aln1 = ((uint32_t)src - 2) & 0xfffffffc; src_aln2 = (uint32_t)src & 0xfffffffc; src_rs1 = 4 - (((uint32_t)src - 2) & 3); src_rs2 = 4 - ((uint32_t)src & 3); S32I2M(xr15,mul_2); S32I2M(xr14,mul_1); S32I2M(xr13,round); dst -= dstStride; for(i=0; i<4; i++) { //2 pixel S32LDD(xr1,src_aln1,0); S32LDD(xr2,src_aln1,4); S32LDD(xr3,src_aln1,8); src_aln1 += srcStride; i_pref(0, src_aln1, 0); //for pixel[0] S32ALN(xr4,xr2,xr1,src_rs1); //xr4 <- src[-2] ~ src[1] S32ALN(xr5,xr3,xr2,src_rs1); //xr5 <- src[2] ~ src[5] //for pixel[1] S32ALN(xr6,xr5,xr4,3); //xr6 <- src[-1] ~ src[2] D32SLR(xr1,xr5,xr0,xr0,8); S32SFL(xr0,xr1,xr5,xr7,ptn3); //xr7 <- src[4],src[3],src[3],src[2] Q8MUL(xr2,xr4,xr14,xr1); //xr2 <- 20*src[1],20*src[0] xr1 <- 5*src[-1],src[-2] Q8MUL(xr8,xr7,xr15,xr7); //xr8 <- src[4],5*src[3] xr7 <- src[3],5*src[2] Q8MUL(xr4,xr6,xr14,xr3); //xr4 <- 20*src[2],20*src[1] xr3 <- 5*src[0],src[-1] Q16ADD_AA_XW(xr2,xr2,xr2,xr0); //xr2 <- 20*src[1] + 20*src[0] Q16ADD_SS_XW(xr0,xr1,xr1,xr1); //xr1 <- src[-2] - 5*src[-1] Q16ADD_SS_XW(xr0,xr7,xr7,xr7); //xr7 <- src[3] - 5*src[2] S32SFL(xr1,xr0,xr1,xr0,ptn3); //xr1 <- src[-2] - 5*src[-1] Q16ACC_AA(xr0,xr2,xr7,xr1); //xr1 <- 20*src[1] + 20*src[0] + src[-2] - 5*src[-1] + src[3] - 5*src[2] Q16ADD_AA_XW(xr4,xr4,xr4,xr0); //xr4 <- 20*src[1] + 20*src[2] Q16ADD_SS_XW(xr3,xr3,xr3,xr0); //xr3 <- src[-1] - 5*src[0] Q16ADD_SS_XW(xr0,xr8,xr8,xr8); //xr8 <- src[4] - 5*src[3] S32SFL(xr3,xr0,xr3,xr0,ptn3); //xr3 <- src[-1] - 5*src[0] Q16ACC_AA(xr0,xr4,xr8,xr3); //xr3 <- 20*src[1] + 20*src[2] + src[-1] - 5*src[0] + src[4] - 5*src[3] S32SFL(xr0,xr3,xr1,xr1,ptn3); Q16ADD_AA_WW(xr0,xr1,xr13,xr11); //xr11 <- tow new pixel //2 pixel S32LDD(xr1,src_aln2,0); S32LDD(xr2,src_aln2,4); S32LDD(xr3,src_aln2,8); src_aln2 += srcStride; i_pref(0, src_aln2, 0); //for pixel[0] S32ALN(xr4,xr2,xr1,src_rs2); //xr4 <- src[-2] ~ src[1] S32ALN(xr5,xr3,xr2,src_rs2); //xr5 <- src[2] ~ src[5] //for pixel[1] S32ALN(xr6,xr5,xr4,3); //xr6 <- src[-1] ~ src[2] D32SLR(xr1,xr5,xr0,xr0,8); S32SFL(xr0,xr1,xr5,xr7,ptn3); //xr7 <- src[4],src[3],src[3],src[2] Q8MUL(xr2,xr4,xr14,xr1); //xr2 <- 20*src[1],20*src[0] xr1 <- 5*src[-1],src[-2] Q8MUL(xr8,xr7,xr15,xr7); //xr8 <- src[4],5*src[3] xr7 <- src[3],5*src[2] Q8MUL(xr4,xr6,xr14,xr3); //xr4 <- 20*src[2],20*src[1] xr3 <- 5*src[0],src[-1] Q16ADD_AA_XW(xr2,xr2,xr2,xr0); //xr2 <- 20*src[1] + 20*src[0] Q16ADD_SS_XW(xr0,xr1,xr1,xr1); //xr1 <- src[-2] - 5*src[-1] Q16ADD_SS_XW(xr0,xr7,xr7,xr7); //xr7 <- src[3] - 5*src[2] S32SFL(xr1,xr0,xr1,xr0,ptn3); //xr1 <- src[-2] - 5*src[-1] Q16ACC_AA(xr0,xr2,xr7,xr1); //xr1 <- 20*src[1] + 20*src[0] + src[-2] - 5*src[-1] + src[3] - 5*src[2] Q16ADD_AA_XW(xr4,xr4,xr4,xr0); //xr4 <- 20*src[1] + 20*src[2] Q16ADD_SS_XW(xr3,xr3,xr3,xr0); //xr3 <- src[-1] - 5*src[0] Q16ADD_SS_XW(xr0,xr8,xr8,xr8); //xr8 <- src[4] - 5*src[3] S32SFL(xr3,xr0,xr3,xr0,ptn3); //xr3 <- src[-1] - 5*src[0] Q16ACC_AA(xr0,xr4,xr8,xr3); //xr3 <- 20*src[1] + 20*src[2] + src[-1] - 5*src[0] + src[4] - 5*src[3] S32SFL(xr0,xr3,xr1,xr1,ptn3); Q16ADD_AA_WW(xr0,xr1,xr13,xr12); //xr12 <- tow new pixel // >>5 and clip to (0~255) Q16SAR(xr12,xr12,xr11,xr11,5); Q16SAT(xr1,xr12,xr11); //xr1 <- dst[3],dst[2],dst[1],dst[0] S32SDIV(xr1,dst,dstStride,0); }}static void put_h264_qpel4_v_lowpass_mxu(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ uint32_t src_aln,src_aln0, src_al4; uint32_t src_rs; int i; uint32_t mul_20 = 0x14141414; //(20,20,20,20) uint32_t mul_5 = 0x05050505; //(5,5,5,5) uint32_t round = 0x00100010; //(0,16,0,16) src_aln = ((uint32_t)src - (srcStride<<1)) & 0xfffffffc; src_aln0 = src_aln - (srcStride<<1); src_rs = 4 - (((uint32_t)src - (srcStride<<1)) & 3); S32I2M(xr15,mul_20); S32I2M(xr14,mul_5); S32I2M(xr13,round); dst -= dstStride; for(i=0; i<4; i++) { src_aln0 += srcStride; src_aln = src_aln0; src_al4 = src_aln0 + 4; //first line S32LDIV(xr1, src_aln, srcStride, 0); S32LDIV(xr2, src_al4, srcStride, 0); //second line S32LDIV(xr3, src_aln, srcStride, 0); S32LDIV(xr4, src_al4, srcStride, 0); //ALN 1st and 2nd line 4 pixel S32ALN(xr1,xr2,xr1,src_rs); S32ALN(xr2,xr4,xr3,src_rs); //third line S32LDIV(xr3, src_aln, srcStride, 0); S32LDIV(xr4, src_al4, srcStride, 0); Q8MUL(xr10,xr2,xr14,xr9); //2nd 4 pixel mul 5 <--here //ALN 3rd line 4 pixel S32ALN(xr3,xr4,xr3,src_rs); Q8MUL(xr8,xr3,xr15,xr7); //3rd 4 pixel mul 20 //fourth line S32LDIV(xr3, src_aln, srcStride, 0); S32LDIV(xr4, src_al4, srcStride, 0); S32LDIV(xr2, src_aln, srcStride, 0); // <--here S32LDIV(xr5, src_al4, srcStride, 0); //ALN 4th line 4 pixel S32ALN(xr4,xr4,xr3,src_rs); //fiveth line //ALN 5th line 4 pixel S32ALN(xr5,xr5,xr2,src_rs); S32LDIV(xr11, src_aln, srcStride, 0); // <--here S32LDIV(xr12, src_al4, srcStride, 0); Q8MUL(xr6,xr5,xr14,xr5); //5th 4 pixel mul 5 Q8MUL(xr4,xr4,xr15,xr3); //4th 4 pixel mul 20 //sixth line //ALN 6th line 4 pixel S32ALN(xr2,xr12,xr11,src_rs); //1st line + 6th line Q8ADDE_AA(xr2,xr1,xr2,xr1); //1st line + 6th line + 3rd line - 2nd line Q16ACC_SS(xr2,xr8,xr10,xr0); Q16ACC_SS(xr1,xr7,xr9,xr0); //1st line + 6th line + 3rd line - 2nd line + 4th line - 5th line Q16ACC_SS(xr2,xr4,xr6,xr0); Q16ACC_SS(xr1,xr3,xr5,xr0); //+ round Q16ADD_AA_WW(xr2,xr2,xr13,xr0); Q16ADD_AA_WW(xr1,xr1,xr13,xr0); // >>5 and clip to (0~255) Q16SAR(xr2,xr2,xr1,xr1,5); Q16SAT(xr1,xr2,xr1); //xr1 <- dst[3],dst[2],dst[1],dst[0] S32SDIV(xr1, dst, dstStride, 0); }}static void put_h264_qpel4_hv_lowpass_mxu(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){ uint32_t src_aln[2],tmp_h,tmp_v; uint32_t src_rs[2]; int i,j; uint32_t mul_1 = 0x14140501; //(20,20,5,1) uint32_t mul_2 = 0x01050105; //(1,5,1,5) uint32_t mul_20 = 0x00140014; //(0,20,0,20) uint32_t mul_5 = 0x00050005; //(0,5,0,5) uint32_t round = 0x200; //512 tmp_h = (uint32_t)tmp; src_aln[0] = ((uint32_t)src - 2*srcStride - 2) & 0xfffffffc; src_aln[1] = ((uint32_t)src - 2*srcStride) & 0xfffffffc; src_rs[0] = 4 - (((uint32_t)src - 2) & 3); src_rs[1] = 4 - ((uint32_t)src & 3); S32I2M(xr15,mul_2); S32I2M(xr14,mul_1); tmp_h -= 4; for(i=0; i<9; i++) { for(j=0; j<2; j++) { //2 pixel S32LDD(xr1,src_aln[j],0); S32LDD(xr2,src_aln[j],4); S32LDD(xr3,src_aln[j],8); src_aln[j] += srcStride; i_pref(0, src_aln[j], 0); //for pixel[0] S32ALN(xr4,xr2,xr1,src_rs[j]); //xr4 <- src[-2] ~ src[1] S32ALN(xr5,xr3,xr2,src_rs[j]); //xr5 <- src[2] ~ src[5] //for pixel[1] S32ALN(xr6,xr5,xr4,3); //xr6 <- src[-1] ~ src[2] D32SLR(xr1,xr5,xr0,xr0,8); S32SFL(xr0,xr1,xr5,xr7,ptn3); //xr7 <- src[4],src[3],src[3],src[2] Q8MUL(xr2,xr4,xr14,xr1); //xr2 <- 20*src[1],20*src[0] xr1 <- 5*src[-1],src[-2] Q8MUL(xr8,xr7,xr15,xr7); //xr8 <- src[4],5*src[3] xr7 <- src[3],5*src[2] Q8MUL(xr4,xr6,xr14,xr3); //xr4 <- 20*src[2],20*src[1] xr3 <- 5*src[0],src[-1] Q16ADD_AA_XW(xr2,xr2,xr2,xr0); //xr2 <- 20*src[1] + 20*src[0] Q16ADD_SS_XW(xr0,xr1,xr1,xr1); //xr1 <- src[-2] - 5*src[-1] Q16ADD_SS_XW(xr0,xr7,xr7,xr7); //xr7 <- src[3] - 5*src[2] S32SFL(xr1,xr0,xr1,xr0,ptn3); //xr1 <- src[-2] - 5*src[-1] Q16ACC_AA(xr0,xr2,xr7,xr1); //xr1 <- 20*src[1] + 20*src[0] + src[-2] - 5*src[-1] + src[3] - 5*src[2] Q16ADD_AA_XW(xr4,xr4,xr4,xr0); //xr4 <- 20*src[1] + 20*src[2] Q16ADD_SS_XW(xr3,xr3,xr3,xr0); //xr3 <- src[-1] - 5*src[0] Q16ADD_SS_XW(xr0,xr8,xr8,xr8); //xr8 <- src[4] - 5*src[3] S32SFL(xr3,xr0,xr3,xr0,ptn3); //xr3 <- src[-1] - 5*src[0] Q16ACC_AA(xr0,xr4,xr8,xr3); //xr3 <- 20*src[1] + 20*src[2] + src[-1] - 5*src[0] + src[4] - 5*src[3] S32SFL(xr0,xr3,xr1,xr1,ptn3); S32SDI(xr1, tmp_h, 4); } tmp_h += tmpStride - 8; } S32I2M(xr15,mul_20); S32I2M(xr14,mul_5); S32I2M(xr13,round); S32I2M(xr11,0x10001); tmp_v = (uint32_t)tmp - (tmpStride << 1); for(i=0; i<4; i++) { tmp_v += tmpStride; for(j=0; j<2; j++) { tmp_h = tmp_v + (j<<2); //1st line 2 pixel S32LDIV(xr1, tmp_h, tmpStride, 0); //2nd line 2 pixel S32LDIV(xr2, tmp_h, tmpStride, 0); //3rd line 2 pixel S32LDIV(xr3, tmp_h, tmpStride, 0); //2line*5; 3line*20 D16MUL_WW(xr10,xr2,xr14,xr9); //2nd 2 pixel mul 5 D16MUL_WW(xr8,xr3,xr15,xr7); //3rd 2 pixel mul 20 //4th line 2 pixel S32LDIV(xr4, tmp_h, tmpStride, 0); //5th line 2 pixel S32LDIV(xr5, tmp_h, tmpStride, 0); S32LDDV(xr12, tmp_h, tmpStride, 0); //5line*5; 4line*20 D16MUL_WW(xr3,xr5,xr14,xr2); //5th 2 pixel mul 5 D16MUL_WW(xr5,xr4,xr15,xr4); //4th 2 pixel mul 20 //6st line 2 pixel //1st line + 6th line Q16ADD_AA_WW(xr12,xr1,xr12,xr0); D16MUL_WW(xr12,xr12,xr11,xr1); //1st line + 6th line + 3rd line - 2nd line D32ACC_SS(xr12,xr8,xr10,xr0); D32ACC_SS(xr1,xr7,xr9,xr0); //1st line + 6th line + 3rd line - 2nd line + 4th line - 5th line D32ACC_SS(xr12,xr5,xr3,xr0); D32ACC_SS(xr1,xr4,xr2,xr0); //+ round D32ADD_AA(xr12,xr12,xr13,xr0); D32ADD_AA(xr1,xr1,xr13,xr0); //right shift D32SAR(xr2,xr12,xr1,xr1,10); S32SFL(xr0, xr2, xr1, xr1, ptn3); Q16SAT(xr1, xr0, xr1); *((uint16_t*)dst) = S32M2I(xr1); dst += 2; } dst
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -