📄 vc1dsp.c
字号:
/* * VC-1 and WMV3 decoder - DSP functions * Copyright (c) 2006 Konstantin Shishkov * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *//*** @file vc1dsp.c * VC-1 and WMV3 decoder * */#define JZ4740_MXU_OPT#ifdef JZ4740_MXU_OPT#include "jzmedia.h"#endif#include "dsputil.h"#undef printf/** Apply overlap transform to horizontal edge*/#define W1 0x4 //4#define W2 0x6 //6#define W3 0x9 //9#define W4 0xC //12#define W5 0xF //15#define W6 0x10 //16#define W7 0x11 //17#define W8 0x16 //22#define W9 0xA //10static void vc1_v_overlap_c(uint8_t* src, int stride){ int i; int a, b, c, d; int d1, d2; int rnd = 1; for(i = 0; i < 8; i++) { a = src[-2*stride]; b = src[-stride]; c = src[0]; d = src[stride]; d1 = (a - d + 3 + rnd) >> 3; d2 = (a - d + b - c + 4 - rnd) >> 3; src[-2*stride] = a - d1; src[-stride] = b - d2; src[0] = c + d2; src[stride] = d + d1; src++; rnd = !rnd; }}/** Apply overlap transform to vertical edge*/static void vc1_h_overlap_c(uint8_t* src, int stride){ int i; int a, b, c, d; int d1, d2; int rnd = 1; for(i = 0; i < 8; i++) { a = src[-2]; b = src[-1]; c = src[0]; d = src[1]; d1 = (a - d + 3 + rnd) >> 3; d2 = (a - d + b - c + 4 - rnd) >> 3; src[-2] = a - d1; src[-1] = b - d2; src[0] = c + d2; src[1] = d + d1; src += stride; rnd = !rnd; }}/** Do inverse transform on 8x8 block*/#ifdef JZ4740_MXU_OPTstatic void vc1_inv_trans_8x8_c(DCTELEM block[64]){ int i; DCTELEM *src, *dst; src = block; dst = block; S32I2M(xr15,W4<<16|W4); //xr15:12|12 S32I2M(xr14,W6<<16|W2); //xr14:16|6 S32I2M(xr13,W6<<16|W5); //xr13:16|15 S32I2M(xr12,W3<<16|W1); //xr12:9 |4 for(i=0;i<8;i++) { S32LDD(xr1,src,0x0); //xr1:src[1] src[0] S32LDD(xr2,src,0x4); //xr2:src[3] src[2] S32LDD(xr3,src,0x8); //xr3:src[5] src[4] S32LDD(xr4,src,0xc); //xr4:src[7] src[6] D16MUL_LW(xr5,xr1,xr15,xr6); //xr5:12*src[0] xr6:12*src[0] D16MAC_AS_LW(xr5,xr3,xr15,xr6); //xr5:t1 12*src[0]+12*src[4] xr6:t2 12*src[0]-12*src[4] D16MUL_LW(xr7,xr2,xr14,xr8); //xr7:16*src[2] xr8:6*src[2] D16MAC_SA_LW(xr8,xr4,xr14,xr7); //xr8:t4 6*src[2]-16*src[6] xr7:t3 16*src[2]+6*src[6] D32ADD_AS(xr5,xr5,xr7,xr7); //xr5:t5 t1+t3 xr7:t8 t1-t3 D32ADD_AS(xr6,xr6,xr8,xr8); //xr6:t6 t2+t4 xr8:t7 t2-t4 S32I2M(xr9,4); D32ACC_AS(xr5,xr9,xr0,xr7); D32ACC_AS(xr6,xr9,xr0,xr8); D16MUL_HW(xr9,xr1,xr13,xr10); //xr9:16*src[1] xr10:15*src[1] D16MUL_HW(xr11,xr1,xr12,xr1); //xr11:9*src[1] xr1:4*src[1] D16MAC_SA_HW(xr11,xr2,xr13,xr9);//xr11:9*src[1]-16*src[3] xr9:16*src[1]+15*src[3] D16MAC_SS_HW(xr1,xr2,xr12,xr10);//xr1:4*src[1]-9*src[3] xr10:15*src[1]-4*src[3] D16MAC_SA_HW(xr10,xr3,xr13,xr1);//xr10:15*src[1]-4*src[3]-16*src[5] xr1:4*src[1]-9*src[3]+15*src[5] D16MAC_AA_HW(xr9,xr3,xr12,xr11);//xr9:16*src[1]+15*src[3]+ 9*src[5] xr11:9*src[1]-16*src[3]+4*src[5] D16MAC_SA_HW(xr1,xr4,xr13,xr11);//xr1:t4 4*src[1]-9*src[3]+15*src[5]-16*src[7] xr11:t3 9*src[1]-16*src[3]+4*src[5]+15*src[7] D16MAC_SA_HW(xr10,xr4,xr12,xr9);//xr10:t2 15*src[1]-4*src[3]-16*src[5]-9*src[7] xr9:t1 16*src[1]+15*src[3]+ 9*src[5]+4*src[7] D32ADD_AS(xr5,xr5,xr9,xr9); //xr5:t5+t1 xr9:t5-t1 D32ADD_AS(xr6,xr6,xr10,xr10); //xr6:t6+t2 xr10:t6-t2 D32ADD_AS(xr8,xr8,xr11,xr11); //xr8:t7+t3 xr11:t7-t3 D32ADD_AS(xr7,xr7,xr1,xr1); //xr7:t8+t4 xr1:t8-t4 D32SARL(xr5,xr6,xr5,3); D32SARL(xr6,xr7,xr8,3); D32SARL(xr7,xr11,xr1,3); D32SARL(xr8,xr9,xr10,3); S32STD(xr5,dst,0x0); S32STD(xr6,dst,0x4); S32STD(xr7,dst,0x8); S32STD(xr8,dst,0xc); src+=8; dst+=8; } src = block; dst = block; for(i = 0; i < 4; i++){ S32LDD(xr1,src,0x00); //xr1:src[1] src[0] S32LDD(xr2,src,0x10); //xr2:src[9] src[8] S32LDD(xr3,src,0x20); //xr3:src[17] src[16] S32LDD(xr4,src,0x30); //xr4:src[25] src[24] S32LDD(xr5,src,0x40); //xr5:src[33] src[32] S32LDD(xr6,src,0x50); //xr6:src[41] src[40] S32LDD(xr7,src,0x60); //xr7:src[49] src[48] S32LDD(xr8,src,0x70); //xr8:src[57] src[56] Q16ADD_AS_WW(xr9,xr1,xr5,xr10); //xr9:src[1]+src[33] src[0]+src[32] xr10:src[1]-src[33] src[0]-src[32] Q16SLL(xr11,xr9,xr10,xr12,3); //xr11:(src[1]+src[33])<<3 (src[0]+src[32])<<3 xr12:(src[1]-src[33])<<3 (src[0]-src[32])<<3 Q16SLL(xr13,xr9,xr10,xr14,2); //xr13:(src[1]+src[33])<<2 (src[0]+src[32])<<2 xr14:(src[1]-src[33])<<2 (src[0]-src[32])<<2 Q16ADD_AS_WW(xr11,xr11,xr13,xr0);//xr11:pt1 (src[1]+src[33])<<3+(src[1]+src[33])<<2 t1 (src[0]+src[32])<<3+(src[0]+src[32])<<2 Q16ADD_AS_WW(xr12,xr12,xr14,xr0);//xr12:pt2 (src[1]-src[33])<<3+(src[1]-src[33])<<2 t2 (src[0]-src[32])<<3+(src[0]-src[32])<<2 Q16SLL(xr9,xr3,xr7,xr10,4); //xr9:src[17]<<4 src[16]<<4 xr10:src[49]<<4 src[48]<<4 Q16SLL(xr13,xr3,xr7,xr14,3); //xr13:src[17]<<3 src[16]<<3 xr14:src[49]<<3 src[48]<<3 Q16SLL(xr15,xr3,xr7,xr1,1); //xr15:src[17]<<1 src[16]<<1 xr1:src[49]<<1 src[48]<<1 Q16ADD_AS_WW(xr9,xr9,xr14,xr0); //xr9:src[17]<<4+src[49]<<3 src[16]<<4+src[48]<<3 Q16ADD_AS_WW(xr0,xr9,xr1,xr9); //xr9:pt3 src[17]<<4+src[49]<<3-src[49]<<1 t3 src[16]<<4+src[48]<<3-src[48]<<1 Q16ADD_AS_WW(xr0,xr13,xr15,xr13); //xr13:src[17]<<3-src[17]<<1 src[16]<<3-src[16]<<1 Q16ADD_AS_WW(xr0,xr13,xr10,xr13); //xr13:pt4 src[17]<<3-src[17]<<1-src[49]<<4 t4 src[16]<<3-src[16]<<1-src[48]<<4 Q16ADD_AS_WW(xr11,xr11,xr9,xr9); //xr11:pt5 t5 xr9:pt8 t8 Q16ADD_AS_WW(xr12,xr12,xr13,xr13); //xr12:pt6 t6 xr13:pt7 t7 Q16ADD_AS_WW(xr1,xr6,xr4,xr3); //xr1:src[41]+src[25] src[40]+src[24] xr3:src[41]-src[25] src[40]-src[24] Q16SLL(xr10,xr2,xr4,xr14,4); //xr10:src[9]<<4 src[8]<<4 xr14:src[25]<<4 src[24]<<4 Q16SLL(xr15,xr6,xr8,xr5,4); //xr15:src[41]<<4 src[40]<<4 xr5:src[57]<<4 src[56]<<4 Q16SLL(xr10,xr2,xr4,xr14,4); //xr10:src[9]<<4 src[8]<<4 xr14:src[25]<<4 src[24]<<4 Q16SLL(xr15,xr6,xr0,xr0,3); //xr15:src[41]<<3 src[40]<<3 Q16SLL(xr5,xr8,xr0,xr0,2); //xr5:src[57]<<2 src[56]<<2 Q16ADD_AS_WW(xr7,xr10,xr14,xr0); //xr7:src[9]<<4 + src[25]<<4 src[8]<<4 + src[24]<<4 Q16ADD_AS_WW(xr5,xr15,xr5,xr0); //xr5:src[41]<<3+src[57]<<2 src[40]<<3+src[56]<<2 Q16ADD_AS_WW(xr5,xr7,xr5,xr0); //xr5:src[9]<<4 + src[25]<<4+src[41]<<3+src[57]<<2 src[8]<<4 + src[24]<<4+src[40]<<3+src[56]<<2 Q16ADD_AS_WW(xr5,xr5,xr3,xr0); //xr5:pt1 t1 Q16SLL(xr10,xr6,xr8,xr14,4); //xr10:src[41]<<4 src[40]<<4 xr14:src[57]<<4 src[56]<<4 Q16SLL(xr15,xr4,xr0,xr0,3); //xr15:src[25]<<3 src[24]<<3 Q16SLL(xr3,xr2,xr0,xr0,2); //xr3: src[9]<<2 src[8]<<2 Q16ADD_AS_WW(xr0,xr10,xr14,xr7); //xr7: src[41]<<4-src[57]<<4 src[40]<<4-src[56]<<4 Q16ADD_AS_WW(xr0,xr7,xr15,xr7); //xr7: src[41]<<4-src[57]<<4-src[25]<<3 src[40]<<4-src[56]<<4-src[24]<<3 Q16ADD_AS_WW(xr7,xr7,xr3,xr0); //xr7:src[41]<<4-src[57]<<4-src[25]<<3+src[9] src[40]<<4-src[56]<<4-src[24]<<3+src[8] Q16ADD_AS_WW(xr0,xr7,xr1,xr7); //xr7:pt4 t4 Q16ADD_AS_WW(xr1,xr8,xr2,xr3); //xr1:src[57]+src[9] src[56]+src[8] xr3:src[57]-src[9] src[56]-src[8] Q16SLL(xr10,xr2,xr6,xr14,4); //xr10:src[9]<<4 src[8]<<4 xr14:src[41]<<4 src[40]<<4 Q16SLL(xr15,xr8,xr0,xr0,3); //xr15:src[57]<<3 src[56]<<3 Q16ADD_AS_WW(xr0,xr10,xr14,xr10); //xr10:src[9]<<4-src[41]<<4 src[8]<<4-src[40]<<4 Q16ADD_AS_WW(xr0,xr10,xr15,xr10); //xr10:src[9]<<4-src[41]<<4-src[57]<<3 src[8]<<4-src[40]<<4-src[56]<<3 Q16SLL(xr15,xr4,xr0,xr0,2); //xr15:src[25]<<2 src[24]<<2 Q16ADD_AS_WW(xr0,xr10,xr15,xr10); //xr10:src[9]<<4-src[41]<<4-src[57]<<3-src[25]<<2 src[8]<<4-src[40]<<4-src[56]<<3-src[24]<<2 Q16ADD_AS_WW(xr0,xr10,xr1,xr1); //xr1:pt2 t2 Q16SLL(xr10,xr8,xr4,xr14,4); //xr10:src[57]<<4 src[56]<<4 xr14:src[25]<<4 src[24]<<4 Q16SLL(xr15,xr2,xr0,xr0,3); //xr15:src[9]<<3 src[8]<<3 Q16ADD_AS_WW(xr0,xr10,xr14,xr10); //xr10:src[57]<<4-src[25]<<4 src[56]<<4-src[24]<<4 Q16ADD_AS_WW(xr10,xr10,xr15,xr0); //xr10:src[57]<<4-src[25]<<4+src[9]<<3 src[56]<<4-src[24]<<4+src[8]<<3 Q16SLL(xr15,xr6,xr0,xr0,2); //xr15:src[41]<<2 src[40]<<2 Q16ADD_AS_WW(xr10,xr10,xr15,xr0); Q16ADD_AS_WW(xr3,xr10,xr3,xr0); //xr3:pt3 t3*/ Q16ADD_AS_WW(xr11,xr11,xr5,xr5); //xr11:pt5+pt1 t5+t1 xr5:pt5-pt1 t5-t1 Q16ADD_AS_WW(xr12,xr12,xr1,xr1); //xr12:pt6+pt2 t6+t2 xr1:pt6-pt2 t6-t2 Q16ADD_AS_WW(xr13,xr13,xr3,xr3); //xr13:pt7+pt3 t7+t3 xr3:pt7-pt3 t7-t3 Q16ADD_AS_WW(xr9,xr9,xr7,xr7); //xr9: pt8+pt4 t8+t4 xr7:pt8-pt4 t8-t4 S32I2M(xr2,64<<16|64); Q16ACC_AS(xr11,xr2,xr0,xr12); //xr11:pt5+pt1+64 t5+t1+64 xr12:pt6+pt2+64 t6+t2+64 Q16ACC_AS(xr13,xr2,xr0,xr9); //xr13:pt7+pt3+64 t7+t3+64 xr9: pt8+pt4+64 t8+t4+64 S32I2M(xr4,65<<16|65); Q16ACC_AS(xr5,xr4,xr0,xr1); //xr5:pt5-pt1+65 t5-t1+65 xr1:pt6-pt2+65 t6-t2+65 Q16ACC_AS(xr3,xr4,xr0,xr7); //xr3:pt7-pt3+65 t7-t3+65 xr7:pt8-pt4+65 t8-t4+65 Q16SAR(xr11,xr11,xr12,xr12,7); //xr11:dst[0] xr12:dst[8] Q16SAR(xr13,xr13,xr9,xr9,7); //xr13:dst[16] xr9:dst[24] Q16SAR(xr7,xr7,xr3,xr3,7); //xr7:dst[32] xr3: dst[40] Q16SAR(xr1,xr1,xr5,xr5,7); //xr1:dst[48] xr5: dst[56] S32STD(xr11,dst,0x00); S32STD(xr12,dst,0x10); S32STD(xr13,dst,0x20); S32STD(xr9,dst,0x30); S32STD(xr7,dst,0x40); S32STD(xr3,dst,0x50); S32STD(xr1,dst,0x60); S32STD(xr5,dst,0x70); src+=2; dst+=2; }}#elsestatic void vc1_inv_trans_8x8_c(DCTELEM block[64]){ int i; register int t1,t2,t3,t4,t5,t6,t7,t8; DCTELEM *src, *dst; src = block; dst = block; for(i = 0; i < 8; i++){ t1 = 12 * (src[0] + src[4]); t2 = 12 * (src[0] - src[4]); t3 = 16 * src[2] + 6 * src[6]; t4 = 6 * src[2] - 16 * src[6]; t5 = t1 + t3; t6 = t2 + t4; t7 = t2 - t4; t8 = t1 - t3; t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7]; t2 = 15 * src[1] - 4 * src[3] - 16 * src[5] - 9 * src[7]; t3 = 9 * src[1] - 16 * src[3] + 4 * src[5] + 15 * src[7]; t4 = 4 * src[1] - 9 * src[3] + 15 * src[5] - 16 * src[7]; dst[0] = (t5 + t1 + 4) >> 3; dst[1] = (t6 + t2 + 4) >> 3; dst[2] = (t7 + t3 + 4) >> 3; dst[3] = (t8 + t4 + 4) >> 3; dst[4] = (t8 - t4 + 4) >> 3; dst[5] = (t7 - t3 + 4) >> 3; dst[6] = (t6 - t2 + 4) >> 3; dst[7] = (t5 - t1 + 4) >> 3; src += 8; dst += 8; } src=block; dst=block; for(i = 0; i < 8; i++){ t1 = 12 * (src[0] + src[32]); t2 = 12 * (src[0] - src[32]); t3 = 16 * src[16] + 6 * src[48]; t4 = 6 * src[16] - 16 * src[48]; t5 = t1 + t3; t6 = t2 + t4; t7 = t2 - t4; t8 = t1 - t3; t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]; t2 = 15 * src[8] - 4 * src[24] - 16 * src[40] - 9 * src[56]; t3 = 9 * src[8] - 16 * src[24] + 4 * src[40] + 15 * src[56]; t4 = 4 * src[8] - 9 * src[24] + 15 * src[40] - 16 * src[56]; dst[0] = (t5 + t1 + 64) >> 7; dst[8] = (t6 + t2 + 64) >> 7; dst[16] = (t7 + t3 + 64) >> 7; dst[24] = (t8 + t4 + 64) >> 7; dst[32] = (t8 - t4 + 64+1) >> 7; dst[40] = (t7 - t3 + 64+1) >> 7; dst[48] = (t6 - t2 + 64+1) >> 7; dst[56] = (t5 - t1 + 64+1) >> 7; src++; dst++; }}#endif/** Do inverse transform on 8x4 part of block*/#ifdef JZ4740_MXU_OPTstatic void vc1_inv_trans_8x4_c(DCTELEM block[64], int n){ int i; DCTELEM *src, *dst; int off; off = n * 32; src = block + off; dst = block + off; S32I2M(xr15,W4<<16|W4); //xr15:12|12 S32I2M(xr14,W6<<16|W2); //xr14:16|6 S32I2M(xr13,W6<<16|W5); //xr13:16|15 S32I2M(xr12,W3<<16|W1); //xr12:9 |4 for(i=0;i<4;i++) { S32LDD(xr1,src,0x0); //xr1:src[1] src[0] S32LDD(xr2,src,0x4); //xr2:src[3] src[2] S32LDD(xr3,src,0x8); //xr3:src[5] src[4] S32LDD(xr4,src,0xc); //xr4:src[7] src[6] D16MUL_LW(xr5,xr1,xr15,xr6); //xr5:12*src[0] xr6:12*src[0] D16MAC_AS_LW(xr5,xr3,xr15,xr6); //xr5:t1 12*src[0]+12*src[4] xr6:t2 12*src[0]-12*src[4]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -