📄 idct_mmx.h
字号:
/* libfame - Fast Assembly MPEG Encoder Library Copyright (C) 2000-2001 Vivien Chappelier This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*//*************************** MMX accelerated iDCT ****************************/static void inline idct_aan_pass(dct_t * block){ // register unsigned short const *mmx_icos = _mmx_icos; asm volatile ( /* STEP 1 v11 = v1+v7; - v71 - v71 = v1-v7; - v71, v11 - v04 = v0+v4; - v71, v11, v44 - v44 = v0-v4; - v71, v11, v44, v04 - v62 = v2+v6; - v71, v11, v44, v04, v62 - v22 = v2-v6; - v71, v11, v44, v04, v62, v22 - v05 = v04+v62; - v71, v11, v44, v04, v62, v22, v05 - v65 = v04-v62; - v71, v11, v44, v65, v62, v22, v05 - block[row*8+0] = v05; - v71, v11, v44, v65, v62, v22, v05 - block[row*8+7] = v05; - v71, v11, v44, v65, v62, v22 - */ "movq 0x70(%0), %%mm0\n" /* line 7 -> mm0 */ "movq 0x10(%0), %%mm1\n" /* line 1 -> mm1 */ "movq %%mm0, %%mm6\n" /* line 7 -> mm6 */ "paddsw %%mm1, %%mm0\n" /* line 1 + line 7 -> mm0 (v11) */ "psubsw %%mm6, %%mm1\n" /* line 1 - line 7 -> mm1 (v71) */ "movq 0x40(%0), %%mm2\n" /* line 4 -> mm2 */ "movq 0x00(%0), %%mm3\n" /* line 0 -> mm3 */ "movq %%mm2, %%mm7\n" /* line 4 -> mm7 */ "paddsw %%mm3, %%mm2\n" /* line 0 + line 4 -> mm2 (v04) */ "psubsw %%mm7, %%mm3\n" /* line 0 - line 4 -> mm3 (v44) */ "movq 0x60(%0), %%mm4\n" /* line 6 -> mm4 */ "movq 0x20(%0), %%mm5\n" /* line 2 -> mm5 */ "movq %%mm4, %%mm6\n" /* line 6 -> mm6 */ "paddsw %%mm5, %%mm4\n" /* line 2 + line 6 -> mm4 (v62) */ "psubsw %%mm6, %%mm5\n" /* line 2 - line 6 -> mm5 (v22) */ "movq %%mm2, %%mm7\n" /* v04 -> mm7 */ "paddsw %%mm4, %%mm2\n" /* v04 + v62 -> mm2 (v05) */ "psubsw %%mm4, %%mm7\n" /* v04 - v62 -> mm7 (v65) */ "movq %%mm2, 0x00(%0)\n" /* v05 -> line 0 */ "movq %%mm2, 0x70(%0)\n" /* v05 -> line 7 */ /* STEP 2 v23 = v22*ICOS4; - v71, v11, v44, v04, v62, v23 - v24 = v23-v62; - v71, v11, v44, v04, v62, v24 - v45 = v44+v24; - v71, v11, v44, v65, v24, v45 - block[row*8+1] = v45; - v71, v11, v44, v65, v24, v45 - block[row*8+6] = v45; - v71, v11, v44, v65, v24 - */ "psllw $0x02, %%mm5\n" /* adjust v22 for multiply */ "paddw " ASMSYM "_mmx_1, %%mm5\n" /* + 1 for rounding */ // "pmulhw 8(%1), %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23) */ "pmulhw " ASMSYM "_mmx_icos+8, %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23)*/ "psubsw %%mm4, %%mm5\n" /* v23 - v62 -> mm5 (v24) */ "movq %%mm3, %%mm6\n" /* v44 -> mm6 */ "paddsw %%mm5, %%mm6\n" /* v44 + v24 -> mm6 (v45) */ "movq %%mm6, 0x10(%0)\n" /* v45 -> line 1 */ "movq %%mm6, 0x60(%0)\n" /* v45 -> line 6 */ /* STEP 3 v25 = v44-v24; - v71, v11, v65, v25 - v51 = v5-v3; - v71, v11, v65, v25, v51 - v31 = v5+v3; - v71, v11, v65, v25, v51, v31 - block[row*8+2] = v25; - v71, v11, v65, v25, v51, v31 - block[row*8+5] = v25; - v71, v11, v65, v51, v31 - v32 = v11+v31; - v71, v11, v65, v51, v31, v32 - v12 = v11-v31; - v71, v65, v51, v32, v12 - block[row*8+0] += v32; - v71, v65, v51, v32, v12 - block[row*8+7] -= v32; - v71, v65, v51, v32, v12 - block[row*8+3] = v65; - v71, v65, v51, v32, v12 - block[row*8+4] = v65; - v71, v51, v32, v12 - */ "psubsw %%mm5, %%mm3\n" /* v44 - v24 -> mm3 (v25) */ "movq 0x50(%0), %%mm6\n" /* line 5 -> mm6 */ "movq 0x50(%0), %%mm2\n" /* line 5 -> mm2 */ "paddsw 0x30(%0), %%mm6\n" /* line 5 + line 3 -> mm6 (v31) */ "psubsw 0x30(%0), %%mm2\n" /* line 5 - line 3 -> mm2 (v51) */ "movq %%mm3, 0x20(%0)\n" /* v25 -> line 2 */ "movq %%mm3, 0x50(%0)\n" /* v25 -> line 5 */ "movq %%mm6, %%mm3\n" /* v31 -> mm3 */ "paddsw %%mm0, %%mm3\n" /* v11 + v31 -> mm3 (v32) */ "psubsw %%mm6, %%mm0\n" /* v11 - v31 -> mm0 (v12) */ "movq 0x00(%0), %%mm6\n" /* v05 -> mm6 */ "movq 0x70(%0), %%mm5\n" /* v05 -> mm5 */ "paddsw %%mm3, %%mm6\n" /* v05 + v32 -> mm6 */ "psubsw %%mm3, %%mm5\n" /* v05 - v32 -> mm5 */ "movq %%mm6, 0x00(%0)\n" /* mm6 -> line 0 */ "movq %%mm5, 0x70(%0)\n" /* mm5 -> line 7 */ "movq %%mm7, 0x30(%0)\n" /* v65 -> line 3 */ "movq %%mm7, 0x40(%0)\n" /* v65 -> line 4 */ /* STEP 4 v13 = v12*ICOS4; - v71, v51, v32, v13 - va2 = v51-v71; - v71, v51, v32, v13, va2 - v53 = v51*ICOS6; - v71, v53, v32, v13, va2 - v73 = v71*ICOS2; - v73, v53, v32, v13, va2 - va3 = va2*ICOS8; - v73, v53, v32, v13, va3 - v54 = v53-va3; - v73, v54, v32, v13, va3 - v74 = v73-va3; - v74, v54, v32, v13 - v75 = v74-v32; - v75, v54, v13 - block[row*8+1] += v75; - v75, v54, v13 - block[row*8+6] -= v75; - v75, v54, v13 - v15 = v13-v75; - v54, v15 - block[row*8+2] += v15; - v54, v15 - block[row*8+5] -= v15; - v54, v15 - v55 = v15-v54; - v55 - block[row*8+3] -= v55; - v55 - block[row*8+4] += v55; - - */ "psllw $0x02, %%mm0\n" /* adjust v12 for multiply */ "paddw " ASMSYM "_mmx_1, %%mm0\n" /* + 1 for rounding */ // "pmulhw 8(%1), %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */ "pmulhw " ASMSYM "_mmx_icos+8, %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */ "movq %%mm2, %%mm6\n" /* v51 -> mm6 */ "psubsw %%mm1, %%mm6\n" /* v51 - v71 -> mm6 (va2) */ "psllw $0x03, %%mm2\n" /* adjust v51 for multiply */ "paddw " ASMSYM "_mmx_1, %%mm2\n" /* + 1 for rounding */ /* should add another one here but it seems to look better without */ // "pmulhw 16(%1), %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */ "pmulhw " ASMSYM "_mmx_icos+16, %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */ "psllw $0x02, %%mm1\n" /* adjust v71 for multiply */ "paddw " ASMSYM "_mmx_1, %%mm1\n" /* + 1 for rounding */ /* should add another one here but it seems to look better without */ // "pmulhw 0(%1), %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */ "pmulhw " ASMSYM "_mmx_icos, %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */ "psllw $0x01, %%mm6\n" /* adjust va2 for multiply */ "paddw " ASMSYM "_mmx_1, %%mm6\n" /* + 1 for rounding */ // "pmulhw 24(%1), %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */ "pmulhw " ASMSYM "_mmx_icos+24, %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */ "psubsw %%mm6, %%mm2\n" /* v53 - va3 -> mm2 (v54) */ "psubsw %%mm6, %%mm1\n" /* v73 - va3 -> mm1 (v74) */ "psubsw %%mm3, %%mm1\n" /* v74 - v32 -> mm3 (v75) */ "movq 0x10(%0), %%mm6\n" /* v45 -> mm6 */ "movq 0x60(%0), %%mm7\n" /* v45 -> mm7 */ "paddsw %%mm1, %%mm6\n" /* v45 + v75 -> mm6 */ "psubsw %%mm1, %%mm7\n" /* v45 - v75 -> mm7 */ "movq %%mm6, 0x10(%0)\n" /* mm6 -> line 1 */ "movq %%mm7, 0x60(%0)\n" /* mm7 -> line 6 */ "psubsw %%mm1, %%mm0\n" /* v13 - v75 -> mm0 (v15) */ "movq 0x20(%0), %%mm6\n" /* v25 -> mm6 */ "movq 0x50(%0), %%mm7\n" /* v25 -> mm7 */ "paddsw %%mm0, %%mm6\n" /* v25 + v15 -> mm6 */ "psubsw %%mm0, %%mm7\n" /* v25 - v15 -> mm7 */ "movq %%mm6, 0x20(%0)\n" /* mm6 -> line 2 */ "movq %%mm7, 0x50(%0)\n" /* mm7 -> line 5 */ "psubsw %%mm2, %%mm0\n" /* v15 - v54 -> mm0 (v55) */ "movq 0x30(%0), %%mm6\n" /* v65 -> mm6 */ "movq 0x40(%0), %%mm7\n" /* v65 -> mm7 */ "psubsw %%mm0, %%mm6\n" /* v65 - v55 -> mm6 */ "paddsw %%mm0, %%mm7\n" /* v65 + v55 -> mm7 */ "movq %%mm6, 0x30(%0)\n" /* mm6 -> line 3 */ "movq %%mm7, 0x40(%0)\n" /* mm7 -> line 4 */ : "=r"(block)/*, "=r"(mmx_icos)*/ : "0"(block)/*, "1"(mmx_icos)*/ : "memory");}static void inline idct(dct_t *block){ idct_aan_pass(block); idct_aan_pass(block+4); transpose(block); idct_aan_pass(block); idct_aan_pass(block+4);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -