📄 dct_mmx.h
字号:
/* libfame - Fast Assembly MPEG Encoder Library Copyright (C) 2000-2001 Vivien Chappelier This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*//*************************** MMX accelerated DCT *****************************//* Warning: Didn't check the DCT was IEEE compliant. It is probably not. *//* TODO: Write an IEEE compliant DCT/iDCT */#define precisionstatic void inline dct_aan_pass(dct_t *cache){ // register unsigned short const *mmx_cos = _mmx_cos; asm volatile ( "" /* STEP 1 */ "movq 0x00(%0), %%mm0\n" /* load line 0 */ "movq 0x10(%0), %%mm1\n" /* load line 1 */ "movq 0x20(%0), %%mm2\n" /* load line 2 */ "movq 0x30(%0), %%mm3\n" /* load line 3 */ "movq 0x40(%0), %%mm4\n" /* load line 4 */ "movq 0x50(%0), %%mm5\n" /* load line 5 */ "movq 0x60(%0), %%mm6\n" /* load line 6 */ "movq 0x70(%0), %%mm7\n" /* load line 7 */ "paddsw %%mm0, %%mm7\n" /* line0 + line7 -> mm7 (v00) */ "paddsw %%mm1, %%mm6\n" /* line1 + line6 -> mm6 (v01) */ "paddsw %%mm2, %%mm5\n" /* line2 + line5 -> mm5 (v02) */ "paddsw %%mm4, %%mm3\n" /* line4 + line3 -> mm3 (v03) */ "psubsw 0x70(%0), %%mm0\n" /* line0 - line7 -> mm0 (v07) */ "psubsw 0x60(%0), %%mm1\n" /* line1 - line6 -> mm1 (v06) */ "psubsw 0x50(%0), %%mm2\n" /* line2 - line5 -> mm2 (v05) */ "psubsw 0x30(%0), %%mm4\n" /* line4 - line3 -> mm4 (-v04) */ "" /* STEP 2 */ "psubsw %%mm2, %%mm4\n" /* -v04 - v05 -> mm4 (v14) */ "paddsw %%mm1, %%mm2\n" /* v05 + v06 -> mm2 (v15) */ "paddsw %%mm0, %%mm1\n" /* v06 + v07 -> mm1 (v16) */ "movq %%mm0, 0x70(%0)\n" /* store v07 for later */ "movq %%mm3, %%mm0\n" /* v03 -> mm0 */ "paddsw %%mm7, %%mm3\n" /* v00 + v03 -> mm3 (v10) */ "psubsw %%mm0, %%mm7\n" /* v00 - v03 -> mm7 (v13) */ "movq %%mm5, %%mm0\n" /* v02 -> mm0 */ "paddsw %%mm6, %%mm5\n" /* v01 + v02 -> mm5 (v11) */ "psubsw %%mm0, %%mm6\n" /* v01 - v02 -> mm6 (v12) */ "" /* STEP 3 */ "movq %%mm5, %%mm0\n" /* v11 -> mm0 */ "paddsw %%mm3, %%mm5\n" /* v10 + v11 -> mm5 (v20) */ "psubsw %%mm0, %%mm3\n" /* v10 - v11 -> mm3 (v21) */ "paddsw %%mm7, %%mm6\n" /* v12 + v13 -> mm6 (v22) */ "movq %%mm5, 0x00(%0)\n" /* store line 0 */ "movq %%mm3, 0x40(%0)\n" /* store line 4 */ "movq %%mm4, %%mm5\n" /* v14 -> mm5 */ "paddsw %%mm1, %%mm5\n" /* v14 + v16 -> mm5 */#ifdef precision "psllw $0x01, %%mm5\n" /* precision(va0) += 1 bit */#endif "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */// "pmulhw 16(%1), %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */ "pmulhw " ASMSYM "_mmx_cos+16, %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */ "" /* STEP 4 */#ifdef precision "psllw $0x02, %%mm6\n" /* precision(v22) += 1 bit */#else "psllw $0x01, %%mm6\n" /* */#endif "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ // "pmulhw 8(%1), %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/ "pmulhw " ASMSYM "_mmx_cos+8, %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/#ifdef precision "psllw $0x02, %%mm2\n" /* precision(v15) += 1 bit */#else "psllw $0x01, %%mm2\n" /* */#endif "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ // "pmulhw 8(%1), %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */ "pmulhw " ASMSYM "_mmx_cos+8, %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */#ifdef precision "psllw $0x02, %%mm4\n" /* precision(v14) += 1 bit */#else "psllw $0x01, %%mm4\n" /* */#endif "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ // "pmulhw 0(%1), %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */ "pmulhw " ASMSYM "_mmx_cos, %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */ "psubsw %%mm5, %%mm4\n" /* v14*-COS2 - va0 -> mm4 (v34) */#ifdef precision "psllw $0x01, %%mm1\n" /* precision(v16) += 1 bit */#endif "psubsw %%mm1, %%mm5\n" /* va0 - v16 -> mm5 */ "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ // "pmulhw 24(%1), %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */ "pmulhw " ASMSYM "_mmx_cos+24, %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */ "psubsw %%mm5, %%mm1\n" /* v16 * COS8 - va0 -> mm1 (v36)*/ "" /* STEP 5 */ "movq 0x70(%0), %%mm0\n" /* retrieve v07 -> mm0 */#ifdef precision "psllw $0x01, %%mm7\n" /* precision(v13) += 1 bit */ "psllw $0x01, %%mm0\n" /* precision(v07) += 1 bit */#endif "movq %%mm6, %%mm3\n" /* v32 -> mm3 */ "paddsw %%mm7, %%mm6\n" /* v13 + v32 -> mm6 (v42) */ "psubsw %%mm3, %%mm7\n" /* v13 - v32 -> mm7 (v43) */#ifdef precision "psraw $0x01, %%mm6\n" /* precision(v42) -= 1 bit */ "psraw $0x01, %%mm7\n" /* precision(v43) -= 1 bit */#endif "movq %%mm6, 0x20(%0)\n" /* store line 2 */ "movq %%mm7, 0x60(%0)\n" /* store line 6 */ "movq %%mm2, %%mm5\n" /* v35 -> mm5 */ "paddsw %%mm0, %%mm2\n" /* v07 + v35 -> mm2 (v45) */ "psubsw %%mm5, %%mm0\n" /* v07 - v35 -> mm0 (v47) */ "" /* STEP 6 */ "movq %%mm4, %%mm3\n" /* v34 -> mm3 */ "paddsw %%mm0, %%mm4\n" /* v47 + v34 -> mm4 (v54) */ "psubsw %%mm3, %%mm0\n" /* v47 - v34 -> mm0 (v57) */ "movq %%mm1, %%mm5\n" /* v36 -> mm5 */ "paddsw %%mm2, %%mm1\n" /* v45 + v36 -> mm1 (v55) */ "psubsw %%mm5, %%mm2\n" /* v45 - v36 -> mm2 (v56) */#ifdef precision "psraw $0x01, %%mm4\n" /* precision(v54) -= 1 bit */ "psraw $0x01, %%mm0\n" /* precision(v57) -= 1 bit */ "psraw $0x01, %%mm1\n" /* precision(v55) -= 1 bit */ "psraw $0x01, %%mm2\n" /* precision(v56) -= 1 bit */#endif "movq %%mm1, 0x10(%0)\n" /* store line 1 */ "movq %%mm0, 0x30(%0)\n" /* store line 3 */ "movq %%mm4, 0x50(%0)\n" /* store line 5 */ "movq %%mm2, 0x70(%0)\n" /* store line 7 */ : "=r"(cache)/*, "=r"(mmx_cos)*/ : "0"(cache)/*, "1"(mmx_cos)*/ : "memory");}static void inline dct(dct_t *block){ dct_aan_pass(block); dct_aan_pass(block+4); transpose(block); dct_aan_pass(block); dct_aan_pass(block+4);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -