📄 altivec_mlib.c
字号:
/* Ogle - A video player * Copyright (C) 2001, Charles M. Hannum <root@ihack.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *//* * AltiVec support written by Charles M. Hannum <root@ihack.net>, except * for the core IDCT routine published by Motorola. * * Notes: * 1) All AltiVec loads and stores are aligned. Conveniently, the output * area for the IDCT and motion comp functions is always aligned. * However, the reference area is not; therefore, we check its * alignment and use lvsl/vperm as necessary to align the reference * data as it's loaded. * 2) Unfortunately, AltiVec doesn't do 8-byte loads and stores. This * means that the fastest paths are only applicable to the Y channel. * For 8-byte operations on the U and V channels, there are two cases. * When the alignment of the input and output are the same, we can do * 16-byte loads and just do two 4-byte stores. For the unmatched * alignment case, we have to do a rotation of the loaded data first. * 3) The `i[0-7]' variables look silly, but they prevent GCC from * generating gratuitous multiplies, and allow the loaded constants * to be recycled several times in the IDCT routine. * 4) The use of "b" constraints is *very* important. Using r0 in any * of the AltiVec load/store instructions is equivalent to a constant * 0. */#include <inttypes.h>#if 0#define ASSERT(x) if (!(x)) abort()#else#define ASSERT(x)#endifvoidmlib_Init(void){ asm("mtspr 0x100,%0" : : "b" (-1));}static inline voidmlib_VideoInterpAveXY_U8_U8(uint8_t *curr_block, const uint8_t *ref_block, const int width, const int height, int32_t frame_stride, int32_t field_stride) { int x, y; const uint8_t *ref_block_next = ref_block + field_stride; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) curr_block[x] = (curr_block[x] + ((ref_block[x] + ref_block_next[x] + ref_block[x+1] + ref_block_next[x+1] + 2) >> 2) + 1) >> 1; curr_block += frame_stride; ref_block += frame_stride; ref_block_next += frame_stride; }}static inline voidmlib_VideoInterpXY_U8_U8(uint8_t *curr_block, const uint8_t *ref_block, const int width, const int height, int32_t frame_stride, int32_t field_stride) { int x, y; const uint8_t *ref_block_next = ref_block + field_stride; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) curr_block[x] = (ref_block[x] + ref_block_next[x] + ref_block[x+1] + ref_block_next[x+1] + 2) >> 2; curr_block += frame_stride; ref_block += frame_stride; ref_block_next += frame_stride; }}voidmlib_VideoCopyRefAve_U8_U8_16x16(uint8_t *curr_block, const uint8_t *ref_block, int32_t stride){ ASSERT(((int)curr_block & 15) == 0); if (((int)ref_block & 15) != 0) { int i0 = 0, i1 = 16; asm("" "lvsl 3,%0,%1\n" "" : : "b" (ref_block), "b" (i0)); asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2 \n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); } else { int i0 = 0; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); }}void mlib_VideoCopyRefAve_U8_U8_16x8(uint8_t *curr_block, const uint8_t *ref_block, int32_t stride){ ASSERT(((int)curr_block & 15) == 0); if (((int)ref_block & 15) != 0) { int i0 = 0, i1 = 16; asm("" "lvsl 3,%0,%1\n" "" : : "b" (ref_block), "b" (i0)); asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); i0 += stride, i1 += stride; asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1)); } else { int i0 = 0; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); i0 += stride; asm("" "lvx 0,%0,%2\n" "lvx 1,%1,%2\n" "vavgub 0,0,1\n" "stvx 0,%0,%2\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0)); }}void mlib_VideoCopyRefAve_U8_U8_8x8(uint8_t *curr_block, const uint8_t *ref_block, int32_t stride){ ASSERT(((int)curr_block & 7) == 0); if ((((int)ref_block ^ (int)curr_block) & 15) != 0) { const int i0 = 0, i1 = 16, i2 = 4; asm("" "lvsl 3,%1,%2\n" "lvsl 4,%1,%3\n" "lvsr 5,%0,%2\n" "lvsr 6,%0,%3\n" "vperm 3,3,3,5\n" "vperm 4,4,4,6\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i0 + stride)); asm("" "lvx 0,%1,%2\n" "lvx 1,%1,%3\n" "lvx 2,%0,%2\n" "vperm 0,0,1,3\n" "vavgub 0,0,2\n" "stvewx 0,%0,%2\n" "stvewx 0,%0,%4\n" "" : : "b" (curr_block), "b" (ref_block), "b" (i0), "b" (i1), "b" (i2)); curr_block += stride, ref_block += stride; asm("" "lvx 0,%1,%2\n"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -