📄 mmx_mlib.c

📁 基于linux的DVD播放器程序
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* *  mmx_mlib.c * *  Intel MMX implementation of motion comp routines. *  MMX code written by David I. Lehn <dlehn@vt.edu>. *  lib{mmx,xmmx,sse} can be found at http://shay.ecn.purdue.edu/~swar/ * *  Copyright 2000, David I. Lehn <dlehn@vt.edu>. Released under GPL. * *  You should have received a copy of the GNU General Public License *  along with GNU Make; see the file COPYING.  If not, write to *  the Free Software Foundation,  * */#include <stdlib.h>#include <stdio.h>#include <inttypes.h>#include "include/debug_print.h"#include "mmx.h"#include "mmx_mlib.h"static uint64_t ones = 0x0001000100010001ULL;static uint64_t twos = 0x0002000200020002ULL;//#define MC_MMX_verifyvoidmlib_Init(void){  DNOTE("Using MMX accelerated media functions\n");  return;}/*static inline uint8_tclip_to_u8 (int16_t value){  //return value < 0 ? 0 : (value > 255 ? 255 : value);  return ((uint16_t)value) > 256 ? value < 0 ? 0 : 255 : value;}*/static inline voidmmx_average_2_U8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2){   //   // *dst = (*src1 + *src2 + 1)/2;   //   //pxor_r2r(mm0,mm0);         // load 0 into mm0   movq_m2r(*src1,mm1);        // load 8 src1 bytes   movq_r2r(mm1,mm2);          // copy 8 src1 bytes   movq_m2r(*src2,mm3);        // load 8 src2 bytes   movq_r2r(mm3,mm4);          // copy 8 src2 bytes   punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes   punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes   punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes   punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes   paddw_m2r(ones, mm1);   paddw_r2r(mm3,mm1);         // add lows to mm1   psraw_i2r(1,mm1);           // /2   paddw_m2r(ones, mm2);   paddw_r2r(mm4,mm2);         // add highs to mm2   psraw_i2r(1,mm2);           // /2   packuswb_r2r(mm2,mm1);      // pack (w/ saturation)   movq_r2m(mm1,*dst);         // store result in dst}static inline voidmmx_interp_average_2_U8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2){   //   // *dst = (*dst + (*src1 + *src2 + 1)/2 + 1)/2;   //   //pxor_r2r(mm0,mm0);             // load 0 into mm0   movq_m2r(*dst,mm1);            // load 8 dst bytes   movq_r2r(mm1,mm2);             // copy 8 dst bytes   movq_m2r(*src1,mm3);           // load 8 src1 bytes   movq_r2r(mm3,mm4);             // copy 8 src1 bytes   movq_m2r(*src2,mm5);           // load 8 src2 bytes   movq_r2r(mm5,mm6);             // copy 8 src2 bytes   punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes   punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes   punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes   punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes   punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes   punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes   paddw_m2r(ones, mm3);   paddw_r2r(mm5,mm3);            // add lows   paddw_m2r(ones, mm4);   paddw_r2r(mm6,mm4);            // add highs   psraw_i2r(1,mm3);              // /2   psraw_i2r(1,mm4);              // /2   paddw_m2r(ones, mm1);   paddw_r2r(mm3,mm1);            // add lows   paddw_m2r(ones, mm2);   paddw_r2r(mm4,mm2);            // add highs   psraw_i2r(1,mm1);              // /2   psraw_i2r(1,mm2);              // /2   packuswb_r2r(mm2,mm1);         // pack (w/ saturation)   movq_r2m(mm1,*dst);            // store result in dst}static inline voidmmx_average_4_U8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,		 const uint8_t *src3, const uint8_t *src4){   //   // *dst = (*src1 + *src2 + *src3 + *src4 + 2)/4;   //   //pxor_r2r(mm0,mm0);                  // load 0 into mm0   movq_m2r(*src1,mm1);                // load 8 src1 bytes   movq_r2r(mm1,mm2);                  // copy 8 src1 bytes   punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes   punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes   movq_m2r(*src2,mm3);                // load 8 src2 bytes   movq_r2r(mm3,mm4);                  // copy 8 src2 bytes   punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes   punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes   paddw_r2r(mm3,mm1);                 // add lows   paddw_r2r(mm4,mm2);                 // add highs   // now have partials in mm1 and mm2   movq_m2r(*src3,mm3);                // load 8 src3 bytes   movq_r2r(mm3,mm4);                  // copy 8 src3 bytes   punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes   punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes   paddw_r2r(mm3,mm1);                 // add lows   paddw_r2r(mm4,mm2);                 // add highs   movq_m2r(*src4,mm5);                // load 8 src4 bytes   movq_r2r(mm5,mm6);                  // copy 8 src4 bytes   punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes   punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes   paddw_m2r(twos, mm1);   paddw_r2r(mm5,mm1);                 // add lows   paddw_m2r(twos, mm2);   paddw_r2r(mm6,mm2);                 // add highs   // now have subtotal in mm1 and mm2   psraw_i2r(2,mm1);                   // /4   psraw_i2r(2,mm2);                   // /4   packuswb_r2r(mm2,mm1);              // pack (w/ saturation)   movq_r2m(mm1,*dst);                 // store result in dst}static inline voidmmx_interp_average_4_U8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,			const uint8_t *src3, const uint8_t *src4){   //   // *dst = (*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2;   //   //pxor_r2r(mm0,mm0);                  // load 0 into mm0   movq_m2r(*src1,mm1);                // load 8 src1 bytes   movq_r2r(mm1,mm2);                  // copy 8 src1 bytes   punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes   punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes   movq_m2r(*src2,mm3);                // load 8 src2 bytes   movq_r2r(mm3,mm4);                  // copy 8 src2 bytes   punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes   punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes   paddw_r2r(mm3,mm1);                 // add lows   paddw_r2r(mm4,mm2);                 // add highs   // now have partials in mm1 and mm2   movq_m2r(*src3,mm3);                // load 8 src3 bytes   movq_r2r(mm3,mm4);                  // copy 8 src3 bytes   punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes   punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes   paddw_r2r(mm3,mm1);                 // add lows   paddw_r2r(mm4,mm2);                 // add highs   movq_m2r(*src4,mm5);                // load 8 src4 bytes   movq_r2r(mm5,mm6);                  // copy 8 src4 bytes   punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes   punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes   paddw_m2r(twos, mm1);   paddw_r2r(mm5,mm1);                 // add lows   paddw_m2r(twos, mm2);   paddw_r2r(mm6,mm2);                 // add highs   psraw_i2r(2,mm1);                   // /4   psraw_i2r(2,mm2);                   // /4   // now have subtotal/4 in mm1 and mm2   movq_m2r(*dst,mm3);                 // load 8 dst bytes   movq_r2r(mm3,mm4);                  // copy 8 dst bytes   punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes   punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes   paddw_m2r(ones, mm1);   paddw_r2r(mm3,mm1);                 // add lows   paddw_m2r(ones, mm2);   paddw_r2r(mm4,mm2);                 // add highs   psraw_i2r(1,mm1);                   // /2   psraw_i2r(1,mm2);                   // /2   // now have end value in mm1 and mm2   packuswb_r2r(mm2,mm1);              // pack (w/ saturation)   movq_r2m(mm1,*dst);                 // store result in dst}// VideoCopyRef* - Copy block from reference block to current block// ---------------------------------------------------------------static inline voidmlib_VideoCopyRefAve_U8_U8_MxN(      const uint8_t m,      const uint8_t n,      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){#define MMX_mmx_VideoCopyRefAve_U8_U8_MxN#if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoCopyRefAve_U8_U8_MxN)   int x,y;   const int jump = stride - m;   for (y = 0; y < n; y++) {      for (x = 0; x < m; x++)         *curr_block++ = (*curr_block + *ref_block++ + 1)/2;      ref_block += jump;      curr_block += jump;   }#else   int x,y;   const int step = 8;   const int jump = stride - m;   pxor_r2r(mm0,mm0);             // load 0 into mm0   for (y = 0; y < n; y++) {      for (x = 0; x < m/8; x++) {         mmx_average_2_U8(curr_block, curr_block, ref_block);         curr_block += step;         ref_block  += step;      }      curr_block += jump;      ref_block  += jump;   }#endif}voidmlib_VideoCopyRefAve_U8_U8_16x16(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){   mlib_VideoCopyRefAve_U8_U8_MxN(      16, 16,      curr_block,      ref_block,      stride);}voidmlib_VideoCopyRefAve_U8_U8_16x8(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){   mlib_VideoCopyRefAve_U8_U8_MxN(      16, 8,      curr_block,      ref_block,      stride);}voidmlib_VideoCopyRefAve_U8_U8_8x8(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){   mlib_VideoCopyRefAve_U8_U8_MxN(      8, 8,       curr_block,      ref_block,      stride);}voidmlib_VideoCopyRefAve_U8_U8_8x4(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){   mlib_VideoCopyRefAve_U8_U8_MxN(      8, 4,      curr_block,      ref_block,      stride);}#if 0inline voidmlib_VideoCopyRef_U8_U8(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t width,      int32_t height,      int32_t stride){   int x,y;   const int jump = stride - width;   printf("I don't get called!");   for (y = 0; y < height; y++) {      for (x = 0; x < width; x++)         *curr_block++ = *ref_block++;      ref_block  += jump;      curr_block += jump;   }}#endifvoidprint_U8_U8_MxN(      const uint8_t m,      const uint8_t n,      uint8_t *block0,      uint8_t *block1,      int32_t stride){   int x,y;   int jump = stride - m;   printf("block %dx%d @ %x<-%x stride=%d\n",n,n,(uint32_t)block0,(uint32_t)block1,stride);   for (y = 0; y < n; y++) {      printf("%2d: ",y);      for (x = 0; x < m; x++) {         printf("%3d<%3d ",*block0++,*block1++);      }      printf("\n");      block0 += jump;      block1 += jump;   }}static inline voidmlib_VideoCopyRef_U8_U8_MxN(      const uint8_t m,      const uint8_t n,      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){#define MMX_mmx_VideoCopyRef_U8_U8_MxN#if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoCopyRef_U8_U8_MxN)   int x,y;   const int jump = stride - m;   for (y = 0; y < n; y++) {      for (x = 0; x < m; x++)         *curr_block++ = *ref_block++;      ref_block += jump;      curr_block += jump;   }#else   int x,y;   const int step = 8;   const int jump = stride - m;   pxor_r2r(mm0,mm0);             // load 0 into mm0   for (y = 0; y < n; y++) {      for (x = 0; x < m/8; x++) {         movq_m2r(*ref_block,mm1);    // load 8 ref bytes         movq_r2m(mm1,*curr_block);   // store 8 bytes at curr         curr_block += step;         ref_block  += step;      }      curr_block += jump;      ref_block  += jump;   }#endif}voidmlib_VideoCopyRef_U8_U8_16x16(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){   mlib_VideoCopyRef_U8_U8_MxN(      16, 16,      curr_block,      ref_block,      stride);}voidmlib_VideoCopyRef_U8_U8_16x8(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){   mlib_VideoCopyRef_U8_U8_MxN(      16, 8,      curr_block,      ref_block,      stride);}voidmlib_VideoCopyRef_U8_U8_8x8(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){   mlib_VideoCopyRef_U8_U8_MxN(      8, 8,      curr_block,      ref_block,      stride);}voidmlib_VideoCopyRef_U8_U8_8x4(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t stride){   mlib_VideoCopyRef_U8_U8_MxN(      8, 4,      curr_block,      ref_block,      stride);}// VideoInterp*X - Half pixel interpolation in the x direction// ------------------------------------------------------------------static inline void mlib_VideoInterpAveX_U8_U8_MxN(      const uint8_t m,      const uint8_t n,      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t frame_stride,      int32_t field_stride) {#define MMX_mmx_VideoInterpAveX_U8_U8_MxN#if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoInterpAveX_U8_U8_MxN)   int x,y;   const int jump = frame_stride - m;   for (y = 0; y < n; y++) {      for (x = 0; x < m; x++)         *curr_block++ = (*curr_block + (*ref_block++ + *(ref_block + 1) + 1)/2 + 1)/2;      ref_block += jump;      curr_block += jump;   }#else   int x,y;   const int step = 8;   const int jump = frame_stride - m;   pxor_r2r(mm0,mm0);             // load 0 into mm0   for (y = 0; y < n; y++) {      for (x = 0; x < m/8; x++) {         mmx_interp_average_2_U8(curr_block, ref_block, ref_block + 1);         curr_block += step;         ref_block  += step;      }      curr_block += jump;      ref_block  += jump;   }#endif}void mlib_VideoInterpAveX_U8_U8_16x16(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t frame_stride,      int32_t field_stride) {   mlib_VideoInterpAveX_U8_U8_MxN(      16, 16,      curr_block,      ref_block,      frame_stride,      field_stride);}void mlib_VideoInterpAveX_U8_U8_16x8(      uint8_t *curr_block,      const uint8_t *ref_block,      int32_t frame_stride,      int32_t field_stride) {   mlib_VideoInterpAveX_U8_U8_MxN(      16, 8,      curr_block,      ref_block,      frame_stride,      field_stride);}void
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -