vf_fspp.c

来自「君正早期ucos系统(只有早期的才不没有打包成库),MPLAYER,文件系统,图」· C语言 代码 · 共 2,126 行 · 第 1/5 页

C
2,126
字号
/*  Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>  Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>  This program is free software; you can redistribute it and/or modify  it under the terms of the GNU General Public License as published by  the Free Software Foundation; either version 2 of the License, or  (at your option) any later version.  This program is distributed in the hope that it will be useful,  but WITHOUT ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  GNU General Public License for more details.  You should have received a copy of the GNU General Public License  along with this program; if not, write to the Free Software  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA*//* * This implementation is based on an algorithm described in * "Aria Nosratinia Embedded Post-Processing for  * Enhancement of Compressed Images (1999)" * (http://citeseer.nj.nec.com/nosratinia99embedded.html) * Futher, with splitting (i)dct into hor/ver passes, one of them can be * performed once per block, not pixel. This allows for much better speed. *//*  Heavily optimized version of SPP filter by Nikolaj */ #include <uclib.h>#include <uclib.h>#include <uclib.h>#include <inttypes.h>#include <math.h>#include "config.h"#include "mp_msg.h"#include "cpudetect.h"#include "libavcodec/avcodec.h"#include "libavcodec/dsputil.h"#ifdef HAVE_MALLOC_H#include <uclib.h>#endif#include "img_format.h"#include "mp_image.h"#include "vf.h"#include "libvo/fastmemcpy.h"#undef memcpy#define memcpy uc_memcpy//===========================================================================//#define BLOCKSZ 12static const short custom_threshold[64]=// values (296) can't be too high// -it causes too big quant dependence// or maybe overflow(check), which results in some flashing{ 71, 296, 295, 237,  71,  40,  38,  19,  245, 193, 185, 121, 102,  73,  53,  27,  158, 129, 141, 107,  97,  73,  50,  26,  102, 116, 109,  98,  82,  66,  45,  23,  71,  94,  95,  81,  70,  56,  38,  20,  56,  77,  74,  66,  56,  44,  30,  15,  38,  53,  50,  45,  38,  30,  21,  11,  20,  27,  26,  23,  20,  15,  11,   5};static const uint8_t  __attribute__((aligned(32))) dither[8][8]={    {  0,  48,  12,  60,   3,  51,  15,  63, },    { 32,  16,  44,  28,  35,  19,  47,  31, },    {  8,  56,   4,  52,  11,  59,   7,  55, },    { 40,  24,  36,  20,  43,  27,  39,  23, },    {  2,  50,  14,  62,   1,  49,  13,  61, },    { 34,  18,  46,  30,  33,  17,  45,  29, },    { 10,  58,   6,  54,   9,  57,   5,  53, },    { 42,  26,  38,  22,  41,  25,  37,  21, },};struct vf_priv_s { //align 16 !    uint64_t threshold_mtx_noq[8*2];    uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions    int log2_count;    int temp_stride;    int qp;    int mpeg2;    int prev_q;    uint8_t *src;    int16_t *temp;    int bframes;    char *non_b_qp;};#ifndef HAVE_MMX//This func reads from 1 slice, 1 and clears 0 & 1static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale){int y, x;#define STORE(pos)							\    temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale);	\    src[x + pos]=src[x + pos - 8*src_stride]=0;				\    if(temp & 0x100) temp= ~(temp>>31);					\    dst[x + pos]= temp;        for(y=0; y<height; y++){	const uint8_t *d= dither[y];	for(x=0; x<width; x+=8){	    int temp;	    STORE(0);	    STORE(1);	    STORE(2);	    STORE(3);	    STORE(4);	    STORE(5);	    STORE(6);	    STORE(7);      	}	src+=src_stride;	dst+=dst_stride;    }}//This func reads from 2 slices, 0 & 2  and clears 2-ndstatic void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale){int y, x;#define STORE2(pos)							\    temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale);	\    src[x + pos + 16*src_stride]=0;					\    if(temp & 0x100) temp= ~(temp>>31);					\    dst[x + pos]= temp;       for(y=0; y<height; y++){	const uint8_t *d= dither[y];	for(x=0; x<width; x+=8){	    int temp;	    STORE2(0);	    STORE2(1);	    STORE2(2);	    STORE2(3);	    STORE2(4);	    STORE2(5);	    STORE2(6);	    STORE2(7);      	}	src+=src_stride;	dst+=dst_stride;    }}static void mul_thrmat_c(struct vf_priv_s *p,int q){    int a;    for(a=0;a<64;a++)	((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C}static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);static void row_idct_c(DCTELEM* workspace,		       int16_t* output_adr, int output_stride, int cnt);static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);//this is rather ugly, but there is no need for function pointers#define store_slice_s store_slice_c#define store_slice2_s store_slice2_c#define mul_thrmat_s mul_thrmat_c#define column_fidct_s column_fidct_c#define row_idct_s row_idct_c#define row_fdct_s row_fdct_c#else /* HAVE_MMX *///This func reads from 1 slice, 1 and clears 0 & 1static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale){    const uint8_t *od=&dither[0][0];    const uint8_t *end=&dither[height][0];    width = (width+7)&~7;    dst_stride-=width;    //src_stride=(src_stride-width)*2;    asm volatile(	"mov %5, %%"REG_d"                \n\t"	"mov %6, %%"REG_S"                \n\t"	"mov %7, %%"REG_D"                \n\t"	"mov %1, %%"REG_a"                \n\t"	"movd %%"REG_d", %%mm5             \n\t"	"xor $-1, %%"REG_d"              \n\t"	"mov %%"REG_a", %%"REG_c"             \n\t"	"add $7, %%"REG_d"               \n\t"	"neg %%"REG_a"                   \n\t"	"sub %0, %%"REG_c"            \n\t"	"add %%"REG_c", %%"REG_c"             \n\t"	"movd %%"REG_d", %%mm2             \n\t"	"mov %%"REG_c", %1       \n\t"	"mov %2, %%"REG_d"               \n\t"	"shl $4, %%"REG_a"               \n\t"	"2:                        \n\t"	"movq (%%"REG_d"), %%mm3           \n\t"	"movq %%mm3, %%mm4             \n\t"	"pxor %%mm7, %%mm7             \n\t"	"punpcklbw %%mm7, %%mm3        \n\t"	"punpckhbw %%mm7, %%mm4        \n\t"	"mov %0, %%"REG_c"            \n\t"	"psraw %%mm5, %%mm3            \n\t"	"psraw %%mm5, %%mm4            \n\t"	"1:                        \n\t"	"movq %%mm7, (%%"REG_S",%%"REG_a",)     \n\t"	"movq (%%"REG_S"), %%mm0           \n\t"	"movq 8(%%"REG_S"), %%mm1          \n\t"	"movq %%mm7, 8(%%"REG_S",%%"REG_a",)    \n\t"	"paddw %%mm3, %%mm0            \n\t"	"paddw %%mm4, %%mm1            \n\t"	"movq %%mm7, (%%"REG_S")           \n\t"	"psraw %%mm2, %%mm0            \n\t"	"psraw %%mm2, %%mm1            \n\t"	"movq %%mm7, 8(%%"REG_S")          \n\t"	"packuswb %%mm1, %%mm0         \n\t"	"add $16, %%"REG_S"              \n\t"	"movq %%mm0, (%%"REG_D")           \n\t"	"add $8, %%"REG_D"               \n\t"	"sub $8, %%"REG_c"               \n\t"	"jg 1b                      \n\t"	"add %1, %%"REG_S"       \n\t"	"add $8, %%"REG_d"               \n\t"	"add %3, %%"REG_D"       \n\t"	"cmp %4, %%"REG_d"           \n\t"	"jl 2b                      \n\t"	:	: "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (end),	  "m" (log2_scale), "m" (src), "m" (dst) //input	: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D	);    }//This func reads from 2 slices, 0 & 2  and clears 2-ndstatic void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale){    const uint8_t *od=&dither[0][0];    const uint8_t *end=&dither[height][0];    width = (width+7)&~7;    dst_stride-=width;    //src_stride=(src_stride-width)*2;    asm volatile(	"mov %5, %%"REG_d"                \n\t"	"mov %6, %%"REG_S"                \n\t"	"mov %7, %%"REG_D"                \n\t"	"mov %1, %%"REG_a"            \n\t"	"movd %%"REG_d", %%mm5             \n\t"	"xor $-1, %%"REG_d"              \n\t"	"mov %%"REG_a", %%"REG_c"             \n\t"	"add $7, %%"REG_d"               \n\t"	"sub %0, %%"REG_c"            \n\t"	"add %%"REG_c", %%"REG_c"             \n\t"	"movd %%"REG_d", %%mm2             \n\t"	"mov %%"REG_c", %1       \n\t"	"mov %2, %%"REG_d"               \n\t"	"shl $5, %%"REG_a"               \n\t"	"2:                        \n\t"	"movq (%%"REG_d"), %%mm3           \n\t"	"movq %%mm3, %%mm4             \n\t"	"pxor %%mm7, %%mm7             \n\t"	"punpcklbw %%mm7, %%mm3        \n\t"	"punpckhbw %%mm7, %%mm4        \n\t"	"mov %0, %%"REG_c"            \n\t"	"psraw %%mm5, %%mm3            \n\t"	"psraw %%mm5, %%mm4            \n\t"	"1:                        \n\t"	"movq (%%"REG_S"), %%mm0           \n\t"	"movq 8(%%"REG_S"), %%mm1          \n\t"	"paddw %%mm3, %%mm0            \n\t"	"paddw (%%"REG_S",%%"REG_a",), %%mm0    \n\t"	"paddw %%mm4, %%mm1            \n\t"	"movq 8(%%"REG_S",%%"REG_a",), %%mm6    \n\t"	"movq %%mm7, (%%"REG_S",%%"REG_a",)     \n\t"	"psraw %%mm2, %%mm0            \n\t"	"paddw %%mm6, %%mm1            \n\t"	"movq %%mm7, 8(%%"REG_S",%%"REG_a",)    \n\t"	"psraw %%mm2, %%mm1            \n\t"	"packuswb %%mm1, %%mm0         \n\t"	"movq %%mm0, (%%"REG_D")           \n\t"	"add $16, %%"REG_S"              \n\t"	"add $8, %%"REG_D"               \n\t"	"sub $8, %%"REG_c"               \n\t"	"jg 1b                      \n\t"	"add %1, %%"REG_S"       \n\t"	"add $8, %%"REG_d"               \n\t"	"add %3, %%"REG_D"       \n\t"	"cmp %4, %%"REG_d"           \n\t"	"jl 2b                      \n\t"	:	: "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (end),	  "m" (log2_scale), "m" (src), "m" (dst) //input	: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S	);  }static void mul_thrmat_mmx(struct vf_priv_s *p, int q){    uint64_t *adr=&p->threshold_mtx_noq[0];    asm volatile(	"movd %0, %%mm7                \n\t"	"add $8*8*2, %%"REG_D"            \n\t"	"movq 0*8(%%"REG_S"), %%mm0        \n\t"	"punpcklwd %%mm7, %%mm7        \n\t"	"movq 1*8(%%"REG_S"), %%mm1        \n\t"	"punpckldq %%mm7, %%mm7        \n\t"	"pmullw %%mm7, %%mm0           \n\t"	"movq 2*8(%%"REG_S"), %%mm2        \n\t"	"pmullw %%mm7, %%mm1           \n\t"	"movq 3*8(%%"REG_S"), %%mm3        \n\t"	"pmullw %%mm7, %%mm2           \n\t"	"movq %%mm0, 0*8(%%"REG_D")        \n\t"	"movq 4*8(%%"REG_S"), %%mm4        \n\t"	"pmullw %%mm7, %%mm3           \n\t"	"movq %%mm1, 1*8(%%"REG_D")        \n\t"	"movq 5*8(%%"REG_S"), %%mm5        \n\t"	"pmullw %%mm7, %%mm4           \n\t"	"movq %%mm2, 2*8(%%"REG_D")        \n\t"	"movq 6*8(%%"REG_S"), %%mm6        \n\t"	"pmullw %%mm7, %%mm5           \n\t"	"movq %%mm3, 3*8(%%"REG_D")        \n\t"	"movq 7*8+0*8(%%"REG_S"), %%mm0    \n\t"	"pmullw %%mm7, %%mm6           \n\t"	"movq %%mm4, 4*8(%%"REG_D")        \n\t"	"movq 7*8+1*8(%%"REG_S"), %%mm1    \n\t"	"pmullw %%mm7, %%mm0           \n\t"	"movq %%mm5, 5*8(%%"REG_D")        \n\t"	"movq 7*8+2*8(%%"REG_S"), %%mm2    \n\t"	"pmullw %%mm7, %%mm1           \n\t"	"movq %%mm6, 6*8(%%"REG_D")        \n\t"	"movq 7*8+3*8(%%"REG_S"), %%mm3    \n\t"	"pmullw %%mm7, %%mm2           \n\t"	"movq %%mm0, 7*8+0*8(%%"REG_D")    \n\t"	"movq 7*8+4*8(%%"REG_S"), %%mm4    \n\t"	"pmullw %%mm7, %%mm3           \n\t"	"movq %%mm1, 7*8+1*8(%%"REG_D")    \n\t"	"movq 7*8+5*8(%%"REG_S"), %%mm5    \n\t"	"pmullw %%mm7, %%mm4           \n\t"	"movq %%mm2, 7*8+2*8(%%"REG_D")    \n\t"	"movq 7*8+6*8(%%"REG_S"), %%mm6    \n\t"	"pmullw %%mm7, %%mm5           \n\t"	"movq %%mm3, 7*8+3*8(%%"REG_D")    \n\t"	"movq 14*8+0*8(%%"REG_S"), %%mm0   \n\t"	"pmullw %%mm7, %%mm6           \n\t"	"movq %%mm4, 7*8+4*8(%%"REG_D")    \n\t"	"movq 14*8+1*8(%%"REG_S"), %%mm1   \n\t"	"pmullw %%mm7, %%mm0           \n\t"	"movq %%mm5, 7*8+5*8(%%"REG_D")    \n\t"	"pmullw %%mm7, %%mm1           \n\t"	"movq %%mm6, 7*8+6*8(%%"REG_D")    \n\t"	"movq %%mm0, 14*8+0*8(%%"REG_D")   \n\t"	"movq %%mm1, 14*8+1*8(%%"REG_D")   \n\t"	: "+g" (q), "+S" (adr), "+D" (adr)	:	);}static void column_fidct_mmx(int16_t* thr_adr,  DCTELEM *data,  DCTELEM *output,  int cnt);static void row_idct_mmx(DCTELEM* workspace, 			 int16_t* output_adr,  int output_stride,  int cnt);static void row_fdct_mmx(DCTELEM *data,  const uint8_t *pixels,  int line_size,  int cnt);#define store_slice_s store_slice_mmx#define store_slice2_s store_slice2_mmx#define mul_thrmat_s mul_thrmat_mmx#define column_fidct_s column_fidct_mmx#define row_idct_s row_idct_mmx#define row_fdct_s row_fdct_mmx#endif // HAVE_MMXstatic void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,		   int dst_stride, int src_stride,		   int width, int height,		   uint8_t *qp_store, int qp_stride, int is_luma){    int x, x0, y, es, qy, t;    const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))    const int step=6-p->log2_count;    const int qps= 3 + is_luma;     int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];    DCTELEM *block= (DCTELEM *)block_align;    DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ);        memset(block3, 0, 4*8*BLOCKSZ);    //p->src=src-src_stride*8-8;//!        if (!src || !dst) return; // HACK avoid crash for Y8 colourspace    for(y=0; y<height; y++){        int index= 8 + 8*stride + y*stride;        fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers        for(x=0; x<8; x++){ 

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?