📄 fdct_aan_ref.c

📁 motion Jpeg 在SPI DSP平台优化好的代码
💻 C
字号:
////////////////////////////////////////////////////////////////////////////////////////////////////////      Title:          fdct_aan_ref.sc  (C Reference code for FDCT8x8+Quantization)////      Notice:         COPYRIGHT (C) STREAM PROCESSORS, INC. 2005-2007//                      THIS PROGRAM IS PROVIDED UNDER THE TERMS OF THE SPI//                      END-USER LICENSE AGREEMENT (EULA). THE PROGRAM MAY ONLY//                      BE USED IN A MANNER EXPLICITLY SPECIFIED IN THE EULA,//                      WHICH INCLUDES LIMITATIONS ON COPYING, MODIFYING,//                      REDISTRIBUTION AND WARANTIES. UNAUTHORIZED USE OF THIS//                      PROGRAM IS STRICTLY PROHIBITED. YOU MAY OBTAIN A COPY OF//                      THE EULA FROM WWW.STREAMPROCESSORS.COM. //    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////      #includes ////////////////////////////////////////////////////////////////////////////////////////////////////#include "spi_common.h"#include "jpege_context.h"//////////////////////////////////////////////////////////////////////////////////////////////////////// Implementation details.  This code is modified from IJG code. File jfdctfst.c//// jfdctfst.c// Copyright (C) 1994-1996, Thomas G. Lane.// This file is part of the Independent JPEG Group's software.// For conditions of distribution and use, see the accompanying README file.// // This file contains a fast, not so accurate integer implementation of the// forward DCT (Discrete Cosine Transform).// // A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT// on each column.  Direct algorithms are also available, but they are// much more complex and seem not to be any faster when reduced to code.// // This implementation is based on Arai, Agui, and Nakajima's algorithm for// scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in// Japanese, but the algorithm is described in the Pennebaker & Mitchell// JPEG textbook (see REFERENCES section in file README).  The following code// is based directly on figure 4-8 in P&M.// While an 8-point DCT cannot be done in less than 11 multiplies, it is// possible to arrange the computation so that many of the multiplies are// simple scalings of the final outputs.  These multiplies can then be// folded into the multiplications or divisions by the JPEG quantization// table entries.  The AA&N method leaves only 5 multiplies and 29 adds// to be done in the DCT itself.// The primary disadvantage of this method is that with fixed-point math,// accuracy is lost due to imprecise representation of the scaled// quantization values.  The smaller the quantization table entry, the less// precise the scaled value, so this implementation does worse with high-// quality-setting files than with low-quality ones.// // This module is specialized to the case DCTSIZE = 8.// Scaling decisions are generally the same as in the LL&M algorithm;// see jfdctint.c for more details.  However, we choose to descale// (right shift) multiplication products as soon as they are formed,// rather than carrying additional fractional bits into subsequent additions.// This compromises accuracy slightly, but it lets us save a few shifts.// More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)// everywhere except in the multiplications proper; this saves a good deal// of work on 16-bit-int machines.// // Again to save a few shifts, the intermediate results between pass 1 and// pass 2 are not upscaled, but are represented only to integral precision.// // A final compromise is to represent the multiplicative constants to only// 8 fractional bits, rather than 13.  This saves some shifting work on some// machines, and may also reduce the cost of multiplication (since there// are fewer one-bits in the constants).// // Some C compilers fail to reduce "FIX(constant)" at compile time, thus// causing a lot of useless floating-point operations at run time.// To get around this we use the following pre-calculated constants.// If you change CONST_BITS you may want to add appropriate values.// (With a reasonable C compiler, you can just rely on the FIX() macro...)//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////      Constants////////////////////////////////////////////////////////////////////////////////#if AAN_TCONST_BITS == 15#define FIX_0_382683433  ((int)  12540)     // 0x30FC30FC   //((int)  12540)   // FIX(0.382683433) Q1.15#define FIX_0_541196100  ((int)  17734)     // 0x45464546   //((int)  17734)   // FIX(0.541196100) Q1.15#define FIX_0_707106781  ((int)  23170)     // 0x5A825A82   //((int)  23170)   // FIX(0.707106781) Q1.15#define FIX_1_306562965  ((int)  21407)     // FIX(1.306562965) Q2.14#elif AAN_TCONST_BITS == 14#define FIX_0_382683433  ((int)  6270)      // FIX(0.382683433)#define FIX_0_541196100  ((int)  8867)      // FIX(0.541196100)#define FIX_0_707106781  ((int)  11585)     // FIX(0.707106781)#define FIX_1_306562965  ((int)  21407)     // FIX(1.306562965)#elif AAN_TCONST_BITS == 12#define FIX_0_382683433  ((int)  1567)      //(0.382683433 * (1 << AAN_TCONST_BITS))      FIX(0.382683433) #define FIX_0_541196100  ((int)  2217)      // FIX(0.541196100) #define FIX_0_707106781  ((int)  2896)      // FIX(0.707106781) #define FIX_1_306562965  ((int)  5352)      // FIX(1.306562965) #elif AAN_TCONST_BITS == 8#define FIX_0_382683433  ((int)   98)	    // FIX(0.382683433) #define FIX_0_541196100  ((int)  139)	    // FIX(0.541196100) #define FIX_0_707106781  ((int)  181)	    // FIX(0.707106781) #define FIX_1_306562965  ((int)  334)	    // FIX(1.306562965) #else#define FIX_0_382683433  FIX(0.382683433)#define FIX_0_541196100  FIX(0.541196100)#define FIX_0_707106781  FIX(0.707106781)#define FIX_1_306562965  FIX(1.306562965)#endif///////////////////////////////////////////////////////////////////////short quantize_ref ( short data,  unsigned short scale)// Description: Input 'data' is quantized by using the 'scale' value //              (quantization divisor).  The result is rounded and shifted//              to retain the most precise part of the result.////	Returns:    short.///////////////////////////////////////////////////////////////////////{    if (data < 0)    {	    data = -data;        data = (data * (short)scale + DIV_CONST_HALF) >> DIV_CONST_BITS;	    data = -data;	}     else    {        data = (data * (short)scale + DIV_CONST_HALF) >> DIV_CONST_BITS;            	}    return data;}///////////////////////////////////////////////////////////////////////void fdct_and_quantize_aan_ref ( char *p_in,  short *p_out,  unsigned short *p_quant_divisor)// Description: From Pennebaker/Mitchell, pg. 50-52.  See also Arai, Agui,//              Nakajima.  This algorithm is based on the 16-pt DFT.  Basically,//              the 8-pt DCT can be calculated by scaling the real parts of the//              output of the 16-pt DFT.////	Returns:    Nothing.///////////////////////////////////////////////////////////////////////{    short  a0, a1, a2, a3, a4, a5, a6, a7;    short  b0, b1, b2, b3;    short  z1, z2, z3, z4, z5, z11, z13;    char  *p_in_ptr;    short *p_out_ptr;    unsigned short *p_divisor;    int  i, j, k;    // Pass 1: process columns to match with the kernel code    // Pass 1: process rows.    p_in_ptr  = p_in;    p_out_ptr = p_out;    for (i = 0; i < BLOCK_WIDTH; i++)    {        a0 = p_in_ptr[BLOCK_WIDTH * 0] + p_in_ptr[BLOCK_WIDTH * 7];        a7 = p_in_ptr[BLOCK_WIDTH * 0] - p_in_ptr[BLOCK_WIDTH * 7];        a1 = p_in_ptr[BLOCK_WIDTH * 1] + p_in_ptr[BLOCK_WIDTH * 6];        a6 = p_in_ptr[BLOCK_WIDTH * 1] - p_in_ptr[BLOCK_WIDTH * 6];        a2 = p_in_ptr[BLOCK_WIDTH * 2] + p_in_ptr[BLOCK_WIDTH * 5];        a5 = p_in_ptr[BLOCK_WIDTH * 2] - p_in_ptr[BLOCK_WIDTH * 5];        a3 = p_in_ptr[BLOCK_WIDTH * 3] + p_in_ptr[BLOCK_WIDTH * 4];        a4 = p_in_ptr[BLOCK_WIDTH * 3] - p_in_ptr[BLOCK_WIDTH * 4];        // Even part         b0 = a0 + a3;	                            // phase 2         b3 = a0 - a3;        b1 = a1 + a2;        b2 = a1 - a2;                p_out_ptr[BLOCK_WIDTH * 0] = b0 + b1;       // phase 3         p_out_ptr[BLOCK_WIDTH * 4] = b0 - b1;        z1 = (((b2 + b3) * FIX_0_707106781) + 0x4000) >> AAN_TCONST_BITS;   // c4         p_out_ptr[BLOCK_WIDTH * 2] = b3 + z1;       // phase 5         p_out_ptr[BLOCK_WIDTH * 6] = b3 - z1;        // Odd part         b0 = a4 + a5;	                            // phase 2         b1 = a5 + a6;        b2 = a6 + a7;		// The following changes were necessary to maintain bit-exactness b/w streamC and Reference C.		// Constant 0x4000 added to the multiplication result is simulate spi_vmulra16i() rounding.		// FIX_1_306562965 is in Q2.14 with the rest of the co-efficients in Q1.15.		// An additional left shift is seen to convert the Q2.14 result back into Q1.15.        // The rotator is modified from fig 4-8 to avoid extra negations.         z5 = (((b0 - b2) * FIX_0_382683433) + 0x4000) >> AAN_TCONST_BITS;           // c6         z2 = (((b0 * FIX_0_541196100) + 0x4000) >> AAN_TCONST_BITS) + z5;           // c2-c6         z4 = ((((b2 * FIX_1_306562965) + 0x4000) >> AAN_TCONST_BITS) << 1) + z5;    // c2+c6         z3 = ((b1 * FIX_0_707106781) + 0x4000) >> AAN_TCONST_BITS;                  // c4         z11 = a7 + z3;		                    // phase 5         z13 = a7 - z3;        p_out_ptr[BLOCK_WIDTH*5] = z13 + z2;    // phase 6         p_out_ptr[BLOCK_WIDTH*3] = z13 - z2;        p_out_ptr[BLOCK_WIDTH*1] = z11 + z4;        p_out_ptr[BLOCK_WIDTH*7] = z11 - z4;        p_in_ptr++;        p_out_ptr++;			                // advance pointer to next column     }    p_out_ptr = p_out;    for (i = 0; i < BLOCK_HEIGHT; i++)    {        a0 = p_out_ptr[0] + p_out_ptr[7];        a7 = p_out_ptr[0] - p_out_ptr[7];        a1 = p_out_ptr[1] + p_out_ptr[6];        a6 = p_out_ptr[1] - p_out_ptr[6];        a2 = p_out_ptr[2] + p_out_ptr[5];        a5 = p_out_ptr[2] - p_out_ptr[5];        a3 = p_out_ptr[3] + p_out_ptr[4];        a4 = p_out_ptr[3] - p_out_ptr[4];        // Even part         b0 = a0 + a3;	        // phase 2         b3 = a0 - a3;        b1 = a1 + a2;        b2 = a1 - a2;        p_out_ptr[0] = b0 + b1; // phase 3         p_out_ptr[4] = b0 - b1;        z1 = (((b2 + b3) * FIX_0_707106781) + 0x4000) >> AAN_TCONST_BITS;   // c4         p_out_ptr[2] = b3 + z1;	// phase 5         p_out_ptr[6] = b3 - z1;        // Odd part         b0 = a4 + a5;	        // phase 2         b1 = a5 + a6;        b2 = a6 + a7;        // The rotator is modified from fig 4-8 to avoid extra negations.         z5 = (((b0 - b2) * FIX_0_382683433) + 0x4000) >> AAN_TCONST_BITS;           // c6         z2 = (((b0 * FIX_0_541196100) + 0x4000) >> AAN_TCONST_BITS) + z5;           // c2-c6         z4 = ((((b2 * FIX_1_306562965) + 0x4000) >> AAN_TCONST_BITS) << 1) + z5;    // c2+c6         z3 = ((b1 * FIX_0_707106781) + 0x4000) >> AAN_TCONST_BITS;                  // c4         z11 = a7 + z3;		        // phase 5         z13 = a7 - z3;        p_out_ptr[5] = z13 + z2;	// phase 6         p_out_ptr[3] = z13 - z2;        p_out_ptr[1] = z11 + z4;        p_out_ptr[7] = z11 - z4;        p_out_ptr += BLOCK_WIDTH;	// advance pointer to next row     }    // Pass 3: quantize.    p_out_ptr = p_out;    p_divisor = p_quant_divisor;    for (i = 0; i < BLOCK_WIDTH; i++)    {        for (j = 0; j < BLOCK_HEIGHT; j++)        {            k = i + j * BLOCK_WIDTH;            p_out_ptr[k] = quantize_ref (p_out_ptr[k], p_divisor[k]);        }    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -