📄 filter_block_kc.h
字号:
// -------------------------------------------------------------------// 仼 2005 Stream Processors, Inc. All rights reserved.// This Software is the property of Stream Processors, Inc. (SPI) and// is Proprietary and Confidential. It has been provided under// license for solely use in evaluating and/or developing code for a// stream processor device. Any use of the Software to develop code// for a semiconductor device not manufactured by or for SPI is// prohibited. Unauthorized use of this Software is strictly// prohibited.//// THIS SOFTWARE IS PROVIDED "AS IS". NO WARRANTIES ARE GIVEN,// WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING WARRANTIES OR// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE,// NONINFRINGEMENT AND TITLE. RECIPIENT SHALL HAVE THE SOLE// RESPONSIBILITY FOR THE ADEQUATE PROTECTION AND BACK-UP OF ITS DATA// USED IN CONNECTION WITH THIS SOFTWARE. IN NO EVENT WILL SPI BE// LIABLE FOR ANY CONSEQUENTIAL DAMAGES WHATSOEVER, INCLUDING LOSS OF// DATA OR USE, LOST PROFITS OR ANY INCIDENTAL OR SPECIAL DAMAGES,// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS// SOFTWARE, WHETHER IN ACTION OF CONTRACT OR TORT, INCLUDING// NEGLIGENCE. SPI FURTHER DISCLAIMS ANY LIABILITY WHATSOEVER FOR// INFRINGEMENT OF ANY INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD// PARTY.// -------------------------------------------------------------------//--------------------------------------------------------------------// File: $File: //depot/main/software/demo/deblock/src/filter_block_kc.h $// Revision: $Revision: #1 $// Last Modified: $DateTime: 2007/12/10 16:59:57 $//--------------------------------------------------------------------#include "spi_common.h"// Transpose a 2x2 matrix across 2 neighboring clustersinline kernel void transpose_2x2_select(vec uint8x4 buf_i0(in), vec uint8x4 buf_i1(in), vec uint32x1 rcv_ctrl(in), vec bool32x1 selbool(in), vec uint8x4 buf_o0(out), vec uint8x4 buf_o1(out)){ vec uint8x4 rcv_in,rcv_out,sel_in; //hi_lo(sel_in,rcv_in) = selectd(selbool, buf_i1, buf_i0); sel_in = spi_vselect8((vec uint8x4)selbool, buf_i1, buf_i0); rcv_in = spi_vselect8((vec uint8x4)selbool, buf_i0, buf_i1); rcv_out = (vec uint8x4)spi_vperm32(rcv_ctrl, (vec uint32x1)rcv_in, 0); //hi_lo(buf_o1,buf_o0) = selectd(selbool, sel_in, rcv_out); buf_o1 = spi_vselect8((vec uint8x4)selbool, sel_in, rcv_out); buf_o0 = spi_vselect8((vec uint8x4)selbool, rcv_out, sel_in);}// Transpose a 4x4 matrix across 4 neighboring clustersinline kernel void transpose_4x4_select(vec uint8x4 buf_i0(in), vec uint8x4 buf_i1(in), vec uint8x4 buf_i2(in), vec uint8x4 buf_i3(in), vec uint32x1 rcv_ctrl0(in), vec bool32x1 selbool0(in), vec uint32x1 rcv_ctrl1(in), vec bool32x1 selbool1(in), vec uint8x4 buf_o0(out), vec uint8x4 buf_o1(out), vec uint8x4 buf_o2(out), vec uint8x4 buf_o3(out)){ vec uint8x4 tmp0, tmp1, tmp2, tmp3; // First stage - do transpose within inner 2x2 matrices transpose_2x2_select(buf_i0, buf_i1, rcv_ctrl0, selbool0, tmp0, tmp1); transpose_2x2_select(buf_i2, buf_i3, rcv_ctrl0, selbool0, tmp2, tmp3); // Second stage - do transpose of 2x2 matrices transpose_2x2_select(tmp0, tmp2, rcv_ctrl1, selbool1, buf_o0, buf_o2); transpose_2x2_select(tmp1, tmp3, rcv_ctrl1, selbool1, buf_o1, buf_o3); }// Transpose a 8x8 matrix across 8 neighboring clustersinline kernel void transpose_8x8_select(vec uint8x4 buf_i0(in), vec uint8x4 buf_i1(in), vec uint8x4 buf_i2(in), vec uint8x4 buf_i3(in), vec uint8x4 buf_i4(in), vec uint8x4 buf_i5(in), vec uint8x4 buf_i6(in), vec uint8x4 buf_i7(in), vec uint32x1 rcv_ctrl0(in), vec bool32x1 selbool0(in), vec uint32x1 rcv_ctrl1(in), vec bool32x1 selbool1(in), vec uint32x1 rcv_ctrl2(in), vec bool32x1 selbool2(in), vec uint8x4 buf_o0(out), vec uint8x4 buf_o1(out), vec uint8x4 buf_o2(out), vec uint8x4 buf_o3(out), vec uint8x4 buf_o4(out), vec uint8x4 buf_o5(out), vec uint8x4 buf_o6(out), vec uint8x4 buf_o7(out)){ vec uint8x4 tmp_a0; vec uint8x4 tmp_a1; vec uint8x4 tmp_a2; vec uint8x4 tmp_a3; vec uint8x4 tmp_a4; vec uint8x4 tmp_a5; vec uint8x4 tmp_a6; vec uint8x4 tmp_a7; vec uint8x4 tmp_b0; vec uint8x4 tmp_b1; vec uint8x4 tmp_b2; vec uint8x4 tmp_b3; vec uint8x4 tmp_b4; vec uint8x4 tmp_b5; vec uint8x4 tmp_b6; vec uint8x4 tmp_b7; // First stage - do transpose within inner 2x2 matrices transpose_2x2_select(buf_i0, buf_i1, rcv_ctrl0, selbool0, tmp_a0, tmp_a1); transpose_2x2_select(buf_i2, buf_i3, rcv_ctrl0, selbool0, tmp_a2, tmp_a3); transpose_2x2_select(buf_i4, buf_i5, rcv_ctrl0, selbool0, tmp_a4, tmp_a5); transpose_2x2_select(buf_i6, buf_i7, rcv_ctrl0, selbool0, tmp_a6, tmp_a7); transpose_2x2_select(tmp_a0, tmp_a2, rcv_ctrl1, selbool1, tmp_b0, tmp_b2); transpose_2x2_select(tmp_a1, tmp_a3, rcv_ctrl1, selbool1, tmp_b1, tmp_b3); transpose_2x2_select(tmp_a4, tmp_a6, rcv_ctrl1, selbool1, tmp_b4, tmp_b6); transpose_2x2_select(tmp_a5, tmp_a7, rcv_ctrl1, selbool1, tmp_b5, tmp_b7); // Second stage - do transpose of 2x2 matrices transpose_2x2_select(tmp_b0, tmp_b4, rcv_ctrl2, selbool2, buf_o0, buf_o4); transpose_2x2_select(tmp_b1, tmp_b5, rcv_ctrl2, selbool2, buf_o1, buf_o5); transpose_2x2_select(tmp_b2, tmp_b6, rcv_ctrl2, selbool2, buf_o2, buf_o6); transpose_2x2_select(tmp_b3, tmp_b7, rcv_ctrl2, selbool2, buf_o3, buf_o7); }//------------------------------------------------------------------inline kernel void transpose_macroblock16x2//------------------------------------------------------------------// KernelC inline function to transpose a macroblock between 8 clusters// Each cluster has 2 rows of pixels, each of packed words consist of// 2 (lower) bytes from 2 rows and upper 2 bytes are 0s.// On the output also it is required to pack the 2 column pixels in lower// bytes with upper 2 bytes being 0. This allows deblocking be using// packed operands.//--------------------------------------------------------------------( // Input: Control values vec bool32x1 cid_b2(in), vec bool32x1 cid_b1(in), vec bool32x1 cid_b0(in), vec uint32x1 perm_b2(in), vec uint32x1 perm_b1(in), vec uint32x1 perm_b0(in), // Input: blocks to be transformed (4 half blocks per cluster) vec uint8x4 cur0_p3(in), vec uint8x4 cur0_p2(in), vec uint8x4 cur0_p1(in), vec uint8x4 cur0_p0(in), vec uint8x4 cur1_p3(in), vec uint8x4 cur1_p2(in), vec uint8x4 cur1_p1(in), vec uint8x4 cur1_p0(in), vec uint8x4 cur2_p3(in), vec uint8x4 cur2_p2(in), vec uint8x4 cur2_p1(in), vec uint8x4 cur2_p0(in), vec uint8x4 cur3_p3(in), vec uint8x4 cur3_p2(in), vec uint8x4 cur3_p1(in), vec uint8x4 cur3_p0(in), // Output: transposed blocks (4 per cluster) vec uint8x4 t_cur0_p3(out), vec uint8x4 t_cur0_p2(out), vec uint8x4 t_cur0_p1(out), vec uint8x4 t_cur0_p0(out), vec uint8x4 t_cur1_p3(out), vec uint8x4 t_cur1_p2(out), vec uint8x4 t_cur1_p1(out), vec uint8x4 t_cur1_p0(out), vec uint8x4 t_cur2_p3(out), vec uint8x4 t_cur2_p2(out), vec uint8x4 t_cur2_p1(out), vec uint8x4 t_cur2_p0(out), vec uint8x4 t_cur3_p3(out), vec uint8x4 t_cur3_p2(out), vec uint8x4 t_cur3_p1(out), vec uint8x4 t_cur3_p0(out) ){ vec uint8x4 tmp0_0, tmp0_1, tmp0_2, tmp0_3, tmp0_4, tmp0_5, tmp0_6, tmp0_7; vec uint8x4 tmp1_0, tmp1_1, tmp1_2, tmp1_3, tmp1_4, tmp1_5, tmp1_6, tmp1_7; // pack the bytes in each cluster such that each word has // 2 bytes from top row and 2 from bottom row. These bytes // are ordered such that no more transposes would be needed // within the word itself. This helps the transpose to boil // down to 8x8 8 cluster transform. tmp0_0 = (vec uint8x4)spi_vshuffleu(0x06020400, (vec uint32x1)cur0_p3, (vec uint32x1)cur0_p2); tmp0_1 = (vec uint8x4)spi_vshuffleu(0x06020400, (vec uint32x1)cur0_p1, (vec uint32x1)cur0_p0); tmp0_2 = (vec uint8x4)spi_vshuffleu(0x06020400, (vec uint32x1)cur1_p3, (vec uint32x1)cur1_p2); tmp0_3 = (vec uint8x4)spi_vshuffleu(0x06020400, (vec uint32x1)cur1_p1, (vec uint32x1)cur1_p0); tmp0_4 = (vec uint8x4)spi_vshuffleu(0x06020400, (vec uint32x1)cur2_p3, (vec uint32x1)cur2_p2); tmp0_5 = (vec uint8x4)spi_vshuffleu(0x06020400, (vec uint32x1)cur2_p1, (vec uint32x1)cur2_p0); tmp0_6 = (vec uint8x4)spi_vshuffleu(0x06020400, (vec uint32x1)cur3_p3, (vec uint32x1)cur3_p2); tmp0_7 = (vec uint8x4)spi_vshuffleu(0x06020400, (vec uint32x1)cur3_p1, (vec uint32x1)cur3_p0); // Use standard 8x8 transpose from Brucek's code. transpose_8x8_select(tmp0_0, tmp0_1, tmp0_2, tmp0_3, tmp0_4, tmp0_5, tmp0_6, tmp0_7, perm_b0, cid_b0, perm_b1, cid_b1, perm_b2, cid_b2, tmp1_0, tmp1_1, tmp1_2, tmp1_3, tmp1_4, tmp1_5, tmp1_6, tmp1_7); // Now unpack the transposed to the liking of deblocking // kernel t_cur0_p2 = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)tmp1_0, 0); t_cur0_p3 = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)tmp1_0, 0); t_cur0_p0 = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)tmp1_1, 0); t_cur0_p1 = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)tmp1_1, 0); t_cur1_p2 = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)tmp1_2, 0); t_cur1_p3 = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)tmp1_2, 0); t_cur1_p0 = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)tmp1_3, 0); t_cur1_p1 = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)tmp1_3, 0); t_cur2_p2 = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)tmp1_4, 0); t_cur2_p3 = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)tmp1_4, 0); t_cur2_p0 = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)tmp1_5, 0); t_cur2_p1 = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)tmp1_5, 0); t_cur3_p2 = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)tmp1_6, 0); t_cur3_p3 = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)tmp1_6, 0); t_cur3_p0 = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)tmp1_7, 0); t_cur3_p1 = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)tmp1_7, 0);}//------------------------------------------------------------------// FunctionName: Clip3//// Description:// Return Val clipped to [MinVal, MaxVal]//------------------------------------------------------------------inline kernel void clip3_32i(vec int32x1 min_val(in), vec int32x1 max_val(in), vec int32x1 val(in), vec int32x1 ret_val(out)){ ret_val = spi_vmin32i(max_val, spi_vmax32i(min_val, val));}inline kernel void clip3_8i(vec int8x4 min_val(in), vec int8x4 max_val(in), vec int8x4 val(in), vec int8x4 ret_val(out)){ ret_val = spi_vmin8i(max_val, spi_vmax8i(min_val, val));}inline kernel void clip3_16i(vec int16x2 min_val(in), vec int16x2 max_val(in), vec int16x2 val(in), vec int16x2 ret_val(out)){ ret_val = spi_vmin16i(max_val, spi_vmax16i(min_val, val));}//------------------------------------------------------------------// FunctionName: MatrixMult4x4(U)Int4x4//// Description:// This function does a 4x4 by 4x4 matrix multiply and vector add://// T T T T// [(t_p3, t_q3), [(CoeffP , CoeffP, CoeffP , CoeffP ), [add_val,// (t_p2, t_q2), * + add_val,// (t_p1, t_q1), T T T T add_val,// (t_p0, t_q0)] (CoeffQ , CoeffQ, CoeffQ , CoeffQ )] add_val]//// Inputs: t_p*, t_q*: 1x4 vectors
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -