mpv_sparc.c

来自「支持SSL v2/v3, TLS, PKCS #5, PKCS #7, PKCS」· C语言 代码 · 共 350 行

C
350
字号
/* * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/MPL/ *  * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. *  * The Original Code is a SPARC/VIS optimized multiply and add function * * The Initial Developer of the Original Code is Sun Microsystems Inc. * Portions created by Sun Microsystems Inc. are  * Copyright (C) 1999-2000 Sun Microsystems Inc.  All Rights Reserved. *  * Contributor(s): *  * Alternatively, the contents of this file may be used under the * terms of the GNU General Public License Version 2 or later (the * "GPL"), in which case the provisions of the GPL are applicable  * instead of those above.	If you wish to allow use of your  * version of this file only under the terms of the GPL and not to * allow others to use your version of this file under the MPL, * indicate your decision by deleting the provisions above and * replace them with the notice and other provisions required by * the GPL.  If you do not delete the provisions above, a recipient * may use your version of this file under either the MPL or the * GPL. *  $Id: mpv_sparc.c,v 1.1 2000/09/29 23:38:01 nelsonb%netscape.com Exp $ */#include "vis_proto.h"/***************************************************************/typedef  int                t_s32;typedef  unsigned int       t_u32;#if defined(__sparcv9)typedef  long               t_s64;typedef  unsigned long      t_u64;#elsetypedef  long long          t_s64;typedef  unsigned long long t_u64;#endiftypedef  double             t_d64;/***************************************************************/typedef union {  t_d64 d64;  struct {    t_s32 i0;    t_s32 i1;  } i32s;} d64_2_i32;/***************************************************************/#define BUFF_SIZE  256#define A_BITS  21#define A_MASK  ((1 << A_BITS) - 1)/***************************************************************/static t_u64 mask_cnst[] = {  0x8000000080000000ull};/***************************************************************/#define DEF_VARS(N)                     \  t_d64 *py = (t_d64*)y;                \  t_d64 mask = *((t_d64*)mask_cnst);    \  t_d64 ca = (1u << 31) - 1;            \  t_d64 da = (t_d64)a;                  \  t_s64 buff[N], s;                     \  d64_2_i32 dy/***************************************************************/#define MUL_U32_S64_1(i)                                        \  dy.f32s.i0 = vis_fxnors(vis_read_hi(mask), ((t_f32*)y)[0]);   \  buff[0] = (ca - (t_d64)dy.i32s.i0) * da#define MUL_U32_S64_1_D()                                       \  dy.f32s.i0 = vis_fxnors(vis_read_hi(mask), ((t_f32*)y)[0]);   \  d0 = ca - (t_d64)dy.i32s.i0;                                  \  buff[0] = (t_s64)(d0 * da);                                   \  buff[1] = (t_s64)(d0 * db);                                   \#define MUL_U32_S64_1_E(i)                                      \  dy.f32s.i0 = vis_fxnors(vis_read_hi(mask), ((t_f32*)y)[0]);   \  d0 = ca - (t_d64)dy.i32s.i0;                                  \  buff[0] = (t_s64)(d0 * da);                                   \  buff[1] = (t_s64)(d0 * db - (1ULL << 63))/************ ***************************************************/#define MUL_U32_S64_2(i)                                \  dy.d64 = vis_fxnor(mask, py[i]);                      \  buff[2*(i)  ] = (ca - (t_d64)dy.i32s.i0) * da;        \  buff[2*(i)+1] = (ca - (t_d64)dy.i32s.i1) * da#define MUL_U32_S64_2_D(i)              \  dy.d64 = vis_fxnor(mask, py[i]);      \  d0 = ca - (t_d64)dy.i32s.i0;          \  d1 = ca - (t_d64)dy.i32s.i1;          \  buff[4*(i)  ] = (t_s64)(d0 * da);     \  buff[4*(i)+1] = (t_s64)(d0 * db);     \  buff[4*(i)+2] = (t_s64)(d1 * da);     \  buff[4*(i)+3] = (t_s64)(d1 * db)#define MUL_U32_S64_2_E(i)                              \  dy.d64 = vis_fxnor(mask, py[i]);                      \  d0 = ca - (t_d64)dy.i32s.i0;                          \  d1 = ca - (t_d64)dy.i32s.i1;                          \  buff[4*(i)  ] = (t_s64)(d0 * da);                     \  buff[4*(i)+1] = (t_s64)(d0 * db - (1ULL << 63));      \  buff[4*(i)+2] = (t_s64)(d1 * da);                     \  buff[4*(i)+3] = (t_s64)(d1 * db - (1ULL << 63))/***************************************************************/#define ADD_S64_U32(i)          \  s = buff[i] + x[i] + c;       \  z[i] = s;                     \  c = (s >> 32)#define ADD_S64_U32_D(i)                        \  s = buff[2*(i)] + buff[2*(i)+1] + x[i] + c;   \  z[i] = s;                                     \  c = (s >> 32)#define ADD_S64_U32_E(i)                                        \  s =  buff[2*(i)] + buff[2*(i)+1] + x[i] + uc + (1ULL << 63);  \  z[i] = s;                                                     \  uc = ((t_u64)s >> 32)/***************************************************************/#define MUL_U32_S64_8(i)        \  MUL_U32_S64_2(i);             \  MUL_U32_S64_2(i+1);           \  MUL_U32_S64_2(i+2);           \  MUL_U32_S64_2(i+3)#define MUL_U32_S64_D_8(i)      \  MUL_U32_S64_2_D(i);           \  MUL_U32_S64_2_D(i+1);         \  MUL_U32_S64_2_D(i+2);         \  MUL_U32_S64_2_D(i+3)#define MUL_U32_S64_E_8(i)      \  MUL_U32_S64_2_E(i);           \  MUL_U32_S64_2_E(i+1);         \  MUL_U32_S64_2_E(i+2);         \  MUL_U32_S64_2_E(i+3)/***************************************************************/#define ADD_S64_U32_8(i)        \  ADD_S64_U32(i);               \  ADD_S64_U32(i+1);             \  ADD_S64_U32(i+2);             \  ADD_S64_U32(i+3);             \  ADD_S64_U32(i+4);             \  ADD_S64_U32(i+5);             \  ADD_S64_U32(i+6);             \  ADD_S64_U32(i+7)#define ADD_S64_U32_D_8(i)      \  ADD_S64_U32_D(i);             \  ADD_S64_U32_D(i+1);           \  ADD_S64_U32_D(i+2);           \  ADD_S64_U32_D(i+3);           \  ADD_S64_U32_D(i+4);           \  ADD_S64_U32_D(i+5);           \  ADD_S64_U32_D(i+6);           \  ADD_S64_U32_D(i+7)#define ADD_S64_U32_E_8(i)      \  ADD_S64_U32_E(i);             \  ADD_S64_U32_E(i+1);           \  ADD_S64_U32_E(i+2);           \  ADD_S64_U32_E(i+3);           \  ADD_S64_U32_E(i+4);           \  ADD_S64_U32_E(i+5);           \  ADD_S64_U32_E(i+6);           \  ADD_S64_U32_E(i+7)/***************************************************************/t_u32 mul_add(t_u32 *z, t_u32 *x, t_u32 *y, int n, t_u32 a){  if (a < (1 << A_BITS)) {    if (n == 8) {      DEF_VARS(8);      t_s32 c = 0;      MUL_U32_S64_8(0);      ADD_S64_U32_8(0);      return c;    } else if (n == 16) {      DEF_VARS(16);      t_s32 c = 0;      MUL_U32_S64_8(0);      MUL_U32_S64_8(4);      ADD_S64_U32_8(0);      ADD_S64_U32_8(8);      return c;    } else {      DEF_VARS(BUFF_SIZE);      t_s32 i, c = 0;#pragma pipeloop(0)      for (i = 0; i < (n+1)/2; i ++) {        MUL_U32_S64_2(i);      }#pragma pipeloop(0)      for (i = 0; i < n; i ++) {        ADD_S64_U32(i);      }      return c;    }  } else if (a < (1u << 31)) {    if (n == 8) {      DEF_VARS(2*8);      t_d64 d0, d1, db;      t_s32 c = 0;      da = (t_d64)(a &  A_MASK);      db = (t_d64)(a &~ A_MASK);      MUL_U32_S64_D_8(0);      ADD_S64_U32_D_8(0);      return c;    } else if (n == 16) {      DEF_VARS(2*16);      t_d64 d0, d1, db;      t_s32 c = 0;      da = (t_d64)(a &  A_MASK);      db = (t_d64)(a &~ A_MASK);      MUL_U32_S64_D_8(0);      MUL_U32_S64_D_8(4);      ADD_S64_U32_D_8(0);      ADD_S64_U32_D_8(8);      return c;    } else {      DEF_VARS(2*BUFF_SIZE);      t_d64 d0, d1, db;      t_s32 i, c = 0;      da = (t_d64)(a &  A_MASK);      db = (t_d64)(a &~ A_MASK);#pragma pipeloop(0)      for (i = 0; i < (n+1)/2; i ++) {        MUL_U32_S64_2_D(i);      }#pragma pipeloop(0)      for (i = 0; i < n; i ++) {        ADD_S64_U32_D(i);      }      return c;    }  } else {    if (n == 8) {      DEF_VARS(2*8);      t_d64 d0, d1, db;      t_u32 uc = 0;      da = (t_d64)(a &  A_MASK);      db = (t_d64)(a &~ A_MASK);      MUL_U32_S64_E_8(0);      ADD_S64_U32_E_8(0);      return uc;    } else if (n == 16) {      DEF_VARS(2*16);      t_d64 d0, d1, db;      t_u32 uc = 0;      da = (t_d64)(a &  A_MASK);      db = (t_d64)(a &~ A_MASK);      MUL_U32_S64_E_8(0);      MUL_U32_S64_E_8(4);      ADD_S64_U32_E_8(0);      ADD_S64_U32_E_8(8);      return uc;    } else {      DEF_VARS(2*BUFF_SIZE);      t_d64 d0, d1, db;      t_u32 uc = 0;      t_s32 i;      da = (t_d64)(a &  A_MASK);      db = (t_d64)(a &~ A_MASK);#pragma pipeloop(0)      for (i = 0; i < (n+1)/2; i ++) {        MUL_U32_S64_2_E(i);      }#pragma pipeloop(0)      for (i = 0; i < n; i ++) {        ADD_S64_U32_E(i);      }      return uc;    }  }}/***************************************************************/t_u32 mul_add_inp(t_u32 *x, t_u32 *y, int n, t_u32 a){  return mul_add(x, x, y, n, a);}/***************************************************************/

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?