⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 atl_gemv_sse.c

📁 基于Blas CLapck的.用过的人知道是干啥的
💻 C
📖 第 1 页 / 共 2 页
字号:
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                      (C) Copyright 2003 Camm Maguire * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include <stdio.h>#include <stdlib.h>#include "atlas_misc.h"#include "camm_util.h"#ifndef ATL_GAS_x8632   #error "This kernel requires gas x86-32 assembler!"#endif#define COPY_B#ifdef COPY_B#define plb(a_,b_,c_) pla(a_,b_,c_)#else#define plb(a_,b_,c_) pl(a_,b_,c_)#endif#undef p1_4_gemvT_1#define p1_4_gemvT_1(a_) \      pls(SS(a_,MM(0,RS4)),bx,4) \      pls(SS(a_,MM(0,RS4)),ax,0) \      pls(SS(a_,MM(0,RS4)),si,2) \      pmsr(4,0) \      pasr(0,6) \      pmsr(4,2) \      pasr(2,7)#undef p1_2_gemvT_1#define p1_2_gemvT_1(a_) \      px(4) \      pld(SS(a_,MM(0,RS4)),bx,4) \      px(0) \      pld(SS(a_,MM(0,RS4)),ax,0) \      px(2) \      pld(SS(a_,MM(0,RS4)),si,2) \      pm(4,0) \      pa(0,6) \      pm(4,2) \      pa(2,7)#undef p1_gemvT_1#define p1_gemvT_1(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plq(SS(a_,MM(0,RS4)),si,2) \      pm(4,0) \      pa(0,6) \      pm(4,2) \      pa(2,7)#undef p2_gemvT_1#define p2_gemvT_1(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plb(SS(a_,MM(1,RS4)),bx,5) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plq(SS(a_,MM(1,RS4)),ax,1) \      plq(SS(a_,MM(0,RS4)),si,2) \      plq(SS(a_,MM(1,RS4)),si,3) \      pm(4,0) \      pa(0,6) \      pm(4,2) \      pa(2,7) \      pm(5,1) \      pa(1,6) \      pm(5,3) \      pa(3,7)#undef p4_gemvT_1#define p4_gemvT_1(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plb(SS(a_,MM(1,RS4)),bx,5) \      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plq(SS(a_,MM(1,RS4)),ax,1) \      plq(SS(a_,MM(0,RS4)),si,2) \      plq(SS(a_,MM(1,RS4)),si,3) \      pm(4,0) \      pa(0,6) \      pm(4,2) \      pa(2,7) \      pm(5,1) \      pa(1,6) \      pm(5,3) \      pa(3,7) \      plb(SS(a_,MM(2,RS4)),bx,4) \      plb(SS(a_,MM(3,RS4)),bx,5) \      plq(SS(a_,MM(2,RS4)),ax,0) \      plq(SS(a_,MM(3,RS4)),ax,1) \      f(nta,SS(a_,MM((SS(0,CL)),RS4)),si) \      plq(SS(a_,MM(2,RS4)),si,2) \      plq(SS(a_,MM(3,RS4)),si,3) \      pm(4,0) \      pa(0,6) \      pm(4,2) \      pa(2,7) \      pm(5,1) \      pa(1,6) \      pm(5,3) \      pa(3,7)#undef lpgemvT_1#define lpgemvT_1(a_)#undef dpgemvT_1#define dpgemvT_1(a_) p4_gemvT_1(a_)#undef plgemvT_1#define plgemvT_1 16#undef p1_4_gemvT_1_1#define p1_4_gemvT_1_1(a_) \      pls(SS(a_,MM(0,RS4)),bx,4) \      pls(SS(a_,MM(0,RS4)),ax,0) \      pmsr(4,0) \      pasr(0,6)#undef p1_2_gemvT_1_1#define p1_2_gemvT_1_1(a_) \      px(4) \      pld(SS(a_,MM(0,RS4)),bx,4) \      px(0) \      pld(SS(a_,MM(0,RS4)),ax,0) \      pm(4,0) \      pa(0,6)#undef p1_gemvT_1_1#define p1_gemvT_1_1(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plq(SS(a_,MM(0,RS4)),ax,0) \      pm(4,0) \      pa(0,6)#undef p2_gemvT_1_1#define p2_gemvT_1_1(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plb(SS(a_,MM(1,RS4)),bx,5) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plq(SS(a_,MM(1,RS4)),ax,1) \      pm(4,0) \      pa(0,6) \      pm(5,1) \      pa(1,6)#undef p4_gemvT_1_1#define p4_gemvT_1_1(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plb(SS(a_,MM(1,RS4)),bx,5) \      plb(SS(a_,MM(2,RS4)),bx,3) \      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plq(SS(a_,MM(1,RS4)),ax,1) \      plq(SS(a_,MM(2,RS4)),ax,2) \      pm(4,0) \      pa(0,6) \      plb(SS(a_,MM(3,RS4)),bx,4) \      plq(SS(a_,MM(3,RS4)),ax,0) \      pm(5,1) \      pa(1,6) \      pm(3,2) \      pa(2,6) \      pm(4,0) \      pa(0,6)#undef lpgemvT_1_1#define lpgemvT_1_1(a_)#undef dpgemvT_1_1#define dpgemvT_1_1(a_) p4_gemvT_1_1(a_)#undef plgemvT_1_1#define plgemvT_1_1 16#undef p1_4_gemvT_1_3#define p1_4_gemvT_1_3(a_) \      pls(SS(a_,MM(0,RS4)),bx,4) \      pls(SS(a_,MM(0,RS4)),ax,0) \      pmsr(4,0) \      pasr(0,6)#undef p1_2_gemvT_1_3#define p1_2_gemvT_1_3(a_) \      px(4) \      pld(SS(a_,MM(0,RS4)),bx,4) \      px(0) \      pld(SS(a_,MM(0,RS4)),ax,0) \      pm(4,0) \      pa(0,6)#undef p1_gemvT_1_3#define p1_gemvT_1_3(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plq(SS(a_,MM(0,RS4)),ax,0) \      pm(4,0) \      pa(0,6)#undef p2_gemvT_1_3#define p2_gemvT_1_3(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plb(SS(a_,MM(1,RS4)),bx,5) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plq(SS(a_,MM(1,RS4)),ax,1) \      pm(4,0) \      pa(0,6) \      pm(5,1) \      pa(1,6)#undef p4_gemvT_1_3#define p4_gemvT_1_3(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plb(SS(a_,MM(1,RS4)),bx,5) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plb(SS(a_,MM(2,RS4)),bx,3) \      plq(SS(a_,MM(1,RS4)),ax,1) \      plq(SS(a_,MM(2,RS4)),ax,2) \      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \      pm(4,0) \      pa(0,6) \      plb(SS(a_,MM(3,RS4)),bx,4) \      plq(SS(a_,MM(3,RS4)),ax,0) \      pm(5,1) \      pa(1,7) \      pm(3,2) \      pa(2,6) \      pm(4,0) \      pa(0,7)#undef p8_gemvT_1_3#define p8_gemvT_1_3(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plb(SS(a_,MM(1,RS4)),bx,5) \      plb(SS(a_,MM(2,RS4)),bx,3) \      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plq(SS(a_,MM(1,RS4)),ax,1) \      plq(SS(a_,MM(2,RS4)),ax,2) \      pm(4,0) \      pa(0,6) \      plb(SS(a_,MM(3,RS4)),bx,4) \      plq(SS(a_,MM(3,RS4)),ax,0) \      pm(5,1) \      pa(1,7) \      pm(3,2) \      pa(2,6) \      pm(4,0) \      pa(0,7) \      plb(SS(a_,MM(4,RS4)),bx,4) \      plb(SS(a_,MM(5,RS4)),bx,5) \      plb(SS(a_,MM(6,RS4)),bx,3) \      plq(SS(a_,MM(4,RS4)),ax,0) \      plq(SS(a_,MM(5,RS4)),ax,1) \      plq(SS(a_,MM(6,RS4)),ax,2) \      pm(4,0) \      pa(0,6) \      plb(SS(a_,MM(7,RS4)),bx,4) \      plq(SS(a_,MM(7,RS4)),ax,0) \      pm(5,1) \      pa(1,7) \      pm(3,2) \      pa(2,6) \      pm(4,0) \      pa(0,7)#undef lpgemvT_1_3#define lpgemvT_1_3(a_)#undef dpgemvT_1_3#define dpgemvT_1_3(a_) p4_gemvT_1_3(a_)#undef plgemvT_1_3#define plgemvT_1_3 16#undef p1_4_gemvT_1_1c#define p1_4_gemvT_1_1c(a_)#undef p1_2_gemvT_1_1c#define p1_2_gemvT_1_1c(a_) \      px(4) \      pld(SS(a_,MM(0,RS4)),bx,4) \      px(0) \      pld(SS(a_,MM(0,RS4)),ax,0) \      pc(4,2) \      ps(CSHUF,4,4) \      pm(0,2) \      pa(2,6) \      pm(0,4) \      pa(4,7)#undef p1_gemvT_1_1c#define p1_gemvT_1_1c(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plq(SS(a_,MM(0,RS4)),ax,0) \      pc(4,2) \      ps(CSHUF,4,4) \      pm(0,2) \      pa(2,6) \      pm(0,4) \      pa(4,7)#undef p2_gemvT_1_1c#define p2_gemvT_1_1c(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plb(SS(a_,MM(1,RS4)),bx,5) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plq(SS(a_,MM(1,RS4)),ax,1) \      pc(4,2) \      pc(5,3) \      ps(CSHUF,4,4) \      ps(CSHUF,5,5) \      pm(0,2) \      pa(2,6) \      pm(0,4) \      pa(4,7) \      pm(1,3) \      pa(3,6) \      pm(1,5) \      pa(5,7)#undef p4_gemvT_1_1c#define p4_gemvT_1_1c(a_) \      plb(SS(a_,MM(0,RS4)),bx,4) \      plb(SS(a_,MM(1,RS4)),bx,5) \      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plq(SS(a_,MM(1,RS4)),ax,1) \      pc(4,2) \      pc(5,3) \      ps(CSHUF,4,4) \      ps(CSHUF,5,5) \      pm(0,2) \      pa(2,6) \      pm(0,4) \      pa(4,7) \      pm(1,3) \      pa(3,6) \      pm(1,5) \      pa(5,7) \      plb(SS(a_,MM(2,RS4)),bx,4) \      plb(SS(a_,MM(3,RS4)),bx,5) \      plq(SS(a_,MM(2,RS4)),ax,0) \      plq(SS(a_,MM(3,RS4)),ax,1) \      pc(4,2) \      pc(5,3) \      ps(CSHUF,4,4) \      ps(CSHUF,5,5) \      pm(0,2) \      pa(2,6) \      pm(0,4) \      pa(4,7) \      pm(1,3) \      pa(3,6) \      pm(1,5) \      pa(5,7)#undef lpgemvT_1_1c#define lpgemvT_1_1c(a_)#undef dpgemvT_1_1c#define dpgemvT_1_1c(a_) p4_gemvT_1_1c(a_)#undef plgemvT_1_1c#define plgemvT_1_1c 16#undef p1_4_gemvT_3_1#define p1_4_gemvT_3_1(a_) \      pls(SS(a_,MM(0,RS4)),bx,3) \      pls(SS(a_,MM(0,RS4)),ax,0) \      plsx(SS(a_,MM(0,RS4)),ax,bp,1,1) \      plsx(SS(a_,MM(0,RS4)),ax,bp,2,2) \      pmsr(3,0) \      pasr(0,4) \      pmsr(3,1) \      pasr(1,5) \      pmsr(3,2) \      pasr(2,6)#undef p1_2_gemvT_3_1#define p1_2_gemvT_3_1(a_) \      px(3) \      px(0) \      px(1) \      px(2) \      pld(SS(a_,MM(0,RS4)),bx,3) \      pld(SS(a_,MM(0,RS4)),ax,0) \      pldx(SS(a_,MM(0,RS4)),ax,bp,1,1) \      pldx(SS(a_,MM(0,RS4)),ax,bp,2,2) \      pm(3,0) \      pa(0,4) \      pm(3,1) \      pa(1,5) \      pm(3,2) \      pa(2,6)#undef p1_gemvT_3_1#define p1_gemvT_3_1(a_) \      plb(SS(a_,MM(0,RS4)),bx,3) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plqx(SS(a_,MM(0,RS4)),ax,bp,1,1) \      plqx(SS(a_,MM(0,RS4)),ax,bp,2,2) \      pm(3,0) \      pa(0,4) \      pm(3,1) \      pa(1,5) \      pm(3,2) \      pa(2,6)#undef p2_gemvT_3_1#define p2_gemvT_3_1(a_) \      plb(SS(a_,MM(0,RS4)),bx,3) \      plq(SS(a_,MM(0,RS4)),ax,0) \      plqx(SS(a_,MM(0,RS4)),ax,bp,1,1) \      plqx(SS(a_,MM(0,RS4)),ax,bp,2,2) \      pm(3,0) \      pa(0,4) \      pm(3,1) \      pa(1,5) \

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -