📄 viscsumcopy.s
字号:
/* $Id: VIScsumcopy.S,v 1.8 2000/02/20 23:21:39 davem Exp $ * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous * copying utilizing the UltraSparc Visual Instruction Set. * * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) * * Based on older sparc32/sparc64 checksum.S, which is: * * Copyright(C) 1995 Linus Torvalds * Copyright(C) 1995 Miguel de Icaza * Copyright(C) 1996,1997 David S. Miller * derived from: * Linux/Alpha checksum c-code * Linux/ix86 inline checksum assembly * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) * David Mosberger-Tang for optimized reference c-code * BSD4.4 portable checksum routine */#ifdef __sparc_v9__#define STACKOFF 0x7ff+128#else#define STACKOFF 64#endif#ifdef __KERNEL__#include <asm/head.h>#include <asm/asi.h>#include <asm/page.h>#include <asm/visasm.h>#include <asm/thread_info.h>#define ASI_BLK_XOR 0#define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)#define ASI_BLK_OR (ASI_BLK_P & ~ASI_P)#else#define ASI_P 0x80#define ASI_BLK_P 0xf0#define FRPS_FEF 0x04#define FPRS_DU 0x02#define FPRS_DL 0x01#define ASI_BLK_XOR (ASI_BLK_P ^ ASI_P)#endif#define src o0#define dst o1#define len o2#define sum o3#define x1 g1#define x2 g2#define x3 o4#define x4 g4#define x5 g5#define x6 g7#define x7 g3#define x8 o5/* Dobrou noc, SunSoft engineers. Spete sladce. * This has a couple of tricks in and those * tricks are UltraLinux trade secrets :)) * Once AGAIN, the SunSoft engineers are caught * asleep at the keyboard :)). * The main loop does about 20 superscalar cycles * per 64bytes checksummed/copied. */#define LDBLK(O0) \ ldda [%src] %asi, %O0 /* Load Group */#define STBLK \ stda %f48, [%dst] ASI_BLK_P /* Store */#define ST(fx,off) \ std %fx, [%dst + off] /* Store */#define SYNC \ membar #Sync#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \ LOAD /* Load (Group) */; \ faligndata %A14, %F0, %A14 /* FPA Group */; \ inc %x5 /* IEU0 */; \ STORE1 /* Store (optional) */; \ faligndata %F0, %F2, %A0 /* FPA Group */; \ srl %x5, 1, %x5 /* IEU0 */; \ add %sum, %x4, %sum /* IEU1 */; \ fpadd32 %F0, %f0, %F0 /* FPA Group */; \ inc %x6 /* IEU0 */; \ STORE2 /* Store (optional) */; \ faligndata %F2, %F4, %A2 /* FPA Group */; \ srl %x6, 1, %x6 /* IEU0 */; \ add %sum, %x5, %sum /* IEU1 */; \ fpadd32 %F2, %f2, %F2 /* FPA Group */; \ add %src, 64, %src /* IEU0 */; \ fcmpgt32 %f0, %F0, %x1 /* FPM */; \ add %dst, 64, %dst /* IEU1 Group */; \ inc %x7 /* IEU0 */; \ STORE3 /* Store (optional) */; \ faligndata %F4, %F6, %A4 /* FPA */; \ fpadd32 %F4, %f4, %F4 /* FPA Group */; \ add %sum, %x6, %sum /* IEU1 */; \ fcmpgt32 %f2, %F2, %x2 /* FPM */; \ srl %x7, 1, %x7 /* IEU0 Group */; \ inc %x8 /* IEU1 */; \ STORE4 /* Store (optional) */; \ faligndata %F6, %F8, %A6 /* FPA */; \ fpadd32 %F6, %f6, %F6 /* FPA Group */; \ srl %x8, 1, %x8 /* IEU0 */; \ fcmpgt32 %f4, %F4, %x3 /* FPM */; \ add %sum, %x7, %sum /* IEU0 Group */; \ inc %x1 /* IEU1 */; \ STORE5 /* Store (optional) */; \ faligndata %F8, %F10, %A8 /* FPA */; \ fpadd32 %F8, %f8, %F8 /* FPA Group */; \ srl %x1, 1, %x1 /* IEU0 */; \ fcmpgt32 %f6, %F6, %x4 /* FPM */; \ add %sum, %x8, %sum /* IEU0 Group */; \ inc %x2 /* IEU1 */; \ STORE6 /* Store (optional) */; \ faligndata %F10, %F12, %A10 /* FPA */; \ fpadd32 %F10, %f10, %F10 /* FPA Group */; \ srl %x2, 1, %x2 /* IEU0 */; \ fcmpgt32 %f8, %F8, %x5 /* FPM */; \ add %sum, %x1, %sum /* IEU0 Group */; \ inc %x3 /* IEU1 */; \ STORE7 /* Store (optional) */; \ faligndata %F12, %F14, %A12 /* FPA */; \ fpadd32 %F12, %f12, %F12 /* FPA Group */; \ srl %x3, 1, %x3 /* IEU0 */; \ fcmpgt32 %f10, %F10, %x6 /* FPM */; \ add %sum, %x2, %sum /* IEU0 Group */; \ inc %x4 /* IEU1 */; \ STORE8 /* Store (optional) */; \ fmovd %F14, %B14 /* FPA */; \ fpadd32 %F14, %f14, %F14 /* FPA Group */; \ srl %x4, 1, %x4 /* IEU0 */; \ fcmpgt32 %f12, %F12, %x7 /* FPM */; \ add %sum, %x3, %sum /* IEU0 Group */; \ subcc %len, 64, %len /* IEU1 */; \ BRANCH /* CTI */; \ fcmpgt32 %f14, %F14, %x8 /* FPM Group */;#define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \ inc %x5 /* IEU0 Group */; \ fpadd32 %f2, %f0, %S0 /* FPA */; \ add %sum, %x4, %sum /* IEU1 */; \ srl %x5, 1, %x5 /* IEU0 Group */; \ fpadd32 %f6, %f4, %S1 /* FPA */; \ inc %x6 /* IEU1 */; \ fpadd32 %f10, %f8, %S2 /* FPA Group */; \ add %sum, %x5, %sum /* IEU0 */; \ fcmpgt32 %f0, %S0, %x1 /* FPM */; \ fpadd32 %f14, %f12, %S3 /* FPA Group */; \ srl %x6, 1, %x6 /* IEU0 */; \ fcmpgt32 %f4, %S1, %x2 /* FPM */; \ add %sum, %x6, %sum /* IEU0 Group */; \ fzero %fz /* FPA */; \ fcmpgt32 %f8, %S2, %x3 /* FPM */; \ inc %x7 /* IEU0 Group */; \ inc %x8 /* IEU1 */; \ srl %x7, 1, %x7 /* IEU0 Group */; \ inc %x1 /* IEU1 */; \ fpadd32 %S0, %S1, %T0 /* FPA */; \ fpadd32 %S2, %S3, %T1 /* FPA Group */; \ add %sum, %x7, %sum /* IEU0 */; \ fcmpgt32 %f12, %S3, %x4 /* FPM */; \ srl %x8, 1, %x8 /* IEU0 Group */; \ inc %x2 /* IEU1 */; \ srl %x1, 1, %x1 /* IEU0 Group */; \ add %sum, %x8, %sum /* IEU1 */; \ add %sum, %x1, %sum /* IEU0 Group */; \ fcmpgt32 %S0, %T0, %x5 /* FPM */; \ srl %x2, 1, %x2 /* IEU0 Group */; \ fcmpgt32 %S2, %T1, %x6 /* FPM */; \ inc %x3 /* IEU0 Group */; \ add %sum, %x2, %sum /* IEU1 */; \ srl %x3, 1, %x3 /* IEU0 Group */; \ inc %x4 /* IEU1 */; \ fpadd32 %T0, %T1, %U0 /* FPA Group */; \ add %sum, %x3, %sum /* IEU0 */; \ fcmpgt32 %fz, %f2, %x7 /* FPM */; \ srl %x4, 1, %x4 /* IEU0 Group */; \ fcmpgt32 %fz, %f6, %x8 /* FPM */; \ inc %x5 /* IEU0 Group */; \ add %sum, %x4, %sum /* IEU1 */; \ srl %x5, 1, %x5 /* IEU0 Group */; \ fcmpgt32 %fz, %f10, %x1 /* FPM */; \ inc %x6 /* IEU0 Group */; \ add %sum, %x5, %sum /* IEU1 */; \ fmovd %FA, %FB /* FPA Group */; \ fcmpgt32 %fz, %f14, %x2 /* FPM */; \ srl %x6, 1, %x6 /* IEU0 Group */; \ ba,pt %xcc, ett /* CTI */; \ inc %x7 /* IEU1 */;#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \ END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \ fpadd32 %U0, %U1, %V0 /* FPA Group */; \ srl %x7, 1, %x7 /* IEU0 */; \ add %sum, %x6, %sum /* IEU1 */; \ std %V0, [%sp + STACKOFF] /* Store Group */; \ inc %x8 /* IEU0 */; \ sub %sum, %x7, %sum /* IEU1 */; \ srl %x8, 1, %x8 /* IEU0 Group */; \ fcmpgt32 %fz, %S1, %x3 /* FPM */; \ inc %x1 /* IEU0 Group */; \ fcmpgt32 %fz, %S3, %x4 /* FPM */; \ srl %x1, 1, %x1 /* IEU0 Group */; \ sub %sum, %x8, %sum /* IEU1 */; \ ldx [%sp + STACKOFF], %x8 /* Load Group */; \ inc %x2 /* IEU0 */; \ sub %sum, %x1, %sum /* IEU1 */; \ srl %x2, 1, %x2 /* IEU0 Group */; \ fcmpgt32 %fz, %T1, %x5 /* FPM */; \ inc %x3 /* IEU0 Group */; \ fcmpgt32 %T0, %U0, %x6 /* FPM */; \ srl %x3, 1, %x3 /* IEU0 Group */; \ sub %sum, %x2, %sum /* IEU1 */; \ inc %x4 /* IEU0 Group */; \ sub %sum, %x3, %sum /* IEU1 */; \ srl %x4, 1, %x4 /* IEU0 Group */; \ fcmpgt32 %fz, %U1, %x7 /* FPM */; \ inc %x5 /* IEU0 Group */; \ fcmpgt32 %U0, %V0, %x1 /* FPM */; \ srl %x5, 1, %x5 /* IEU0 Group */; \ sub %sum, %x4, %sum /* IEU1 */; \ sub %sum, %x5, %sum /* IEU0 Group */; \ fcmpgt32 %fz, %V0, %x2 /* FPM */; \ inc %x6 /* IEU0 Group */; \ inc %x7 /* IEU1 */; \ srl %x6, 1, %x6 /* IEU0 Group */; \ inc %x1 /* IEU1 */; \ srl %x7, 1, %x7 /* IEU0 Group */; \ add %sum, %x6, %sum /* IEU1 */; \ srl %x1, 1, %x1 /* IEU0 Group */; \ sub %sum, %x7, %sum /* IEU1 */; \ inc %x2 /* IEU0 Group */; \ add %sum, %x1, %sum /* IEU1 */; \ srl %x2, 1, %x2 /* IEU0 Group */; \ sub %sum, %x2, %sum /* IEU0 Group */; \ addcc %sum, %x8, %sum /* IEU1 Group */; \ bcs,a,pn %xcc, 33f /* CTI */; \ add %sum, 1, %sum /* IEU0 (Group) */; \33: /* That's it */; .text .globl csum_partial_copy_vis .align 32/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. * csum_partial_copy_from_user * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 */csum_partial_copy_vis: andcc %dst, 7, %g0 /* IEU1 Group */ be,pt %icc, 4f /* CTI */ and %dst, 0x38, %o4 /* IEU0 */ mov 1, %g5 /* IEU0 Group */ andcc %dst, 2, %g0 /* IEU1 */ be,pt %icc, 1f /* CTI */ and %dst, 4, %g7 /* IEU0 Group */ lduha [%src] %asi, %g2 /* Load */ sub %len, 2, %len /* IEU0 Group */ add %dst, 2, %dst /* IEU1 */ andcc %dst, 4, %g7 /* IEU1 Group */ sll %g5, 16, %g5 /* IEU0 */ sth %g2, [%dst - 2] /* Store Group */ sll %g2, 16, %g2 /* IEU0 */ add %src, 2, %src /* IEU1 */ addcc %g2, %sum, %sum /* IEU1 Group */ bcs,a,pn %icc, 1f /* CTI */ add %sum, %g5, %sum /* IEU0 */1: lduwa [%src] %asi, %g2 /* Load */ brz,a,pn %g7, 4f /* CTI+IEU1 Group */ and %dst, 0x38, %o4 /* IEU0 */ add %dst, 4, %dst /* IEU0 Group */ sub %len, 4, %len /* IEU1 */ addcc %g2, %sum, %sum /* IEU1 Group */ bcs,a,pn %icc, 1f /* CTI */ add %sum, 1, %sum /* IEU0 */1: and %dst, 0x38, %o4 /* IEU0 Group */ stw %g2, [%dst - 4] /* Store */ add %src, 4, %src /* IEU1 */4:#ifdef __KERNEL__ VISEntry#endif mov %src, %g7 /* IEU1 Group */ fzero %f48 /* FPA */ alignaddr %src, %g0, %src /* Single Group */ subcc %g7, %src, %g7 /* IEU1 Group */ be,pt %xcc, 1f /* CTI */ mov 0x40, %g1 /* IEU0 */ lduwa [%src] %asi, %g2 /* Load Group */ subcc %sum, %g2, %sum /* IEU1 Group+load stall*/ bcs,a,pn %icc, 1f /* CTI */ sub %sum, 1, %sum /* IEU0 */1: srl %sum, 0, %sum /* IEU0 Group */ clr %g5 /* IEU1 */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -