📄 mmxoctave.asm
字号:
;/**************************************************************************
;
;Mixed Rendering
;
; **************************************************************************/
;/***************************************************************
;*
;* This program has been developed by Intel Corporation.
;* You have Intel's permission to incorporate this code
;* into your product, royalty free. Intel has various
;* intellectual property rights which it may assert under
;* certain circumstances, such as if another manufacturer's
;* processor mis-identifies itself as being "GenuineIntel"
;* when the CPUID instruction is executed.
;*
;* Intel specifically disclaims all warranties, express or
;* implied, and all liability, including consequential and
;* other indirect damages, for the use of this code,
;* including liability for infringement of any proprietary
;* rights, and including the warranties of merchantability
;* and fitness for a particular purpose. Intel does not
;* assume any responsibility for any errors which may
;* appear in this code nor any responsibility to update it.
;*
;* * Other brands and names are the property of their respective
;* owners.
;*
;* Copyright (c) 1995, Intel Corporation. All rights reserved.
;***************************************************************/
TITLE Modified form of Perlin's Noise Basis function using MMX(tm) technology
;prevent listing of iammx.inc file
.nolist
INCLUDE iammx.inc
.list
.586
.model FLAT
;***********************************************************************
; Data Segment Declarations
;***********************************************************************
;.DATA
DSEG SEGMENT PARA
;KEY for comments
;P0, P1, P# = Pixel number 0, Pixel number 1, Pixel number # respectively.
;Pix = Pixel
;DU = Derivative of the variable U.
;DDU = Derivative of the variable DU.
;Texel = A point in the texture to be mapped onto the screen. Given by U, V.
;Note: Even though the assembly writes four pixel values through each pass of the
;inner loop, only two of the pixels are directly calculated. The other two pixels
;are averaged from neighboring pixels. According to the current scheme,
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MMX = | Pixel #0 | Pixel #1 | Pixel #2 | Pixel #3 |
; +-----------------------------------------------------------+
;Pixels #1 and #3 are directly calculated. Pixel #2 is averaged from Pixel #1 and
;pixel #3. Pixel #0 is averaged from Pixel #1 and the previous pixel before #0.
;
;Also, the programmer realizes that the pixels are labeled from 0, 1, 2, 3 instead
;of 3, 2, 1, 0 as follows the conventional format of Intel Architecture. This was
;an oversite and not realized until it was to late.
;Variables, u, v, du, dv, ddu, ddv each contain parameters for two
;texels. Since u, v, ..., ddv are 64 bit, then each texel parameter is
;32 bit. (32 bit per texel * two texels = 64 bits). This enables us
;to work with two pixels at one time using MMX technology.
ALIGN 8
u QWORD ?
du QWORD ?
ddu QWORD ?
v QWORD ?
dv QWORD ?
ddv QWORD ?
firstU QWORD ?
firstV QWORD ?
;Since the program only calculates odd pixel values, the even pixel values
;must be averaged. Therefore, for each pass through the inner loop, four
;pixels will be drawn. In order to draw the first pixel, the pixel before
;it must be known for the averaging. This pixel color is contained here.
prev_color DWORD 255
octShift DWORD 0,0
turbShift DWORD 0,0
;Various masks. Set up to filter out unwanted bits in MMX registers.
ALIGN 8
mask_32_to_15 QWORD 00007FFF00007FFFh
mask_quad_1 QWORD 0001000100010001h
mask_quad_255 QWORD 00FF00FF00FF00FFh
mask_quad_256 QWORD 0100010001000100h
mask_quad_510 QWORD 01FE01FE01FE01FEh
mask_quad_511 QWORD 01FF01FF01FF01FFh
mask_quad_1536 QWORD 0600060006000600h
mask_double_255 QWORD 000000FF000000FFh
mask_double_FFFF QWORD 0000FFFF0000FFFFh
mask_double_65536 QWORD 0001000000010000h
mask_four_255 QWORD 00FF00FF00FF00FFh
DSEG ENDS
;***********************************************************************
; Constant Segment Declarations
;***********************************************************************
.const
;***********************************************************************
; Code Segment Declarations
;***********************************************************************
.code
COMMENT^
void MMX_Octave(unsigned long u_init, unsigned long v_init,
long du_init, long dv_init,
unsigned long Num_Pix,
unsigned _int16* turb_buffer,
unsigned long num_octaves);
^
MMX_Octave PROC NEAR C USES ebx ecx edi esi,
u_init:DWORD , v_init:DWORD,
du_init:DWORD , dv_init:DWORD,
num_pixels:DWORD , turb_buffer:DWORD,
num_octaves:DWORD
;Initialization
MOVD MM0, u_init
MOVD MM1, v_init
PUNPCKLDQ MM0, MM0 ;U p1 = u, p3 = u
MOVD MM2, du_init
PUNPCKLDQ MM1, MM1 ;V p1 = v, p3 = v
MOVD MM3, dv_init
PADDD MM0, MM2 ;U p1 = u, p3 = u + du
PADDD MM1, MM3 ;V p1 = v, p3 = v + dv
PADDD MM0, MM2 ;U p1 = u, p3 = u + 2du
PADDD MM1, MM3 ;V p1 = v, p3 = v + 2dv
PUNPCKLDQ MM2, MM2
PUNPCKLDQ MM3, MM3
PADDD MM0, MM2 ;U p1 = u + du, p3 = u + 3du
MOV [turbShift] , 0 ; turbShift is the octave number 0,1,2,....
XOR ESI,ESI
MOVQ DWORD PTR firstU , MM0
PADDD MM1, MM3 ;V p1 = v + dv, p3 = v + 3dv
MOV [octShift] , 14 ; octshift is (14 - esi (octave number))
PSLLD MM2, 2 ;DU p1 = 4du, p3 = 4du
MOVQ DWORD PTR firstV, MM1
PSLLD MM3, 2 ;DU p1 = 4dv, p3 = 4dv
MOVQ DWORD PTR du, MM2
MOVQ DWORD PTR dv, MM3
start_octave :
MOV EBX, prev_color
MOV EDI, turb_buffer ;EDI will always be pointer to screen buffer
MOV ECX, num_pixels
SUB EDI, 8
;Get the UV parameters in MMX(tm) technology form.
;Note: UV texel values are stored in 10.22 fixed integer format.
;This sets up the U parameters for pixels 1 and 3 in MM0 register and
;V parameter in MM1 register. After setup, the registers will contain:
; |--------- 32 bit ------------|
; +-------------------------------------------------------------------+
;MM0 = | U texel for pix #1 = u + du | U texel for pix #3 = u + 3du + 3ddu |
; +-------------------------------------------------------------------+
; +-------------------------------------------------------------------+
;MM1 = | V texel for pix #1 = v + dv | V texel for pix #3 = v + 3dv + 3ddv |
; +-------------------------------------------------------------------+
;This is because the first four pixels drawn on the screen will have the
;U and V texel values of:
;Pixel #0 = u
;Pixel #1 = u + du
;Pixel #2 = u + 2du + ddu
;Pixel #3 = u + 3du + 3ddu
;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged.
movq mm0, DWORD PTR firstU
movq mm1, DWORD PTR firstV
movq DWORD PTR u , mm0
movq DWORD PTR v , mm1
;Get the du dv parameters in MMX(tm) technology form
;Note: du dv texel values are stored in 10.22 fixed integer format.
;This sets up the du parameters for pixels 1 and 3 in MM0 register and
;dv parameter in MM1 register. After setup, the registers will contain:
; |--------- 32 bit --------------|
; +---------------------------------------------------------------+
;MM0 = | DU texel for p1 = 4du + 10ddu | DU texel for p3 = 4du + 18ddu |
; +---------------------------------------------------------------+
; +---------------------------------------------------------------+
;MM1 = | DV texel for p1 = 4dv + 10ddv | DV texel for p3 = 4dv + 18ddv |
; +---------------------------------------------------------------+
;This is because after the first four pixels are drawn on the screen, the
;loop repeats to draw the next four pixels. In order to get the next u, v
;texel coordinates, appropriate du, dv values need to be summed to u and v.
;The correct starting values of du and dv are:
;Pixel #0 = 4du + 6ddu ;Note: these have been mathematically proven.
;Pixel #1 = 4du + 10ddu
;Pixel #2 = 4du + 14ddu
;Pixel #3 = 4du + 18ddu
;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged.
;nop
;nop
;nop
;nop
;nop
start_scan_line:
;First, the program converts the u and v texel coordinates
;from 10.22 format to 8.8 format. 10.22 format is used for
;decimal accuracy but only 16 of the 32 bits are actually used.
;Because the final format will fit in a 16 bit result, u and v
;values are converted from 4, 32 bit packed values
;to 4, 16 bit packed values that will fit in one MMX register. Output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM0 = | U texel - p1 | U texel - p3 | V texel - p1 | V texel - p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;u_16bit = u_init >> 14;
;v_16bit = v_init >> 14;
MOVQ MM1, DWORD PTR u;
MOVQ MM3, DWORD PTR octShift
MOVQ MM0, DWORD PTR v;
PSRLD MM1,MM3 ;Convert from 10.22 to 10.8
MOVQ MM2, DWORD PTR mask_32_to_15 ;Uses 15 instead of 16 because of signed saturation.
PSRLD MM0,MM3 ;Convert from 10.22 to 10.8
;PSRLD MM1, [octShift]
;PSRLD MM0, [octShift]
PAND MM1, MM2 ;Convert from 10.8 to 7.8 integer format
PAND MM0, MM2 ;Convert from 10.8 to 7.8 integer format
MOVQ MM3, DWORD PTR mask_quad_1
PACKSSDW MM0, MM1 ;Pack the result into one register
;Calculation of the bx0, by0, bx1, by1 values for both pixels. Output:
; |-8 bit-|
; +-----------------------------------------------------------+
;MM2 = | |BX0 p1 | |BX0 p3 | |BY0 p1 | |BY0 p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM3 = | |BX1 p1 | |BX1 p3 | |BY1 p1 | |BY1 p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;bx0 = u_16bit >> 8;
;by0 = v_16bit >> 8;
;bx1 = bx0 + 1;
;by1 = by0 + 1;
MOVQ MM1, DWORD PTR u ;Used for incrementing u for next 4 pix.
MOVQ MM2, MM0
PSRLW MM2, 8
;PADDD MM1, MM4 ;Used for incrementing u for next 4 pix.
PADDD MM1, DWORD PTR du ;Used for incrementing u for next 4 pix.
PADDUSB MM3, MM2 ;mm3 = 0:BX1(1):0:BX1(3):0:BY1(1):0:BY1(3)
;Calculation of the rx0, ry0 values for both pixels. Final output:
; |-8 bit-|
; +-----------------------------------------------------------+
;MM0 = | |RX0 p1 | |RX0 p3 | |RY0 p1 | |RY0 p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;rx0 = u_16bit & 255;
;ry0 = v_16bit & 255;
PSLLW MM0, 8
MOVQ MM4, MM3
MOVQ MM6, DWORD PTR mask_quad_1
PUNPCKHWD MM4, MM2 ;mm4 = 0:BX0(1):0:BX1(1):0:BX0(3):0:BX1(3)
PUNPCKLWD MM3, MM2 ;mm3 = 0:BY0(1):0:BY1(1):0:BY0(3):0:BY1(3)
PMULLW MM4, MM4 ;mm4 = BX0^2(1):BX1^2(1):BX0^2(3):BX1^2(3)
PSRLW MM0, 8 ;MM0 = rx0 and ry0 param for pix 1, 3
;This section includes calculation of b00, b01, b10, b11. Output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM4 = | b01 for p1 | b11 for p1 | b01 for p3 | b11 for p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM5 = | b00 for p1 | b10 for p1 | b00 for p3 | b10 for p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;b00 = random1((random1(bx0) + by0));
;b01 = random1((random1(bx0) + by1));
;b10 = random1((random1(bx1) + by0));
;b11 = random1((random1(bx1) + by1));
MOVQ MM2, MM3
PUNPCKLDQ MM3, MM3 ;mm3 = 0:BY0(3):0:BY1(3):0:BY0(3):0:BY1(3)
PUNPCKHDQ MM2, MM2 ;mm2 = 0:BY0(1):0:BY1(1):0:BY0(1):0:BY1(1)
MOVQ MM5, MM4
MOVQ DWORD PTR u, MM1 ;Used for incrementing u for next 4 pix.
PUNPCKLWD MM4, MM4 ;mm4 = BX0^2(3):BX0^2(3):BX1^2(3):BX1^2(3)
PUNPCKHWD MM5, MM5 ;mm5 = BX0^2(1):BX0^2(1):BX1^2(1):BX1^2(1)
PADDW MM4, MM3
PADDW MM5, MM2
;This section calculates g_b00_0, b_b01_0, g_b10_0, g_b11_0 for pix 1 and 3.
;Output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM2 = | g_b00_1 p3 | g_b01_1 p3 | g_b10_1 p3 | g_b11_1 p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM3 = | g_b00_1 p1 | g_b01_1 p1 | g_b10_1 p1 | g_b11_1 p1 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM4 = | g_b00_0 p3 | g_b01_0 p3 | g_b10_0 p3 | g_b11_0 p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM5 = | g_b00_0 p1 | g_b01_0 p1 | g_b10_0 p1 | g_b11_0 p1 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;g_b00_0 = (random2(b00) & 511) - 256;
;g_b01_0 = (random2(b01) & 511) - 256;
;g_b10_0 = (random2(b10) & 511) - 256;
;g_b11_0 = (random2(b11) & 511) - 256;
;g_b00_1 = (random2(b00 + 1) & 511) - 256;
;g_b01_1 = (random2(b01 + 1) & 511) - 256;
;g_b10_1 = (random2(b10 + 1) & 511) - 256;
;g_b11_1 = (random2(b11 + 1) & 511) - 256;
PMULLW MM4, MM4 ;random1
PMULLW MM5, MM5 ;random1
MOVQ MM2, MM6
MOVQ MM3, MM6
PADDUSW MM2, MM4
PMULLW MM2, MM2 ;random2
PADDUSW MM3, MM5
MOVQ MM1, DWORD PTR mask_quad_256
PMULLW MM3, MM3 ;random2
MOVQ MM7, DWORD PTR mask_quad_511
PMULLW MM4, MM4 ;random2
PMULLW MM5, MM5 ;random2
PSRLW MM2, 2
PSRLW MM3, 2
PAND MM2, MM7
PSRLW MM4, 2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -