📄 immintrin.h
字号:
* **** VLDDQU ymm1, m256
* The instruction is functionally similar to VMOVDQU YMM, m256 for loading
* from memory. That is: 32 bytes of data starting at an address specified by
* the source memory operand are fetched from memory and placed in a
* destination
*/
extern __m256i __cdecl _mm256_lddqu_si256(__m256i const *a);
/*
* Store Packed Integers Using Non-Temporal Hint
* **** VMOVNTDQ m256, ymm1
* Moves the packed integers in the source operand to the destination using a
* non-temporal hint to prevent caching of the data during the write to memory
*/
extern void __cdecl _mm256_stream_si256(__m256i *p, __m256i a);
/*
* Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
* **** VMOVNTPD m256, ymm1
* Moves the packed double-precision floating-point values in the source
* operand to the destination operand using a non-temporal hint to prevent
* caching of the data during the write to memory
*/
extern void __cdecl _mm256_stream_pd(double *p, __m256d a);
/*
* Store Packed Single-Precision Floating-Point Values Using Non-Temporal Hint
* **** VMOVNTPS m256, ymm1
* Moves the packed single-precision floating-point values in the source
* operand to the destination operand using a non-temporal hint to prevent
* caching of the data during the write to memory
*/
extern void __cdecl _mm256_stream_ps(float *p, __m256 a);
/*
* Compute Approximate Reciprocals of Packed Single-Precision Floating-Point Values
* **** VRCPPS ymm1, ymm2/m256
* Performs an SIMD computation of the approximate reciprocals of the eight
* packed single precision floating-point values in the source operand and
* stores the packed single-precision floating-point results in the destination
*/
extern __m256 __cdecl _mm256_rcp_ps(__m256 a);
/*
* Compute Approximate Reciprocals of Square Roots of
* Packed Single-Precision Floating-point Values
* **** VRSQRTPS ymm1, ymm2/m256
* Performs an SIMD computation of the approximate reciprocals of the square
* roots of the eight packed single precision floating-point values in the
* source operand and stores the packed single-precision floating-point results
* in the destination
*/
extern __m256 __cdecl _mm256_rsqrt_ps(__m256 a);
/*
* Square Root of Double-Precision Floating-Point Values
* **** VSQRTPD ymm1, ymm2/m256
* Performs an SIMD computation of the square roots of the two or four packed
* double-precision floating-point values in the source operand and stores
* the packed double-precision floating-point results in the destination
*/
extern __m256d __cdecl _mm256_sqrt_pd(__m256d a);
/*
* Square Root of Single-Precision Floating-Point Values
* **** VSQRTPS ymm1, ymm2/m256
* Performs an SIMD computation of the square roots of the eight packed
* single-precision floating-point values in the source operand stores the
* packed double-precision floating-point results in the destination
*/
extern __m256 __cdecl _mm256_sqrt_ps(__m256 a);
/*
* Round Packed Double-Precision Floating-Point Values
* **** VROUNDPD ymm1,ymm2/m256,imm8
* Round the four Double-Precision Floating-Point Values values in the source
* operand by the rounding mode specified in the immediate operand and place
* the result in the destination. The rounding process rounds the input to an
* integral value and returns the result as a double-precision floating-point
* value. The Precision Floating Point Exception is signaled according to the
* immediate operand. If any source operand is an SNaN then it will be
* converted to a QNaN.
*/
extern __m256d __cdecl _mm256_round_pd(__m256d a, int iRoundMode);
#define _mm256_ceil_pd(val) _mm256_round_pd((val), 0x0A);
#define _mm256_floor_pd(val) _mm256_round_pd((val), 0x09);
/*
* Round Packed Single-Precision Floating-Point Values
* **** VROUNDPS ymm1,ymm2/m256,imm8
* Round the four single-precision floating-point values values in the source
* operand by the rounding mode specified in the immediate operand and place
* the result in the destination. The rounding process rounds the input to an
* integral value and returns the result as a double-precision floating-point
* value. The Precision Floating Point Exception is signaled according to the
* immediate operand. If any source operand is an SNaN then it will be
* converted to a QNaN.
*/
extern __m256 __cdecl _mm256_round_ps(__m256 a, int iRoundMode);
#define _mm256_ceil_ps(val) _mm256_round_ps((val), 0x0A);
#define _mm256_floor_ps(val) _mm256_round_ps((val), 0x09);
/*
* Unpack and Interleave High Packed Double-Precision Floating-Point Values
* **** VUNPCKHPD ymm1,ymm2,ymm3/m256
* Performs an interleaved unpack of the high double-precision floating-point
* values from the first source operand and the second source operand.
*/
extern __m256d __cdecl _mm256_unpackhi_pd(__m256d m1, __m256d m2);
/*
* Unpack and Interleave High Packed Single-Precision Floating-Point Values
* **** VUNPCKHPS ymm1,ymm2,ymm3
* Performs an interleaved unpack of the high single-precision floating-point
* values from the first source operand and the second source operand
*/
extern __m256 __cdecl _mm256_unpackhi_ps(__m256 m1, __m256 m2);
/*
* Unpack and Interleave Low Packed Double-Precision Floating-Point Values
* **** VUNPCKLPD ymm1,ymm2,ymm3/m256
* Performs an interleaved unpack of the low double-precision floating-point
* values from the first source operand and the second source operand
*/
extern __m256d __cdecl _mm256_unpacklo_pd(__m256d m1, __m256d m2);
/*
* Unpack and Interleave Low Packed Single-Precision Floating-Point Values
* **** VUNPCKLPS ymm1,ymm2,ymm3
* Performs an interleaved unpack of the low single-precision floating-point
* values from the first source operand and the second source operand
*/
extern __m256 __cdecl _mm256_unpacklo_ps(__m256 m1, __m256 m2);
/*
* Packed Bit Test
* **** VPTEST ymm1, ymm2/m256
* VPTEST set the ZF flag if all bits in the result are 0 of the bitwise AND
* of the first source operand and the second source operand. VPTEST sets the
* CF flag if all bits in the result are 0 of the bitwise AND of the second
* source operand and the logical NOT of the destination.
*/
extern int __cdecl _mm256_testz_si256(__m256i s1, __m256i s2);
extern int __cdecl _mm256_testc_si256(__m256i s1, __m256i s2);
extern int __cdecl _mm256_testnzc_si256(__m256i s1, __m256i s2);
/*
* Packed Bit Test
* **** VTESTPD ymm1, ymm2/m256
* **** VTESTPD xmm1, xmm2/m128
* VTESTPD performs a bitwise comparison of all the sign bits of the
* double-precision elements in the first source operation and corresponding
* sign bits in the second source operand. If the AND of the two sets of bits
* produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
* the source sign bits with the dest sign bits produces all zeros the CF is
* set else the CF is clear
*/
extern int __cdecl _mm256_testz_pd(__m256d s1, __m256d s2);
extern int __cdecl _mm256_testc_pd(__m256d s1, __m256d s2);
extern int __cdecl _mm256_testnzc_pd(__m256d s1, __m256d s2);
extern int __cdecl _mm_testz_pd(__m128d s1, __m128d s2);
extern int __cdecl _mm_testc_pd(__m128d s1, __m128d s2);
extern int __cdecl _mm_testnzc_pd(__m128d s1, __m128d s2);
/*
* Packed Bit Test
* **** VTESTPS ymm1, ymm2/m256
* **** VTESTPS xmm1, xmm2/m128
* VTESTPS performs a bitwise comparison of all the sign bits of the packed
* single-precision elements in the first source operation and corresponding
* sign bits in the second source operand. If the AND of the two sets of bits
* produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
* the source sign bits with the dest sign bits produces all zeros the CF is
* set else the CF is clear
*/
extern int __cdecl _mm256_testz_ps(__m256 s1, __m256 s2);
extern int __cdecl _mm256_testc_ps(__m256 s1, __m256 s2);
extern int __cdecl _mm256_testnzc_ps(__m256 s1, __m256 s2);
extern int __cdecl _mm_testz_ps(__m128 s1, __m128 s2);
extern int __cdecl _mm_testc_ps(__m128 s1, __m128 s2);
extern int __cdecl _mm_testnzc_ps(__m128 s1, __m128 s2);
/*
* Extract Double-Precision Floating-Point Sign mask
* **** VMOVMSKPD r32, ymm2
* Extracts the sign bits from the packed double-precision floating-point
* values in the source operand, formats them into a 4-bit mask, and stores
* the mask in the destination
*/
extern int __cdecl _mm256_movemask_pd(__m256d a);
/*
* Extract Single-Precision Floating-Point Sign mask
* **** VMOVMSKPS r32, ymm2
* Extracts the sign bits from the packed single-precision floating-point
* values in the source operand, formats them into a 8-bit mask, and stores
* the mask in the destination
*/
extern int __cdecl _mm256_movemask_ps(__m256 a);
/*
* Return 256-bit vector with all elements set to 0
*/
extern __m256d __cdecl _mm256_setzero_pd(void);
extern __m256 __cdecl _mm256_setzero_ps(void);
extern __m256i __cdecl _mm256_setzero_si256(void);
/*
* Return 256-bit vector intialized to specified arguments
*/
extern __m256d __cdecl _mm256_set_pd(double, double, double, double);
extern __m256 __cdecl _mm256_set_ps(float, float, float, float, float, float, float, float);
extern __m256i __cdecl _mm256_set_epi8(char, char, char, char, char, char, char, char,
char, char, char, char, char, char, char, char,
char, char, char, char, char, char, char, char,
char, char, char, char, char, char, char, char);
extern __m256i __cdecl _mm256_set_epi16(short, short, short, short, short, short, short, short,
short, short, short, short, short, short, short, short);
extern __m256i __cdecl _mm256_set_epi32(int, int, int, int, int, int, int, int);
extern __m256i __cdecl _mm256_set_epi64x(long long, long long, long long, long long);
extern __m256d __cdecl _mm256_setr_pd(double, double, double, double);
extern __m256 __cdecl _mm256_setr_ps(float, float, float, float, float, float, float, float);
extern __m256i __cdecl _mm256_setr_epi8(char, char, char, char, char, char, char, char,
char, char, char, char, char, char, char, char,
char, char, char, char, char, char, char, char,
char, char, char, char, char, char, char, char);
extern __m256i __cdecl _mm256_setr_epi16(short, short, short, short, short, short, short, short,
short, short, short, short, short, short, short, short);
extern __m256i __cdecl _mm256_setr_epi32(int, int, int, int, int, int, int, int);
extern __m256i __cdecl _mm256_setr_epi64x(long long, long long, long long, long long);
/*
* Return 256-bit vector with all elements intialized to specified scalar
*/
extern __m256d __cdecl _mm256_set1_pd(double);
extern __m256 __cdecl _mm256_set1_ps(float);
extern __m256i __cdecl _mm256_set1_epi8(char);
extern __m256i __cdecl _mm256_set1_epi16(short);
extern __m256i __cdecl _mm256_set1_epi32(int);
extern __m256i __cdecl _mm256_set1_epi64x(long long);
/*
* Support intrinsics to do vector type casts. These intrinsics do not introduce
* extra moves to generated code. When cast is done from a 128 to 256-bit type
* the low 128 bits of the 256-bit result contain source parameter value; the
* upper 128 bits of the result are undefined
*/
extern __m256 __cdecl _mm256_castpd_ps(__m256d a);
extern __m256d __cdecl _mm256_castps_pd(__m256 a);
extern __m256i __cdecl _mm256_castps_si256(__m256 a);
extern __m256i __cdecl _mm256_castpd_si256(__m256d a);
extern __m256 __cdecl _mm256_castsi256_ps(__m256i a);
extern __m256d __cdecl _mm256_castsi256_pd(__m256i a);
extern __m128 __cdecl _mm256_castps256_ps128(__m256 a);
extern __m128d __cdecl _mm256_castpd256_pd128(__m256d a);
extern __m128i __cdecl _mm256_castsi256_si128(__m256i a);
extern __m256 __cdecl _mm256_castps128_ps256(__m128 a);
extern __m256d __cdecl _mm256_castpd128_pd256(__m128d a);
extern __m256i __cdecl _mm256_castsi128_si256(__m128i a);
#if defined __cplusplus
}; /* End "C" */
#endif /* defined __cplusplus */
#endif /* defined (_M_CEE_PURE) */
#endif /* _INCLUDED_IMM */
#endif /* __midl */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -