immintrin.h

来自「C语言库函数的原型,有用的拿去」· C头文件代码 · 共 1,080 行 · 第 1/4 页
1,080 行
 * **** VLDDQU ymm1, m256
 * The instruction is functionally similar to VMOVDQU YMM, m256 for loading
 * from memory. That is: 32 bytes of data starting at an address specified by
 * the source memory operand are fetched from memory and placed in a
 * destination
 */
extern __m256i __cdecl _mm256_lddqu_si256(__m256i const *a);

/*
 * Store Packed Integers Using Non-Temporal Hint
 * **** VMOVNTDQ m256, ymm1
 * Moves the packed integers in the source operand to the destination using a
 * non-temporal hint to prevent caching of the data during the write to memory
 */
extern void    __cdecl _mm256_stream_si256(__m256i *p, __m256i a);

/*
 * Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
 * **** VMOVNTPD m256, ymm1
 * Moves the packed double-precision floating-point values in the source
 * operand to the destination operand using a non-temporal hint to prevent
 * caching of the data during the write to memory
 */
extern void    __cdecl _mm256_stream_pd(double *p, __m256d a);

/*
 * Store Packed Single-Precision Floating-Point Values Using Non-Temporal Hint
 * **** VMOVNTPS m256, ymm1
 * Moves the packed single-precision floating-point values in the source
 * operand to the destination operand using a non-temporal hint to prevent
 * caching of the data during the write to memory
 */
extern void    __cdecl _mm256_stream_ps(float *p, __m256 a);

/*
 * Compute Approximate Reciprocals of Packed Single-Precision Floating-Point Values
 * **** VRCPPS ymm1, ymm2/m256
 * Performs an SIMD computation of the approximate reciprocals of the eight
 * packed single precision floating-point values in the source operand and
 * stores the packed single-precision floating-point results in the destination
 */
extern __m256  __cdecl _mm256_rcp_ps(__m256 a);

/*
 * Compute Approximate Reciprocals of Square Roots of
 * Packed Single-Precision Floating-point Values
 * **** VRSQRTPS ymm1, ymm2/m256
 * Performs an SIMD computation of the approximate reciprocals of the square
 * roots of the eight packed single precision floating-point values in the
 * source operand and stores the packed single-precision floating-point results
 * in the destination
 */
extern __m256  __cdecl _mm256_rsqrt_ps(__m256 a);

/*
 * Square Root of Double-Precision Floating-Point Values
 * **** VSQRTPD ymm1, ymm2/m256
 * Performs an SIMD computation of the square roots of the two or four packed
 * double-precision floating-point values in the source operand and stores
 * the packed double-precision floating-point results in the destination
 */
extern __m256d __cdecl _mm256_sqrt_pd(__m256d a);

/*
 * Square Root of Single-Precision Floating-Point Values
 * **** VSQRTPS ymm1, ymm2/m256
 * Performs an SIMD computation of the square roots of the eight packed
 * single-precision floating-point values in the source operand stores the
 * packed double-precision floating-point results in the destination
 */
extern __m256  __cdecl _mm256_sqrt_ps(__m256 a);

/*
 * Round Packed Double-Precision Floating-Point Values
 * **** VROUNDPD ymm1,ymm2/m256,imm8
 * Round the four Double-Precision Floating-Point Values values in the source
 * operand by the rounding mode specified in the immediate operand and place
 * the result in the destination. The rounding process rounds the input to an
 * integral value and returns the result as a double-precision floating-point
 * value. The Precision Floating Point Exception is signaled according to the
 * immediate operand. If any source operand is an SNaN then it will be
 * converted to a QNaN.
 */
extern __m256d __cdecl _mm256_round_pd(__m256d a, int iRoundMode);
#define _mm256_ceil_pd(val)   _mm256_round_pd((val), 0x0A);
#define _mm256_floor_pd(val)  _mm256_round_pd((val), 0x09);

/*
 * Round Packed Single-Precision Floating-Point Values
 * **** VROUNDPS ymm1,ymm2/m256,imm8
 * Round the four single-precision floating-point values values in the source
 * operand by the rounding mode specified in the immediate operand and place
 * the result in the destination. The rounding process rounds the input to an
 * integral value and returns the result as a double-precision floating-point
 * value. The Precision Floating Point Exception is signaled according to the
 * immediate operand. If any source operand is an SNaN then it will be
 * converted to a QNaN.
 */
extern __m256  __cdecl _mm256_round_ps(__m256 a, int iRoundMode);
#define _mm256_ceil_ps(val)   _mm256_round_ps((val), 0x0A);
#define _mm256_floor_ps(val)  _mm256_round_ps((val), 0x09);

/*
 * Unpack and Interleave High Packed Double-Precision Floating-Point Values
 * **** VUNPCKHPD ymm1,ymm2,ymm3/m256
 * Performs an interleaved unpack of the high double-precision floating-point
 * values from the first source operand and the second source operand.
 */
extern __m256d __cdecl _mm256_unpackhi_pd(__m256d m1, __m256d m2);

/*
 * Unpack and Interleave High Packed Single-Precision Floating-Point Values
 * **** VUNPCKHPS ymm1,ymm2,ymm3
 * Performs an interleaved unpack of the high single-precision floating-point
 * values from the first source operand and the second source operand
 */
extern __m256  __cdecl _mm256_unpackhi_ps(__m256 m1, __m256 m2);

/*
 * Unpack and Interleave Low Packed Double-Precision Floating-Point Values
 * **** VUNPCKLPD ymm1,ymm2,ymm3/m256
 * Performs an interleaved unpack of the low double-precision floating-point
 * values from the first source operand and the second source operand
 */
extern __m256d __cdecl _mm256_unpacklo_pd(__m256d m1, __m256d m2);

/*
 * Unpack and Interleave Low Packed Single-Precision Floating-Point Values
 * **** VUNPCKLPS ymm1,ymm2,ymm3
 * Performs an interleaved unpack of the low single-precision floating-point
 * values from the first source operand and the second source operand
 */
extern __m256  __cdecl _mm256_unpacklo_ps(__m256 m1, __m256 m2);

/*
 * Packed Bit Test
 * **** VPTEST ymm1, ymm2/m256
 * VPTEST set the ZF flag if all bits in the result are 0 of the bitwise AND
 * of the first source operand and the second source operand. VPTEST sets the
 * CF flag if all bits in the result are 0 of the bitwise AND of the second
 * source operand and the logical NOT of the destination.
 */
extern int     __cdecl _mm256_testz_si256(__m256i s1, __m256i s2);
extern int     __cdecl _mm256_testc_si256(__m256i s1, __m256i s2);
extern int     __cdecl _mm256_testnzc_si256(__m256i s1, __m256i s2);

/*
 * Packed Bit Test
 * **** VTESTPD ymm1, ymm2/m256
 * **** VTESTPD xmm1, xmm2/m128
 * VTESTPD performs a bitwise comparison of all the sign bits of the
 * double-precision elements in the first source operation and corresponding
 * sign bits in the second source operand. If the AND of the two sets of bits
 * produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
 * the source sign bits with the dest sign bits produces all zeros the CF is
 * set else the CF is clear
 */
extern int     __cdecl _mm256_testz_pd(__m256d s1, __m256d s2);
extern int     __cdecl _mm256_testc_pd(__m256d s1, __m256d s2);
extern int     __cdecl _mm256_testnzc_pd(__m256d s1, __m256d s2);
extern int     __cdecl _mm_testz_pd(__m128d s1, __m128d s2);
extern int     __cdecl _mm_testc_pd(__m128d s1, __m128d s2);
extern int     __cdecl _mm_testnzc_pd(__m128d s1, __m128d s2);

/*
 * Packed Bit Test
 * **** VTESTPS ymm1, ymm2/m256
 * **** VTESTPS xmm1, xmm2/m128
 * VTESTPS performs a bitwise comparison of all the sign bits of the packed
 * single-precision elements in the first source operation and corresponding
 * sign bits in the second source operand. If the AND of the two sets of bits
 * produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
 * the source sign bits with the dest sign bits produces all zeros the CF is
 * set else the CF is clear
 */
extern int     __cdecl _mm256_testz_ps(__m256 s1, __m256 s2);
extern int     __cdecl _mm256_testc_ps(__m256 s1, __m256 s2);
extern int     __cdecl _mm256_testnzc_ps(__m256 s1, __m256 s2);
extern int     __cdecl _mm_testz_ps(__m128 s1, __m128 s2);
extern int     __cdecl _mm_testc_ps(__m128 s1, __m128 s2);
extern int     __cdecl _mm_testnzc_ps(__m128 s1, __m128 s2);

/*
 * Extract Double-Precision Floating-Point Sign mask
 * **** VMOVMSKPD r32, ymm2
 * Extracts the sign bits from the packed double-precision floating-point
 * values in the source operand, formats them into a 4-bit mask, and stores
 * the mask in the destination
 */
extern int     __cdecl _mm256_movemask_pd(__m256d a);

/*
 * Extract Single-Precision Floating-Point Sign mask
 * **** VMOVMSKPS r32, ymm2
 * Extracts the sign bits from the packed single-precision floating-point
 * values in the source operand, formats them into a 8-bit mask, and stores
 * the mask in the destination
 */
extern int     __cdecl _mm256_movemask_ps(__m256 a);

/*
 * Return 256-bit vector with all elements set to 0
 */
extern __m256d __cdecl _mm256_setzero_pd(void);
extern __m256  __cdecl _mm256_setzero_ps(void);
extern __m256i __cdecl _mm256_setzero_si256(void);

/*
 * Return 256-bit vector intialized to specified arguments
 */
extern __m256d __cdecl _mm256_set_pd(double, double, double, double);
extern __m256  __cdecl _mm256_set_ps(float, float, float, float, float, float, float, float);
extern __m256i __cdecl _mm256_set_epi8(char, char, char, char, char, char, char, char,
                                       char, char, char, char, char, char, char, char,
                                       char, char, char, char, char, char, char, char,
                                       char, char, char, char, char, char, char, char);
extern __m256i __cdecl _mm256_set_epi16(short, short, short, short, short, short, short, short,
                                        short, short, short, short, short, short, short, short);
extern __m256i __cdecl _mm256_set_epi32(int, int, int, int, int, int, int, int);
extern __m256i __cdecl _mm256_set_epi64x(long long, long long, long long, long long);

extern __m256d __cdecl _mm256_setr_pd(double, double, double, double);
extern __m256  __cdecl _mm256_setr_ps(float, float, float, float, float, float, float, float);
extern __m256i __cdecl _mm256_setr_epi8(char, char, char, char, char, char, char, char,
                                        char, char, char, char, char, char, char, char,
                                        char, char, char, char, char, char, char, char,
                                        char, char, char, char, char, char, char, char);
extern __m256i __cdecl _mm256_setr_epi16(short, short, short, short, short, short, short, short,
                                         short, short, short, short, short, short, short, short);
extern __m256i __cdecl _mm256_setr_epi32(int, int, int, int, int, int, int, int);
extern __m256i __cdecl _mm256_setr_epi64x(long long, long long, long long, long long);

/*
 * Return 256-bit vector with all elements intialized to specified scalar
 */
extern __m256d __cdecl _mm256_set1_pd(double);
extern __m256  __cdecl _mm256_set1_ps(float);
extern __m256i __cdecl _mm256_set1_epi8(char);
extern __m256i __cdecl _mm256_set1_epi16(short);
extern __m256i __cdecl _mm256_set1_epi32(int);
extern __m256i __cdecl _mm256_set1_epi64x(long long);

/*
 * Support intrinsics to do vector type casts. These intrinsics do not introduce
 * extra moves to generated code. When cast is done from a 128 to 256-bit type
 * the low 128 bits of the 256-bit result contain source parameter value; the
 * upper 128 bits of the result are undefined
 */
extern __m256  __cdecl _mm256_castpd_ps(__m256d a);
extern __m256d __cdecl _mm256_castps_pd(__m256 a);
extern __m256i __cdecl _mm256_castps_si256(__m256 a);
extern __m256i __cdecl _mm256_castpd_si256(__m256d a);
extern __m256  __cdecl _mm256_castsi256_ps(__m256i a);
extern __m256d __cdecl _mm256_castsi256_pd(__m256i a);
extern __m128  __cdecl _mm256_castps256_ps128(__m256 a);
extern __m128d __cdecl _mm256_castpd256_pd128(__m256d a);
extern __m128i __cdecl _mm256_castsi256_si128(__m256i a);
extern __m256  __cdecl _mm256_castps128_ps256(__m128 a);
extern __m256d __cdecl _mm256_castpd128_pd256(__m128d a);
extern __m256i __cdecl _mm256_castsi128_si256(__m128i a);

#if defined __cplusplus
}; /* End "C" */
#endif  /* defined __cplusplus */

#endif  /* defined (_M_CEE_PURE) */

#endif  /* _INCLUDED_IMM */
#endif  /* __midl */
immintrin.h - 源码说明

本页面展示了「C语言库函数的原型,有用的拿去」中的 immintrin.h 源码文件，采用 C头文件编程语言编写，共 1,080 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与C语言相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?