📄 integer_8cpp-source.html
字号:
00807 sum = _mm_add_epi64(a2b1, a0b3);00808 C[3] = _mm_add_epi64(sum, sum1);00809 00810 __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);00811 __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);00812 __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);00813 __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);00814 sum = _mm_add_epi64(a2b2, a3b1);00815 C[4] = _mm_add_epi64(sum, a1b3);00816 00817 __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));00818 __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));00819 __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);00820 __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);00821 __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);00822 C[5] = _mm_add_epi64(a3b2, a2b3);00823 }00824 00825 <span class="keywordtype">void</span> P4Optimized::Multiply4(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B)00826 {00827 __m128i temp[7];00828 <span class="keyword">const</span> word *w = (word *)temp;00829 <span class="keyword">const</span> __m64 *mw = (__m64 *)w;00830 00831 P4_Mul(temp, (__m128i *)A, (__m128i *)B);00832 00833 C[0] = w[0];00834 00835 __m64 s1, s2;00836 00837 __m64 w1 = _m_from_int(w[1]);00838 __m64 w4 = mw[2];00839 __m64 w6 = mw[3];00840 __m64 w8 = mw[4];00841 __m64 w10 = mw[5];00842 __m64 w12 = mw[6];00843 __m64 w14 = mw[7];00844 __m64 w16 = mw[8];00845 __m64 w18 = mw[9];00846 __m64 w20 = mw[10];00847 __m64 w22 = mw[11];00848 __m64 w26 = _m_from_int(w[26]);00849 00850 s1 = _mm_add_si64(w1, w4);00851 C[1] = _m_to_int(s1);00852 s1 = _m_psrlqi(s1, 32);00853 00854 s2 = _mm_add_si64(w6, w8);00855 s1 = _mm_add_si64(s1, s2);00856 C[2] = _m_to_int(s1);00857 s1 = _m_psrlqi(s1, 32);00858 00859 s2 = _mm_add_si64(w10, w12);00860 s1 = _mm_add_si64(s1, s2);00861 C[3] = _m_to_int(s1);00862 s1 = _m_psrlqi(s1, 32);00863 00864 s2 = _mm_add_si64(w14, w16);00865 s1 = _mm_add_si64(s1, s2);00866 C[4] = _m_to_int(s1);00867 s1 = _m_psrlqi(s1, 32);00868 00869 s2 = _mm_add_si64(w18, w20);00870 s1 = _mm_add_si64(s1, s2);00871 C[5] = _m_to_int(s1);00872 s1 = _m_psrlqi(s1, 32);00873 00874 s2 = _mm_add_si64(w22, w26);00875 s1 = _mm_add_si64(s1, s2);00876 C[6] = _m_to_int(s1);00877 s1 = _m_psrlqi(s1, 32);00878 00879 C[7] = _m_to_int(s1) + w[27];00880 _mm_empty();00881 }00882 00883 <span class="keywordtype">void</span> P4Optimized::Multiply8(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B)00884 {00885 __m128i temp[28];00886 <span class="keyword">const</span> word *w = (word *)temp;00887 <span class="keyword">const</span> __m64 *mw = (__m64 *)w;00888 <span class="keyword">const</span> word *x = (word *)temp+7*4;00889 <span class="keyword">const</span> __m64 *mx = (__m64 *)x;00890 <span class="keyword">const</span> word *y = (word *)temp+7*4*2;00891 <span class="keyword">const</span> __m64 *my = (__m64 *)y;00892 <span class="keyword">const</span> word *z = (word *)temp+7*4*3;00893 <span class="keyword">const</span> __m64 *mz = (__m64 *)z;00894 00895 P4_Mul(temp, (__m128i *)A, (__m128i *)B);00896 00897 P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);00898 00899 P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);00900 00901 P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);00902 00903 C[0] = w[0];00904 00905 __m64 s1, s2, s3, s4;00906 00907 __m64 w1 = _m_from_int(w[1]);00908 __m64 w4 = mw[2];00909 __m64 w6 = mw[3];00910 __m64 w8 = mw[4];00911 __m64 w10 = mw[5];00912 __m64 w12 = mw[6];00913 __m64 w14 = mw[7];00914 __m64 w16 = mw[8];00915 __m64 w18 = mw[9];00916 __m64 w20 = mw[10];00917 __m64 w22 = mw[11];00918 __m64 w26 = _m_from_int(w[26]);00919 __m64 w27 = _m_from_int(w[27]);00920 00921 __m64 x0 = _m_from_int(x[0]);00922 __m64 x1 = _m_from_int(x[1]);00923 __m64 x4 = mx[2];00924 __m64 x6 = mx[3];00925 __m64 x8 = mx[4];00926 __m64 x10 = mx[5];00927 __m64 x12 = mx[6];00928 __m64 x14 = mx[7];00929 __m64 x16 = mx[8];00930 __m64 x18 = mx[9];00931 __m64 x20 = mx[10];00932 __m64 x22 = mx[11];00933 __m64 x26 = _m_from_int(x[26]);00934 __m64 x27 = _m_from_int(x[27]);00935 00936 __m64 y0 = _m_from_int(y[0]);00937 __m64 y1 = _m_from_int(y[1]);00938 __m64 y4 = my[2];00939 __m64 y6 = my[3];00940 __m64 y8 = my[4];00941 __m64 y10 = my[5];00942 __m64 y12 = my[6];00943 __m64 y14 = my[7];00944 __m64 y16 = my[8];00945 __m64 y18 = my[9];00946 __m64 y20 = my[10];00947 __m64 y22 = my[11];00948 __m64 y26 = _m_from_int(y[26]);00949 __m64 y27 = _m_from_int(y[27]);00950 00951 __m64 z0 = _m_from_int(z[0]);00952 __m64 z1 = _m_from_int(z[1]);00953 __m64 z4 = mz[2];00954 __m64 z6 = mz[3];00955 __m64 z8 = mz[4];00956 __m64 z10 = mz[5];00957 __m64 z12 = mz[6];00958 __m64 z14 = mz[7];00959 __m64 z16 = mz[8];00960 __m64 z18 = mz[9];00961 __m64 z20 = mz[10];00962 __m64 z22 = mz[11];00963 __m64 z26 = _m_from_int(z[26]);00964 00965 s1 = _mm_add_si64(w1, w4);00966 C[1] = _m_to_int(s1);00967 s1 = _m_psrlqi(s1, 32);00968 00969 s2 = _mm_add_si64(w6, w8);00970 s1 = _mm_add_si64(s1, s2);00971 C[2] = _m_to_int(s1);00972 s1 = _m_psrlqi(s1, 32);00973 00974 s2 = _mm_add_si64(w10, w12);00975 s1 = _mm_add_si64(s1, s2);00976 C[3] = _m_to_int(s1);00977 s1 = _m_psrlqi(s1, 32);00978 00979 s3 = _mm_add_si64(x0, y0);00980 s2 = _mm_add_si64(w14, w16);00981 s1 = _mm_add_si64(s1, s3);00982 s1 = _mm_add_si64(s1, s2);00983 C[4] = _m_to_int(s1);00984 s1 = _m_psrlqi(s1, 32);00985 00986 s3 = _mm_add_si64(x1, y1);00987 s4 = _mm_add_si64(x4, y4);00988 s1 = _mm_add_si64(s1, w18);00989 s3 = _mm_add_si64(s3, s4);00990 s1 = _mm_add_si64(s1, w20);00991 s1 = _mm_add_si64(s1, s3);00992 C[5] = _m_to_int(s1);00993 s1 = _m_psrlqi(s1, 32);00994 00995 s3 = _mm_add_si64(x6, y6);00996 s4 = _mm_add_si64(x8, y8);00997 s1 = _mm_add_si64(s1, w22);00998 s3 = _mm_add_si64(s3, s4);00999 s1 = _mm_add_si64(s1, w26);01000 s1 = _mm_add_si64(s1, s3);01001 C[6] = _m_to_int(s1);01002 s1 = _m_psrlqi(s1, 32);01003 01004 s3 = _mm_add_si64(x10, y10);01005 s4 = _mm_add_si64(x12, y12);01006 s1 = _mm_add_si64(s1, w27);01007 s3 = _mm_add_si64(s3, s4);01008 s1 = _mm_add_si64(s1, s3);01009 C[7] = _m_to_int(s1);01010 s1 = _m_psrlqi(s1, 32);01011 01012 s3 = _mm_add_si64(x14, y14);01013 s4 = _mm_add_si64(x16, y16);01014 s1 = _mm_add_si64(s1, z0);01015 s3 = _mm_add_si64(s3, s4);01016 s1 = _mm_add_si64(s1, s3);01017 C[8] = _m_to_int(s1);01018 s1 = _m_psrlqi(s1, 32);01019 01020 s3 = _mm_add_si64(x18, y18);01021 s4 = _mm_add_si64(x20, y20);01022 s1 = _mm_add_si64(s1, z1);01023 s3 = _mm_add_si64(s3, s4);01024 s1 = _mm_add_si64(s1, z4);01025 s1 = _mm_add_si64(s1, s3);01026 C[9] = _m_to_int(s1);01027 s1 = _m_psrlqi(s1, 32);01028 01029 s3 = _mm_add_si64(x22, y22);01030 s4 = _mm_add_si64(x26, y26);01031 s1 = _mm_add_si64(s1, z6);01032 s3 = _mm_add_si64(s3, s4);01033 s1 = _mm_add_si64(s1, z8);01034 s1 = _mm_add_si64(s1, s3);01035 C[10] = _m_to_int(s1);01036 s1 = _m_psrlqi(s1, 32);01037 01038 s3 = _mm_add_si64(x27, y27);01039 s1 = _mm_add_si64(s1, z10);01040 s1 = _mm_add_si64(s1, z12);01041 s1 = _mm_add_si64(s1, s3);01042 C[11] = _m_to_int(s1);01043 s1 = _m_psrlqi(s1, 32);01044 01045 s3 = _mm_add_si64(z14, z16);01046 s1 = _mm_add_si64(s1, s3);01047 C[12] = _m_to_int(s1);01048 s1 = _m_psrlqi(s1, 32);01049 01050 s3 = _mm_add_si64(z18, z20);01051 s1 = _mm_add_si64(s1, s3);01052 C[13] = _m_to_int(s1);01053 s1 = _m_psrlqi(s1, 32);01054 01055 s3 = _mm_add_si64(z22, z26);01056 s1 = _mm_add_si64(s1, s3);01057 C[14] = _m_to_int(s1);01058 s1 = _m_psrlqi(s1, 32);01059 01060 C[15] = z[27] + _m_to_int(s1);01061 _mm_empty();01062 }01063 01064 <span class="keywordtype">void</span> P4Optimized::Multiply8Bottom(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B)01065 {01066 __m128i temp[21];01067 <span class="keyword">const</span> word *w = (word *)temp;01068 <span class="keyword">const</span> __m64 *mw = (__m64 *)w;01069 <span class="keyword">const</span> word *x = (word *)temp+7*4;01070 <span class="keyword">const</span> __m64 *mx = (__m64 *)x;01071 <span class="keyword">const</span> word *y = (word *)temp+7*4*2;01072 <span class="keyword">const</span> __m64 *my = (__m64 *)y;01073 01074 P4_Mul(temp, (__m128i *)A, (__m128i *)B);01075 01076 P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);01077
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -