📄 integer_8cpp-source.html
字号:
01078 P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);01079 01080 C[0] = w[0];01081 01082 __m64 s1, s2, s3, s4;01083 01084 __m64 w1 = _m_from_int(w[1]);01085 __m64 w4 = mw[2];01086 __m64 w6 = mw[3];01087 __m64 w8 = mw[4];01088 __m64 w10 = mw[5];01089 __m64 w12 = mw[6];01090 __m64 w14 = mw[7];01091 __m64 w16 = mw[8];01092 __m64 w18 = mw[9];01093 __m64 w20 = mw[10];01094 __m64 w22 = mw[11];01095 __m64 w26 = _m_from_int(w[26]);01096 01097 __m64 x0 = _m_from_int(x[0]);01098 __m64 x1 = _m_from_int(x[1]);01099 __m64 x4 = mx[2];01100 __m64 x6 = mx[3];01101 __m64 x8 = mx[4];01102 01103 __m64 y0 = _m_from_int(y[0]);01104 __m64 y1 = _m_from_int(y[1]);01105 __m64 y4 = my[2];01106 __m64 y6 = my[3];01107 __m64 y8 = my[4];01108 01109 s1 = _mm_add_si64(w1, w4);01110 C[1] = _m_to_int(s1);01111 s1 = _m_psrlqi(s1, 32);01112 01113 s2 = _mm_add_si64(w6, w8);01114 s1 = _mm_add_si64(s1, s2);01115 C[2] = _m_to_int(s1);01116 s1 = _m_psrlqi(s1, 32);01117 01118 s2 = _mm_add_si64(w10, w12);01119 s1 = _mm_add_si64(s1, s2);01120 C[3] = _m_to_int(s1);01121 s1 = _m_psrlqi(s1, 32);01122 01123 s3 = _mm_add_si64(x0, y0);01124 s2 = _mm_add_si64(w14, w16);01125 s1 = _mm_add_si64(s1, s3);01126 s1 = _mm_add_si64(s1, s2);01127 C[4] = _m_to_int(s1);01128 s1 = _m_psrlqi(s1, 32);01129 01130 s3 = _mm_add_si64(x1, y1);01131 s4 = _mm_add_si64(x4, y4);01132 s1 = _mm_add_si64(s1, w18);01133 s3 = _mm_add_si64(s3, s4);01134 s1 = _mm_add_si64(s1, w20);01135 s1 = _mm_add_si64(s1, s3);01136 C[5] = _m_to_int(s1);01137 s1 = _m_psrlqi(s1, 32);01138 01139 s3 = _mm_add_si64(x6, y6);01140 s4 = _mm_add_si64(x8, y8);01141 s1 = _mm_add_si64(s1, w22);01142 s3 = _mm_add_si64(s3, s4);01143 s1 = _mm_add_si64(s1, w26);01144 s1 = _mm_add_si64(s1, s3);01145 C[6] = _m_to_int(s1);01146 s1 = _m_psrlqi(s1, 32);01147 01148 C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12];01149 _mm_empty();01150 }01151 01152 __declspec(naked) word __fastcall P4Optimized::Add(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N)01153 {01154 __asm01155 {01156 sub esp, 1601157 xor eax, eax01158 mov [esp], edi01159 mov [esp+4], esi01160 mov [esp+8], ebx01161 mov [esp+12], ebp01162 01163 mov ebx, [esp+20] <span class="comment">// B</span>01164 mov esi, [esp+24] <span class="comment">// N</span>01165 01166 <span class="comment">// now: ebx = B, ecx = C, edx = A, esi = N</span>01167 01168 neg esi01169 jz loopend <span class="comment">// if no dwords then nothing to do</span>01170 01171 mov edi, [edx]01172 mov ebp, [ebx]01173 01174 loopstart:01175 add edi, eax01176 jc carry101177 01178 xor eax, eax01179 01180 carry1continue:01181 add edi, ebp01182 mov ebp, 101183 mov [ecx], edi01184 mov edi, [edx+4]01185 cmovc eax, ebp01186 mov ebp, [ebx+4]01187 lea ebx, [ebx+8]01188 add edi, eax01189 jc carry201190 01191 xor eax, eax01192 01193 carry2continue:01194 add edi, ebp01195 mov ebp, 101196 cmovc eax, ebp01197 mov [ecx+4], edi01198 add ecx, 801199 mov edi, [edx+8]01200 add edx, 801201 add esi, 201202 mov ebp, [ebx]01203 jnz loopstart01204 01205 loopend:01206 mov edi, [esp]01207 mov esi, [esp+4]01208 mov ebx, [esp+8]01209 mov ebp, [esp+12]01210 add esp, 1601211 ret 801212 01213 carry1:01214 mov eax, 101215 jmp carry1continue01216 01217 carry2:01218 mov eax, 101219 jmp carry2continue01220 }01221 }01222 01223 __declspec(naked) word __fastcall P4Optimized::Subtract(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N)01224 {01225 __asm01226 {01227 sub esp, 1601228 xor eax, eax01229 mov [esp], edi01230 mov [esp+4], esi01231 mov [esp+8], ebx01232 mov [esp+12], ebp01233 01234 mov ebx, [esp+20] <span class="comment">// B</span>01235 mov esi, [esp+24] <span class="comment">// N</span>01236 01237 <span class="comment">// now: ebx = B, ecx = C, edx = A, esi = N</span>01238 01239 neg esi01240 jz loopend <span class="comment">// if no dwords then nothing to do</span>01241 01242 mov edi, [edx]01243 mov ebp, [ebx]01244 01245 loopstart:01246 sub edi, eax01247 jc carry101248 01249 xor eax, eax01250 01251 carry1continue:01252 sub edi, ebp01253 mov ebp, 101254 mov [ecx], edi01255 mov edi, [edx+4]01256 cmovc eax, ebp01257 mov ebp, [ebx+4]01258 lea ebx, [ebx+8]01259 sub edi, eax01260 jc carry201261 01262 xor eax, eax01263 01264 carry2continue:01265 sub edi, ebp01266 mov ebp, 101267 cmovc eax, ebp01268 mov [ecx+4], edi01269 add ecx, 801270 mov edi, [edx+8]01271 add edx, 801272 add esi, 201273 mov ebp, [ebx]01274 jnz loopstart01275 01276 loopend:01277 mov edi, [esp]01278 mov esi, [esp+4]01279 mov ebx, [esp+8]01280 mov ebp, [esp+12]01281 add esp, 1601282 ret 801283 01284 carry1:01285 mov eax, 101286 jmp carry1continue01287 01288 carry2:01289 mov eax, 101290 jmp carry2continue01291 }01292 }01293 01294 <span class="preprocessor">#endif // #ifdef SSE2_INTRINSICS_AVAILABLE</span>01295 <span class="preprocessor"></span>01296 <span class="preprocessor">#elif defined(__GNUC__) && defined(__i386__)</span>01297 <span class="preprocessor"></span>01298 <span class="keyword">class </span>PentiumOptimized : <span class="keyword">public</span> Portable01299 {01300 <span class="keyword">public</span>:01301 <span class="keyword">static</span> word Add(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N);01302 <span class="keyword">static</span> word Subtract(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N);01303 <span class="keyword">static</span> <span class="keywordtype">void</span> Square4(word *R, <span class="keyword">const</span> word *A);01304 <span class="keyword">static</span> <span class="keywordtype">void</span> Multiply4(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B);01305 <span class="keyword">static</span> <span class="keywordtype">void</span> Multiply8(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B);01306 };01307 01308 <span class="keyword">typedef</span> PentiumOptimized LowLevel;01309 01310 <span class="comment">// Add and Subtract assembly code originally contributed by Alister Lee</span>01311 01312 __attribute__((regparm(3))) word PentiumOptimized::Add(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N)01313 {01314 assert (N%2 == 0);01315 01316 <span class="keyword">register</span> word carry, temp;01317 01318 __asm__ __volatile__(01319 <span class="stringliteral">"push %%ebp;"</span>01320 <span class="stringliteral">"sub %3, %2;"</span>01321 <span class="stringliteral">"xor %0, %0;"</span>01322 <span class="stringliteral">"sub %4, %0;"</span>01323 <span class="stringliteral">"lea (%1,%4,4), %1;"</span>01324 <span class="stringliteral">"sar $1, %0;"</span>01325 <span class="stringliteral">"jz 1f;"</span>01326 01327 <span class="stringliteral">"0:;"</span>01328 <span class="stringliteral">"mov 0(%3), %4;"</span>01329 <span class="stringliteral">"mov 4(%3), %%ebp;"</span>01330 <span class="stringliteral">"mov (%1,%0,8), %5;"</span>01331 <span class="stringliteral">"lea 8(%3), %3;"</span>01332 <span class="stringliteral">"adc %5, %4;"</span>01333 <span class="stringliteral">"mov 4(%1,%0,8), %5;"</span>01334 <span class="stringliteral">"adc %5, %%ebp;"</span>01335 <span class="stringliteral">"inc %0;"</span>01336 <span class="stringliteral">"mov %4, -8(%3, %2);"</span>01337 <span class="stringliteral">"mov %%ebp, -4(%3, %2);"</span>01338 <span class="stringliteral">"jnz 0b;"</span>01339 01340 <span class="stringliteral">"1:;"</span>01341 <span class="stringliteral">"adc $0, %0;"</span>01342 <span class="stringliteral">"pop %%ebp;"</span>01343 01344 : <span class="stringliteral">"=aSD"</span> (carry), <span class="stringliteral">"+r"</span> (B), <span class="stringliteral">"+r"</span> (C), <span class="stringliteral">"+r"</span> (A), <span clas
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -